mirror of
https://github.com/elastic/logstash.git
synced 2025-04-19 04:15:23 -04:00
Json normalization performance (#16313)
* licenses: allow elv2, standard abbreviation for Elastic License version 2 * json-dump: reduce unicode normalization cost Since the underlying JrJackson now properly (and efficiently) encodes the UTF-8 transcode of whichever strings it is given, we no longer need to pre-normalize to UTF-8 in ruby _except_ when the string is flagged as BINARY because we have alternate behaviour to preserve valid UTF-8 sequences. By emitting a _copy_ of binary-flagged strings that have been re-flagged as UTF-8, we allow the downstream (efficient) encoding operation in jrjackson to produce equivalent behaviour at much lower cost. * cleanup: remove orphan unicode normalizer
This commit is contained in:
parent
2404bad9a9
commit
66aeeeef83
3 changed files with 7 additions and 41 deletions
|
@ -16,7 +16,6 @@
|
|||
# under the License.
|
||||
|
||||
require "logstash/environment"
|
||||
require "logstash/util/unicode_normalizer"
|
||||
require "jrjackson"
|
||||
|
||||
module LogStash
|
||||
|
@ -51,7 +50,12 @@ module LogStash
|
|||
def normalize_encoding(data)
|
||||
case data
|
||||
when String
|
||||
LogStash::UnicodeNormalizer.normalize_string_encoding(data)
|
||||
if data.encoding == Encoding::ASCII_8BIT
|
||||
# when given BINARY-flagged string, assume it is UTF-8 so that
|
||||
# subsequent cleanup retains valid UTF-8 sequences
|
||||
data = data.dup.force_encoding(Encoding::UTF_8)
|
||||
end
|
||||
data
|
||||
when Array
|
||||
data.map { |item| normalize_encoding(item) }
|
||||
when Hash
|
||||
|
|
|
@ -1,38 +0,0 @@
|
|||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
module LogStash
|
||||
|
||||
# A class to normalize the invalid unicode data
|
||||
class UnicodeNormalizer
|
||||
|
||||
include LogStash::Util::Loggable
|
||||
|
||||
# Tries to normalize input string to UTF-8 when
|
||||
# input string encoding is not UTF-8,
|
||||
# and replaces invalid unicode bytes with replacement characters ('uFFFD')
|
||||
# string_data - The String data to be normalized.
|
||||
# Returns the normalized string data.
|
||||
def self.normalize_string_encoding(string_data)
|
||||
# when given BINARY-flagged string, assume it is UTF-8 so that
|
||||
# subsequent cleanup retains valid UTF-8 sequences
|
||||
source_encoding = string_data.encoding
|
||||
source_encoding = Encoding::UTF_8 if source_encoding == Encoding::BINARY
|
||||
string_data.encode(Encoding::UTF_8, source_encoding, invalid: :replace, undef: :replace).scrub
|
||||
end
|
||||
end
|
||||
end
|
|
@ -31,7 +31,7 @@ describe "Project licenses" do
|
|||
/ruby/,
|
||||
/lgpl/,
|
||||
/epl/,
|
||||
/elastic/i
|
||||
/elastic|elv2/i
|
||||
])
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue