mirror of
https://github.com/elastic/logstash.git
synced 2025-04-24 06:37:19 -04:00
proper charset encoding with specs
This commit is contained in:
parent
76a8f7667d
commit
2c8fdbc8b1
2 changed files with 104 additions and 6 deletions
|
@ -1,5 +1,6 @@
|
|||
# encoding: utf-8
|
||||
require "logstash/codecs/base"
|
||||
require "logstash/util/charset"
|
||||
|
||||
# The multiline codec will collapse multiline messages and merge them into a
|
||||
# single event.
|
||||
|
@ -150,16 +151,13 @@ class LogStash::Codecs::Multiline < LogStash::Codecs::Base
|
|||
@buffer = []
|
||||
@handler = method("do_#{@what}".to_sym)
|
||||
|
||||
@charset_encoding = Encoding.find(@charset)
|
||||
@converter = LogStash::Util::Charset.new(@charset)
|
||||
@converter.logger = @logger
|
||||
end # def register
|
||||
|
||||
public
|
||||
def decode(text, &block)
|
||||
text.force_encoding(@charset_encoding)
|
||||
if @charset_encoding != Encoding::UTF_8
|
||||
# Convert to UTF-8 if not in that character set.
|
||||
text = text.encode(Encoding::UTF_8, :invalid => :replace, :undef => :replace)
|
||||
end
|
||||
text = @converter.convert(text)
|
||||
|
||||
match = @grok.match(text)
|
||||
@logger.debug("Multiline", :pattern => @pattern, :text => text,
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# encoding: utf-8
|
||||
|
||||
require "logstash/codecs/multiline"
|
||||
require "logstash/event"
|
||||
require "insist"
|
||||
|
@ -56,5 +58,103 @@ describe LogStash::Codecs::Multiline do
|
|||
insist { events.size } == 1
|
||||
insist { events.first["message"] } == lines.join("\n")
|
||||
end
|
||||
|
||||
|
||||
context "using default UTF-8 charset" do
|
||||
|
||||
it "should decode valid UTF-8 input" do
|
||||
codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous")
|
||||
lines = [ "foobar", "κόσμε" ]
|
||||
events = []
|
||||
lines.each do |line|
|
||||
insist { line.encoding.name } == "UTF-8"
|
||||
insist { line.valid_encoding? } == true
|
||||
|
||||
codec.decode(line) { |event| events << event }
|
||||
end
|
||||
codec.flush { |e| events << e }
|
||||
insist { events.size } == 2
|
||||
|
||||
events.zip(lines).each do |tuple|
|
||||
insist { tuple[0]["message"] } == tuple[1]
|
||||
insist { tuple[0]["message"].encoding.name } == "UTF-8"
|
||||
end
|
||||
end
|
||||
|
||||
it "should escape invalid sequences" do
|
||||
codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous")
|
||||
lines = [ "foo \xED\xB9\x81\xC3", "bar \xAD" ]
|
||||
events = []
|
||||
lines.each do |line|
|
||||
insist { line.encoding.name } == "UTF-8"
|
||||
insist { line.valid_encoding? } == false
|
||||
|
||||
codec.decode(line) { |event| events << event }
|
||||
end
|
||||
codec.flush { |e| events << e }
|
||||
insist { events.size } == 2
|
||||
|
||||
events.zip(lines).each do |tuple|
|
||||
insist { tuple[0]["message"] } == tuple[1].inspect[1..-2]
|
||||
insist { tuple[0]["message"].encoding.name } == "UTF-8"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
context "with valid non UTF-8 source encoding" do
|
||||
|
||||
it "should encode to UTF-8" do
|
||||
codec = LogStash::Codecs::Multiline.new("charset" => "ISO-8859-1", "pattern" => "^\\s", "what" => "previous")
|
||||
samples = [
|
||||
["foobar", "foobar"],
|
||||
["\xE0 Montr\xE9al", "à Montréal"],
|
||||
]
|
||||
|
||||
# lines = [ "foo \xED\xB9\x81\xC3", "bar \xAD" ]
|
||||
events = []
|
||||
samples.map{|(a, b)| a.force_encoding("ISO-8859-1")}.each do |line|
|
||||
insist { line.encoding.name } == "ISO-8859-1"
|
||||
insist { line.valid_encoding? } == true
|
||||
|
||||
codec.decode(line) { |event| events << event }
|
||||
end
|
||||
codec.flush { |e| events << e }
|
||||
insist { events.size } == 2
|
||||
|
||||
events.zip(samples.map{|(a, b)| b}).each do |tuple|
|
||||
insist { tuple[1].encoding.name } == "UTF-8"
|
||||
insist { tuple[0]["message"] } == tuple[1]
|
||||
insist { tuple[0]["message"].encoding.name } == "UTF-8"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context "with invalid non UTF-8 source encoding" do
|
||||
|
||||
it "should encode to UTF-8" do
|
||||
codec = LogStash::Codecs::Multiline.new("charset" => "ASCII-8BIT", "pattern" => "^\\s", "what" => "previous")
|
||||
samples = [
|
||||
["\xE0 Montr\xE9al", "<EFBFBD> Montr<74>al"],
|
||||
["\xCE\xBA\xCF\x8C\xCF\x83\xCE\xBC\xCE\xB5", "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"],
|
||||
]
|
||||
events = []
|
||||
samples.map{|(a, b)| a.force_encoding("ASCII-8BIT")}.each do |line|
|
||||
insist { line.encoding.name } == "ASCII-8BIT"
|
||||
insist { line.valid_encoding? } == true
|
||||
|
||||
codec.decode(line) { |event| events << event }
|
||||
end
|
||||
codec.flush { |e| events << e }
|
||||
insist { events.size } == 2
|
||||
|
||||
events.zip(samples.map{|(a, b)| b}).each do |tuple|
|
||||
insist { tuple[1].encoding.name } == "UTF-8"
|
||||
insist { tuple[0]["message"] } == tuple[1]
|
||||
insist { tuple[0]["message"].encoding.name } == "UTF-8"
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue