proper charset encoding with specs

This commit is contained in:
Colin Surprenant 2014-04-02 16:00:16 -04:00
parent 76a8f7667d
commit 2c8fdbc8b1
2 changed files with 104 additions and 6 deletions

View file

@ -1,5 +1,6 @@
# encoding: utf-8
require "logstash/codecs/base"
require "logstash/util/charset"
# The multiline codec will collapse multiline messages and merge them into a
# single event.
@ -150,16 +151,13 @@ class LogStash::Codecs::Multiline < LogStash::Codecs::Base
@buffer = []
@handler = method("do_#{@what}".to_sym)
@charset_encoding = Encoding.find(@charset)
@converter = LogStash::Util::Charset.new(@charset)
@converter.logger = @logger
end # def register
public
def decode(text, &block)
text.force_encoding(@charset_encoding)
if @charset_encoding != Encoding::UTF_8
# Convert to UTF-8 if not in that character set.
text = text.encode(Encoding::UTF_8, :invalid => :replace, :undef => :replace)
end
text = @converter.convert(text)
match = @grok.match(text)
@logger.debug("Multiline", :pattern => @pattern, :text => text,

View file

@ -1,3 +1,5 @@
# encoding: utf-8
require "logstash/codecs/multiline"
require "logstash/event"
require "insist"
@ -56,5 +58,103 @@ describe LogStash::Codecs::Multiline do
insist { events.size } == 1
insist { events.first["message"] } == lines.join("\n")
end
context "using default UTF-8 charset" do
it "should decode valid UTF-8 input" do
codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous")
lines = [ "foobar", "κόσμε" ]
events = []
lines.each do |line|
insist { line.encoding.name } == "UTF-8"
insist { line.valid_encoding? } == true
codec.decode(line) { |event| events << event }
end
codec.flush { |e| events << e }
insist { events.size } == 2
events.zip(lines).each do |tuple|
insist { tuple[0]["message"] } == tuple[1]
insist { tuple[0]["message"].encoding.name } == "UTF-8"
end
end
it "should escape invalid sequences" do
codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous")
lines = [ "foo \xED\xB9\x81\xC3", "bar \xAD" ]
events = []
lines.each do |line|
insist { line.encoding.name } == "UTF-8"
insist { line.valid_encoding? } == false
codec.decode(line) { |event| events << event }
end
codec.flush { |e| events << e }
insist { events.size } == 2
events.zip(lines).each do |tuple|
insist { tuple[0]["message"] } == tuple[1].inspect[1..-2]
insist { tuple[0]["message"].encoding.name } == "UTF-8"
end
end
end
context "with valid non UTF-8 source encoding" do
it "should encode to UTF-8" do
codec = LogStash::Codecs::Multiline.new("charset" => "ISO-8859-1", "pattern" => "^\\s", "what" => "previous")
samples = [
["foobar", "foobar"],
["\xE0 Montr\xE9al", "à Montréal"],
]
# lines = [ "foo \xED\xB9\x81\xC3", "bar \xAD" ]
events = []
samples.map{|(a, b)| a.force_encoding("ISO-8859-1")}.each do |line|
insist { line.encoding.name } == "ISO-8859-1"
insist { line.valid_encoding? } == true
codec.decode(line) { |event| events << event }
end
codec.flush { |e| events << e }
insist { events.size } == 2
events.zip(samples.map{|(a, b)| b}).each do |tuple|
insist { tuple[1].encoding.name } == "UTF-8"
insist { tuple[0]["message"] } == tuple[1]
insist { tuple[0]["message"].encoding.name } == "UTF-8"
end
end
end
context "with invalid non UTF-8 source encoding" do
it "should encode to UTF-8" do
codec = LogStash::Codecs::Multiline.new("charset" => "ASCII-8BIT", "pattern" => "^\\s", "what" => "previous")
samples = [
["\xE0 Montr\xE9al", "<EFBFBD> Montr<74>al"],
["\xCE\xBA\xCF\x8C\xCF\x83\xCE\xBC\xCE\xB5", "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"],
]
events = []
samples.map{|(a, b)| a.force_encoding("ASCII-8BIT")}.each do |line|
insist { line.encoding.name } == "ASCII-8BIT"
insist { line.valid_encoding? } == true
codec.decode(line) { |event| events << event }
end
codec.flush { |e| events << e }
insist { events.size } == 2
events.zip(samples.map{|(a, b)| b}).each do |tuple|
insist { tuple[1].encoding.name } == "UTF-8"
insist { tuple[0]["message"] } == tuple[1]
insist { tuple[0]["message"].encoding.name } == "UTF-8"
end
end
end
end
end