actually encode to UTF-8 when input charset is non UTF-8 + basic specs

This commit is contained in:
Colin Surprenant 2014-03-07 19:35:47 -05:00 committed by Jordan Sissel
parent b67e223b3b
commit d083dcb264
3 changed files with 93 additions and 24 deletions

View file

@ -105,7 +105,7 @@ clean:
.PHONY: vendor-clean
vendor-clean:
-$(QUIET)rm -rf vendor/kibana vendor/geoip vendor/collectd
-$(QUIET)rm -rf vendor/kibana vendor/geoip vendor/collectd
-$(QUIET)rm -rf vendor/jar vendor/ua-parser
.PHONY: clean-vendor

View file

@ -4,36 +4,31 @@ require "logstash/util"
class LogStash::Util::Charset
attr_accessor :logger
def initialize(charset)
@charset = charset
end
def convert(data)
data.force_encoding(@charset)
if @charset == "UTF-8"
# Some users don't know the charset of their logs or just don't know they
# can set the charset setting.
if !data.valid_encoding?
@logger.warn("Received an event that has a different character encoding than you configured.", :text => data.inspect[1..-2], :expected_charset => @charset)
#if @force_lossy_charset_conversion
## Janky hack to force ruby to re-encode UTF-8 with replacement chars.
#data.force_encoding("CP65001")
#data = data.encode("UTF-8", :invalid => :replace, :undef => :replace)
#else
#end
# A silly hack to help convert some of the unknown bytes to
# somewhat-readable escape codes. The [1..-2] is to trim the quotes
# ruby puts on the value.
data = data.inspect[1..-2]
else
# The user has declared the character encoding of this data is
# something other than UTF-8. Let's convert it (as cleanly as possible)
# into UTF-8 so we can use it with JSON, etc.
data = data.encode("UTF-8", :invalid => :replace, :undef => :replace)
# NON UTF-8 charset declared.
# Let's convert it (as cleanly as possible) into UTF-8 so we can use it with JSON, etc.
return data.encode("UTF-8", :invalid => :replace, :undef => :replace) unless @charset == "UTF-8"
# UTF-8 charset declared.
# Some users don't know the charset of their logs or just don't know they
# can set the charset setting.
unless data.valid_encoding?
# A silly hack to help convert some of the unknown bytes to
# somewhat-readable escape codes. The [1..-2] is to trim the quotes
# ruby puts on the value.
return data.inspect[1..-2].tap do |escaped|
@logger.warn("Received an event that has a different character encoding than you configured.", :text => escaped, :expected_charset => @charset)
end
end
return data
end # def convert
end # class LogStash::Util::Charset
data
end # def convert
end # class LogStash::Util::Charset

74
spec/util/charset_spec.rb Normal file
View file

@ -0,0 +1,74 @@
# encoding: utf-8
require "test_utils"
require "logstash/util/charset"
describe LogStash::Util::Charset do
let(:logger) { double("logger") }
context "with valid UTF-8 source encoding" do
subject {LogStash::Util::Charset.new("UTF-8")}
it "should return untouched data" do
["foobar", "κόσμε"].each do |data|
insist { data.encoding.name } == "UTF-8"
insist { subject.convert(data) } == data
insist { subject.convert(data).encoding.name } == "UTF-8"
end
end
end
context "with invalid UTF-8 source encoding" do
subject do
LogStash::Util::Charset.new("UTF-8").tap do |charset|
charset.logger = logger
end
end
it "should escape invalid sequences" do
["foo \xED\xB9\x81\xC3", "bar \xAD"].each do |data|
insist { data.encoding.name } == "UTF-8"
insist { data.valid_encoding? } == false
logger.should_receive(:warn).twice
insist { subject.convert(data) } == data.inspect[1..-2]
insist { subject.convert(data).encoding.name } == "UTF-8"
end
end
end
context "with valid non UTF-8 source encoding" do
subject {LogStash::Util::Charset.new("ISO-8859-1")}
it "should encode to UTF-8" do
samples = [
["foobar", "foobar"],
["\xE0 Montr\xE9al", "à Montréal"],
]
samples.map{|(a, b)| [a.force_encoding("ISO-8859-1"), b]}.each do |(a, b)|
insist { a.encoding.name } == "ISO-8859-1"
insist { b.encoding.name } == "UTF-8"
insist { a.valid_encoding? } == true
insist { subject.convert(a).encoding.name } == "UTF-8"
insist { subject.convert(a) } == b
end
end
end
context "with invalid non UTF-8 source encoding" do
subject {LogStash::Util::Charset.new("ASCII-8BIT")}
it "should encode to UTF-8 and replace invalid chars" do
samples = [
["\xE0 Montr\xE9al", "<EFBFBD> Montr<74>al"],
["\xCE\xBA\xCF\x8C\xCF\x83\xCE\xBC\xCE\xB5", "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"],
]
samples.map{|(a, b)| [a.force_encoding("ASCII-8BIT"), b]}.each do |(a, b)|
insist { a.encoding.name } == "ASCII-8BIT"
insist { b.encoding.name } == "UTF-8"
insist { subject.convert(a).encoding.name } == "UTF-8"
insist { subject.convert(a) } == b
end
end
end
end