mirror of
https://github.com/elastic/logstash.git
synced 2025-04-23 22:27:21 -04:00
actually encode to UTF-8 when input charset is non UTF-8 + basic specs
This commit is contained in:
parent
b67e223b3b
commit
d083dcb264
3 changed files with 93 additions and 24 deletions
2
Makefile
2
Makefile
|
@ -105,7 +105,7 @@ clean:
|
|||
|
||||
.PHONY: vendor-clean
|
||||
vendor-clean:
|
||||
-$(QUIET)rm -rf vendor/kibana vendor/geoip vendor/collectd
|
||||
-$(QUIET)rm -rf vendor/kibana vendor/geoip vendor/collectd
|
||||
-$(QUIET)rm -rf vendor/jar vendor/ua-parser
|
||||
|
||||
.PHONY: clean-vendor
|
||||
|
|
|
@ -4,36 +4,31 @@ require "logstash/util"
|
|||
|
||||
class LogStash::Util::Charset
|
||||
attr_accessor :logger
|
||||
|
||||
def initialize(charset)
|
||||
@charset = charset
|
||||
end
|
||||
|
||||
def convert(data)
|
||||
data.force_encoding(@charset)
|
||||
if @charset == "UTF-8"
|
||||
# Some users don't know the charset of their logs or just don't know they
|
||||
# can set the charset setting.
|
||||
if !data.valid_encoding?
|
||||
@logger.warn("Received an event that has a different character encoding than you configured.", :text => data.inspect[1..-2], :expected_charset => @charset)
|
||||
#if @force_lossy_charset_conversion
|
||||
## Janky hack to force ruby to re-encode UTF-8 with replacement chars.
|
||||
#data.force_encoding("CP65001")
|
||||
#data = data.encode("UTF-8", :invalid => :replace, :undef => :replace)
|
||||
#else
|
||||
#end
|
||||
|
||||
# A silly hack to help convert some of the unknown bytes to
|
||||
# somewhat-readable escape codes. The [1..-2] is to trim the quotes
|
||||
# ruby puts on the value.
|
||||
data = data.inspect[1..-2]
|
||||
else
|
||||
# The user has declared the character encoding of this data is
|
||||
# something other than UTF-8. Let's convert it (as cleanly as possible)
|
||||
# into UTF-8 so we can use it with JSON, etc.
|
||||
data = data.encode("UTF-8", :invalid => :replace, :undef => :replace)
|
||||
# NON UTF-8 charset declared.
|
||||
# Let's convert it (as cleanly as possible) into UTF-8 so we can use it with JSON, etc.
|
||||
return data.encode("UTF-8", :invalid => :replace, :undef => :replace) unless @charset == "UTF-8"
|
||||
|
||||
# UTF-8 charset declared.
|
||||
# Some users don't know the charset of their logs or just don't know they
|
||||
# can set the charset setting.
|
||||
unless data.valid_encoding?
|
||||
# A silly hack to help convert some of the unknown bytes to
|
||||
# somewhat-readable escape codes. The [1..-2] is to trim the quotes
|
||||
# ruby puts on the value.
|
||||
return data.inspect[1..-2].tap do |escaped|
|
||||
@logger.warn("Received an event that has a different character encoding than you configured.", :text => escaped, :expected_charset => @charset)
|
||||
end
|
||||
end
|
||||
return data
|
||||
end # def convert
|
||||
end # class LogStash::Util::Charset
|
||||
|
||||
data
|
||||
end # def convert
|
||||
|
||||
end # class LogStash::Util::Charset
|
||||
|
|
74
spec/util/charset_spec.rb
Normal file
74
spec/util/charset_spec.rb
Normal file
|
@ -0,0 +1,74 @@
|
|||
# encoding: utf-8
|
||||
|
||||
require "test_utils"
|
||||
require "logstash/util/charset"
|
||||
|
||||
describe LogStash::Util::Charset do
|
||||
let(:logger) { double("logger") }
|
||||
|
||||
context "with valid UTF-8 source encoding" do
|
||||
subject {LogStash::Util::Charset.new("UTF-8")}
|
||||
|
||||
it "should return untouched data" do
|
||||
["foobar", "κόσμε"].each do |data|
|
||||
insist { data.encoding.name } == "UTF-8"
|
||||
insist { subject.convert(data) } == data
|
||||
insist { subject.convert(data).encoding.name } == "UTF-8"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context "with invalid UTF-8 source encoding" do
|
||||
subject do
|
||||
LogStash::Util::Charset.new("UTF-8").tap do |charset|
|
||||
charset.logger = logger
|
||||
end
|
||||
end
|
||||
|
||||
it "should escape invalid sequences" do
|
||||
["foo \xED\xB9\x81\xC3", "bar \xAD"].each do |data|
|
||||
insist { data.encoding.name } == "UTF-8"
|
||||
insist { data.valid_encoding? } == false
|
||||
logger.should_receive(:warn).twice
|
||||
insist { subject.convert(data) } == data.inspect[1..-2]
|
||||
insist { subject.convert(data).encoding.name } == "UTF-8"
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
context "with valid non UTF-8 source encoding" do
|
||||
subject {LogStash::Util::Charset.new("ISO-8859-1")}
|
||||
|
||||
it "should encode to UTF-8" do
|
||||
samples = [
|
||||
["foobar", "foobar"],
|
||||
["\xE0 Montr\xE9al", "à Montréal"],
|
||||
]
|
||||
samples.map{|(a, b)| [a.force_encoding("ISO-8859-1"), b]}.each do |(a, b)|
|
||||
insist { a.encoding.name } == "ISO-8859-1"
|
||||
insist { b.encoding.name } == "UTF-8"
|
||||
insist { a.valid_encoding? } == true
|
||||
insist { subject.convert(a).encoding.name } == "UTF-8"
|
||||
insist { subject.convert(a) } == b
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context "with invalid non UTF-8 source encoding" do
|
||||
subject {LogStash::Util::Charset.new("ASCII-8BIT")}
|
||||
|
||||
it "should encode to UTF-8 and replace invalid chars" do
|
||||
samples = [
|
||||
["\xE0 Montr\xE9al", "<EFBFBD> Montr<74>al"],
|
||||
["\xCE\xBA\xCF\x8C\xCF\x83\xCE\xBC\xCE\xB5", "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"],
|
||||
]
|
||||
samples.map{|(a, b)| [a.force_encoding("ASCII-8BIT"), b]}.each do |(a, b)|
|
||||
insist { a.encoding.name } == "ASCII-8BIT"
|
||||
insist { b.encoding.name } == "UTF-8"
|
||||
insist { subject.convert(a).encoding.name } == "UTF-8"
|
||||
insist { subject.convert(a) } == b
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Add table
Add a link
Reference in a new issue