mirror of
https://github.com/elastic/logstash.git
synced 2025-04-24 06:37:19 -04:00
parent
fd4824c23f
commit
13ed78790f
3 changed files with 134 additions and 0 deletions
1
Gemfile
1
Gemfile
|
@ -15,3 +15,4 @@ gem "stud", "~> 0.0.19", :group => :build
|
|||
gem "fpm", "~> 1.3.3", :group => :build
|
||||
gem "rubyzip", "~> 1.1.7", :group => :build
|
||||
gem "gems", "~> 0.8.3", :group => :build
|
||||
gem "flores", "~> 0.0.4", :group => :development
|
80
lib/logstash/util/unicode_trimmer.rb
Normal file
80
lib/logstash/util/unicode_trimmer.rb
Normal file
|
@ -0,0 +1,80 @@
|
|||
module UnicodeTrimmer
|
||||
# The largest possible unicode chars are 4 bytes
|
||||
# http://stackoverflow.com/questions/9533258/what-is-the-maximum-number-of-bytes-for-a-utf-8-encoded-character
|
||||
# http://tools.ietf.org/html/rfc3629
|
||||
MAX_CHAR_BYTES = 4
|
||||
|
||||
# Takes a unicode string and makes sure it fits in a max of `desired_bytes`
|
||||
# This aims to be somewhat efficient about this for the average case and get as close to
|
||||
# O(1) as possible. Given certain distributions of multi-byte characters it'll be slower
|
||||
# It tries to find the point the truncation *should* happen based on the average byte size.
|
||||
# If that snips it in the wrong place it'll try to add or remove chars to get it to the right
|
||||
# spot and preserve as much data as possible.
|
||||
public
|
||||
def self.trim_bytes(orig_str, desired_bytes)
|
||||
return orig_str if orig_str.bytesize <= desired_bytes
|
||||
|
||||
pre_shortened = pre_shorten(orig_str, desired_bytes)
|
||||
|
||||
case pre_shortened.bytesize <=> desired_bytes
|
||||
when 0
|
||||
pre_shortened
|
||||
when 1
|
||||
shrink_bytes(pre_shortened, orig_str, desired_bytes)
|
||||
when -1
|
||||
grow_bytes(pre_shortened, orig_str, desired_bytes)
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
# Try to cut the string at the right place based on the avg. byte size
|
||||
def self.pre_shorten(orig_str, desired_bytes)
|
||||
# Compute the average size to get an idea of where should chop
|
||||
orig_len = orig_str.length
|
||||
orig_bs = orig_str.bytesize
|
||||
avg_size = (orig_bs.to_f / orig_len.to_f)
|
||||
|
||||
# Try to do an initial shortening based on the average char size
|
||||
# The goal here is to get us somewhere above or below the boundary quickly
|
||||
orig_extra_bytes = orig_bs - desired_bytes
|
||||
pre_shorten_by = (orig_extra_bytes / avg_size).to_i
|
||||
orig_str.slice(0, orig_len - pre_shorten_by)
|
||||
end
|
||||
|
||||
private
|
||||
def self.grow_bytes(pre_shortened, orig_str, desired_bytes)
|
||||
res_str = pre_shortened.clone()
|
||||
|
||||
loop do
|
||||
bs = res_str.bytesize
|
||||
deficit = desired_bytes - bs
|
||||
lengthen_by = deficit / MAX_CHAR_BYTES
|
||||
lengthen_by = 1 if lengthen_by < 1
|
||||
append = orig_str.slice(res_str.length, lengthen_by)
|
||||
|
||||
break if (bs + append.bytesize) > desired_bytes
|
||||
|
||||
res_str << append
|
||||
end
|
||||
|
||||
res_str
|
||||
end
|
||||
|
||||
private
|
||||
def self.shrink_bytes(pre_shortened, orig_str, desired_bytes)
|
||||
res_str = pre_shortened.clone()
|
||||
|
||||
loop do
|
||||
bs = res_str.bytesize
|
||||
break if bs <= desired_bytes
|
||||
|
||||
extra = bs - desired_bytes
|
||||
shorten_by = extra / MAX_CHAR_BYTES
|
||||
shorten_by = 1 if shorten_by < 1
|
||||
|
||||
res_str.slice!(res_str.length - shorten_by)
|
||||
end
|
||||
|
||||
res_str
|
||||
end
|
||||
end
|
53
spec/util/unicode_trimmer_spec.rb
Normal file
53
spec/util/unicode_trimmer_spec.rb
Normal file
|
@ -0,0 +1,53 @@
|
|||
# encoding: utf-8
|
||||
require "spec_helper"
|
||||
require "logstash/util/unicode_trimmer"
|
||||
require "flores/rspec"
|
||||
require "flores/random"
|
||||
|
||||
RSpec.configure do |config|
|
||||
Flores::RSpec.configure(config)
|
||||
end
|
||||
|
||||
describe "truncating unicode strings correctly" do
|
||||
context "with extra bytes before the snip" do
|
||||
let(:ustr) { "Testing «ταБЬℓσ»: 1<2 & 4+1>3, now 20% off!" }
|
||||
|
||||
it "should truncate to exact byte boundaries when possible" do
|
||||
expect(UnicodeTrimmer.trim_bytes(ustr, 21).bytesize).to eql(21)
|
||||
end
|
||||
|
||||
it "should truncate below the bytesize when splitting a byte" do
|
||||
expect(UnicodeTrimmer.trim_bytes(ustr, 20).bytesize).to eql(18)
|
||||
end
|
||||
|
||||
it "should not truncate the string when the bytesize is already OK" do
|
||||
expect(UnicodeTrimmer.trim_bytes(ustr, ustr.bytesize)).to eql(ustr)
|
||||
end
|
||||
end
|
||||
|
||||
context "with extra bytes after the snip" do
|
||||
let(:ustr) { ": 1<2 & 4+1>3, now 20% off! testing «ταБЬℓσ»" }
|
||||
|
||||
it "should truncate to exact byte boundaries when possible" do
|
||||
expect(UnicodeTrimmer.trim_bytes(ustr, 21).bytesize).to eql(21)
|
||||
end
|
||||
|
||||
it "should truncate below the bytesize when splitting a byte" do
|
||||
expect(UnicodeTrimmer.trim_bytes(ustr, 52).bytesize).to eql(51)
|
||||
end
|
||||
|
||||
it "should not truncate the string when the bytesize is already OK" do
|
||||
expect(UnicodeTrimmer.trim_bytes(ustr, ustr.bytesize)).to eql(ustr)
|
||||
end
|
||||
end
|
||||
|
||||
context "randomized testing" do
|
||||
let(:text) { Flores::Random.text(1..1000) }
|
||||
let(:size) { Flores::Random.integer(1..text.bytesize) }
|
||||
let(:expected_range) { (size - 4)..size }
|
||||
|
||||
stress_it "should be near the boundary of requested size" do
|
||||
expect(expected_range).to include(UnicodeTrimmer.trim_bytes(text, size).bytesize)
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Add table
Add a link
Reference in a new issue