Add unicode trimmer utility function

Fixes #3505
2025-04-24 06:37:19 -04:00 · 2015-06-26 14:03:58 -05:00 · 2015-06-26 14:03:58 -05:00 · 13ed78790f
commit 13ed78790f
parent fd4824c23f
3 changed files with 134 additions and 0 deletions
--- a/1
+++ b/1
@ -15,3 +15,4 @@ gem "stud", "~> 0.0.19", :group => :build
 gem "fpm", "~> 1.3.3", :group => :build
 gem "rubyzip", "~> 1.1.7", :group => :build
 gem "gems", "~> 0.8.3", :group => :build
+gem "flores", "~> 0.0.4", :group => :development
--- a/lib/logstash/util/unicode_trimmer.rb
+++ b/lib/logstash/util/unicode_trimmer.rb
@ -0,0 +1,80 @@
+module UnicodeTrimmer
+  # The largest possible unicode chars are 4 bytes
+  # http://stackoverflow.com/questions/9533258/what-is-the-maximum-number-of-bytes-for-a-utf-8-encoded-character
+  # http://tools.ietf.org/html/rfc3629
+  MAX_CHAR_BYTES = 4
+
+  # Takes a unicode string and makes sure it fits in a max of `desired_bytes`
+  # This aims to be somewhat efficient about this for the average case and get as close to
+  # O(1) as possible. Given certain distributions of multi-byte characters it'll be slower
+  # It tries to find the point the truncation *should* happen based on the average byte size.
+  # If that snips it in the wrong place it'll try to add or remove chars to get it to the right
+  # spot and preserve as much data as possible.
+  public
+  def self.trim_bytes(orig_str, desired_bytes)
+    return orig_str if orig_str.bytesize <= desired_bytes
+
+    pre_shortened = pre_shorten(orig_str, desired_bytes)
+
+    case pre_shortened.bytesize <=> desired_bytes
+    when 0
+      pre_shortened
+    when 1
+      shrink_bytes(pre_shortened, orig_str, desired_bytes)
+    when -1
+      grow_bytes(pre_shortened, orig_str, desired_bytes)
+    end
+  end
+
+  private
+  # Try to cut the string at the right place based on the avg. byte size
+  def self.pre_shorten(orig_str, desired_bytes)
+    # Compute the average size to get an idea of where should chop
+    orig_len = orig_str.length
+    orig_bs = orig_str.bytesize
+    avg_size = (orig_bs.to_f / orig_len.to_f)
+
+    # Try to do an initial shortening based on the average char size
+    # The goal here is to get us somewhere above or below the boundary quickly
+    orig_extra_bytes = orig_bs - desired_bytes
+    pre_shorten_by = (orig_extra_bytes  / avg_size).to_i
+    orig_str.slice(0, orig_len - pre_shorten_by)
+  end
+
+  private
+  def self.grow_bytes(pre_shortened, orig_str, desired_bytes)
+    res_str = pre_shortened.clone()
+
+    loop do
+      bs = res_str.bytesize
+      deficit = desired_bytes - bs
+      lengthen_by = deficit / MAX_CHAR_BYTES
+      lengthen_by = 1 if lengthen_by < 1
+      append = orig_str.slice(res_str.length, lengthen_by)
+
+      break if (bs + append.bytesize) > desired_bytes
+
+      res_str << append
+    end
+
+    res_str
+  end
+
+  private
+  def self.shrink_bytes(pre_shortened, orig_str, desired_bytes)
+    res_str = pre_shortened.clone()
+
+    loop do
+      bs = res_str.bytesize
+      break if bs <= desired_bytes
+
+      extra = bs - desired_bytes
+      shorten_by = extra / MAX_CHAR_BYTES
+      shorten_by = 1 if shorten_by < 1
+
+      res_str.slice!(res_str.length - shorten_by)
+    end
+
+    res_str
+  end
+end
--- a/spec/util/unicode_trimmer_spec.rb
+++ b/spec/util/unicode_trimmer_spec.rb
@ -0,0 +1,53 @@
+# encoding: utf-8
+require "spec_helper"
+require "logstash/util/unicode_trimmer"
+require "flores/rspec"
+require "flores/random"
+
+RSpec.configure do |config|
+  Flores::RSpec.configure(config)
+end
+
+describe "truncating unicode strings correctly" do
+  context "with extra bytes before the snip" do
+    let(:ustr) { "Testing «ταБЬℓσ»: 1<2 & 4+1>3, now 20% off!" }
+
+    it "should truncate to exact byte boundaries when possible" do
+      expect(UnicodeTrimmer.trim_bytes(ustr, 21).bytesize).to eql(21)
+    end
+
+    it "should truncate below the bytesize when splitting a byte" do
+      expect(UnicodeTrimmer.trim_bytes(ustr, 20).bytesize).to eql(18)
+    end
+
+    it "should not truncate the string when the bytesize is already OK" do
+      expect(UnicodeTrimmer.trim_bytes(ustr, ustr.bytesize)).to eql(ustr)
+    end
+  end
+
+  context "with extra bytes after the snip" do
+    let(:ustr) { ": 1<2 & 4+1>3, now 20% off! testing «ταБЬℓσ»" }
+
+    it "should truncate to exact byte boundaries when possible" do
+      expect(UnicodeTrimmer.trim_bytes(ustr, 21).bytesize).to eql(21)
+    end
+
+    it "should truncate below the bytesize when splitting a byte" do
+      expect(UnicodeTrimmer.trim_bytes(ustr, 52).bytesize).to eql(51)
+    end
+
+    it "should not truncate the string when the bytesize is already OK" do
+      expect(UnicodeTrimmer.trim_bytes(ustr, ustr.bytesize)).to eql(ustr)
+    end
+  end
+
+  context "randomized testing" do
+    let(:text) { Flores::Random.text(1..1000) }
+    let(:size) { Flores::Random.integer(1..text.bytesize) }
+    let(:expected_range) { (size - 4)..size }
+
+    stress_it "should be near the boundary of requested size" do
+      expect(expected_range).to include(UnicodeTrimmer.trim_bytes(text, size).bytesize)
+    end
+  end
+end