[ML] Update Deberta tokenizer (#116358)

* Was using byte position for end of offset, but it seems like using char position is correct * Update docs/changelog/116358.yaml * Update UnigramTokenizer.java --------- Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
2025-06-28 17:34:17 -04:00 · 2024-11-20 15:08:44 -05:00 · 2024-11-20 15:08:44 -05:00 · 770551498b
commit 770551498b
parent 311412db2f
2 changed files with 8 additions and 1 deletions
--- a/docs/changelog/116358.yaml
+++ b/docs/changelog/116358.yaml
@ -0,0 +1,5 @@
+pr: 116358
+summary: Update Deberta tokenizer
+area: Machine Learning
+type: bug
+issues: []
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java
@ -367,8 +367,10 @@ public final class UnigramTokenizer extends Tokenizer {
                        new DelimitedToken.Encoded(
                            Strings.format("<0x%02X>", bytes[i]),
                            pieces[i],
+                            // even though we are changing the number of characters in the output, we don't
+                            // need to change the offsets. The offsets refer to the input characters
                            offsetCorrection.apply(node.startsAtCharPos),
-                            offsetCorrection.apply(startsAtBytes + i)
+                            offsetCorrection.apply(endsAtChars)
                        )
                    );
                }