[ML] Update Deberta tokenizer (#116358)

* Was using byte position for end of offset, but it seems like using char position is correct

* Update docs/changelog/116358.yaml

* Update UnigramTokenizer.java

---------

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
This commit is contained in:
Max Hniebergall 2024-11-20 15:08:44 -05:00 committed by GitHub
parent 311412db2f
commit 770551498b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 8 additions and 1 deletions

View file

@ -0,0 +1,5 @@
pr: 116358
summary: Update Deberta tokenizer
area: Machine Learning
type: bug
issues: []

View file

@ -367,8 +367,10 @@ public final class UnigramTokenizer extends Tokenizer {
new DelimitedToken.Encoded( new DelimitedToken.Encoded(
Strings.format("<0x%02X>", bytes[i]), Strings.format("<0x%02X>", bytes[i]),
pieces[i], pieces[i],
// even though we are changing the number of characters in the output, we don't
// need to change the offsets. The offsets refer to the input characters
offsetCorrection.apply(node.startsAtCharPos), offsetCorrection.apply(node.startsAtCharPos),
offsetCorrection.apply(startsAtBytes + i) offsetCorrection.apply(endsAtChars)
) )
); );
} }