mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-28 17:34:17 -04:00
[ML] Update Deberta tokenizer (#116358)
* Was using byte position for end of offset, but it seems like using char position is correct * Update docs/changelog/116358.yaml * Update UnigramTokenizer.java --------- Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
This commit is contained in:
parent
311412db2f
commit
770551498b
2 changed files with 8 additions and 1 deletions
5
docs/changelog/116358.yaml
Normal file
5
docs/changelog/116358.yaml
Normal file
|
@ -0,0 +1,5 @@
|
|||
pr: 116358
|
||||
summary: Update Deberta tokenizer
|
||||
area: Machine Learning
|
||||
type: bug
|
||||
issues: []
|
|
@ -367,8 +367,10 @@ public final class UnigramTokenizer extends Tokenizer {
|
|||
new DelimitedToken.Encoded(
|
||||
Strings.format("<0x%02X>", bytes[i]),
|
||||
pieces[i],
|
||||
// even though we are changing the number of characters in the output, we don't
|
||||
// need to change the offsets. The offsets refer to the input characters
|
||||
offsetCorrection.apply(node.startsAtCharPos),
|
||||
offsetCorrection.apply(startsAtBytes + i)
|
||||
offsetCorrection.apply(endsAtChars)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue