From 5d1999781aed30ff7c656e61a593ba0af02c2fea Mon Sep 17 00:00:00 2001 From: Jordan Powers Date: Tue, 17 Jun 2025 08:15:40 -0700 Subject: [PATCH] Use optimized text in match_only_text fields (#129371) Follow-up to #126492 to use the json parsing optimizations for match_only_text fields. Relates to #129072. --- benchmarks/build.gradle | 1 + .../index/mapper/MapperServiceFactory.java | 8 ++- .../xcontent/OptimizedTextBenchmark.java | 10 ++-- .../extras/MatchOnlyTextFieldMapper.java | 15 +++--- .../extras/SourceConfirmedTextQuery.java | 9 +++- .../extras/MatchOnlyTextFieldMapperTests.java | 7 ++- .../common/text/UTF8DecodingReader.java | 53 +++++++++++++++++++ 7 files changed, 90 insertions(+), 13 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/common/text/UTF8DecodingReader.java diff --git a/benchmarks/build.gradle b/benchmarks/build.gradle index 091a061a29b6..ccba1fb82db5 100644 --- a/benchmarks/build.gradle +++ b/benchmarks/build.gradle @@ -41,6 +41,7 @@ dependencies { } api(project(':libs:h3')) api(project(':modules:aggregations')) + implementation project(':modules:mapper-extras'); api(project(':x-pack:plugin:esql-core')) api(project(':x-pack:plugin:core')) api(project(':x-pack:plugin:esql')) diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/MapperServiceFactory.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/MapperServiceFactory.java index 74cea5d5f154..53aecb936d71 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/MapperServiceFactory.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/MapperServiceFactory.java @@ -29,6 +29,7 @@ import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.ProvidedIdFieldMapper; import org.elasticsearch.index.similarity.SimilarityService; import org.elasticsearch.indices.IndicesModule; +import org.elasticsearch.plugins.MapperPlugin; import org.elasticsearch.script.Script; import org.elasticsearch.script.ScriptCompiler; import org.elasticsearch.script.ScriptContext; @@ -38,11 +39,16 @@ import org.elasticsearch.xcontent.XContentParserConfiguration; import java.io.IOException; import java.io.UncheckedIOException; import java.util.Collections; +import java.util.List; import java.util.Map; public class MapperServiceFactory { public static MapperService create(String mappings) { + return create(mappings, Collections.emptyList()); + } + + public static MapperService create(String mappings, List mapperPlugins) { Settings settings = Settings.builder() .put("index.number_of_replicas", 0) .put("index.number_of_shards", 1) @@ -51,7 +57,7 @@ public class MapperServiceFactory { .build(); IndexMetadata meta = IndexMetadata.builder("index").settings(settings).build(); IndexSettings indexSettings = new IndexSettings(meta, settings); - MapperRegistry mapperRegistry = new IndicesModule(Collections.emptyList()).getMapperRegistry(); + MapperRegistry mapperRegistry = new IndicesModule(mapperPlugins).getMapperRegistry(); SimilarityService similarityService = new SimilarityService(indexSettings, null, Map.of()); BitsetFilterCache bitsetFilterCache = new BitsetFilterCache(indexSettings, BitsetFilterCache.Listener.NOOP); diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java index 9984205445fc..d424c93f6723 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java @@ -15,6 +15,7 @@ import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.logging.LogConfigurator; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.SourceToParse; +import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentFactory; import org.elasticsearch.xcontent.XContentType; @@ -34,6 +35,7 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; import java.io.IOException; +import java.util.List; import java.util.Random; import java.util.concurrent.TimeUnit; @@ -66,7 +68,7 @@ public class OptimizedTextBenchmark { private SourceToParse[] sources; private String randomValue(int length) { - final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; Random random = new Random(); StringBuilder builder = new StringBuilder(length); for (int i = 0; i < length; i++) { @@ -83,17 +85,17 @@ public class OptimizedTextBenchmark { "dynamic": false, "properties": { "field": { - "type": "keyword" + "type": "match_only_text" } } } } - """); + """, List.of(new MapperExtrasPlugin())); sources = new SourceToParse[nDocs]; for (int i = 0; i < nDocs; i++) { XContentBuilder b = XContentFactory.jsonBuilder(); - b.startObject().field("field", randomValue(8)).endObject(); + b.startObject().field("field", randomValue(512)).endObject(); sources[i] = new SourceToParse(UUIDs.randomBase64UUID(), BytesReference.bytes(b), XContentType.JSON); } } diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java index 47b5be1a89c6..3333b004df40 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java @@ -31,6 +31,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOFunction; import org.elasticsearch.common.CheckedIntFunction; import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.text.UTF8DecodingReader; import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.analysis.IndexAnalyzers; @@ -364,7 +365,7 @@ public class MatchOnlyTextFieldMapper extends FieldMapper { @Override public BlockLoader blockLoader(BlockLoaderContext blContext) { if (textFieldType.isSyntheticSource()) { - return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(storedFieldNameForSyntheticSource()); + return new BlockStoredFieldsReader.BytesFromBytesRefsBlockLoader(storedFieldNameForSyntheticSource()); } SourceValueFetcher fetcher = SourceValueFetcher.toString(blContext.sourcePaths(name())); // MatchOnlyText never has norms, so we have to use the field names field @@ -385,7 +386,7 @@ public class MatchOnlyTextFieldMapper extends FieldMapper { ) { @Override protected BytesRef storedToBytesRef(Object stored) { - return new BytesRef((String) stored); + return (BytesRef) stored; } }; } @@ -443,18 +444,20 @@ public class MatchOnlyTextFieldMapper extends FieldMapper { @Override protected void parseCreateField(DocumentParserContext context) throws IOException { - final String value = context.parser().textOrNull(); + final var value = context.parser().optimizedTextOrNull(); if (value == null) { return; } - Field field = new Field(fieldType().name(), value, fieldType); + final var utfBytes = value.bytes(); + Field field = new Field(fieldType().name(), new UTF8DecodingReader(utfBytes), fieldType); context.doc().add(field); context.addToFieldNames(fieldType().name()); if (storeSource) { - context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), value)); + final var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length()); + context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), bytesRef)); } } @@ -474,7 +477,7 @@ public class MatchOnlyTextFieldMapper extends FieldMapper { () -> new StringStoredFieldFieldLoader(fieldType().storedFieldNameForSyntheticSource(), fieldType().name(), leafName()) { @Override protected void write(XContentBuilder b, Object value) throws IOException { - b.value((String) value); + b.value(((BytesRef) value).utf8ToString()); } } ); diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java index b35be28d4b59..2628de589d77 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java @@ -41,6 +41,7 @@ import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.Similarity.SimScorer; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOFunction; import org.elasticsearch.common.CheckedIntFunction; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; @@ -438,7 +439,13 @@ public final class SourceConfirmedTextQuery extends Query { if (value == null) { continue; } - cacheEntry.memoryIndex.addField(field, value.toString(), indexAnalyzer); + String valueStr; + if (value instanceof BytesRef valueRef) { + valueStr = valueRef.utf8ToString(); + } else { + valueStr = value.toString(); + } + cacheEntry.memoryIndex.addField(field, valueStr, indexAnalyzer); } } return cacheEntry.memoryIndex; diff --git a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapperTests.java b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapperTests.java index 22841f8c42bf..f427c6c1b7c0 100644 --- a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapperTests.java +++ b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapperTests.java @@ -123,7 +123,12 @@ public class MatchOnlyTextFieldMapperTests extends MapperTestCase { ParsedDocument doc = mapper.parse(source(b -> b.field("field", "1234"))); List fields = doc.rootDoc().getFields("field"); assertEquals(1, fields.size()); - assertEquals("1234", fields.get(0).stringValue()); + + var reader = fields.get(0).readerValue(); + char[] buff = new char[20]; + assertEquals(4, reader.read(buff)); + assertEquals("1234", new String(buff, 0, 4)); + IndexableFieldType fieldType = fields.get(0).fieldType(); assertThat(fieldType.omitNorms(), equalTo(true)); assertTrue(fieldType.tokenized()); diff --git a/server/src/main/java/org/elasticsearch/common/text/UTF8DecodingReader.java b/server/src/main/java/org/elasticsearch/common/text/UTF8DecodingReader.java new file mode 100644 index 000000000000..8f85f74b17b8 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/common/text/UTF8DecodingReader.java @@ -0,0 +1,53 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.common.text; + +import org.elasticsearch.xcontent.XContentString; + +import java.io.Reader; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.StandardCharsets; + +/** + * Reader that decodes UTF-8 formatted bytes into chars. + */ +public final class UTF8DecodingReader extends Reader { + private CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder(); + private ByteBuffer bytes; + + public UTF8DecodingReader(ByteBuffer bytes) { + this.bytes = bytes; + } + + public UTF8DecodingReader(XContentString.UTF8Bytes utf8bytes) { + this.bytes = ByteBuffer.wrap(utf8bytes.bytes(), utf8bytes.offset(), utf8bytes.length()); + } + + @Override + public int read(char[] cbuf, int off, int len) { + return read(CharBuffer.wrap(cbuf, off, len)); + } + + @Override + public int read(CharBuffer cbuf) { + if (bytes.hasRemaining() == false) { + return -1; + } + + int startPos = cbuf.position(); + decoder.decode(bytes, cbuf, true); + return cbuf.position() - startPos; + } + + @Override + public void close() {} +}