Use optimized text in match_only_text fields (#129371)

Follow-up to #126492 to use the json parsing optimizations for match_only_text fields. Relates to #129072.
2025-06-27 17:10:22 -04:00 · 2025-06-17 08:15:40 -07:00 · 2025-06-17 08:15:40 -07:00 · 5d1999781a
commit 5d1999781a
parent 9f78d11639
7 changed files with 90 additions and 13 deletions
--- a/benchmarks/build.gradle
+++ b/benchmarks/build.gradle
@ -41,6 +41,7 @@ dependencies {
  }
  api(project(':libs:h3'))
  api(project(':modules:aggregations'))
+  implementation project(':modules:mapper-extras');
  api(project(':x-pack:plugin:esql-core'))
  api(project(':x-pack:plugin:core'))
  api(project(':x-pack:plugin:esql'))
--- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/MapperServiceFactory.java
+++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/MapperServiceFactory.java
@ -29,6 +29,7 @@ import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.index.mapper.ProvidedIdFieldMapper;
 import org.elasticsearch.index.similarity.SimilarityService;
 import org.elasticsearch.indices.IndicesModule;
+import org.elasticsearch.plugins.MapperPlugin;
 import org.elasticsearch.script.Script;
 import org.elasticsearch.script.ScriptCompiler;
 import org.elasticsearch.script.ScriptContext;
@ -38,11 +39,16 @@ import org.elasticsearch.xcontent.XContentParserConfiguration;
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.Collections;
+import java.util.List;
 import java.util.Map;

 public class MapperServiceFactory {

    public static MapperService create(String mappings) {
+        return create(mappings, Collections.emptyList());
+    }
+
+    public static MapperService create(String mappings, List<MapperPlugin> mapperPlugins) {
        Settings settings = Settings.builder()
            .put("index.number_of_replicas", 0)
            .put("index.number_of_shards", 1)
@ -51,7 +57,7 @@ public class MapperServiceFactory {
            .build();
        IndexMetadata meta = IndexMetadata.builder("index").settings(settings).build();
        IndexSettings indexSettings = new IndexSettings(meta, settings);
-        MapperRegistry mapperRegistry = new IndicesModule(Collections.emptyList()).getMapperRegistry();
+        MapperRegistry mapperRegistry = new IndicesModule(mapperPlugins).getMapperRegistry();

        SimilarityService similarityService = new SimilarityService(indexSettings, null, Map.of());
        BitsetFilterCache bitsetFilterCache = new BitsetFilterCache(indexSettings, BitsetFilterCache.Listener.NOOP);
--- a/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java
+++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java
@ -15,6 +15,7 @@ import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.logging.LogConfigurator;
 import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.index.mapper.SourceToParse;
+import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin;
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentFactory;
 import org.elasticsearch.xcontent.XContentType;
@ -34,6 +35,7 @@ import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;

 import java.io.IOException;
+import java.util.List;
 import java.util.Random;
 import java.util.concurrent.TimeUnit;

@ -66,7 +68,7 @@ public class OptimizedTextBenchmark {
    private SourceToParse[] sources;

    private String randomValue(int length) {
-        final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+        final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
        Random random = new Random();
        StringBuilder builder = new StringBuilder(length);
        for (int i = 0; i < length; i++) {
@ -83,17 +85,17 @@ public class OptimizedTextBenchmark {
                    "dynamic": false,
                    "properties": {
                        "field": {
-                            "type": "keyword"
+                            "type": "match_only_text"
                        }
                    }
                }
            }
-            """);
+            """, List.of(new MapperExtrasPlugin()));

        sources = new SourceToParse[nDocs];
        for (int i = 0; i < nDocs; i++) {
            XContentBuilder b = XContentFactory.jsonBuilder();
-            b.startObject().field("field", randomValue(8)).endObject();
+            b.startObject().field("field", randomValue(512)).endObject();
            sources[i] = new SourceToParse(UUIDs.randomBase64UUID(), BytesReference.bytes(b), XContentType.JSON);
        }
    }
--- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java
+++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java
@ -31,6 +31,7 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOFunction;
 import org.elasticsearch.common.CheckedIntFunction;
 import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.common.text.UTF8DecodingReader;
 import org.elasticsearch.common.unit.Fuzziness;
 import org.elasticsearch.index.IndexVersion;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
@ -364,7 +365,7 @@ public class MatchOnlyTextFieldMapper extends FieldMapper {
        @Override
        public BlockLoader blockLoader(BlockLoaderContext blContext) {
            if (textFieldType.isSyntheticSource()) {
-                return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(storedFieldNameForSyntheticSource());
+                return new BlockStoredFieldsReader.BytesFromBytesRefsBlockLoader(storedFieldNameForSyntheticSource());
            }
            SourceValueFetcher fetcher = SourceValueFetcher.toString(blContext.sourcePaths(name()));
            // MatchOnlyText never has norms, so we have to use the field names field
@ -385,7 +386,7 @@ public class MatchOnlyTextFieldMapper extends FieldMapper {
                ) {
                    @Override
                    protected BytesRef storedToBytesRef(Object stored) {
-                        return new BytesRef((String) stored);
+                        return (BytesRef) stored;
                    }
                };
            }
@ -443,18 +444,20 @@ public class MatchOnlyTextFieldMapper extends FieldMapper {

    @Override
    protected void parseCreateField(DocumentParserContext context) throws IOException {
-        final String value = context.parser().textOrNull();
+        final var value = context.parser().optimizedTextOrNull();

        if (value == null) {
            return;
        }

-        Field field = new Field(fieldType().name(), value, fieldType);
+        final var utfBytes = value.bytes();
+        Field field = new Field(fieldType().name(), new UTF8DecodingReader(utfBytes), fieldType);
        context.doc().add(field);
        context.addToFieldNames(fieldType().name());

        if (storeSource) {
-            context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), value));
+            final var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
+            context.doc().add(new StoredField(fieldType().storedFieldNameForSyntheticSource(), bytesRef));
        }
    }

@ -474,7 +477,7 @@ public class MatchOnlyTextFieldMapper extends FieldMapper {
            () -> new StringStoredFieldFieldLoader(fieldType().storedFieldNameForSyntheticSource(), fieldType().name(), leafName()) {
                @Override
                protected void write(XContentBuilder b, Object value) throws IOException {
-                    b.value((String) value);
+                    b.value(((BytesRef) value).utf8ToString());
                }
            }
        );
--- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java
+++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java
@ -41,6 +41,7 @@ import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOFunction;
 import org.elasticsearch.common.CheckedIntFunction;
 import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
@ -438,7 +439,13 @@ public final class SourceConfirmedTextQuery extends Query {
                    if (value == null) {
                        continue;
                    }
-                    cacheEntry.memoryIndex.addField(field, value.toString(), indexAnalyzer);
+                    String valueStr;
+                    if (value instanceof BytesRef valueRef) {
+                        valueStr = valueRef.utf8ToString();
+                    } else {
+                        valueStr = value.toString();
+                    }
+                    cacheEntry.memoryIndex.addField(field, valueStr, indexAnalyzer);
                }
            }
            return cacheEntry.memoryIndex;
--- a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapperTests.java
+++ b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapperTests.java
@ -123,7 +123,12 @@ public class MatchOnlyTextFieldMapperTests extends MapperTestCase {
        ParsedDocument doc = mapper.parse(source(b -> b.field("field", "1234")));
        List<IndexableField> fields = doc.rootDoc().getFields("field");
        assertEquals(1, fields.size());
-        assertEquals("1234", fields.get(0).stringValue());
+
+        var reader = fields.get(0).readerValue();
+        char[] buff = new char[20];
+        assertEquals(4, reader.read(buff));
+        assertEquals("1234", new String(buff, 0, 4));
+
        IndexableFieldType fieldType = fields.get(0).fieldType();
        assertThat(fieldType.omitNorms(), equalTo(true));
        assertTrue(fieldType.tokenized());
--- a/server/src/main/java/org/elasticsearch/common/text/UTF8DecodingReader.java
+++ b/server/src/main/java/org/elasticsearch/common/text/UTF8DecodingReader.java
@ -0,0 +1,53 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.common.text;
+
+import org.elasticsearch.xcontent.XContentString;
+
+import java.io.Reader;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Reader that decodes UTF-8 formatted bytes into chars.
+ */
+public final class UTF8DecodingReader extends Reader {
+    private CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
+    private ByteBuffer bytes;
+
+    public UTF8DecodingReader(ByteBuffer bytes) {
+        this.bytes = bytes;
+    }
+
+    public UTF8DecodingReader(XContentString.UTF8Bytes utf8bytes) {
+        this.bytes = ByteBuffer.wrap(utf8bytes.bytes(), utf8bytes.offset(), utf8bytes.length());
+    }
+
+    @Override
+    public int read(char[] cbuf, int off, int len) {
+        return read(CharBuffer.wrap(cbuf, off, len));
+    }
+
+    @Override
+    public int read(CharBuffer cbuf) {
+        if (bytes.hasRemaining() == false) {
+            return -1;
+        }
+
+        int startPos = cbuf.position();
+        decoder.decode(bytes, cbuf, true);
+        return cbuf.position() - startPos;
+    }
+
+    @Override
+    public void close() {}
+}