Implement synthetic source support for annotated text field (#107735)

This PR adds synthetic source support for annotated_text fields. Existing implementation for text is reused including test infrastructure so the majority of the change is moving and making things accessible.

Contributes to #106460, #78744.
This commit is contained in:
Oleksandr Kolomiiets 2024-04-25 10:31:27 -07:00 committed by GitHub
parent 4ef8b3825e
commit e1d902d33b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 824 additions and 300 deletions

View file

@ -0,0 +1,19 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
module org.elasticsearch.index.mapper.annotatedtext {
requires org.elasticsearch.base;
requires org.elasticsearch.server;
requires org.elasticsearch.xcontent;
requires org.apache.lucene.core;
requires org.apache.lucene.highlighter;
// exports nothing
provides org.elasticsearch.features.FeatureSpecification with org.elasticsearch.index.mapper.annotatedtext.Features;
}

View file

@ -21,17 +21,22 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.features.NodeFeature;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.mapper.DocumentParserContext;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.KeywordFieldMapper;
import org.elasticsearch.index.mapper.MapperBuilderContext;
import org.elasticsearch.index.mapper.SourceLoader;
import org.elasticsearch.index.mapper.StringStoredFieldFieldLoader;
import org.elasticsearch.index.mapper.TextFieldMapper;
import org.elasticsearch.index.mapper.TextParams;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.similarity.SimilarityProvider;
import org.elasticsearch.xcontent.XContentBuilder;
import java.io.IOException;
import java.io.Reader;
@ -41,6 +46,7 @@ import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -58,6 +64,8 @@ import java.util.regex.Pattern;
**/
public class AnnotatedTextFieldMapper extends FieldMapper {
public static final NodeFeature SYNTHETIC_SOURCE_SUPPORT = new NodeFeature("mapper.annotated_text.synthetic_source");
public static final String CONTENT_TYPE = "annotated_text";
private static Builder builder(FieldMapper in) {
@ -114,7 +122,7 @@ public class AnnotatedTextFieldMapper extends FieldMapper {
meta };
}
private AnnotatedTextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext context) {
private AnnotatedTextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext context, MultiFields multiFields) {
TextSearchInfo tsi = new TextSearchInfo(
fieldType,
similarity.get(),
@ -126,12 +134,14 @@ public class AnnotatedTextFieldMapper extends FieldMapper {
store.getValue(),
tsi,
context.isSourceSynthetic(),
TextFieldMapper.SyntheticSourceHelper.syntheticSourceDelegate(fieldType, multiFields),
meta.getValue()
);
}
@Override
public AnnotatedTextFieldMapper build(MapperBuilderContext context) {
MultiFields multiFields = multiFieldsBuilder.build(this, context);
FieldType fieldType = TextParams.buildFieldType(() -> true, store, indexOptions, norms, termVectors);
if (fieldType.indexOptions() == IndexOptions.NONE) {
throw new IllegalArgumentException("[" + CONTENT_TYPE + "] fields must be indexed");
@ -146,8 +156,8 @@ public class AnnotatedTextFieldMapper extends FieldMapper {
return new AnnotatedTextFieldMapper(
name(),
fieldType,
buildFieldType(fieldType, context),
multiFieldsBuilder.build(this, context),
buildFieldType(fieldType, context, multiFields),
multiFields,
copyTo,
this
);
@ -472,15 +482,15 @@ public class AnnotatedTextFieldMapper extends FieldMapper {
}
public static final class AnnotatedTextFieldType extends TextFieldMapper.TextFieldType {
private AnnotatedTextFieldType(
String name,
boolean store,
TextSearchInfo tsi,
boolean isSyntheticSource,
KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate,
Map<String, String> meta
) {
super(name, true, store, tsi, isSyntheticSource, null, meta, false, false);
super(name, true, store, tsi, isSyntheticSource, syntheticSourceDelegate, meta, false, false);
}
public AnnotatedTextFieldType(String name, Map<String, String> meta) {
@ -544,4 +554,36 @@ public class AnnotatedTextFieldMapper extends FieldMapper {
public FieldMapper.Builder getMergeBuilder() {
return new Builder(simpleName(), builder.indexCreatedVersion, builder.analyzers.indexAnalyzers).init(this);
}
@Override
public SourceLoader.SyntheticFieldLoader syntheticFieldLoader() {
if (copyTo.copyToFields().isEmpty() != true) {
throw new IllegalArgumentException(
"field [" + name() + "] of type [" + typeName() + "] doesn't support synthetic source because it declares copy_to"
);
}
if (fieldType.stored()) {
return new StringStoredFieldFieldLoader(name(), simpleName(), null) {
@Override
protected void write(XContentBuilder b, Object value) throws IOException {
b.value((String) value);
}
};
}
var kwd = TextFieldMapper.SyntheticSourceHelper.getKeywordFieldMapperForSyntheticSource(this);
if (kwd != null) {
return kwd.syntheticFieldLoader(simpleName());
}
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"field [%s] of type [%s] doesn't support synthetic source unless it is stored or has a sub-field of"
+ " type [keyword] with doc values or stored and without a normalizer",
name(),
typeName()
)
);
}
}

View file

@ -0,0 +1,26 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.index.mapper.annotatedtext;
import org.elasticsearch.features.FeatureSpecification;
import org.elasticsearch.features.NodeFeature;
import java.util.Set;
/**
* Provides features for annotated text mapper.
*/
public class Features implements FeatureSpecification {
@Override
public Set<NodeFeature> getFeatures() {
return Set.of(
AnnotatedTextFieldMapper.SYNTHETIC_SOURCE_SUPPORT // Added in 8.15
);
}
}

View file

@ -0,0 +1,9 @@
#
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License
# 2.0 and the Server Side Public License, v 1; you may not use this file except
# in compliance with, at your election, the Elastic License 2.0 or the Server
# Side Public License, v 1.
#
org.elasticsearch.index.mapper.annotatedtext.Features

View file

@ -14,6 +14,7 @@ import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
@ -29,6 +30,7 @@ import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.LowercaseNormalizer;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
@ -38,6 +40,7 @@ import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.MapperTestCase;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.index.mapper.TextFieldFamilySyntheticSourceTestSetup;
import org.elasticsearch.index.mapper.TextFieldMapper;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.xcontent.ToXContent;
@ -54,6 +57,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
@ -144,7 +148,8 @@ public class AnnotatedTextFieldMapperTests extends MapperTestCase {
)
);
return IndexAnalyzers.of(
Map.of("default", dflt, "standard", standard, "keyword", keyword, "whitespace", whitespace, "my_stop_analyzer", stop)
Map.of("default", dflt, "standard", standard, "keyword", keyword, "whitespace", whitespace, "my_stop_analyzer", stop),
Map.of("lowercase", new NamedAnalyzer("lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer()))
);
}
@ -595,7 +600,23 @@ public class AnnotatedTextFieldMapperTests extends MapperTestCase {
@Override
protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) {
throw new AssumptionViolatedException("not supported");
assumeFalse("ignore_malformed not supported", ignoreMalformed);
return TextFieldFamilySyntheticSourceTestSetup.syntheticSourceSupport("annotated_text", false);
}
@Override
protected BlockReaderSupport getSupportedReaders(MapperService mapper, String loaderFieldName) {
return TextFieldFamilySyntheticSourceTestSetup.getSupportedReaders(mapper, loaderFieldName);
}
@Override
protected Function<Object, Object> loadBlockExpected(BlockReaderSupport blockReaderSupport, boolean columnReader) {
return TextFieldFamilySyntheticSourceTestSetup.loadBlockExpected(blockReaderSupport, columnReader);
}
@Override
protected void validateRoundTripReader(String syntheticSource, DirectoryReader reader, DirectoryReader roundTripReader) {
TextFieldFamilySyntheticSourceTestSetup.validateRoundTripReader(syntheticSource, reader, roundTripReader);
}
@Override

View file

@ -0,0 +1,197 @@
---
setup:
- requires:
cluster_features: ["mapper.annotated_text.synthetic_source"]
reason: introduced in 8.15.0
---
stored annotated_text field:
- do:
indices.create:
index: test
body:
mappings:
_source:
mode: synthetic
properties:
annotated_text:
type: annotated_text
store: true
- do:
index:
index: test
id: 1
refresh: true
body:
annotated_text: the quick brown fox
- do:
search:
index: test
- match:
hits.hits.0._source:
annotated_text: the quick brown fox
---
annotated_text field with keyword multi-field:
- do:
indices.create:
index: test
body:
mappings:
_source:
mode: synthetic
properties:
annotated_text:
type: annotated_text
fields:
keyword:
type: keyword
- do:
index:
index: test
id: 1
refresh: true
body:
annotated_text: the quick brown fox
- do:
search:
index: test
- match:
hits.hits.0._source:
annotated_text: the quick brown fox
---
multiple values in stored annotated_text field:
- do:
indices.create:
index: test
body:
mappings:
_source:
mode: synthetic
properties:
annotated_text:
type: annotated_text
store: true
- do:
index:
index: test
id: 1
refresh: true
body:
annotated_text: ["world", "hello", "world"]
- do:
search:
index: test
- match:
hits.hits.0._source:
annotated_text: ["world", "hello", "world"]
---
multiple values in annotated_text field with keyword multi-field:
- do:
indices.create:
index: test
body:
mappings:
_source:
mode: synthetic
properties:
annotated_text:
type: annotated_text
fields:
keyword:
type: keyword
- do:
index:
index: test
id: 1
refresh: true
body:
annotated_text: ["world", "hello", "world"]
- do:
search:
index: test
- match:
hits.hits.0._source:
annotated_text: ["hello", "world"]
---
multiple values in annotated_text field with stored keyword multi-field:
- do:
indices.create:
index: test
body:
mappings:
_source:
mode: synthetic
properties:
annotated_text:
type: annotated_text
fields:
keyword:
type: keyword
store: true
doc_values: false
- do:
index:
index: test
id: 1
refresh: true
body:
annotated_text: ["world", "hello", "world"]
- do:
search:
index: test
- match:
hits.hits.0._source:
annotated_text: ["world", "hello", "world"]
---
multiple values in stored annotated_text field with keyword multi-field:
- do:
indices.create:
index: test
body:
mappings:
_source:
mode: synthetic
properties:
annotated_text:
type: annotated_text
store: true
fields:
keyword:
type: keyword
- do:
index:
index: test
id: 1
refresh: true
body:
annotated_text: ["world", "hello", "world"]
- do:
search:
index: test
- match:
hits.hits.0._source:
annotated_text: ["world", "hello", "world"]