diff --git a/docs/changelog/107735.yaml b/docs/changelog/107735.yaml new file mode 100644 index 000000000000..372cb59ba8b1 --- /dev/null +++ b/docs/changelog/107735.yaml @@ -0,0 +1,5 @@ +pr: 107735 +summary: Implement synthetic source support for annotated text field +area: Mapping +type: feature +issues: [] diff --git a/docs/plugins/mapper-annotated-text.asciidoc b/docs/plugins/mapper-annotated-text.asciidoc index 14669d857817..900eaa5e97a0 100644 --- a/docs/plugins/mapper-annotated-text.asciidoc +++ b/docs/plugins/mapper-annotated-text.asciidoc @@ -6,7 +6,7 @@ experimental[] The mapper-annotated-text plugin provides the ability to index text that is a combination of free-text and special markup that is typically used to identify items of interest such as people or organisations (see NER or Named Entity Recognition -tools). +tools). The elasticsearch markup allows one or more additional tokens to be injected, unchanged, into the token @@ -18,7 +18,7 @@ include::install_remove.asciidoc[] [[mapper-annotated-text-usage]] ==== Using the `annotated-text` field -The `annotated-text` tokenizes text content as per the more common {ref}/text.html[`text`] field (see +The `annotated-text` tokenizes text content as per the more common {ref}/text.html[`text`] field (see "limitations" below) but also injects any marked-up annotation tokens directly into the search index: @@ -49,7 +49,7 @@ in the search index: -------------------------- GET my-index-000001/_analyze { - "field": "my_field", + "field": "my_field", "text":"Investors in [Apple](Apple+Inc.) rejoiced." } -------------------------- @@ -76,7 +76,7 @@ Response: "position": 1 }, { - "token": "Apple Inc.", <1> + "token": "Apple Inc.", <1> "start_offset": 13, "end_offset": 18, "type": "annotation", @@ -106,7 +106,7 @@ the token stream and at the same position (position 2) as the text token (`apple We can now perform searches for annotations using regular `term` queries that don't tokenize -the provided search values. Annotations are a more precise way of matching as can be seen +the provided search values. Annotations are a more precise way of matching as can be seen in this example where a search for `Beck` will not match `Jeff Beck` : [source,console] @@ -133,18 +133,119 @@ GET my-index-000001/_search } -------------------------- -<1> As well as tokenising the plain text into single words e.g. `beck`, here we +<1> As well as tokenising the plain text into single words e.g. `beck`, here we inject the single token value `Beck` at the same position as `beck` in the token stream. <2> Note annotations can inject multiple tokens at the same position - here we inject both the very specific value `Jeff Beck` and the broader term `Guitarist`. This enables broader positional queries e.g. finding mentions of a `Guitarist` near to `strat`. -<3> A benefit of searching with these carefully defined annotation tokens is that a query for +<3> A benefit of searching with these carefully defined annotation tokens is that a query for `Beck` will not match document 2 that contains the tokens `jeff`, `beck` and `Jeff Beck` -WARNING: Any use of `=` signs in annotation values eg `[Prince](person=Prince)` will +WARNING: Any use of `=` signs in annotation values eg `[Prince](person=Prince)` will cause the document to be rejected with a parse failure. In future we hope to have a use for the equals signs so wil actively reject documents that contain this today. +[[annotated-text-synthetic-source]] +===== Synthetic `_source` + +IMPORTANT: Synthetic `_source` is Generally Available only for TSDB indices +(indices that have `index.mode` set to `time_series`). For other indices +synthetic `_source` is in technical preview. Features in technical preview may +be changed or removed in a future release. Elastic will work to fix +any issues, but features in technical preview are not subject to the support SLA +of official GA features. + +`annotated_text` fields support {ref}/mapping-source-field.html#synthetic-source[synthetic `_source`] if they have +a {ref}/keyword.html#keyword-synthetic-source[`keyword`] sub-field that supports synthetic +`_source` or if the `text` field sets `store` to `true`. Either way, it may +not have {ref}/copy-to.html[`copy_to`]. + +If using a sub-`keyword` field then the values are sorted in the same way as +a `keyword` field's values are sorted. By default, that means sorted with +duplicates removed. So: +[source,console,id=synthetic-source-text-example-default] +---- +PUT idx +{ + "mappings": { + "_source": { "mode": "synthetic" }, + "properties": { + "text": { + "type": "annotated_text", + "fields": { + "raw": { + "type": "keyword" + } + } + } + } + } +} +PUT idx/_doc/1 +{ + "text": [ + "the quick brown fox", + "the quick brown fox", + "jumped over the lazy dog" + ] +} +---- +// TEST[s/$/\nGET idx\/_doc\/1?filter_path=_source\n/] + +Will become: +[source,console-result] +---- +{ + "text": [ + "jumped over the lazy dog", + "the quick brown fox" + ] +} +---- +// TEST[s/^/{"_source":/ s/\n$/}/] + +NOTE: Reordering text fields can have an effect on {ref}/query-dsl-match-query-phrase.html[phrase] +and {ref}/span-queries.html[span] queries. See the discussion about {ref}/position-increment-gap.html[`position_increment_gap`] for more detail. You +can avoid this by making sure the `slop` parameter on the phrase queries +is lower than the `position_increment_gap`. This is the default. + +If the `annotated_text` field sets `store` to true then order and duplicates +are preserved. +[source,console,id=synthetic-source-text-example-stored] +---- +PUT idx +{ + "mappings": { + "_source": { "mode": "synthetic" }, + "properties": { + "text": { "type": "annotated_text", "store": true } + } + } +} +PUT idx/_doc/1 +{ + "text": [ + "the quick brown fox", + "the quick brown fox", + "jumped over the lazy dog" + ] +} +---- +// TEST[s/$/\nGET idx\/_doc\/1?filter_path=_source\n/] + +Will become: +[source,console-result] +---- +{ + "text": [ + "the quick brown fox", + "the quick brown fox", + "jumped over the lazy dog" + ] +} +---- +// TEST[s/^/{"_source":/ s/\n$/}/] + [[mapper-annotated-text-tips]] ==== Data modelling tips @@ -153,13 +254,13 @@ the equals signs so wil actively reject documents that contain this today. Annotations are normally a way of weaving structured information into unstructured text for higher-precision search. -`Entity resolution` is a form of document enrichment undertaken by specialist software or people +`Entity resolution` is a form of document enrichment undertaken by specialist software or people where references to entities in a document are disambiguated by attaching a canonical ID. The ID is used to resolve any number of aliases or distinguish between people with the -same name. The hyperlinks connecting Wikipedia's articles are a good example of resolved -entity IDs woven into text. +same name. The hyperlinks connecting Wikipedia's articles are a good example of resolved +entity IDs woven into text. -These IDs can be embedded as annotations in an annotated_text field but it often makes +These IDs can be embedded as annotations in an annotated_text field but it often makes sense to include them in dedicated structured fields to support discovery via aggregations: [source,console] @@ -214,20 +315,20 @@ GET my-index-000001/_search -------------------------- <1> Note the `my_twitter_handles` contains a list of the annotation values -also used in the unstructured text. (Note the annotated_text syntax requires escaping). -By repeating the annotation values in a structured field this application has ensured that -the tokens discovered in the structured field can be used for search and highlighting -in the unstructured field. +also used in the unstructured text. (Note the annotated_text syntax requires escaping). +By repeating the annotation values in a structured field this application has ensured that +the tokens discovered in the structured field can be used for search and highlighting +in the unstructured field. <2> In this example we search for documents that talk about components of the elastic stack <3> We use the `my_twitter_handles` field here to discover people who are significantly associated with the elastic stack. ===== Avoiding over-matching annotations -By design, the regular text tokens and the annotation tokens co-exist in the same indexed +By design, the regular text tokens and the annotation tokens co-exist in the same indexed field but in rare cases this can lead to some over-matching. The value of an annotation often denotes a _named entity_ (a person, place or company). -The tokens for these named entities are inserted untokenized, and differ from typical text +The tokens for these named entities are inserted untokenized, and differ from typical text tokens because they are normally: * Mixed case e.g. `Madonna` @@ -235,19 +336,19 @@ tokens because they are normally: * Can have punctuation or numbers e.g. `Apple Inc.` or `@kimchy` This means, for the most part, a search for a named entity in the annotated text field will -not have any false positives e.g. when selecting `Apple Inc.` from an aggregation result -you can drill down to highlight uses in the text without "over matching" on any text tokens +not have any false positives e.g. when selecting `Apple Inc.` from an aggregation result +you can drill down to highlight uses in the text without "over matching" on any text tokens like the word `apple` in this context: the apple was very juicy - -However, a problem arises if your named entity happens to be a single term and lower-case e.g. the + +However, a problem arises if your named entity happens to be a single term and lower-case e.g. the company `elastic`. In this case, a search on the annotated text field for the token `elastic` may match a text document such as this: they fired an elastic band -To avoid such false matches users should consider prefixing annotation values to ensure +To avoid such false matches users should consider prefixing annotation values to ensure they don't name clash with text tokens e.g. [elastic](Company_elastic) released version 7.0 of the elastic stack today @@ -273,7 +374,7 @@ GET my-index-000001/_search { "query": { "query_string": { - "query": "cats" + "query": "cats" } }, "highlight": { @@ -291,21 +392,21 @@ GET my-index-000001/_search The annotated highlighter is based on the `unified` highlighter and supports the same settings but does not use the `pre_tags` or `post_tags` parameters. Rather than using -html-like markup such as `cat` the annotated highlighter uses the same +html-like markup such as `cat` the annotated highlighter uses the same markdown-like syntax used for annotations and injects a key=value annotation where `_hit_term` -is the key and the matched search term is the value e.g. +is the key and the matched search term is the value e.g. The [cat](_hit_term=cat) sat on the [mat](sku3578) -The annotated highlighter tries to be respectful of any existing markup in the original +The annotated highlighter tries to be respectful of any existing markup in the original text: -* If the search term matches exactly the location of an existing annotation then the +* If the search term matches exactly the location of an existing annotation then the `_hit_term` key is merged into the url-like syntax used in the `(...)` part of the -existing annotation. +existing annotation. * However, if the search term overlaps the span of an existing annotation it would break the markup formatting so the original annotation is removed in favour of a new annotation -with just the search hit information in the results. +with just the search hit information in the results. * Any non-overlapping annotations in the original text are preserved in highlighter selections diff --git a/docs/reference/mapping/fields/synthetic-source.asciidoc b/docs/reference/mapping/fields/synthetic-source.asciidoc index ec6f51f78eda..21e98cd55bf3 100644 --- a/docs/reference/mapping/fields/synthetic-source.asciidoc +++ b/docs/reference/mapping/fields/synthetic-source.asciidoc @@ -41,6 +41,7 @@ There are a couple of restrictions to be aware of: types: ** <> +** {plugins}/mapper-annotated-text-usage.html#annotated-text-synthetic-source[`annotated-text`] ** <> ** <> ** <> diff --git a/plugins/mapper-annotated-text/src/main/java/module-info.java b/plugins/mapper-annotated-text/src/main/java/module-info.java new file mode 100644 index 000000000000..3aa8e46e2980 --- /dev/null +++ b/plugins/mapper-annotated-text/src/main/java/module-info.java @@ -0,0 +1,19 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +module org.elasticsearch.index.mapper.annotatedtext { + requires org.elasticsearch.base; + requires org.elasticsearch.server; + requires org.elasticsearch.xcontent; + requires org.apache.lucene.core; + requires org.apache.lucene.highlighter; + + // exports nothing + + provides org.elasticsearch.features.FeatureSpecification with org.elasticsearch.index.mapper.annotatedtext.Features; +} diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java index fae2ab19aee3..6d2b83185d5b 100644 --- a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java @@ -21,17 +21,22 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.analysis.AnalyzerScope; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.mapper.DocumentParserContext; import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.KeywordFieldMapper; import org.elasticsearch.index.mapper.MapperBuilderContext; +import org.elasticsearch.index.mapper.SourceLoader; +import org.elasticsearch.index.mapper.StringStoredFieldFieldLoader; import org.elasticsearch.index.mapper.TextFieldMapper; import org.elasticsearch.index.mapper.TextParams; import org.elasticsearch.index.mapper.TextSearchInfo; import org.elasticsearch.index.similarity.SimilarityProvider; +import org.elasticsearch.xcontent.XContentBuilder; import java.io.IOException; import java.io.Reader; @@ -41,6 +46,7 @@ import java.net.URLDecoder; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -58,6 +64,8 @@ import java.util.regex.Pattern; **/ public class AnnotatedTextFieldMapper extends FieldMapper { + public static final NodeFeature SYNTHETIC_SOURCE_SUPPORT = new NodeFeature("mapper.annotated_text.synthetic_source"); + public static final String CONTENT_TYPE = "annotated_text"; private static Builder builder(FieldMapper in) { @@ -114,7 +122,7 @@ public class AnnotatedTextFieldMapper extends FieldMapper { meta }; } - private AnnotatedTextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext context) { + private AnnotatedTextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext context, MultiFields multiFields) { TextSearchInfo tsi = new TextSearchInfo( fieldType, similarity.get(), @@ -126,12 +134,14 @@ public class AnnotatedTextFieldMapper extends FieldMapper { store.getValue(), tsi, context.isSourceSynthetic(), + TextFieldMapper.SyntheticSourceHelper.syntheticSourceDelegate(fieldType, multiFields), meta.getValue() ); } @Override public AnnotatedTextFieldMapper build(MapperBuilderContext context) { + MultiFields multiFields = multiFieldsBuilder.build(this, context); FieldType fieldType = TextParams.buildFieldType(() -> true, store, indexOptions, norms, termVectors); if (fieldType.indexOptions() == IndexOptions.NONE) { throw new IllegalArgumentException("[" + CONTENT_TYPE + "] fields must be indexed"); @@ -146,8 +156,8 @@ public class AnnotatedTextFieldMapper extends FieldMapper { return new AnnotatedTextFieldMapper( name(), fieldType, - buildFieldType(fieldType, context), - multiFieldsBuilder.build(this, context), + buildFieldType(fieldType, context, multiFields), + multiFields, copyTo, this ); @@ -472,15 +482,15 @@ public class AnnotatedTextFieldMapper extends FieldMapper { } public static final class AnnotatedTextFieldType extends TextFieldMapper.TextFieldType { - private AnnotatedTextFieldType( String name, boolean store, TextSearchInfo tsi, boolean isSyntheticSource, + KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate, Map meta ) { - super(name, true, store, tsi, isSyntheticSource, null, meta, false, false); + super(name, true, store, tsi, isSyntheticSource, syntheticSourceDelegate, meta, false, false); } public AnnotatedTextFieldType(String name, Map meta) { @@ -544,4 +554,36 @@ public class AnnotatedTextFieldMapper extends FieldMapper { public FieldMapper.Builder getMergeBuilder() { return new Builder(simpleName(), builder.indexCreatedVersion, builder.analyzers.indexAnalyzers).init(this); } + + @Override + public SourceLoader.SyntheticFieldLoader syntheticFieldLoader() { + if (copyTo.copyToFields().isEmpty() != true) { + throw new IllegalArgumentException( + "field [" + name() + "] of type [" + typeName() + "] doesn't support synthetic source because it declares copy_to" + ); + } + if (fieldType.stored()) { + return new StringStoredFieldFieldLoader(name(), simpleName(), null) { + @Override + protected void write(XContentBuilder b, Object value) throws IOException { + b.value((String) value); + } + }; + } + + var kwd = TextFieldMapper.SyntheticSourceHelper.getKeywordFieldMapperForSyntheticSource(this); + if (kwd != null) { + return kwd.syntheticFieldLoader(simpleName()); + } + + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "field [%s] of type [%s] doesn't support synthetic source unless it is stored or has a sub-field of" + + " type [keyword] with doc values or stored and without a normalizer", + name(), + typeName() + ) + ); + } } diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/Features.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/Features.java new file mode 100644 index 000000000000..1c4bd22e8814 --- /dev/null +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/Features.java @@ -0,0 +1,26 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.index.mapper.annotatedtext; + +import org.elasticsearch.features.FeatureSpecification; +import org.elasticsearch.features.NodeFeature; + +import java.util.Set; + +/** + * Provides features for annotated text mapper. + */ +public class Features implements FeatureSpecification { + @Override + public Set getFeatures() { + return Set.of( + AnnotatedTextFieldMapper.SYNTHETIC_SOURCE_SUPPORT // Added in 8.15 + ); + } +} diff --git a/plugins/mapper-annotated-text/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification b/plugins/mapper-annotated-text/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification new file mode 100644 index 000000000000..a19d9deb9c52 --- /dev/null +++ b/plugins/mapper-annotated-text/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification @@ -0,0 +1,9 @@ +# +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0 and the Server Side Public License, v 1; you may not use this file except +# in compliance with, at your election, the Elastic License 2.0 or the Server +# Side Public License, v 1. +# + +org.elasticsearch.index.mapper.annotatedtext.Features diff --git a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java index 9f1d063433d8..3b27cdb13285 100644 --- a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java +++ b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java @@ -14,6 +14,7 @@ import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; @@ -29,6 +30,7 @@ import org.elasticsearch.index.analysis.AnalyzerScope; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.CustomAnalyzer; import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.LowercaseNormalizer; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.analysis.StandardTokenizerFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; @@ -38,6 +40,7 @@ import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.MapperTestCase; import org.elasticsearch.index.mapper.ParsedDocument; +import org.elasticsearch.index.mapper.TextFieldFamilySyntheticSourceTestSetup; import org.elasticsearch.index.mapper.TextFieldMapper; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.xcontent.ToXContent; @@ -54,6 +57,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Function; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -144,7 +148,8 @@ public class AnnotatedTextFieldMapperTests extends MapperTestCase { ) ); return IndexAnalyzers.of( - Map.of("default", dflt, "standard", standard, "keyword", keyword, "whitespace", whitespace, "my_stop_analyzer", stop) + Map.of("default", dflt, "standard", standard, "keyword", keyword, "whitespace", whitespace, "my_stop_analyzer", stop), + Map.of("lowercase", new NamedAnalyzer("lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer())) ); } @@ -595,7 +600,23 @@ public class AnnotatedTextFieldMapperTests extends MapperTestCase { @Override protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) { - throw new AssumptionViolatedException("not supported"); + assumeFalse("ignore_malformed not supported", ignoreMalformed); + return TextFieldFamilySyntheticSourceTestSetup.syntheticSourceSupport("annotated_text", false); + } + + @Override + protected BlockReaderSupport getSupportedReaders(MapperService mapper, String loaderFieldName) { + return TextFieldFamilySyntheticSourceTestSetup.getSupportedReaders(mapper, loaderFieldName); + } + + @Override + protected Function loadBlockExpected(BlockReaderSupport blockReaderSupport, boolean columnReader) { + return TextFieldFamilySyntheticSourceTestSetup.loadBlockExpected(blockReaderSupport, columnReader); + } + + @Override + protected void validateRoundTripReader(String syntheticSource, DirectoryReader reader, DirectoryReader roundTripReader) { + TextFieldFamilySyntheticSourceTestSetup.validateRoundTripReader(syntheticSource, reader, roundTripReader); } @Override diff --git a/plugins/mapper-annotated-text/src/yamlRestTest/resources/rest-api-spec/test/mapper_annotatedtext/20_synthetic_source.yml b/plugins/mapper-annotated-text/src/yamlRestTest/resources/rest-api-spec/test/mapper_annotatedtext/20_synthetic_source.yml new file mode 100644 index 000000000000..54a51e60f56d --- /dev/null +++ b/plugins/mapper-annotated-text/src/yamlRestTest/resources/rest-api-spec/test/mapper_annotatedtext/20_synthetic_source.yml @@ -0,0 +1,197 @@ +--- +setup: + - requires: + cluster_features: ["mapper.annotated_text.synthetic_source"] + reason: introduced in 8.15.0 + +--- +stored annotated_text field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + store: true + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: the quick brown fox + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: the quick brown fox + +--- +annotated_text field with keyword multi-field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + fields: + keyword: + type: keyword + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: the quick brown fox + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: the quick brown fox + +--- +multiple values in stored annotated_text field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + store: true + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: ["world", "hello", "world"] + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: ["world", "hello", "world"] + +--- +multiple values in annotated_text field with keyword multi-field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + fields: + keyword: + type: keyword + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: ["world", "hello", "world"] + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: ["hello", "world"] + + +--- +multiple values in annotated_text field with stored keyword multi-field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + fields: + keyword: + type: keyword + store: true + doc_values: false + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: ["world", "hello", "world"] + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: ["world", "hello", "world"] + +--- +multiple values in stored annotated_text field with keyword multi-field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + store: true + fields: + keyword: + type: keyword + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: ["world", "hello", "world"] + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: ["world", "hello", "world"] diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index bdf25307d334..eeb452204091 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -1026,7 +1026,7 @@ public final class KeywordFieldMapper extends FieldMapper { return syntheticFieldLoader(simpleName()); } - SourceLoader.SyntheticFieldLoader syntheticFieldLoader(String simpleName) { + public SourceLoader.SyntheticFieldLoader syntheticFieldLoader(String simpleName) { if (hasScript()) { return SourceLoader.SyntheticFieldLoader.NOTHING; } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index ef512e2bbd46..57dd2fa0b920 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -390,7 +390,7 @@ public final class TextFieldMapper extends FieldMapper { store.getValue(), tsi, context.isSourceSynthetic(), - syntheticSourceDelegate(fieldType, multiFields), + SyntheticSourceHelper.syntheticSourceDelegate(fieldType, multiFields), meta.getValue(), eagerGlobalOrdinals.getValue(), indexPhrases.getValue() @@ -402,17 +402,6 @@ public final class TextFieldMapper extends FieldMapper { return ft; } - private static KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate(FieldType fieldType, MultiFields multiFields) { - if (fieldType.stored()) { - return null; - } - var kwd = getKeywordFieldMapperForSyntheticSource(multiFields); - if (kwd != null) { - return kwd.fieldType(); - } - return null; - } - private SubFieldInfo buildPrefixInfo(MapperBuilderContext context, FieldType fieldType, TextFieldType tft) { if (indexPrefixes.get() == null) { return null; @@ -1094,7 +1083,7 @@ public final class TextFieldMapper extends FieldMapper { return isSyntheticSource; } - KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate() { + public KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate() { return syntheticSourceDelegate; } } @@ -1473,7 +1462,7 @@ public final class TextFieldMapper extends FieldMapper { }; } - var kwd = getKeywordFieldMapperForSyntheticSource(this); + var kwd = SyntheticSourceHelper.getKeywordFieldMapperForSyntheticSource(this); if (kwd != null) { return kwd.syntheticFieldLoader(simpleName()); } @@ -1489,16 +1478,29 @@ public final class TextFieldMapper extends FieldMapper { ); } - private static KeywordFieldMapper getKeywordFieldMapperForSyntheticSource(Iterable multiFields) { - for (Mapper sub : multiFields) { - if (sub.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { - KeywordFieldMapper kwd = (KeywordFieldMapper) sub; - if (kwd.hasNormalizer() == false && (kwd.fieldType().hasDocValues() || kwd.fieldType().isStored())) { - return kwd; - } + public static class SyntheticSourceHelper { + public static KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate(FieldType fieldType, MultiFields multiFields) { + if (fieldType.stored()) { + return null; } + var kwd = getKeywordFieldMapperForSyntheticSource(multiFields); + if (kwd != null) { + return kwd.fieldType(); + } + return null; } - return null; + public static KeywordFieldMapper getKeywordFieldMapperForSyntheticSource(Iterable multiFields) { + for (Mapper sub : multiFields) { + if (sub.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { + KeywordFieldMapper kwd = (KeywordFieldMapper) sub; + if (kwd.hasNormalizer() == false && (kwd.fieldType().hasDocValues() || kwd.fieldType().isStored())) { + return kwd; + } + } + } + + return null; + } } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index 70e375a89d5e..4824bd337f5b 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -25,7 +25,6 @@ import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.common.Strings; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.core.Tuple; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.analysis.AnalyzerScope; @@ -45,14 +44,11 @@ import org.elasticsearch.script.StringFieldScript; import org.elasticsearch.xcontent.XContentBuilder; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.function.Function; -import java.util.stream.Collectors; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -658,7 +654,7 @@ public class KeywordFieldMapperTests extends MapperTestCase { @Override protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) { assertFalse("keyword doesn't support ignore_malformed", ignoreMalformed); - return new KeywordSyntheticSourceSupport( + return new KeywordFieldSyntheticSourceSupport( randomBoolean() ? null : between(10, 100), randomBoolean(), usually() ? null : randomAlphaOfLength(2), @@ -666,110 +662,6 @@ public class KeywordFieldMapperTests extends MapperTestCase { ); } - static class KeywordSyntheticSourceSupport implements SyntheticSourceSupport { - private final Integer ignoreAbove; - private final boolean allIgnored; - private final boolean store; - private final boolean docValues; - private final String nullValue; - private final boolean exampleSortsUsingIgnoreAbove; - - KeywordSyntheticSourceSupport(Integer ignoreAbove, boolean store, String nullValue, boolean exampleSortsUsingIgnoreAbove) { - this.ignoreAbove = ignoreAbove; - this.allIgnored = ignoreAbove != null && rarely(); - this.store = store; - this.nullValue = nullValue; - this.exampleSortsUsingIgnoreAbove = exampleSortsUsingIgnoreAbove; - this.docValues = store ? randomBoolean() : true; - } - - @Override - public SyntheticSourceExample example(int maxValues) { - return example(maxValues, false); - } - - public SyntheticSourceExample example(int maxValues, boolean loadBlockFromSource) { - if (randomBoolean()) { - Tuple v = generateValue(); - Object loadBlock = v.v2(); - if (loadBlockFromSource == false && ignoreAbove != null && v.v2().length() > ignoreAbove) { - loadBlock = null; - } - return new SyntheticSourceExample(v.v1(), v.v2(), loadBlock, this::mapping); - } - List> values = randomList(1, maxValues, this::generateValue); - List in = values.stream().map(Tuple::v1).toList(); - List outPrimary = new ArrayList<>(); - List outExtraValues = new ArrayList<>(); - values.stream().map(Tuple::v2).forEach(v -> { - if (exampleSortsUsingIgnoreAbove && ignoreAbove != null && v.length() > ignoreAbove) { - outExtraValues.add(v); - } else { - outPrimary.add(v); - } - }); - List outList = store ? outPrimary : new HashSet<>(outPrimary).stream().sorted().collect(Collectors.toList()); - List loadBlock; - if (loadBlockFromSource) { - // The block loader infrastructure will never return nulls. Just zap them all. - loadBlock = in.stream().filter(m -> m != null).toList(); - } else if (docValues) { - loadBlock = new HashSet<>(outPrimary).stream().sorted().collect(Collectors.toList()); - } else { - loadBlock = List.copyOf(outList); - } - Object loadBlockResult = loadBlock.size() == 1 ? loadBlock.get(0) : loadBlock; - outList.addAll(outExtraValues); - Object out = outList.size() == 1 ? outList.get(0) : outList; - return new SyntheticSourceExample(in, out, loadBlockResult, this::mapping); - } - - private Tuple generateValue() { - if (nullValue != null && randomBoolean()) { - return Tuple.tuple(null, nullValue); - } - int length = 5; - if (ignoreAbove != null && (allIgnored || randomBoolean())) { - length = ignoreAbove + 5; - } - String v = randomAlphaOfLength(length); - return Tuple.tuple(v, v); - } - - private void mapping(XContentBuilder b) throws IOException { - b.field("type", "keyword"); - if (nullValue != null) { - b.field("null_value", nullValue); - } - if (ignoreAbove != null) { - b.field("ignore_above", ignoreAbove); - } - if (store) { - b.field("store", true); - } - if (docValues == false) { - b.field("doc_values", false); - } - } - - @Override - public List invalidExample() throws IOException { - return List.of( - new SyntheticSourceInvalidExample( - equalTo( - "field [field] of type [keyword] doesn't support synthetic source because " - + "it doesn't have doc values and isn't stored" - ), - b -> b.field("type", "keyword").field("doc_values", false) - ), - new SyntheticSourceInvalidExample( - equalTo("field [field] of type [keyword] doesn't support synthetic source because it declares a normalizer"), - b -> b.field("type", "keyword").field("normalizer", "lowercase") - ) - ); - } - } - @Override protected IngestScriptSupport ingestScriptSupport() { return new IngestScriptSupport() { diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java index 5d0c1c01ecdc..50d15be2256e 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java @@ -75,7 +75,6 @@ import org.elasticsearch.search.lookup.SourceProvider; import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentFactory; -import org.hamcrest.Matcher; import org.junit.AssumptionViolatedException; import java.io.IOException; @@ -1178,120 +1177,12 @@ public class TextFieldMapperTests extends MapperTestCase { @Override protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) { assumeFalse("ignore_malformed not supported", ignoreMalformed); - boolean storeTextField = randomBoolean(); - boolean storedKeywordField = storeTextField || randomBoolean(); - boolean indexText = randomBoolean(); - Integer ignoreAbove = randomBoolean() ? null : between(10, 100); - KeywordFieldMapperTests.KeywordSyntheticSourceSupport keywordSupport = new KeywordFieldMapperTests.KeywordSyntheticSourceSupport( - ignoreAbove, - storedKeywordField, - null, - false == storeTextField - ); - return new SyntheticSourceSupport() { - @Override - public SyntheticSourceExample example(int maxValues) { - if (storeTextField) { - SyntheticSourceExample delegate = keywordSupport.example(maxValues, true); - return new SyntheticSourceExample( - delegate.inputValue(), - delegate.expectedForSyntheticSource(), - delegate.expectedForBlockLoader(), - b -> { - b.field("type", "text"); - b.field("store", true); - if (indexText == false) { - b.field("index", false); - } - } - ); - } - // We'll load from _source if ignore_above is defined, otherwise we load from the keyword field. - boolean loadingFromSource = ignoreAbove != null; - SyntheticSourceExample delegate = keywordSupport.example(maxValues, loadingFromSource); - return new SyntheticSourceExample( - delegate.inputValue(), - delegate.expectedForSyntheticSource(), - delegate.expectedForBlockLoader(), - b -> { - b.field("type", "text"); - if (indexText == false) { - b.field("index", false); - } - b.startObject("fields"); - { - b.startObject(randomAlphaOfLength(4)); - delegate.mapping().accept(b); - b.endObject(); - } - b.endObject(); - } - ); - } - - @Override - public List invalidExample() throws IOException { - Matcher err = equalTo( - "field [field] of type [text] doesn't support synthetic source unless it is stored or" - + " has a sub-field of type [keyword] with doc values or stored and without a normalizer" - ); - return List.of( - new SyntheticSourceInvalidExample(err, TextFieldMapperTests.this::minimalMapping), - new SyntheticSourceInvalidExample(err, b -> { - b.field("type", "text"); - b.startObject("fields"); - { - b.startObject("l"); - b.field("type", "long"); - b.endObject(); - } - b.endObject(); - }), - new SyntheticSourceInvalidExample(err, b -> { - b.field("type", "text"); - b.startObject("fields"); - { - b.startObject("kwd"); - b.field("type", "keyword"); - b.field("normalizer", "lowercase"); - b.endObject(); - } - b.endObject(); - }), - new SyntheticSourceInvalidExample(err, b -> { - b.field("type", "text"); - b.startObject("fields"); - { - b.startObject("kwd"); - b.field("type", "keyword"); - b.field("doc_values", "false"); - b.endObject(); - } - b.endObject(); - }) - ); - } - }; + return TextFieldFamilySyntheticSourceTestSetup.syntheticSourceSupport("text", true); } @Override protected Function loadBlockExpected(BlockReaderSupport blockReaderSupport, boolean columnReader) { - if (nullLoaderExpected(blockReaderSupport.mapper(), blockReaderSupport.loaderFieldName())) { - return null; - } - return v -> ((BytesRef) v).utf8ToString(); - } - - private boolean nullLoaderExpected(MapperService mapper, String fieldName) { - MappedFieldType type = mapper.fieldType(fieldName); - if (type instanceof TextFieldType t) { - if (t.isSyntheticSource() == false || t.canUseSyntheticSourceDelegateForQuerying() || t.isStored()) { - return false; - } - String parentField = mapper.mappingLookup().parentField(fieldName); - return parentField == null || nullLoaderExpected(mapper, parentField); - } - return false; + return TextFieldFamilySyntheticSourceTestSetup.loadBlockExpected(blockReaderSupport, columnReader); } @Override @@ -1300,9 +1191,8 @@ public class TextFieldMapperTests extends MapperTestCase { } @Override - protected void validateRoundTripReader(String syntheticSource, DirectoryReader reader, DirectoryReader roundTripReader) - throws IOException { - // Disabled because it currently fails + protected void validateRoundTripReader(String syntheticSource, DirectoryReader reader, DirectoryReader roundTripReader) { + TextFieldFamilySyntheticSourceTestSetup.validateRoundTripReader(syntheticSource, reader, roundTripReader); } public void testUnknownAnalyzerOnLegacyIndex() throws IOException { @@ -1433,21 +1323,7 @@ public class TextFieldMapperTests extends MapperTestCase { @Override protected BlockReaderSupport getSupportedReaders(MapperService mapper, String loaderFieldName) { - MappedFieldType ft = mapper.fieldType(loaderFieldName); - String parentName = mapper.mappingLookup().parentField(ft.name()); - if (parentName == null) { - TextFieldMapper.TextFieldType text = (TextFieldType) ft; - boolean supportsColumnAtATimeReader = text.syntheticSourceDelegate() != null - && text.syntheticSourceDelegate().hasDocValues() - && text.canUseSyntheticSourceDelegateForQuerying(); - return new BlockReaderSupport(supportsColumnAtATimeReader, mapper, loaderFieldName); - } - MappedFieldType parent = mapper.fieldType(parentName); - if (false == parent.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { - throw new UnsupportedOperationException(); - } - KeywordFieldMapper.KeywordFieldType kwd = (KeywordFieldMapper.KeywordFieldType) parent; - return new BlockReaderSupport(kwd.hasDocValues(), mapper, loaderFieldName); + return TextFieldFamilySyntheticSourceTestSetup.getSupportedReaders(mapper, loaderFieldName); } public void testBlockLoaderFromParentColumnReader() throws IOException { @@ -1460,7 +1336,7 @@ public class TextFieldMapperTests extends MapperTestCase { private void testBlockLoaderFromParent(boolean columnReader, boolean syntheticSource) throws IOException { boolean storeParent = randomBoolean(); - KeywordFieldMapperTests.KeywordSyntheticSourceSupport kwdSupport = new KeywordFieldMapperTests.KeywordSyntheticSourceSupport( + KeywordFieldSyntheticSourceSupport kwdSupport = new KeywordFieldSyntheticSourceSupport( null, storeParent, null, diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/KeywordFieldSyntheticSourceSupport.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/KeywordFieldSyntheticSourceSupport.java new file mode 100644 index 000000000000..53ecb75c18d9 --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/KeywordFieldSyntheticSourceSupport.java @@ -0,0 +1,126 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.tests.util.LuceneTestCase; +import org.elasticsearch.core.Tuple; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xcontent.XContentBuilder; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.stream.Collectors; + +import static org.hamcrest.Matchers.equalTo; + +public class KeywordFieldSyntheticSourceSupport implements MapperTestCase.SyntheticSourceSupport { + private final Integer ignoreAbove; + private final boolean allIgnored; + private final boolean store; + private final boolean docValues; + private final String nullValue; + private final boolean exampleSortsUsingIgnoreAbove; + + KeywordFieldSyntheticSourceSupport(Integer ignoreAbove, boolean store, String nullValue, boolean exampleSortsUsingIgnoreAbove) { + this.ignoreAbove = ignoreAbove; + this.allIgnored = ignoreAbove != null && LuceneTestCase.rarely(); + this.store = store; + this.nullValue = nullValue; + this.exampleSortsUsingIgnoreAbove = exampleSortsUsingIgnoreAbove; + this.docValues = store ? ESTestCase.randomBoolean() : true; + } + + @Override + public MapperTestCase.SyntheticSourceExample example(int maxValues) { + return example(maxValues, false); + } + + public MapperTestCase.SyntheticSourceExample example(int maxValues, boolean loadBlockFromSource) { + if (ESTestCase.randomBoolean()) { + Tuple v = generateValue(); + Object loadBlock = v.v2(); + if (loadBlockFromSource == false && ignoreAbove != null && v.v2().length() > ignoreAbove) { + loadBlock = null; + } + return new MapperTestCase.SyntheticSourceExample(v.v1(), v.v2(), loadBlock, this::mapping); + } + List> values = ESTestCase.randomList(1, maxValues, this::generateValue); + List in = values.stream().map(Tuple::v1).toList(); + List outPrimary = new ArrayList<>(); + List outExtraValues = new ArrayList<>(); + values.stream().map(Tuple::v2).forEach(v -> { + if (exampleSortsUsingIgnoreAbove && ignoreAbove != null && v.length() > ignoreAbove) { + outExtraValues.add(v); + } else { + outPrimary.add(v); + } + }); + List outList = store ? outPrimary : new HashSet<>(outPrimary).stream().sorted().collect(Collectors.toList()); + List loadBlock; + if (loadBlockFromSource) { + // The block loader infrastructure will never return nulls. Just zap them all. + loadBlock = in.stream().filter(m -> m != null).toList(); + } else if (docValues) { + loadBlock = new HashSet<>(outPrimary).stream().sorted().collect(Collectors.toList()); + } else { + loadBlock = List.copyOf(outList); + } + Object loadBlockResult = loadBlock.size() == 1 ? loadBlock.get(0) : loadBlock; + outList.addAll(outExtraValues); + Object out = outList.size() == 1 ? outList.get(0) : outList; + return new MapperTestCase.SyntheticSourceExample(in, out, loadBlockResult, this::mapping); + } + + private Tuple generateValue() { + if (nullValue != null && ESTestCase.randomBoolean()) { + return Tuple.tuple(null, nullValue); + } + int length = 5; + if (ignoreAbove != null && (allIgnored || ESTestCase.randomBoolean())) { + length = ignoreAbove + 5; + } + String v = ESTestCase.randomAlphaOfLength(length); + return Tuple.tuple(v, v); + } + + private void mapping(XContentBuilder b) throws IOException { + b.field("type", "keyword"); + if (nullValue != null) { + b.field("null_value", nullValue); + } + if (ignoreAbove != null) { + b.field("ignore_above", ignoreAbove); + } + if (store) { + b.field("store", true); + } + if (docValues == false) { + b.field("doc_values", false); + } + } + + @Override + public List invalidExample() throws IOException { + return List.of( + new MapperTestCase.SyntheticSourceInvalidExample( + equalTo( + "field [field] of type [keyword] doesn't support synthetic source because " + + "it doesn't have doc values and isn't stored" + ), + b -> b.field("type", "keyword").field("doc_values", false) + ), + new MapperTestCase.SyntheticSourceInvalidExample( + equalTo("field [field] of type [keyword] doesn't support synthetic source because it declares a normalizer"), + b -> b.field("type", "keyword").field("normalizer", "lowercase") + ) + ); + } +} diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java index fa0f0e1b95f5..097c23b96bb7 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java @@ -1286,7 +1286,7 @@ public abstract class MapperTestCase extends MapperServiceTestCase { * @param loaderFieldName the field name to use for loading the field */ public record BlockReaderSupport(boolean columnAtATimeReader, boolean syntheticSource, MapperService mapper, String loaderFieldName) { - BlockReaderSupport(boolean columnAtATimeReader, MapperService mapper, String loaderFieldName) { + public BlockReaderSupport(boolean columnAtATimeReader, MapperService mapper, String loaderFieldName) { this(columnAtATimeReader, true, mapper, loaderFieldName); } diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/TextFieldFamilySyntheticSourceTestSetup.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/TextFieldFamilySyntheticSourceTestSetup.java new file mode 100644 index 000000000000..df4377adc3e3 --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/TextFieldFamilySyntheticSourceTestSetup.java @@ -0,0 +1,207 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.util.BytesRef; +import org.hamcrest.Matcher; + +import java.io.IOException; +import java.util.List; +import java.util.Locale; +import java.util.function.Function; + +import static org.elasticsearch.test.ESTestCase.between; +import static org.elasticsearch.test.ESTestCase.randomAlphaOfLength; +import static org.elasticsearch.test.ESTestCase.randomBoolean; +import static org.hamcrest.Matchers.equalTo; + +/** + * Provides functionality needed to test synthetic source support in text and text-like fields (e.g. "text", "annotated_text"). + */ +public final class TextFieldFamilySyntheticSourceTestSetup { + public static MapperTestCase.SyntheticSourceSupport syntheticSourceSupport(String fieldType, boolean supportsCustomIndexConfiguration) { + return new TextFieldFamilySyntheticSourceSupport(fieldType, supportsCustomIndexConfiguration); + } + + public static MapperTestCase.BlockReaderSupport getSupportedReaders(MapperService mapper, String loaderFieldName) { + MappedFieldType ft = mapper.fieldType(loaderFieldName); + String parentName = mapper.mappingLookup().parentField(ft.name()); + if (parentName == null) { + TextFieldMapper.TextFieldType text = (TextFieldMapper.TextFieldType) ft; + boolean supportsColumnAtATimeReader = text.syntheticSourceDelegate() != null + && text.syntheticSourceDelegate().hasDocValues() + && text.canUseSyntheticSourceDelegateForQuerying(); + return new MapperTestCase.BlockReaderSupport(supportsColumnAtATimeReader, mapper, loaderFieldName); + } + MappedFieldType parent = mapper.fieldType(parentName); + if (false == parent.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { + throw new UnsupportedOperationException(); + } + KeywordFieldMapper.KeywordFieldType kwd = (KeywordFieldMapper.KeywordFieldType) parent; + return new MapperTestCase.BlockReaderSupport(kwd.hasDocValues(), mapper, loaderFieldName); + } + + public static Function loadBlockExpected(MapperTestCase.BlockReaderSupport blockReaderSupport, boolean columnReader) { + if (nullLoaderExpected(blockReaderSupport.mapper(), blockReaderSupport.loaderFieldName())) { + return null; + } + return v -> ((BytesRef) v).utf8ToString(); + } + + private static boolean nullLoaderExpected(MapperService mapper, String fieldName) { + MappedFieldType type = mapper.fieldType(fieldName); + if (type instanceof TextFieldMapper.TextFieldType t) { + if (t.isSyntheticSource() == false || t.canUseSyntheticSourceDelegateForQuerying() || t.isStored()) { + return false; + } + String parentField = mapper.mappingLookup().parentField(fieldName); + return parentField == null || nullLoaderExpected(mapper, parentField); + } + return false; + } + + public static void validateRoundTripReader(String syntheticSource, DirectoryReader reader, DirectoryReader roundTripReader) { + // `reader` here is reader of original document and `roundTripReader` reads document + // created from synthetic source. + // This check fails when synthetic source is constructed using keyword subfield + // since in that case values are sorted (due to being read from doc values) but original document isn't. + // + // So it is disabled. + } + + private static class TextFieldFamilySyntheticSourceSupport implements MapperTestCase.SyntheticSourceSupport { + private final String fieldType; + private final boolean storeTextField; + private final boolean storedKeywordField; + private final boolean indexText; + private final Integer ignoreAbove; + private final KeywordFieldSyntheticSourceSupport keywordSupport; + + TextFieldFamilySyntheticSourceSupport(String fieldType, boolean supportsCustomIndexConfiguration) { + this.fieldType = fieldType; + this.storeTextField = randomBoolean(); + this.storedKeywordField = storeTextField || randomBoolean(); + this.indexText = supportsCustomIndexConfiguration ? randomBoolean() : true; + this.ignoreAbove = randomBoolean() ? null : between(10, 100); + this.keywordSupport = new KeywordFieldSyntheticSourceSupport(ignoreAbove, storedKeywordField, null, false == storeTextField); + } + + @Override + public MapperTestCase.SyntheticSourceExample example(int maxValues) { + if (storeTextField) { + MapperTestCase.SyntheticSourceExample delegate = keywordSupport.example(maxValues, true); + return new MapperTestCase.SyntheticSourceExample( + delegate.inputValue(), + delegate.expectedForSyntheticSource(), + delegate.expectedForBlockLoader(), + b -> { + b.field("type", fieldType); + b.field("store", true); + if (indexText == false) { + b.field("index", false); + } + } + ); + } + // We'll load from _source if ignore_above is defined, otherwise we load from the keyword field. + boolean loadingFromSource = ignoreAbove != null; + MapperTestCase.SyntheticSourceExample delegate = keywordSupport.example(maxValues, loadingFromSource); + return new MapperTestCase.SyntheticSourceExample( + delegate.inputValue(), + delegate.expectedForSyntheticSource(), + delegate.expectedForBlockLoader(), + b -> { + b.field("type", fieldType); + if (indexText == false) { + b.field("index", false); + } + b.startObject("fields"); + { + b.startObject(randomAlphaOfLength(4)); + delegate.mapping().accept(b); + b.endObject(); + } + b.endObject(); + } + ); + } + + @Override + public List invalidExample() throws IOException { + Matcher err = equalTo( + String.format( + Locale.ROOT, + "field [field] of type [%s] doesn't support synthetic source unless it is stored or" + + " has a sub-field of type [keyword] with doc values or stored and without a normalizer", + fieldType + ) + ); + return List.of( + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> b.field("type", fieldType)), + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> { + b.field("type", fieldType); + b.startObject("fields"); + { + b.startObject("l"); + b.field("type", "long"); + b.endObject(); + } + b.endObject(); + }), + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> { + b.field("type", fieldType); + b.startObject("fields"); + { + b.startObject("kwd"); + b.field("type", "keyword"); + b.field("normalizer", "lowercase"); + b.endObject(); + } + b.endObject(); + }), + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> { + b.field("type", fieldType); + b.startObject("fields"); + { + b.startObject("kwd"); + b.field("type", "keyword"); + b.field("doc_values", "false"); + b.endObject(); + } + b.endObject(); + }), + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> { + b.field("type", fieldType); + b.field("store", "false"); + b.startObject("fields"); + { + b.startObject("kwd"); + b.field("type", "keyword"); + b.field("doc_values", "false"); + b.endObject(); + } + b.endObject(); + }), + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> { + b.field("type", fieldType); + b.startObject("fields"); + { + b.startObject("kwd"); + b.field("type", "keyword"); + b.field("doc_values", "false"); + b.field("store", "false"); + b.endObject(); + } + b.endObject(); + }) + ); + } + } +}