Enable Mapped Field Types to Override Default Highlighter (#121176)

This commit introduces the `MappedFieldType#getDefaultHighlighter`, allowing a specific highlighter to be enforced for a field.
The semantic field mapper utilizes this new functionality to set the `semantic` highlighter as the default.
All other fields will continue to use the `unified` highlighter by default.
This commit is contained in:
Jim Ferenczi 2025-01-29 21:55:53 +00:00 committed by GitHub
parent 6486299371
commit dbeb55cb3d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 128 additions and 26 deletions

View file

@ -133,14 +133,13 @@ You can extract the most relevant fragments from a semantic text field by using
POST test-index/_search
{
"query": {
"semantic": {
"field": "my_semantic_field"
"match": {
"my_semantic_field": "Which country is Paris in?"
}
},
"highlight": {
"fields": {
"my_semantic_field": {
"type": "semantic",
"number_of_fragments": 2, <1>
"order": "score" <2>
}
@ -152,6 +151,33 @@ POST test-index/_search
<1> Specifies the maximum number of fragments to return.
<2> Sorts highlighted fragments by score when set to `score`. By default, fragments will be output in the order they appear in the field (order: none).
Highlighting is supported on fields other than semantic_text.
However, if you want to restrict highlighting to the semantic highlighter and return no fragments when the field is not of type semantic_text,
you can explicitly enforce the `semantic` highlighter in the query:
[source,console]
------------------------------------------------------------
PUT test-index
{
"query": {
"match": {
"my_field": "Which country is Paris in?"
}
},
"highlight": {
"fields": {
"my_field": {
"type": "semantic", <1>
"number_of_fragments": 2,
"order": "score"
}
}
}
}
------------------------------------------------------------
// TEST[skip:Requires inference endpoint]
<1> Ensures that highlighting is applied exclusively to semantic_text fields.
[discrete]
[[custom-indexing]]
==== Customizing `semantic_text` indexing

View file

@ -37,8 +37,8 @@ GET /_search
// TEST[setup:my_index]
{es} supports three highlighters: `unified`, `plain`, and `fvh` (fast vector
highlighter). You can specify the highlighter `type` you want to use
for each field.
highlighter) for `text` and `keyword` fields and the `semantic` highlighter for `semantic_text` fields.
You can specify the highlighter `type` you want to use for each field or rely on the field type's default highlighter.
[discrete]
[[unified-highlighter]]
@ -48,7 +48,19 @@ highlighter breaks the text into sentences and uses the BM25 algorithm to score
individual sentences as if they were documents in the corpus. It also supports
accurate phrase and multi-term (fuzzy, prefix, regex) highlighting. The `unified`
highlighter can combine matches from multiple fields into one result (see
`matched_fields`). This is the default highlighter.
`matched_fields`).
This is the default highlighter for all `text` and `keyword` fields.
[discrete]
[[semantic-highlighter]]
==== Semantic Highlighter
The `semantic` highlighter is specifically designed for use with the <<semantic-text, `semantic_text`>> field.
It identifies and extracts the most relevant fragments from the field based on semantic
similarity between the query and each fragment.
By default, <<semantic-text, `semantic_text`>> fields use the semantic highlighter.
[discrete]
[[plain-highlighter]]

View file

@ -41,6 +41,7 @@ import org.elasticsearch.index.query.QueryShardException;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.fetch.subphase.FetchFieldsPhase;
import org.elasticsearch.search.fetch.subphase.highlight.DefaultHighlighter;
import org.elasticsearch.search.lookup.SearchLookup;
import java.io.IOException;
@ -217,6 +218,13 @@ public abstract class MappedFieldType {
return null;
}
/**
* Returns the default highlighter type to use when highlighting the field.
*/
public String getDefaultHighlighter() {
return DefaultHighlighter.NAME;
}
/** Generates a query that will only match documents that contain the given value.
* The default implementation returns a {@link TermQuery} over the value bytes
* @throws IllegalArgumentException if {@code value} cannot be converted to the expected data type or if the field is not searchable

View file

@ -913,7 +913,7 @@ public class SearchModule {
NamedRegistry<Highlighter> highlighters = new NamedRegistry<>("highlighter");
highlighters.register("fvh", new FastVectorHighlighter(settings));
highlighters.register("plain", new PlainHighlighter());
highlighters.register("unified", new DefaultHighlighter());
highlighters.register(DefaultHighlighter.NAME, new DefaultHighlighter());
highlighters.extractAndRegister(plugins, SearchPlugin::getHighlighters);
return unmodifiableMap(highlighters.getRegistry());

View file

@ -50,6 +50,8 @@ import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighte
public class DefaultHighlighter implements Highlighter {
public static final String NAME = "unified";
@Override
public boolean canHighlight(MappedFieldType fieldType) {
return true;

View file

@ -66,7 +66,7 @@ public class HighlightPhase implements FetchSubPhase {
Map<String, Function<HitContext, FieldHighlightContext>> contextBuilders = fieldContext.builders;
for (String field : contextBuilders.keySet()) {
FieldHighlightContext fieldContext = contextBuilders.get(field).apply(hitContext);
Highlighter highlighter = getHighlighter(fieldContext.field);
Highlighter highlighter = getHighlighter(fieldContext.field, fieldContext.fieldType);
HighlightField highlightField = highlighter.highlight(fieldContext);
if (highlightField != null) {
// Note that we make sure to use the original field name in the response. This is because the
@ -80,10 +80,10 @@ public class HighlightPhase implements FetchSubPhase {
};
}
private Highlighter getHighlighter(SearchHighlightContext.Field field) {
private Highlighter getHighlighter(SearchHighlightContext.Field field, MappedFieldType fieldType) {
String highlighterType = field.fieldOptions().highlighterType();
if (highlighterType == null) {
highlighterType = "unified";
highlighterType = fieldType.getDefaultHighlighter();
}
Highlighter highlighter = highlighters.get(highlighterType);
if (highlighter == null) {
@ -103,8 +103,6 @@ public class HighlightPhase implements FetchSubPhase {
Map<String, Function<HitContext, FieldHighlightContext>> builders = new LinkedHashMap<>();
StoredFieldsSpec storedFieldsSpec = StoredFieldsSpec.NO_REQUIREMENTS;
for (SearchHighlightContext.Field field : highlightContext.fields()) {
Highlighter highlighter = getHighlighter(field);
Collection<String> fieldNamesToHighlight = context.getSearchExecutionContext().getMatchingFieldNames(field.field());
boolean fieldNameContainsWildcards = field.field().contains("*");
@ -112,6 +110,7 @@ public class HighlightPhase implements FetchSubPhase {
boolean sourceRequired = false;
for (String fieldName : fieldNamesToHighlight) {
MappedFieldType fieldType = context.getSearchExecutionContext().getFieldType(fieldName);
Highlighter highlighter = getHighlighter(field, fieldType);
// We should prevent highlighting if a field is anything but a text, match_only_text,
// or keyword field.

View file

@ -25,6 +25,7 @@ import static org.elasticsearch.xpack.inference.queries.SemanticSparseVectorQuer
public class InferenceFeatures implements FeatureSpecification {
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter");
private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT = new NodeFeature("semantic_text.highlighter.default");
@Override
public Set<NodeFeature> getTestFeatures() {
@ -40,7 +41,8 @@ public class InferenceFeatures implements FeatureSpecification {
SemanticInferenceMetadataFieldsMapper.EXPLICIT_NULL_FIXES,
SEMANTIC_KNN_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED,
TextSimilarityRankRetrieverBuilder.TEXT_SIMILARITY_RERANKER_ALIAS_HANDLING_FIX,
SemanticInferenceMetadataFieldsMapper.INFERENCE_METADATA_FIELDS_ENABLED_BY_DEFAULT
SemanticInferenceMetadataFieldsMapper.INFERENCE_METADATA_FIELDS_ENABLED_BY_DEFAULT,
SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT
);
}
}

View file

@ -73,6 +73,7 @@ import org.elasticsearch.xcontent.XContentType;
import org.elasticsearch.xpack.core.ml.inference.results.MlTextEmbeddingResults;
import org.elasticsearch.xpack.core.ml.inference.results.TextExpansionResults;
import org.elasticsearch.xpack.core.ml.search.SparseVectorQueryBuilder;
import org.elasticsearch.xpack.inference.highlight.SemanticTextHighlighter;
import java.io.IOException;
import java.io.UncheckedIOException;
@ -580,6 +581,11 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
return TextFieldMapper.CONTENT_TYPE;
}
@Override
public String getDefaultHighlighter() {
return SemanticTextHighlighter.NAME;
}
public String getInferenceId() {
return inferenceId;
}

View file

@ -55,22 +55,32 @@ setup:
index.mapping.semantic_text.use_legacy_format: false
mappings:
properties:
title:
type: text
body:
type: semantic_text
inference_id: dense-inference-id
---
"Highlighting using a sparse embedding model":
- do:
index:
index: test-sparse-index
id: doc_1
body:
title: "Elasticsearch"
body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
refresh: true
- match: { result: created }
- do:
index:
index: test-dense-index
id: doc_1
body:
title: "Elasticsearch"
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
refresh: true
---
"Highlighting using a sparse embedding model":
- do:
search:
index: test-sparse-index
@ -153,16 +163,6 @@ setup:
---
"Highlighting using a dense embedding model":
- do:
index:
index: test-dense-index
id: doc_1
body:
body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
refresh: true
- match: { result: created }
- do:
search:
index: test-dense-index
@ -243,4 +243,51 @@ setup:
- match: { hits.hits.0.highlight.body.0: "You Know, for Search!" }
- match: { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
---
"Default highlighter for fields":
- requires:
cluster_features: "semantic_text.highlighter.default"
reason: semantic text field defaults to the semantic highlighter
- do:
search:
index: test-dense-index
body:
query:
match:
body: "What is Elasticsearch?"
highlight:
fields:
body:
order: "score"
number_of_fragments: 2
- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 2 }
- match: { hits.hits.0.highlight.body.0: "You Know, for Search!" }
- match: { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
---
"semantic highlighter ignores non-inference fields":
- requires:
cluster_features: "semantic_text.highlighter.default"
reason: semantic text field defaults to the semantic highlighter
- do:
search:
index: test-dense-index
body:
query:
match:
title: "Elasticsearch"
highlight:
fields:
title:
type: semantic
number_of_fragments: 2
- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- not_exists: hits.hits.0.highlight.title