mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-28 17:34:17 -04:00
Update text expansion/weighted tokens documentation make examples consistent with clients (#103663)
* Update text expansion docs and clarify int/float for token pruning config * Fix formatting * Fix tests * Fix tests
This commit is contained in:
parent
4744215536
commit
0570b0baaa
5 changed files with 224 additions and 56 deletions
|
@ -60,3 +60,5 @@ include::wrapper-query.asciidoc[]
|
||||||
include::pinned-query.asciidoc[]
|
include::pinned-query.asciidoc[]
|
||||||
|
|
||||||
include::rule-query.asciidoc[]
|
include::rule-query.asciidoc[]
|
||||||
|
|
||||||
|
include::weighted-tokens-query.asciidoc[]
|
||||||
|
|
|
@ -62,7 +62,7 @@ Default: Disabled.
|
||||||
Parameters for `<pruning_config>` are:
|
Parameters for `<pruning_config>` are:
|
||||||
|
|
||||||
`tokens_freq_ratio_threshold`::
|
`tokens_freq_ratio_threshold`::
|
||||||
(Optional, float)
|
(Optional, integer)
|
||||||
preview:[]
|
preview:[]
|
||||||
Tokens whose frequency is more than `tokens_freq_ratio_threshold` times the average frequency of all tokens in the specified field are considered outliers and pruned.
|
Tokens whose frequency is more than `tokens_freq_ratio_threshold` times the average frequency of all tokens in the specified field are considered outliers and pruned.
|
||||||
This value must between 1 and 100.
|
This value must between 1 and 100.
|
||||||
|
@ -110,29 +110,96 @@ GET my-index/_search
|
||||||
----
|
----
|
||||||
// TEST[skip: TBD]
|
// TEST[skip: TBD]
|
||||||
|
|
||||||
[discrete]
|
Multiple `text_expansion` queries can be combined with each other or other query types.
|
||||||
[[text-expansion-query-with-pruning-config-example]]
|
This can be achieved by wrapping them in <<query-dsl-bool-query, boolean query clauses>> and using linear boosting:
|
||||||
=== Example ELSER query with pruning configuration
|
|
||||||
|
|
||||||
The following is an extension to the above example that adds a preview:[] pruning configuration to the `text_expansion` query.
|
|
||||||
The pruning configuration identifies non-significant tokens to prune from the query in order to improve query performance.
|
|
||||||
[source,console]
|
[source,console]
|
||||||
----
|
----
|
||||||
GET my-index/_search
|
GET my-index/_search
|
||||||
{
|
{
|
||||||
"query":{
|
"query": {
|
||||||
"text_expansion":{
|
"bool": {
|
||||||
"ml.tokens":{
|
"should": [
|
||||||
"model_id":".elser_model_2",
|
{
|
||||||
"model_text":"How is the weather in Jamaica?"
|
"text_expansion": {
|
||||||
},
|
"ml.inference.title_expanded.predicted_value": {
|
||||||
"pruning_config": {
|
"model_id": ".elser_model_2",
|
||||||
"tokens_freq_ratio_threshold": 5,
|
"model_text": "How is the weather in Jamaica?",
|
||||||
"tokens_weight_threshold": 0.4,
|
"boost": 1
|
||||||
"only_score_pruned_tokens": false
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text_expansion": {
|
||||||
|
"ml.inference.description_expanded.predicted_value": {
|
||||||
|
"model_id": ".elser_model_2",
|
||||||
|
"model_text": "How is the weather in Jamaica?",
|
||||||
|
"boost": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"multi_match": {
|
||||||
|
"query": "How is the weather in Jamaica?",
|
||||||
|
"fields": [
|
||||||
|
"title",
|
||||||
|
"description"
|
||||||
|
],
|
||||||
|
"boost": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
----
|
||||||
|
// TEST[skip: TBD]
|
||||||
|
|
||||||
|
This can also be achieved by using sub searches combined with <<rrf>>.
|
||||||
|
|
||||||
|
[source,console]
|
||||||
|
----
|
||||||
|
GET my-index/_search
|
||||||
|
{
|
||||||
|
"sub_searches": [
|
||||||
|
{
|
||||||
|
"query": {
|
||||||
|
"multi_match": {
|
||||||
|
"query": "How is the weather in Jamaica?",
|
||||||
|
"fields": [
|
||||||
|
"title",
|
||||||
|
"description"
|
||||||
|
]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
{
|
||||||
|
"query": {
|
||||||
|
"text_expansion": {
|
||||||
|
"ml.inference.title_expanded.predicted_value": {
|
||||||
|
"model_id": ".elser_model_2",
|
||||||
|
"model_text": "How is the weather in Jamaica?"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": {
|
||||||
|
"text_expansion": {
|
||||||
|
"ml.inference.description_expanded.predicted_value": {
|
||||||
|
"model_id": ".elser_model_2",
|
||||||
|
"model_text": "How is the weather in Jamaica?"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rank": {
|
||||||
|
"rrf": {
|
||||||
|
"window_size": 10,
|
||||||
|
"rank_constant": 20
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
----
|
----
|
||||||
// TEST[skip: TBD]
|
// TEST[skip: TBD]
|
||||||
|
@ -141,9 +208,13 @@ GET my-index/_search
|
||||||
[[text-expansion-query-with-pruning-config-and-rescore-example]]
|
[[text-expansion-query-with-pruning-config-and-rescore-example]]
|
||||||
=== Example ELSER query with pruning configuration and rescore
|
=== Example ELSER query with pruning configuration and rescore
|
||||||
|
|
||||||
The following is an extension to the above example that adds a <<rescore>> function on top of the preview:[] pruning configuration to the `text_expansion` query.
|
The following is an extension to the above example that adds a preview:[] pruning configuration to the `text_expansion` query.
|
||||||
The pruning configuration identifies non-significant tokens to prune from the query in order to improve query performance.
|
The pruning configuration identifies non-significant tokens to prune from the query in order to improve query performance.
|
||||||
Rescoring the query with the tokens that were originally pruned from the query may improve overall search relevance when using this pruning strategy.
|
|
||||||
|
Token pruning happens at the shard level.
|
||||||
|
While this should result in the same tokens being labeled as insignificant across shards, this is not guaranteed based on the composition of each shard.
|
||||||
|
Therefore, if you are running `text_expansion` with a `pruning_config` on a multi-shard index, we strongly recommend adding a <<rescore>> function with the tokens that were originally pruned from the query.
|
||||||
|
This will help mitigate any shard-level inconsistency with pruned tokens and provide better relevance overall.
|
||||||
|
|
||||||
[source,console]
|
[source,console]
|
||||||
----
|
----
|
||||||
|
@ -188,30 +259,3 @@ GET my-index/_search
|
||||||
====
|
====
|
||||||
Depending on your data, the text expansion query may be faster with `track_total_hits: false`.
|
Depending on your data, the text expansion query may be faster with `track_total_hits: false`.
|
||||||
====
|
====
|
||||||
|
|
||||||
[discrete]
|
|
||||||
[[weighted-tokens-query-example]]
|
|
||||||
=== Example Weighted token query
|
|
||||||
|
|
||||||
In order to quickly iterate during tests, we exposed a new preview:[] `weighted_tokens` query for evaluation of tokenized datasets.
|
|
||||||
While this is not a query that is intended for production use, it can be used to quickly evaluate relevance using various pruning configurations.
|
|
||||||
|
|
||||||
[source,console]
|
|
||||||
----
|
|
||||||
POST /docs/_search
|
|
||||||
{
|
|
||||||
"query": {
|
|
||||||
"weighted_tokens": {
|
|
||||||
"query_expansion": {
|
|
||||||
"tokens": {"2161": 0.4679, "2621": 0.307, "2782": 0.1299, "2851": 0.1056, "3088": 0.3041, "3376": 0.1038, "3467": 0.4873, "3684": 0.8958, "4380": 0.334, "4542": 0.4636, "4633": 2.2805, "4785": 1.2628, "4860": 1.0655, "5133": 1.0709, "7139": 1.0016, "7224": 0.2486, "7387": 0.0985, "7394": 0.0542, "8915": 0.369, "9156": 2.8947, "10505": 0.2771, "11464": 0.3996, "13525": 0.0088, "14178": 0.8161, "16893": 0.1376, "17851": 1.5348, "19939": 0.6012},
|
|
||||||
"pruning_config": {
|
|
||||||
"tokens_freq_ratio_threshold": 5,
|
|
||||||
"tokens_weight_threshold": 0.4,
|
|
||||||
"only_score_pruned_tokens": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
----
|
|
||||||
//TEST[skip: TBD]
|
|
||||||
|
|
122
docs/reference/query-dsl/weighted-tokens-query.asciidoc
Normal file
122
docs/reference/query-dsl/weighted-tokens-query.asciidoc
Normal file
|
@ -0,0 +1,122 @@
|
||||||
|
[[query-dsl-weighted-tokens-query]]
|
||||||
|
=== Weighted tokens query
|
||||||
|
++++
|
||||||
|
<titleabbrev>Weighted tokens</titleabbrev>
|
||||||
|
++++
|
||||||
|
|
||||||
|
preview::[]
|
||||||
|
|
||||||
|
The weighted tokens query requires a list of token-weight pairs that are sent in with a query rather than calculated using a {nlp} model.
|
||||||
|
These token pairs are then used in a query against a <<sparse-vector,sparse vector>> or <<rank-features,rank features>> field.
|
||||||
|
|
||||||
|
Weighted tokens queries are useful when you want to use an external query expansion model, or quickly prototype changes without reindexing a new model.
|
||||||
|
|
||||||
|
[discrete]
|
||||||
|
[[weighted-tokens-query-ex-request]]
|
||||||
|
==== Example request
|
||||||
|
|
||||||
|
[source,console]
|
||||||
|
----
|
||||||
|
POST _search
|
||||||
|
{
|
||||||
|
"query": {
|
||||||
|
"weighted_tokens": {
|
||||||
|
"query_expansion_field": {
|
||||||
|
"tokens": {"2161": 0.4679, "2621": 0.307, "2782": 0.1299, "2851": 0.1056, "3088": 0.3041, "3376": 0.1038, "3467": 0.4873, "3684": 0.8958, "4380": 0.334, "4542": 0.4636, "4633": 2.2805, "4785": 1.2628, "4860": 1.0655, "5133": 1.0709, "7139": 1.0016, "7224": 0.2486, "7387": 0.0985, "7394": 0.0542, "8915": 0.369, "9156": 2.8947, "10505": 0.2771, "11464": 0.3996, "13525": 0.0088, "14178": 0.8161, "16893": 0.1376, "17851": 1.5348, "19939": 0.6012},
|
||||||
|
"pruning_config": {
|
||||||
|
"tokens_freq_ratio_threshold": 5,
|
||||||
|
"tokens_weight_threshold": 0.4,
|
||||||
|
"only_score_pruned_tokens": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
----
|
||||||
|
// TEST[skip: TBD]
|
||||||
|
|
||||||
|
[discrete]
|
||||||
|
[[weighted-token-query-params]]
|
||||||
|
=== Top level parameters for `weighted_token`
|
||||||
|
|
||||||
|
`<tokens>`:::
|
||||||
|
(Required, dictionary)
|
||||||
|
A dictionary of token-weight pairs.
|
||||||
|
|
||||||
|
`pruning_config` ::::
|
||||||
|
(Optional, object)
|
||||||
|
Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance.
|
||||||
|
Default: Disabled.
|
||||||
|
+
|
||||||
|
--
|
||||||
|
Parameters for `<pruning_config>` are:
|
||||||
|
|
||||||
|
`tokens_freq_ratio_threshold`::
|
||||||
|
(Optional, integer)
|
||||||
|
Tokens whose frequency is more than `tokens_freq_ratio_threshold` times the average frequency of all tokens in the specified field are considered outliers and pruned.
|
||||||
|
This value must between 1 and 100.
|
||||||
|
Default: `5`.
|
||||||
|
|
||||||
|
`tokens_weight_threshold`::
|
||||||
|
(Optional, float)
|
||||||
|
Tokens whose weight is less than `tokens_weight_threshold` are considered nonsignificant and pruned.
|
||||||
|
This value must be between 0 and 1.
|
||||||
|
Default: `0.4`.
|
||||||
|
|
||||||
|
`only_score_pruned_tokens`::
|
||||||
|
(Optional, boolean)
|
||||||
|
If `true` we only input pruned tokens into scoring, and discard non-pruned tokens.
|
||||||
|
It is strongly recommended to set this to `false` for the main query, but this can be set to `true` for a rescore query to get more relevant results.
|
||||||
|
Default: `false`.
|
||||||
|
|
||||||
|
NOTE: The default values for `tokens_freq_ratio_threshold` and `tokens_weight_threshold` were chosen based on tests using ELSER that provided the most optimal results.
|
||||||
|
--
|
||||||
|
|
||||||
|
[discrete]
|
||||||
|
[[weighted-tokens-query-with-pruning-config-and-rescore-example]]
|
||||||
|
==== Example weighted tokens query with pruning configuration and rescore
|
||||||
|
|
||||||
|
The following example adds a pruning configuration to the `text_expansion` query.
|
||||||
|
The pruning configuration identifies non-significant tokens to prune from the query in order to improve query performance.
|
||||||
|
|
||||||
|
Token pruning happens at the shard level.
|
||||||
|
While this should result in the same tokens being labeled as insignificant across shards, this is not guaranteed based on the composition of each shard.
|
||||||
|
Therefore, if you are running `text_expansion` with a `pruning_config` on a multi-shard index, we strongly recommend adding a <<rescore>> function with the tokens that were originally pruned from the query.
|
||||||
|
This will help mitigate any shard-level inconsistency with pruned tokens and provide better relevance overall.
|
||||||
|
|
||||||
|
[source,console]
|
||||||
|
----
|
||||||
|
GET my-index/_search
|
||||||
|
{
|
||||||
|
"query":{
|
||||||
|
"weighted_tokens": {
|
||||||
|
"query_expansion_field": {
|
||||||
|
"tokens": {"2161": 0.4679, "2621": 0.307, "2782": 0.1299, "2851": 0.1056, "3088": 0.3041, "3376": 0.1038, "3467": 0.4873, "3684": 0.8958, "4380": 0.334, "4542": 0.4636, "4633": 2.2805, "4785": 1.2628, "4860": 1.0655, "5133": 1.0709, "7139": 1.0016, "7224": 0.2486, "7387": 0.0985, "7394": 0.0542, "8915": 0.369, "9156": 2.8947, "10505": 0.2771, "11464": 0.3996, "13525": 0.0088, "14178": 0.8161, "16893": 0.1376, "17851": 1.5348, "19939": 0.6012},
|
||||||
|
"pruning_config": {
|
||||||
|
"tokens_freq_ratio_threshold": 5,
|
||||||
|
"tokens_weight_threshold": 0.4,
|
||||||
|
"only_score_pruned_tokens": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rescore": {
|
||||||
|
"window_size": 100,
|
||||||
|
"query": {
|
||||||
|
"rescore_query": {
|
||||||
|
"weighted_tokens": {
|
||||||
|
"query_expansion_field": {
|
||||||
|
"tokens": {"2161": 0.4679, "2621": 0.307, "2782": 0.1299, "2851": 0.1056, "3088": 0.3041, "3376": 0.1038, "3467": 0.4873, "3684": 0.8958, "4380": 0.334, "4542": 0.4636, "4633": 2.2805, "4785": 1.2628, "4860": 1.0655, "5133": 1.0709, "7139": 1.0016, "7224": 0.2486, "7387": 0.0985, "7394": 0.0542, "8915": 0.369, "9156": 2.8947, "10505": 0.2771, "11464": 0.3996, "13525": 0.0088, "14178": 0.8161, "16893": 0.1376, "17851": 1.5348, "19939": 0.6012},
|
||||||
|
"pruning_config": {
|
||||||
|
"tokens_freq_ratio_threshold": 5,
|
||||||
|
"tokens_weight_threshold": 0.4,
|
||||||
|
"only_score_pruned_tokens": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
----
|
||||||
|
//TEST[skip: TBD]
|
|
@ -47,8 +47,8 @@ public class TokenPruningConfig implements Writeable, ToXContentObject {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
"["
|
"["
|
||||||
+ TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName()
|
+ TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName()
|
||||||
+ "] must be between [1.0] and ["
|
+ "] must be between [1] and ["
|
||||||
+ String.format(Locale.ROOT, "%.1f", MAX_TOKENS_FREQ_RATIO_THRESHOLD)
|
+ String.format(Locale.ROOT, "%d", (int) MAX_TOKENS_FREQ_RATIO_THRESHOLD)
|
||||||
+ "], got "
|
+ "], got "
|
||||||
+ tokensFreqRatioThreshold
|
+ tokensFreqRatioThreshold
|
||||||
);
|
);
|
||||||
|
|
|
@ -231,7 +231,7 @@ public class WeightedTokensQueryBuilderTests extends AbstractQueryTestCase<Weigh
|
||||||
WeightedTokensQueryBuilder queryThatShouldBePruned = new WeightedTokensQueryBuilder(
|
WeightedTokensQueryBuilder queryThatShouldBePruned = new WeightedTokensQueryBuilder(
|
||||||
RANK_FEATURES_FIELD,
|
RANK_FEATURES_FIELD,
|
||||||
inputTokens,
|
inputTokens,
|
||||||
new TokenPruningConfig(1.5f, 0.5f, false)
|
new TokenPruningConfig(2, 0.5f, false)
|
||||||
);
|
);
|
||||||
query = queryThatShouldBePruned.doToQuery(context);
|
query = queryThatShouldBePruned.doToQuery(context);
|
||||||
assertCorrectLuceneQuery("queryThatShouldBePruned", query, List.of("dog", "jumped", "on", "me"));
|
assertCorrectLuceneQuery("queryThatShouldBePruned", query, List.of("dog", "jumped", "on", "me"));
|
||||||
|
@ -239,7 +239,7 @@ public class WeightedTokensQueryBuilderTests extends AbstractQueryTestCase<Weigh
|
||||||
WeightedTokensQueryBuilder onlyScorePrunedTokensQuery = new WeightedTokensQueryBuilder(
|
WeightedTokensQueryBuilder onlyScorePrunedTokensQuery = new WeightedTokensQueryBuilder(
|
||||||
RANK_FEATURES_FIELD,
|
RANK_FEATURES_FIELD,
|
||||||
inputTokens,
|
inputTokens,
|
||||||
new TokenPruningConfig(1.5f, 0.5f, true)
|
new TokenPruningConfig(2, 0.5f, true)
|
||||||
);
|
);
|
||||||
query = onlyScorePrunedTokensQuery.doToQuery(context);
|
query = onlyScorePrunedTokensQuery.doToQuery(context);
|
||||||
assertCorrectLuceneQuery("onlyScorePrunedTokensQuery", query, List.of("the", "black"));
|
assertCorrectLuceneQuery("onlyScorePrunedTokensQuery", query, List.of("the", "black"));
|
||||||
|
@ -361,21 +361,21 @@ public class WeightedTokensQueryBuilderTests extends AbstractQueryTestCase<Weigh
|
||||||
{
|
{
|
||||||
IllegalArgumentException e = expectThrows(
|
IllegalArgumentException e = expectThrows(
|
||||||
IllegalArgumentException.class,
|
IllegalArgumentException.class,
|
||||||
() -> new WeightedTokensQueryBuilder("field name", weightedTokens, new TokenPruningConfig(-1f, 0.0f, false))
|
() -> new WeightedTokensQueryBuilder("field name", weightedTokens, new TokenPruningConfig(-1, 0.0f, false))
|
||||||
);
|
);
|
||||||
assertEquals("[tokens_freq_ratio_threshold] must be between [1.0] and [100.0], got -1.0", e.getMessage());
|
assertEquals("[tokens_freq_ratio_threshold] must be between [1] and [100], got -1.0", e.getMessage());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
IllegalArgumentException e = expectThrows(
|
IllegalArgumentException e = expectThrows(
|
||||||
IllegalArgumentException.class,
|
IllegalArgumentException.class,
|
||||||
() -> new WeightedTokensQueryBuilder("field name", weightedTokens, new TokenPruningConfig(101f, 0.0f, false))
|
() -> new WeightedTokensQueryBuilder("field name", weightedTokens, new TokenPruningConfig(101, 0.0f, false))
|
||||||
);
|
);
|
||||||
assertEquals("[tokens_freq_ratio_threshold] must be between [1.0] and [100.0], got 101.0", e.getMessage());
|
assertEquals("[tokens_freq_ratio_threshold] must be between [1] and [100], got 101.0", e.getMessage());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
IllegalArgumentException e = expectThrows(
|
IllegalArgumentException e = expectThrows(
|
||||||
IllegalArgumentException.class,
|
IllegalArgumentException.class,
|
||||||
() -> new WeightedTokensQueryBuilder("field name", weightedTokens, new TokenPruningConfig(5f, 5f, false))
|
() -> new WeightedTokensQueryBuilder("field name", weightedTokens, new TokenPruningConfig(5, 5f, false))
|
||||||
);
|
);
|
||||||
assertEquals("[tokens_weight_threshold] must be between 0 and 1", e.getMessage());
|
assertEquals("[tokens_weight_threshold] must be between 0 and 1", e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue