Fix a bug in significant_terms (#127975)

Fix a bug in the `significant_terms` agg where the "subsetSize" array is
too small because we never collect the ordinal for the agg "above" it.

This mostly hits when the you do a `range` agg containing a
`significant_terms` AND you only collect the first few ranges. `range`
isn't particularly popular, but `date_histogram` is super popular and it
rewrites into a `range` pretty commonly - so that's likely what's really
hitting this - a `date_histogram` followed by a `significant_text` where
the matches are all early in the date range held by the shard.
This commit is contained in:
Nik Everett 2025-05-09 13:48:19 -04:00 committed by GitHub
parent 5447909dc3
commit da553b11e3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 268 additions and 4 deletions

View file

@ -0,0 +1,5 @@
pr: 127975
summary: Fix a bug in `significant_terms`
area: Aggregations
type: bug
issues: []

View file

@ -73,7 +73,7 @@
- match: {aggregations.class.buckets.1.sig_terms.buckets.0.key: "good"} - match: {aggregations.class.buckets.1.sig_terms.buckets.0.key: "good"}
--- ---
"Test background filter count ": "Test background filter count":
- requires: - requires:
cluster_features: ["gte_v7.15.0"] cluster_features: ["gte_v7.15.0"]
reason: bugfix introduced in 7.15.0 reason: bugfix introduced in 7.15.0
@ -153,6 +153,257 @@
index: goodbad* index: goodbad*
body: {"aggs": {"sig_terms": {"significant_terms": {"field": "text", "background_filter": {"bool": {"filter": [{"term": {"class": "good" }}]}}}}}} body: {"aggs": {"sig_terms": {"significant_terms": {"field": "text", "background_filter": {"bool": {"filter": [{"term": {"class": "good" }}]}}}}}}
- match: { aggregations.sig_terms.bg_count: 2 } - match: { aggregations.sig_terms.bg_count: 2 }
---
"Test background filter count as sub - global ords":
- requires:
capabilities:
- method: POST
path: /_search
capabilities: [ significant_terms_background_filter_as_sub ]
test_runner_features: capabilities
reason: "bug fix"
- do:
indices.create:
index: goodbad
body:
settings:
number_of_shards: 1
mappings:
properties:
text:
type: keyword
class:
type: keyword
- do:
indices.create:
index: goodbad-2
body:
settings:
number_of_shards: 1
mappings:
properties:
text:
type: keyword
class:
type: keyword
- do:
index:
index: goodbad-2
id: "1"
body: { group: 1, class: "bad" }
- do:
index:
index: goodbad-2
id: "2"
body: { group: 1, class: "bad" }
- do:
index:
index: goodbad
id: "1"
body: { group: 1, text: "good", class: "good" }
- do:
index:
index: goodbad
id: "2"
body: { group: 1, text: "good", class: "good" }
- do:
index:
index: goodbad
id: "3"
body: { group: 1, text: "bad", class: "bad" }
- do:
index:
index: goodbad
id: "4"
body: { group: 2, text: "bad", class: "bad" }
- do:
indices.refresh:
index: [goodbad, goodbad-2]
- do:
search:
rest_total_hits_as_int: true
index: goodbad*
- match: {hits.total: 6}
- do:
search:
index: goodbad*
body:
aggs:
group:
range:
field: group
ranges:
# Having many ranges helps catch an issue building no hits buckets
- to: 1
- from: 1
to: 2
- from: 2
to: 3
- from: 3
to: 4
- from: 4
to: 5
- from: 5
to: 6
aggs:
sig_terms:
significant_terms:
execution_hint: global_ordinals
field: text
background_filter:
bool:
filter: [{term: {class: good }}]
- match: { aggregations.group.buckets.0.key: "*-1.0" }
- match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 }
- match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 }
- match: { aggregations.group.buckets.1.key: "1.0-2.0" }
- match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 }
- match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 }
- match: { aggregations.group.buckets.2.key: "2.0-3.0" }
- match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 }
- match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 }
- match: { aggregations.group.buckets.3.key: "3.0-4.0" }
- match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 }
- match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 }
- match: { aggregations.group.buckets.4.key: "4.0-5.0" }
- match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 }
- match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 }
- match: { aggregations.group.buckets.5.key: "5.0-6.0" }
- match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 }
- match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 }
---
"Test background filter count as sub - map":
- requires:
capabilities:
- method: POST
path: /_search
capabilities: [ significant_terms_background_filter_as_sub ]
test_runner_features: capabilities
reason: "bug fix"
- do:
indices.create:
index: goodbad
body:
settings:
number_of_shards: 1
mappings:
properties:
text:
type: keyword
class:
type: keyword
- do:
indices.create:
index: goodbad-2
body:
settings:
number_of_shards: 1
mappings:
properties:
text:
type: keyword
class:
type: keyword
- do:
index:
index: goodbad-2
id: "1"
body: { group: 1, class: "bad" }
- do:
index:
index: goodbad-2
id: "2"
body: { group: 1, class: "bad" }
- do:
index:
index: goodbad
id: "1"
body: { group: 1, text: "good", class: "good" }
- do:
index:
index: goodbad
id: "2"
body: { group: 1, text: "good", class: "good" }
- do:
index:
index: goodbad
id: "3"
body: { group: 1, text: "bad", class: "bad" }
- do:
index:
index: goodbad
id: "4"
body: { group: 2, text: "bad", class: "bad" }
- do:
indices.refresh:
index: [goodbad, goodbad-2]
- do:
search:
rest_total_hits_as_int: true
index: goodbad*
- match: {hits.total: 6}
- do:
search:
index: goodbad*
body:
aggs:
group:
range:
field: group
ranges:
# Having many ranges helps catch an issue building no hits buckets
- to: 1
- from: 1
to: 2
- from: 2
to: 3
- from: 3
to: 4
- from: 4
to: 5
- from: 5
to: 6
aggs:
sig_terms:
significant_terms:
execution_hint: map
field: text
background_filter:
bool:
filter: [{term: {class: good }}]
- match: { aggregations.group.buckets.0.key: "*-1.0" }
- match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 }
- match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 }
- match: { aggregations.group.buckets.1.key: "1.0-2.0" }
- match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 }
- match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 }
- match: { aggregations.group.buckets.2.key: "2.0-3.0" }
- match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 }
- match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 }
- match: { aggregations.group.buckets.3.key: "3.0-4.0" }
- match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 }
- match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 }
- match: { aggregations.group.buckets.4.key: "4.0-5.0" }
- match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 }
- match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 }
- match: { aggregations.group.buckets.5.key: "5.0-6.0" }
- match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 }
- match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 }
--- ---
"IP test": "IP test":
- do: - do:

View file

@ -48,6 +48,8 @@ public final class SearchCapabilities {
private static final String INDEX_SELECTOR_SYNTAX = "index_expression_selectors"; private static final String INDEX_SELECTOR_SYNTAX = "index_expression_selectors";
private static final String SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB = "significant_terms_background_filter_as_sub";
public static final Set<String> CAPABILITIES; public static final Set<String> CAPABILITIES;
static { static {
HashSet<String> capabilities = new HashSet<>(); HashSet<String> capabilities = new HashSet<>();
@ -66,6 +68,7 @@ public final class SearchCapabilities {
capabilities.add(KQL_QUERY_SUPPORTED); capabilities.add(KQL_QUERY_SUPPORTED);
capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT); capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT);
capabilities.add(INDEX_SELECTOR_SYNTAX); capabilities.add(INDEX_SELECTOR_SYNTAX);
capabilities.add(SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB);
CAPABILITIES = Set.copyOf(capabilities); CAPABILITIES = Set.copyOf(capabilities);
} }
} }

View file

@ -1087,7 +1087,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
@Override @Override
SignificantStringTerms buildNoValuesResult(long owningBucketOrdinal) { SignificantStringTerms buildNoValuesResult(long owningBucketOrdinal) {
return buildEmptySignificantTermsAggregation(subsetSizes.get(owningBucketOrdinal), supersetSize, significanceHeuristic); return buildEmptySignificantTermsAggregation(subsetSize(owningBucketOrdinal), supersetSize, significanceHeuristic);
} }
@Override @Override

View file

@ -649,7 +649,7 @@ public final class MapStringTermsAggregator extends AbstractStringTermsAggregato
@Override @Override
BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd) { BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd) {
long subsetSize = subsetSizes.get(owningBucketOrd); long subsetSize = subsetSize(owningBucketOrd);
return (spare, ordsEnum, docCount) -> { return (spare, ordsEnum, docCount) -> {
ordsEnum.readValue(spare.termBytes); ordsEnum.readValue(spare.termBytes);
spare.subsetDf = docCount; spare.subsetDf = docCount;
@ -696,7 +696,7 @@ public final class MapStringTermsAggregator extends AbstractStringTermsAggregato
bucketCountThresholds.getMinDocCount(), bucketCountThresholds.getMinDocCount(),
metadata(), metadata(),
format, format,
subsetSizes.get(owningBucketOrd), subsetSize(owningBucketOrd),
supersetSize, supersetSize,
significanceHeuristic, significanceHeuristic,
Arrays.asList(topBuckets) Arrays.asList(topBuckets)
@ -712,5 +712,10 @@ public final class MapStringTermsAggregator extends AbstractStringTermsAggregato
public void close() { public void close() {
Releasables.close(backgroundFrequencies, subsetSizes); Releasables.close(backgroundFrequencies, subsetSizes);
} }
private long subsetSize(long owningBucketOrd) {
// if the owningBucketOrd is not in the array that means the bucket is empty so the size has to be 0
return owningBucketOrd < subsetSizes.size() ? subsetSizes.get(owningBucketOrd) : 0;
}
} }
} }