mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-28 09:28:55 -04:00
Fix a bug in significant_terms (#127975)
Fix a bug in the `significant_terms` agg where the "subsetSize" array is too small because we never collect the ordinal for the agg "above" it. This mostly hits when the you do a `range` agg containing a `significant_terms` AND you only collect the first few ranges. `range` isn't particularly popular, but `date_histogram` is super popular and it rewrites into a `range` pretty commonly - so that's likely what's really hitting this - a `date_histogram` followed by a `significant_text` where the matches are all early in the date range held by the shard.
This commit is contained in:
parent
5447909dc3
commit
da553b11e3
5 changed files with 268 additions and 4 deletions
5
docs/changelog/127975.yaml
Normal file
5
docs/changelog/127975.yaml
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
pr: 127975
|
||||||
|
summary: Fix a bug in `significant_terms`
|
||||||
|
area: Aggregations
|
||||||
|
type: bug
|
||||||
|
issues: []
|
|
@ -73,7 +73,7 @@
|
||||||
- match: {aggregations.class.buckets.1.sig_terms.buckets.0.key: "good"}
|
- match: {aggregations.class.buckets.1.sig_terms.buckets.0.key: "good"}
|
||||||
|
|
||||||
---
|
---
|
||||||
"Test background filter count ":
|
"Test background filter count":
|
||||||
- requires:
|
- requires:
|
||||||
cluster_features: ["gte_v7.15.0"]
|
cluster_features: ["gte_v7.15.0"]
|
||||||
reason: bugfix introduced in 7.15.0
|
reason: bugfix introduced in 7.15.0
|
||||||
|
@ -153,6 +153,257 @@
|
||||||
index: goodbad*
|
index: goodbad*
|
||||||
body: {"aggs": {"sig_terms": {"significant_terms": {"field": "text", "background_filter": {"bool": {"filter": [{"term": {"class": "good" }}]}}}}}}
|
body: {"aggs": {"sig_terms": {"significant_terms": {"field": "text", "background_filter": {"bool": {"filter": [{"term": {"class": "good" }}]}}}}}}
|
||||||
- match: { aggregations.sig_terms.bg_count: 2 }
|
- match: { aggregations.sig_terms.bg_count: 2 }
|
||||||
|
|
||||||
|
---
|
||||||
|
"Test background filter count as sub - global ords":
|
||||||
|
- requires:
|
||||||
|
capabilities:
|
||||||
|
- method: POST
|
||||||
|
path: /_search
|
||||||
|
capabilities: [ significant_terms_background_filter_as_sub ]
|
||||||
|
test_runner_features: capabilities
|
||||||
|
reason: "bug fix"
|
||||||
|
|
||||||
|
- do:
|
||||||
|
indices.create:
|
||||||
|
index: goodbad
|
||||||
|
body:
|
||||||
|
settings:
|
||||||
|
number_of_shards: 1
|
||||||
|
mappings:
|
||||||
|
properties:
|
||||||
|
text:
|
||||||
|
type: keyword
|
||||||
|
class:
|
||||||
|
type: keyword
|
||||||
|
- do:
|
||||||
|
indices.create:
|
||||||
|
index: goodbad-2
|
||||||
|
body:
|
||||||
|
settings:
|
||||||
|
number_of_shards: 1
|
||||||
|
mappings:
|
||||||
|
properties:
|
||||||
|
text:
|
||||||
|
type: keyword
|
||||||
|
class:
|
||||||
|
type: keyword
|
||||||
|
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad-2
|
||||||
|
id: "1"
|
||||||
|
body: { group: 1, class: "bad" }
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad-2
|
||||||
|
id: "2"
|
||||||
|
body: { group: 1, class: "bad" }
|
||||||
|
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad
|
||||||
|
id: "1"
|
||||||
|
body: { group: 1, text: "good", class: "good" }
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad
|
||||||
|
id: "2"
|
||||||
|
body: { group: 1, text: "good", class: "good" }
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad
|
||||||
|
id: "3"
|
||||||
|
body: { group: 1, text: "bad", class: "bad" }
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad
|
||||||
|
id: "4"
|
||||||
|
body: { group: 2, text: "bad", class: "bad" }
|
||||||
|
|
||||||
|
- do:
|
||||||
|
indices.refresh:
|
||||||
|
index: [goodbad, goodbad-2]
|
||||||
|
|
||||||
|
- do:
|
||||||
|
search:
|
||||||
|
rest_total_hits_as_int: true
|
||||||
|
index: goodbad*
|
||||||
|
- match: {hits.total: 6}
|
||||||
|
|
||||||
|
- do:
|
||||||
|
search:
|
||||||
|
index: goodbad*
|
||||||
|
body:
|
||||||
|
aggs:
|
||||||
|
group:
|
||||||
|
range:
|
||||||
|
field: group
|
||||||
|
ranges:
|
||||||
|
# Having many ranges helps catch an issue building no hits buckets
|
||||||
|
- to: 1
|
||||||
|
- from: 1
|
||||||
|
to: 2
|
||||||
|
- from: 2
|
||||||
|
to: 3
|
||||||
|
- from: 3
|
||||||
|
to: 4
|
||||||
|
- from: 4
|
||||||
|
to: 5
|
||||||
|
- from: 5
|
||||||
|
to: 6
|
||||||
|
aggs:
|
||||||
|
sig_terms:
|
||||||
|
significant_terms:
|
||||||
|
execution_hint: global_ordinals
|
||||||
|
field: text
|
||||||
|
background_filter:
|
||||||
|
bool:
|
||||||
|
filter: [{term: {class: good }}]
|
||||||
|
- match: { aggregations.group.buckets.0.key: "*-1.0" }
|
||||||
|
- match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 }
|
||||||
|
- match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 }
|
||||||
|
- match: { aggregations.group.buckets.1.key: "1.0-2.0" }
|
||||||
|
- match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 }
|
||||||
|
- match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 }
|
||||||
|
- match: { aggregations.group.buckets.2.key: "2.0-3.0" }
|
||||||
|
- match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 }
|
||||||
|
- match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 }
|
||||||
|
- match: { aggregations.group.buckets.3.key: "3.0-4.0" }
|
||||||
|
- match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 }
|
||||||
|
- match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 }
|
||||||
|
- match: { aggregations.group.buckets.4.key: "4.0-5.0" }
|
||||||
|
- match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 }
|
||||||
|
- match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 }
|
||||||
|
- match: { aggregations.group.buckets.5.key: "5.0-6.0" }
|
||||||
|
- match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 }
|
||||||
|
- match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 }
|
||||||
|
|
||||||
|
---
|
||||||
|
"Test background filter count as sub - map":
|
||||||
|
- requires:
|
||||||
|
capabilities:
|
||||||
|
- method: POST
|
||||||
|
path: /_search
|
||||||
|
capabilities: [ significant_terms_background_filter_as_sub ]
|
||||||
|
test_runner_features: capabilities
|
||||||
|
reason: "bug fix"
|
||||||
|
|
||||||
|
- do:
|
||||||
|
indices.create:
|
||||||
|
index: goodbad
|
||||||
|
body:
|
||||||
|
settings:
|
||||||
|
number_of_shards: 1
|
||||||
|
mappings:
|
||||||
|
properties:
|
||||||
|
text:
|
||||||
|
type: keyword
|
||||||
|
class:
|
||||||
|
type: keyword
|
||||||
|
- do:
|
||||||
|
indices.create:
|
||||||
|
index: goodbad-2
|
||||||
|
body:
|
||||||
|
settings:
|
||||||
|
number_of_shards: 1
|
||||||
|
mappings:
|
||||||
|
properties:
|
||||||
|
text:
|
||||||
|
type: keyword
|
||||||
|
class:
|
||||||
|
type: keyword
|
||||||
|
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad-2
|
||||||
|
id: "1"
|
||||||
|
body: { group: 1, class: "bad" }
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad-2
|
||||||
|
id: "2"
|
||||||
|
body: { group: 1, class: "bad" }
|
||||||
|
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad
|
||||||
|
id: "1"
|
||||||
|
body: { group: 1, text: "good", class: "good" }
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad
|
||||||
|
id: "2"
|
||||||
|
body: { group: 1, text: "good", class: "good" }
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad
|
||||||
|
id: "3"
|
||||||
|
body: { group: 1, text: "bad", class: "bad" }
|
||||||
|
- do:
|
||||||
|
index:
|
||||||
|
index: goodbad
|
||||||
|
id: "4"
|
||||||
|
body: { group: 2, text: "bad", class: "bad" }
|
||||||
|
|
||||||
|
- do:
|
||||||
|
indices.refresh:
|
||||||
|
index: [goodbad, goodbad-2]
|
||||||
|
|
||||||
|
- do:
|
||||||
|
search:
|
||||||
|
rest_total_hits_as_int: true
|
||||||
|
index: goodbad*
|
||||||
|
- match: {hits.total: 6}
|
||||||
|
|
||||||
|
- do:
|
||||||
|
search:
|
||||||
|
index: goodbad*
|
||||||
|
body:
|
||||||
|
aggs:
|
||||||
|
group:
|
||||||
|
range:
|
||||||
|
field: group
|
||||||
|
ranges:
|
||||||
|
# Having many ranges helps catch an issue building no hits buckets
|
||||||
|
- to: 1
|
||||||
|
- from: 1
|
||||||
|
to: 2
|
||||||
|
- from: 2
|
||||||
|
to: 3
|
||||||
|
- from: 3
|
||||||
|
to: 4
|
||||||
|
- from: 4
|
||||||
|
to: 5
|
||||||
|
- from: 5
|
||||||
|
to: 6
|
||||||
|
aggs:
|
||||||
|
sig_terms:
|
||||||
|
significant_terms:
|
||||||
|
execution_hint: map
|
||||||
|
field: text
|
||||||
|
background_filter:
|
||||||
|
bool:
|
||||||
|
filter: [{term: {class: good }}]
|
||||||
|
- match: { aggregations.group.buckets.0.key: "*-1.0" }
|
||||||
|
- match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 }
|
||||||
|
- match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 }
|
||||||
|
- match: { aggregations.group.buckets.1.key: "1.0-2.0" }
|
||||||
|
- match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 }
|
||||||
|
- match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 }
|
||||||
|
- match: { aggregations.group.buckets.2.key: "2.0-3.0" }
|
||||||
|
- match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 }
|
||||||
|
- match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 }
|
||||||
|
- match: { aggregations.group.buckets.3.key: "3.0-4.0" }
|
||||||
|
- match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 }
|
||||||
|
- match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 }
|
||||||
|
- match: { aggregations.group.buckets.4.key: "4.0-5.0" }
|
||||||
|
- match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 }
|
||||||
|
- match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 }
|
||||||
|
- match: { aggregations.group.buckets.5.key: "5.0-6.0" }
|
||||||
|
- match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 }
|
||||||
|
- match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 }
|
||||||
|
|
||||||
---
|
---
|
||||||
"IP test":
|
"IP test":
|
||||||
- do:
|
- do:
|
||||||
|
|
|
@ -48,6 +48,8 @@ public final class SearchCapabilities {
|
||||||
|
|
||||||
private static final String INDEX_SELECTOR_SYNTAX = "index_expression_selectors";
|
private static final String INDEX_SELECTOR_SYNTAX = "index_expression_selectors";
|
||||||
|
|
||||||
|
private static final String SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB = "significant_terms_background_filter_as_sub";
|
||||||
|
|
||||||
public static final Set<String> CAPABILITIES;
|
public static final Set<String> CAPABILITIES;
|
||||||
static {
|
static {
|
||||||
HashSet<String> capabilities = new HashSet<>();
|
HashSet<String> capabilities = new HashSet<>();
|
||||||
|
@ -66,6 +68,7 @@ public final class SearchCapabilities {
|
||||||
capabilities.add(KQL_QUERY_SUPPORTED);
|
capabilities.add(KQL_QUERY_SUPPORTED);
|
||||||
capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT);
|
capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT);
|
||||||
capabilities.add(INDEX_SELECTOR_SYNTAX);
|
capabilities.add(INDEX_SELECTOR_SYNTAX);
|
||||||
|
capabilities.add(SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB);
|
||||||
CAPABILITIES = Set.copyOf(capabilities);
|
CAPABILITIES = Set.copyOf(capabilities);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1087,7 +1087,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
SignificantStringTerms buildNoValuesResult(long owningBucketOrdinal) {
|
SignificantStringTerms buildNoValuesResult(long owningBucketOrdinal) {
|
||||||
return buildEmptySignificantTermsAggregation(subsetSizes.get(owningBucketOrdinal), supersetSize, significanceHeuristic);
|
return buildEmptySignificantTermsAggregation(subsetSize(owningBucketOrdinal), supersetSize, significanceHeuristic);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -649,7 +649,7 @@ public final class MapStringTermsAggregator extends AbstractStringTermsAggregato
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd) {
|
BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd) {
|
||||||
long subsetSize = subsetSizes.get(owningBucketOrd);
|
long subsetSize = subsetSize(owningBucketOrd);
|
||||||
return (spare, ordsEnum, docCount) -> {
|
return (spare, ordsEnum, docCount) -> {
|
||||||
ordsEnum.readValue(spare.termBytes);
|
ordsEnum.readValue(spare.termBytes);
|
||||||
spare.subsetDf = docCount;
|
spare.subsetDf = docCount;
|
||||||
|
@ -696,7 +696,7 @@ public final class MapStringTermsAggregator extends AbstractStringTermsAggregato
|
||||||
bucketCountThresholds.getMinDocCount(),
|
bucketCountThresholds.getMinDocCount(),
|
||||||
metadata(),
|
metadata(),
|
||||||
format,
|
format,
|
||||||
subsetSizes.get(owningBucketOrd),
|
subsetSize(owningBucketOrd),
|
||||||
supersetSize,
|
supersetSize,
|
||||||
significanceHeuristic,
|
significanceHeuristic,
|
||||||
Arrays.asList(topBuckets)
|
Arrays.asList(topBuckets)
|
||||||
|
@ -712,5 +712,10 @@ public final class MapStringTermsAggregator extends AbstractStringTermsAggregato
|
||||||
public void close() {
|
public void close() {
|
||||||
Releasables.close(backgroundFrequencies, subsetSizes);
|
Releasables.close(backgroundFrequencies, subsetSizes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long subsetSize(long owningBucketOrd) {
|
||||||
|
// if the owningBucketOrd is not in the array that means the bucket is empty so the size has to be 0
|
||||||
|
return owningBucketOrd < subsetSizes.size() ? subsetSizes.get(owningBucketOrd) : 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue