Fix a bug in significant_terms (#127975)

Fix a bug in the `significant_terms` agg where the "subsetSize" array is too small because we never collect the ordinal for the agg "above" it. This mostly hits when the you do a `range` agg containing a `significant_terms` AND you only collect the first few ranges. `range` isn't particularly popular, but `date_histogram` is super popular and it rewrites into a `range` pretty commonly - so that's likely what's really hitting this - a `date_histogram` followed by a `significant_text` where the matches are all early in the date range held by the shard.
2025-06-28 09:28:55 -04:00 · 2025-05-09 13:48:19 -04:00 · 2025-05-09 13:48:19 -04:00 · da553b11e3
commit da553b11e3
parent 5447909dc3
5 changed files with 268 additions and 4 deletions
--- a/docs/changelog/127975.yaml
+++ b/docs/changelog/127975.yaml
@ -0,0 +1,5 @@
 pr: 127975
 summary: Fix a bug in `significant_terms`
 area: Aggregations
 type: bug
 issues: []
--- a/modules/aggregations/src/yamlRestTest/resources/rest-api-spec/test/aggregations/sig_terms.yml
+++ b/modules/aggregations/src/yamlRestTest/resources/rest-api-spec/test/aggregations/sig_terms.yml
@ -73,7 +73,7 @@
  - match: {aggregations.class.buckets.1.sig_terms.buckets.0.key: "good"}
 ---
-"Test background filter count ":
+"Test background filter count":
  - requires:
      cluster_features: ["gte_v7.15.0"]
      reason: bugfix introduced in 7.15.0
@ -153,6 +153,257 @@
        index: goodbad*
        body: {"aggs": {"sig_terms": {"significant_terms": {"field": "text", "background_filter": {"bool": {"filter": [{"term": {"class": "good" }}]}}}}}}
  - match: { aggregations.sig_terms.bg_count: 2 }
 ---
 "Test background filter count as sub - global ords":
  - requires:
      capabilities:
        - method: POST
          path: /_search
          capabilities: [ significant_terms_background_filter_as_sub ]
      test_runner_features: capabilities
      reason: "bug fix"
  - do:
      indices.create:
        index:  goodbad
        body:
          settings:
            number_of_shards: 1
          mappings:
            properties:
              text:
                type: keyword
              class:
                type: keyword
  - do:
      indices.create:
        index: goodbad-2
        body:
          settings:
            number_of_shards: 1
          mappings:
            properties:
              text:
                type: keyword
              class:
                type: keyword
  - do:
      index:
        index: goodbad-2
        id: "1"
        body: { group: 1, class: "bad" }
  - do:
      index:
        index: goodbad-2
        id: "2"
        body: { group: 1, class: "bad" }
  - do:
      index:
        index:  goodbad
        id:     "1"
        body:   { group: 1, text: "good", class: "good" }
  - do:
      index:
        index:  goodbad
        id:     "2"
        body:   { group: 1, text: "good", class: "good" }
  - do:
      index:
        index:  goodbad
        id:     "3"
        body:   { group: 1, text: "bad", class: "bad" }
  - do:
      index:
        index:  goodbad
        id:     "4"
        body:   { group: 2, text: "bad", class: "bad" }
  - do:
      indices.refresh:
        index: [goodbad, goodbad-2]
  - do:
      search:
        rest_total_hits_as_int: true
        index: goodbad*
  - match: {hits.total: 6}
  - do:
      search:
        index: goodbad*
        body:
          aggs:
            group:
              range:
                field: group
                ranges:
                  # Having many ranges helps catch an issue building no hits buckets
                  - to: 1
                  - from: 1
                    to: 2
                  - from: 2
                    to: 3
                  - from: 3
                    to: 4
                  - from: 4
                    to: 5
                  - from: 5
                    to: 6
              aggs:
                sig_terms:
                  significant_terms:
                    execution_hint: global_ordinals
                    field: text
                    background_filter:
                      bool:
                        filter: [{term: {class: good }}]
  - match: { aggregations.group.buckets.0.key: "*-1.0" }
  - match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 }
  - match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 }
  - match: { aggregations.group.buckets.1.key: "1.0-2.0" }
  - match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 }
  - match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 }
  - match: { aggregations.group.buckets.2.key: "2.0-3.0" }
  - match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 }
  - match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 }
  - match: { aggregations.group.buckets.3.key: "3.0-4.0" }
  - match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 }
  - match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 }
  - match: { aggregations.group.buckets.4.key: "4.0-5.0" }
  - match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 }
  - match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 }
  - match: { aggregations.group.buckets.5.key: "5.0-6.0" }
  - match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 }
  - match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 }
 ---
 "Test background filter count as sub - map":
  - requires:
      capabilities:
        - method: POST
          path: /_search
          capabilities: [ significant_terms_background_filter_as_sub ]
      test_runner_features: capabilities
      reason: "bug fix"
  - do:
      indices.create:
        index:  goodbad
        body:
          settings:
            number_of_shards: 1
          mappings:
            properties:
              text:
                type: keyword
              class:
                type: keyword
  - do:
      indices.create:
        index: goodbad-2
        body:
          settings:
            number_of_shards: 1
          mappings:
            properties:
              text:
                type: keyword
              class:
                type: keyword
  - do:
      index:
        index: goodbad-2
        id: "1"
        body: { group: 1, class: "bad" }
  - do:
      index:
        index: goodbad-2
        id: "2"
        body: { group: 1, class: "bad" }
  - do:
      index:
        index:  goodbad
        id:     "1"
        body:   { group: 1, text: "good", class: "good" }
  - do:
      index:
        index:  goodbad
        id:     "2"
        body:   { group: 1, text: "good", class: "good" }
  - do:
      index:
        index:  goodbad
        id:     "3"
        body:   { group: 1, text: "bad", class: "bad" }
  - do:
      index:
        index:  goodbad
        id:     "4"
        body:   { group: 2, text: "bad", class: "bad" }
  - do:
      indices.refresh:
        index: [goodbad, goodbad-2]
  - do:
      search:
        rest_total_hits_as_int: true
        index: goodbad*
  - match: {hits.total: 6}
  - do:
      search:
        index: goodbad*
        body:
          aggs:
            group:
              range:
                field: group
                ranges:
                  # Having many ranges helps catch an issue building no hits buckets
                  - to: 1
                  - from: 1
                    to: 2
                  - from: 2
                    to: 3
                  - from: 3
                    to: 4
                  - from: 4
                    to: 5
                  - from: 5
                    to: 6
              aggs:
                sig_terms:
                  significant_terms:
                    execution_hint: map
                    field: text
                    background_filter:
                      bool:
                        filter: [{term: {class: good }}]
  - match: { aggregations.group.buckets.0.key: "*-1.0" }
  - match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 }
  - match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 }
  - match: { aggregations.group.buckets.1.key: "1.0-2.0" }
  - match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 }
  - match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 }
  - match: { aggregations.group.buckets.2.key: "2.0-3.0" }
  - match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 }
  - match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 }
  - match: { aggregations.group.buckets.3.key: "3.0-4.0" }
  - match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 }
  - match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 }
  - match: { aggregations.group.buckets.4.key: "4.0-5.0" }
  - match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 }
  - match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 }
  - match: { aggregations.group.buckets.5.key: "5.0-6.0" }
  - match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 }
  - match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 }
 ---
 "IP test":
  - do:
--- a/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java
+++ b/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java
@ -48,6 +48,8 @@ public final class SearchCapabilities {
    private static final String INDEX_SELECTOR_SYNTAX = "index_expression_selectors";
    private static final String SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB = "significant_terms_background_filter_as_sub";
    public static final Set<String> CAPABILITIES;
    static {
        HashSet<String> capabilities = new HashSet<>();
@ -66,6 +68,7 @@ public final class SearchCapabilities {
        capabilities.add(KQL_QUERY_SUPPORTED);
        capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT);
        capabilities.add(INDEX_SELECTOR_SYNTAX);
        capabilities.add(SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB);
        CAPABILITIES = Set.copyOf(capabilities);
    }
 }
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java
@ -1087,7 +1087,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
        @Override
        SignificantStringTerms buildNoValuesResult(long owningBucketOrdinal) {
-            return buildEmptySignificantTermsAggregation(subsetSizes.get(owningBucketOrdinal), supersetSize, significanceHeuristic);
+            return buildEmptySignificantTermsAggregation(subsetSize(owningBucketOrdinal), supersetSize, significanceHeuristic);
        }
        @Override
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java
@ -649,7 +649,7 @@ public final class MapStringTermsAggregator extends AbstractStringTermsAggregato
        @Override
        BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd) {
-            long subsetSize = subsetSizes.get(owningBucketOrd);
+            long subsetSize = subsetSize(owningBucketOrd);
            return (spare, ordsEnum, docCount) -> {
                ordsEnum.readValue(spare.termBytes);
                spare.subsetDf = docCount;
@ -696,7 +696,7 @@ public final class MapStringTermsAggregator extends AbstractStringTermsAggregato
                bucketCountThresholds.getMinDocCount(),
                metadata(),
                format,
-                subsetSizes.get(owningBucketOrd),
+                subsetSize(owningBucketOrd),
                supersetSize,
                significanceHeuristic,
                Arrays.asList(topBuckets)
@ -712,5 +712,10 @@ public final class MapStringTermsAggregator extends AbstractStringTermsAggregato
        public void close() {
            Releasables.close(backgroundFrequencies, subsetSizes);
        }
        private long subsetSize(long owningBucketOrd) {
            // if the owningBucketOrd is not in the array that means the bucket is empty so the size has to be 0
            return owningBucketOrd < subsetSizes.size() ? subsetSizes.get(owningBucketOrd) : 0;
        }
    }
 }