[Response Ops][Alerting] Expose shard failures from querying over CCS indices in ES query rule type (#189312)

2025-04-24 09:48:58 -04:00 · 2024-09-05 21:48:46 -04:00 · 2024-09-05 21:48:46 -04:00 · 4a0919a7ea
commit 4a0919a7ea
parent ffda01784d
9 changed files with 657 additions and 8 deletions
--- a/x-pack/plugins/alerting/server/lib/rule_execution_status.test.ts
+++ b/x-pack/plugins/alerting/server/lib/rule_execution_status.test.ts
@ -188,6 +188,53 @@ describe('RuleExecutionStatus', () => {
      expect(status.error).toEqual({ message: 'an error', reason: 'unknown' });
      expect(status.warning).toBe(undefined);
    });
+
+    test('task state with framework warning and rule execution warning - only show framework warning', () => {
+      const ruleResultService = new RuleResultService();
+      const lastRunSetters = ruleResultService.getLastRunSetters();
+      lastRunSetters.addLastRunWarning('a rule execution warning');
+      const { status, metrics } = executionStatusFromState({
+        stateWithMetrics: {
+          alertInstances: { a: {} },
+          metrics: executionMetrics,
+        },
+        ruleResultService,
+      });
+      checkDateIsNearNow(status.lastExecutionDate);
+      expect(status.warning).toEqual({
+        message: `a rule execution warning`,
+        reason: RuleExecutionStatusWarningReasons.EXECUTION,
+      });
+      expect(status.status).toBe('warning');
+      expect(status.error).toBe(undefined);
+
+      testExpectedMetrics(metrics!, executionMetrics);
+    });
+
+    test('task state with rule execution warning', () => {
+      const ruleResultService = new RuleResultService();
+      const lastRunSetters = ruleResultService.getLastRunSetters();
+      lastRunSetters.addLastRunWarning('a rule execution warning');
+      const { status, metrics } = executionStatusFromState({
+        stateWithMetrics: {
+          alertInstances: { a: {} },
+          metrics: { ...executionMetrics, triggeredActionsStatus: ActionsCompletion.PARTIAL },
+        },
+        ruleResultService,
+      });
+      checkDateIsNearNow(status.lastExecutionDate);
+      expect(status.warning).toEqual({
+        message: translations.taskRunner.warning.maxExecutableActions,
+        reason: RuleExecutionStatusWarningReasons.MAX_EXECUTABLE_ACTIONS,
+      });
+      expect(status.status).toBe('warning');
+      expect(status.error).toBe(undefined);
+
+      testExpectedMetrics(metrics!, {
+        ...executionMetrics,
+        triggeredActionsStatus: ActionsCompletion.PARTIAL,
+      });
+    });
  });

  describe('executionStatusFromError()', () => {
--- a/x-pack/plugins/alerting/server/lib/rule_execution_status.ts
+++ b/x-pack/plugins/alerting/server/lib/rule_execution_status.ts
@ -70,7 +70,8 @@ export function executionStatusFromState({
  }

  // Overwrite status to be error if last run reported any errors
-  const { errors: errorsFromLastRun } = ruleResultService.getLastRunResults();
+  const { errors: errorsFromLastRun, warnings: warningsFromLastRun } =
+    ruleResultService.getLastRunResults();
  if (errorsFromLastRun.length > 0) {
    status = RuleExecutionStatusValues[2];
    // These errors are reported by ruleResultService.addLastRunError, therefore they are landed in successful execution map
@ -80,6 +81,15 @@ export function executionStatusFromState({
    };
  }

+  // Set warning status if last run reported any warnings and framework has not set any warnings
+  if (warningsFromLastRun.length > 0 && !warning) {
+    status = RuleExecutionStatusValues[5];
+    warning = {
+      reason: RuleExecutionStatusWarningReasons.EXECUTION,
+      message: warningsFromLastRun.join(','),
+    };
+  }
+
  return {
    status: {
      lastExecutionDate: lastExecutionDate ?? new Date(),
--- a/x-pack/plugins/stack_alerts/server/rule_types/es_query/executor.ts
+++ b/x-pack/plugins/stack_alerts/server/rule_types/es_query/executor.ts
@ -53,7 +53,7 @@ export async function executor(core: CoreSetup, options: ExecutorOptions<EsQuery
    logger,
    getTimeRange,
  } = options;
-  const { alertsClient, scopedClusterClient, share } = services;
+  const { alertsClient, ruleResultService, scopedClusterClient, share } = services;

  if (!alertsClient) {
    throw new AlertsClientError();
@ -89,6 +89,7 @@ export async function executor(core: CoreSetup, options: ExecutorOptions<EsQuery
          getSearchSourceClient: services.getSearchSourceClient,
          logger,
          getDataViews: services.getDataViews,
+          ruleResultService,
        },
        dateStart,
        dateEnd,
@ -119,6 +120,7 @@ export async function executor(core: CoreSetup, options: ExecutorOptions<EsQuery
        services: {
          scopedClusterClient,
          logger,
+          ruleResultService,
        },
        dateStart,
        dateEnd,
--- a/x-pack/plugins/stack_alerts/server/rule_types/es_query/lib/fetch_es_query.test.ts
+++ b/x-pack/plugins/stack_alerts/server/rule_types/es_query/lib/fetch_es_query.test.ts
@ -9,7 +9,9 @@ import { OnlyEsQueryRuleParams } from '../types';
 import { Comparator } from '../../../../common/comparator_types';
 import { fetchEsQuery } from './fetch_es_query';
 import { elasticsearchServiceMock } from '@kbn/core-elasticsearch-server-mocks';
+import { elasticsearchClientMock } from '@kbn/core-elasticsearch-client-server-mocks';
 import { loggerMock } from '@kbn/logging-mocks';
+import { publicRuleResultServiceMock } from '@kbn/alerting-plugin/server/monitoring/rule_result_service.mock';

 jest.mock('@kbn/triggers-actions-ui-plugin/common', () => {
  const actual = jest.requireActual('@kbn/triggers-actions-ui-plugin/common');
@ -37,6 +39,7 @@ const defaultParams: OnlyEsQueryRuleParams = {

 const logger = loggerMock.create();
 const scopedClusterClientMock = elasticsearchServiceMock.createScopedClusterClient();
+const mockRuleResultService = publicRuleResultServiceMock.create();

 describe('fetchEsQuery', () => {
  beforeAll(() => {
@ -52,6 +55,7 @@ describe('fetchEsQuery', () => {
  const services = {
    scopedClusterClient: scopedClusterClientMock,
    logger,
+    ruleResultService: mockRuleResultService,
  };
  it('should add time filter if timestamp if defined and excludeHitsFromPreviousRun is true', async () => {
    const params = defaultParams;
@ -479,4 +483,139 @@ describe('fetchEsQuery', () => {
      { meta: true }
    );
  });
+
+  it('should bubble up CCS errors stored in the _shards field of the search result', async () => {
+    scopedClusterClientMock.asCurrentUser.search.mockResolvedValueOnce(
+      elasticsearchClientMock.createSuccessTransportRequestPromise({
+        took: 16,
+        timed_out: false,
+        _shards: {
+          total: 51,
+          successful: 48,
+          skipped: 48,
+          failed: 3,
+          failures: [
+            {
+              shard: 0,
+              index: 'ccs-index',
+              node: '8jMc8jz-Q6qFmKZXfijt-A',
+              reason: {
+                type: 'illegal_argument_exception',
+                reason:
+                  "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+              },
+            },
+          ],
+        },
+        hits: {
+          total: {
+            value: 0,
+            relation: 'eq',
+          },
+          max_score: 0,
+          hits: [],
+        },
+      })
+    );
+
+    await fetchEsQuery({
+      ruleId: 'abc',
+      name: 'test-rule',
+      params: defaultParams,
+      timestamp: '2020-02-09T23:15:41.941Z',
+      services,
+      spacePrefix: '',
+      publicBaseUrl: '',
+      dateStart: new Date().toISOString(),
+      dateEnd: new Date().toISOString(),
+    });
+
+    expect(mockRuleResultService.addLastRunWarning).toHaveBeenCalledWith(
+      `Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.`
+    );
+    expect(mockRuleResultService.setLastRunOutcomeMessage).toHaveBeenCalledWith(
+      `Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.`
+    );
+  });
+
+  it('should bubble up CCS errors stored in the _clusters field of the search result', async () => {
+    scopedClusterClientMock.asCurrentUser.search.mockResolvedValueOnce(
+      // @ts-expect-error - _clusters.details not a valid response but it is irl
+      elasticsearchClientMock.createSuccessTransportRequestPromise({
+        took: 6,
+        timed_out: false,
+        num_reduce_phases: 0,
+        _shards: { total: 0, successful: 0, skipped: 0, failed: 0 },
+        _clusters: {
+          total: 1,
+          successful: 0,
+          skipped: 1,
+          running: 0,
+          partial: 0,
+          failed: 0,
+          details: {
+            test: {
+              status: 'skipped',
+              indices: '.kibana-event-log*',
+              timed_out: false,
+              failures: [
+                {
+                  shard: -1,
+                  index: null,
+                  reason: {
+                    type: 'search_phase_execution_exception',
+                    reason: 'all shards failed',
+                    phase: 'query',
+                    grouped: true,
+                    failed_shards: [
+                      {
+                        shard: 0,
+                        index: 'test:.ds-.kibana-event-log-ds-2024.07.31-000001',
+                        node: 'X1aMu4BpQR-7PHi-bEI8Fw',
+                        reason: {
+                          type: 'illegal_argument_exception',
+                          reason:
+                            "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+                        },
+                      },
+                    ],
+                    caused_by: {
+                      type: '',
+                      reason:
+                        "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+                      caused_by: {
+                        type: 'illegal_argument_exception',
+                        reason:
+                          "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+                      },
+                    },
+                  },
+                },
+              ],
+            },
+          },
+        },
+        hits: { total: { value: 0, relation: 'eq' }, max_score: 0, hits: [] },
+      })
+    );
+
+    await fetchEsQuery({
+      ruleId: 'abc',
+      name: 'test-rule',
+      params: defaultParams,
+      timestamp: '2020-02-09T23:15:41.941Z',
+      services,
+      spacePrefix: '',
+      publicBaseUrl: '',
+      dateStart: new Date().toISOString(),
+      dateEnd: new Date().toISOString(),
+    });
+
+    expect(mockRuleResultService.addLastRunWarning).toHaveBeenCalledWith(
+      `Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.`
+    );
+    expect(mockRuleResultService.setLastRunOutcomeMessage).toHaveBeenCalledWith(
+      `Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.`
+    );
+  });
 });
--- a/x-pack/plugins/stack_alerts/server/rule_types/es_query/lib/fetch_es_query.ts
+++ b/x-pack/plugins/stack_alerts/server/rule_types/es_query/lib/fetch_es_query.ts
@ -13,10 +13,11 @@ import {
 } from '@kbn/triggers-actions-ui-plugin/common';
 import { isGroupAggregation } from '@kbn/triggers-actions-ui-plugin/common';
 import { ES_QUERY_ID } from '@kbn/rule-data-utils';
+import { PublicRuleResultService } from '@kbn/alerting-plugin/server/types';
 import { getComparatorScript } from '../../../../common';
 import { OnlyEsQueryRuleParams } from '../types';
 import { buildSortedEventsQuery } from '../../../../common/build_sorted_events_query';
-import { getParsedQuery } from '../util';
+import { getParsedQuery, checkForShardFailures } from '../util';

 export interface FetchEsQueryOpts {
  ruleId: string;
@ -28,6 +29,7 @@ export interface FetchEsQueryOpts {
  services: {
    scopedClusterClient: IScopedClusterClient;
    logger: Logger;
+    ruleResultService?: PublicRuleResultService;
  };
  alertLimit?: number;
  dateStart: string;
@ -49,7 +51,7 @@ export async function fetchEsQuery({
  dateStart,
  dateEnd,
 }: FetchEsQueryOpts) {
-  const { scopedClusterClient, logger } = services;
+  const { scopedClusterClient, logger, ruleResultService } = services;
  const esClient = scopedClusterClient.asCurrentUser;
  const isGroupAgg = isGroupAggregation(params.termField);
  const isCountAgg = isCountAggregation(params.aggType);
@ -135,6 +137,14 @@ export async function fetchEsQuery({
      ` es query rule ${ES_QUERY_ID}:${ruleId} "${name}" result - ${JSON.stringify(searchResult)}`
  );

+  // result against CCS indices will return success response with errors nested within
+  // the _shards or _clusters field; look for these errors and bubble them up
+  const anyShardFailures = checkForShardFailures(searchResult);
+  if (anyShardFailures && ruleResultService) {
+    ruleResultService.addLastRunWarning(anyShardFailures);
+    ruleResultService.setLastRunOutcomeMessage(anyShardFailures);
+  }
+
  const link = `${publicBaseUrl}${spacePrefix}/app/management/insightsAndAlerting/triggersActions/rule/${ruleId}`;

  return {
--- a/x-pack/plugins/stack_alerts/server/rule_types/es_query/lib/fetch_search_source_query.test.ts
+++ b/x-pack/plugins/stack_alerts/server/rule_types/es_query/lib/fetch_search_source_query.test.ts
@ -6,8 +6,11 @@
 */

 import { OnlySearchSourceRuleParams } from '../types';
+import {
+  createSearchSourceMock,
+  searchSourceInstanceMock,
+} from '@kbn/data-plugin/common/search/search_source/mocks';
 import { searchSourceCommonMock } from '@kbn/data-plugin/common/search/search_source/mocks';
-import { createSearchSourceMock } from '@kbn/data-plugin/common/search/search_source/mocks';
 import { loggerMock } from '@kbn/logging-mocks';
 import {
  updateSearchSource,
@ -26,6 +29,7 @@ import { Comparator } from '../../../../common/comparator_types';
 import { dataViewPluginMocks } from '@kbn/data-views-plugin/public/mocks';
 import { DiscoverAppLocatorParams } from '@kbn/discover-plugin/common';
 import { LocatorPublic } from '@kbn/share-plugin/common';
+import { publicRuleResultServiceMock } from '@kbn/alerting-plugin/server/monitoring/rule_result_service.mock';
 import { SavedObjectsErrorHelpers } from '@kbn/core-saved-objects-server';
 import {
  getErrorSource,
@ -72,12 +76,13 @@ const defaultParams: OnlySearchSourceRuleParams = {
 };

 const logger = loggerMock.create();
+const mockRuleResultService = publicRuleResultServiceMock.create();

 describe('fetchSearchSourceQuery', () => {
  const dataViewMock = createDataView();

  afterAll(() => {
-    jest.resetAllMocks();
+    jest.clearAllMocks();
  });

  const fakeNow = new Date('2020-02-09T23:15:41.941Z');
@ -431,6 +436,204 @@ describe('fetchSearchSourceQuery', () => {
      expect(logger.warn).toHaveBeenCalledWith('Top hits size is capped at 100');
    });

+    it('should bubble up CCS errors stored in the _shards field of the search result', async () => {
+      const response = {
+        took: 16,
+        timed_out: false,
+        _shards: {
+          total: 51,
+          successful: 48,
+          skipped: 48,
+          failed: 3,
+          failures: [
+            {
+              shard: 0,
+              index: 'ccs-index',
+              node: '8jMc8jz-Q6qFmKZXfijt-A',
+              reason: {
+                type: 'illegal_argument_exception',
+                reason:
+                  "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+              },
+            },
+          ],
+        },
+        hits: {
+          total: {
+            value: 0,
+            relation: 'eq',
+          },
+          max_score: 0,
+          hits: [],
+        },
+      };
+
+      (searchSourceInstanceMock.getField as jest.Mock).mockImplementationOnce(
+        jest.fn().mockReturnValue(dataViewMock)
+      );
+      (searchSourceInstanceMock.setField as jest.Mock).mockImplementationOnce(
+        jest.fn().mockReturnValue(undefined)
+      );
+      (searchSourceInstanceMock.createChild as jest.Mock).mockImplementationOnce(
+        jest.fn().mockReturnValue(searchSourceInstanceMock)
+      );
+      (searchSourceInstanceMock.fetch as jest.Mock).mockImplementationOnce(
+        jest.fn().mockReturnValue(response)
+      );
+
+      // const searchSourceInstance = createSearchSourceMock({}, response);
+      searchSourceCommonMock.createLazy.mockResolvedValueOnce(searchSourceInstanceMock);
+
+      await fetchSearchSourceQuery({
+        ruleId: 'abc',
+        params: defaultParams,
+        services: {
+          logger,
+          getSearchSourceClient: async () => searchSourceCommonMock,
+          ruleResultService: mockRuleResultService,
+          share: {
+            url: {
+              // @ts-expect-error
+              locators: {
+                get: jest.fn().mockReturnValue({
+                  getRedirectUrl: jest.fn(() => '/app/r?l=DISCOVER_APP_LOCATOR'),
+                } as unknown as LocatorPublic<DiscoverAppLocatorParams>),
+              },
+            },
+          },
+          getDataViews: async () => {
+            return {
+              ...dataViewPluginMocks.createStartContract(),
+              create: async (spec: DataViewSpec) =>
+                new DataView({ spec, fieldFormats: fieldFormatsMock }),
+            };
+          },
+        },
+        spacePrefix: '',
+        dateStart: new Date().toISOString(),
+        dateEnd: new Date().toISOString(),
+      });
+
+      expect(mockRuleResultService.addLastRunWarning).toHaveBeenCalledWith(
+        `Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.`
+      );
+      expect(mockRuleResultService.setLastRunOutcomeMessage).toHaveBeenCalledWith(
+        `Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.`
+      );
+    });
+
+    it('should bubble up CCS errors stored in the _clusters field of the search result', async () => {
+      const response = {
+        took: 6,
+        timed_out: false,
+        num_reduce_phases: 0,
+        _shards: { total: 0, successful: 0, skipped: 0, failed: 0 },
+        _clusters: {
+          total: 1,
+          successful: 0,
+          skipped: 1,
+          running: 0,
+          partial: 0,
+          failed: 0,
+          details: {
+            test: {
+              status: 'skipped',
+              indices: '.kibana-event-log*',
+              timed_out: false,
+              failures: [
+                {
+                  shard: -1,
+                  index: null,
+                  reason: {
+                    type: 'search_phase_execution_exception',
+                    reason: 'all shards failed',
+                    phase: 'query',
+                    grouped: true,
+                    failed_shards: [
+                      {
+                        shard: 0,
+                        index: 'test:.ds-.kibana-event-log-ds-2024.07.31-000001',
+                        node: 'X1aMu4BpQR-7PHi-bEI8Fw',
+                        reason: {
+                          type: 'illegal_argument_exception',
+                          reason:
+                            "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+                        },
+                      },
+                    ],
+                    caused_by: {
+                      type: '',
+                      reason:
+                        "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+                      caused_by: {
+                        type: 'illegal_argument_exception',
+                        reason:
+                          "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+                      },
+                    },
+                  },
+                },
+              ],
+            },
+          },
+        },
+        hits: { total: { value: 0, relation: 'eq' }, max_score: 0, hits: [] },
+      };
+
+      (searchSourceInstanceMock.getField as jest.Mock).mockImplementationOnce(
+        jest.fn().mockReturnValue(dataViewMock)
+      );
+      (searchSourceInstanceMock.setField as jest.Mock).mockImplementationOnce(
+        jest.fn().mockReturnValue(undefined)
+      );
+      (searchSourceInstanceMock.createChild as jest.Mock).mockImplementationOnce(
+        jest.fn().mockReturnValue(searchSourceInstanceMock)
+      );
+      (searchSourceInstanceMock.fetch as jest.Mock).mockImplementationOnce(
+        jest.fn().mockReturnValue(response)
+      );
+
+      // const searchSourceInstance = createSearchSourceMock({}, response);
+      searchSourceCommonMock.createLazy.mockResolvedValueOnce(searchSourceInstanceMock);
+
+      await fetchSearchSourceQuery({
+        ruleId: 'abc',
+        params: defaultParams,
+        services: {
+          logger,
+          getSearchSourceClient: async () => searchSourceCommonMock,
+          ruleResultService: mockRuleResultService,
+          share: {
+            url: {
+              // @ts-expect-error
+              locators: {
+                get: jest.fn().mockReturnValue({
+                  getRedirectUrl: jest.fn(() => '/app/r?l=DISCOVER_APP_LOCATOR'),
+                } as unknown as LocatorPublic<DiscoverAppLocatorParams>),
+              },
+            },
+          },
+          getDataViews: async () => {
+            return {
+              ...dataViewPluginMocks.createStartContract(),
+              create: async (spec: DataViewSpec) =>
+                new DataView({ spec, fieldFormats: fieldFormatsMock }),
+            };
+          },
+        },
+        spacePrefix: '',
+        dateStart: new Date().toISOString(),
+        dateEnd: new Date().toISOString(),
+      });
+
+      expect(mockRuleResultService.addLastRunWarning).toHaveBeenCalledWith(
+        `Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.`
+      );
+      expect(mockRuleResultService.setLastRunOutcomeMessage).toHaveBeenCalledWith(
+        `Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.`
+      );
+    });
+
    it('should throw user error if data view is not found', async () => {
      searchSourceCommonMock.createLazy.mockImplementationOnce(() => {
        throw SavedObjectsErrorHelpers.createGenericNotFoundError('index-pattern', 'abc');
--- a/x-pack/plugins/stack_alerts/server/rule_types/es_query/lib/fetch_search_source_query.ts
+++ b/x-pack/plugins/stack_alerts/server/rule_types/es_query/lib/fetch_search_source_query.ts
@ -24,9 +24,11 @@ import { SharePluginStart } from '@kbn/share-plugin/server';
 import { DiscoverAppLocatorParams } from '@kbn/discover-plugin/common';
 import { Logger, SavedObjectsErrorHelpers } from '@kbn/core/server';
 import { LocatorPublic } from '@kbn/share-plugin/common';
+import { PublicRuleResultService } from '@kbn/alerting-plugin/server/types';
 import { createTaskRunError, TaskErrorSource } from '@kbn/task-manager-plugin/server';
 import { OnlySearchSourceRuleParams } from '../types';
 import { getComparatorScript } from '../../../../common';
+import { checkForShardFailures } from '../util';

 export interface FetchSearchSourceQueryOpts {
  ruleId: string;
@ -39,6 +41,7 @@ export interface FetchSearchSourceQueryOpts {
    getSearchSourceClient: () => Promise<ISearchStartSearchSource>;
    share: SharePluginStart;
    getDataViews: () => Promise<DataViewsContract>;
+    ruleResultService?: PublicRuleResultService;
  };
  dateStart: string;
  dateEnd: string;
@ -54,7 +57,7 @@ export async function fetchSearchSourceQuery({
  dateStart,
  dateEnd,
 }: FetchSearchSourceQueryOpts) {
-  const { logger, getSearchSourceClient } = services;
+  const { logger, getSearchSourceClient, ruleResultService } = services;
  const searchSourceClient = await getSearchSourceClient();
  const isGroupAgg = isGroupAggregation(params.termField);
  const isCountAgg = isCountAggregation(params.aggType);
@ -88,6 +91,14 @@ export async function fetchSearchSourceQuery({

  const searchResult = await searchSource.fetch();

+  // result against CCS indices will return success response with errors nested within
+  // the _shards or _clusters field; look for these errors and bubble them up
+  const anyShardFailures = checkForShardFailures(searchResult);
+  if (anyShardFailures && ruleResultService) {
+    ruleResultService.addLastRunWarning(anyShardFailures);
+    ruleResultService.setLastRunOutcomeMessage(anyShardFailures);
+  }
+
  const link = await generateLink(
    initialSearchSource,
    services.share.url.locators.get<DiscoverAppLocatorParams>('DISCOVER_APP_LOCATOR')!,
--- a/x-pack/plugins/stack_alerts/server/rule_types/es_query/util.test.ts
+++ b/x-pack/plugins/stack_alerts/server/rule_types/es_query/util.test.ts
@ -7,7 +7,7 @@

 import { OnlyEsQueryRuleParams } from './types';
 import { Comparator } from '../../../common/comparator_types';
-import { getParsedQuery } from './util';
+import { getParsedQuery, checkForShardFailures } from './util';

 describe('es_query utils', () => {
  const defaultProps = {
@ -48,4 +48,209 @@ describe('es_query utils', () => {
      ).toThrow('invalid query specified: "{ "someProperty": "test-query" }" - query must be JSON');
    });
  });
+
+  describe('parseShardFailures', () => {
+    it('should return error message if any failures in the shard response', () => {
+      expect(
+        checkForShardFailures({
+          took: 16,
+          timed_out: false,
+          _shards: {
+            total: 51,
+            successful: 48,
+            skipped: 48,
+            failed: 3,
+            failures: [
+              {
+                shard: 0,
+                index: 'ccs-index',
+                node: '8jMc8jz-Q6qFmKZXfijt-A',
+                reason: {
+                  type: 'illegal_argument_exception',
+                  reason:
+                    "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+                },
+              },
+            ],
+          },
+          _clusters: { total: 1, successful: 1, running: 0, partial: 0, failed: 0, skipped: 0 },
+          hits: { total: { value: 0, relation: 'eq' }, max_score: 0, hits: [] },
+        })
+      ).toEqual(
+        `Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.`
+      );
+    });
+
+    it('should return default error message if malformed error', () => {
+      expect(
+        checkForShardFailures({
+          took: 16,
+          timed_out: false,
+          _shards: {
+            total: 51,
+            successful: 48,
+            skipped: 48,
+            failed: 3,
+            failures: [
+              // @ts-expect-error
+              {
+                shard: 0,
+                index: 'ccs-index',
+                node: '8jMc8jz-Q6qFmKZXfijt-A',
+              },
+            ],
+          },
+          _clusters: { total: 1, successful: 1, running: 0, partial: 0, failed: 0, skipped: 0 },
+          hits: { total: { value: 0, relation: 'eq' }, max_score: 0, hits: [] },
+        })
+      ).toEqual(`Search returned partial results due to shard failures.`);
+
+      expect(
+        checkForShardFailures({
+          took: 16,
+          timed_out: false,
+          _shards: { total: 51, successful: 48, skipped: 48, failed: 3, failures: [] },
+          hits: {
+            total: {
+              value: 0,
+              relation: 'eq',
+            },
+            max_score: 0,
+            hits: [],
+          },
+        })
+      ).toEqual(`Search returned partial results due to shard failures.`);
+    });
+
+    it('should return error if any skipped clusters with failures', () => {
+      expect(
+        checkForShardFailures({
+          took: 6,
+          timed_out: false,
+          num_reduce_phases: 0,
+          _shards: { total: 0, successful: 0, skipped: 0, failed: 0 },
+          _clusters: {
+            total: 1,
+            successful: 0,
+            skipped: 1,
+            running: 0,
+            partial: 0,
+            failed: 0,
+            details: {
+              test: {
+                status: 'skipped',
+                indices: '.kibana-event-log*',
+                timed_out: false,
+                failures: [
+                  {
+                    shard: -1,
+                    // @ts-expect-error
+                    index: null,
+                    reason: {
+                      type: 'search_phase_execution_exception',
+                      reason: 'all shards failed',
+                      phase: 'query',
+                      grouped: true,
+                      failed_shards: [
+                        {
+                          shard: 0,
+                          index: 'test:.ds-.kibana-event-log-ds-2024.07.31-000001',
+                          node: 'X1aMu4BpQR-7PHi-bEI8Fw',
+                          reason: {
+                            type: 'illegal_argument_exception',
+                            reason:
+                              "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+                          },
+                        },
+                      ],
+                      caused_by: {
+                        type: '',
+                        reason:
+                          "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+                        caused_by: {
+                          type: 'illegal_argument_exception',
+                          reason:
+                            "Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.",
+                        },
+                      },
+                    },
+                  },
+                ],
+              },
+            },
+          },
+          hits: { total: { value: 0, relation: 'eq' }, max_score: 0, hits: [] },
+        })
+      ).toEqual(
+        `Top hits result window is too large, the top hits aggregator [topHitsAgg]'s from + size must be less than or equal to: [100] but was [300]. This limit can be set by changing the [index.max_inner_result_window] index level setting.`
+      );
+    });
+
+    it('should return default error message if malformed skipped cluster error', () => {
+      expect(
+        checkForShardFailures({
+          took: 6,
+          timed_out: false,
+          num_reduce_phases: 0,
+          _shards: { total: 0, successful: 0, skipped: 0, failed: 0 },
+          _clusters: {
+            total: 1,
+            successful: 0,
+            skipped: 1,
+            running: 0,
+            partial: 0,
+            failed: 0,
+            details: {
+              test: {
+                status: 'skipped',
+                indices: '.kibana-event-log*',
+                timed_out: false,
+                failures: [],
+              },
+            },
+          },
+          hits: { total: { value: 0, relation: 'eq' }, max_score: 0, hits: [] },
+        })
+      ).toEqual(`Search returned partial results due to skipped cluster errors.`);
+
+      expect(
+        checkForShardFailures({
+          took: 6,
+          timed_out: false,
+          num_reduce_phases: 0,
+          _shards: { total: 0, successful: 0, skipped: 0, failed: 0 },
+          _clusters: {
+            total: 1,
+            successful: 0,
+            skipped: 1,
+            running: 0,
+            partial: 0,
+            failed: 0,
+            details: {
+              test: {
+                status: 'skipped',
+                indices: '.kibana-event-log*',
+                timed_out: false,
+                // @ts-expect-error
+                failures: [{ shard: -1 }],
+              },
+            },
+          },
+          hits: { total: { value: 0, relation: 'eq' }, max_score: 0, hits: [] },
+        })
+      ).toEqual(`Search returned partial results due to skipped cluster errors.`);
+    });
+
+    it('should return undefined if no failures', () => {
+      expect(
+        checkForShardFailures({
+          took: 16,
+          timed_out: false,
+          _shards: { total: 51, successful: 51, skipped: 51, failed: 0, failures: [] },
+          _clusters: { total: 1, successful: 1, running: 0, partial: 0, failed: 0, skipped: 0 },
+          hits: { total: { value: 0, relation: 'eq' }, max_score: 0, hits: [] },
+        })
+      ).toBeUndefined();
+    });
+  });
 });
--- a/x-pack/plugins/stack_alerts/server/rule_types/es_query/util.ts
+++ b/x-pack/plugins/stack_alerts/server/rule_types/es_query/util.ts
@ -6,6 +6,7 @@
 */

 import { i18n } from '@kbn/i18n';
+import { SearchResponse } from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
 import { OnlyEsQueryRuleParams } from './types';
 import { EsQueryRuleParams } from './rule_type_params';

@ -46,3 +47,24 @@ function getInvalidQueryError(query: string) {
    },
  });
 }
+
+export function checkForShardFailures(searchResult: SearchResponse<unknown>): string | undefined {
+  const anyShardsFailed = searchResult?._shards?.failed ?? 0;
+  if (anyShardsFailed > 0) {
+    const errorMessage =
+      searchResult?._shards?.failures?.[0]?.reason?.reason ||
+      'Search returned partial results due to shard failures.';
+    return errorMessage;
+  }
+
+  const anyClustersSkipped = searchResult?._clusters?.skipped ?? 0;
+  if (anyClustersSkipped) {
+    const details = searchResult?._clusters?.details ?? {};
+    for (const detail of Object.values(details)) {
+      const errorMessage =
+        detail?.failures?.[0]?.reason?.caused_by?.reason ||
+        'Search returned partial results due to skipped cluster errors.';
+      return errorMessage;
+    }
+  }
+}