[Response Ops][Alerting] Switch to logger.warn for no_shard_available_action_exception encountered when collecting telemetry (#205520)

Resolves https://github.com/elastic/response-ops-team/issues/221 ## Summary Looking through the logs for this, the majority of these errors have the following stack trace (14,000+ in the last 7 days): ``` ResponseError: search_phase_execution_exception Root causes: no_shard_available_action_exception: [es-es-search-686bf6b747-xldl8][100.65.77.183:9300][indices:data/read/search[phase/query]] no_shard_available_action_exception: null at KibanaTransport._request (/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:543:27) at processTicksAndRejections (node:internal/process/task_queues:95:5) at /usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:641:32 at KibanaTransport.request (/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:637:20) at KibanaTransport.request (/usr/share/kibana/node_modules/@kbn/core-elasticsearch-client-server-internal/src/create_transport.js:60:16) at ClientTraced.SearchApi [as search] (/usr/share/kibana/node_modules/@elastic/elasticsearch/lib/api/api/search.js:72:12) at getTotalAlertsCountAggregations (/usr/share/kibana/node_modules/@kbn/alerting-plugin/server/usage/lib/get_telemetry_from_alerts.js:42:21) at async Promise.all (index 6) at TaskManagerRunner.run (/usr/share/kibana/node_modules/@kbn/task-manager-plugin/server/task_running/task_runner.js:325:22) ``` Looking through the code, we are already catching these errors and returning a default response for that telemetry object. The `no_shard_available_action_exception` is not an issue with Kibana, it's an ES issue, so this PR catches these types of errors and log them at a debug level instead of a warn level to avoid polluting the logs with errors we have no control over. Excluding those results, we see a different and less frequent stack trace (100+ in the last 15 days): ``` TypeError: Cannot read properties of undefined (reading 'by_rule_type_id') at getTotalAlertsCountAggregations (/usr/share/kibana/node_modules/@kbn/alerting-plugin/server/usage/lib/get_telemetry_from_alerts.js:49:109) at processTicksAndRejections (node:internal/process/task_queues:95:5) at async Promise.all (index 6) at TaskManagerRunner.run (/usr/share/kibana/node_modules/@kbn/task-manager-plugin/server/task_running/task_runner.js:325:22) ``` For actions telemetry, the volume of these errors is much lower. Unfortunately, we are not logging the stack trace for these errors so it's harder to track down the source. I've updated the code to store the stack trace as well as adding the same handling for `no_shard_available_action_exception` under the assumption that it could also be getting these errors. --------- Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
2025-04-24 01:38:56 -04:00 · 2025-02-03 09:44:03 -05:00 · 2025-02-03 09:44:03 -05:00 · 4e0c0a785a
commit 4e0c0a785a
parent 504510b92b
12 changed files with 823 additions and 378 deletions
--- a/x-pack/platform/plugins/shared/actions/server/usage/actions_telemetry.test.ts
+++ b/x-pack/platform/plugins/shared/actions/server/usage/actions_telemetry.test.ts
@ -6,7 +6,9 @@
 */

 import { elasticsearchClientMock } from '@kbn/core-elasticsearch-client-server-mocks';
+import { MockedLogger, loggerMock } from '@kbn/logging-mocks';
 import { loggingSystemMock } from '@kbn/core/server/mocks';
+import { errors } from '@elastic/elasticsearch';
 import {
  getCounts,
  getExecutionsPerDayCount,
@ -14,9 +16,13 @@ import {
  getTotalCount,
 } from './actions_telemetry';

-const mockLogger = loggingSystemMock.create().get();
+let logger: MockedLogger;

 describe('actions telemetry', () => {
+  beforeEach(() => {
+    logger = loggerMock.create();
+  });
+
  test('getTotalCount should replace first symbol . to __ for action types names', async () => {
    const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
    mockEsClient.search.mockResponse(
@ -107,7 +113,7 @@ describe('actions telemetry', () => {
        },
      }
    );
-    const telemetry = await getTotalCount(mockEsClient, 'test', mockLogger);
+    const telemetry = await getTotalCount(mockEsClient, 'test', logger);

    expect(mockEsClient.search).toHaveBeenCalledTimes(1);

@ -126,16 +132,24 @@ describe('actions telemetry', () => {
    `);
  });

-  test('getTotalCount should return empty results if query throws error', async () => {
+  test('getTotalCount should return empty results and log warning if query throws error', async () => {
    const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
    mockEsClient.search.mockRejectedValue(new Error('oh no'));

-    const telemetry = await getTotalCount(mockEsClient, 'test', mockLogger);
+    const telemetry = await getTotalCount(mockEsClient, 'test', logger);

    expect(mockEsClient.search).toHaveBeenCalledTimes(1);
-    expect(mockLogger.warn).toHaveBeenCalledWith(
-      `Error executing actions telemetry task: getTotalCount - {}`
+
+    const loggerCalls = loggingSystemMock.collect(logger);
+    expect(loggerCalls.debug).toHaveLength(0);
+    expect(loggerCalls.warn).toHaveLength(1);
+    expect(loggerCalls.warn[0][0]).toEqual(
+      `Error executing actions telemetry task: getTotalCount - Error: oh no`
    );
+    // logger meta
+    expect(loggerCalls.warn[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
+    expect(loggerCalls.warn[0][1]?.error?.stack_trace).toBeDefined();
+
    expect(telemetry).toMatchInlineSnapshot(`
      Object {
        "countByType": Object {},
@ -147,6 +161,60 @@ describe('actions telemetry', () => {
    `);
  });

+  test('getTotalCount should return empty results and log debug if query throws search_phase_execution_exception error', async () => {
+    const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
+    mockEsClient.search.mockRejectedValueOnce(
+      new errors.ResponseError({
+        warnings: [],
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        meta: {} as any,
+        body: {
+          error: {
+            root_cause: [],
+            type: 'search_phase_execution_exception',
+            reason: 'no_shard_available_action_exception',
+            phase: 'fetch',
+            grouped: true,
+            failed_shards: [],
+            caused_by: {
+              type: 'no_shard_available_action_exception',
+              reason: 'This is the nested reason',
+            },
+          },
+        },
+        statusCode: 503,
+        headers: {},
+      })
+    );
+
+    const telemetry = await getTotalCount(mockEsClient, 'test', logger);
+
+    expect(mockEsClient.search).toHaveBeenCalledTimes(1);
+
+    const loggerCalls = loggingSystemMock.collect(logger);
+    expect(loggerCalls.debug).toHaveLength(1);
+    expect(loggerCalls.debug[0][0]).toMatchInlineSnapshot(`
+      "Error executing actions telemetry task: getTotalCount - ResponseError: search_phase_execution_exception
+      	Caused by:
+      		no_shard_available_action_exception: This is the nested reason"
+    `);
+    // logger meta
+    expect(loggerCalls.debug[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
+    expect(loggerCalls.debug[0][1]?.error?.stack_trace).toBeDefined();
+
+    expect(loggerCalls.warn).toHaveLength(0);
+
+    expect(telemetry).toMatchInlineSnapshot(`
+      Object {
+        "countByType": Object {},
+        "countGenAiProviderTypes": Object {},
+        "countTotal": 0,
+        "errorMessage": "no_shard_available_action_exception",
+        "hasErrors": true,
+      }
+    `);
+  });
+
  test('getInUseTotalCount', async () => {
    const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
    mockEsClient.search.mockResponseOnce(
@ -202,7 +270,7 @@ describe('actions telemetry', () => {
        ],
      },
    });
-    const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger);
+    const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger);

    expect(mockEsClient.search).toHaveBeenCalledTimes(2);
    expect(telemetry).toMatchInlineSnapshot(`
@ -287,7 +355,7 @@ describe('actions telemetry', () => {
        ],
      },
    });
-    const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger, undefined, [
+    const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger, undefined, [
      {
        id: 'test',
        actionTypeId: '.email',
@ -332,16 +400,23 @@ describe('actions telemetry', () => {
    `);
  });

-  test('getInUseTotalCount should return empty results if query throws error', async () => {
+  test('getInUseTotalCount should return empty results and log warning if query throws error', async () => {
    const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
    mockEsClient.search.mockRejectedValue(new Error('oh no'));

-    const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger);
+    const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger);

    expect(mockEsClient.search).toHaveBeenCalledTimes(1);
-    expect(mockLogger.warn).toHaveBeenCalledWith(
-      `Error executing actions telemetry task: getInUseTotalCount - {}`
+    const loggerCalls = loggingSystemMock.collect(logger);
+    expect(loggerCalls.debug).toHaveLength(0);
+    expect(loggerCalls.warn).toHaveLength(1);
+    expect(loggerCalls.warn[0][0]).toEqual(
+      `Error executing actions telemetry task: getInUseTotalCount - Error: oh no`
    );
+    // logger meta
+    expect(loggerCalls.warn[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
+    expect(loggerCalls.warn[0][1]?.error?.stack_trace).toBeDefined();
+
    expect(telemetry).toMatchInlineSnapshot(`
      Object {
        "countByAlertHistoryConnectorType": 0,
@ -355,6 +430,62 @@ describe('actions telemetry', () => {
    `);
  });

+  test('getInUseTotalCount should return empty results and log debug if query throws search_phase_execution_exception error', async () => {
+    const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
+    mockEsClient.search.mockRejectedValueOnce(
+      new errors.ResponseError({
+        warnings: [],
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        meta: {} as any,
+        body: {
+          error: {
+            root_cause: [],
+            type: 'search_phase_execution_exception',
+            reason: 'no_shard_available_action_exception',
+            phase: 'fetch',
+            grouped: true,
+            failed_shards: [],
+            caused_by: {
+              type: 'no_shard_available_action_exception',
+              reason: 'This is the nested reason',
+            },
+          },
+        },
+        statusCode: 503,
+        headers: {},
+      })
+    );
+
+    const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger);
+
+    expect(mockEsClient.search).toHaveBeenCalledTimes(1);
+
+    const loggerCalls = loggingSystemMock.collect(logger);
+    expect(loggerCalls.debug).toHaveLength(1);
+    expect(loggerCalls.debug[0][0]).toMatchInlineSnapshot(`
+      "Error executing actions telemetry task: getInUseTotalCount - ResponseError: search_phase_execution_exception
+      	Caused by:
+      		no_shard_available_action_exception: This is the nested reason"
+    `);
+    // logger meta
+    expect(loggerCalls.debug[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
+    expect(loggerCalls.debug[0][1]?.error?.stack_trace).toBeDefined();
+
+    expect(loggerCalls.warn).toHaveLength(0);
+
+    expect(telemetry).toMatchInlineSnapshot(`
+      Object {
+        "countByAlertHistoryConnectorType": 0,
+        "countByType": Object {},
+        "countEmailByService": Object {},
+        "countNamespaces": 0,
+        "countTotal": 0,
+        "errorMessage": "no_shard_available_action_exception",
+        "hasErrors": true,
+      }
+    `);
+  });
+
  test('getTotalCount accounts for preconfigured connectors', async () => {
    const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
    mockEsClient.search.mockResponse(
@ -445,7 +576,7 @@ describe('actions telemetry', () => {
        },
      }
    );
-    const telemetry = await getTotalCount(mockEsClient, 'test', mockLogger, [
+    const telemetry = await getTotalCount(mockEsClient, 'test', logger, [
      {
        id: 'test',
        actionTypeId: '.test',
@ -501,7 +632,7 @@ describe('actions telemetry', () => {
        },
      }
    );
-    const telemetry = await getTotalCount(mockEsClient, 'test', mockLogger, [
+    const telemetry = await getTotalCount(mockEsClient, 'test', logger, [
      {
        id: 'system_action:system-connector-test.system-action',
        actionTypeId: 'test.system-action',
@ -615,7 +746,7 @@ describe('actions telemetry', () => {
        ],
      },
    });
-    const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger, undefined, [
+    const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger, undefined, [
      {
        id: 'anotherServerLog',
        actionTypeId: '.server-log',
@ -721,7 +852,7 @@ describe('actions telemetry', () => {
      },
    });

-    const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger, undefined, []);
+    const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger, undefined, []);

    expect(mockEsClient.search).toHaveBeenCalledTimes(2);
    expect(telemetry).toMatchInlineSnapshot(`
@ -827,7 +958,7 @@ describe('actions telemetry', () => {
        ],
      },
    });
-    const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger);
+    const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger);

    expect(mockEsClient.search).toHaveBeenCalledTimes(2);
    expect(telemetry).toMatchInlineSnapshot(`
@ -961,7 +1092,7 @@ describe('actions telemetry', () => {
        },
      }
    );
-    const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', mockLogger);
+    const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', logger);

    expect(mockEsClient.search).toHaveBeenCalledTimes(1);
    expect(telemetry).toStrictEqual({
@ -995,16 +1126,24 @@ describe('actions telemetry', () => {
    });
  });

-  test('getExecutionsPerDayCount should return empty results if query throws error', async () => {
+  test('getExecutionsPerDayCount should return empty results and log warning if query throws error', async () => {
    const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
    mockEsClient.search.mockRejectedValue(new Error('oh no'));

-    const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', mockLogger);
+    const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', logger);

    expect(mockEsClient.search).toHaveBeenCalledTimes(1);
-    expect(mockLogger.warn).toHaveBeenCalledWith(
-      `Error executing actions telemetry task: getExecutionsPerDayCount - {}`
+
+    const loggerCalls = loggingSystemMock.collect(logger);
+    expect(loggerCalls.debug).toHaveLength(0);
+    expect(loggerCalls.warn).toHaveLength(1);
+    expect(loggerCalls.warn[0][0]).toEqual(
+      `Error executing actions telemetry task: getExecutionsPerDayCount - Error: oh no`
    );
+    // logger meta
+    expect(loggerCalls.warn[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
+    expect(loggerCalls.warn[0][1]?.error?.stack_trace).toBeDefined();
+
    expect(telemetry).toMatchInlineSnapshot(`
      Object {
        "avgExecutionTime": 0,
@ -1020,6 +1159,64 @@ describe('actions telemetry', () => {
    `);
  });

+  test('getExecutionsPerDayCount should return empty results and log debug if query throws search_phase_execution_exception error', async () => {
+    const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
+    mockEsClient.search.mockRejectedValueOnce(
+      new errors.ResponseError({
+        warnings: [],
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        meta: {} as any,
+        body: {
+          error: {
+            root_cause: [],
+            type: 'search_phase_execution_exception',
+            reason: 'no_shard_available_action_exception',
+            phase: 'fetch',
+            grouped: true,
+            failed_shards: [],
+            caused_by: {
+              type: 'no_shard_available_action_exception',
+              reason: 'This is the nested reason',
+            },
+          },
+        },
+        statusCode: 503,
+        headers: {},
+      })
+    );
+
+    const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', logger);
+
+    expect(mockEsClient.search).toHaveBeenCalledTimes(1);
+
+    const loggerCalls = loggingSystemMock.collect(logger);
+    expect(loggerCalls.debug).toHaveLength(1);
+    expect(loggerCalls.debug[0][0]).toMatchInlineSnapshot(`
+      "Error executing actions telemetry task: getExecutionsPerDayCount - ResponseError: search_phase_execution_exception
+      	Caused by:
+      		no_shard_available_action_exception: This is the nested reason"
+    `);
+    // logger meta
+    expect(loggerCalls.debug[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
+    expect(loggerCalls.debug[0][1]?.error?.stack_trace).toBeDefined();
+
+    expect(loggerCalls.warn).toHaveLength(0);
+
+    expect(telemetry).toMatchInlineSnapshot(`
+      Object {
+        "avgExecutionTime": 0,
+        "avgExecutionTimeByType": Object {},
+        "countByType": Object {},
+        "countFailed": 0,
+        "countFailedByType": Object {},
+        "countRunOutcomeByConnectorType": Object {},
+        "countTotal": 0,
+        "errorMessage": "no_shard_available_action_exception",
+        "hasErrors": true,
+      }
+    `);
+  });
+
  it('getCounts', () => {
    const aggs = {
      '.d3security': 2,
--- a/x-pack/platform/plugins/shared/actions/server/usage/actions_telemetry.ts
+++ b/x-pack/platform/plugins/shared/actions/server/usage/actions_telemetry.ts
@ -20,6 +20,7 @@ import {
  getActionExecutions,
  getActionsCount,
 } from './lib/actions_telemetry_util';
+import { parseAndLogError } from './lib/parse_and_log_error';

 export interface InMemoryAggRes {
  total: number;
@ -108,9 +109,7 @@ export async function getTotalCount(
      countGenAiProviderTypes,
    };
  } catch (err) {
-    const errorMessage = err && err.message ? err.message : err.toString();
-
-    logger.warn(`Error executing actions telemetry task: getTotalCount - ${JSON.stringify(err)}`);
+    const errorMessage = parseAndLogError(err, `getTotalCount`, logger);

    return {
      hasErrors: true,
@ -387,11 +386,8 @@ export async function getInUseTotalCount(
      countNamespaces: namespacesList.size,
    };
  } catch (err) {
-    const errorMessage = err && err.message ? err.message : err.toString();
+    const errorMessage = parseAndLogError(err, `getInUseTotalCount`, logger);

-    logger.warn(
-      `Error executing actions telemetry task: getInUseTotalCount - ${JSON.stringify(err)}`
-    );
    return {
      hasErrors: true,
      errorMessage,
@ -657,10 +653,8 @@ export async function getExecutionsPerDayCount(
      ),
    };
  } catch (err) {
-    const errorMessage = err && err.message ? err.message : err.toString();
-    logger.warn(
-      `Error executing actions telemetry task: getExecutionsPerDayCount - ${JSON.stringify(err)}`
-    );
+    const errorMessage = parseAndLogError(err, `getExecutionsPerDayCount`, logger);
+
    return {
      hasErrors: true,
      errorMessage,
--- a/x-pack/platform/plugins/shared/actions/server/usage/lib/parse_and_log_error.ts
+++ b/x-pack/platform/plugins/shared/actions/server/usage/lib/parse_and_log_error.ts
@ -0,0 +1,34 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { Logger } from '@kbn/core/server';
+
+export function parseAndLogError(err: Error, errType: string, logger: Logger): string {
+  const errorMessage = err && err.message ? err.message : err.toString();
+  let returnedErrorMessage = errorMessage;
+
+  const errorStr = JSON.stringify(err);
+  const logMessage = `Error executing actions telemetry task: ${errType} - ${err}`;
+  const logOptions = {
+    tags: ['actions', 'telemetry-failed'],
+    error: { stack_trace: err.stack },
+  };
+
+  // If error string contains "no_shard_available_action_exception", debug log it
+  if (errorStr.includes('no_shard_available_action_exception')) {
+    // the no_shard_available_action_exception can be wordy and the error message returned from this function
+    // gets stored in the task state so lets simplify
+    returnedErrorMessage = 'no_shard_available_action_exception';
+    if (logger.isLevelEnabled('debug')) {
+      logger.debug(logMessage, logOptions);
+    }
+  } else {
+    logger.warn(logMessage, logOptions);
+  }
+
+  return returnedErrorMessage;
+}
--- a/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_alerts.test.ts
+++ b/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_alerts.test.ts
@ -6,64 +6,50 @@
 */

 import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
+import { MockedLogger, loggerMock } from '@kbn/logging-mocks';
 import { getTotalAlertsCountAggregations } from './get_telemetry_from_alerts';
+import { errors } from '@elastic/elasticsearch';

 const elasticsearch = elasticsearchServiceMock.createStart();
 const esClient = elasticsearch.client.asInternalUser;
-const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
+let logger: MockedLogger;

 describe('kibana index telemetry', () => {
  beforeEach(() => {
    jest.resetAllMocks();
+    logger = loggerMock.create();
  });

-  it('should return total alert couts and alert counts by rule type id', async () => {
+  it('should return total alert counts and alert counts by rule type id', async () => {
    esClient.search.mockResponseOnce({
      took: 4,
      timed_out: false,
-      _shards: {
-        total: 1,
-        successful: 1,
-        skipped: 0,
-        failed: 0,
-      },
-      hits: {
-        total: {
-          value: 6,
-          relation: 'eq',
-        },
-        max_score: null,
-        hits: [],
-      },
+      _shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
+      hits: { total: { value: 6, relation: 'eq' }, max_score: null, hits: [] },
      aggregations: {
        by_rule_type_id: {
          doc_count_error_upper_bound: 0,
          sum_other_doc_count: 0,
          buckets: [
-            {
-              key: '.index-threshold',
-              doc_count: 1,
-            },
-            {
-              key: 'logs.alert.document.count',
-              doc_count: 2,
-            },
-            {
-              key: 'document.test.',
-              doc_count: 3,
-            },
+            { key: '.index-threshold', doc_count: 1 },
+            { key: 'logs.alert.document.count', doc_count: 2 },
+            { key: 'document.test.', doc_count: 3 },
          ],
        },
      },
    });

-    const telemetry = await getTotalAlertsCountAggregations({
-      esClient,
-      logger,
-    });
+    const telemetry = await getTotalAlertsCountAggregations({ esClient, logger });

    expect(esClient.search).toHaveBeenCalledTimes(1);
-    expect(logger.debug).toHaveBeenCalledTimes(2);
+    const debugLogs = loggingSystemMock.collect(logger).debug;
+    expect(debugLogs).toHaveLength(2);
+    expect(debugLogs[0][0]).toEqual(
+      `query for getTotalAlertsCountAggregations - {\"index\":\".alerts-*\",\"size\":0,\"body\":{\"query\":{\"match_all\":{}},\"aggs\":{\"by_rule_type_id\":{\"terms\":{\"field\":\"kibana.alert.rule.rule_type_id\",\"size\":33}}}}}`
+    );
+    expect(debugLogs[1][0]).toEqual(
+      `results for getTotalAlertsCountAggregations query - {\"took\":4,\"timed_out\":false,\"_shards\":{\"total\":1,\"successful\":1,\"skipped\":0,\"failed\":0},\"hits\":{\"total\":{\"value\":6,\"relation\":\"eq\"},\"max_score\":null,\"hits\":[]},\"aggregations\":{\"by_rule_type_id\":{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\".index-threshold\",\"doc_count\":1},{\"key\":\"logs.alert.document.count\",\"doc_count\":2},{\"key\":\"document.test.\",\"doc_count\":3}]}}}`
+    );

    expect(telemetry).toEqual({
      hasErrors: false,
@ -77,37 +63,18 @@ describe('kibana index telemetry', () => {
    });
  });

-  it('should return ', async () => {
+  it('should return on empty results', async () => {
    esClient.search.mockResponseOnce({
      took: 4,
      timed_out: false,
-      _shards: {
-        total: 1,
-        successful: 1,
-        skipped: 0,
-        failed: 0,
-      },
-      hits: {
-        total: {
-          value: 0,
-          relation: 'eq',
-        },
-        max_score: null,
-        hits: [],
-      },
+      _shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
+      hits: { total: { value: 0, relation: 'eq' }, max_score: null, hits: [] },
      aggregations: {
-        by_rule_type_id: {
-          doc_count_error_upper_bound: 0,
-          sum_other_doc_count: 0,
-          buckets: [],
-        },
+        by_rule_type_id: { doc_count_error_upper_bound: 0, sum_other_doc_count: 0, buckets: [] },
      },
    });

-    const telemetry = await getTotalAlertsCountAggregations({
-      esClient,
-      logger,
-    });
+    const telemetry = await getTotalAlertsCountAggregations({ esClient, logger });

    expect(telemetry).toEqual({
      hasErrors: false,
@ -116,17 +83,25 @@ describe('kibana index telemetry', () => {
    });
  });

-  test('should return empty results and log warning if query throws error', async () => {
+  it('should return empty results and log warning if query throws error', async () => {
    esClient.search.mockRejectedValueOnce(new Error('test'));

-    const telemetry = await getTotalAlertsCountAggregations({
-      esClient,
-      logger,
-    });
+    const telemetry = await getTotalAlertsCountAggregations({ esClient, logger });

    expect(esClient.search).toHaveBeenCalledTimes(1);
-    expect(logger.debug).toHaveBeenCalledTimes(1);
-    expect(logger.warn).toHaveBeenCalledTimes(1);
+
+    const loggerCalls = loggingSystemMock.collect(logger);
+    expect(loggerCalls.debug).toHaveLength(1);
+    expect(loggerCalls.debug[0][0]).toEqual(
+      `query for getTotalAlertsCountAggregations - {\"index\":\".alerts-*\",\"size\":0,\"body\":{\"query\":{\"match_all\":{}},\"aggs\":{\"by_rule_type_id\":{\"terms\":{\"field\":\"kibana.alert.rule.rule_type_id\",\"size\":33}}}}}`
+    );
+    expect(loggerCalls.warn).toHaveLength(1);
+    expect(loggerCalls.warn[0][0]).toEqual(
+      `Error executing alerting telemetry task: getTotalAlertsCountAggregations - Error: test`
+    );
+    // logger meta
+    expect(loggerCalls.warn[0][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
+    expect(loggerCalls.warn[0][1]?.error?.stack_trace).toBeDefined();

    expect(telemetry).toEqual({
      hasErrors: true,
@ -135,4 +110,56 @@ describe('kibana index telemetry', () => {
      count_alerts_by_rule_type: {},
    });
  });
+
+  it('should return empty results and log debug log if query throws search_phase_execution_exception error', async () => {
+    esClient.search.mockRejectedValueOnce(
+      new errors.ResponseError({
+        warnings: [],
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        meta: {} as any,
+        body: {
+          error: {
+            root_cause: [],
+            type: 'search_phase_execution_exception',
+            reason: 'no_shard_available_action_exception',
+            phase: 'fetch',
+            grouped: true,
+            failed_shards: [],
+            caused_by: {
+              type: 'no_shard_available_action_exception',
+              reason: 'This is the nested reason',
+            },
+          },
+        },
+        statusCode: 503,
+        headers: {},
+      })
+    );
+
+    const telemetry = await getTotalAlertsCountAggregations({ esClient, logger });
+
+    expect(esClient.search).toHaveBeenCalledTimes(1);
+
+    const loggerCalls = loggingSystemMock.collect(logger);
+    expect(loggerCalls.debug).toHaveLength(2);
+    expect(loggerCalls.debug[0][0]).toEqual(
+      `query for getTotalAlertsCountAggregations - {\"index\":\".alerts-*\",\"size\":0,\"body\":{\"query\":{\"match_all\":{}},\"aggs\":{\"by_rule_type_id\":{\"terms\":{\"field\":\"kibana.alert.rule.rule_type_id\",\"size\":33}}}}}`
+    );
+    expect(loggerCalls.debug[1][0]).toMatchInlineSnapshot(`
+      "Error executing alerting telemetry task: getTotalAlertsCountAggregations - ResponseError: search_phase_execution_exception
+      	Caused by:
+      		no_shard_available_action_exception: This is the nested reason"
+    `);
+    // logger meta
+    expect(loggerCalls.debug[1][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
+    expect(loggerCalls.debug[1][1]?.error?.stack_trace).toBeDefined();
+    expect(loggerCalls.warn).toHaveLength(0);
+
+    expect(telemetry).toEqual({
+      hasErrors: true,
+      errorMessage: `no_shard_available_action_exception`,
+      count_alerts_total: 0,
+      count_alerts_by_rule_type: {},
+    });
+  });
 });
--- a/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_alerts.ts
+++ b/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_alerts.ts
@ -14,6 +14,7 @@ import { ElasticsearchClient, Logger } from '@kbn/core/server';
 import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
 import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
 import { AlertingUsage } from '../types';
+import { parseAndLogError } from './parse_and_log_error';

 interface Opts {
  esClient: ElasticsearchClient;
@ -69,20 +70,10 @@ export async function getTotalAlertsCountAggregations({
    return {
      hasErrors: false,
      count_alerts_total: totalAlertsCount ?? 0,
-      count_alerts_by_rule_type: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets),
+      count_alerts_by_rule_type: parseSimpleRuleTypeBucket(aggregations?.by_rule_type_id?.buckets),
    };
  } catch (err) {
-    const errorMessage = err && err.message ? err.message : err.toString();
-
-    logger.warn(
-      `Error executing alerting telemetry task: getTotalAlertsCountAggregations - ${JSON.stringify(
-        err
-      )}`,
-      {
-        tags: ['alerting', 'telemetry-failed'],
-        error: { stack_trace: err.stack },
-      }
-    );
+    const errorMessage = parseAndLogError(err, `getTotalAlertsCountAggregations`, logger);

    return {
      hasErrors: true,
--- a/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_event_log.test.ts
+++ b/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_event_log.test.ts
@ -6,6 +6,8 @@
 */

 import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
+import { MockedLogger, loggerMock } from '@kbn/logging-mocks';
+import { errors } from '@elastic/elasticsearch';
 import {
  getExecutionsPerDayCount,
  parseExecutionFailureByRuleType,
@ -17,11 +19,12 @@ import {

 const elasticsearch = elasticsearchServiceMock.createStart();
 const esClient = elasticsearch.client.asInternalUser;
-const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
+let logger: MockedLogger;

 describe('event log telemetry', () => {
  beforeEach(() => {
    jest.resetAllMocks();
+    logger = loggerMock.create();
  });

  describe('parseRuleTypeBucket', () => {
@ -1106,20 +1109,8 @@ describe('event log telemetry', () => {
      esClient.search.mockResponse({
        took: 4,
        timed_out: false,
-        _shards: {
-          total: 1,
-          successful: 1,
-          skipped: 0,
-          failed: 0,
-        },
-        hits: {
-          total: {
-            value: 148,
-            relation: 'eq',
-          },
-          max_score: null,
-          hits: [],
-        },
+        _shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
+        hits: { total: { value: 148, relation: 'eq' }, max_score: null, hits: [] },
        aggregations: {
          by_rule_type_id: {
            doc_count_error_upper_bound: 0,
@ -1302,11 +1293,7 @@ describe('event log telemetry', () => {
        },
      });

-      const telemetry = await getExecutionsPerDayCount({
-        esClient,
-        eventLogIndex: 'test',
-        logger,
-      });
+      const telemetry = await getExecutionsPerDayCount({ esClient, eventLogIndex: 'test', logger });

      expect(esClient.search).toHaveBeenCalledTimes(1);

@ -1416,17 +1403,14 @@ describe('event log telemetry', () => {
    test('should return empty results and log warning if query throws error', async () => {
      esClient.search.mockRejectedValue(new Error('oh no'));

-      const telemetry = await getExecutionsPerDayCount({
-        esClient,
-        eventLogIndex: 'test',
-        logger,
-      });
+      const telemetry = await getExecutionsPerDayCount({ esClient, eventLogIndex: 'test', logger });

      expect(esClient.search).toHaveBeenCalledTimes(1);
+
      const loggerCall = logger.warn.mock.calls[0][0];
      const loggerMeta = logger.warn.mock.calls[0][1];
      expect(loggerCall as string).toMatchInlineSnapshot(
-        `"Error executing alerting telemetry task: getExecutionsPerDayCount - {}"`
+        `"Error executing alerting telemetry task: getExecutionsPerDayCount - Error: oh no"`
      );
      expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
      expect(loggerMeta?.error?.stack_trace).toBeDefined();
@ -1451,6 +1435,72 @@ describe('event log telemetry', () => {
        countRulesByExecutionStatus: {},
      });
    });
+
+    it('should return empty results and log debug log if query throws search_phase_execution_exception error', async () => {
+      esClient.search.mockRejectedValueOnce(
+        new errors.ResponseError({
+          warnings: [],
+          // eslint-disable-next-line @typescript-eslint/no-explicit-any
+          meta: {} as any,
+          body: {
+            error: {
+              root_cause: [],
+              type: 'search_phase_execution_exception',
+              reason: 'no_shard_available_action_exception',
+              phase: 'fetch',
+              grouped: true,
+              failed_shards: [],
+              caused_by: {
+                type: 'no_shard_available_action_exception',
+                reason: 'This is the nested reason',
+              },
+            },
+          },
+          statusCode: 503,
+          headers: {},
+        })
+      );
+
+      const telemetry = await getExecutionsPerDayCount({ esClient, eventLogIndex: 'test', logger });
+
+      expect(esClient.search).toHaveBeenCalledTimes(1);
+
+      const loggerCalls = loggingSystemMock.collect(logger);
+      expect(loggerCalls.debug).toHaveLength(2);
+      expect(loggerCalls.debug[0][0]).toEqual(
+        `query for getExecutionsPerDayCount - {\"index\":\"test\",\"size\":0,\"body\":{\"query\":{\"bool\":{\"filter\":{\"bool\":{\"must\":[{\"term\":{\"event.action\":\"execute\"}},{\"term\":{\"event.provider\":\"alerting\"}},{\"range\":{\"@timestamp\":{\"gte\":\"now-1d\"}}}]}}}},\"aggs\":{\"avg_execution_time\":{\"avg\":{\"field\":\"event.duration\"}},\"avg_es_search_duration\":{\"avg\":{\"field\":\"kibana.alert.rule.execution.metrics.es_search_duration_ms\"}},\"avg_total_search_duration\":{\"avg\":{\"field\":\"kibana.alert.rule.execution.metrics.total_search_duration_ms\"}},\"percentile_scheduled_actions\":{\"percentiles\":{\"field\":\"kibana.alert.rule.execution.metrics.number_of_generated_actions\",\"percents\":[50,90,99]}},\"percentile_alerts\":{\"percentiles\":{\"field\":\"kibana.alert.rule.execution.metrics.alert_counts.active\",\"percents\":[50,90,99]}},\"execution_failures\":{\"filter\":{\"term\":{\"event.outcome\":\"failure\"}},\"aggs\":{\"by_reason\":{\"terms\":{\"field\":\"event.reason\",\"size\":5}}}},\"by_rule_type_id\":{\"terms\":{\"field\":\"rule.category\",\"size\":33},\"aggs\":{\"avg_execution_time\":{\"avg\":{\"field\":\"event.duration\"}},\"avg_es_search_duration\":{\"avg\":{\"field\":\"kibana.alert.rule.execution.metrics.es_search_duration_ms\"}},\"avg_total_search_duration\":{\"avg\":{\"field\":\"kibana.alert.rule.execution.metrics.total_search_duration_ms\"}},\"percentile_scheduled_actions\":{\"percentiles\":{\"field\":\"kibana.alert.rule.execution.metrics.number_of_generated_actions\",\"percents\":[50,90,99]}},\"percentile_alerts\":{\"percentiles\":{\"field\":\"kibana.alert.rule.execution.metrics.alert_counts.active\",\"percents\":[50,90,99]}},\"execution_failures\":{\"filter\":{\"term\":{\"event.outcome\":\"failure\"}},\"aggs\":{\"by_reason\":{\"terms\":{\"field\":\"event.reason\",\"size\":5}}}}}},\"by_execution_status\":{\"terms\":{\"field\":\"event.outcome\"}}}}}`
+      );
+      expect(loggerCalls.debug[1][0]).toMatchInlineSnapshot(`
+        "Error executing alerting telemetry task: getExecutionsPerDayCount - ResponseError: search_phase_execution_exception
+        	Caused by:
+        		no_shard_available_action_exception: This is the nested reason"
+      `);
+      // logger meta
+      expect(loggerCalls.debug[1][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
+      expect(loggerCalls.debug[1][1]?.error?.stack_trace).toBeDefined();
+      expect(loggerCalls.warn).toHaveLength(0);
+
+      expect(telemetry).toStrictEqual({
+        hasErrors: true,
+        errorMessage: 'no_shard_available_action_exception',
+        countTotalRuleExecutions: 0,
+        countRuleExecutionsByType: {},
+        countTotalFailedExecutions: 0,
+        countFailedExecutionsByReason: {},
+        countFailedExecutionsByReasonByType: {},
+        avgExecutionTime: 0,
+        avgExecutionTimeByType: {},
+        avgEsSearchDuration: 0,
+        avgEsSearchDurationByType: {},
+        avgTotalSearchDuration: 0,
+        avgTotalSearchDurationByType: {},
+        generatedActionsPercentiles: {},
+        generatedActionsPercentilesByType: {},
+        alertsPercentiles: {},
+        alertsPercentilesByType: {},
+        countRulesByExecutionStatus: {},
+      });
+    });
  });

  describe('getExecutionTimeoutsPerDayCount', () => {
@ -1458,37 +1508,16 @@ describe('event log telemetry', () => {
      esClient.search.mockResponse({
        took: 4,
        timed_out: false,
-        _shards: {
-          total: 1,
-          successful: 1,
-          skipped: 0,
-          failed: 0,
-        },
-        hits: {
-          total: {
-            value: 4,
-            relation: 'eq',
-          },
-          max_score: null,
-          hits: [],
-        },
+        _shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
+        hits: { total: { value: 4, relation: 'eq' }, max_score: null, hits: [] },
        aggregations: {
          by_rule_type_id: {
            doc_count_error_upper_bound: 0,
            sum_other_doc_count: 0,
            buckets: [
-              {
-                key: '.index-threshold',
-                doc_count: 2,
-              },
-              {
-                key: 'logs.alert.document.count',
-                doc_count: 1,
-              },
-              {
-                key: 'document.test.',
-                doc_count: 1,
-              },
+              { key: '.index-threshold', doc_count: 2 },
+              { key: 'logs.alert.document.count', doc_count: 1 },
+              { key: 'document.test.', doc_count: 1 },
            ],
          },
        },
@ -1527,7 +1556,7 @@ describe('event log telemetry', () => {
      const loggerCall = logger.warn.mock.calls[0][0];
      const loggerMeta = logger.warn.mock.calls[0][1];
      expect(loggerCall as string).toMatchInlineSnapshot(
-        `"Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - {}"`
+        `"Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - Error: oh no"`
      );
      expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
      expect(loggerMeta?.error?.stack_trace).toBeDefined();
@ -1538,5 +1567,61 @@ describe('event log telemetry', () => {
        hasErrors: true,
      });
    });
+
+    it('should return empty results and log debug log if query throws search_phase_execution_exception error', async () => {
+      esClient.search.mockRejectedValueOnce(
+        new errors.ResponseError({
+          warnings: [],
+          // eslint-disable-next-line @typescript-eslint/no-explicit-any
+          meta: {} as any,
+          body: {
+            error: {
+              root_cause: [],
+              type: 'search_phase_execution_exception',
+              reason: 'no_shard_available_action_exception',
+              phase: 'fetch',
+              grouped: true,
+              failed_shards: [],
+              caused_by: {
+                type: 'no_shard_available_action_exception',
+                reason: 'This is the nested reason',
+              },
+            },
+          },
+          statusCode: 503,
+          headers: {},
+        })
+      );
+
+      const telemetry = await getExecutionTimeoutsPerDayCount({
+        esClient,
+        eventLogIndex: 'test',
+        logger,
+      });
+
+      expect(esClient.search).toHaveBeenCalledTimes(1);
+
+      const loggerCalls = loggingSystemMock.collect(logger);
+      expect(loggerCalls.debug).toHaveLength(2);
+      expect(loggerCalls.debug[0][0]).toEqual(
+        `query for getExecutionTimeoutsPerDayCount - {\"index\":\"test\",\"size\":0,\"body\":{\"query\":{\"bool\":{\"filter\":{\"bool\":{\"must\":[{\"term\":{\"event.action\":\"execute-timeout\"}},{\"term\":{\"event.provider\":\"alerting\"}},{\"range\":{\"@timestamp\":{\"gte\":\"now-1d\"}}}]}}}},\"aggs\":{\"by_rule_type_id\":{\"terms\":{\"field\":\"rule.category\",\"size\":33}}}}}`
+      );
+      expect(loggerCalls.debug[1][0]).toMatchInlineSnapshot(`
+        "Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - ResponseError: search_phase_execution_exception
+        	Caused by:
+        		no_shard_available_action_exception: This is the nested reason"
+      `);
+      // logger meta
+      expect(loggerCalls.debug[1][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
+      expect(loggerCalls.debug[1][1]?.error?.stack_trace).toBeDefined();
+      expect(loggerCalls.warn).toHaveLength(0);
+
+      expect(telemetry).toStrictEqual({
+        hasErrors: true,
+        errorMessage: 'no_shard_available_action_exception',
+        countExecutionTimeouts: 0,
+        countExecutionTimeoutsByType: {},
+      });
+    });
  });
 });
--- a/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_event_log.ts
+++ b/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_event_log.ts
@ -22,6 +22,7 @@ import {
 } from '../alerting_usage_collector';
 import { replaceDotSymbols } from './replace_dots_with_underscores';
 import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
+import { parseAndLogError } from './parse_and_log_error';

 const Millis2Nanos = 1000 * 1000;
 const percentileFieldNameMapping: Record<string, string> = {
@ -189,14 +190,8 @@ export async function getExecutionsPerDayCount({
      ),
    };
  } catch (err) {
-    const errorMessage = err && err.message ? err.message : err.toString();
-    logger.warn(
-      `Error executing alerting telemetry task: getExecutionsPerDayCount - ${JSON.stringify(err)}`,
-      {
-        tags: ['alerting', 'telemetry-failed'],
-        error: { stack_trace: err.stack },
-      }
-    );
+    const errorMessage = parseAndLogError(err, `getExecutionsPerDayCount`, logger);
+
    return {
      hasErrors: true,
      errorMessage,
@ -262,17 +257,8 @@ export async function getExecutionTimeoutsPerDayCount({
      countExecutionTimeoutsByType: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets),
    };
  } catch (err) {
-    const errorMessage = err && err.message ? err.message : err.toString();
+    const errorMessage = parseAndLogError(err, `getExecutionsTimeoutsPerDayCount`, logger);

-    logger.warn(
-      `Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - ${JSON.stringify(
-        err
-      )}`,
-      {
-        tags: ['alerting', 'telemetry-failed'],
-        error: { stack_trace: err.stack },
-      }
-    );
    return {
      hasErrors: true,
      errorMessage,
--- a/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_kibana.test.ts
+++ b/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_kibana.test.ts
--- a/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_kibana.ts
+++ b/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_kibana.ts
@ -25,6 +25,7 @@ import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
 import { groupRulesBySearchType } from './group_rules_by_search_type';
 import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../../common';
 import { MaintenanceWindowAttributes } from '../../data/maintenance_window/types';
+import { parseAndLogError } from './parse_and_log_error';

 interface Opts {
  esClient: ElasticsearchClient;
@ -376,15 +377,8 @@ export async function getTotalCountAggregations({
      },
    };
  } catch (err) {
-    const errorMessage = err && err.message ? err.message : err.toString();
+    const errorMessage = parseAndLogError(err, `getTotalCountAggregations`, logger);

-    logger.warn(
-      `Error executing alerting telemetry task: getTotalCountAggregations - ${JSON.stringify(err)}`,
-      {
-        tags: ['alerting', 'telemetry-failed'],
-        error: { stack_trace: err.stack },
-      }
-    );
    return {
      hasErrors: true,
      errorMessage,
@ -491,14 +485,8 @@ export async function getTotalCountInUse({
      countNamespaces: aggregations.namespaces_count.value ?? 0,
    };
  } catch (err) {
-    const errorMessage = err && err.message ? err.message : err.toString();
-    logger.warn(
-      `Error executing alerting telemetry task: getTotalCountInUse - ${JSON.stringify(err)}`,
-      {
-        tags: ['alerting', 'telemetry-failed'],
-        error: { stack_trace: err.stack },
-      }
-    );
+    const errorMessage = parseAndLogError(err, `getTotalCountInUse`, logger);
+
    return {
      hasErrors: true,
      errorMessage,
@ -548,14 +536,8 @@ export async function getMWTelemetry({
      count_mw_with_filter_alert_toggle_on: countMWWithFilterAlertToggleON,
    };
  } catch (err) {
-    const errorMessage = err?.message ? err.message : err.toString();
-    logger.warn(
-      `Error executing alerting telemetry task: getTotalMWCount - ${JSON.stringify(err)}`,
-      {
-        tags: ['alerting', 'telemetry-failed'],
-        error: { stack_trace: err?.stack },
-      }
-    );
+    const errorMessage = parseAndLogError(err, `getTotalMWCount`, logger);
+
    return {
      hasErrors: true,
      errorMessage,
--- a/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_task_manager.test.ts
+++ b/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_task_manager.test.ts
@ -5,7 +5,9 @@
 * 2.0.
 */

+import { errors } from '@elastic/elasticsearch';
 import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
+import { MockedLogger, loggerMock } from '@kbn/logging-mocks';
 import {
  getFailedAndUnrecognizedTasksPerDay,
  parseBucket,
@ -13,11 +15,12 @@ import {

 const elasticsearch = elasticsearchServiceMock.createStart();
 const esClient = elasticsearch.client.asInternalUser;
-const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
+let logger: MockedLogger;

 describe('task manager telemetry', () => {
  beforeEach(() => {
    jest.resetAllMocks();
+    logger = loggerMock.create();
  });

  describe('parseBucket', () => {
@ -145,20 +148,8 @@ describe('task manager telemetry', () => {
      esClient.search.mockResponse({
        took: 4,
        timed_out: false,
-        _shards: {
-          total: 1,
-          successful: 1,
-          skipped: 0,
-          failed: 0,
-        },
-        hits: {
-          total: {
-            value: 40,
-            relation: 'eq',
-          },
-          max_score: null,
-          hits: [],
-        },
+        _shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
+        hits: { total: { value: 40, relation: 'eq' }, max_score: null, hits: [] },
        aggregations: {
          by_status: {
            doc_count_error_upper_bound: 0,
@ -243,7 +234,7 @@ describe('task manager telemetry', () => {
      const loggerCall = logger.warn.mock.calls[0][0];
      const loggerMeta = logger.warn.mock.calls[0][1];
      expect(loggerCall as string).toMatchInlineSnapshot(
-        `"Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - {}"`
+        `"Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - Error: oh no"`
      );
      expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
      expect(loggerMeta?.error?.stack_trace).toBeDefined();
@ -255,5 +246,62 @@ describe('task manager telemetry', () => {
        countFailedAndUnrecognizedTasksByStatusByType: {},
      });
    });
+
+    test('should return empty results and log debug log if query throws search_phase_execution_exception error', async () => {
+      esClient.search.mockRejectedValueOnce(
+        new errors.ResponseError({
+          warnings: [],
+          // eslint-disable-next-line @typescript-eslint/no-explicit-any
+          meta: {} as any,
+          body: {
+            error: {
+              root_cause: [],
+              type: 'search_phase_execution_exception',
+              reason: 'no_shard_available_action_exception',
+              phase: 'fetch',
+              grouped: true,
+              failed_shards: [],
+              caused_by: {
+                type: 'no_shard_available_action_exception',
+                reason: 'This is the nested reason',
+              },
+            },
+          },
+          statusCode: 503,
+          headers: {},
+        })
+      );
+
+      const telemetry = await getFailedAndUnrecognizedTasksPerDay({
+        esClient,
+        taskManagerIndex: 'test',
+        logger,
+      });
+
+      expect(esClient.search).toHaveBeenCalledTimes(1);
+
+      const loggerCalls = loggingSystemMock.collect(logger);
+      expect(loggerCalls.debug).toHaveLength(2);
+      expect(loggerCalls.debug[0][0]).toEqual(
+        `query for getFailedAndUnrecognizedTasksPerDay - {\"index\":\"test\",\"size\":0,\"body\":{\"query\":{\"bool\":{\"must\":[{\"bool\":{\"should\":[{\"term\":{\"task.status\":\"unrecognized\"}},{\"term\":{\"task.status\":\"failed\"}}]}},{\"wildcard\":{\"task.taskType\":{\"value\":\"alerting:*\"}}},{\"range\":{\"task.runAt\":{\"gte\":\"now-1d\"}}}]}},\"aggs\":{\"by_status\":{\"terms\":{\"field\":\"task.status\",\"size\":10},\"aggs\":{\"by_task_type\":{\"terms\":{\"field\":\"task.taskType\",\"size\":33}}}}}}}`
+      );
+      expect(loggerCalls.debug[1][0]).toMatchInlineSnapshot(`
+        "Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - ResponseError: search_phase_execution_exception
+        	Caused by:
+        		no_shard_available_action_exception: This is the nested reason"
+      `);
+      // logger meta
+      expect(loggerCalls.debug[1][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
+      expect(loggerCalls.debug[1][1]?.error?.stack_trace).toBeDefined();
+      expect(loggerCalls.warn).toHaveLength(0);
+
+      expect(telemetry).toStrictEqual({
+        errorMessage: 'no_shard_available_action_exception',
+        hasErrors: true,
+        countFailedAndUnrecognizedTasks: 0,
+        countFailedAndUnrecognizedTasksByStatus: {},
+        countFailedAndUnrecognizedTasksByStatusByType: {},
+      });
+    });
  });
 });
--- a/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_task_manager.ts
+++ b/x-pack/platform/plugins/shared/alerting/server/usage/lib/get_telemetry_from_task_manager.ts
@ -14,6 +14,7 @@ import type {
 import { ElasticsearchClient, Logger } from '@kbn/core/server';
 import { replaceDotSymbols } from './replace_dots_with_underscores';
 import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
+import { parseAndLogError } from './parse_and_log_error';

 interface Opts {
  esClient: ElasticsearchClient;
@ -122,16 +123,8 @@ export async function getFailedAndUnrecognizedTasksPerDay({
      countFailedAndUnrecognizedTasks: totalFailedAndUnrecognizedTasks ?? 0,
    };
  } catch (err) {
-    const errorMessage = err && err.message ? err.message : err.toString();
-    logger.warn(
-      `Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify(
-        err
-      )}`,
-      {
-        tags: ['alerting', 'telemetry-failed'],
-        error: { stack_trace: err.stack },
-      }
-    );
+    const errorMessage = parseAndLogError(err, `getFailedAndUnrecognizedTasksPerDay`, logger);
+
    return {
      hasErrors: true,
      errorMessage,
--- a/x-pack/platform/plugins/shared/alerting/server/usage/lib/parse_and_log_error.ts
+++ b/x-pack/platform/plugins/shared/alerting/server/usage/lib/parse_and_log_error.ts
@ -0,0 +1,34 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { Logger } from '@kbn/core/server';
+
+export function parseAndLogError(err: Error, errType: string, logger: Logger): string {
+  const errorMessage = err && err.message ? err.message : err.toString();
+  let returnedErrorMessage = errorMessage;
+
+  const errorStr = JSON.stringify(err);
+  const logMessage = `Error executing alerting telemetry task: ${errType} - ${err}`;
+  const logOptions = {
+    tags: ['alerting', 'telemetry-failed'],
+    error: { stack_trace: err.stack },
+  };
+
+  // If error string contains "no_shard_available_action_exception", debug log it
+  if (errorStr.includes('no_shard_available_action_exception')) {
+    // the no_shard_available_action_exception can be wordy and the error message returned from this function
+    // gets stored in the task state so lets simplify
+    returnedErrorMessage = 'no_shard_available_action_exception';
+    if (logger.isLevelEnabled('debug')) {
+      logger.debug(logMessage, logOptions);
+    }
+  } else {
+    logger.warn(logMessage, logOptions);
+  }
+
+  return returnedErrorMessage;
+}