[8.x] [Response Ops][Alerting] Switch to `logger.warn` for `no_shard_available_action_exception` encountered when collecting telemetry (#205520) (#209323)

# Backport

This will backport the following commits from `main` to `8.x`:
- [[Response Ops][Alerting] Switch to `logger.warn` for
`no_shard_available_action_exception` encountered when
collecting telemetry
(#205520)](https://github.com/elastic/kibana/pull/205520)

<!--- Backport version: 9.4.3 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"Ying
Mao","email":"ying.mao@elastic.co"},"sourceCommit":{"committedDate":"2025-02-03T14:44:03Z","message":"[Response
Ops][Alerting] Switch to `logger.warn` for
`no_shard_available_action_exception` encountered when collecting
telemetry (#205520)\n\nResolves
https://github.com/elastic/response-ops-team/issues/221\r\n\r\n##
Summary\r\n\r\nLooking through the logs for this, the majority of these
errors have the\r\nfollowing stack trace (14,000+ in the last 7
days):\r\n\r\n```\r\nResponseError:
search_phase_execution_exception\r\n\tRoot
causes:\r\n\t\tno_shard_available_action_exception:
[es-es-search-686bf6b747-xldl8][100.65.77.183:9300][indices:data/read/search[phase/query]]\r\n\t\tno_shard_available_action_exception:
null\r\n at KibanaTransport._request
(/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:543:27)\r\n
at processTicksAndRejections
(node:internal/process/task_queues:95:5)\r\n at
/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:641:32\r\n
at KibanaTransport.request
(/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:637:20)\r\n
at KibanaTransport.request
(/usr/share/kibana/node_modules/@kbn/core-elasticsearch-client-server-internal/src/create_transport.js:60:16)\r\n
at ClientTraced.SearchApi [as search]
(/usr/share/kibana/node_modules/@elastic/elasticsearch/lib/api/api/search.js:72:12)\r\n
at getTotalAlertsCountAggregations
(/usr/share/kibana/node_modules/@kbn/alerting-plugin/server/usage/lib/get_telemetry_from_alerts.js:42:21)\r\n
at async Promise.all (index 6)\r\n at TaskManagerRunner.run
(/usr/share/kibana/node_modules/@kbn/task-manager-plugin/server/task_running/task_runner.js:325:22)\r\n```\r\n\r\nLooking
through the code, we are already catching these errors and\r\nreturning
a default response for that telemetry object.
The\r\n`no_shard_available_action_exception` is not an issue with
Kibana, it's\r\nan ES issue, so this PR catches these types of errors
and log them at a\r\ndebug level instead of a warn level to avoid
polluting the logs with\r\nerrors we have no control
over.\r\n\r\nExcluding those results, we see a different and less
frequent stack\r\ntrace (100+ in the last 15
days):\r\n\r\n```\r\nTypeError: Cannot read properties of undefined
(reading 'by_rule_type_id')\r\n at getTotalAlertsCountAggregations
(/usr/share/kibana/node_modules/@kbn/alerting-plugin/server/usage/lib/get_telemetry_from_alerts.js:49:109)\r\n
at processTicksAndRejections
(node:internal/process/task_queues:95:5)\r\n at async Promise.all (index
6)\r\n at TaskManagerRunner.run
(/usr/share/kibana/node_modules/@kbn/task-manager-plugin/server/task_running/task_runner.js:325:22)\r\n```\r\n\r\nFor
actions telemetry, the volume of these errors is much
lower.\r\nUnfortunately, we are not logging the stack trace for these
errors so\r\nit's harder to track down the source. I've updated the code
to store the\r\nstack trace as well as adding the same handling
for\r\n`no_shard_available_action_exception` under the assumption that
it could\r\nalso be getting these
errors.\r\n\r\n---------\r\n\r\nCo-authored-by: Elastic Machine
<elasticmachine@users.noreply.github.com>","sha":"4e0c0a785ad8d8d6df66406542acda1189e2b817","branchLabelMapping":{"^v9.1.0$":"main","^v8.19.0$":"8.x","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["Feature:Alerting","release_note:skip","Feature:Actions","Team:ResponseOps","backport:prev-minor","ci:project-deploy-security","v9.1.0","v8.19.0"],"title":"[Response
Ops][Alerting] Switch to `logger.warn` for
`no_shard_available_action_exception` encountered when collecting
telemetry","number":205520,"url":"https://github.com/elastic/kibana/pull/205520","mergeCommit":{"message":"[Response
Ops][Alerting] Switch to `logger.warn` for
`no_shard_available_action_exception` encountered when collecting
telemetry (#205520)\n\nResolves
https://github.com/elastic/response-ops-team/issues/221\r\n\r\n##
Summary\r\n\r\nLooking through the logs for this, the majority of these
errors have the\r\nfollowing stack trace (14,000+ in the last 7
days):\r\n\r\n```\r\nResponseError:
search_phase_execution_exception\r\n\tRoot
causes:\r\n\t\tno_shard_available_action_exception:
[es-es-search-686bf6b747-xldl8][100.65.77.183:9300][indices:data/read/search[phase/query]]\r\n\t\tno_shard_available_action_exception:
null\r\n at KibanaTransport._request
(/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:543:27)\r\n
at processTicksAndRejections
(node:internal/process/task_queues:95:5)\r\n at
/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:641:32\r\n
at KibanaTransport.request
(/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:637:20)\r\n
at KibanaTransport.request
(/usr/share/kibana/node_modules/@kbn/core-elasticsearch-client-server-internal/src/create_transport.js:60:16)\r\n
at ClientTraced.SearchApi [as search]
(/usr/share/kibana/node_modules/@elastic/elasticsearch/lib/api/api/search.js:72:12)\r\n
at getTotalAlertsCountAggregations
(/usr/share/kibana/node_modules/@kbn/alerting-plugin/server/usage/lib/get_telemetry_from_alerts.js:42:21)\r\n
at async Promise.all (index 6)\r\n at TaskManagerRunner.run
(/usr/share/kibana/node_modules/@kbn/task-manager-plugin/server/task_running/task_runner.js:325:22)\r\n```\r\n\r\nLooking
through the code, we are already catching these errors and\r\nreturning
a default response for that telemetry object.
The\r\n`no_shard_available_action_exception` is not an issue with
Kibana, it's\r\nan ES issue, so this PR catches these types of errors
and log them at a\r\ndebug level instead of a warn level to avoid
polluting the logs with\r\nerrors we have no control
over.\r\n\r\nExcluding those results, we see a different and less
frequent stack\r\ntrace (100+ in the last 15
days):\r\n\r\n```\r\nTypeError: Cannot read properties of undefined
(reading 'by_rule_type_id')\r\n at getTotalAlertsCountAggregations
(/usr/share/kibana/node_modules/@kbn/alerting-plugin/server/usage/lib/get_telemetry_from_alerts.js:49:109)\r\n
at processTicksAndRejections
(node:internal/process/task_queues:95:5)\r\n at async Promise.all (index
6)\r\n at TaskManagerRunner.run
(/usr/share/kibana/node_modules/@kbn/task-manager-plugin/server/task_running/task_runner.js:325:22)\r\n```\r\n\r\nFor
actions telemetry, the volume of these errors is much
lower.\r\nUnfortunately, we are not logging the stack trace for these
errors so\r\nit's harder to track down the source. I've updated the code
to store the\r\nstack trace as well as adding the same handling
for\r\n`no_shard_available_action_exception` under the assumption that
it could\r\nalso be getting these
errors.\r\n\r\n---------\r\n\r\nCo-authored-by: Elastic Machine
<elasticmachine@users.noreply.github.com>","sha":"4e0c0a785ad8d8d6df66406542acda1189e2b817"}},"sourceBranch":"main","suggestedTargetBranches":["8.x"],"targetPullRequestStates":[{"branch":"main","label":"v9.1.0","branchLabelMappingKey":"^v9.1.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/205520","number":205520,"mergeCommit":{"message":"[Response
Ops][Alerting] Switch to `logger.warn` for
`no_shard_available_action_exception` encountered when collecting
telemetry (#205520)\n\nResolves
https://github.com/elastic/response-ops-team/issues/221\r\n\r\n##
Summary\r\n\r\nLooking through the logs for this, the majority of these
errors have the\r\nfollowing stack trace (14,000+ in the last 7
days):\r\n\r\n```\r\nResponseError:
search_phase_execution_exception\r\n\tRoot
causes:\r\n\t\tno_shard_available_action_exception:
[es-es-search-686bf6b747-xldl8][100.65.77.183:9300][indices:data/read/search[phase/query]]\r\n\t\tno_shard_available_action_exception:
null\r\n at KibanaTransport._request
(/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:543:27)\r\n
at processTicksAndRejections
(node:internal/process/task_queues:95:5)\r\n at
/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:641:32\r\n
at KibanaTransport.request
(/usr/share/kibana/node_modules/@elastic/transport/lib/Transport.js:637:20)\r\n
at KibanaTransport.request
(/usr/share/kibana/node_modules/@kbn/core-elasticsearch-client-server-internal/src/create_transport.js:60:16)\r\n
at ClientTraced.SearchApi [as search]
(/usr/share/kibana/node_modules/@elastic/elasticsearch/lib/api/api/search.js:72:12)\r\n
at getTotalAlertsCountAggregations
(/usr/share/kibana/node_modules/@kbn/alerting-plugin/server/usage/lib/get_telemetry_from_alerts.js:42:21)\r\n
at async Promise.all (index 6)\r\n at TaskManagerRunner.run
(/usr/share/kibana/node_modules/@kbn/task-manager-plugin/server/task_running/task_runner.js:325:22)\r\n```\r\n\r\nLooking
through the code, we are already catching these errors and\r\nreturning
a default response for that telemetry object.
The\r\n`no_shard_available_action_exception` is not an issue with
Kibana, it's\r\nan ES issue, so this PR catches these types of errors
and log them at a\r\ndebug level instead of a warn level to avoid
polluting the logs with\r\nerrors we have no control
over.\r\n\r\nExcluding those results, we see a different and less
frequent stack\r\ntrace (100+ in the last 15
days):\r\n\r\n```\r\nTypeError: Cannot read properties of undefined
(reading 'by_rule_type_id')\r\n at getTotalAlertsCountAggregations
(/usr/share/kibana/node_modules/@kbn/alerting-plugin/server/usage/lib/get_telemetry_from_alerts.js:49:109)\r\n
at processTicksAndRejections
(node:internal/process/task_queues:95:5)\r\n at async Promise.all (index
6)\r\n at TaskManagerRunner.run
(/usr/share/kibana/node_modules/@kbn/task-manager-plugin/server/task_running/task_runner.js:325:22)\r\n```\r\n\r\nFor
actions telemetry, the volume of these errors is much
lower.\r\nUnfortunately, we are not logging the stack trace for these
errors so\r\nit's harder to track down the source. I've updated the code
to store the\r\nstack trace as well as adding the same handling
for\r\n`no_shard_available_action_exception` under the assumption that
it could\r\nalso be getting these
errors.\r\n\r\n---------\r\n\r\nCo-authored-by: Elastic Machine
<elasticmachine@users.noreply.github.com>","sha":"4e0c0a785ad8d8d6df66406542acda1189e2b817"}},{"branch":"8.x","label":"v8.19.0","branchLabelMappingKey":"^v8.19.0$","isSourceBranch":false,"state":"NOT_CREATED"}]}]
BACKPORT-->

Co-authored-by: Ying Mao <ying.mao@elastic.co>
This commit is contained in:
Kibana Machine 2025-02-04 03:48:11 +11:00 committed by GitHub
parent d77bab7c6b
commit b1e03010ab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 823 additions and 378 deletions

View file

@ -6,7 +6,9 @@
*/
import { elasticsearchClientMock } from '@kbn/core-elasticsearch-client-server-mocks';
import { MockedLogger, loggerMock } from '@kbn/logging-mocks';
import { loggingSystemMock } from '@kbn/core/server/mocks';
import { errors } from '@elastic/elasticsearch';
import {
getCounts,
getExecutionsPerDayCount,
@ -14,9 +16,13 @@ import {
getTotalCount,
} from './actions_telemetry';
const mockLogger = loggingSystemMock.create().get();
let logger: MockedLogger;
describe('actions telemetry', () => {
beforeEach(() => {
logger = loggerMock.create();
});
test('getTotalCount should replace first symbol . to __ for action types names', async () => {
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
mockEsClient.search.mockResponse(
@ -107,7 +113,7 @@ describe('actions telemetry', () => {
},
}
);
const telemetry = await getTotalCount(mockEsClient, 'test', mockLogger);
const telemetry = await getTotalCount(mockEsClient, 'test', logger);
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
@ -126,16 +132,24 @@ describe('actions telemetry', () => {
`);
});
test('getTotalCount should return empty results if query throws error', async () => {
test('getTotalCount should return empty results and log warning if query throws error', async () => {
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
mockEsClient.search.mockRejectedValue(new Error('oh no'));
const telemetry = await getTotalCount(mockEsClient, 'test', mockLogger);
const telemetry = await getTotalCount(mockEsClient, 'test', logger);
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
expect(mockLogger.warn).toHaveBeenCalledWith(
`Error executing actions telemetry task: getTotalCount - {}`
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(0);
expect(loggerCalls.warn).toHaveLength(1);
expect(loggerCalls.warn[0][0]).toEqual(
`Error executing actions telemetry task: getTotalCount - Error: oh no`
);
// logger meta
expect(loggerCalls.warn[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
expect(loggerCalls.warn[0][1]?.error?.stack_trace).toBeDefined();
expect(telemetry).toMatchInlineSnapshot(`
Object {
"countByType": Object {},
@ -147,6 +161,60 @@ describe('actions telemetry', () => {
`);
});
test('getTotalCount should return empty results and log debug if query throws search_phase_execution_exception error', async () => {
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
mockEsClient.search.mockRejectedValueOnce(
new errors.ResponseError({
warnings: [],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
meta: {} as any,
body: {
error: {
root_cause: [],
type: 'search_phase_execution_exception',
reason: 'no_shard_available_action_exception',
phase: 'fetch',
grouped: true,
failed_shards: [],
caused_by: {
type: 'no_shard_available_action_exception',
reason: 'This is the nested reason',
},
},
},
statusCode: 503,
headers: {},
})
);
const telemetry = await getTotalCount(mockEsClient, 'test', logger);
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(1);
expect(loggerCalls.debug[0][0]).toMatchInlineSnapshot(`
"Error executing actions telemetry task: getTotalCount - ResponseError: search_phase_execution_exception
Caused by:
no_shard_available_action_exception: This is the nested reason"
`);
// logger meta
expect(loggerCalls.debug[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
expect(loggerCalls.debug[0][1]?.error?.stack_trace).toBeDefined();
expect(loggerCalls.warn).toHaveLength(0);
expect(telemetry).toMatchInlineSnapshot(`
Object {
"countByType": Object {},
"countGenAiProviderTypes": Object {},
"countTotal": 0,
"errorMessage": "no_shard_available_action_exception",
"hasErrors": true,
}
`);
});
test('getInUseTotalCount', async () => {
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
mockEsClient.search.mockResponseOnce(
@ -202,7 +270,7 @@ describe('actions telemetry', () => {
],
},
});
const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger);
const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger);
expect(mockEsClient.search).toHaveBeenCalledTimes(2);
expect(telemetry).toMatchInlineSnapshot(`
@ -287,7 +355,7 @@ describe('actions telemetry', () => {
],
},
});
const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger, undefined, [
const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger, undefined, [
{
id: 'test',
actionTypeId: '.email',
@ -332,16 +400,23 @@ describe('actions telemetry', () => {
`);
});
test('getInUseTotalCount should return empty results if query throws error', async () => {
test('getInUseTotalCount should return empty results and log warning if query throws error', async () => {
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
mockEsClient.search.mockRejectedValue(new Error('oh no'));
const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger);
const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger);
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
expect(mockLogger.warn).toHaveBeenCalledWith(
`Error executing actions telemetry task: getInUseTotalCount - {}`
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(0);
expect(loggerCalls.warn).toHaveLength(1);
expect(loggerCalls.warn[0][0]).toEqual(
`Error executing actions telemetry task: getInUseTotalCount - Error: oh no`
);
// logger meta
expect(loggerCalls.warn[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
expect(loggerCalls.warn[0][1]?.error?.stack_trace).toBeDefined();
expect(telemetry).toMatchInlineSnapshot(`
Object {
"countByAlertHistoryConnectorType": 0,
@ -355,6 +430,62 @@ describe('actions telemetry', () => {
`);
});
test('getInUseTotalCount should return empty results and log debug if query throws search_phase_execution_exception error', async () => {
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
mockEsClient.search.mockRejectedValueOnce(
new errors.ResponseError({
warnings: [],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
meta: {} as any,
body: {
error: {
root_cause: [],
type: 'search_phase_execution_exception',
reason: 'no_shard_available_action_exception',
phase: 'fetch',
grouped: true,
failed_shards: [],
caused_by: {
type: 'no_shard_available_action_exception',
reason: 'This is the nested reason',
},
},
},
statusCode: 503,
headers: {},
})
);
const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger);
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(1);
expect(loggerCalls.debug[0][0]).toMatchInlineSnapshot(`
"Error executing actions telemetry task: getInUseTotalCount - ResponseError: search_phase_execution_exception
Caused by:
no_shard_available_action_exception: This is the nested reason"
`);
// logger meta
expect(loggerCalls.debug[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
expect(loggerCalls.debug[0][1]?.error?.stack_trace).toBeDefined();
expect(loggerCalls.warn).toHaveLength(0);
expect(telemetry).toMatchInlineSnapshot(`
Object {
"countByAlertHistoryConnectorType": 0,
"countByType": Object {},
"countEmailByService": Object {},
"countNamespaces": 0,
"countTotal": 0,
"errorMessage": "no_shard_available_action_exception",
"hasErrors": true,
}
`);
});
test('getTotalCount accounts for preconfigured connectors', async () => {
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
mockEsClient.search.mockResponse(
@ -445,7 +576,7 @@ describe('actions telemetry', () => {
},
}
);
const telemetry = await getTotalCount(mockEsClient, 'test', mockLogger, [
const telemetry = await getTotalCount(mockEsClient, 'test', logger, [
{
id: 'test',
actionTypeId: '.test',
@ -501,7 +632,7 @@ describe('actions telemetry', () => {
},
}
);
const telemetry = await getTotalCount(mockEsClient, 'test', mockLogger, [
const telemetry = await getTotalCount(mockEsClient, 'test', logger, [
{
id: 'system_action:system-connector-test.system-action',
actionTypeId: 'test.system-action',
@ -615,7 +746,7 @@ describe('actions telemetry', () => {
],
},
});
const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger, undefined, [
const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger, undefined, [
{
id: 'anotherServerLog',
actionTypeId: '.server-log',
@ -721,7 +852,7 @@ describe('actions telemetry', () => {
},
});
const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger, undefined, []);
const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger, undefined, []);
expect(mockEsClient.search).toHaveBeenCalledTimes(2);
expect(telemetry).toMatchInlineSnapshot(`
@ -827,7 +958,7 @@ describe('actions telemetry', () => {
],
},
});
const telemetry = await getInUseTotalCount(mockEsClient, 'test', mockLogger);
const telemetry = await getInUseTotalCount(mockEsClient, 'test', logger);
expect(mockEsClient.search).toHaveBeenCalledTimes(2);
expect(telemetry).toMatchInlineSnapshot(`
@ -961,7 +1092,7 @@ describe('actions telemetry', () => {
},
}
);
const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', mockLogger);
const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', logger);
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
expect(telemetry).toStrictEqual({
@ -995,16 +1126,24 @@ describe('actions telemetry', () => {
});
});
test('getExecutionsPerDayCount should return empty results if query throws error', async () => {
test('getExecutionsPerDayCount should return empty results and log warning if query throws error', async () => {
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
mockEsClient.search.mockRejectedValue(new Error('oh no'));
const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', mockLogger);
const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', logger);
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
expect(mockLogger.warn).toHaveBeenCalledWith(
`Error executing actions telemetry task: getExecutionsPerDayCount - {}`
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(0);
expect(loggerCalls.warn).toHaveLength(1);
expect(loggerCalls.warn[0][0]).toEqual(
`Error executing actions telemetry task: getExecutionsPerDayCount - Error: oh no`
);
// logger meta
expect(loggerCalls.warn[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
expect(loggerCalls.warn[0][1]?.error?.stack_trace).toBeDefined();
expect(telemetry).toMatchInlineSnapshot(`
Object {
"avgExecutionTime": 0,
@ -1020,6 +1159,64 @@ describe('actions telemetry', () => {
`);
});
test('getExecutionsPerDayCount should return empty results and log debug if query throws search_phase_execution_exception error', async () => {
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
mockEsClient.search.mockRejectedValueOnce(
new errors.ResponseError({
warnings: [],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
meta: {} as any,
body: {
error: {
root_cause: [],
type: 'search_phase_execution_exception',
reason: 'no_shard_available_action_exception',
phase: 'fetch',
grouped: true,
failed_shards: [],
caused_by: {
type: 'no_shard_available_action_exception',
reason: 'This is the nested reason',
},
},
},
statusCode: 503,
headers: {},
})
);
const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', logger);
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(1);
expect(loggerCalls.debug[0][0]).toMatchInlineSnapshot(`
"Error executing actions telemetry task: getExecutionsPerDayCount - ResponseError: search_phase_execution_exception
Caused by:
no_shard_available_action_exception: This is the nested reason"
`);
// logger meta
expect(loggerCalls.debug[0][1]?.tags).toEqual(['actions', 'telemetry-failed']);
expect(loggerCalls.debug[0][1]?.error?.stack_trace).toBeDefined();
expect(loggerCalls.warn).toHaveLength(0);
expect(telemetry).toMatchInlineSnapshot(`
Object {
"avgExecutionTime": 0,
"avgExecutionTimeByType": Object {},
"countByType": Object {},
"countFailed": 0,
"countFailedByType": Object {},
"countRunOutcomeByConnectorType": Object {},
"countTotal": 0,
"errorMessage": "no_shard_available_action_exception",
"hasErrors": true,
}
`);
});
it('getCounts', () => {
const aggs = {
'.d3security': 2,

View file

@ -20,6 +20,7 @@ import {
getActionExecutions,
getActionsCount,
} from './lib/actions_telemetry_util';
import { parseAndLogError } from './lib/parse_and_log_error';
export interface InMemoryAggRes {
total: number;
@ -108,9 +109,7 @@ export async function getTotalCount(
countGenAiProviderTypes,
};
} catch (err) {
const errorMessage = err && err.message ? err.message : err.toString();
logger.warn(`Error executing actions telemetry task: getTotalCount - ${JSON.stringify(err)}`);
const errorMessage = parseAndLogError(err, `getTotalCount`, logger);
return {
hasErrors: true,
@ -387,11 +386,8 @@ export async function getInUseTotalCount(
countNamespaces: namespacesList.size,
};
} catch (err) {
const errorMessage = err && err.message ? err.message : err.toString();
const errorMessage = parseAndLogError(err, `getInUseTotalCount`, logger);
logger.warn(
`Error executing actions telemetry task: getInUseTotalCount - ${JSON.stringify(err)}`
);
return {
hasErrors: true,
errorMessage,
@ -657,10 +653,8 @@ export async function getExecutionsPerDayCount(
),
};
} catch (err) {
const errorMessage = err && err.message ? err.message : err.toString();
logger.warn(
`Error executing actions telemetry task: getExecutionsPerDayCount - ${JSON.stringify(err)}`
);
const errorMessage = parseAndLogError(err, `getExecutionsPerDayCount`, logger);
return {
hasErrors: true,
errorMessage,

View file

@ -0,0 +1,34 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { Logger } from '@kbn/core/server';
export function parseAndLogError(err: Error, errType: string, logger: Logger): string {
const errorMessage = err && err.message ? err.message : err.toString();
let returnedErrorMessage = errorMessage;
const errorStr = JSON.stringify(err);
const logMessage = `Error executing actions telemetry task: ${errType} - ${err}`;
const logOptions = {
tags: ['actions', 'telemetry-failed'],
error: { stack_trace: err.stack },
};
// If error string contains "no_shard_available_action_exception", debug log it
if (errorStr.includes('no_shard_available_action_exception')) {
// the no_shard_available_action_exception can be wordy and the error message returned from this function
// gets stored in the task state so lets simplify
returnedErrorMessage = 'no_shard_available_action_exception';
if (logger.isLevelEnabled('debug')) {
logger.debug(logMessage, logOptions);
}
} else {
logger.warn(logMessage, logOptions);
}
return returnedErrorMessage;
}

View file

@ -6,64 +6,50 @@
*/
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
import { MockedLogger, loggerMock } from '@kbn/logging-mocks';
import { getTotalAlertsCountAggregations } from './get_telemetry_from_alerts';
import { errors } from '@elastic/elasticsearch';
const elasticsearch = elasticsearchServiceMock.createStart();
const esClient = elasticsearch.client.asInternalUser;
const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
let logger: MockedLogger;
describe('kibana index telemetry', () => {
beforeEach(() => {
jest.resetAllMocks();
logger = loggerMock.create();
});
it('should return total alert couts and alert counts by rule type id', async () => {
it('should return total alert counts and alert counts by rule type id', async () => {
esClient.search.mockResponseOnce({
took: 4,
timed_out: false,
_shards: {
total: 1,
successful: 1,
skipped: 0,
failed: 0,
},
hits: {
total: {
value: 6,
relation: 'eq',
},
max_score: null,
hits: [],
},
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 6, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
by_rule_type_id: {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: '.index-threshold',
doc_count: 1,
},
{
key: 'logs.alert.document.count',
doc_count: 2,
},
{
key: 'document.test.',
doc_count: 3,
},
{ key: '.index-threshold', doc_count: 1 },
{ key: 'logs.alert.document.count', doc_count: 2 },
{ key: 'document.test.', doc_count: 3 },
],
},
},
});
const telemetry = await getTotalAlertsCountAggregations({
esClient,
logger,
});
const telemetry = await getTotalAlertsCountAggregations({ esClient, logger });
expect(esClient.search).toHaveBeenCalledTimes(1);
expect(logger.debug).toHaveBeenCalledTimes(2);
const debugLogs = loggingSystemMock.collect(logger).debug;
expect(debugLogs).toHaveLength(2);
expect(debugLogs[0][0]).toEqual(
`query for getTotalAlertsCountAggregations - {\"index\":\".alerts-*\",\"size\":0,\"body\":{\"query\":{\"match_all\":{}},\"aggs\":{\"by_rule_type_id\":{\"terms\":{\"field\":\"kibana.alert.rule.rule_type_id\",\"size\":33}}}}}`
);
expect(debugLogs[1][0]).toEqual(
`results for getTotalAlertsCountAggregations query - {\"took\":4,\"timed_out\":false,\"_shards\":{\"total\":1,\"successful\":1,\"skipped\":0,\"failed\":0},\"hits\":{\"total\":{\"value\":6,\"relation\":\"eq\"},\"max_score\":null,\"hits\":[]},\"aggregations\":{\"by_rule_type_id\":{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\".index-threshold\",\"doc_count\":1},{\"key\":\"logs.alert.document.count\",\"doc_count\":2},{\"key\":\"document.test.\",\"doc_count\":3}]}}}`
);
expect(telemetry).toEqual({
hasErrors: false,
@ -77,37 +63,18 @@ describe('kibana index telemetry', () => {
});
});
it('should return ', async () => {
it('should return on empty results', async () => {
esClient.search.mockResponseOnce({
took: 4,
timed_out: false,
_shards: {
total: 1,
successful: 1,
skipped: 0,
failed: 0,
},
hits: {
total: {
value: 0,
relation: 'eq',
},
max_score: null,
hits: [],
},
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 0, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
by_rule_type_id: {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [],
},
by_rule_type_id: { doc_count_error_upper_bound: 0, sum_other_doc_count: 0, buckets: [] },
},
});
const telemetry = await getTotalAlertsCountAggregations({
esClient,
logger,
});
const telemetry = await getTotalAlertsCountAggregations({ esClient, logger });
expect(telemetry).toEqual({
hasErrors: false,
@ -116,17 +83,25 @@ describe('kibana index telemetry', () => {
});
});
test('should return empty results and log warning if query throws error', async () => {
it('should return empty results and log warning if query throws error', async () => {
esClient.search.mockRejectedValueOnce(new Error('test'));
const telemetry = await getTotalAlertsCountAggregations({
esClient,
logger,
});
const telemetry = await getTotalAlertsCountAggregations({ esClient, logger });
expect(esClient.search).toHaveBeenCalledTimes(1);
expect(logger.debug).toHaveBeenCalledTimes(1);
expect(logger.warn).toHaveBeenCalledTimes(1);
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(1);
expect(loggerCalls.debug[0][0]).toEqual(
`query for getTotalAlertsCountAggregations - {\"index\":\".alerts-*\",\"size\":0,\"body\":{\"query\":{\"match_all\":{}},\"aggs\":{\"by_rule_type_id\":{\"terms\":{\"field\":\"kibana.alert.rule.rule_type_id\",\"size\":33}}}}}`
);
expect(loggerCalls.warn).toHaveLength(1);
expect(loggerCalls.warn[0][0]).toEqual(
`Error executing alerting telemetry task: getTotalAlertsCountAggregations - Error: test`
);
// logger meta
expect(loggerCalls.warn[0][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerCalls.warn[0][1]?.error?.stack_trace).toBeDefined();
expect(telemetry).toEqual({
hasErrors: true,
@ -135,4 +110,56 @@ describe('kibana index telemetry', () => {
count_alerts_by_rule_type: {},
});
});
it('should return empty results and log debug log if query throws search_phase_execution_exception error', async () => {
esClient.search.mockRejectedValueOnce(
new errors.ResponseError({
warnings: [],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
meta: {} as any,
body: {
error: {
root_cause: [],
type: 'search_phase_execution_exception',
reason: 'no_shard_available_action_exception',
phase: 'fetch',
grouped: true,
failed_shards: [],
caused_by: {
type: 'no_shard_available_action_exception',
reason: 'This is the nested reason',
},
},
},
statusCode: 503,
headers: {},
})
);
const telemetry = await getTotalAlertsCountAggregations({ esClient, logger });
expect(esClient.search).toHaveBeenCalledTimes(1);
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(2);
expect(loggerCalls.debug[0][0]).toEqual(
`query for getTotalAlertsCountAggregations - {\"index\":\".alerts-*\",\"size\":0,\"body\":{\"query\":{\"match_all\":{}},\"aggs\":{\"by_rule_type_id\":{\"terms\":{\"field\":\"kibana.alert.rule.rule_type_id\",\"size\":33}}}}}`
);
expect(loggerCalls.debug[1][0]).toMatchInlineSnapshot(`
"Error executing alerting telemetry task: getTotalAlertsCountAggregations - ResponseError: search_phase_execution_exception
Caused by:
no_shard_available_action_exception: This is the nested reason"
`);
// logger meta
expect(loggerCalls.debug[1][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerCalls.debug[1][1]?.error?.stack_trace).toBeDefined();
expect(loggerCalls.warn).toHaveLength(0);
expect(telemetry).toEqual({
hasErrors: true,
errorMessage: `no_shard_available_action_exception`,
count_alerts_total: 0,
count_alerts_by_rule_type: {},
});
});
});

View file

@ -14,6 +14,7 @@ import { ElasticsearchClient, Logger } from '@kbn/core/server';
import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
import { AlertingUsage } from '../types';
import { parseAndLogError } from './parse_and_log_error';
interface Opts {
esClient: ElasticsearchClient;
@ -69,20 +70,10 @@ export async function getTotalAlertsCountAggregations({
return {
hasErrors: false,
count_alerts_total: totalAlertsCount ?? 0,
count_alerts_by_rule_type: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets),
count_alerts_by_rule_type: parseSimpleRuleTypeBucket(aggregations?.by_rule_type_id?.buckets),
};
} catch (err) {
const errorMessage = err && err.message ? err.message : err.toString();
logger.warn(
`Error executing alerting telemetry task: getTotalAlertsCountAggregations - ${JSON.stringify(
err
)}`,
{
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err.stack },
}
);
const errorMessage = parseAndLogError(err, `getTotalAlertsCountAggregations`, logger);
return {
hasErrors: true,

View file

@ -6,6 +6,8 @@
*/
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
import { MockedLogger, loggerMock } from '@kbn/logging-mocks';
import { errors } from '@elastic/elasticsearch';
import {
getExecutionsPerDayCount,
parseExecutionFailureByRuleType,
@ -17,11 +19,12 @@ import {
const elasticsearch = elasticsearchServiceMock.createStart();
const esClient = elasticsearch.client.asInternalUser;
const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
let logger: MockedLogger;
describe('event log telemetry', () => {
beforeEach(() => {
jest.resetAllMocks();
logger = loggerMock.create();
});
describe('parseRuleTypeBucket', () => {
@ -1106,20 +1109,8 @@ describe('event log telemetry', () => {
esClient.search.mockResponse({
took: 4,
timed_out: false,
_shards: {
total: 1,
successful: 1,
skipped: 0,
failed: 0,
},
hits: {
total: {
value: 148,
relation: 'eq',
},
max_score: null,
hits: [],
},
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 148, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
by_rule_type_id: {
doc_count_error_upper_bound: 0,
@ -1302,11 +1293,7 @@ describe('event log telemetry', () => {
},
});
const telemetry = await getExecutionsPerDayCount({
esClient,
eventLogIndex: 'test',
logger,
});
const telemetry = await getExecutionsPerDayCount({ esClient, eventLogIndex: 'test', logger });
expect(esClient.search).toHaveBeenCalledTimes(1);
@ -1416,17 +1403,14 @@ describe('event log telemetry', () => {
test('should return empty results and log warning if query throws error', async () => {
esClient.search.mockRejectedValue(new Error('oh no'));
const telemetry = await getExecutionsPerDayCount({
esClient,
eventLogIndex: 'test',
logger,
});
const telemetry = await getExecutionsPerDayCount({ esClient, eventLogIndex: 'test', logger });
expect(esClient.search).toHaveBeenCalledTimes(1);
const loggerCall = logger.warn.mock.calls[0][0];
const loggerMeta = logger.warn.mock.calls[0][1];
expect(loggerCall as string).toMatchInlineSnapshot(
`"Error executing alerting telemetry task: getExecutionsPerDayCount - {}"`
`"Error executing alerting telemetry task: getExecutionsPerDayCount - Error: oh no"`
);
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerMeta?.error?.stack_trace).toBeDefined();
@ -1451,6 +1435,72 @@ describe('event log telemetry', () => {
countRulesByExecutionStatus: {},
});
});
it('should return empty results and log debug log if query throws search_phase_execution_exception error', async () => {
esClient.search.mockRejectedValueOnce(
new errors.ResponseError({
warnings: [],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
meta: {} as any,
body: {
error: {
root_cause: [],
type: 'search_phase_execution_exception',
reason: 'no_shard_available_action_exception',
phase: 'fetch',
grouped: true,
failed_shards: [],
caused_by: {
type: 'no_shard_available_action_exception',
reason: 'This is the nested reason',
},
},
},
statusCode: 503,
headers: {},
})
);
const telemetry = await getExecutionsPerDayCount({ esClient, eventLogIndex: 'test', logger });
expect(esClient.search).toHaveBeenCalledTimes(1);
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(2);
expect(loggerCalls.debug[0][0]).toEqual(
`query for getExecutionsPerDayCount - {\"index\":\"test\",\"size\":0,\"body\":{\"query\":{\"bool\":{\"filter\":{\"bool\":{\"must\":[{\"term\":{\"event.action\":\"execute\"}},{\"term\":{\"event.provider\":\"alerting\"}},{\"range\":{\"@timestamp\":{\"gte\":\"now-1d\"}}}]}}}},\"aggs\":{\"avg_execution_time\":{\"avg\":{\"field\":\"event.duration\"}},\"avg_es_search_duration\":{\"avg\":{\"field\":\"kibana.alert.rule.execution.metrics.es_search_duration_ms\"}},\"avg_total_search_duration\":{\"avg\":{\"field\":\"kibana.alert.rule.execution.metrics.total_search_duration_ms\"}},\"percentile_scheduled_actions\":{\"percentiles\":{\"field\":\"kibana.alert.rule.execution.metrics.number_of_generated_actions\",\"percents\":[50,90,99]}},\"percentile_alerts\":{\"percentiles\":{\"field\":\"kibana.alert.rule.execution.metrics.alert_counts.active\",\"percents\":[50,90,99]}},\"execution_failures\":{\"filter\":{\"term\":{\"event.outcome\":\"failure\"}},\"aggs\":{\"by_reason\":{\"terms\":{\"field\":\"event.reason\",\"size\":5}}}},\"by_rule_type_id\":{\"terms\":{\"field\":\"rule.category\",\"size\":33},\"aggs\":{\"avg_execution_time\":{\"avg\":{\"field\":\"event.duration\"}},\"avg_es_search_duration\":{\"avg\":{\"field\":\"kibana.alert.rule.execution.metrics.es_search_duration_ms\"}},\"avg_total_search_duration\":{\"avg\":{\"field\":\"kibana.alert.rule.execution.metrics.total_search_duration_ms\"}},\"percentile_scheduled_actions\":{\"percentiles\":{\"field\":\"kibana.alert.rule.execution.metrics.number_of_generated_actions\",\"percents\":[50,90,99]}},\"percentile_alerts\":{\"percentiles\":{\"field\":\"kibana.alert.rule.execution.metrics.alert_counts.active\",\"percents\":[50,90,99]}},\"execution_failures\":{\"filter\":{\"term\":{\"event.outcome\":\"failure\"}},\"aggs\":{\"by_reason\":{\"terms\":{\"field\":\"event.reason\",\"size\":5}}}}}},\"by_execution_status\":{\"terms\":{\"field\":\"event.outcome\"}}}}}`
);
expect(loggerCalls.debug[1][0]).toMatchInlineSnapshot(`
"Error executing alerting telemetry task: getExecutionsPerDayCount - ResponseError: search_phase_execution_exception
Caused by:
no_shard_available_action_exception: This is the nested reason"
`);
// logger meta
expect(loggerCalls.debug[1][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerCalls.debug[1][1]?.error?.stack_trace).toBeDefined();
expect(loggerCalls.warn).toHaveLength(0);
expect(telemetry).toStrictEqual({
hasErrors: true,
errorMessage: 'no_shard_available_action_exception',
countTotalRuleExecutions: 0,
countRuleExecutionsByType: {},
countTotalFailedExecutions: 0,
countFailedExecutionsByReason: {},
countFailedExecutionsByReasonByType: {},
avgExecutionTime: 0,
avgExecutionTimeByType: {},
avgEsSearchDuration: 0,
avgEsSearchDurationByType: {},
avgTotalSearchDuration: 0,
avgTotalSearchDurationByType: {},
generatedActionsPercentiles: {},
generatedActionsPercentilesByType: {},
alertsPercentiles: {},
alertsPercentilesByType: {},
countRulesByExecutionStatus: {},
});
});
});
describe('getExecutionTimeoutsPerDayCount', () => {
@ -1458,37 +1508,16 @@ describe('event log telemetry', () => {
esClient.search.mockResponse({
took: 4,
timed_out: false,
_shards: {
total: 1,
successful: 1,
skipped: 0,
failed: 0,
},
hits: {
total: {
value: 4,
relation: 'eq',
},
max_score: null,
hits: [],
},
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 4, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
by_rule_type_id: {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: '.index-threshold',
doc_count: 2,
},
{
key: 'logs.alert.document.count',
doc_count: 1,
},
{
key: 'document.test.',
doc_count: 1,
},
{ key: '.index-threshold', doc_count: 2 },
{ key: 'logs.alert.document.count', doc_count: 1 },
{ key: 'document.test.', doc_count: 1 },
],
},
},
@ -1527,7 +1556,7 @@ describe('event log telemetry', () => {
const loggerCall = logger.warn.mock.calls[0][0];
const loggerMeta = logger.warn.mock.calls[0][1];
expect(loggerCall as string).toMatchInlineSnapshot(
`"Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - {}"`
`"Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - Error: oh no"`
);
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerMeta?.error?.stack_trace).toBeDefined();
@ -1538,5 +1567,61 @@ describe('event log telemetry', () => {
hasErrors: true,
});
});
it('should return empty results and log debug log if query throws search_phase_execution_exception error', async () => {
esClient.search.mockRejectedValueOnce(
new errors.ResponseError({
warnings: [],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
meta: {} as any,
body: {
error: {
root_cause: [],
type: 'search_phase_execution_exception',
reason: 'no_shard_available_action_exception',
phase: 'fetch',
grouped: true,
failed_shards: [],
caused_by: {
type: 'no_shard_available_action_exception',
reason: 'This is the nested reason',
},
},
},
statusCode: 503,
headers: {},
})
);
const telemetry = await getExecutionTimeoutsPerDayCount({
esClient,
eventLogIndex: 'test',
logger,
});
expect(esClient.search).toHaveBeenCalledTimes(1);
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(2);
expect(loggerCalls.debug[0][0]).toEqual(
`query for getExecutionTimeoutsPerDayCount - {\"index\":\"test\",\"size\":0,\"body\":{\"query\":{\"bool\":{\"filter\":{\"bool\":{\"must\":[{\"term\":{\"event.action\":\"execute-timeout\"}},{\"term\":{\"event.provider\":\"alerting\"}},{\"range\":{\"@timestamp\":{\"gte\":\"now-1d\"}}}]}}}},\"aggs\":{\"by_rule_type_id\":{\"terms\":{\"field\":\"rule.category\",\"size\":33}}}}}`
);
expect(loggerCalls.debug[1][0]).toMatchInlineSnapshot(`
"Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - ResponseError: search_phase_execution_exception
Caused by:
no_shard_available_action_exception: This is the nested reason"
`);
// logger meta
expect(loggerCalls.debug[1][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerCalls.debug[1][1]?.error?.stack_trace).toBeDefined();
expect(loggerCalls.warn).toHaveLength(0);
expect(telemetry).toStrictEqual({
hasErrors: true,
errorMessage: 'no_shard_available_action_exception',
countExecutionTimeouts: 0,
countExecutionTimeoutsByType: {},
});
});
});
});

View file

@ -22,6 +22,7 @@ import {
} from '../alerting_usage_collector';
import { replaceDotSymbols } from './replace_dots_with_underscores';
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
import { parseAndLogError } from './parse_and_log_error';
const Millis2Nanos = 1000 * 1000;
const percentileFieldNameMapping: Record<string, string> = {
@ -189,14 +190,8 @@ export async function getExecutionsPerDayCount({
),
};
} catch (err) {
const errorMessage = err && err.message ? err.message : err.toString();
logger.warn(
`Error executing alerting telemetry task: getExecutionsPerDayCount - ${JSON.stringify(err)}`,
{
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err.stack },
}
);
const errorMessage = parseAndLogError(err, `getExecutionsPerDayCount`, logger);
return {
hasErrors: true,
errorMessage,
@ -262,17 +257,8 @@ export async function getExecutionTimeoutsPerDayCount({
countExecutionTimeoutsByType: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets),
};
} catch (err) {
const errorMessage = err && err.message ? err.message : err.toString();
const errorMessage = parseAndLogError(err, `getExecutionsTimeoutsPerDayCount`, logger);
logger.warn(
`Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - ${JSON.stringify(
err
)}`,
{
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err.stack },
}
);
return {
hasErrors: true,
errorMessage,

File diff suppressed because one or more lines are too long

View file

@ -25,6 +25,7 @@ import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
import { groupRulesBySearchType } from './group_rules_by_search_type';
import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../../common';
import { MaintenanceWindowAttributes } from '../../data/maintenance_window/types';
import { parseAndLogError } from './parse_and_log_error';
interface Opts {
esClient: ElasticsearchClient;
@ -376,15 +377,8 @@ export async function getTotalCountAggregations({
},
};
} catch (err) {
const errorMessage = err && err.message ? err.message : err.toString();
const errorMessage = parseAndLogError(err, `getTotalCountAggregations`, logger);
logger.warn(
`Error executing alerting telemetry task: getTotalCountAggregations - ${JSON.stringify(err)}`,
{
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err.stack },
}
);
return {
hasErrors: true,
errorMessage,
@ -491,14 +485,8 @@ export async function getTotalCountInUse({
countNamespaces: aggregations.namespaces_count.value ?? 0,
};
} catch (err) {
const errorMessage = err && err.message ? err.message : err.toString();
logger.warn(
`Error executing alerting telemetry task: getTotalCountInUse - ${JSON.stringify(err)}`,
{
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err.stack },
}
);
const errorMessage = parseAndLogError(err, `getTotalCountInUse`, logger);
return {
hasErrors: true,
errorMessage,
@ -548,14 +536,8 @@ export async function getMWTelemetry({
count_mw_with_filter_alert_toggle_on: countMWWithFilterAlertToggleON,
};
} catch (err) {
const errorMessage = err?.message ? err.message : err.toString();
logger.warn(
`Error executing alerting telemetry task: getTotalMWCount - ${JSON.stringify(err)}`,
{
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err?.stack },
}
);
const errorMessage = parseAndLogError(err, `getTotalMWCount`, logger);
return {
hasErrors: true,
errorMessage,

View file

@ -5,7 +5,9 @@
* 2.0.
*/
import { errors } from '@elastic/elasticsearch';
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
import { MockedLogger, loggerMock } from '@kbn/logging-mocks';
import {
getFailedAndUnrecognizedTasksPerDay,
parseBucket,
@ -13,11 +15,12 @@ import {
const elasticsearch = elasticsearchServiceMock.createStart();
const esClient = elasticsearch.client.asInternalUser;
const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
let logger: MockedLogger;
describe('task manager telemetry', () => {
beforeEach(() => {
jest.resetAllMocks();
logger = loggerMock.create();
});
describe('parseBucket', () => {
@ -145,20 +148,8 @@ describe('task manager telemetry', () => {
esClient.search.mockResponse({
took: 4,
timed_out: false,
_shards: {
total: 1,
successful: 1,
skipped: 0,
failed: 0,
},
hits: {
total: {
value: 40,
relation: 'eq',
},
max_score: null,
hits: [],
},
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 40, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
by_status: {
doc_count_error_upper_bound: 0,
@ -243,7 +234,7 @@ describe('task manager telemetry', () => {
const loggerCall = logger.warn.mock.calls[0][0];
const loggerMeta = logger.warn.mock.calls[0][1];
expect(loggerCall as string).toMatchInlineSnapshot(
`"Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - {}"`
`"Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - Error: oh no"`
);
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerMeta?.error?.stack_trace).toBeDefined();
@ -255,5 +246,62 @@ describe('task manager telemetry', () => {
countFailedAndUnrecognizedTasksByStatusByType: {},
});
});
test('should return empty results and log debug log if query throws search_phase_execution_exception error', async () => {
esClient.search.mockRejectedValueOnce(
new errors.ResponseError({
warnings: [],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
meta: {} as any,
body: {
error: {
root_cause: [],
type: 'search_phase_execution_exception',
reason: 'no_shard_available_action_exception',
phase: 'fetch',
grouped: true,
failed_shards: [],
caused_by: {
type: 'no_shard_available_action_exception',
reason: 'This is the nested reason',
},
},
},
statusCode: 503,
headers: {},
})
);
const telemetry = await getFailedAndUnrecognizedTasksPerDay({
esClient,
taskManagerIndex: 'test',
logger,
});
expect(esClient.search).toHaveBeenCalledTimes(1);
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(2);
expect(loggerCalls.debug[0][0]).toEqual(
`query for getFailedAndUnrecognizedTasksPerDay - {\"index\":\"test\",\"size\":0,\"body\":{\"query\":{\"bool\":{\"must\":[{\"bool\":{\"should\":[{\"term\":{\"task.status\":\"unrecognized\"}},{\"term\":{\"task.status\":\"failed\"}}]}},{\"wildcard\":{\"task.taskType\":{\"value\":\"alerting:*\"}}},{\"range\":{\"task.runAt\":{\"gte\":\"now-1d\"}}}]}},\"aggs\":{\"by_status\":{\"terms\":{\"field\":\"task.status\",\"size\":10},\"aggs\":{\"by_task_type\":{\"terms\":{\"field\":\"task.taskType\",\"size\":33}}}}}}}`
);
expect(loggerCalls.debug[1][0]).toMatchInlineSnapshot(`
"Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - ResponseError: search_phase_execution_exception
Caused by:
no_shard_available_action_exception: This is the nested reason"
`);
// logger meta
expect(loggerCalls.debug[1][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerCalls.debug[1][1]?.error?.stack_trace).toBeDefined();
expect(loggerCalls.warn).toHaveLength(0);
expect(telemetry).toStrictEqual({
errorMessage: 'no_shard_available_action_exception',
hasErrors: true,
countFailedAndUnrecognizedTasks: 0,
countFailedAndUnrecognizedTasksByStatus: {},
countFailedAndUnrecognizedTasksByStatusByType: {},
});
});
});
});

View file

@ -14,6 +14,7 @@ import type {
import { ElasticsearchClient, Logger } from '@kbn/core/server';
import { replaceDotSymbols } from './replace_dots_with_underscores';
import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
import { parseAndLogError } from './parse_and_log_error';
interface Opts {
esClient: ElasticsearchClient;
@ -122,16 +123,8 @@ export async function getFailedAndUnrecognizedTasksPerDay({
countFailedAndUnrecognizedTasks: totalFailedAndUnrecognizedTasks ?? 0,
};
} catch (err) {
const errorMessage = err && err.message ? err.message : err.toString();
logger.warn(
`Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify(
err
)}`,
{
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err.stack },
}
);
const errorMessage = parseAndLogError(err, `getFailedAndUnrecognizedTasksPerDay`, logger);
return {
hasErrors: true,
errorMessage,

View file

@ -0,0 +1,34 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { Logger } from '@kbn/core/server';
export function parseAndLogError(err: Error, errType: string, logger: Logger): string {
const errorMessage = err && err.message ? err.message : err.toString();
let returnedErrorMessage = errorMessage;
const errorStr = JSON.stringify(err);
const logMessage = `Error executing alerting telemetry task: ${errType} - ${err}`;
const logOptions = {
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err.stack },
};
// If error string contains "no_shard_available_action_exception", debug log it
if (errorStr.includes('no_shard_available_action_exception')) {
// the no_shard_available_action_exception can be wordy and the error message returned from this function
// gets stored in the task state so lets simplify
returnedErrorMessage = 'no_shard_available_action_exception';
if (logger.isLevelEnabled('debug')) {
logger.debug(logMessage, logOptions);
}
} else {
logger.warn(logMessage, logOptions);
}
return returnedErrorMessage;
}