mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 17:59:23 -04:00
[Response Ops] Replace scripted metric aggs in alerting telemetry queries with terms aggregations (#134769)
* Updating getTotalCountAggregations query * Replacing scripted metric aggs with terms aggregations * Fixing task manager query * Updating replaceDotSymbols fn * Adding stack trace to logger meta * Reusing event log query * Adding fallback for bucket key and doc_count * Switch reduce for for loop * combining aggs * Fixing nulls issue Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
parent
8440cec9a6
commit
4b7b363e9c
14 changed files with 3267 additions and 1706 deletions
|
@ -1,725 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
|
||||||
* or more contributor license agreements. Licensed under the Elastic License
|
|
||||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
|
||||||
* 2.0.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* eslint-disable @typescript-eslint/naming-convention */
|
|
||||||
|
|
||||||
// eslint-disable-next-line @kbn/eslint/no-restricted-paths
|
|
||||||
import { elasticsearchClientMock } from '@kbn/core/server/elasticsearch/client/mocks';
|
|
||||||
import { loggingSystemMock } from '@kbn/core/server/mocks';
|
|
||||||
import {
|
|
||||||
getTotalCountAggregations,
|
|
||||||
getTotalCountInUse,
|
|
||||||
getExecutionsPerDayCount,
|
|
||||||
getExecutionTimeoutsPerDayCount,
|
|
||||||
getFailedAndUnrecognizedTasksPerDay,
|
|
||||||
parsePercentileAggsByRuleType,
|
|
||||||
} from './alerting_telemetry';
|
|
||||||
|
|
||||||
const mockLogger = loggingSystemMock.create().get();
|
|
||||||
describe('alerting telemetry', () => {
|
|
||||||
test('getTotalCountInUse should replace "." symbols with "__" in rule types names', async () => {
|
|
||||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
|
||||||
mockEsClient.search.mockResponse(
|
|
||||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
|
||||||
{
|
|
||||||
aggregations: {
|
|
||||||
byRuleTypeId: {
|
|
||||||
value: {
|
|
||||||
ruleTypes: {
|
|
||||||
'.index-threshold': 2,
|
|
||||||
'logs.alert.document.count': 1,
|
|
||||||
'document.test.': 1,
|
|
||||||
},
|
|
||||||
namespaces: {
|
|
||||||
default: 1,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
hits: {
|
|
||||||
hits: [],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
const telemetry = await getTotalCountInUse(mockEsClient, 'test', mockLogger);
|
|
||||||
|
|
||||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
|
||||||
|
|
||||||
expect(telemetry).toMatchInlineSnapshot(`
|
|
||||||
Object {
|
|
||||||
"countByType": Object {
|
|
||||||
"__index-threshold": 2,
|
|
||||||
"document__test__": 1,
|
|
||||||
"logs__alert__document__count": 1,
|
|
||||||
},
|
|
||||||
"countNamespaces": 1,
|
|
||||||
"countTotal": 4,
|
|
||||||
}
|
|
||||||
`);
|
|
||||||
});
|
|
||||||
|
|
||||||
test('getTotalCountInUse should return empty results if query throws error', async () => {
|
|
||||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
|
||||||
mockEsClient.search.mockRejectedValue(new Error('oh no'));
|
|
||||||
|
|
||||||
const telemetry = await getTotalCountInUse(mockEsClient, 'test', mockLogger);
|
|
||||||
|
|
||||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
|
||||||
expect(mockLogger.warn).toHaveBeenCalledWith(
|
|
||||||
`Error executing alerting telemetry task: getTotalCountInUse - {}`
|
|
||||||
);
|
|
||||||
expect(telemetry).toMatchInlineSnapshot(`
|
|
||||||
Object {
|
|
||||||
"countByType": Object {},
|
|
||||||
"countNamespaces": 0,
|
|
||||||
"countTotal": 0,
|
|
||||||
}
|
|
||||||
`);
|
|
||||||
});
|
|
||||||
|
|
||||||
test('getTotalCountAggregations should return min/max connectors in use', async () => {
|
|
||||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
|
||||||
mockEsClient.search.mockResponse(
|
|
||||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
|
||||||
{
|
|
||||||
aggregations: {
|
|
||||||
byRuleTypeId: {
|
|
||||||
value: {
|
|
||||||
ruleTypes: {
|
|
||||||
'.index-threshold': 2,
|
|
||||||
'logs.alert.document.count': 1,
|
|
||||||
'document.test.': 1,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
max_throttle_time: { value: 60 },
|
|
||||||
min_throttle_time: { value: 0 },
|
|
||||||
avg_throttle_time: { value: 30 },
|
|
||||||
max_interval_time: { value: 10 },
|
|
||||||
min_interval_time: { value: 1 },
|
|
||||||
avg_interval_time: { value: 4.5 },
|
|
||||||
max_actions_count: { value: 4 },
|
|
||||||
min_actions_count: { value: 0 },
|
|
||||||
avg_actions_count: { value: 2.5 },
|
|
||||||
},
|
|
||||||
hits: {
|
|
||||||
hits: [],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
const telemetry = await getTotalCountAggregations(mockEsClient, 'test', mockLogger);
|
|
||||||
|
|
||||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
|
||||||
|
|
||||||
expect(telemetry).toMatchInlineSnapshot(`
|
|
||||||
Object {
|
|
||||||
"connectors_per_alert": Object {
|
|
||||||
"avg": 2.5,
|
|
||||||
"max": 4,
|
|
||||||
"min": 0,
|
|
||||||
},
|
|
||||||
"count_by_type": Object {
|
|
||||||
"__index-threshold": 2,
|
|
||||||
"document__test__": 1,
|
|
||||||
"logs__alert__document__count": 1,
|
|
||||||
},
|
|
||||||
"count_rules_namespaces": 0,
|
|
||||||
"count_total": 4,
|
|
||||||
"schedule_time": Object {
|
|
||||||
"avg": "4.5s",
|
|
||||||
"max": "10s",
|
|
||||||
"min": "1s",
|
|
||||||
},
|
|
||||||
"schedule_time_number_s": Object {
|
|
||||||
"avg": 4.5,
|
|
||||||
"max": 10,
|
|
||||||
"min": 1,
|
|
||||||
},
|
|
||||||
"throttle_time": Object {
|
|
||||||
"avg": "30s",
|
|
||||||
"max": "60s",
|
|
||||||
"min": "0s",
|
|
||||||
},
|
|
||||||
"throttle_time_number_s": Object {
|
|
||||||
"avg": 30,
|
|
||||||
"max": 60,
|
|
||||||
"min": 0,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
`);
|
|
||||||
});
|
|
||||||
|
|
||||||
test('getTotalCountAggregations should return empty results if query throws error', async () => {
|
|
||||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
|
||||||
mockEsClient.search.mockRejectedValue(new Error('oh no'));
|
|
||||||
|
|
||||||
const telemetry = await getTotalCountAggregations(mockEsClient, 'test', mockLogger);
|
|
||||||
|
|
||||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
|
||||||
expect(mockLogger.warn).toHaveBeenCalledWith(
|
|
||||||
`Error executing alerting telemetry task: getTotalCountAggregations - {}`
|
|
||||||
);
|
|
||||||
expect(telemetry).toMatchInlineSnapshot(`
|
|
||||||
Object {
|
|
||||||
"connectors_per_alert": Object {
|
|
||||||
"avg": 0,
|
|
||||||
"max": 0,
|
|
||||||
"min": 0,
|
|
||||||
},
|
|
||||||
"count_by_type": Object {},
|
|
||||||
"count_rules_namespaces": 0,
|
|
||||||
"count_total": 0,
|
|
||||||
"schedule_time": Object {
|
|
||||||
"avg": "0s",
|
|
||||||
"max": "0s",
|
|
||||||
"min": "0s",
|
|
||||||
},
|
|
||||||
"schedule_time_number_s": Object {
|
|
||||||
"avg": 0,
|
|
||||||
"max": 0,
|
|
||||||
"min": 0,
|
|
||||||
},
|
|
||||||
"throttle_time": Object {
|
|
||||||
"avg": "0s",
|
|
||||||
"max": "0s",
|
|
||||||
"min": "0s",
|
|
||||||
},
|
|
||||||
"throttle_time_number_s": Object {
|
|
||||||
"avg": 0,
|
|
||||||
"max": 0,
|
|
||||||
"min": 0,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
`);
|
|
||||||
});
|
|
||||||
|
|
||||||
test('getExecutionsPerDayCount should return execution aggregations for total count, count by rule type and number of failed executions', async () => {
|
|
||||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
|
||||||
mockEsClient.search.mockResponse(
|
|
||||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
|
||||||
{
|
|
||||||
aggregations: {
|
|
||||||
byRuleTypeId: {
|
|
||||||
value: {
|
|
||||||
ruleTypes: {
|
|
||||||
'.index-threshold': 2,
|
|
||||||
'logs.alert.document.count': 1,
|
|
||||||
'document.test.': 1,
|
|
||||||
},
|
|
||||||
ruleTypesDuration: {
|
|
||||||
'.index-threshold': 2087868,
|
|
||||||
'logs.alert.document.count': 1675765,
|
|
||||||
'document.test.': 17687687,
|
|
||||||
},
|
|
||||||
ruleTypesEsSearchDuration: {
|
|
||||||
'.index-threshold': 23,
|
|
||||||
'logs.alert.document.count': 526,
|
|
||||||
'document.test.': 534,
|
|
||||||
},
|
|
||||||
ruleTypesTotalSearchDuration: {
|
|
||||||
'.index-threshold': 62,
|
|
||||||
'logs.alert.document.count': 588,
|
|
||||||
'document.test.': 637,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
failuresByReason: {
|
|
||||||
value: {
|
|
||||||
reasons: {
|
|
||||||
unknown: {
|
|
||||||
'.index-threshold': 2,
|
|
||||||
'logs.alert.document.count': 1,
|
|
||||||
'document.test.': 1,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
avgDuration: { value: 10 },
|
|
||||||
avgEsSearchDuration: {
|
|
||||||
value: 25.785714285714285,
|
|
||||||
},
|
|
||||||
avgTotalSearchDuration: {
|
|
||||||
value: 30.642857142857142,
|
|
||||||
},
|
|
||||||
percentileScheduledActions: {
|
|
||||||
values: {
|
|
||||||
'50.0': 4.0,
|
|
||||||
'90.0': 26.0,
|
|
||||||
'99.0': 26.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
percentileAlerts: {
|
|
||||||
values: {
|
|
||||||
'50.0': 10.0,
|
|
||||||
'90.0': 22.0,
|
|
||||||
'99.0': 22.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
aggsByType: {
|
|
||||||
doc_count_error_upper_bound: 0,
|
|
||||||
sum_other_doc_count: 0,
|
|
||||||
buckets: [
|
|
||||||
{
|
|
||||||
key: '.index-threshold',
|
|
||||||
doc_count: 149,
|
|
||||||
percentileScheduledActions: {
|
|
||||||
values: {
|
|
||||||
'50.0': 4.0,
|
|
||||||
'90.0': 26.0,
|
|
||||||
'99.0': 26.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
percentileAlerts: {
|
|
||||||
values: {
|
|
||||||
'50.0': 10.0,
|
|
||||||
'90.0': 22.0,
|
|
||||||
'99.0': 22.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'logs.alert.document.count',
|
|
||||||
doc_count: 1,
|
|
||||||
percentileScheduledActions: {
|
|
||||||
values: {
|
|
||||||
'50.0': 10.0,
|
|
||||||
'90.0': 10.0,
|
|
||||||
'99.0': 10.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
percentileAlerts: {
|
|
||||||
values: {
|
|
||||||
'50.0': 5.0,
|
|
||||||
'90.0': 13.0,
|
|
||||||
'99.0': 13.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
hits: {
|
|
||||||
hits: [],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', mockLogger);
|
|
||||||
|
|
||||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
|
||||||
|
|
||||||
expect(telemetry).toStrictEqual({
|
|
||||||
avgEsSearchDuration: 26,
|
|
||||||
avgEsSearchDurationByType: {
|
|
||||||
'__index-threshold': 12,
|
|
||||||
document__test__: 534,
|
|
||||||
logs__alert__document__count: 526,
|
|
||||||
},
|
|
||||||
avgExecutionTime: 0,
|
|
||||||
avgExecutionTimeByType: {
|
|
||||||
'__index-threshold': 1043934,
|
|
||||||
document__test__: 17687687,
|
|
||||||
logs__alert__document__count: 1675765,
|
|
||||||
},
|
|
||||||
avgTotalSearchDuration: 31,
|
|
||||||
avgTotalSearchDurationByType: {
|
|
||||||
'__index-threshold': 31,
|
|
||||||
document__test__: 637,
|
|
||||||
logs__alert__document__count: 588,
|
|
||||||
},
|
|
||||||
countByType: {
|
|
||||||
'__index-threshold': 2,
|
|
||||||
document__test__: 1,
|
|
||||||
logs__alert__document__count: 1,
|
|
||||||
},
|
|
||||||
countFailuresByReason: {
|
|
||||||
unknown: 4,
|
|
||||||
},
|
|
||||||
countFailuresByReasonByType: {
|
|
||||||
unknown: {
|
|
||||||
'__index-threshold': 2,
|
|
||||||
document__test__: 1,
|
|
||||||
logs__alert__document__count: 1,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
countTotal: 4,
|
|
||||||
countTotalFailures: 4,
|
|
||||||
generatedActionsPercentiles: {
|
|
||||||
p50: 4,
|
|
||||||
p90: 26,
|
|
||||||
p99: 26,
|
|
||||||
},
|
|
||||||
generatedActionsPercentilesByType: {
|
|
||||||
p50: {
|
|
||||||
'__index-threshold': 4,
|
|
||||||
logs__alert__document__count: 10,
|
|
||||||
},
|
|
||||||
p90: {
|
|
||||||
'__index-threshold': 26,
|
|
||||||
logs__alert__document__count: 10,
|
|
||||||
},
|
|
||||||
p99: {
|
|
||||||
'__index-threshold': 26,
|
|
||||||
logs__alert__document__count: 10,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
alertsPercentiles: {
|
|
||||||
p50: 10,
|
|
||||||
p90: 22,
|
|
||||||
p99: 22,
|
|
||||||
},
|
|
||||||
alertsPercentilesByType: {
|
|
||||||
p50: {
|
|
||||||
'__index-threshold': 10,
|
|
||||||
logs__alert__document__count: 5,
|
|
||||||
},
|
|
||||||
p90: {
|
|
||||||
'__index-threshold': 22,
|
|
||||||
logs__alert__document__count: 13,
|
|
||||||
},
|
|
||||||
p99: {
|
|
||||||
'__index-threshold': 22,
|
|
||||||
logs__alert__document__count: 13,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
test('getExecutionsPerDayCount should return empty results if query throws error', async () => {
|
|
||||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
|
||||||
mockEsClient.search.mockRejectedValue(new Error('oh no'));
|
|
||||||
|
|
||||||
const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', mockLogger);
|
|
||||||
|
|
||||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
|
||||||
expect(mockLogger.warn).toHaveBeenCalledWith(
|
|
||||||
`Error executing alerting telemetry task: getExecutionsPerDayCount - {}`
|
|
||||||
);
|
|
||||||
expect(telemetry).toStrictEqual({
|
|
||||||
avgEsSearchDuration: 0,
|
|
||||||
avgEsSearchDurationByType: {},
|
|
||||||
avgExecutionTime: 0,
|
|
||||||
avgExecutionTimeByType: {},
|
|
||||||
avgTotalSearchDuration: 0,
|
|
||||||
avgTotalSearchDurationByType: {},
|
|
||||||
countByType: {},
|
|
||||||
countFailuresByReason: {},
|
|
||||||
countFailuresByReasonByType: {},
|
|
||||||
countTotal: 0,
|
|
||||||
countTotalFailures: 0,
|
|
||||||
generatedActionsPercentiles: {},
|
|
||||||
generatedActionsPercentilesByType: {},
|
|
||||||
alertsPercentiles: {},
|
|
||||||
alertsPercentilesByType: {},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
test('getExecutionTimeoutsPerDayCount should return execution aggregations for total timeout count and count by rule type', async () => {
|
|
||||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
|
||||||
mockEsClient.search.mockResponse(
|
|
||||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
|
||||||
{
|
|
||||||
aggregations: {
|
|
||||||
byRuleTypeId: {
|
|
||||||
value: {
|
|
||||||
ruleTypes: {
|
|
||||||
'.index-threshold': 2,
|
|
||||||
'logs.alert.document.count': 1,
|
|
||||||
'document.test.': 1,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
hits: {
|
|
||||||
hits: [],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
const telemetry = await getExecutionTimeoutsPerDayCount(mockEsClient, 'test', mockLogger);
|
|
||||||
|
|
||||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
|
||||||
|
|
||||||
expect(telemetry).toStrictEqual({
|
|
||||||
countTotal: 4,
|
|
||||||
countByType: {
|
|
||||||
'__index-threshold': 2,
|
|
||||||
document__test__: 1,
|
|
||||||
logs__alert__document__count: 1,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
test('getExecutionTimeoutsPerDayCount should return empty results if query throws error', async () => {
|
|
||||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
|
||||||
mockEsClient.search.mockRejectedValue(new Error('oh no'));
|
|
||||||
|
|
||||||
const telemetry = await getExecutionTimeoutsPerDayCount(mockEsClient, 'test', mockLogger);
|
|
||||||
|
|
||||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
|
||||||
expect(mockLogger.warn).toHaveBeenCalledWith(
|
|
||||||
`Error executing alerting telemetry task: getExecutionsPerDayCount - {}`
|
|
||||||
);
|
|
||||||
expect(telemetry).toStrictEqual({
|
|
||||||
countTotal: 0,
|
|
||||||
countByType: {},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
test('getFailedAndUnrecognizedTasksPerDay should aggregations for total count, count by status and count by status and rule type for failed and unrecognized tasks', async () => {
|
|
||||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
|
||||||
mockEsClient.search.mockResponse(
|
|
||||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
|
||||||
{
|
|
||||||
aggregations: {
|
|
||||||
byTaskTypeId: {
|
|
||||||
value: {
|
|
||||||
statuses: {
|
|
||||||
failed: {
|
|
||||||
'.index-threshold': 2,
|
|
||||||
'logs.alert.document.count': 1,
|
|
||||||
'document.test.': 1,
|
|
||||||
},
|
|
||||||
unrecognized: {
|
|
||||||
'o.l.d.task-type': 1,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
hits: {
|
|
||||||
hits: [],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
const telemetry = await getFailedAndUnrecognizedTasksPerDay(mockEsClient, 'test', mockLogger);
|
|
||||||
|
|
||||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
|
||||||
|
|
||||||
expect(telemetry).toStrictEqual({
|
|
||||||
countByStatus: {
|
|
||||||
failed: 4,
|
|
||||||
unrecognized: 1,
|
|
||||||
},
|
|
||||||
countByStatusByRuleType: {
|
|
||||||
failed: {
|
|
||||||
'__index-threshold': 2,
|
|
||||||
document__test__: 1,
|
|
||||||
logs__alert__document__count: 1,
|
|
||||||
},
|
|
||||||
unrecognized: {
|
|
||||||
'o__l__d__task-type': 1,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
countTotal: 5,
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
test('getFailedAndUnrecognizedTasksPerDay should return empty results if query throws error', async () => {
|
|
||||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
|
||||||
mockEsClient.search.mockRejectedValue(new Error('oh no'));
|
|
||||||
|
|
||||||
const telemetry = await getFailedAndUnrecognizedTasksPerDay(mockEsClient, 'test', mockLogger);
|
|
||||||
|
|
||||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
|
||||||
expect(mockLogger.warn).toHaveBeenCalledWith(
|
|
||||||
`Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - {}`
|
|
||||||
);
|
|
||||||
expect(telemetry).toStrictEqual({
|
|
||||||
countByStatus: {},
|
|
||||||
countByStatusByRuleType: {},
|
|
||||||
countTotal: 0,
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
test('parsePercentileAggsByRuleType', () => {
|
|
||||||
const aggsByType = {
|
|
||||||
doc_count_error_upper_bound: 0,
|
|
||||||
sum_other_doc_count: 0,
|
|
||||||
buckets: [
|
|
||||||
{
|
|
||||||
key: '.index-threshold',
|
|
||||||
doc_count: 149,
|
|
||||||
percentileScheduledActions: {
|
|
||||||
values: {
|
|
||||||
'50.0': 4.0,
|
|
||||||
'90.0': 26.0,
|
|
||||||
'99.0': 26.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
percentileAlerts: {
|
|
||||||
values: {
|
|
||||||
'50.0': 3.0,
|
|
||||||
'90.0': 22.0,
|
|
||||||
'99.0': 22.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'logs.alert.document.count',
|
|
||||||
doc_count: 1,
|
|
||||||
percentileScheduledActions: {
|
|
||||||
values: {
|
|
||||||
'50.0': 10.0,
|
|
||||||
'90.0': 10.0,
|
|
||||||
'99.0': 10.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
percentileAlerts: {
|
|
||||||
values: {
|
|
||||||
'50.0': 5.0,
|
|
||||||
'90.0': 16.0,
|
|
||||||
'99.0': 16.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'document.test.',
|
|
||||||
doc_count: 1,
|
|
||||||
percentileScheduledActions: {
|
|
||||||
values: {
|
|
||||||
'50.0': null,
|
|
||||||
'90.0': null,
|
|
||||||
'99.0': null,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
percentileAlerts: {
|
|
||||||
values: {
|
|
||||||
'50.0': null,
|
|
||||||
'90.0': null,
|
|
||||||
'99.0': null,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
};
|
|
||||||
expect(
|
|
||||||
parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileScheduledActions.values')
|
|
||||||
).toEqual({
|
|
||||||
p50: {
|
|
||||||
'__index-threshold': 4,
|
|
||||||
document__test__: 0,
|
|
||||||
logs__alert__document__count: 10,
|
|
||||||
},
|
|
||||||
p90: {
|
|
||||||
'__index-threshold': 26,
|
|
||||||
document__test__: 0,
|
|
||||||
logs__alert__document__count: 10,
|
|
||||||
},
|
|
||||||
p99: {
|
|
||||||
'__index-threshold': 26,
|
|
||||||
document__test__: 0,
|
|
||||||
logs__alert__document__count: 10,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
expect(parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileAlerts.values')).toEqual({
|
|
||||||
p50: {
|
|
||||||
'__index-threshold': 3,
|
|
||||||
document__test__: 0,
|
|
||||||
logs__alert__document__count: 5,
|
|
||||||
},
|
|
||||||
p90: {
|
|
||||||
'__index-threshold': 22,
|
|
||||||
document__test__: 0,
|
|
||||||
logs__alert__document__count: 16,
|
|
||||||
},
|
|
||||||
p99: {
|
|
||||||
'__index-threshold': 22,
|
|
||||||
document__test__: 0,
|
|
||||||
logs__alert__document__count: 16,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
test('parsePercentileAggsByRuleType handles unknown path', () => {
|
|
||||||
const aggsByType = {
|
|
||||||
doc_count_error_upper_bound: 0,
|
|
||||||
sum_other_doc_count: 0,
|
|
||||||
buckets: [
|
|
||||||
{
|
|
||||||
key: '.index-threshold',
|
|
||||||
doc_count: 149,
|
|
||||||
percentileScheduledActions: {
|
|
||||||
values: {
|
|
||||||
'50.0': 4.0,
|
|
||||||
'90.0': 26.0,
|
|
||||||
'99.0': 26.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'logs.alert.document.count',
|
|
||||||
doc_count: 1,
|
|
||||||
percentileScheduledActions: {
|
|
||||||
values: {
|
|
||||||
'50.0': 10.0,
|
|
||||||
'90.0': 10.0,
|
|
||||||
'99.0': 10.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
};
|
|
||||||
expect(parsePercentileAggsByRuleType(aggsByType.buckets, 'foo.values')).toEqual({
|
|
||||||
p50: {},
|
|
||||||
p90: {},
|
|
||||||
p99: {},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
test('parsePercentileAggsByRuleType handles unrecognized percentiles', () => {
|
|
||||||
const aggsByType = {
|
|
||||||
doc_count_error_upper_bound: 0,
|
|
||||||
sum_other_doc_count: 0,
|
|
||||||
buckets: [
|
|
||||||
{
|
|
||||||
key: '.index-threshold',
|
|
||||||
doc_count: 149,
|
|
||||||
percentileScheduledActions: {
|
|
||||||
values: {
|
|
||||||
'50.0': 4.0,
|
|
||||||
'75.0': 8.0,
|
|
||||||
'90.0': 26.0,
|
|
||||||
'99.0': 26.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'logs.alert.document.count',
|
|
||||||
doc_count: 1,
|
|
||||||
percentileScheduledActions: {
|
|
||||||
values: {
|
|
||||||
'50.0': 10.0,
|
|
||||||
'75.0': 10.0,
|
|
||||||
'90.0': 10.0,
|
|
||||||
'99.0': 10.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
};
|
|
||||||
expect(
|
|
||||||
parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileScheduledActions.values')
|
|
||||||
).toEqual({
|
|
||||||
p50: {
|
|
||||||
'__index-threshold': 4,
|
|
||||||
logs__alert__document__count: 10,
|
|
||||||
},
|
|
||||||
p90: {
|
|
||||||
'__index-threshold': 26,
|
|
||||||
logs__alert__document__count: 10,
|
|
||||||
},
|
|
||||||
p99: {
|
|
||||||
'__index-threshold': 26,
|
|
||||||
logs__alert__document__count: 10,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
|
@ -1,962 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
|
||||||
* or more contributor license agreements. Licensed under the Elastic License
|
|
||||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
|
||||||
* 2.0.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
|
||||||
import { ElasticsearchClient, Logger } from '@kbn/core/server';
|
|
||||||
import { get, merge } from 'lodash';
|
|
||||||
import { AlertingUsage } from './types';
|
|
||||||
import { NUM_ALERTING_RULE_TYPES } from './alerting_usage_collector';
|
|
||||||
|
|
||||||
const percentileFieldNameMapping: Record<string, string> = {
|
|
||||||
'50.0': 'p50',
|
|
||||||
'90.0': 'p90',
|
|
||||||
'99.0': 'p99',
|
|
||||||
};
|
|
||||||
|
|
||||||
const ruleTypeMetric = {
|
|
||||||
scripted_metric: {
|
|
||||||
init_script: 'state.ruleTypes = [:]; state.namespaces = [:]',
|
|
||||||
map_script: `
|
|
||||||
String ruleType = doc['alert.alertTypeId'].value;
|
|
||||||
String namespace = doc['namespaces'] !== null && doc['namespaces'].size() > 0 ? doc['namespaces'].value : 'default';
|
|
||||||
state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1);
|
|
||||||
if (state.namespaces.containsKey(namespace) === false) {
|
|
||||||
state.namespaces.put(namespace, 1);
|
|
||||||
}
|
|
||||||
`,
|
|
||||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
|
||||||
// Despite docs that say this is optional, this script can't be blank.
|
|
||||||
combine_script: 'return state',
|
|
||||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
|
||||||
// This also needs to account for having no data
|
|
||||||
reduce_script: `
|
|
||||||
HashMap result = new HashMap();
|
|
||||||
HashMap combinedRuleTypes = new HashMap();
|
|
||||||
HashMap combinedNamespaces = new HashMap();
|
|
||||||
for (state in states) {
|
|
||||||
for (String ruleType : state.ruleTypes.keySet()) {
|
|
||||||
int ruleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + state.ruleTypes.get(ruleType) : state.ruleTypes.get(ruleType);
|
|
||||||
combinedRuleTypes.put(ruleType, ruleTypeCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (String namespace : state.namespaces.keySet()) {
|
|
||||||
combinedNamespaces.put(namespace, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
result.ruleTypes = combinedRuleTypes;
|
|
||||||
result.namespaces = combinedNamespaces;
|
|
||||||
return result;
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const generatedActionsPercentilesAgg = {
|
|
||||||
percentiles: {
|
|
||||||
field: 'kibana.alert.rule.execution.metrics.number_of_generated_actions',
|
|
||||||
percents: [50, 90, 99],
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const alertsPercentilesAgg = {
|
|
||||||
percentiles: {
|
|
||||||
field: 'kibana.alert.rule.execution.metrics.alert_counts.active',
|
|
||||||
percents: [50, 90, 99],
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const ruleTypeExecutionsWithDurationMetric = {
|
|
||||||
scripted_metric: {
|
|
||||||
init_script:
|
|
||||||
'state.ruleTypes = [:]; state.ruleTypesDuration = [:]; state.ruleTypesEsSearchDuration = [:]; state.ruleTypesTotalSearchDuration = [:];',
|
|
||||||
map_script: `
|
|
||||||
String ruleType = doc['rule.category'].value;
|
|
||||||
long duration = doc['event.duration'].value / (1000 * 1000);
|
|
||||||
long esSearchDuration = doc['kibana.alert.rule.execution.metrics.es_search_duration_ms'].empty ? 0 : doc['kibana.alert.rule.execution.metrics.es_search_duration_ms'].value;
|
|
||||||
long totalSearchDuration = doc['kibana.alert.rule.execution.metrics.total_search_duration_ms'].empty ? 0 : doc['kibana.alert.rule.execution.metrics.total_search_duration_ms'].value;
|
|
||||||
state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1);
|
|
||||||
state.ruleTypesDuration.put(ruleType, state.ruleTypesDuration.containsKey(ruleType) ? state.ruleTypesDuration.get(ruleType) + duration : duration);
|
|
||||||
state.ruleTypesEsSearchDuration.put(ruleType, state.ruleTypesEsSearchDuration.containsKey(ruleType) ? state.ruleTypesEsSearchDuration.get(ruleType) + esSearchDuration : esSearchDuration);
|
|
||||||
state.ruleTypesTotalSearchDuration.put(ruleType, state.ruleTypesTotalSearchDuration.containsKey(ruleType) ? state.ruleTypesTotalSearchDuration.get(ruleType) + totalSearchDuration : totalSearchDuration);
|
|
||||||
`,
|
|
||||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
|
||||||
// Despite docs that say this is optional, this script can't be blank.
|
|
||||||
combine_script: 'return state',
|
|
||||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
|
||||||
// This also needs to account for having no data
|
|
||||||
reduce_script: `
|
|
||||||
HashMap result = new HashMap();
|
|
||||||
HashMap combinedRuleTypes = new HashMap();
|
|
||||||
HashMap combinedRuleTypeDurations = new HashMap();
|
|
||||||
HashMap combinedRuleTypeEsSearchDurations = new HashMap();
|
|
||||||
HashMap combinedRuleTypeTotalSearchDurations = new HashMap();
|
|
||||||
for (state in states) {
|
|
||||||
for (String ruleType : state.ruleTypes.keySet()) {
|
|
||||||
int ruleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + state.ruleTypes.get(ruleType) : state.ruleTypes.get(ruleType);
|
|
||||||
combinedRuleTypes.put(ruleType, ruleTypeCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (String ruleType : state.ruleTypesDuration.keySet()) {
|
|
||||||
long ruleTypeDurationTotal = combinedRuleTypeDurations.containsKey(ruleType) ? combinedRuleTypeDurations.get(ruleType) + state.ruleTypesDuration.get(ruleType) : state.ruleTypesDuration.get(ruleType);
|
|
||||||
combinedRuleTypeDurations.put(ruleType, ruleTypeDurationTotal);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (String ruleType : state.ruleTypesEsSearchDuration.keySet()) {
|
|
||||||
long ruleTypeEsSearchDurationTotal = combinedRuleTypeEsSearchDurations.containsKey(ruleType) ? combinedRuleTypeEsSearchDurations.get(ruleType) + state.ruleTypesEsSearchDuration.get(ruleType) : state.ruleTypesEsSearchDuration.get(ruleType);
|
|
||||||
combinedRuleTypeEsSearchDurations.put(ruleType, ruleTypeEsSearchDurationTotal);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (String ruleType : state.ruleTypesTotalSearchDuration.keySet()) {
|
|
||||||
long ruleTypeTotalSearchDurationTotal = combinedRuleTypeTotalSearchDurations.containsKey(ruleType) ? combinedRuleTypeTotalSearchDurations.get(ruleType) + state.ruleTypesTotalSearchDuration.get(ruleType) : state.ruleTypesTotalSearchDuration.get(ruleType);
|
|
||||||
combinedRuleTypeTotalSearchDurations.put(ruleType, ruleTypeTotalSearchDurationTotal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
result.ruleTypes = combinedRuleTypes;
|
|
||||||
result.ruleTypesDuration = combinedRuleTypeDurations;
|
|
||||||
result.ruleTypesEsSearchDuration = combinedRuleTypeEsSearchDurations;
|
|
||||||
result.ruleTypesTotalSearchDuration = combinedRuleTypeTotalSearchDurations;
|
|
||||||
return result;
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const ruleTypeExecutionsMetric = {
|
|
||||||
scripted_metric: {
|
|
||||||
init_script: 'state.ruleTypes = [:]',
|
|
||||||
map_script: `
|
|
||||||
String ruleType = doc['rule.category'].value;
|
|
||||||
state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1);
|
|
||||||
`,
|
|
||||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
|
||||||
// Despite docs that say this is optional, this script can't be blank.
|
|
||||||
combine_script: 'return state',
|
|
||||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
|
||||||
// This also needs to account for having no data
|
|
||||||
reduce_script: `
|
|
||||||
HashMap result = new HashMap();
|
|
||||||
HashMap combinedRuleTypes = new HashMap();
|
|
||||||
for (state in states) {
|
|
||||||
for (String ruleType : state.ruleTypes.keySet()) {
|
|
||||||
int ruleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + state.ruleTypes.get(ruleType) : state.ruleTypes.get(ruleType);
|
|
||||||
combinedRuleTypes.put(ruleType, ruleTypeCount);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
result.ruleTypes = combinedRuleTypes;
|
|
||||||
return result;
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const taskTypeExecutionsMetric = {
|
|
||||||
scripted_metric: {
|
|
||||||
init_script: 'state.statuses = [:]',
|
|
||||||
map_script: `
|
|
||||||
String status = doc['task.status'].value;
|
|
||||||
String taskType = doc['task.taskType'].value.replace('alerting:', '');
|
|
||||||
Map taskTypes = state.statuses.containsKey(status) ? state.statuses.get(status) : [:];
|
|
||||||
taskTypes.put(taskType, taskTypes.containsKey(taskType) ? taskTypes.get(taskType) + 1 : 1);
|
|
||||||
state.statuses.put(status, taskTypes);
|
|
||||||
`,
|
|
||||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
|
||||||
// Despite docs that say this is optional, this script can't be blank.
|
|
||||||
combine_script: 'return state',
|
|
||||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
|
||||||
// This also needs to account for having no data
|
|
||||||
reduce_script: `
|
|
||||||
HashMap result = new HashMap();
|
|
||||||
HashMap combinedStatuses = new HashMap();
|
|
||||||
for (state in states) {
|
|
||||||
for (String status : state.statuses.keySet()) {
|
|
||||||
HashMap combinedTaskTypes = new HashMap();
|
|
||||||
Map statusTaskTypes = state.statuses.get(status);
|
|
||||||
for (String taskType : statusTaskTypes.keySet()) {
|
|
||||||
int statusByTaskTypeCount = combinedTaskTypes.containsKey(taskType) ? combinedTaskTypes.get(taskType) + statusTaskTypes.get(taskType) : statusTaskTypes.get(taskType);
|
|
||||||
combinedTaskTypes.put(taskType, statusByTaskTypeCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
combinedStatuses.put(status, combinedTaskTypes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result.statuses = combinedStatuses;
|
|
||||||
return result;
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const ruleTypeFailureExecutionsMetric = {
|
|
||||||
scripted_metric: {
|
|
||||||
init_script: 'state.reasons = [:]',
|
|
||||||
map_script: `
|
|
||||||
if (doc['event.outcome'].value == 'failure') {
|
|
||||||
String reason = doc['event.reason'].value;
|
|
||||||
String ruleType = doc['rule.category'].value;
|
|
||||||
Map ruleTypes = state.reasons.containsKey(reason) ? state.reasons.get(reason) : [:];
|
|
||||||
ruleTypes.put(ruleType, ruleTypes.containsKey(ruleType) ? ruleTypes.get(ruleType) + 1 : 1);
|
|
||||||
state.reasons.put(reason, ruleTypes);
|
|
||||||
}
|
|
||||||
`,
|
|
||||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
|
||||||
// Despite docs that say this is optional, this script can't be blank.
|
|
||||||
combine_script: 'return state',
|
|
||||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
|
||||||
// This also needs to account for having no data
|
|
||||||
reduce_script: `
|
|
||||||
HashMap result = new HashMap();
|
|
||||||
HashMap combinedReasons = new HashMap();
|
|
||||||
for (state in states) {
|
|
||||||
for (String reason : state.reasons.keySet()) {
|
|
||||||
HashMap combinedRuleTypes = new HashMap();
|
|
||||||
Map reasonRuleTypes = state.reasons.get(reason);
|
|
||||||
for (String ruleType : state.reasons.get(reason).keySet()) {
|
|
||||||
int reasonByRuleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + reasonRuleTypes.get(ruleType) : reasonRuleTypes.get(ruleType);
|
|
||||||
combinedRuleTypes.put(ruleType, reasonByRuleTypeCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
combinedReasons.put(reason, combinedRuleTypes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result.reasons = combinedReasons;
|
|
||||||
return result;
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
export async function getTotalCountAggregations(
|
|
||||||
esClient: ElasticsearchClient,
|
|
||||||
kibanaIndex: string,
|
|
||||||
logger: Logger
|
|
||||||
): Promise<
|
|
||||||
Pick<
|
|
||||||
AlertingUsage,
|
|
||||||
| 'count_total'
|
|
||||||
| 'count_by_type'
|
|
||||||
| 'throttle_time'
|
|
||||||
| 'schedule_time'
|
|
||||||
| 'throttle_time_number_s'
|
|
||||||
| 'schedule_time_number_s'
|
|
||||||
| 'connectors_per_alert'
|
|
||||||
| 'count_rules_namespaces'
|
|
||||||
>
|
|
||||||
> {
|
|
||||||
try {
|
|
||||||
const results = await esClient.search({
|
|
||||||
index: kibanaIndex,
|
|
||||||
body: {
|
|
||||||
size: 0,
|
|
||||||
query: {
|
|
||||||
bool: {
|
|
||||||
filter: [{ term: { type: 'alert' } }],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
runtime_mappings: {
|
|
||||||
alert_action_count: {
|
|
||||||
type: 'long',
|
|
||||||
script: {
|
|
||||||
source: `
|
|
||||||
def alert = params._source['alert'];
|
|
||||||
if (alert != null) {
|
|
||||||
def actions = alert.actions;
|
|
||||||
if (actions != null) {
|
|
||||||
emit(actions.length);
|
|
||||||
} else {
|
|
||||||
emit(0);
|
|
||||||
}
|
|
||||||
}`,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
alert_interval: {
|
|
||||||
type: 'long',
|
|
||||||
script: {
|
|
||||||
source: `
|
|
||||||
int parsed = 0;
|
|
||||||
if (doc['alert.schedule.interval'].size() > 0) {
|
|
||||||
def interval = doc['alert.schedule.interval'].value;
|
|
||||||
|
|
||||||
if (interval.length() > 1) {
|
|
||||||
// get last char
|
|
||||||
String timeChar = interval.substring(interval.length() - 1);
|
|
||||||
// remove last char
|
|
||||||
interval = interval.substring(0, interval.length() - 1);
|
|
||||||
|
|
||||||
if (interval.chars().allMatch(Character::isDigit)) {
|
|
||||||
// using of regex is not allowed in painless language
|
|
||||||
parsed = Integer.parseInt(interval);
|
|
||||||
|
|
||||||
if (timeChar.equals("s")) {
|
|
||||||
parsed = parsed;
|
|
||||||
} else if (timeChar.equals("m")) {
|
|
||||||
parsed = parsed * 60;
|
|
||||||
} else if (timeChar.equals("h")) {
|
|
||||||
parsed = parsed * 60 * 60;
|
|
||||||
} else if (timeChar.equals("d")) {
|
|
||||||
parsed = parsed * 24 * 60 * 60;
|
|
||||||
}
|
|
||||||
emit(parsed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
emit(parsed);
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
alert_throttle: {
|
|
||||||
type: 'long',
|
|
||||||
script: {
|
|
||||||
source: `
|
|
||||||
int parsed = 0;
|
|
||||||
if (doc['alert.throttle'].size() > 0) {
|
|
||||||
def throttle = doc['alert.throttle'].value;
|
|
||||||
|
|
||||||
if (throttle.length() > 1) {
|
|
||||||
// get last char
|
|
||||||
String timeChar = throttle.substring(throttle.length() - 1);
|
|
||||||
// remove last char
|
|
||||||
throttle = throttle.substring(0, throttle.length() - 1);
|
|
||||||
|
|
||||||
if (throttle.chars().allMatch(Character::isDigit)) {
|
|
||||||
// using of regex is not allowed in painless language
|
|
||||||
parsed = Integer.parseInt(throttle);
|
|
||||||
|
|
||||||
if (timeChar.equals("s")) {
|
|
||||||
parsed = parsed;
|
|
||||||
} else if (timeChar.equals("m")) {
|
|
||||||
parsed = parsed * 60;
|
|
||||||
} else if (timeChar.equals("h")) {
|
|
||||||
parsed = parsed * 60 * 60;
|
|
||||||
} else if (timeChar.equals("d")) {
|
|
||||||
parsed = parsed * 24 * 60 * 60;
|
|
||||||
}
|
|
||||||
emit(parsed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
emit(parsed);
|
|
||||||
`,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
aggs: {
|
|
||||||
byRuleTypeId: ruleTypeMetric,
|
|
||||||
max_throttle_time: { max: { field: 'alert_throttle' } },
|
|
||||||
min_throttle_time: { min: { field: 'alert_throttle' } },
|
|
||||||
avg_throttle_time: { avg: { field: 'alert_throttle' } },
|
|
||||||
max_interval_time: { max: { field: 'alert_interval' } },
|
|
||||||
min_interval_time: { min: { field: 'alert_interval' } },
|
|
||||||
avg_interval_time: { avg: { field: 'alert_interval' } },
|
|
||||||
max_actions_count: { max: { field: 'alert_action_count' } },
|
|
||||||
min_actions_count: { min: { field: 'alert_action_count' } },
|
|
||||||
avg_actions_count: { avg: { field: 'alert_action_count' } },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const aggregations = results.aggregations as {
|
|
||||||
byRuleTypeId: { value: { ruleTypes: Record<string, string> } };
|
|
||||||
max_throttle_time: { value: number };
|
|
||||||
min_throttle_time: { value: number };
|
|
||||||
avg_throttle_time: { value: number };
|
|
||||||
max_interval_time: { value: number };
|
|
||||||
min_interval_time: { value: number };
|
|
||||||
avg_interval_time: { value: number };
|
|
||||||
max_actions_count: { value: number };
|
|
||||||
min_actions_count: { value: number };
|
|
||||||
avg_actions_count: { value: number };
|
|
||||||
};
|
|
||||||
|
|
||||||
const totalRulesCount = Object.keys(aggregations.byRuleTypeId.value.ruleTypes).reduce(
|
|
||||||
(total: number, key: string) =>
|
|
||||||
parseInt(aggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
|
||||||
0
|
|
||||||
);
|
|
||||||
|
|
||||||
return {
|
|
||||||
count_total: totalRulesCount,
|
|
||||||
count_by_type: replaceDotSymbolsInRuleTypeIds(aggregations.byRuleTypeId.value.ruleTypes),
|
|
||||||
throttle_time: {
|
|
||||||
min: `${aggregations.min_throttle_time.value}s`,
|
|
||||||
avg: `${aggregations.avg_throttle_time.value}s`,
|
|
||||||
max: `${aggregations.max_throttle_time.value}s`,
|
|
||||||
},
|
|
||||||
schedule_time: {
|
|
||||||
min: `${aggregations.min_interval_time.value}s`,
|
|
||||||
avg: `${aggregations.avg_interval_time.value}s`,
|
|
||||||
max: `${aggregations.max_interval_time.value}s`,
|
|
||||||
},
|
|
||||||
throttle_time_number_s: {
|
|
||||||
min: aggregations.min_throttle_time.value,
|
|
||||||
avg: aggregations.avg_throttle_time.value,
|
|
||||||
max: aggregations.max_throttle_time.value,
|
|
||||||
},
|
|
||||||
schedule_time_number_s: {
|
|
||||||
min: aggregations.min_interval_time.value,
|
|
||||||
avg: aggregations.avg_interval_time.value,
|
|
||||||
max: aggregations.max_interval_time.value,
|
|
||||||
},
|
|
||||||
connectors_per_alert: {
|
|
||||||
min: aggregations.min_actions_count.value,
|
|
||||||
avg: aggregations.avg_actions_count.value,
|
|
||||||
max: aggregations.max_actions_count.value,
|
|
||||||
},
|
|
||||||
count_rules_namespaces: 0,
|
|
||||||
};
|
|
||||||
} catch (err) {
|
|
||||||
logger.warn(
|
|
||||||
`Error executing alerting telemetry task: getTotalCountAggregations - ${JSON.stringify(err)}`
|
|
||||||
);
|
|
||||||
return {
|
|
||||||
count_total: 0,
|
|
||||||
count_by_type: {},
|
|
||||||
throttle_time: {
|
|
||||||
min: '0s',
|
|
||||||
avg: '0s',
|
|
||||||
max: '0s',
|
|
||||||
},
|
|
||||||
schedule_time: {
|
|
||||||
min: '0s',
|
|
||||||
avg: '0s',
|
|
||||||
max: '0s',
|
|
||||||
},
|
|
||||||
throttle_time_number_s: {
|
|
||||||
min: 0,
|
|
||||||
avg: 0,
|
|
||||||
max: 0,
|
|
||||||
},
|
|
||||||
schedule_time_number_s: {
|
|
||||||
min: 0,
|
|
||||||
avg: 0,
|
|
||||||
max: 0,
|
|
||||||
},
|
|
||||||
connectors_per_alert: {
|
|
||||||
min: 0,
|
|
||||||
avg: 0,
|
|
||||||
max: 0,
|
|
||||||
},
|
|
||||||
count_rules_namespaces: 0,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function getTotalCountInUse(
|
|
||||||
esClient: ElasticsearchClient,
|
|
||||||
kibanaIndex: string,
|
|
||||||
logger: Logger
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
const searchResult = await esClient.search({
|
|
||||||
index: kibanaIndex,
|
|
||||||
size: 0,
|
|
||||||
body: {
|
|
||||||
query: {
|
|
||||||
bool: {
|
|
||||||
filter: [{ term: { type: 'alert' } }, { term: { 'alert.enabled': true } }],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
aggs: {
|
|
||||||
byRuleTypeId: ruleTypeMetric,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const aggregations = searchResult.aggregations as {
|
|
||||||
byRuleTypeId: {
|
|
||||||
value: { ruleTypes: Record<string, string>; namespaces: Record<string, string> };
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
|
||||||
countTotal: Object.keys(aggregations.byRuleTypeId.value.ruleTypes).reduce(
|
|
||||||
(total: number, key: string) =>
|
|
||||||
parseInt(aggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
|
||||||
0
|
|
||||||
),
|
|
||||||
countByType: replaceDotSymbolsInRuleTypeIds(aggregations.byRuleTypeId.value.ruleTypes),
|
|
||||||
countNamespaces: Object.keys(aggregations.byRuleTypeId.value.namespaces).length,
|
|
||||||
};
|
|
||||||
} catch (err) {
|
|
||||||
logger.warn(
|
|
||||||
`Error executing alerting telemetry task: getTotalCountInUse - ${JSON.stringify(err)}`
|
|
||||||
);
|
|
||||||
return {
|
|
||||||
countTotal: 0,
|
|
||||||
countByType: {},
|
|
||||||
countNamespaces: 0,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function getExecutionsPerDayCount(
|
|
||||||
esClient: ElasticsearchClient,
|
|
||||||
eventLogIndex: string,
|
|
||||||
logger: Logger
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
const searchResult = await esClient.search({
|
|
||||||
index: eventLogIndex,
|
|
||||||
size: 0,
|
|
||||||
body: {
|
|
||||||
query: {
|
|
||||||
bool: {
|
|
||||||
filter: {
|
|
||||||
bool: {
|
|
||||||
must: [
|
|
||||||
{
|
|
||||||
term: { 'event.action': 'execute' },
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: { 'event.provider': 'alerting' },
|
|
||||||
},
|
|
||||||
{
|
|
||||||
range: {
|
|
||||||
'@timestamp': {
|
|
||||||
gte: 'now-1d',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
aggs: {
|
|
||||||
byRuleTypeId: ruleTypeExecutionsWithDurationMetric,
|
|
||||||
failuresByReason: ruleTypeFailureExecutionsMetric,
|
|
||||||
avgDuration: { avg: { field: 'event.duration' } },
|
|
||||||
avgEsSearchDuration: {
|
|
||||||
avg: { field: 'kibana.alert.rule.execution.metrics.es_search_duration_ms' },
|
|
||||||
},
|
|
||||||
avgTotalSearchDuration: {
|
|
||||||
avg: { field: 'kibana.alert.rule.execution.metrics.total_search_duration_ms' },
|
|
||||||
},
|
|
||||||
percentileScheduledActions: generatedActionsPercentilesAgg,
|
|
||||||
percentileAlerts: alertsPercentilesAgg,
|
|
||||||
aggsByType: {
|
|
||||||
terms: {
|
|
||||||
field: 'rule.category',
|
|
||||||
size: NUM_ALERTING_RULE_TYPES,
|
|
||||||
},
|
|
||||||
aggs: {
|
|
||||||
percentileScheduledActions: generatedActionsPercentilesAgg,
|
|
||||||
percentileAlerts: alertsPercentilesAgg,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const executionsAggregations = searchResult.aggregations as {
|
|
||||||
byRuleTypeId: {
|
|
||||||
value: {
|
|
||||||
ruleTypes: Record<string, string>;
|
|
||||||
ruleTypesDuration: Record<string, number>;
|
|
||||||
ruleTypesEsSearchDuration: Record<string, number>;
|
|
||||||
ruleTypesTotalSearchDuration: Record<string, number>;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
const aggsAvgExecutionTime = Math.round(
|
|
||||||
// @ts-expect-error aggegation type is not specified
|
|
||||||
// convert nanoseconds to milliseconds
|
|
||||||
searchResult.aggregations.avgDuration.value / (1000 * 1000)
|
|
||||||
);
|
|
||||||
|
|
||||||
const aggsAvgEsSearchDuration = Math.round(
|
|
||||||
// @ts-expect-error aggegation type is not specified
|
|
||||||
searchResult.aggregations.avgEsSearchDuration.value
|
|
||||||
);
|
|
||||||
const aggsAvgTotalSearchDuration = Math.round(
|
|
||||||
// @ts-expect-error aggegation type is not specified
|
|
||||||
searchResult.aggregations.avgTotalSearchDuration.value
|
|
||||||
);
|
|
||||||
|
|
||||||
const aggsGeneratedActionsPercentiles =
|
|
||||||
// @ts-expect-error aggegation type is not specified
|
|
||||||
searchResult.aggregations.percentileScheduledActions.values;
|
|
||||||
|
|
||||||
const aggsAlertsPercentiles =
|
|
||||||
// @ts-expect-error aggegation type is not specified
|
|
||||||
searchResult.aggregations.percentileAlerts.values;
|
|
||||||
|
|
||||||
const aggsByTypeBuckets =
|
|
||||||
// @ts-expect-error aggegation type is not specified
|
|
||||||
searchResult.aggregations.aggsByType.buckets;
|
|
||||||
|
|
||||||
const executionFailuresAggregations = searchResult.aggregations as {
|
|
||||||
failuresByReason: { value: { reasons: Record<string, Record<string, string>> } };
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
|
||||||
countTotal: Object.keys(executionsAggregations.byRuleTypeId.value.ruleTypes).reduce(
|
|
||||||
(total: number, key: string) =>
|
|
||||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
|
||||||
0
|
|
||||||
),
|
|
||||||
countByType: replaceDotSymbolsInRuleTypeIds(
|
|
||||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
|
||||||
),
|
|
||||||
countTotalFailures: Object.keys(
|
|
||||||
executionFailuresAggregations.failuresByReason.value.reasons
|
|
||||||
).reduce((total: number, reason: string) => {
|
|
||||||
const byRuleTypesRefs =
|
|
||||||
executionFailuresAggregations.failuresByReason.value.reasons[reason];
|
|
||||||
const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce(
|
|
||||||
(totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10),
|
|
||||||
0
|
|
||||||
);
|
|
||||||
return countByRuleTypes + total;
|
|
||||||
}, 0),
|
|
||||||
countFailuresByReason: Object.keys(
|
|
||||||
executionFailuresAggregations.failuresByReason.value.reasons
|
|
||||||
).reduce(
|
|
||||||
// ES DSL aggregations are returned as `any` by esClient.search
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
(obj: any, reason: string) => {
|
|
||||||
const byRuleTypesRefs =
|
|
||||||
executionFailuresAggregations.failuresByReason.value.reasons[reason];
|
|
||||||
const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce(
|
|
||||||
(totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10),
|
|
||||||
0
|
|
||||||
);
|
|
||||||
return {
|
|
||||||
...obj,
|
|
||||||
[replaceDotSymbols(reason)]: countByRuleTypes,
|
|
||||||
};
|
|
||||||
},
|
|
||||||
{}
|
|
||||||
),
|
|
||||||
countFailuresByReasonByType: Object.keys(
|
|
||||||
executionFailuresAggregations.failuresByReason.value.reasons
|
|
||||||
).reduce(
|
|
||||||
// ES DSL aggregations are returned as `any` by esClient.search
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
(obj: any, key: string) => ({
|
|
||||||
...obj,
|
|
||||||
[key]: replaceDotSymbolsInRuleTypeIds(
|
|
||||||
executionFailuresAggregations.failuresByReason.value.reasons[key]
|
|
||||||
),
|
|
||||||
}),
|
|
||||||
{}
|
|
||||||
),
|
|
||||||
avgExecutionTime: aggsAvgExecutionTime,
|
|
||||||
avgExecutionTimeByType: Object.keys(
|
|
||||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
|
||||||
).reduce(
|
|
||||||
// ES DSL aggregations are returned as `any` by esClient.search
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
(obj: any, key: string) => ({
|
|
||||||
...obj,
|
|
||||||
[replaceDotSymbols(key)]: Math.round(
|
|
||||||
executionsAggregations.byRuleTypeId.value.ruleTypesDuration[key] /
|
|
||||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10)
|
|
||||||
),
|
|
||||||
}),
|
|
||||||
{}
|
|
||||||
),
|
|
||||||
avgEsSearchDuration: aggsAvgEsSearchDuration,
|
|
||||||
avgEsSearchDurationByType: Object.keys(
|
|
||||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
|
||||||
).reduce(
|
|
||||||
// ES DSL aggregations are returned as `any` by esClient.search
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
(obj: any, key: string) => ({
|
|
||||||
...obj,
|
|
||||||
[replaceDotSymbols(key)]: Math.round(
|
|
||||||
executionsAggregations.byRuleTypeId.value.ruleTypesEsSearchDuration[key] /
|
|
||||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10)
|
|
||||||
),
|
|
||||||
}),
|
|
||||||
{}
|
|
||||||
),
|
|
||||||
avgTotalSearchDuration: aggsAvgTotalSearchDuration,
|
|
||||||
avgTotalSearchDurationByType: Object.keys(
|
|
||||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
|
||||||
).reduce(
|
|
||||||
// ES DSL aggregations are returned as `any` by esClient.search
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
(obj: any, key: string) => ({
|
|
||||||
...obj,
|
|
||||||
[replaceDotSymbols(key)]: Math.round(
|
|
||||||
executionsAggregations.byRuleTypeId.value.ruleTypesTotalSearchDuration[key] /
|
|
||||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10)
|
|
||||||
),
|
|
||||||
}),
|
|
||||||
{}
|
|
||||||
),
|
|
||||||
generatedActionsPercentiles: Object.keys(aggsGeneratedActionsPercentiles).reduce(
|
|
||||||
// ES DSL aggregations are returned as `any` by esClient.search
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
(acc: any, curr: string) => ({
|
|
||||||
...acc,
|
|
||||||
...(percentileFieldNameMapping[curr]
|
|
||||||
? { [percentileFieldNameMapping[curr]]: aggsGeneratedActionsPercentiles[curr] }
|
|
||||||
: {}),
|
|
||||||
}),
|
|
||||||
{}
|
|
||||||
),
|
|
||||||
generatedActionsPercentilesByType: parsePercentileAggsByRuleType(
|
|
||||||
aggsByTypeBuckets,
|
|
||||||
'percentileScheduledActions.values'
|
|
||||||
),
|
|
||||||
alertsPercentiles: Object.keys(aggsAlertsPercentiles).reduce(
|
|
||||||
// ES DSL aggregations are returned as `any` by esClient.search
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
(acc: any, curr: string) => ({
|
|
||||||
...acc,
|
|
||||||
...(percentileFieldNameMapping[curr]
|
|
||||||
? { [percentileFieldNameMapping[curr]]: aggsAlertsPercentiles[curr] }
|
|
||||||
: {}),
|
|
||||||
}),
|
|
||||||
{}
|
|
||||||
),
|
|
||||||
alertsPercentilesByType: parsePercentileAggsByRuleType(
|
|
||||||
aggsByTypeBuckets,
|
|
||||||
'percentileAlerts.values'
|
|
||||||
),
|
|
||||||
};
|
|
||||||
} catch (err) {
|
|
||||||
logger.warn(
|
|
||||||
`Error executing alerting telemetry task: getExecutionsPerDayCount - ${JSON.stringify(err)}`
|
|
||||||
);
|
|
||||||
return {
|
|
||||||
countTotal: 0,
|
|
||||||
countByType: {},
|
|
||||||
countTotalFailures: 0,
|
|
||||||
countFailuresByReason: {},
|
|
||||||
countFailuresByReasonByType: {},
|
|
||||||
avgExecutionTime: 0,
|
|
||||||
avgExecutionTimeByType: {},
|
|
||||||
avgEsSearchDuration: 0,
|
|
||||||
avgEsSearchDurationByType: {},
|
|
||||||
avgTotalSearchDuration: 0,
|
|
||||||
avgTotalSearchDurationByType: {},
|
|
||||||
generatedActionsPercentiles: {},
|
|
||||||
generatedActionsPercentilesByType: {},
|
|
||||||
alertsPercentiles: {},
|
|
||||||
alertsPercentilesByType: {},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function getExecutionTimeoutsPerDayCount(
|
|
||||||
esClient: ElasticsearchClient,
|
|
||||||
eventLogIndex: string,
|
|
||||||
logger: Logger
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
const searchResult = await esClient.search({
|
|
||||||
index: eventLogIndex,
|
|
||||||
size: 0,
|
|
||||||
body: {
|
|
||||||
query: {
|
|
||||||
bool: {
|
|
||||||
filter: {
|
|
||||||
bool: {
|
|
||||||
must: [
|
|
||||||
{
|
|
||||||
term: { 'event.action': 'execute-timeout' },
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: { 'event.provider': 'alerting' },
|
|
||||||
},
|
|
||||||
{
|
|
||||||
range: {
|
|
||||||
'@timestamp': {
|
|
||||||
gte: 'now-1d',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
aggs: {
|
|
||||||
byRuleTypeId: ruleTypeExecutionsMetric,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const executionsAggregations = searchResult.aggregations as {
|
|
||||||
byRuleTypeId: {
|
|
||||||
value: { ruleTypes: Record<string, string>; ruleTypesDuration: Record<string, number> };
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
|
||||||
countTotal: Object.keys(executionsAggregations.byRuleTypeId.value.ruleTypes).reduce(
|
|
||||||
(total: number, key: string) =>
|
|
||||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
|
||||||
0
|
|
||||||
),
|
|
||||||
countByType: replaceDotSymbolsInRuleTypeIds(
|
|
||||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
|
||||||
),
|
|
||||||
};
|
|
||||||
} catch (err) {
|
|
||||||
logger.warn(
|
|
||||||
`Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - ${JSON.stringify(
|
|
||||||
err
|
|
||||||
)}`
|
|
||||||
);
|
|
||||||
return {
|
|
||||||
countTotal: 0,
|
|
||||||
countByType: {},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function getFailedAndUnrecognizedTasksPerDay(
|
|
||||||
esClient: ElasticsearchClient,
|
|
||||||
taskManagerIndex: string,
|
|
||||||
logger: Logger
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
const searchResult = await esClient.search({
|
|
||||||
index: taskManagerIndex,
|
|
||||||
size: 0,
|
|
||||||
body: {
|
|
||||||
query: {
|
|
||||||
bool: {
|
|
||||||
must: [
|
|
||||||
{
|
|
||||||
bool: {
|
|
||||||
should: [
|
|
||||||
{
|
|
||||||
term: {
|
|
||||||
'task.status': 'unrecognized',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: {
|
|
||||||
'task.status': 'failed',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
wildcard: {
|
|
||||||
'task.taskType': {
|
|
||||||
value: 'alerting:*',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
range: {
|
|
||||||
'task.runAt': {
|
|
||||||
gte: 'now-1d',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
aggs: {
|
|
||||||
byTaskTypeId: taskTypeExecutionsMetric,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const executionsAggregations = searchResult.aggregations as {
|
|
||||||
byTaskTypeId: { value: { statuses: Record<string, Record<string, string>> } };
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
|
||||||
countTotal: Object.keys(executionsAggregations.byTaskTypeId.value.statuses).reduce(
|
|
||||||
(total: number, status: string) => {
|
|
||||||
const byRuleTypesRefs = executionsAggregations.byTaskTypeId.value.statuses[status];
|
|
||||||
const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce(
|
|
||||||
(totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10),
|
|
||||||
0
|
|
||||||
);
|
|
||||||
return countByRuleTypes + total;
|
|
||||||
},
|
|
||||||
0
|
|
||||||
),
|
|
||||||
countByStatus: Object.keys(executionsAggregations.byTaskTypeId.value.statuses).reduce(
|
|
||||||
// ES DSL aggregations are returned as `any` by esClient.search
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
(obj: any, status: string) => {
|
|
||||||
const byRuleTypesRefs = executionsAggregations.byTaskTypeId.value.statuses[status];
|
|
||||||
const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce(
|
|
||||||
(totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10),
|
|
||||||
0
|
|
||||||
);
|
|
||||||
return {
|
|
||||||
...obj,
|
|
||||||
[status]: countByRuleTypes,
|
|
||||||
};
|
|
||||||
},
|
|
||||||
{}
|
|
||||||
),
|
|
||||||
countByStatusByRuleType: Object.keys(
|
|
||||||
executionsAggregations.byTaskTypeId.value.statuses
|
|
||||||
).reduce(
|
|
||||||
// ES DSL aggregations are returned as `any` by esClient.search
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
(obj: any, key: string) => ({
|
|
||||||
...obj,
|
|
||||||
[key]: replaceDotSymbolsInRuleTypeIds(
|
|
||||||
executionsAggregations.byTaskTypeId.value.statuses[key]
|
|
||||||
),
|
|
||||||
}),
|
|
||||||
{}
|
|
||||||
),
|
|
||||||
};
|
|
||||||
} catch (err) {
|
|
||||||
logger.warn(
|
|
||||||
`Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify(
|
|
||||||
err
|
|
||||||
)}`
|
|
||||||
);
|
|
||||||
return {
|
|
||||||
countTotal: 0,
|
|
||||||
countByStatus: {},
|
|
||||||
countByStatusByRuleType: {},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function replaceDotSymbols(strToReplace: string) {
|
|
||||||
return strToReplace.replaceAll('.', '__');
|
|
||||||
}
|
|
||||||
|
|
||||||
function replaceDotSymbolsInRuleTypeIds(ruleTypeIdObj: Record<string, string>) {
|
|
||||||
return Object.keys(ruleTypeIdObj).reduce(
|
|
||||||
(obj, key) => ({ ...obj, [replaceDotSymbols(key)]: ruleTypeIdObj[key] }),
|
|
||||||
{}
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
export function parsePercentileAggsByRuleType(
|
|
||||||
aggsByType: estypes.AggregationsStringTermsBucketKeys[],
|
|
||||||
path: string
|
|
||||||
) {
|
|
||||||
return (aggsByType ?? []).reduce(
|
|
||||||
(acc, curr) => {
|
|
||||||
const percentiles = get(curr, path, {});
|
|
||||||
return merge(
|
|
||||||
acc,
|
|
||||||
Object.keys(percentiles).reduce((pacc, pcurr) => {
|
|
||||||
return {
|
|
||||||
...pacc,
|
|
||||||
...(percentileFieldNameMapping[pcurr]
|
|
||||||
? {
|
|
||||||
[percentileFieldNameMapping[pcurr]]: {
|
|
||||||
[replaceDotSymbols(curr.key)]: percentiles[pcurr] ?? 0,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
: {}),
|
|
||||||
};
|
|
||||||
}, {})
|
|
||||||
);
|
|
||||||
},
|
|
||||||
{ p50: {}, p90: {}, p99: {} }
|
|
||||||
);
|
|
||||||
}
|
|
|
@ -68,6 +68,8 @@ const byReasonSchema: MakeSchemaFrom<AlertingUsage>['count_rules_executions_fail
|
||||||
unknown: { type: 'long' },
|
unknown: { type: 'long' },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const NUM_ALERTING_EXECUTION_FAILURE_REASON_TYPES = Object.keys(byReasonSchema).length;
|
||||||
|
|
||||||
const byPercentileSchema: MakeSchemaFrom<AlertingUsage>['percentile_num_generated_actions_per_day'] =
|
const byPercentileSchema: MakeSchemaFrom<AlertingUsage>['percentile_num_generated_actions_per_day'] =
|
||||||
{
|
{
|
||||||
p50: { type: 'long' },
|
p50: { type: 'long' },
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,583 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||||
|
* 2.0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { flatMap, merge } from 'lodash';
|
||||||
|
import type {
|
||||||
|
AggregationsKeyedPercentiles,
|
||||||
|
AggregationsSingleBucketAggregateBase,
|
||||||
|
AggregationsPercentilesAggregateBase,
|
||||||
|
AggregationsSingleMetricAggregateBase,
|
||||||
|
AggregationsTermsAggregateBase,
|
||||||
|
AggregationsStringTermsBucketKeys,
|
||||||
|
AggregationsBuckets,
|
||||||
|
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||||
|
import { ElasticsearchClient, Logger } from '@kbn/core/server';
|
||||||
|
import {
|
||||||
|
NUM_ALERTING_RULE_TYPES,
|
||||||
|
NUM_ALERTING_EXECUTION_FAILURE_REASON_TYPES,
|
||||||
|
} from '../alerting_usage_collector';
|
||||||
|
import { replaceDotSymbols } from './replace_dots_with_underscores';
|
||||||
|
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
|
||||||
|
|
||||||
|
const Millis2Nanos = 1000 * 1000;
|
||||||
|
const percentileFieldNameMapping: Record<string, string> = {
|
||||||
|
'50.0': 'p50',
|
||||||
|
'90.0': 'p90',
|
||||||
|
'99.0': 'p99',
|
||||||
|
};
|
||||||
|
|
||||||
|
interface Opts {
|
||||||
|
esClient: ElasticsearchClient;
|
||||||
|
eventLogIndex: string;
|
||||||
|
logger: Logger;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface GetExecutionsPerDayCountResults {
|
||||||
|
countTotalRuleExecutions: number;
|
||||||
|
countRuleExecutionsByType: Record<string, number>;
|
||||||
|
countTotalFailedExecutions: number;
|
||||||
|
countFailedExecutionsByReason: Record<string, number>;
|
||||||
|
countFailedExecutionsByReasonByType: Record<string, Record<string, number>>;
|
||||||
|
avgExecutionTime: number;
|
||||||
|
avgExecutionTimeByType: Record<string, number>;
|
||||||
|
avgEsSearchDuration: number;
|
||||||
|
avgEsSearchDurationByType: Record<string, number>;
|
||||||
|
avgTotalSearchDuration: number;
|
||||||
|
avgTotalSearchDurationByType: Record<string, number>;
|
||||||
|
generatedActionsPercentiles: Record<string, number>;
|
||||||
|
generatedActionsPercentilesByType: Record<string, Record<string, number>>;
|
||||||
|
alertsPercentiles: Record<string, number>;
|
||||||
|
alertsPercentilesByType: Record<string, Record<string, number>>;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface GetExecutionTimeoutsPerDayCountResults {
|
||||||
|
countExecutionTimeouts: number;
|
||||||
|
countExecutionTimeoutsByType: Record<string, number>;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface GetExecutionCountsExecutionFailures extends AggregationsSingleBucketAggregateBase {
|
||||||
|
by_reason: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||||
|
}
|
||||||
|
interface GetExecutionCountsAggregationBucket extends AggregationsStringTermsBucketKeys {
|
||||||
|
avg_execution_time: AggregationsSingleMetricAggregateBase;
|
||||||
|
avg_es_search_duration: AggregationsSingleMetricAggregateBase;
|
||||||
|
avg_total_search_duration: AggregationsSingleMetricAggregateBase;
|
||||||
|
execution_failures: GetExecutionCountsExecutionFailures;
|
||||||
|
percentile_scheduled_actions: AggregationsPercentilesAggregateBase;
|
||||||
|
percentile_alerts: AggregationsPercentilesAggregateBase;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface IGetExecutionFailures extends AggregationsSingleBucketAggregateBase {
|
||||||
|
by_reason: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getExecutionsPerDayCount({
|
||||||
|
esClient,
|
||||||
|
eventLogIndex,
|
||||||
|
logger,
|
||||||
|
}: Opts): Promise<GetExecutionsPerDayCountResults> {
|
||||||
|
try {
|
||||||
|
const eventLogAggs = {
|
||||||
|
avg_execution_time: {
|
||||||
|
avg: {
|
||||||
|
field: 'event.duration',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
avg_es_search_duration: {
|
||||||
|
avg: {
|
||||||
|
field: 'kibana.alert.rule.execution.metrics.es_search_duration_ms',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
avg_total_search_duration: {
|
||||||
|
avg: {
|
||||||
|
field: 'kibana.alert.rule.execution.metrics.total_search_duration_ms',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
percentile_scheduled_actions: {
|
||||||
|
percentiles: {
|
||||||
|
field: 'kibana.alert.rule.execution.metrics.number_of_generated_actions',
|
||||||
|
percents: [50, 90, 99],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
percentile_alerts: {
|
||||||
|
percentiles: {
|
||||||
|
field: 'kibana.alert.rule.execution.metrics.alert_counts.active',
|
||||||
|
percents: [50, 90, 99],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
execution_failures: {
|
||||||
|
filter: {
|
||||||
|
term: {
|
||||||
|
'event.outcome': 'failure',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
aggs: {
|
||||||
|
by_reason: {
|
||||||
|
terms: {
|
||||||
|
field: 'event.reason',
|
||||||
|
size: NUM_ALERTING_EXECUTION_FAILURE_REASON_TYPES,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const query = {
|
||||||
|
index: eventLogIndex,
|
||||||
|
size: 0,
|
||||||
|
body: {
|
||||||
|
query: getProviderAndActionFilterForTimeRange('execute'),
|
||||||
|
aggs: {
|
||||||
|
...eventLogAggs,
|
||||||
|
by_rule_type_id: {
|
||||||
|
terms: {
|
||||||
|
field: 'rule.category',
|
||||||
|
size: NUM_ALERTING_RULE_TYPES,
|
||||||
|
},
|
||||||
|
aggs: eventLogAggs,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
logger.debug(`query for getExecutionsPerDayCount - ${JSON.stringify(query)}`);
|
||||||
|
const results = await esClient.search(query);
|
||||||
|
|
||||||
|
logger.debug(`results for getExecutionsPerDayCount query - ${JSON.stringify(results)}`);
|
||||||
|
|
||||||
|
const totalRuleExecutions =
|
||||||
|
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
|
||||||
|
|
||||||
|
const aggregations = results.aggregations as {
|
||||||
|
by_rule_type_id: AggregationsTermsAggregateBase<GetExecutionCountsAggregationBucket>;
|
||||||
|
execution_failures: IGetExecutionFailures;
|
||||||
|
percentile_scheduled_actions: AggregationsPercentilesAggregateBase;
|
||||||
|
percentile_alerts: AggregationsPercentilesAggregateBase;
|
||||||
|
avg_execution_time: AggregationsSingleMetricAggregateBase;
|
||||||
|
avg_es_search_duration: AggregationsSingleMetricAggregateBase;
|
||||||
|
avg_total_search_duration: AggregationsSingleMetricAggregateBase;
|
||||||
|
};
|
||||||
|
|
||||||
|
const aggregationsByRuleTypeId: AggregationsBuckets<GetExecutionCountsAggregationBucket> =
|
||||||
|
aggregations.by_rule_type_id.buckets as GetExecutionCountsAggregationBucket[];
|
||||||
|
|
||||||
|
return {
|
||||||
|
...parseRuleTypeBucket(aggregationsByRuleTypeId),
|
||||||
|
...parseExecutionFailureByRuleType(aggregationsByRuleTypeId),
|
||||||
|
...parseExecutionCountAggregationResults(aggregations),
|
||||||
|
countTotalRuleExecutions: totalRuleExecutions ?? 0,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
logger.warn(
|
||||||
|
`Error executing alerting telemetry task: getExecutionsPerDayCount - ${JSON.stringify(err)}`,
|
||||||
|
{
|
||||||
|
tags: ['alerting', 'telemetry-failed'],
|
||||||
|
error: { stack_trace: err.stack },
|
||||||
|
}
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
countTotalRuleExecutions: 0,
|
||||||
|
countRuleExecutionsByType: {},
|
||||||
|
countTotalFailedExecutions: 0,
|
||||||
|
countFailedExecutionsByReason: {},
|
||||||
|
countFailedExecutionsByReasonByType: {},
|
||||||
|
avgExecutionTime: 0,
|
||||||
|
avgExecutionTimeByType: {},
|
||||||
|
avgEsSearchDuration: 0,
|
||||||
|
avgEsSearchDurationByType: {},
|
||||||
|
avgTotalSearchDuration: 0,
|
||||||
|
avgTotalSearchDurationByType: {},
|
||||||
|
generatedActionsPercentiles: {},
|
||||||
|
generatedActionsPercentilesByType: {},
|
||||||
|
alertsPercentiles: {},
|
||||||
|
alertsPercentilesByType: {},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getExecutionTimeoutsPerDayCount({
|
||||||
|
esClient,
|
||||||
|
eventLogIndex,
|
||||||
|
logger,
|
||||||
|
}: Opts): Promise<GetExecutionTimeoutsPerDayCountResults> {
|
||||||
|
try {
|
||||||
|
const query = {
|
||||||
|
index: eventLogIndex,
|
||||||
|
size: 0,
|
||||||
|
body: {
|
||||||
|
query: getProviderAndActionFilterForTimeRange('execute-timeout'),
|
||||||
|
aggs: {
|
||||||
|
by_rule_type_id: {
|
||||||
|
terms: {
|
||||||
|
field: 'rule.category',
|
||||||
|
size: NUM_ALERTING_RULE_TYPES,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
logger.debug(`query for getExecutionTimeoutsPerDayCount - ${JSON.stringify(query)}`);
|
||||||
|
const results = await esClient.search(query);
|
||||||
|
|
||||||
|
logger.debug(`results for getExecutionTimeoutsPerDayCount query - ${JSON.stringify(results)}`);
|
||||||
|
|
||||||
|
const aggregations = results.aggregations as {
|
||||||
|
by_rule_type_id: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||||
|
};
|
||||||
|
|
||||||
|
const totalTimedoutExecutionsCount =
|
||||||
|
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
|
||||||
|
|
||||||
|
return {
|
||||||
|
countExecutionTimeouts: totalTimedoutExecutionsCount ?? 0,
|
||||||
|
countExecutionTimeoutsByType: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets),
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
logger.warn(
|
||||||
|
`Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - ${JSON.stringify(
|
||||||
|
err
|
||||||
|
)}`,
|
||||||
|
{
|
||||||
|
tags: ['alerting', 'telemetry-failed'],
|
||||||
|
error: { stack_trace: err.stack },
|
||||||
|
}
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
countExecutionTimeouts: 0,
|
||||||
|
countExecutionTimeoutsByType: {},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bucket format:
|
||||||
|
* {
|
||||||
|
* key: '.index-threshold', // rule type id
|
||||||
|
* doc_count: 78, // count of number of executions
|
||||||
|
* avg_es_search_duration: { // average es search duration across executions
|
||||||
|
* value: 40.76056338028169,
|
||||||
|
* },
|
||||||
|
* percentile_alerts: { // stats for number of alerts created across executions
|
||||||
|
* values: {
|
||||||
|
* '50.0': 1,
|
||||||
|
* '95.0': 1,
|
||||||
|
* '99.0': 1,
|
||||||
|
* },
|
||||||
|
* },
|
||||||
|
* execution_failures: {
|
||||||
|
* doc_count: 7, // count of number of failed executions
|
||||||
|
* by_reason: {
|
||||||
|
* doc_count_error_upper_bound: 0,
|
||||||
|
* sum_other_doc_count: 0,
|
||||||
|
* buckets: [
|
||||||
|
* {
|
||||||
|
* key: 'execute', // breakdown of reason for execution failures
|
||||||
|
* doc_count: 4,
|
||||||
|
* },
|
||||||
|
* {
|
||||||
|
* key: 'decrypt',
|
||||||
|
* doc_count: 3,
|
||||||
|
* },
|
||||||
|
* ],
|
||||||
|
* },
|
||||||
|
* },
|
||||||
|
* percentile_scheduled_actions: { // stats for number of actions generated across executions
|
||||||
|
* values: {
|
||||||
|
* '50.0': 0,
|
||||||
|
* '95.0': 0,
|
||||||
|
* '99.0': 0,
|
||||||
|
* },
|
||||||
|
* },
|
||||||
|
* avg_execution_time: { // average execution time in nanoseconds across executions
|
||||||
|
* value: 100576923.07692307,
|
||||||
|
* },
|
||||||
|
* avg_total_search_duration: { // average total search duration across executions
|
||||||
|
* value: 43.74647887323944,
|
||||||
|
* },
|
||||||
|
* }
|
||||||
|
*/
|
||||||
|
|
||||||
|
export function parseRuleTypeBucket(
|
||||||
|
buckets: GetExecutionCountsAggregationBucket[]
|
||||||
|
): Pick<
|
||||||
|
GetExecutionsPerDayCountResults,
|
||||||
|
| 'countRuleExecutionsByType'
|
||||||
|
| 'avgExecutionTimeByType'
|
||||||
|
| 'avgEsSearchDurationByType'
|
||||||
|
| 'avgTotalSearchDurationByType'
|
||||||
|
| 'generatedActionsPercentilesByType'
|
||||||
|
| 'alertsPercentilesByType'
|
||||||
|
> {
|
||||||
|
let summary = {
|
||||||
|
countRuleExecutionsByType: {},
|
||||||
|
avgExecutionTimeByType: {},
|
||||||
|
avgEsSearchDurationByType: {},
|
||||||
|
avgTotalSearchDurationByType: {},
|
||||||
|
generatedActionsPercentilesByType: { p50: {}, p90: {}, p99: {} },
|
||||||
|
alertsPercentilesByType: { p50: {}, p90: {}, p99: {} },
|
||||||
|
};
|
||||||
|
for (const bucket of buckets ?? []) {
|
||||||
|
const ruleType: string = replaceDotSymbols(bucket?.key) ?? '';
|
||||||
|
const numExecutions: number = bucket?.doc_count ?? 0;
|
||||||
|
const avgExecutionTimeNanos = bucket?.avg_execution_time?.value ?? 0;
|
||||||
|
const avgEsSearchTimeMillis = bucket?.avg_es_search_duration?.value ?? 0;
|
||||||
|
const avgTotalSearchTimeMillis = bucket?.avg_total_search_duration?.value ?? 0;
|
||||||
|
const actionPercentiles = bucket?.percentile_scheduled_actions?.values ?? {};
|
||||||
|
const alertPercentiles = bucket?.percentile_alerts?.values ?? {};
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
countRuleExecutionsByType: {
|
||||||
|
...summary.countRuleExecutionsByType,
|
||||||
|
[ruleType]: numExecutions,
|
||||||
|
},
|
||||||
|
avgExecutionTimeByType: {
|
||||||
|
...summary.avgExecutionTimeByType,
|
||||||
|
[ruleType]: Math.round(avgExecutionTimeNanos / Millis2Nanos),
|
||||||
|
},
|
||||||
|
avgEsSearchDurationByType: {
|
||||||
|
...summary.avgEsSearchDurationByType,
|
||||||
|
[ruleType]: Math.round(avgEsSearchTimeMillis),
|
||||||
|
},
|
||||||
|
avgTotalSearchDurationByType: {
|
||||||
|
...summary.avgTotalSearchDurationByType,
|
||||||
|
[ruleType]: Math.round(avgTotalSearchTimeMillis),
|
||||||
|
},
|
||||||
|
generatedActionsPercentilesByType: merge(
|
||||||
|
summary.generatedActionsPercentilesByType,
|
||||||
|
parsePercentileAggs(actionPercentiles as AggregationsKeyedPercentiles, ruleType)
|
||||||
|
),
|
||||||
|
alertsPercentilesByType: merge(
|
||||||
|
summary.alertsPercentilesByType,
|
||||||
|
parsePercentileAggs(alertPercentiles as AggregationsKeyedPercentiles, ruleType)
|
||||||
|
),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return summary;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface FlattenedExecutionFailureBucket {
|
||||||
|
ruleType: string;
|
||||||
|
key: string;
|
||||||
|
doc_count: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parseExecutionFailureByRuleType(
|
||||||
|
buckets: GetExecutionCountsAggregationBucket[]
|
||||||
|
): Pick<GetExecutionsPerDayCountResults, 'countFailedExecutionsByReasonByType'> {
|
||||||
|
const executionFailuresWithRuleTypeBuckets: FlattenedExecutionFailureBucket[] = flatMap(
|
||||||
|
buckets ?? [],
|
||||||
|
(bucket) => {
|
||||||
|
const ruleType: string = replaceDotSymbols(bucket.key);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execution failure bucket format
|
||||||
|
* [
|
||||||
|
* {
|
||||||
|
* key: 'execute',
|
||||||
|
* doc_count: 4,
|
||||||
|
* },
|
||||||
|
* {
|
||||||
|
* key: 'decrypt',
|
||||||
|
* doc_count: 3,
|
||||||
|
* },
|
||||||
|
* ]
|
||||||
|
*/
|
||||||
|
|
||||||
|
const executionFailuresBuckets = bucket?.execution_failures?.by_reason
|
||||||
|
?.buckets as AggregationsStringTermsBucketKeys[];
|
||||||
|
return (executionFailuresBuckets ?? []).map((b) => ({ ...b, ruleType }));
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
const parsedFailures = (executionFailuresWithRuleTypeBuckets ?? []).reduce(
|
||||||
|
(acc: Record<string, Record<string, number>>, bucket: FlattenedExecutionFailureBucket) => {
|
||||||
|
const ruleType: string = bucket.ruleType;
|
||||||
|
const reason: string = bucket.key;
|
||||||
|
|
||||||
|
if (acc[reason]) {
|
||||||
|
if (acc[reason][ruleType]) {
|
||||||
|
return {
|
||||||
|
...acc,
|
||||||
|
[reason]: {
|
||||||
|
...acc[reason],
|
||||||
|
[ruleType]: acc[reason][ruleType] + bucket.doc_count,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
...acc,
|
||||||
|
[reason]: {
|
||||||
|
...acc[reason],
|
||||||
|
[ruleType]: bucket.doc_count,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
...acc,
|
||||||
|
[reason]: {
|
||||||
|
[ruleType]: bucket.doc_count,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
},
|
||||||
|
{}
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
countFailedExecutionsByReasonByType: parsedFailures,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parsePercentileAggs(
|
||||||
|
percentiles: AggregationsKeyedPercentiles,
|
||||||
|
ruleTypeId?: string
|
||||||
|
) {
|
||||||
|
return Object.keys(percentiles ?? {}).reduce((acc, percentileKey: string) => {
|
||||||
|
let result = {};
|
||||||
|
const percentileKeyMapped = percentileFieldNameMapping[percentileKey];
|
||||||
|
if (percentileKeyMapped) {
|
||||||
|
if (ruleTypeId) {
|
||||||
|
result = {
|
||||||
|
[percentileKeyMapped]: {
|
||||||
|
[ruleTypeId]: percentiles[percentileKey] ?? 0,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
result = {
|
||||||
|
[percentileKeyMapped]: percentiles[percentileKey] ?? 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
...acc,
|
||||||
|
...result,
|
||||||
|
};
|
||||||
|
}, {});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aggregation Result Format (minus rule type id agg buckets)
|
||||||
|
* {
|
||||||
|
* avg_es_search_duration: {
|
||||||
|
* value: 26.246376811594203,
|
||||||
|
* },
|
||||||
|
* percentile_alerts: {
|
||||||
|
* values: {
|
||||||
|
* '50.0': 1,
|
||||||
|
* '90.0': 5,
|
||||||
|
* '99.0': 5,
|
||||||
|
* },
|
||||||
|
* },
|
||||||
|
* execution_failures: {
|
||||||
|
* doc_count: 10,
|
||||||
|
* by_reason: {
|
||||||
|
* doc_count_error_upper_bound: 0,
|
||||||
|
* sum_other_doc_count: 0,
|
||||||
|
* buckets: [
|
||||||
|
* {
|
||||||
|
* key: 'decrypt',
|
||||||
|
* doc_count: 6,
|
||||||
|
* },
|
||||||
|
* {
|
||||||
|
* key: 'execute',
|
||||||
|
* doc_count: 4,
|
||||||
|
* },
|
||||||
|
* ],
|
||||||
|
* },
|
||||||
|
* },
|
||||||
|
* percentile_scheduled_actions: {
|
||||||
|
* values: {
|
||||||
|
* '50.0': 0,
|
||||||
|
* '95.0': 5,
|
||||||
|
* '99.0': 5,
|
||||||
|
* },
|
||||||
|
* },
|
||||||
|
* avg_execution_time: {
|
||||||
|
* value: 288250000,
|
||||||
|
* },
|
||||||
|
* avg_total_search_duration: {
|
||||||
|
* value: 28.630434782608695,
|
||||||
|
* },
|
||||||
|
*/
|
||||||
|
export function parseExecutionCountAggregationResults(results: {
|
||||||
|
execution_failures: IGetExecutionFailures;
|
||||||
|
percentile_scheduled_actions: AggregationsPercentilesAggregateBase;
|
||||||
|
percentile_alerts: AggregationsPercentilesAggregateBase;
|
||||||
|
avg_execution_time: AggregationsSingleMetricAggregateBase;
|
||||||
|
avg_es_search_duration: AggregationsSingleMetricAggregateBase;
|
||||||
|
avg_total_search_duration: AggregationsSingleMetricAggregateBase;
|
||||||
|
}): Pick<
|
||||||
|
GetExecutionsPerDayCountResults,
|
||||||
|
| 'countTotalFailedExecutions'
|
||||||
|
| 'countFailedExecutionsByReason'
|
||||||
|
| 'avgExecutionTime'
|
||||||
|
| 'avgEsSearchDuration'
|
||||||
|
| 'avgTotalSearchDuration'
|
||||||
|
| 'generatedActionsPercentiles'
|
||||||
|
| 'alertsPercentiles'
|
||||||
|
> {
|
||||||
|
const avgExecutionTimeNanos = results?.avg_execution_time?.value ?? 0;
|
||||||
|
const avgEsSearchDurationMillis = results?.avg_es_search_duration?.value ?? 0;
|
||||||
|
const avgTotalSearchDurationMillis = results?.avg_total_search_duration?.value ?? 0;
|
||||||
|
const executionFailuresByReasonBuckets =
|
||||||
|
(results?.execution_failures?.by_reason?.buckets as AggregationsStringTermsBucketKeys[]) ?? [];
|
||||||
|
const actionPercentiles = results?.percentile_scheduled_actions?.values ?? {};
|
||||||
|
const alertPercentiles = results?.percentile_alerts?.values ?? {};
|
||||||
|
|
||||||
|
return {
|
||||||
|
countTotalFailedExecutions: results?.execution_failures?.doc_count ?? 0,
|
||||||
|
countFailedExecutionsByReason: executionFailuresByReasonBuckets.reduce(
|
||||||
|
(acc: Record<string, number>, bucket: AggregationsStringTermsBucketKeys) => {
|
||||||
|
const reason: string = bucket.key;
|
||||||
|
return {
|
||||||
|
...acc,
|
||||||
|
[reason]: bucket.doc_count ?? 0,
|
||||||
|
};
|
||||||
|
},
|
||||||
|
{}
|
||||||
|
),
|
||||||
|
avgExecutionTime: Math.round(avgExecutionTimeNanos / Millis2Nanos),
|
||||||
|
avgEsSearchDuration: Math.round(avgEsSearchDurationMillis),
|
||||||
|
avgTotalSearchDuration: Math.round(avgTotalSearchDurationMillis),
|
||||||
|
generatedActionsPercentiles: parsePercentileAggs(
|
||||||
|
actionPercentiles as AggregationsKeyedPercentiles
|
||||||
|
),
|
||||||
|
alertsPercentiles: parsePercentileAggs(alertPercentiles as AggregationsKeyedPercentiles),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function getProviderAndActionFilterForTimeRange(
|
||||||
|
action: string,
|
||||||
|
provider: string = 'alerting',
|
||||||
|
range: string = '1d'
|
||||||
|
) {
|
||||||
|
return {
|
||||||
|
bool: {
|
||||||
|
filter: {
|
||||||
|
bool: {
|
||||||
|
must: [
|
||||||
|
{
|
||||||
|
term: { 'event.action': action },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: { 'event.provider': provider },
|
||||||
|
},
|
||||||
|
{
|
||||||
|
range: {
|
||||||
|
'@timestamp': {
|
||||||
|
gte: `now-${range}`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
|
@ -0,0 +1,249 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||||
|
* 2.0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
|
||||||
|
import { getTotalCountAggregations, getTotalCountInUse } from './get_telemetry_from_kibana';
|
||||||
|
|
||||||
|
const elasticsearch = elasticsearchServiceMock.createStart();
|
||||||
|
const esClient = elasticsearch.client.asInternalUser;
|
||||||
|
const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
|
||||||
|
|
||||||
|
describe('kibana index telemetry', () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
jest.resetAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('getTotalCountAggregations', () => {
|
||||||
|
test('should return rule counts by rule type id, stats about schedule and throttle intervals and number of actions', async () => {
|
||||||
|
esClient.search.mockResponseOnce({
|
||||||
|
took: 4,
|
||||||
|
timed_out: false,
|
||||||
|
_shards: {
|
||||||
|
total: 1,
|
||||||
|
successful: 1,
|
||||||
|
skipped: 0,
|
||||||
|
failed: 0,
|
||||||
|
},
|
||||||
|
hits: {
|
||||||
|
total: {
|
||||||
|
value: 4,
|
||||||
|
relation: 'eq',
|
||||||
|
},
|
||||||
|
max_score: null,
|
||||||
|
hits: [],
|
||||||
|
},
|
||||||
|
aggregations: {
|
||||||
|
by_rule_type_id: {
|
||||||
|
doc_count_error_upper_bound: 0,
|
||||||
|
sum_other_doc_count: 0,
|
||||||
|
buckets: [
|
||||||
|
{
|
||||||
|
key: '.index-threshold',
|
||||||
|
doc_count: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'logs.alert.document.count',
|
||||||
|
doc_count: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'document.test.',
|
||||||
|
doc_count: 1,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
max_throttle_time: { value: 60 },
|
||||||
|
min_throttle_time: { value: 0 },
|
||||||
|
avg_throttle_time: { value: 30 },
|
||||||
|
max_interval_time: { value: 10 },
|
||||||
|
min_interval_time: { value: 1 },
|
||||||
|
avg_interval_time: { value: 4.5 },
|
||||||
|
max_actions_count: { value: 4 },
|
||||||
|
min_actions_count: { value: 0 },
|
||||||
|
avg_actions_count: { value: 2.5 },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const telemetry = await getTotalCountAggregations({
|
||||||
|
esClient,
|
||||||
|
kibanaIndex: 'test',
|
||||||
|
logger,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||||
|
|
||||||
|
expect(telemetry).toEqual({
|
||||||
|
connectors_per_alert: {
|
||||||
|
avg: 2.5,
|
||||||
|
max: 4,
|
||||||
|
min: 0,
|
||||||
|
},
|
||||||
|
count_by_type: {
|
||||||
|
'__index-threshold': 2,
|
||||||
|
document__test__: 1,
|
||||||
|
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||||
|
logs__alert__document__count: 1,
|
||||||
|
},
|
||||||
|
count_total: 4,
|
||||||
|
schedule_time: {
|
||||||
|
avg: '4.5s',
|
||||||
|
max: '10s',
|
||||||
|
min: '1s',
|
||||||
|
},
|
||||||
|
schedule_time_number_s: {
|
||||||
|
avg: 4.5,
|
||||||
|
max: 10,
|
||||||
|
min: 1,
|
||||||
|
},
|
||||||
|
throttle_time: {
|
||||||
|
avg: '30s',
|
||||||
|
max: '60s',
|
||||||
|
min: '0s',
|
||||||
|
},
|
||||||
|
throttle_time_number_s: {
|
||||||
|
avg: 30,
|
||||||
|
max: 60,
|
||||||
|
min: 0,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should return empty results and log warning if query throws error', async () => {
|
||||||
|
esClient.search.mockRejectedValueOnce(new Error('oh no'));
|
||||||
|
|
||||||
|
const telemetry = await getTotalCountAggregations({
|
||||||
|
esClient,
|
||||||
|
kibanaIndex: 'test',
|
||||||
|
logger,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||||
|
const loggerCall = logger.warn.mock.calls[0][0];
|
||||||
|
const loggerMeta = logger.warn.mock.calls[0][1];
|
||||||
|
expect(loggerCall as string).toMatchInlineSnapshot(
|
||||||
|
`"Error executing alerting telemetry task: getTotalCountAggregations - {}"`
|
||||||
|
);
|
||||||
|
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
|
||||||
|
expect(loggerMeta?.error?.stack_trace).toBeDefined();
|
||||||
|
expect(telemetry).toEqual({
|
||||||
|
connectors_per_alert: {
|
||||||
|
avg: 0,
|
||||||
|
max: 0,
|
||||||
|
min: 0,
|
||||||
|
},
|
||||||
|
count_by_type: {},
|
||||||
|
count_total: 0,
|
||||||
|
schedule_time: {
|
||||||
|
avg: '0s',
|
||||||
|
max: '0s',
|
||||||
|
min: '0s',
|
||||||
|
},
|
||||||
|
schedule_time_number_s: {
|
||||||
|
avg: 0,
|
||||||
|
max: 0,
|
||||||
|
min: 0,
|
||||||
|
},
|
||||||
|
throttle_time: {
|
||||||
|
avg: '0s',
|
||||||
|
max: '0s',
|
||||||
|
min: '0s',
|
||||||
|
},
|
||||||
|
throttle_time_number_s: {
|
||||||
|
avg: 0,
|
||||||
|
max: 0,
|
||||||
|
min: 0,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('getTotalCountInUse', () => {
|
||||||
|
test('should return enabled rule counts by rule type id and number of namespaces', async () => {
|
||||||
|
esClient.search.mockResponseOnce({
|
||||||
|
took: 4,
|
||||||
|
timed_out: false,
|
||||||
|
_shards: {
|
||||||
|
total: 1,
|
||||||
|
successful: 1,
|
||||||
|
skipped: 0,
|
||||||
|
failed: 0,
|
||||||
|
},
|
||||||
|
hits: {
|
||||||
|
total: {
|
||||||
|
value: 4,
|
||||||
|
relation: 'eq',
|
||||||
|
},
|
||||||
|
max_score: null,
|
||||||
|
hits: [],
|
||||||
|
},
|
||||||
|
aggregations: {
|
||||||
|
namespaces_count: { value: 1 },
|
||||||
|
by_rule_type_id: {
|
||||||
|
doc_count_error_upper_bound: 0,
|
||||||
|
sum_other_doc_count: 0,
|
||||||
|
buckets: [
|
||||||
|
{
|
||||||
|
key: '.index-threshold',
|
||||||
|
doc_count: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'logs.alert.document.count',
|
||||||
|
doc_count: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'document.test.',
|
||||||
|
doc_count: 1,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const telemetry = await getTotalCountInUse({
|
||||||
|
esClient,
|
||||||
|
kibanaIndex: 'test',
|
||||||
|
logger,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||||
|
|
||||||
|
expect(telemetry).toStrictEqual({
|
||||||
|
countByType: {
|
||||||
|
'__index-threshold': 2,
|
||||||
|
document__test__: 1,
|
||||||
|
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||||
|
logs__alert__document__count: 1,
|
||||||
|
},
|
||||||
|
countNamespaces: 1,
|
||||||
|
countTotal: 4,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should return empty results and log warning if query throws error', async () => {
|
||||||
|
esClient.search.mockRejectedValueOnce(new Error('oh no'));
|
||||||
|
|
||||||
|
const telemetry = await getTotalCountInUse({
|
||||||
|
esClient,
|
||||||
|
kibanaIndex: 'test',
|
||||||
|
logger,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||||
|
const loggerCall = logger.warn.mock.calls[0][0];
|
||||||
|
const loggerMeta = logger.warn.mock.calls[0][1];
|
||||||
|
expect(loggerCall as string).toMatchInlineSnapshot(
|
||||||
|
`"Error executing alerting telemetry task: getTotalCountInUse - {}"`
|
||||||
|
);
|
||||||
|
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
|
||||||
|
expect(loggerMeta?.error?.stack_trace).toBeDefined();
|
||||||
|
expect(telemetry).toStrictEqual({
|
||||||
|
countByType: {},
|
||||||
|
countNamespaces: 0,
|
||||||
|
countTotal: 0,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
|
@ -0,0 +1,317 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||||
|
* 2.0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type {
|
||||||
|
AggregationsSingleMetricAggregateBase,
|
||||||
|
AggregationsCardinalityAggregate,
|
||||||
|
AggregationsTermsAggregateBase,
|
||||||
|
AggregationsStringTermsBucketKeys,
|
||||||
|
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||||
|
import { ElasticsearchClient, Logger } from '@kbn/core/server';
|
||||||
|
import { AlertingUsage } from '../types';
|
||||||
|
import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
|
||||||
|
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
|
||||||
|
|
||||||
|
interface Opts {
|
||||||
|
esClient: ElasticsearchClient;
|
||||||
|
kibanaIndex: string;
|
||||||
|
logger: Logger;
|
||||||
|
}
|
||||||
|
|
||||||
|
type GetTotalCountsResults = Pick<
|
||||||
|
AlertingUsage,
|
||||||
|
| 'count_total'
|
||||||
|
| 'count_by_type'
|
||||||
|
| 'throttle_time'
|
||||||
|
| 'schedule_time'
|
||||||
|
| 'throttle_time_number_s'
|
||||||
|
| 'schedule_time_number_s'
|
||||||
|
| 'connectors_per_alert'
|
||||||
|
>;
|
||||||
|
|
||||||
|
interface GetTotalCountInUseResults {
|
||||||
|
countTotal: number;
|
||||||
|
countByType: Record<string, number>;
|
||||||
|
countNamespaces: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getTotalCountAggregations({
|
||||||
|
esClient,
|
||||||
|
kibanaIndex,
|
||||||
|
logger,
|
||||||
|
}: Opts): Promise<GetTotalCountsResults> {
|
||||||
|
try {
|
||||||
|
const query = {
|
||||||
|
index: kibanaIndex,
|
||||||
|
size: 0,
|
||||||
|
body: {
|
||||||
|
query: {
|
||||||
|
bool: {
|
||||||
|
// Aggregate over all rule saved objects
|
||||||
|
filter: [{ term: { type: 'alert' } }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
runtime_mappings: {
|
||||||
|
rule_action_count: {
|
||||||
|
type: 'long',
|
||||||
|
script: {
|
||||||
|
source: `
|
||||||
|
def alert = params._source['alert'];
|
||||||
|
if (alert != null) {
|
||||||
|
def actions = alert.actions;
|
||||||
|
if (actions != null) {
|
||||||
|
emit(actions.length);
|
||||||
|
} else {
|
||||||
|
emit(0);
|
||||||
|
}
|
||||||
|
}`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Convert schedule interval duration string from rule saved object to interval in seconds
|
||||||
|
rule_schedule_interval: {
|
||||||
|
type: 'long',
|
||||||
|
script: {
|
||||||
|
source: `
|
||||||
|
int parsed = 0;
|
||||||
|
if (doc['alert.schedule.interval'].size() > 0) {
|
||||||
|
def interval = doc['alert.schedule.interval'].value;
|
||||||
|
|
||||||
|
if (interval.length() > 1) {
|
||||||
|
// get last char
|
||||||
|
String timeChar = interval.substring(interval.length() - 1);
|
||||||
|
// remove last char
|
||||||
|
interval = interval.substring(0, interval.length() - 1);
|
||||||
|
|
||||||
|
if (interval.chars().allMatch(Character::isDigit)) {
|
||||||
|
// using of regex is not allowed in painless language
|
||||||
|
parsed = Integer.parseInt(interval);
|
||||||
|
|
||||||
|
if (timeChar.equals("s")) {
|
||||||
|
parsed = parsed;
|
||||||
|
} else if (timeChar.equals("m")) {
|
||||||
|
parsed = parsed * 60;
|
||||||
|
} else if (timeChar.equals("h")) {
|
||||||
|
parsed = parsed * 60 * 60;
|
||||||
|
} else if (timeChar.equals("d")) {
|
||||||
|
parsed = parsed * 24 * 60 * 60;
|
||||||
|
}
|
||||||
|
emit(parsed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
emit(parsed);
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Convert throttle interval duration string from rule saved object to interval in seconds
|
||||||
|
rule_throttle_interval: {
|
||||||
|
type: 'long',
|
||||||
|
script: {
|
||||||
|
source: `
|
||||||
|
int parsed = 0;
|
||||||
|
if (doc['alert.throttle'].size() > 0) {
|
||||||
|
def throttle = doc['alert.throttle'].value;
|
||||||
|
|
||||||
|
if (throttle.length() > 1) {
|
||||||
|
// get last char
|
||||||
|
String timeChar = throttle.substring(throttle.length() - 1);
|
||||||
|
// remove last char
|
||||||
|
throttle = throttle.substring(0, throttle.length() - 1);
|
||||||
|
|
||||||
|
if (throttle.chars().allMatch(Character::isDigit)) {
|
||||||
|
// using of regex is not allowed in painless language
|
||||||
|
parsed = Integer.parseInt(throttle);
|
||||||
|
|
||||||
|
if (timeChar.equals("s")) {
|
||||||
|
parsed = parsed;
|
||||||
|
} else if (timeChar.equals("m")) {
|
||||||
|
parsed = parsed * 60;
|
||||||
|
} else if (timeChar.equals("h")) {
|
||||||
|
parsed = parsed * 60 * 60;
|
||||||
|
} else if (timeChar.equals("d")) {
|
||||||
|
parsed = parsed * 24 * 60 * 60;
|
||||||
|
}
|
||||||
|
emit(parsed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
emit(parsed);
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
aggs: {
|
||||||
|
by_rule_type_id: {
|
||||||
|
terms: {
|
||||||
|
field: 'alert.alertTypeId',
|
||||||
|
size: NUM_ALERTING_RULE_TYPES,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
max_throttle_time: { max: { field: 'rule_throttle_interval' } },
|
||||||
|
min_throttle_time: { min: { field: 'rule_throttle_interval' } },
|
||||||
|
avg_throttle_time: { avg: { field: 'rule_throttle_interval' } },
|
||||||
|
max_interval_time: { max: { field: 'rule_schedule_interval' } },
|
||||||
|
min_interval_time: { min: { field: 'rule_schedule_interval' } },
|
||||||
|
avg_interval_time: { avg: { field: 'rule_schedule_interval' } },
|
||||||
|
max_actions_count: { max: { field: 'rule_action_count' } },
|
||||||
|
min_actions_count: { min: { field: 'rule_action_count' } },
|
||||||
|
avg_actions_count: { avg: { field: 'rule_action_count' } },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
logger.debug(`query for getTotalCountAggregations - ${JSON.stringify(query)}`);
|
||||||
|
const results = await esClient.search(query);
|
||||||
|
|
||||||
|
logger.debug(`results for getTotalCountAggregations query - ${JSON.stringify(results)}`);
|
||||||
|
|
||||||
|
const aggregations = results.aggregations as {
|
||||||
|
by_rule_type_id: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||||
|
max_throttle_time: AggregationsSingleMetricAggregateBase;
|
||||||
|
min_throttle_time: AggregationsSingleMetricAggregateBase;
|
||||||
|
avg_throttle_time: AggregationsSingleMetricAggregateBase;
|
||||||
|
max_interval_time: AggregationsSingleMetricAggregateBase;
|
||||||
|
min_interval_time: AggregationsSingleMetricAggregateBase;
|
||||||
|
avg_interval_time: AggregationsSingleMetricAggregateBase;
|
||||||
|
max_actions_count: AggregationsSingleMetricAggregateBase;
|
||||||
|
min_actions_count: AggregationsSingleMetricAggregateBase;
|
||||||
|
avg_actions_count: AggregationsSingleMetricAggregateBase;
|
||||||
|
};
|
||||||
|
|
||||||
|
const totalRulesCount =
|
||||||
|
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
|
||||||
|
|
||||||
|
return {
|
||||||
|
count_total: totalRulesCount ?? 0,
|
||||||
|
count_by_type: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets),
|
||||||
|
throttle_time: {
|
||||||
|
min: `${aggregations.min_throttle_time.value ?? 0}s`,
|
||||||
|
avg: `${aggregations.avg_throttle_time.value ?? 0}s`,
|
||||||
|
max: `${aggregations.max_throttle_time.value ?? 0}s`,
|
||||||
|
},
|
||||||
|
schedule_time: {
|
||||||
|
min: `${aggregations.min_interval_time.value ?? 0}s`,
|
||||||
|
avg: `${aggregations.avg_interval_time.value ?? 0}s`,
|
||||||
|
max: `${aggregations.max_interval_time.value ?? 0}s`,
|
||||||
|
},
|
||||||
|
throttle_time_number_s: {
|
||||||
|
min: aggregations.min_throttle_time.value ?? 0,
|
||||||
|
avg: aggregations.avg_throttle_time.value ?? 0,
|
||||||
|
max: aggregations.max_throttle_time.value ?? 0,
|
||||||
|
},
|
||||||
|
schedule_time_number_s: {
|
||||||
|
min: aggregations.min_interval_time.value ?? 0,
|
||||||
|
avg: aggregations.avg_interval_time.value ?? 0,
|
||||||
|
max: aggregations.max_interval_time.value ?? 0,
|
||||||
|
},
|
||||||
|
connectors_per_alert: {
|
||||||
|
min: aggregations.min_actions_count.value ?? 0,
|
||||||
|
avg: aggregations.avg_actions_count.value ?? 0,
|
||||||
|
max: aggregations.max_actions_count.value ?? 0,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
logger.warn(
|
||||||
|
`Error executing alerting telemetry task: getTotalCountAggregations - ${JSON.stringify(err)}`,
|
||||||
|
{
|
||||||
|
tags: ['alerting', 'telemetry-failed'],
|
||||||
|
error: { stack_trace: err.stack },
|
||||||
|
}
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
count_total: 0,
|
||||||
|
count_by_type: {},
|
||||||
|
throttle_time: {
|
||||||
|
min: '0s',
|
||||||
|
avg: '0s',
|
||||||
|
max: '0s',
|
||||||
|
},
|
||||||
|
schedule_time: {
|
||||||
|
min: '0s',
|
||||||
|
avg: '0s',
|
||||||
|
max: '0s',
|
||||||
|
},
|
||||||
|
throttle_time_number_s: {
|
||||||
|
min: 0,
|
||||||
|
avg: 0,
|
||||||
|
max: 0,
|
||||||
|
},
|
||||||
|
schedule_time_number_s: {
|
||||||
|
min: 0,
|
||||||
|
avg: 0,
|
||||||
|
max: 0,
|
||||||
|
},
|
||||||
|
connectors_per_alert: {
|
||||||
|
min: 0,
|
||||||
|
avg: 0,
|
||||||
|
max: 0,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getTotalCountInUse({
|
||||||
|
esClient,
|
||||||
|
kibanaIndex,
|
||||||
|
logger,
|
||||||
|
}: Opts): Promise<GetTotalCountInUseResults> {
|
||||||
|
try {
|
||||||
|
const query = {
|
||||||
|
index: kibanaIndex,
|
||||||
|
size: 0,
|
||||||
|
body: {
|
||||||
|
query: {
|
||||||
|
bool: {
|
||||||
|
// Aggregate over only enabled rule saved objects
|
||||||
|
filter: [{ term: { type: 'alert' } }, { term: { 'alert.enabled': true } }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
aggs: {
|
||||||
|
namespaces_count: { cardinality: { field: 'namespaces' } },
|
||||||
|
by_rule_type_id: {
|
||||||
|
terms: {
|
||||||
|
field: 'alert.alertTypeId',
|
||||||
|
size: NUM_ALERTING_RULE_TYPES,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
logger.debug(`query for getTotalCountInUse - ${JSON.stringify(query)}`);
|
||||||
|
const results = await esClient.search(query);
|
||||||
|
|
||||||
|
logger.debug(`results for getTotalCountInUse query - ${JSON.stringify(results)}`);
|
||||||
|
|
||||||
|
const aggregations = results.aggregations as {
|
||||||
|
by_rule_type_id: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||||
|
namespaces_count: AggregationsCardinalityAggregate;
|
||||||
|
};
|
||||||
|
|
||||||
|
const totalEnabledRulesCount =
|
||||||
|
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
|
||||||
|
|
||||||
|
return {
|
||||||
|
countTotal: totalEnabledRulesCount ?? 0,
|
||||||
|
countByType: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets),
|
||||||
|
countNamespaces: aggregations.namespaces_count.value ?? 0,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
logger.warn(
|
||||||
|
`Error executing alerting telemetry task: getTotalCountInUse - ${JSON.stringify(err)}`,
|
||||||
|
{
|
||||||
|
tags: ['alerting', 'telemetry-failed'],
|
||||||
|
error: { stack_trace: err.stack },
|
||||||
|
}
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
countTotal: 0,
|
||||||
|
countByType: {},
|
||||||
|
countNamespaces: 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,256 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||||
|
* 2.0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
|
||||||
|
import {
|
||||||
|
getFailedAndUnrecognizedTasksPerDay,
|
||||||
|
parseBucket,
|
||||||
|
} from './get_telemetry_from_task_manager';
|
||||||
|
|
||||||
|
const elasticsearch = elasticsearchServiceMock.createStart();
|
||||||
|
const esClient = elasticsearch.client.asInternalUser;
|
||||||
|
const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
|
||||||
|
|
||||||
|
describe('task manager telemetry', () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
jest.resetAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('parseBucket', () => {
|
||||||
|
test('should correctly parse aggregation bucket results', () => {
|
||||||
|
expect(
|
||||||
|
parseBucket([
|
||||||
|
{
|
||||||
|
key: 'failed',
|
||||||
|
doc_count: 36,
|
||||||
|
by_task_type: {
|
||||||
|
doc_count_error_upper_bound: 0,
|
||||||
|
sum_other_doc_count: 0,
|
||||||
|
buckets: [
|
||||||
|
{
|
||||||
|
key: 'alerting:.index-threshold',
|
||||||
|
doc_count: 4,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'alerting:document.test.',
|
||||||
|
doc_count: 32,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'unrecognized',
|
||||||
|
doc_count: 4,
|
||||||
|
by_task_type: {
|
||||||
|
doc_count_error_upper_bound: 0,
|
||||||
|
sum_other_doc_count: 0,
|
||||||
|
buckets: [
|
||||||
|
{
|
||||||
|
key: 'alerting:logs.alert.document.count',
|
||||||
|
doc_count: 4,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
])
|
||||||
|
).toEqual({
|
||||||
|
countFailedAndUnrecognizedTasksByStatus: {
|
||||||
|
failed: 36,
|
||||||
|
unrecognized: 4,
|
||||||
|
},
|
||||||
|
countFailedAndUnrecognizedTasksByStatusByType: {
|
||||||
|
failed: {
|
||||||
|
'__index-threshold': 4,
|
||||||
|
document__test__: 32,
|
||||||
|
},
|
||||||
|
unrecognized: {
|
||||||
|
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||||
|
logs__alert__document__count: 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle missing values', () => {
|
||||||
|
expect(
|
||||||
|
parseBucket([
|
||||||
|
{
|
||||||
|
key: 'failed',
|
||||||
|
by_task_type: {
|
||||||
|
doc_count_error_upper_bound: 0,
|
||||||
|
sum_other_doc_count: 0,
|
||||||
|
buckets: [
|
||||||
|
{
|
||||||
|
key: 'alerting:.index-threshold',
|
||||||
|
doc_count: 4,
|
||||||
|
},
|
||||||
|
// @ts-expect-error
|
||||||
|
{
|
||||||
|
key: 'alerting:document.test.',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'unrecognized',
|
||||||
|
doc_count: 4,
|
||||||
|
// @ts-expect-error
|
||||||
|
by_task_type: {
|
||||||
|
doc_count_error_upper_bound: 0,
|
||||||
|
sum_other_doc_count: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// @ts-expect-error
|
||||||
|
{
|
||||||
|
key: 'another_key',
|
||||||
|
},
|
||||||
|
])
|
||||||
|
).toEqual({
|
||||||
|
countFailedAndUnrecognizedTasksByStatus: {
|
||||||
|
failed: 0,
|
||||||
|
unrecognized: 4,
|
||||||
|
another_key: 0,
|
||||||
|
},
|
||||||
|
countFailedAndUnrecognizedTasksByStatusByType: {
|
||||||
|
failed: {
|
||||||
|
'__index-threshold': 4,
|
||||||
|
document__test__: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle empty input', () => {
|
||||||
|
expect(parseBucket([])).toEqual({
|
||||||
|
countFailedAndUnrecognizedTasksByStatus: {},
|
||||||
|
countFailedAndUnrecognizedTasksByStatusByType: {},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle undefined input', () => {
|
||||||
|
// @ts-expect-error
|
||||||
|
expect(parseBucket(undefined)).toEqual({
|
||||||
|
countFailedAndUnrecognizedTasksByStatus: {},
|
||||||
|
countFailedAndUnrecognizedTasksByStatusByType: {},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('getFailedAndUnrecognizedTasksPerDay', () => {
|
||||||
|
test('should return counts of failed and unrecognized tasks broken down by status and rule type', async () => {
|
||||||
|
esClient.search.mockResponse({
|
||||||
|
took: 4,
|
||||||
|
timed_out: false,
|
||||||
|
_shards: {
|
||||||
|
total: 1,
|
||||||
|
successful: 1,
|
||||||
|
skipped: 0,
|
||||||
|
failed: 0,
|
||||||
|
},
|
||||||
|
hits: {
|
||||||
|
total: {
|
||||||
|
value: 40,
|
||||||
|
relation: 'eq',
|
||||||
|
},
|
||||||
|
max_score: null,
|
||||||
|
hits: [],
|
||||||
|
},
|
||||||
|
aggregations: {
|
||||||
|
by_status: {
|
||||||
|
doc_count_error_upper_bound: 0,
|
||||||
|
sum_other_doc_count: 0,
|
||||||
|
buckets: [
|
||||||
|
{
|
||||||
|
key: 'failed',
|
||||||
|
doc_count: 36,
|
||||||
|
by_task_type: {
|
||||||
|
doc_count_error_upper_bound: 0,
|
||||||
|
sum_other_doc_count: 0,
|
||||||
|
buckets: [
|
||||||
|
{
|
||||||
|
key: 'alerting:.index-threshold',
|
||||||
|
doc_count: 4,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'alerting:document.test.',
|
||||||
|
doc_count: 32,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'unrecognized',
|
||||||
|
doc_count: 4,
|
||||||
|
by_task_type: {
|
||||||
|
doc_count_error_upper_bound: 0,
|
||||||
|
sum_other_doc_count: 0,
|
||||||
|
buckets: [
|
||||||
|
{
|
||||||
|
key: 'alerting:logs.alert.document.count',
|
||||||
|
doc_count: 4,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const telemetry = await getFailedAndUnrecognizedTasksPerDay({
|
||||||
|
esClient,
|
||||||
|
taskManagerIndex: 'test',
|
||||||
|
logger,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||||
|
|
||||||
|
expect(telemetry).toStrictEqual({
|
||||||
|
countFailedAndUnrecognizedTasks: 40,
|
||||||
|
countFailedAndUnrecognizedTasksByStatus: {
|
||||||
|
failed: 36,
|
||||||
|
unrecognized: 4,
|
||||||
|
},
|
||||||
|
countFailedAndUnrecognizedTasksByStatusByType: {
|
||||||
|
failed: {
|
||||||
|
'__index-threshold': 4,
|
||||||
|
document__test__: 32,
|
||||||
|
},
|
||||||
|
unrecognized: {
|
||||||
|
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||||
|
logs__alert__document__count: 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should return empty results and log warning if query throws error', async () => {
|
||||||
|
esClient.search.mockRejectedValue(new Error('oh no'));
|
||||||
|
|
||||||
|
const telemetry = await getFailedAndUnrecognizedTasksPerDay({
|
||||||
|
esClient,
|
||||||
|
taskManagerIndex: 'test',
|
||||||
|
logger,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||||
|
|
||||||
|
const loggerCall = logger.warn.mock.calls[0][0];
|
||||||
|
const loggerMeta = logger.warn.mock.calls[0][1];
|
||||||
|
expect(loggerCall as string).toMatchInlineSnapshot(
|
||||||
|
`"Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - {}"`
|
||||||
|
);
|
||||||
|
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
|
||||||
|
expect(loggerMeta?.error?.stack_trace).toBeDefined();
|
||||||
|
expect(telemetry).toStrictEqual({
|
||||||
|
countFailedAndUnrecognizedTasks: 0,
|
||||||
|
countFailedAndUnrecognizedTasksByStatus: {},
|
||||||
|
countFailedAndUnrecognizedTasksByStatusByType: {},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
|
@ -0,0 +1,199 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||||
|
* 2.0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { isEmpty, merge } from 'lodash';
|
||||||
|
import type {
|
||||||
|
AggregationsTermsAggregateBase,
|
||||||
|
AggregationsStringTermsBucketKeys,
|
||||||
|
AggregationsBuckets,
|
||||||
|
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||||
|
import { ElasticsearchClient, Logger } from '@kbn/core/server';
|
||||||
|
import { replaceDotSymbols } from './replace_dots_with_underscores';
|
||||||
|
import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
|
||||||
|
|
||||||
|
interface Opts {
|
||||||
|
esClient: ElasticsearchClient;
|
||||||
|
taskManagerIndex: string;
|
||||||
|
logger: Logger;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface GetFailedAndUnrecognizedTasksAggregationBucket extends AggregationsStringTermsBucketKeys {
|
||||||
|
by_task_type: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface GetFailedAndUnrecognizedTasksResults {
|
||||||
|
countFailedAndUnrecognizedTasks: number;
|
||||||
|
countFailedAndUnrecognizedTasksByStatus: Record<string, number>;
|
||||||
|
countFailedAndUnrecognizedTasksByStatusByType: Record<string, Record<string, number>>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getFailedAndUnrecognizedTasksPerDay({
|
||||||
|
esClient,
|
||||||
|
taskManagerIndex,
|
||||||
|
logger,
|
||||||
|
}: Opts): Promise<GetFailedAndUnrecognizedTasksResults> {
|
||||||
|
try {
|
||||||
|
const query = {
|
||||||
|
index: taskManagerIndex,
|
||||||
|
size: 0,
|
||||||
|
body: {
|
||||||
|
query: {
|
||||||
|
bool: {
|
||||||
|
must: [
|
||||||
|
{
|
||||||
|
bool: {
|
||||||
|
should: [
|
||||||
|
{
|
||||||
|
term: {
|
||||||
|
'task.status': 'unrecognized',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: {
|
||||||
|
'task.status': 'failed',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
wildcard: {
|
||||||
|
'task.taskType': {
|
||||||
|
value: 'alerting:*',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
range: {
|
||||||
|
'task.runAt': {
|
||||||
|
gte: 'now-1d',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
aggs: {
|
||||||
|
by_status: {
|
||||||
|
terms: {
|
||||||
|
field: 'task.status',
|
||||||
|
size: 10,
|
||||||
|
},
|
||||||
|
aggs: {
|
||||||
|
by_task_type: {
|
||||||
|
terms: {
|
||||||
|
field: 'task.taskType',
|
||||||
|
// Use number of alerting rule types because we're filtering by 'alerting:'
|
||||||
|
size: NUM_ALERTING_RULE_TYPES,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
logger.debug(`query for getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify(query)}`);
|
||||||
|
const results = await esClient.search(query);
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
`results for getFailedAndUnrecognizedTasksPerDay query - ${JSON.stringify(results)}`
|
||||||
|
);
|
||||||
|
|
||||||
|
const aggregations = results.aggregations as {
|
||||||
|
by_status: AggregationsTermsAggregateBase<GetFailedAndUnrecognizedTasksAggregationBucket>;
|
||||||
|
};
|
||||||
|
|
||||||
|
const totalFailedAndUnrecognizedTasks =
|
||||||
|
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
|
||||||
|
|
||||||
|
const aggregationsByStatus: AggregationsBuckets<GetFailedAndUnrecognizedTasksAggregationBucket> =
|
||||||
|
aggregations.by_status.buckets as GetFailedAndUnrecognizedTasksAggregationBucket[];
|
||||||
|
|
||||||
|
return {
|
||||||
|
...parseBucket(aggregationsByStatus),
|
||||||
|
countFailedAndUnrecognizedTasks: totalFailedAndUnrecognizedTasks ?? 0,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
logger.warn(
|
||||||
|
`Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify(
|
||||||
|
err
|
||||||
|
)}`,
|
||||||
|
{
|
||||||
|
tags: ['alerting', 'telemetry-failed'],
|
||||||
|
error: { stack_trace: err.stack },
|
||||||
|
}
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
countFailedAndUnrecognizedTasks: 0,
|
||||||
|
countFailedAndUnrecognizedTasksByStatus: {},
|
||||||
|
countFailedAndUnrecognizedTasksByStatusByType: {},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bucket format:
|
||||||
|
* {
|
||||||
|
* "key": "idle", // task status
|
||||||
|
* "doc_count": 28, // number of tasks with this status
|
||||||
|
* "by_task_type": {
|
||||||
|
* "doc_count_error_upper_bound": 0,
|
||||||
|
* "sum_other_doc_count": 0,
|
||||||
|
* "buckets": [
|
||||||
|
* {
|
||||||
|
* "key": "alerting:.es-query", // breakdown of task type for status
|
||||||
|
* "doc_count": 1
|
||||||
|
* },
|
||||||
|
* {
|
||||||
|
* "key": "alerting:.index-threshold",
|
||||||
|
* "doc_count": 1
|
||||||
|
* }
|
||||||
|
* ]
|
||||||
|
* }
|
||||||
|
* }
|
||||||
|
*/
|
||||||
|
|
||||||
|
export function parseBucket(
|
||||||
|
buckets: GetFailedAndUnrecognizedTasksAggregationBucket[]
|
||||||
|
): Pick<
|
||||||
|
GetFailedAndUnrecognizedTasksResults,
|
||||||
|
'countFailedAndUnrecognizedTasksByStatus' | 'countFailedAndUnrecognizedTasksByStatusByType'
|
||||||
|
> {
|
||||||
|
return (buckets ?? []).reduce(
|
||||||
|
(summary, bucket) => {
|
||||||
|
const status: string = bucket.key;
|
||||||
|
const taskTypeBuckets = bucket?.by_task_type?.buckets as AggregationsStringTermsBucketKeys[];
|
||||||
|
|
||||||
|
const byTaskType = (taskTypeBuckets ?? []).reduce(
|
||||||
|
(acc: Record<string, number>, taskTypeBucket: AggregationsStringTermsBucketKeys) => {
|
||||||
|
const taskType: string = replaceDotSymbols(taskTypeBucket.key.replace('alerting:', ''));
|
||||||
|
return {
|
||||||
|
...acc,
|
||||||
|
[taskType]: taskTypeBucket.doc_count ?? 0,
|
||||||
|
};
|
||||||
|
},
|
||||||
|
{}
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
...summary,
|
||||||
|
countFailedAndUnrecognizedTasksByStatus: {
|
||||||
|
...summary.countFailedAndUnrecognizedTasksByStatus,
|
||||||
|
[status]: bucket?.doc_count ?? 0,
|
||||||
|
},
|
||||||
|
countFailedAndUnrecognizedTasksByStatusByType: merge(
|
||||||
|
summary.countFailedAndUnrecognizedTasksByStatusByType,
|
||||||
|
isEmpty(byTaskType) ? {} : { [status]: byTaskType }
|
||||||
|
),
|
||||||
|
};
|
||||||
|
},
|
||||||
|
{
|
||||||
|
countFailedAndUnrecognizedTasksByStatus: {},
|
||||||
|
countFailedAndUnrecognizedTasksByStatusByType: {},
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
|
@ -0,0 +1,67 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||||
|
* 2.0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
|
||||||
|
|
||||||
|
describe('parseSimpleRuleTypeBucket', () => {
|
||||||
|
test('should correctly parse rule type bucket results', () => {
|
||||||
|
expect(
|
||||||
|
parseSimpleRuleTypeBucket([
|
||||||
|
{
|
||||||
|
key: '.index-threshold',
|
||||||
|
doc_count: 78,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'document.test.',
|
||||||
|
doc_count: 42,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'logs.alert.document.count',
|
||||||
|
doc_count: 28,
|
||||||
|
},
|
||||||
|
])
|
||||||
|
).toEqual({
|
||||||
|
'__index-threshold': 78,
|
||||||
|
document__test__: 42,
|
||||||
|
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||||
|
logs__alert__document__count: 28,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle missing values', () => {
|
||||||
|
expect(
|
||||||
|
parseSimpleRuleTypeBucket([
|
||||||
|
// @ts-expect-error
|
||||||
|
{
|
||||||
|
key: '.index-threshold',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'document.test.',
|
||||||
|
doc_count: 42,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'logs.alert.document.count',
|
||||||
|
doc_count: 28,
|
||||||
|
},
|
||||||
|
])
|
||||||
|
).toEqual({
|
||||||
|
'__index-threshold': 0,
|
||||||
|
document__test__: 42,
|
||||||
|
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||||
|
logs__alert__document__count: 28,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle empty input', () => {
|
||||||
|
expect(parseSimpleRuleTypeBucket([])).toEqual({});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle undefined input', () => {
|
||||||
|
// @ts-expect-error
|
||||||
|
expect(parseSimpleRuleTypeBucket(undefined)).toEqual({});
|
||||||
|
});
|
||||||
|
});
|
|
@ -0,0 +1,25 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||||
|
* 2.0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
AggregationsBuckets,
|
||||||
|
AggregationsStringTermsBucketKeys,
|
||||||
|
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||||
|
import { replaceDotSymbols } from './replace_dots_with_underscores';
|
||||||
|
|
||||||
|
export function parseSimpleRuleTypeBucket(
|
||||||
|
ruleTypeBuckets: AggregationsBuckets<AggregationsStringTermsBucketKeys>
|
||||||
|
) {
|
||||||
|
const buckets = ruleTypeBuckets as AggregationsStringTermsBucketKeys[];
|
||||||
|
return (buckets ?? []).reduce((acc, bucket: AggregationsStringTermsBucketKeys) => {
|
||||||
|
const ruleType: string = replaceDotSymbols(bucket.key);
|
||||||
|
return {
|
||||||
|
...acc,
|
||||||
|
[ruleType]: bucket.doc_count ?? 0,
|
||||||
|
};
|
||||||
|
}, {});
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||||
|
* 2.0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { replaceDotSymbols } from './replace_dots_with_underscores';
|
||||||
|
|
||||||
|
describe('replaceDotSymbols', () => {
|
||||||
|
test('should replace "." symbols with "__" in string', async () => {
|
||||||
|
expect(replaceDotSymbols('.index-threshold')).toEqual('__index-threshold');
|
||||||
|
});
|
||||||
|
});
|
|
@ -0,0 +1,10 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License
|
||||||
|
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||||
|
* 2.0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
export function replaceDotSymbols(strToReplace: string) {
|
||||||
|
return strToReplace.replaceAll('.', '__');
|
||||||
|
}
|
|
@ -13,13 +13,12 @@ import {
|
||||||
TaskManagerStartContract,
|
TaskManagerStartContract,
|
||||||
} from '@kbn/task-manager-plugin/server';
|
} from '@kbn/task-manager-plugin/server';
|
||||||
|
|
||||||
|
import { getFailedAndUnrecognizedTasksPerDay } from './lib/get_telemetry_from_task_manager';
|
||||||
|
import { getTotalCountAggregations, getTotalCountInUse } from './lib/get_telemetry_from_kibana';
|
||||||
import {
|
import {
|
||||||
getTotalCountAggregations,
|
|
||||||
getTotalCountInUse,
|
|
||||||
getExecutionsPerDayCount,
|
getExecutionsPerDayCount,
|
||||||
getExecutionTimeoutsPerDayCount,
|
getExecutionTimeoutsPerDayCount,
|
||||||
getFailedAndUnrecognizedTasksPerDay,
|
} from './lib/get_telemetry_from_event_log';
|
||||||
} from './alerting_telemetry';
|
|
||||||
|
|
||||||
export const TELEMETRY_TASK_TYPE = 'alerting_telemetry';
|
export const TELEMETRY_TASK_TYPE = 'alerting_telemetry';
|
||||||
|
|
||||||
|
@ -98,11 +97,11 @@ export function telemetryTaskRunner(
|
||||||
async run() {
|
async run() {
|
||||||
const esClient = await getEsClient();
|
const esClient = await getEsClient();
|
||||||
return Promise.all([
|
return Promise.all([
|
||||||
getTotalCountAggregations(esClient, kibanaIndex, logger),
|
getTotalCountAggregations({ esClient, kibanaIndex, logger }),
|
||||||
getTotalCountInUse(esClient, kibanaIndex, logger),
|
getTotalCountInUse({ esClient, kibanaIndex, logger }),
|
||||||
getExecutionsPerDayCount(esClient, eventLogIndex, logger),
|
getExecutionsPerDayCount({ esClient, eventLogIndex, logger }),
|
||||||
getExecutionTimeoutsPerDayCount(esClient, eventLogIndex, logger),
|
getExecutionTimeoutsPerDayCount({ esClient, eventLogIndex, logger }),
|
||||||
getFailedAndUnrecognizedTasksPerDay(esClient, taskManagerIndex, logger),
|
getFailedAndUnrecognizedTasksPerDay({ esClient, taskManagerIndex, logger }),
|
||||||
])
|
])
|
||||||
.then(
|
.then(
|
||||||
([
|
([
|
||||||
|
@ -120,22 +119,25 @@ export function telemetryTaskRunner(
|
||||||
count_active_total: totalInUse.countTotal,
|
count_active_total: totalInUse.countTotal,
|
||||||
count_disabled_total: totalCountAggregations.count_total - totalInUse.countTotal,
|
count_disabled_total: totalCountAggregations.count_total - totalInUse.countTotal,
|
||||||
count_rules_namespaces: totalInUse.countNamespaces,
|
count_rules_namespaces: totalInUse.countNamespaces,
|
||||||
count_rules_executions_per_day: dailyExecutionCounts.countTotal,
|
count_rules_executions_per_day: dailyExecutionCounts.countTotalRuleExecutions,
|
||||||
count_rules_executions_by_type_per_day: dailyExecutionCounts.countByType,
|
count_rules_executions_by_type_per_day:
|
||||||
count_rules_executions_failured_per_day: dailyExecutionCounts.countTotalFailures,
|
dailyExecutionCounts.countRuleExecutionsByType,
|
||||||
|
count_rules_executions_failured_per_day:
|
||||||
|
dailyExecutionCounts.countTotalFailedExecutions,
|
||||||
count_rules_executions_failured_by_reason_per_day:
|
count_rules_executions_failured_by_reason_per_day:
|
||||||
dailyExecutionCounts.countFailuresByReason,
|
dailyExecutionCounts.countFailedExecutionsByReason,
|
||||||
count_rules_executions_failured_by_reason_by_type_per_day:
|
count_rules_executions_failured_by_reason_by_type_per_day:
|
||||||
dailyExecutionCounts.countFailuresByReasonByType,
|
dailyExecutionCounts.countFailedExecutionsByReasonByType,
|
||||||
count_rules_executions_timeouts_per_day: dailyExecutionTimeoutCounts.countTotal,
|
count_rules_executions_timeouts_per_day:
|
||||||
|
dailyExecutionTimeoutCounts.countExecutionTimeouts,
|
||||||
count_rules_executions_timeouts_by_type_per_day:
|
count_rules_executions_timeouts_by_type_per_day:
|
||||||
dailyExecutionTimeoutCounts.countByType,
|
dailyExecutionTimeoutCounts.countExecutionTimeoutsByType,
|
||||||
count_failed_and_unrecognized_rule_tasks_per_day:
|
count_failed_and_unrecognized_rule_tasks_per_day:
|
||||||
dailyFailedAndUnrecognizedTasks.countTotal,
|
dailyFailedAndUnrecognizedTasks.countFailedAndUnrecognizedTasks,
|
||||||
count_failed_and_unrecognized_rule_tasks_by_status_per_day:
|
count_failed_and_unrecognized_rule_tasks_by_status_per_day:
|
||||||
dailyFailedAndUnrecognizedTasks.countByStatus,
|
dailyFailedAndUnrecognizedTasks.countFailedAndUnrecognizedTasksByStatus,
|
||||||
count_failed_and_unrecognized_rule_tasks_by_status_by_type_per_day:
|
count_failed_and_unrecognized_rule_tasks_by_status_by_type_per_day:
|
||||||
dailyFailedAndUnrecognizedTasks.countByStatusByRuleType,
|
dailyFailedAndUnrecognizedTasks.countFailedAndUnrecognizedTasksByStatusByType,
|
||||||
avg_execution_time_per_day: dailyExecutionCounts.avgExecutionTime,
|
avg_execution_time_per_day: dailyExecutionCounts.avgExecutionTime,
|
||||||
avg_execution_time_by_type_per_day: dailyExecutionCounts.avgExecutionTimeByType,
|
avg_execution_time_by_type_per_day: dailyExecutionCounts.avgExecutionTimeByType,
|
||||||
avg_es_search_duration_per_day: dailyExecutionCounts.avgEsSearchDuration,
|
avg_es_search_duration_per_day: dailyExecutionCounts.avgEsSearchDuration,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue