mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 09:48:58 -04:00
[Response Ops] Replace scripted metric aggs in alerting telemetry queries with terms aggregations (#134769)
* Updating getTotalCountAggregations query * Replacing scripted metric aggs with terms aggregations * Fixing task manager query * Updating replaceDotSymbols fn * Adding stack trace to logger meta * Reusing event log query * Adding fallback for bucket key and doc_count * Switch reduce for for loop * combining aggs * Fixing nulls issue Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
parent
8440cec9a6
commit
4b7b363e9c
14 changed files with 3267 additions and 1706 deletions
|
@ -1,725 +0,0 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
/* eslint-disable @typescript-eslint/naming-convention */
|
||||
|
||||
// eslint-disable-next-line @kbn/eslint/no-restricted-paths
|
||||
import { elasticsearchClientMock } from '@kbn/core/server/elasticsearch/client/mocks';
|
||||
import { loggingSystemMock } from '@kbn/core/server/mocks';
|
||||
import {
|
||||
getTotalCountAggregations,
|
||||
getTotalCountInUse,
|
||||
getExecutionsPerDayCount,
|
||||
getExecutionTimeoutsPerDayCount,
|
||||
getFailedAndUnrecognizedTasksPerDay,
|
||||
parsePercentileAggsByRuleType,
|
||||
} from './alerting_telemetry';
|
||||
|
||||
const mockLogger = loggingSystemMock.create().get();
|
||||
describe('alerting telemetry', () => {
|
||||
test('getTotalCountInUse should replace "." symbols with "__" in rule types names', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockResponse(
|
||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
||||
{
|
||||
aggregations: {
|
||||
byRuleTypeId: {
|
||||
value: {
|
||||
ruleTypes: {
|
||||
'.index-threshold': 2,
|
||||
'logs.alert.document.count': 1,
|
||||
'document.test.': 1,
|
||||
},
|
||||
namespaces: {
|
||||
default: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
hits: {
|
||||
hits: [],
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
const telemetry = await getTotalCountInUse(mockEsClient, 'test', mockLogger);
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
|
||||
expect(telemetry).toMatchInlineSnapshot(`
|
||||
Object {
|
||||
"countByType": Object {
|
||||
"__index-threshold": 2,
|
||||
"document__test__": 1,
|
||||
"logs__alert__document__count": 1,
|
||||
},
|
||||
"countNamespaces": 1,
|
||||
"countTotal": 4,
|
||||
}
|
||||
`);
|
||||
});
|
||||
|
||||
test('getTotalCountInUse should return empty results if query throws error', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockRejectedValue(new Error('oh no'));
|
||||
|
||||
const telemetry = await getTotalCountInUse(mockEsClient, 'test', mockLogger);
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
expect(mockLogger.warn).toHaveBeenCalledWith(
|
||||
`Error executing alerting telemetry task: getTotalCountInUse - {}`
|
||||
);
|
||||
expect(telemetry).toMatchInlineSnapshot(`
|
||||
Object {
|
||||
"countByType": Object {},
|
||||
"countNamespaces": 0,
|
||||
"countTotal": 0,
|
||||
}
|
||||
`);
|
||||
});
|
||||
|
||||
test('getTotalCountAggregations should return min/max connectors in use', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockResponse(
|
||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
||||
{
|
||||
aggregations: {
|
||||
byRuleTypeId: {
|
||||
value: {
|
||||
ruleTypes: {
|
||||
'.index-threshold': 2,
|
||||
'logs.alert.document.count': 1,
|
||||
'document.test.': 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
max_throttle_time: { value: 60 },
|
||||
min_throttle_time: { value: 0 },
|
||||
avg_throttle_time: { value: 30 },
|
||||
max_interval_time: { value: 10 },
|
||||
min_interval_time: { value: 1 },
|
||||
avg_interval_time: { value: 4.5 },
|
||||
max_actions_count: { value: 4 },
|
||||
min_actions_count: { value: 0 },
|
||||
avg_actions_count: { value: 2.5 },
|
||||
},
|
||||
hits: {
|
||||
hits: [],
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
const telemetry = await getTotalCountAggregations(mockEsClient, 'test', mockLogger);
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
|
||||
expect(telemetry).toMatchInlineSnapshot(`
|
||||
Object {
|
||||
"connectors_per_alert": Object {
|
||||
"avg": 2.5,
|
||||
"max": 4,
|
||||
"min": 0,
|
||||
},
|
||||
"count_by_type": Object {
|
||||
"__index-threshold": 2,
|
||||
"document__test__": 1,
|
||||
"logs__alert__document__count": 1,
|
||||
},
|
||||
"count_rules_namespaces": 0,
|
||||
"count_total": 4,
|
||||
"schedule_time": Object {
|
||||
"avg": "4.5s",
|
||||
"max": "10s",
|
||||
"min": "1s",
|
||||
},
|
||||
"schedule_time_number_s": Object {
|
||||
"avg": 4.5,
|
||||
"max": 10,
|
||||
"min": 1,
|
||||
},
|
||||
"throttle_time": Object {
|
||||
"avg": "30s",
|
||||
"max": "60s",
|
||||
"min": "0s",
|
||||
},
|
||||
"throttle_time_number_s": Object {
|
||||
"avg": 30,
|
||||
"max": 60,
|
||||
"min": 0,
|
||||
},
|
||||
}
|
||||
`);
|
||||
});
|
||||
|
||||
test('getTotalCountAggregations should return empty results if query throws error', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockRejectedValue(new Error('oh no'));
|
||||
|
||||
const telemetry = await getTotalCountAggregations(mockEsClient, 'test', mockLogger);
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
expect(mockLogger.warn).toHaveBeenCalledWith(
|
||||
`Error executing alerting telemetry task: getTotalCountAggregations - {}`
|
||||
);
|
||||
expect(telemetry).toMatchInlineSnapshot(`
|
||||
Object {
|
||||
"connectors_per_alert": Object {
|
||||
"avg": 0,
|
||||
"max": 0,
|
||||
"min": 0,
|
||||
},
|
||||
"count_by_type": Object {},
|
||||
"count_rules_namespaces": 0,
|
||||
"count_total": 0,
|
||||
"schedule_time": Object {
|
||||
"avg": "0s",
|
||||
"max": "0s",
|
||||
"min": "0s",
|
||||
},
|
||||
"schedule_time_number_s": Object {
|
||||
"avg": 0,
|
||||
"max": 0,
|
||||
"min": 0,
|
||||
},
|
||||
"throttle_time": Object {
|
||||
"avg": "0s",
|
||||
"max": "0s",
|
||||
"min": "0s",
|
||||
},
|
||||
"throttle_time_number_s": Object {
|
||||
"avg": 0,
|
||||
"max": 0,
|
||||
"min": 0,
|
||||
},
|
||||
}
|
||||
`);
|
||||
});
|
||||
|
||||
test('getExecutionsPerDayCount should return execution aggregations for total count, count by rule type and number of failed executions', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockResponse(
|
||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
||||
{
|
||||
aggregations: {
|
||||
byRuleTypeId: {
|
||||
value: {
|
||||
ruleTypes: {
|
||||
'.index-threshold': 2,
|
||||
'logs.alert.document.count': 1,
|
||||
'document.test.': 1,
|
||||
},
|
||||
ruleTypesDuration: {
|
||||
'.index-threshold': 2087868,
|
||||
'logs.alert.document.count': 1675765,
|
||||
'document.test.': 17687687,
|
||||
},
|
||||
ruleTypesEsSearchDuration: {
|
||||
'.index-threshold': 23,
|
||||
'logs.alert.document.count': 526,
|
||||
'document.test.': 534,
|
||||
},
|
||||
ruleTypesTotalSearchDuration: {
|
||||
'.index-threshold': 62,
|
||||
'logs.alert.document.count': 588,
|
||||
'document.test.': 637,
|
||||
},
|
||||
},
|
||||
},
|
||||
failuresByReason: {
|
||||
value: {
|
||||
reasons: {
|
||||
unknown: {
|
||||
'.index-threshold': 2,
|
||||
'logs.alert.document.count': 1,
|
||||
'document.test.': 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
avgDuration: { value: 10 },
|
||||
avgEsSearchDuration: {
|
||||
value: 25.785714285714285,
|
||||
},
|
||||
avgTotalSearchDuration: {
|
||||
value: 30.642857142857142,
|
||||
},
|
||||
percentileScheduledActions: {
|
||||
values: {
|
||||
'50.0': 4.0,
|
||||
'90.0': 26.0,
|
||||
'99.0': 26.0,
|
||||
},
|
||||
},
|
||||
percentileAlerts: {
|
||||
values: {
|
||||
'50.0': 10.0,
|
||||
'90.0': 22.0,
|
||||
'99.0': 22.0,
|
||||
},
|
||||
},
|
||||
aggsByType: {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: '.index-threshold',
|
||||
doc_count: 149,
|
||||
percentileScheduledActions: {
|
||||
values: {
|
||||
'50.0': 4.0,
|
||||
'90.0': 26.0,
|
||||
'99.0': 26.0,
|
||||
},
|
||||
},
|
||||
percentileAlerts: {
|
||||
values: {
|
||||
'50.0': 10.0,
|
||||
'90.0': 22.0,
|
||||
'99.0': 22.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'logs.alert.document.count',
|
||||
doc_count: 1,
|
||||
percentileScheduledActions: {
|
||||
values: {
|
||||
'50.0': 10.0,
|
||||
'90.0': 10.0,
|
||||
'99.0': 10.0,
|
||||
},
|
||||
},
|
||||
percentileAlerts: {
|
||||
values: {
|
||||
'50.0': 5.0,
|
||||
'90.0': 13.0,
|
||||
'99.0': 13.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
hits: {
|
||||
hits: [],
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', mockLogger);
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
|
||||
expect(telemetry).toStrictEqual({
|
||||
avgEsSearchDuration: 26,
|
||||
avgEsSearchDurationByType: {
|
||||
'__index-threshold': 12,
|
||||
document__test__: 534,
|
||||
logs__alert__document__count: 526,
|
||||
},
|
||||
avgExecutionTime: 0,
|
||||
avgExecutionTimeByType: {
|
||||
'__index-threshold': 1043934,
|
||||
document__test__: 17687687,
|
||||
logs__alert__document__count: 1675765,
|
||||
},
|
||||
avgTotalSearchDuration: 31,
|
||||
avgTotalSearchDurationByType: {
|
||||
'__index-threshold': 31,
|
||||
document__test__: 637,
|
||||
logs__alert__document__count: 588,
|
||||
},
|
||||
countByType: {
|
||||
'__index-threshold': 2,
|
||||
document__test__: 1,
|
||||
logs__alert__document__count: 1,
|
||||
},
|
||||
countFailuresByReason: {
|
||||
unknown: 4,
|
||||
},
|
||||
countFailuresByReasonByType: {
|
||||
unknown: {
|
||||
'__index-threshold': 2,
|
||||
document__test__: 1,
|
||||
logs__alert__document__count: 1,
|
||||
},
|
||||
},
|
||||
countTotal: 4,
|
||||
countTotalFailures: 4,
|
||||
generatedActionsPercentiles: {
|
||||
p50: 4,
|
||||
p90: 26,
|
||||
p99: 26,
|
||||
},
|
||||
generatedActionsPercentilesByType: {
|
||||
p50: {
|
||||
'__index-threshold': 4,
|
||||
logs__alert__document__count: 10,
|
||||
},
|
||||
p90: {
|
||||
'__index-threshold': 26,
|
||||
logs__alert__document__count: 10,
|
||||
},
|
||||
p99: {
|
||||
'__index-threshold': 26,
|
||||
logs__alert__document__count: 10,
|
||||
},
|
||||
},
|
||||
alertsPercentiles: {
|
||||
p50: 10,
|
||||
p90: 22,
|
||||
p99: 22,
|
||||
},
|
||||
alertsPercentilesByType: {
|
||||
p50: {
|
||||
'__index-threshold': 10,
|
||||
logs__alert__document__count: 5,
|
||||
},
|
||||
p90: {
|
||||
'__index-threshold': 22,
|
||||
logs__alert__document__count: 13,
|
||||
},
|
||||
p99: {
|
||||
'__index-threshold': 22,
|
||||
logs__alert__document__count: 13,
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test('getExecutionsPerDayCount should return empty results if query throws error', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockRejectedValue(new Error('oh no'));
|
||||
|
||||
const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', mockLogger);
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
expect(mockLogger.warn).toHaveBeenCalledWith(
|
||||
`Error executing alerting telemetry task: getExecutionsPerDayCount - {}`
|
||||
);
|
||||
expect(telemetry).toStrictEqual({
|
||||
avgEsSearchDuration: 0,
|
||||
avgEsSearchDurationByType: {},
|
||||
avgExecutionTime: 0,
|
||||
avgExecutionTimeByType: {},
|
||||
avgTotalSearchDuration: 0,
|
||||
avgTotalSearchDurationByType: {},
|
||||
countByType: {},
|
||||
countFailuresByReason: {},
|
||||
countFailuresByReasonByType: {},
|
||||
countTotal: 0,
|
||||
countTotalFailures: 0,
|
||||
generatedActionsPercentiles: {},
|
||||
generatedActionsPercentilesByType: {},
|
||||
alertsPercentiles: {},
|
||||
alertsPercentilesByType: {},
|
||||
});
|
||||
});
|
||||
|
||||
test('getExecutionTimeoutsPerDayCount should return execution aggregations for total timeout count and count by rule type', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockResponse(
|
||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
||||
{
|
||||
aggregations: {
|
||||
byRuleTypeId: {
|
||||
value: {
|
||||
ruleTypes: {
|
||||
'.index-threshold': 2,
|
||||
'logs.alert.document.count': 1,
|
||||
'document.test.': 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
hits: {
|
||||
hits: [],
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
const telemetry = await getExecutionTimeoutsPerDayCount(mockEsClient, 'test', mockLogger);
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
|
||||
expect(telemetry).toStrictEqual({
|
||||
countTotal: 4,
|
||||
countByType: {
|
||||
'__index-threshold': 2,
|
||||
document__test__: 1,
|
||||
logs__alert__document__count: 1,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test('getExecutionTimeoutsPerDayCount should return empty results if query throws error', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockRejectedValue(new Error('oh no'));
|
||||
|
||||
const telemetry = await getExecutionTimeoutsPerDayCount(mockEsClient, 'test', mockLogger);
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
expect(mockLogger.warn).toHaveBeenCalledWith(
|
||||
`Error executing alerting telemetry task: getExecutionsPerDayCount - {}`
|
||||
);
|
||||
expect(telemetry).toStrictEqual({
|
||||
countTotal: 0,
|
||||
countByType: {},
|
||||
});
|
||||
});
|
||||
|
||||
test('getFailedAndUnrecognizedTasksPerDay should aggregations for total count, count by status and count by status and rule type for failed and unrecognized tasks', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockResponse(
|
||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
||||
{
|
||||
aggregations: {
|
||||
byTaskTypeId: {
|
||||
value: {
|
||||
statuses: {
|
||||
failed: {
|
||||
'.index-threshold': 2,
|
||||
'logs.alert.document.count': 1,
|
||||
'document.test.': 1,
|
||||
},
|
||||
unrecognized: {
|
||||
'o.l.d.task-type': 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
hits: {
|
||||
hits: [],
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
const telemetry = await getFailedAndUnrecognizedTasksPerDay(mockEsClient, 'test', mockLogger);
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
|
||||
expect(telemetry).toStrictEqual({
|
||||
countByStatus: {
|
||||
failed: 4,
|
||||
unrecognized: 1,
|
||||
},
|
||||
countByStatusByRuleType: {
|
||||
failed: {
|
||||
'__index-threshold': 2,
|
||||
document__test__: 1,
|
||||
logs__alert__document__count: 1,
|
||||
},
|
||||
unrecognized: {
|
||||
'o__l__d__task-type': 1,
|
||||
},
|
||||
},
|
||||
countTotal: 5,
|
||||
});
|
||||
});
|
||||
|
||||
test('getFailedAndUnrecognizedTasksPerDay should return empty results if query throws error', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockRejectedValue(new Error('oh no'));
|
||||
|
||||
const telemetry = await getFailedAndUnrecognizedTasksPerDay(mockEsClient, 'test', mockLogger);
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
expect(mockLogger.warn).toHaveBeenCalledWith(
|
||||
`Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - {}`
|
||||
);
|
||||
expect(telemetry).toStrictEqual({
|
||||
countByStatus: {},
|
||||
countByStatusByRuleType: {},
|
||||
countTotal: 0,
|
||||
});
|
||||
});
|
||||
|
||||
test('parsePercentileAggsByRuleType', () => {
|
||||
const aggsByType = {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: '.index-threshold',
|
||||
doc_count: 149,
|
||||
percentileScheduledActions: {
|
||||
values: {
|
||||
'50.0': 4.0,
|
||||
'90.0': 26.0,
|
||||
'99.0': 26.0,
|
||||
},
|
||||
},
|
||||
percentileAlerts: {
|
||||
values: {
|
||||
'50.0': 3.0,
|
||||
'90.0': 22.0,
|
||||
'99.0': 22.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'logs.alert.document.count',
|
||||
doc_count: 1,
|
||||
percentileScheduledActions: {
|
||||
values: {
|
||||
'50.0': 10.0,
|
||||
'90.0': 10.0,
|
||||
'99.0': 10.0,
|
||||
},
|
||||
},
|
||||
percentileAlerts: {
|
||||
values: {
|
||||
'50.0': 5.0,
|
||||
'90.0': 16.0,
|
||||
'99.0': 16.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'document.test.',
|
||||
doc_count: 1,
|
||||
percentileScheduledActions: {
|
||||
values: {
|
||||
'50.0': null,
|
||||
'90.0': null,
|
||||
'99.0': null,
|
||||
},
|
||||
},
|
||||
percentileAlerts: {
|
||||
values: {
|
||||
'50.0': null,
|
||||
'90.0': null,
|
||||
'99.0': null,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
expect(
|
||||
parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileScheduledActions.values')
|
||||
).toEqual({
|
||||
p50: {
|
||||
'__index-threshold': 4,
|
||||
document__test__: 0,
|
||||
logs__alert__document__count: 10,
|
||||
},
|
||||
p90: {
|
||||
'__index-threshold': 26,
|
||||
document__test__: 0,
|
||||
logs__alert__document__count: 10,
|
||||
},
|
||||
p99: {
|
||||
'__index-threshold': 26,
|
||||
document__test__: 0,
|
||||
logs__alert__document__count: 10,
|
||||
},
|
||||
});
|
||||
expect(parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileAlerts.values')).toEqual({
|
||||
p50: {
|
||||
'__index-threshold': 3,
|
||||
document__test__: 0,
|
||||
logs__alert__document__count: 5,
|
||||
},
|
||||
p90: {
|
||||
'__index-threshold': 22,
|
||||
document__test__: 0,
|
||||
logs__alert__document__count: 16,
|
||||
},
|
||||
p99: {
|
||||
'__index-threshold': 22,
|
||||
document__test__: 0,
|
||||
logs__alert__document__count: 16,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test('parsePercentileAggsByRuleType handles unknown path', () => {
|
||||
const aggsByType = {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: '.index-threshold',
|
||||
doc_count: 149,
|
||||
percentileScheduledActions: {
|
||||
values: {
|
||||
'50.0': 4.0,
|
||||
'90.0': 26.0,
|
||||
'99.0': 26.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'logs.alert.document.count',
|
||||
doc_count: 1,
|
||||
percentileScheduledActions: {
|
||||
values: {
|
||||
'50.0': 10.0,
|
||||
'90.0': 10.0,
|
||||
'99.0': 10.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
expect(parsePercentileAggsByRuleType(aggsByType.buckets, 'foo.values')).toEqual({
|
||||
p50: {},
|
||||
p90: {},
|
||||
p99: {},
|
||||
});
|
||||
});
|
||||
|
||||
test('parsePercentileAggsByRuleType handles unrecognized percentiles', () => {
|
||||
const aggsByType = {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: '.index-threshold',
|
||||
doc_count: 149,
|
||||
percentileScheduledActions: {
|
||||
values: {
|
||||
'50.0': 4.0,
|
||||
'75.0': 8.0,
|
||||
'90.0': 26.0,
|
||||
'99.0': 26.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'logs.alert.document.count',
|
||||
doc_count: 1,
|
||||
percentileScheduledActions: {
|
||||
values: {
|
||||
'50.0': 10.0,
|
||||
'75.0': 10.0,
|
||||
'90.0': 10.0,
|
||||
'99.0': 10.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
expect(
|
||||
parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileScheduledActions.values')
|
||||
).toEqual({
|
||||
p50: {
|
||||
'__index-threshold': 4,
|
||||
logs__alert__document__count: 10,
|
||||
},
|
||||
p90: {
|
||||
'__index-threshold': 26,
|
||||
logs__alert__document__count: 10,
|
||||
},
|
||||
p99: {
|
||||
'__index-threshold': 26,
|
||||
logs__alert__document__count: 10,
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
|
@ -1,962 +0,0 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import { ElasticsearchClient, Logger } from '@kbn/core/server';
|
||||
import { get, merge } from 'lodash';
|
||||
import { AlertingUsage } from './types';
|
||||
import { NUM_ALERTING_RULE_TYPES } from './alerting_usage_collector';
|
||||
|
||||
const percentileFieldNameMapping: Record<string, string> = {
|
||||
'50.0': 'p50',
|
||||
'90.0': 'p90',
|
||||
'99.0': 'p99',
|
||||
};
|
||||
|
||||
const ruleTypeMetric = {
|
||||
scripted_metric: {
|
||||
init_script: 'state.ruleTypes = [:]; state.namespaces = [:]',
|
||||
map_script: `
|
||||
String ruleType = doc['alert.alertTypeId'].value;
|
||||
String namespace = doc['namespaces'] !== null && doc['namespaces'].size() > 0 ? doc['namespaces'].value : 'default';
|
||||
state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1);
|
||||
if (state.namespaces.containsKey(namespace) === false) {
|
||||
state.namespaces.put(namespace, 1);
|
||||
}
|
||||
`,
|
||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
||||
// Despite docs that say this is optional, this script can't be blank.
|
||||
combine_script: 'return state',
|
||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
||||
// This also needs to account for having no data
|
||||
reduce_script: `
|
||||
HashMap result = new HashMap();
|
||||
HashMap combinedRuleTypes = new HashMap();
|
||||
HashMap combinedNamespaces = new HashMap();
|
||||
for (state in states) {
|
||||
for (String ruleType : state.ruleTypes.keySet()) {
|
||||
int ruleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + state.ruleTypes.get(ruleType) : state.ruleTypes.get(ruleType);
|
||||
combinedRuleTypes.put(ruleType, ruleTypeCount);
|
||||
}
|
||||
|
||||
for (String namespace : state.namespaces.keySet()) {
|
||||
combinedNamespaces.put(namespace, 1);
|
||||
}
|
||||
}
|
||||
|
||||
result.ruleTypes = combinedRuleTypes;
|
||||
result.namespaces = combinedNamespaces;
|
||||
return result;
|
||||
`,
|
||||
},
|
||||
};
|
||||
|
||||
const generatedActionsPercentilesAgg = {
|
||||
percentiles: {
|
||||
field: 'kibana.alert.rule.execution.metrics.number_of_generated_actions',
|
||||
percents: [50, 90, 99],
|
||||
},
|
||||
};
|
||||
|
||||
const alertsPercentilesAgg = {
|
||||
percentiles: {
|
||||
field: 'kibana.alert.rule.execution.metrics.alert_counts.active',
|
||||
percents: [50, 90, 99],
|
||||
},
|
||||
};
|
||||
|
||||
const ruleTypeExecutionsWithDurationMetric = {
|
||||
scripted_metric: {
|
||||
init_script:
|
||||
'state.ruleTypes = [:]; state.ruleTypesDuration = [:]; state.ruleTypesEsSearchDuration = [:]; state.ruleTypesTotalSearchDuration = [:];',
|
||||
map_script: `
|
||||
String ruleType = doc['rule.category'].value;
|
||||
long duration = doc['event.duration'].value / (1000 * 1000);
|
||||
long esSearchDuration = doc['kibana.alert.rule.execution.metrics.es_search_duration_ms'].empty ? 0 : doc['kibana.alert.rule.execution.metrics.es_search_duration_ms'].value;
|
||||
long totalSearchDuration = doc['kibana.alert.rule.execution.metrics.total_search_duration_ms'].empty ? 0 : doc['kibana.alert.rule.execution.metrics.total_search_duration_ms'].value;
|
||||
state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1);
|
||||
state.ruleTypesDuration.put(ruleType, state.ruleTypesDuration.containsKey(ruleType) ? state.ruleTypesDuration.get(ruleType) + duration : duration);
|
||||
state.ruleTypesEsSearchDuration.put(ruleType, state.ruleTypesEsSearchDuration.containsKey(ruleType) ? state.ruleTypesEsSearchDuration.get(ruleType) + esSearchDuration : esSearchDuration);
|
||||
state.ruleTypesTotalSearchDuration.put(ruleType, state.ruleTypesTotalSearchDuration.containsKey(ruleType) ? state.ruleTypesTotalSearchDuration.get(ruleType) + totalSearchDuration : totalSearchDuration);
|
||||
`,
|
||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
||||
// Despite docs that say this is optional, this script can't be blank.
|
||||
combine_script: 'return state',
|
||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
||||
// This also needs to account for having no data
|
||||
reduce_script: `
|
||||
HashMap result = new HashMap();
|
||||
HashMap combinedRuleTypes = new HashMap();
|
||||
HashMap combinedRuleTypeDurations = new HashMap();
|
||||
HashMap combinedRuleTypeEsSearchDurations = new HashMap();
|
||||
HashMap combinedRuleTypeTotalSearchDurations = new HashMap();
|
||||
for (state in states) {
|
||||
for (String ruleType : state.ruleTypes.keySet()) {
|
||||
int ruleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + state.ruleTypes.get(ruleType) : state.ruleTypes.get(ruleType);
|
||||
combinedRuleTypes.put(ruleType, ruleTypeCount);
|
||||
}
|
||||
|
||||
for (String ruleType : state.ruleTypesDuration.keySet()) {
|
||||
long ruleTypeDurationTotal = combinedRuleTypeDurations.containsKey(ruleType) ? combinedRuleTypeDurations.get(ruleType) + state.ruleTypesDuration.get(ruleType) : state.ruleTypesDuration.get(ruleType);
|
||||
combinedRuleTypeDurations.put(ruleType, ruleTypeDurationTotal);
|
||||
}
|
||||
|
||||
for (String ruleType : state.ruleTypesEsSearchDuration.keySet()) {
|
||||
long ruleTypeEsSearchDurationTotal = combinedRuleTypeEsSearchDurations.containsKey(ruleType) ? combinedRuleTypeEsSearchDurations.get(ruleType) + state.ruleTypesEsSearchDuration.get(ruleType) : state.ruleTypesEsSearchDuration.get(ruleType);
|
||||
combinedRuleTypeEsSearchDurations.put(ruleType, ruleTypeEsSearchDurationTotal);
|
||||
}
|
||||
|
||||
for (String ruleType : state.ruleTypesTotalSearchDuration.keySet()) {
|
||||
long ruleTypeTotalSearchDurationTotal = combinedRuleTypeTotalSearchDurations.containsKey(ruleType) ? combinedRuleTypeTotalSearchDurations.get(ruleType) + state.ruleTypesTotalSearchDuration.get(ruleType) : state.ruleTypesTotalSearchDuration.get(ruleType);
|
||||
combinedRuleTypeTotalSearchDurations.put(ruleType, ruleTypeTotalSearchDurationTotal);
|
||||
}
|
||||
}
|
||||
|
||||
result.ruleTypes = combinedRuleTypes;
|
||||
result.ruleTypesDuration = combinedRuleTypeDurations;
|
||||
result.ruleTypesEsSearchDuration = combinedRuleTypeEsSearchDurations;
|
||||
result.ruleTypesTotalSearchDuration = combinedRuleTypeTotalSearchDurations;
|
||||
return result;
|
||||
`,
|
||||
},
|
||||
};
|
||||
|
||||
const ruleTypeExecutionsMetric = {
|
||||
scripted_metric: {
|
||||
init_script: 'state.ruleTypes = [:]',
|
||||
map_script: `
|
||||
String ruleType = doc['rule.category'].value;
|
||||
state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1);
|
||||
`,
|
||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
||||
// Despite docs that say this is optional, this script can't be blank.
|
||||
combine_script: 'return state',
|
||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
||||
// This also needs to account for having no data
|
||||
reduce_script: `
|
||||
HashMap result = new HashMap();
|
||||
HashMap combinedRuleTypes = new HashMap();
|
||||
for (state in states) {
|
||||
for (String ruleType : state.ruleTypes.keySet()) {
|
||||
int ruleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + state.ruleTypes.get(ruleType) : state.ruleTypes.get(ruleType);
|
||||
combinedRuleTypes.put(ruleType, ruleTypeCount);
|
||||
}
|
||||
}
|
||||
|
||||
result.ruleTypes = combinedRuleTypes;
|
||||
return result;
|
||||
`,
|
||||
},
|
||||
};
|
||||
|
||||
const taskTypeExecutionsMetric = {
|
||||
scripted_metric: {
|
||||
init_script: 'state.statuses = [:]',
|
||||
map_script: `
|
||||
String status = doc['task.status'].value;
|
||||
String taskType = doc['task.taskType'].value.replace('alerting:', '');
|
||||
Map taskTypes = state.statuses.containsKey(status) ? state.statuses.get(status) : [:];
|
||||
taskTypes.put(taskType, taskTypes.containsKey(taskType) ? taskTypes.get(taskType) + 1 : 1);
|
||||
state.statuses.put(status, taskTypes);
|
||||
`,
|
||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
||||
// Despite docs that say this is optional, this script can't be blank.
|
||||
combine_script: 'return state',
|
||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
||||
// This also needs to account for having no data
|
||||
reduce_script: `
|
||||
HashMap result = new HashMap();
|
||||
HashMap combinedStatuses = new HashMap();
|
||||
for (state in states) {
|
||||
for (String status : state.statuses.keySet()) {
|
||||
HashMap combinedTaskTypes = new HashMap();
|
||||
Map statusTaskTypes = state.statuses.get(status);
|
||||
for (String taskType : statusTaskTypes.keySet()) {
|
||||
int statusByTaskTypeCount = combinedTaskTypes.containsKey(taskType) ? combinedTaskTypes.get(taskType) + statusTaskTypes.get(taskType) : statusTaskTypes.get(taskType);
|
||||
combinedTaskTypes.put(taskType, statusByTaskTypeCount);
|
||||
}
|
||||
|
||||
combinedStatuses.put(status, combinedTaskTypes);
|
||||
}
|
||||
}
|
||||
result.statuses = combinedStatuses;
|
||||
return result;
|
||||
`,
|
||||
},
|
||||
};
|
||||
|
||||
const ruleTypeFailureExecutionsMetric = {
|
||||
scripted_metric: {
|
||||
init_script: 'state.reasons = [:]',
|
||||
map_script: `
|
||||
if (doc['event.outcome'].value == 'failure') {
|
||||
String reason = doc['event.reason'].value;
|
||||
String ruleType = doc['rule.category'].value;
|
||||
Map ruleTypes = state.reasons.containsKey(reason) ? state.reasons.get(reason) : [:];
|
||||
ruleTypes.put(ruleType, ruleTypes.containsKey(ruleType) ? ruleTypes.get(ruleType) + 1 : 1);
|
||||
state.reasons.put(reason, ruleTypes);
|
||||
}
|
||||
`,
|
||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
||||
// Despite docs that say this is optional, this script can't be blank.
|
||||
combine_script: 'return state',
|
||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
||||
// This also needs to account for having no data
|
||||
reduce_script: `
|
||||
HashMap result = new HashMap();
|
||||
HashMap combinedReasons = new HashMap();
|
||||
for (state in states) {
|
||||
for (String reason : state.reasons.keySet()) {
|
||||
HashMap combinedRuleTypes = new HashMap();
|
||||
Map reasonRuleTypes = state.reasons.get(reason);
|
||||
for (String ruleType : state.reasons.get(reason).keySet()) {
|
||||
int reasonByRuleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + reasonRuleTypes.get(ruleType) : reasonRuleTypes.get(ruleType);
|
||||
combinedRuleTypes.put(ruleType, reasonByRuleTypeCount);
|
||||
}
|
||||
|
||||
combinedReasons.put(reason, combinedRuleTypes);
|
||||
}
|
||||
}
|
||||
result.reasons = combinedReasons;
|
||||
return result;
|
||||
`,
|
||||
},
|
||||
};
|
||||
|
||||
export async function getTotalCountAggregations(
|
||||
esClient: ElasticsearchClient,
|
||||
kibanaIndex: string,
|
||||
logger: Logger
|
||||
): Promise<
|
||||
Pick<
|
||||
AlertingUsage,
|
||||
| 'count_total'
|
||||
| 'count_by_type'
|
||||
| 'throttle_time'
|
||||
| 'schedule_time'
|
||||
| 'throttle_time_number_s'
|
||||
| 'schedule_time_number_s'
|
||||
| 'connectors_per_alert'
|
||||
| 'count_rules_namespaces'
|
||||
>
|
||||
> {
|
||||
try {
|
||||
const results = await esClient.search({
|
||||
index: kibanaIndex,
|
||||
body: {
|
||||
size: 0,
|
||||
query: {
|
||||
bool: {
|
||||
filter: [{ term: { type: 'alert' } }],
|
||||
},
|
||||
},
|
||||
runtime_mappings: {
|
||||
alert_action_count: {
|
||||
type: 'long',
|
||||
script: {
|
||||
source: `
|
||||
def alert = params._source['alert'];
|
||||
if (alert != null) {
|
||||
def actions = alert.actions;
|
||||
if (actions != null) {
|
||||
emit(actions.length);
|
||||
} else {
|
||||
emit(0);
|
||||
}
|
||||
}`,
|
||||
},
|
||||
},
|
||||
alert_interval: {
|
||||
type: 'long',
|
||||
script: {
|
||||
source: `
|
||||
int parsed = 0;
|
||||
if (doc['alert.schedule.interval'].size() > 0) {
|
||||
def interval = doc['alert.schedule.interval'].value;
|
||||
|
||||
if (interval.length() > 1) {
|
||||
// get last char
|
||||
String timeChar = interval.substring(interval.length() - 1);
|
||||
// remove last char
|
||||
interval = interval.substring(0, interval.length() - 1);
|
||||
|
||||
if (interval.chars().allMatch(Character::isDigit)) {
|
||||
// using of regex is not allowed in painless language
|
||||
parsed = Integer.parseInt(interval);
|
||||
|
||||
if (timeChar.equals("s")) {
|
||||
parsed = parsed;
|
||||
} else if (timeChar.equals("m")) {
|
||||
parsed = parsed * 60;
|
||||
} else if (timeChar.equals("h")) {
|
||||
parsed = parsed * 60 * 60;
|
||||
} else if (timeChar.equals("d")) {
|
||||
parsed = parsed * 24 * 60 * 60;
|
||||
}
|
||||
emit(parsed);
|
||||
}
|
||||
}
|
||||
}
|
||||
emit(parsed);
|
||||
`,
|
||||
},
|
||||
},
|
||||
alert_throttle: {
|
||||
type: 'long',
|
||||
script: {
|
||||
source: `
|
||||
int parsed = 0;
|
||||
if (doc['alert.throttle'].size() > 0) {
|
||||
def throttle = doc['alert.throttle'].value;
|
||||
|
||||
if (throttle.length() > 1) {
|
||||
// get last char
|
||||
String timeChar = throttle.substring(throttle.length() - 1);
|
||||
// remove last char
|
||||
throttle = throttle.substring(0, throttle.length() - 1);
|
||||
|
||||
if (throttle.chars().allMatch(Character::isDigit)) {
|
||||
// using of regex is not allowed in painless language
|
||||
parsed = Integer.parseInt(throttle);
|
||||
|
||||
if (timeChar.equals("s")) {
|
||||
parsed = parsed;
|
||||
} else if (timeChar.equals("m")) {
|
||||
parsed = parsed * 60;
|
||||
} else if (timeChar.equals("h")) {
|
||||
parsed = parsed * 60 * 60;
|
||||
} else if (timeChar.equals("d")) {
|
||||
parsed = parsed * 24 * 60 * 60;
|
||||
}
|
||||
emit(parsed);
|
||||
}
|
||||
}
|
||||
}
|
||||
emit(parsed);
|
||||
`,
|
||||
},
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
byRuleTypeId: ruleTypeMetric,
|
||||
max_throttle_time: { max: { field: 'alert_throttle' } },
|
||||
min_throttle_time: { min: { field: 'alert_throttle' } },
|
||||
avg_throttle_time: { avg: { field: 'alert_throttle' } },
|
||||
max_interval_time: { max: { field: 'alert_interval' } },
|
||||
min_interval_time: { min: { field: 'alert_interval' } },
|
||||
avg_interval_time: { avg: { field: 'alert_interval' } },
|
||||
max_actions_count: { max: { field: 'alert_action_count' } },
|
||||
min_actions_count: { min: { field: 'alert_action_count' } },
|
||||
avg_actions_count: { avg: { field: 'alert_action_count' } },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const aggregations = results.aggregations as {
|
||||
byRuleTypeId: { value: { ruleTypes: Record<string, string> } };
|
||||
max_throttle_time: { value: number };
|
||||
min_throttle_time: { value: number };
|
||||
avg_throttle_time: { value: number };
|
||||
max_interval_time: { value: number };
|
||||
min_interval_time: { value: number };
|
||||
avg_interval_time: { value: number };
|
||||
max_actions_count: { value: number };
|
||||
min_actions_count: { value: number };
|
||||
avg_actions_count: { value: number };
|
||||
};
|
||||
|
||||
const totalRulesCount = Object.keys(aggregations.byRuleTypeId.value.ruleTypes).reduce(
|
||||
(total: number, key: string) =>
|
||||
parseInt(aggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
||||
0
|
||||
);
|
||||
|
||||
return {
|
||||
count_total: totalRulesCount,
|
||||
count_by_type: replaceDotSymbolsInRuleTypeIds(aggregations.byRuleTypeId.value.ruleTypes),
|
||||
throttle_time: {
|
||||
min: `${aggregations.min_throttle_time.value}s`,
|
||||
avg: `${aggregations.avg_throttle_time.value}s`,
|
||||
max: `${aggregations.max_throttle_time.value}s`,
|
||||
},
|
||||
schedule_time: {
|
||||
min: `${aggregations.min_interval_time.value}s`,
|
||||
avg: `${aggregations.avg_interval_time.value}s`,
|
||||
max: `${aggregations.max_interval_time.value}s`,
|
||||
},
|
||||
throttle_time_number_s: {
|
||||
min: aggregations.min_throttle_time.value,
|
||||
avg: aggregations.avg_throttle_time.value,
|
||||
max: aggregations.max_throttle_time.value,
|
||||
},
|
||||
schedule_time_number_s: {
|
||||
min: aggregations.min_interval_time.value,
|
||||
avg: aggregations.avg_interval_time.value,
|
||||
max: aggregations.max_interval_time.value,
|
||||
},
|
||||
connectors_per_alert: {
|
||||
min: aggregations.min_actions_count.value,
|
||||
avg: aggregations.avg_actions_count.value,
|
||||
max: aggregations.max_actions_count.value,
|
||||
},
|
||||
count_rules_namespaces: 0,
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
`Error executing alerting telemetry task: getTotalCountAggregations - ${JSON.stringify(err)}`
|
||||
);
|
||||
return {
|
||||
count_total: 0,
|
||||
count_by_type: {},
|
||||
throttle_time: {
|
||||
min: '0s',
|
||||
avg: '0s',
|
||||
max: '0s',
|
||||
},
|
||||
schedule_time: {
|
||||
min: '0s',
|
||||
avg: '0s',
|
||||
max: '0s',
|
||||
},
|
||||
throttle_time_number_s: {
|
||||
min: 0,
|
||||
avg: 0,
|
||||
max: 0,
|
||||
},
|
||||
schedule_time_number_s: {
|
||||
min: 0,
|
||||
avg: 0,
|
||||
max: 0,
|
||||
},
|
||||
connectors_per_alert: {
|
||||
min: 0,
|
||||
avg: 0,
|
||||
max: 0,
|
||||
},
|
||||
count_rules_namespaces: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function getTotalCountInUse(
|
||||
esClient: ElasticsearchClient,
|
||||
kibanaIndex: string,
|
||||
logger: Logger
|
||||
) {
|
||||
try {
|
||||
const searchResult = await esClient.search({
|
||||
index: kibanaIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: [{ term: { type: 'alert' } }, { term: { 'alert.enabled': true } }],
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
byRuleTypeId: ruleTypeMetric,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const aggregations = searchResult.aggregations as {
|
||||
byRuleTypeId: {
|
||||
value: { ruleTypes: Record<string, string>; namespaces: Record<string, string> };
|
||||
};
|
||||
};
|
||||
|
||||
return {
|
||||
countTotal: Object.keys(aggregations.byRuleTypeId.value.ruleTypes).reduce(
|
||||
(total: number, key: string) =>
|
||||
parseInt(aggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
||||
0
|
||||
),
|
||||
countByType: replaceDotSymbolsInRuleTypeIds(aggregations.byRuleTypeId.value.ruleTypes),
|
||||
countNamespaces: Object.keys(aggregations.byRuleTypeId.value.namespaces).length,
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
`Error executing alerting telemetry task: getTotalCountInUse - ${JSON.stringify(err)}`
|
||||
);
|
||||
return {
|
||||
countTotal: 0,
|
||||
countByType: {},
|
||||
countNamespaces: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function getExecutionsPerDayCount(
|
||||
esClient: ElasticsearchClient,
|
||||
eventLogIndex: string,
|
||||
logger: Logger
|
||||
) {
|
||||
try {
|
||||
const searchResult = await esClient.search({
|
||||
index: eventLogIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: {
|
||||
bool: {
|
||||
must: [
|
||||
{
|
||||
term: { 'event.action': 'execute' },
|
||||
},
|
||||
{
|
||||
term: { 'event.provider': 'alerting' },
|
||||
},
|
||||
{
|
||||
range: {
|
||||
'@timestamp': {
|
||||
gte: 'now-1d',
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
byRuleTypeId: ruleTypeExecutionsWithDurationMetric,
|
||||
failuresByReason: ruleTypeFailureExecutionsMetric,
|
||||
avgDuration: { avg: { field: 'event.duration' } },
|
||||
avgEsSearchDuration: {
|
||||
avg: { field: 'kibana.alert.rule.execution.metrics.es_search_duration_ms' },
|
||||
},
|
||||
avgTotalSearchDuration: {
|
||||
avg: { field: 'kibana.alert.rule.execution.metrics.total_search_duration_ms' },
|
||||
},
|
||||
percentileScheduledActions: generatedActionsPercentilesAgg,
|
||||
percentileAlerts: alertsPercentilesAgg,
|
||||
aggsByType: {
|
||||
terms: {
|
||||
field: 'rule.category',
|
||||
size: NUM_ALERTING_RULE_TYPES,
|
||||
},
|
||||
aggs: {
|
||||
percentileScheduledActions: generatedActionsPercentilesAgg,
|
||||
percentileAlerts: alertsPercentilesAgg,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const executionsAggregations = searchResult.aggregations as {
|
||||
byRuleTypeId: {
|
||||
value: {
|
||||
ruleTypes: Record<string, string>;
|
||||
ruleTypesDuration: Record<string, number>;
|
||||
ruleTypesEsSearchDuration: Record<string, number>;
|
||||
ruleTypesTotalSearchDuration: Record<string, number>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
const aggsAvgExecutionTime = Math.round(
|
||||
// @ts-expect-error aggegation type is not specified
|
||||
// convert nanoseconds to milliseconds
|
||||
searchResult.aggregations.avgDuration.value / (1000 * 1000)
|
||||
);
|
||||
|
||||
const aggsAvgEsSearchDuration = Math.round(
|
||||
// @ts-expect-error aggegation type is not specified
|
||||
searchResult.aggregations.avgEsSearchDuration.value
|
||||
);
|
||||
const aggsAvgTotalSearchDuration = Math.round(
|
||||
// @ts-expect-error aggegation type is not specified
|
||||
searchResult.aggregations.avgTotalSearchDuration.value
|
||||
);
|
||||
|
||||
const aggsGeneratedActionsPercentiles =
|
||||
// @ts-expect-error aggegation type is not specified
|
||||
searchResult.aggregations.percentileScheduledActions.values;
|
||||
|
||||
const aggsAlertsPercentiles =
|
||||
// @ts-expect-error aggegation type is not specified
|
||||
searchResult.aggregations.percentileAlerts.values;
|
||||
|
||||
const aggsByTypeBuckets =
|
||||
// @ts-expect-error aggegation type is not specified
|
||||
searchResult.aggregations.aggsByType.buckets;
|
||||
|
||||
const executionFailuresAggregations = searchResult.aggregations as {
|
||||
failuresByReason: { value: { reasons: Record<string, Record<string, string>> } };
|
||||
};
|
||||
|
||||
return {
|
||||
countTotal: Object.keys(executionsAggregations.byRuleTypeId.value.ruleTypes).reduce(
|
||||
(total: number, key: string) =>
|
||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
||||
0
|
||||
),
|
||||
countByType: replaceDotSymbolsInRuleTypeIds(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
||||
),
|
||||
countTotalFailures: Object.keys(
|
||||
executionFailuresAggregations.failuresByReason.value.reasons
|
||||
).reduce((total: number, reason: string) => {
|
||||
const byRuleTypesRefs =
|
||||
executionFailuresAggregations.failuresByReason.value.reasons[reason];
|
||||
const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce(
|
||||
(totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10),
|
||||
0
|
||||
);
|
||||
return countByRuleTypes + total;
|
||||
}, 0),
|
||||
countFailuresByReason: Object.keys(
|
||||
executionFailuresAggregations.failuresByReason.value.reasons
|
||||
).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, reason: string) => {
|
||||
const byRuleTypesRefs =
|
||||
executionFailuresAggregations.failuresByReason.value.reasons[reason];
|
||||
const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce(
|
||||
(totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10),
|
||||
0
|
||||
);
|
||||
return {
|
||||
...obj,
|
||||
[replaceDotSymbols(reason)]: countByRuleTypes,
|
||||
};
|
||||
},
|
||||
{}
|
||||
),
|
||||
countFailuresByReasonByType: Object.keys(
|
||||
executionFailuresAggregations.failuresByReason.value.reasons
|
||||
).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, key: string) => ({
|
||||
...obj,
|
||||
[key]: replaceDotSymbolsInRuleTypeIds(
|
||||
executionFailuresAggregations.failuresByReason.value.reasons[key]
|
||||
),
|
||||
}),
|
||||
{}
|
||||
),
|
||||
avgExecutionTime: aggsAvgExecutionTime,
|
||||
avgExecutionTimeByType: Object.keys(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
||||
).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, key: string) => ({
|
||||
...obj,
|
||||
[replaceDotSymbols(key)]: Math.round(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypesDuration[key] /
|
||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10)
|
||||
),
|
||||
}),
|
||||
{}
|
||||
),
|
||||
avgEsSearchDuration: aggsAvgEsSearchDuration,
|
||||
avgEsSearchDurationByType: Object.keys(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
||||
).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, key: string) => ({
|
||||
...obj,
|
||||
[replaceDotSymbols(key)]: Math.round(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypesEsSearchDuration[key] /
|
||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10)
|
||||
),
|
||||
}),
|
||||
{}
|
||||
),
|
||||
avgTotalSearchDuration: aggsAvgTotalSearchDuration,
|
||||
avgTotalSearchDurationByType: Object.keys(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
||||
).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, key: string) => ({
|
||||
...obj,
|
||||
[replaceDotSymbols(key)]: Math.round(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypesTotalSearchDuration[key] /
|
||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10)
|
||||
),
|
||||
}),
|
||||
{}
|
||||
),
|
||||
generatedActionsPercentiles: Object.keys(aggsGeneratedActionsPercentiles).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(acc: any, curr: string) => ({
|
||||
...acc,
|
||||
...(percentileFieldNameMapping[curr]
|
||||
? { [percentileFieldNameMapping[curr]]: aggsGeneratedActionsPercentiles[curr] }
|
||||
: {}),
|
||||
}),
|
||||
{}
|
||||
),
|
||||
generatedActionsPercentilesByType: parsePercentileAggsByRuleType(
|
||||
aggsByTypeBuckets,
|
||||
'percentileScheduledActions.values'
|
||||
),
|
||||
alertsPercentiles: Object.keys(aggsAlertsPercentiles).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(acc: any, curr: string) => ({
|
||||
...acc,
|
||||
...(percentileFieldNameMapping[curr]
|
||||
? { [percentileFieldNameMapping[curr]]: aggsAlertsPercentiles[curr] }
|
||||
: {}),
|
||||
}),
|
||||
{}
|
||||
),
|
||||
alertsPercentilesByType: parsePercentileAggsByRuleType(
|
||||
aggsByTypeBuckets,
|
||||
'percentileAlerts.values'
|
||||
),
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
`Error executing alerting telemetry task: getExecutionsPerDayCount - ${JSON.stringify(err)}`
|
||||
);
|
||||
return {
|
||||
countTotal: 0,
|
||||
countByType: {},
|
||||
countTotalFailures: 0,
|
||||
countFailuresByReason: {},
|
||||
countFailuresByReasonByType: {},
|
||||
avgExecutionTime: 0,
|
||||
avgExecutionTimeByType: {},
|
||||
avgEsSearchDuration: 0,
|
||||
avgEsSearchDurationByType: {},
|
||||
avgTotalSearchDuration: 0,
|
||||
avgTotalSearchDurationByType: {},
|
||||
generatedActionsPercentiles: {},
|
||||
generatedActionsPercentilesByType: {},
|
||||
alertsPercentiles: {},
|
||||
alertsPercentilesByType: {},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function getExecutionTimeoutsPerDayCount(
|
||||
esClient: ElasticsearchClient,
|
||||
eventLogIndex: string,
|
||||
logger: Logger
|
||||
) {
|
||||
try {
|
||||
const searchResult = await esClient.search({
|
||||
index: eventLogIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: {
|
||||
bool: {
|
||||
must: [
|
||||
{
|
||||
term: { 'event.action': 'execute-timeout' },
|
||||
},
|
||||
{
|
||||
term: { 'event.provider': 'alerting' },
|
||||
},
|
||||
{
|
||||
range: {
|
||||
'@timestamp': {
|
||||
gte: 'now-1d',
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
byRuleTypeId: ruleTypeExecutionsMetric,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const executionsAggregations = searchResult.aggregations as {
|
||||
byRuleTypeId: {
|
||||
value: { ruleTypes: Record<string, string>; ruleTypesDuration: Record<string, number> };
|
||||
};
|
||||
};
|
||||
|
||||
return {
|
||||
countTotal: Object.keys(executionsAggregations.byRuleTypeId.value.ruleTypes).reduce(
|
||||
(total: number, key: string) =>
|
||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
||||
0
|
||||
),
|
||||
countByType: replaceDotSymbolsInRuleTypeIds(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
||||
),
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
`Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - ${JSON.stringify(
|
||||
err
|
||||
)}`
|
||||
);
|
||||
return {
|
||||
countTotal: 0,
|
||||
countByType: {},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function getFailedAndUnrecognizedTasksPerDay(
|
||||
esClient: ElasticsearchClient,
|
||||
taskManagerIndex: string,
|
||||
logger: Logger
|
||||
) {
|
||||
try {
|
||||
const searchResult = await esClient.search({
|
||||
index: taskManagerIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
must: [
|
||||
{
|
||||
bool: {
|
||||
should: [
|
||||
{
|
||||
term: {
|
||||
'task.status': 'unrecognized',
|
||||
},
|
||||
},
|
||||
{
|
||||
term: {
|
||||
'task.status': 'failed',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
wildcard: {
|
||||
'task.taskType': {
|
||||
value: 'alerting:*',
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
range: {
|
||||
'task.runAt': {
|
||||
gte: 'now-1d',
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
byTaskTypeId: taskTypeExecutionsMetric,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const executionsAggregations = searchResult.aggregations as {
|
||||
byTaskTypeId: { value: { statuses: Record<string, Record<string, string>> } };
|
||||
};
|
||||
|
||||
return {
|
||||
countTotal: Object.keys(executionsAggregations.byTaskTypeId.value.statuses).reduce(
|
||||
(total: number, status: string) => {
|
||||
const byRuleTypesRefs = executionsAggregations.byTaskTypeId.value.statuses[status];
|
||||
const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce(
|
||||
(totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10),
|
||||
0
|
||||
);
|
||||
return countByRuleTypes + total;
|
||||
},
|
||||
0
|
||||
),
|
||||
countByStatus: Object.keys(executionsAggregations.byTaskTypeId.value.statuses).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, status: string) => {
|
||||
const byRuleTypesRefs = executionsAggregations.byTaskTypeId.value.statuses[status];
|
||||
const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce(
|
||||
(totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10),
|
||||
0
|
||||
);
|
||||
return {
|
||||
...obj,
|
||||
[status]: countByRuleTypes,
|
||||
};
|
||||
},
|
||||
{}
|
||||
),
|
||||
countByStatusByRuleType: Object.keys(
|
||||
executionsAggregations.byTaskTypeId.value.statuses
|
||||
).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, key: string) => ({
|
||||
...obj,
|
||||
[key]: replaceDotSymbolsInRuleTypeIds(
|
||||
executionsAggregations.byTaskTypeId.value.statuses[key]
|
||||
),
|
||||
}),
|
||||
{}
|
||||
),
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
`Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify(
|
||||
err
|
||||
)}`
|
||||
);
|
||||
return {
|
||||
countTotal: 0,
|
||||
countByStatus: {},
|
||||
countByStatusByRuleType: {},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function replaceDotSymbols(strToReplace: string) {
|
||||
return strToReplace.replaceAll('.', '__');
|
||||
}
|
||||
|
||||
function replaceDotSymbolsInRuleTypeIds(ruleTypeIdObj: Record<string, string>) {
|
||||
return Object.keys(ruleTypeIdObj).reduce(
|
||||
(obj, key) => ({ ...obj, [replaceDotSymbols(key)]: ruleTypeIdObj[key] }),
|
||||
{}
|
||||
);
|
||||
}
|
||||
|
||||
export function parsePercentileAggsByRuleType(
|
||||
aggsByType: estypes.AggregationsStringTermsBucketKeys[],
|
||||
path: string
|
||||
) {
|
||||
return (aggsByType ?? []).reduce(
|
||||
(acc, curr) => {
|
||||
const percentiles = get(curr, path, {});
|
||||
return merge(
|
||||
acc,
|
||||
Object.keys(percentiles).reduce((pacc, pcurr) => {
|
||||
return {
|
||||
...pacc,
|
||||
...(percentileFieldNameMapping[pcurr]
|
||||
? {
|
||||
[percentileFieldNameMapping[pcurr]]: {
|
||||
[replaceDotSymbols(curr.key)]: percentiles[pcurr] ?? 0,
|
||||
},
|
||||
}
|
||||
: {}),
|
||||
};
|
||||
}, {})
|
||||
);
|
||||
},
|
||||
{ p50: {}, p90: {}, p99: {} }
|
||||
);
|
||||
}
|
|
@ -68,6 +68,8 @@ const byReasonSchema: MakeSchemaFrom<AlertingUsage>['count_rules_executions_fail
|
|||
unknown: { type: 'long' },
|
||||
};
|
||||
|
||||
export const NUM_ALERTING_EXECUTION_FAILURE_REASON_TYPES = Object.keys(byReasonSchema).length;
|
||||
|
||||
const byPercentileSchema: MakeSchemaFrom<AlertingUsage>['percentile_num_generated_actions_per_day'] =
|
||||
{
|
||||
p50: { type: 'long' },
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,583 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { flatMap, merge } from 'lodash';
|
||||
import type {
|
||||
AggregationsKeyedPercentiles,
|
||||
AggregationsSingleBucketAggregateBase,
|
||||
AggregationsPercentilesAggregateBase,
|
||||
AggregationsSingleMetricAggregateBase,
|
||||
AggregationsTermsAggregateBase,
|
||||
AggregationsStringTermsBucketKeys,
|
||||
AggregationsBuckets,
|
||||
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import { ElasticsearchClient, Logger } from '@kbn/core/server';
|
||||
import {
|
||||
NUM_ALERTING_RULE_TYPES,
|
||||
NUM_ALERTING_EXECUTION_FAILURE_REASON_TYPES,
|
||||
} from '../alerting_usage_collector';
|
||||
import { replaceDotSymbols } from './replace_dots_with_underscores';
|
||||
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
|
||||
|
||||
const Millis2Nanos = 1000 * 1000;
|
||||
const percentileFieldNameMapping: Record<string, string> = {
|
||||
'50.0': 'p50',
|
||||
'90.0': 'p90',
|
||||
'99.0': 'p99',
|
||||
};
|
||||
|
||||
interface Opts {
|
||||
esClient: ElasticsearchClient;
|
||||
eventLogIndex: string;
|
||||
logger: Logger;
|
||||
}
|
||||
|
||||
interface GetExecutionsPerDayCountResults {
|
||||
countTotalRuleExecutions: number;
|
||||
countRuleExecutionsByType: Record<string, number>;
|
||||
countTotalFailedExecutions: number;
|
||||
countFailedExecutionsByReason: Record<string, number>;
|
||||
countFailedExecutionsByReasonByType: Record<string, Record<string, number>>;
|
||||
avgExecutionTime: number;
|
||||
avgExecutionTimeByType: Record<string, number>;
|
||||
avgEsSearchDuration: number;
|
||||
avgEsSearchDurationByType: Record<string, number>;
|
||||
avgTotalSearchDuration: number;
|
||||
avgTotalSearchDurationByType: Record<string, number>;
|
||||
generatedActionsPercentiles: Record<string, number>;
|
||||
generatedActionsPercentilesByType: Record<string, Record<string, number>>;
|
||||
alertsPercentiles: Record<string, number>;
|
||||
alertsPercentilesByType: Record<string, Record<string, number>>;
|
||||
}
|
||||
|
||||
interface GetExecutionTimeoutsPerDayCountResults {
|
||||
countExecutionTimeouts: number;
|
||||
countExecutionTimeoutsByType: Record<string, number>;
|
||||
}
|
||||
|
||||
interface GetExecutionCountsExecutionFailures extends AggregationsSingleBucketAggregateBase {
|
||||
by_reason: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||
}
|
||||
interface GetExecutionCountsAggregationBucket extends AggregationsStringTermsBucketKeys {
|
||||
avg_execution_time: AggregationsSingleMetricAggregateBase;
|
||||
avg_es_search_duration: AggregationsSingleMetricAggregateBase;
|
||||
avg_total_search_duration: AggregationsSingleMetricAggregateBase;
|
||||
execution_failures: GetExecutionCountsExecutionFailures;
|
||||
percentile_scheduled_actions: AggregationsPercentilesAggregateBase;
|
||||
percentile_alerts: AggregationsPercentilesAggregateBase;
|
||||
}
|
||||
|
||||
interface IGetExecutionFailures extends AggregationsSingleBucketAggregateBase {
|
||||
by_reason: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||
}
|
||||
|
||||
export async function getExecutionsPerDayCount({
|
||||
esClient,
|
||||
eventLogIndex,
|
||||
logger,
|
||||
}: Opts): Promise<GetExecutionsPerDayCountResults> {
|
||||
try {
|
||||
const eventLogAggs = {
|
||||
avg_execution_time: {
|
||||
avg: {
|
||||
field: 'event.duration',
|
||||
},
|
||||
},
|
||||
avg_es_search_duration: {
|
||||
avg: {
|
||||
field: 'kibana.alert.rule.execution.metrics.es_search_duration_ms',
|
||||
},
|
||||
},
|
||||
avg_total_search_duration: {
|
||||
avg: {
|
||||
field: 'kibana.alert.rule.execution.metrics.total_search_duration_ms',
|
||||
},
|
||||
},
|
||||
|
||||
percentile_scheduled_actions: {
|
||||
percentiles: {
|
||||
field: 'kibana.alert.rule.execution.metrics.number_of_generated_actions',
|
||||
percents: [50, 90, 99],
|
||||
},
|
||||
},
|
||||
percentile_alerts: {
|
||||
percentiles: {
|
||||
field: 'kibana.alert.rule.execution.metrics.alert_counts.active',
|
||||
percents: [50, 90, 99],
|
||||
},
|
||||
},
|
||||
execution_failures: {
|
||||
filter: {
|
||||
term: {
|
||||
'event.outcome': 'failure',
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
by_reason: {
|
||||
terms: {
|
||||
field: 'event.reason',
|
||||
size: NUM_ALERTING_EXECUTION_FAILURE_REASON_TYPES,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const query = {
|
||||
index: eventLogIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: getProviderAndActionFilterForTimeRange('execute'),
|
||||
aggs: {
|
||||
...eventLogAggs,
|
||||
by_rule_type_id: {
|
||||
terms: {
|
||||
field: 'rule.category',
|
||||
size: NUM_ALERTING_RULE_TYPES,
|
||||
},
|
||||
aggs: eventLogAggs,
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
logger.debug(`query for getExecutionsPerDayCount - ${JSON.stringify(query)}`);
|
||||
const results = await esClient.search(query);
|
||||
|
||||
logger.debug(`results for getExecutionsPerDayCount query - ${JSON.stringify(results)}`);
|
||||
|
||||
const totalRuleExecutions =
|
||||
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
|
||||
|
||||
const aggregations = results.aggregations as {
|
||||
by_rule_type_id: AggregationsTermsAggregateBase<GetExecutionCountsAggregationBucket>;
|
||||
execution_failures: IGetExecutionFailures;
|
||||
percentile_scheduled_actions: AggregationsPercentilesAggregateBase;
|
||||
percentile_alerts: AggregationsPercentilesAggregateBase;
|
||||
avg_execution_time: AggregationsSingleMetricAggregateBase;
|
||||
avg_es_search_duration: AggregationsSingleMetricAggregateBase;
|
||||
avg_total_search_duration: AggregationsSingleMetricAggregateBase;
|
||||
};
|
||||
|
||||
const aggregationsByRuleTypeId: AggregationsBuckets<GetExecutionCountsAggregationBucket> =
|
||||
aggregations.by_rule_type_id.buckets as GetExecutionCountsAggregationBucket[];
|
||||
|
||||
return {
|
||||
...parseRuleTypeBucket(aggregationsByRuleTypeId),
|
||||
...parseExecutionFailureByRuleType(aggregationsByRuleTypeId),
|
||||
...parseExecutionCountAggregationResults(aggregations),
|
||||
countTotalRuleExecutions: totalRuleExecutions ?? 0,
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
`Error executing alerting telemetry task: getExecutionsPerDayCount - ${JSON.stringify(err)}`,
|
||||
{
|
||||
tags: ['alerting', 'telemetry-failed'],
|
||||
error: { stack_trace: err.stack },
|
||||
}
|
||||
);
|
||||
return {
|
||||
countTotalRuleExecutions: 0,
|
||||
countRuleExecutionsByType: {},
|
||||
countTotalFailedExecutions: 0,
|
||||
countFailedExecutionsByReason: {},
|
||||
countFailedExecutionsByReasonByType: {},
|
||||
avgExecutionTime: 0,
|
||||
avgExecutionTimeByType: {},
|
||||
avgEsSearchDuration: 0,
|
||||
avgEsSearchDurationByType: {},
|
||||
avgTotalSearchDuration: 0,
|
||||
avgTotalSearchDurationByType: {},
|
||||
generatedActionsPercentiles: {},
|
||||
generatedActionsPercentilesByType: {},
|
||||
alertsPercentiles: {},
|
||||
alertsPercentilesByType: {},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function getExecutionTimeoutsPerDayCount({
|
||||
esClient,
|
||||
eventLogIndex,
|
||||
logger,
|
||||
}: Opts): Promise<GetExecutionTimeoutsPerDayCountResults> {
|
||||
try {
|
||||
const query = {
|
||||
index: eventLogIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: getProviderAndActionFilterForTimeRange('execute-timeout'),
|
||||
aggs: {
|
||||
by_rule_type_id: {
|
||||
terms: {
|
||||
field: 'rule.category',
|
||||
size: NUM_ALERTING_RULE_TYPES,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
logger.debug(`query for getExecutionTimeoutsPerDayCount - ${JSON.stringify(query)}`);
|
||||
const results = await esClient.search(query);
|
||||
|
||||
logger.debug(`results for getExecutionTimeoutsPerDayCount query - ${JSON.stringify(results)}`);
|
||||
|
||||
const aggregations = results.aggregations as {
|
||||
by_rule_type_id: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||
};
|
||||
|
||||
const totalTimedoutExecutionsCount =
|
||||
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
|
||||
|
||||
return {
|
||||
countExecutionTimeouts: totalTimedoutExecutionsCount ?? 0,
|
||||
countExecutionTimeoutsByType: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets),
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
`Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - ${JSON.stringify(
|
||||
err
|
||||
)}`,
|
||||
{
|
||||
tags: ['alerting', 'telemetry-failed'],
|
||||
error: { stack_trace: err.stack },
|
||||
}
|
||||
);
|
||||
return {
|
||||
countExecutionTimeouts: 0,
|
||||
countExecutionTimeoutsByType: {},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Bucket format:
|
||||
* {
|
||||
* key: '.index-threshold', // rule type id
|
||||
* doc_count: 78, // count of number of executions
|
||||
* avg_es_search_duration: { // average es search duration across executions
|
||||
* value: 40.76056338028169,
|
||||
* },
|
||||
* percentile_alerts: { // stats for number of alerts created across executions
|
||||
* values: {
|
||||
* '50.0': 1,
|
||||
* '95.0': 1,
|
||||
* '99.0': 1,
|
||||
* },
|
||||
* },
|
||||
* execution_failures: {
|
||||
* doc_count: 7, // count of number of failed executions
|
||||
* by_reason: {
|
||||
* doc_count_error_upper_bound: 0,
|
||||
* sum_other_doc_count: 0,
|
||||
* buckets: [
|
||||
* {
|
||||
* key: 'execute', // breakdown of reason for execution failures
|
||||
* doc_count: 4,
|
||||
* },
|
||||
* {
|
||||
* key: 'decrypt',
|
||||
* doc_count: 3,
|
||||
* },
|
||||
* ],
|
||||
* },
|
||||
* },
|
||||
* percentile_scheduled_actions: { // stats for number of actions generated across executions
|
||||
* values: {
|
||||
* '50.0': 0,
|
||||
* '95.0': 0,
|
||||
* '99.0': 0,
|
||||
* },
|
||||
* },
|
||||
* avg_execution_time: { // average execution time in nanoseconds across executions
|
||||
* value: 100576923.07692307,
|
||||
* },
|
||||
* avg_total_search_duration: { // average total search duration across executions
|
||||
* value: 43.74647887323944,
|
||||
* },
|
||||
* }
|
||||
*/
|
||||
|
||||
export function parseRuleTypeBucket(
|
||||
buckets: GetExecutionCountsAggregationBucket[]
|
||||
): Pick<
|
||||
GetExecutionsPerDayCountResults,
|
||||
| 'countRuleExecutionsByType'
|
||||
| 'avgExecutionTimeByType'
|
||||
| 'avgEsSearchDurationByType'
|
||||
| 'avgTotalSearchDurationByType'
|
||||
| 'generatedActionsPercentilesByType'
|
||||
| 'alertsPercentilesByType'
|
||||
> {
|
||||
let summary = {
|
||||
countRuleExecutionsByType: {},
|
||||
avgExecutionTimeByType: {},
|
||||
avgEsSearchDurationByType: {},
|
||||
avgTotalSearchDurationByType: {},
|
||||
generatedActionsPercentilesByType: { p50: {}, p90: {}, p99: {} },
|
||||
alertsPercentilesByType: { p50: {}, p90: {}, p99: {} },
|
||||
};
|
||||
for (const bucket of buckets ?? []) {
|
||||
const ruleType: string = replaceDotSymbols(bucket?.key) ?? '';
|
||||
const numExecutions: number = bucket?.doc_count ?? 0;
|
||||
const avgExecutionTimeNanos = bucket?.avg_execution_time?.value ?? 0;
|
||||
const avgEsSearchTimeMillis = bucket?.avg_es_search_duration?.value ?? 0;
|
||||
const avgTotalSearchTimeMillis = bucket?.avg_total_search_duration?.value ?? 0;
|
||||
const actionPercentiles = bucket?.percentile_scheduled_actions?.values ?? {};
|
||||
const alertPercentiles = bucket?.percentile_alerts?.values ?? {};
|
||||
|
||||
summary = {
|
||||
countRuleExecutionsByType: {
|
||||
...summary.countRuleExecutionsByType,
|
||||
[ruleType]: numExecutions,
|
||||
},
|
||||
avgExecutionTimeByType: {
|
||||
...summary.avgExecutionTimeByType,
|
||||
[ruleType]: Math.round(avgExecutionTimeNanos / Millis2Nanos),
|
||||
},
|
||||
avgEsSearchDurationByType: {
|
||||
...summary.avgEsSearchDurationByType,
|
||||
[ruleType]: Math.round(avgEsSearchTimeMillis),
|
||||
},
|
||||
avgTotalSearchDurationByType: {
|
||||
...summary.avgTotalSearchDurationByType,
|
||||
[ruleType]: Math.round(avgTotalSearchTimeMillis),
|
||||
},
|
||||
generatedActionsPercentilesByType: merge(
|
||||
summary.generatedActionsPercentilesByType,
|
||||
parsePercentileAggs(actionPercentiles as AggregationsKeyedPercentiles, ruleType)
|
||||
),
|
||||
alertsPercentilesByType: merge(
|
||||
summary.alertsPercentilesByType,
|
||||
parsePercentileAggs(alertPercentiles as AggregationsKeyedPercentiles, ruleType)
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
return summary;
|
||||
}
|
||||
|
||||
interface FlattenedExecutionFailureBucket {
|
||||
ruleType: string;
|
||||
key: string;
|
||||
doc_count: number;
|
||||
}
|
||||
|
||||
export function parseExecutionFailureByRuleType(
|
||||
buckets: GetExecutionCountsAggregationBucket[]
|
||||
): Pick<GetExecutionsPerDayCountResults, 'countFailedExecutionsByReasonByType'> {
|
||||
const executionFailuresWithRuleTypeBuckets: FlattenedExecutionFailureBucket[] = flatMap(
|
||||
buckets ?? [],
|
||||
(bucket) => {
|
||||
const ruleType: string = replaceDotSymbols(bucket.key);
|
||||
|
||||
/**
|
||||
* Execution failure bucket format
|
||||
* [
|
||||
* {
|
||||
* key: 'execute',
|
||||
* doc_count: 4,
|
||||
* },
|
||||
* {
|
||||
* key: 'decrypt',
|
||||
* doc_count: 3,
|
||||
* },
|
||||
* ]
|
||||
*/
|
||||
|
||||
const executionFailuresBuckets = bucket?.execution_failures?.by_reason
|
||||
?.buckets as AggregationsStringTermsBucketKeys[];
|
||||
return (executionFailuresBuckets ?? []).map((b) => ({ ...b, ruleType }));
|
||||
}
|
||||
);
|
||||
|
||||
const parsedFailures = (executionFailuresWithRuleTypeBuckets ?? []).reduce(
|
||||
(acc: Record<string, Record<string, number>>, bucket: FlattenedExecutionFailureBucket) => {
|
||||
const ruleType: string = bucket.ruleType;
|
||||
const reason: string = bucket.key;
|
||||
|
||||
if (acc[reason]) {
|
||||
if (acc[reason][ruleType]) {
|
||||
return {
|
||||
...acc,
|
||||
[reason]: {
|
||||
...acc[reason],
|
||||
[ruleType]: acc[reason][ruleType] + bucket.doc_count,
|
||||
},
|
||||
};
|
||||
}
|
||||
return {
|
||||
...acc,
|
||||
[reason]: {
|
||||
...acc[reason],
|
||||
[ruleType]: bucket.doc_count,
|
||||
},
|
||||
};
|
||||
}
|
||||
return {
|
||||
...acc,
|
||||
[reason]: {
|
||||
[ruleType]: bucket.doc_count,
|
||||
},
|
||||
};
|
||||
},
|
||||
{}
|
||||
);
|
||||
|
||||
return {
|
||||
countFailedExecutionsByReasonByType: parsedFailures,
|
||||
};
|
||||
}
|
||||
|
||||
export function parsePercentileAggs(
|
||||
percentiles: AggregationsKeyedPercentiles,
|
||||
ruleTypeId?: string
|
||||
) {
|
||||
return Object.keys(percentiles ?? {}).reduce((acc, percentileKey: string) => {
|
||||
let result = {};
|
||||
const percentileKeyMapped = percentileFieldNameMapping[percentileKey];
|
||||
if (percentileKeyMapped) {
|
||||
if (ruleTypeId) {
|
||||
result = {
|
||||
[percentileKeyMapped]: {
|
||||
[ruleTypeId]: percentiles[percentileKey] ?? 0,
|
||||
},
|
||||
};
|
||||
} else {
|
||||
result = {
|
||||
[percentileKeyMapped]: percentiles[percentileKey] ?? 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
return {
|
||||
...acc,
|
||||
...result,
|
||||
};
|
||||
}, {});
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregation Result Format (minus rule type id agg buckets)
|
||||
* {
|
||||
* avg_es_search_duration: {
|
||||
* value: 26.246376811594203,
|
||||
* },
|
||||
* percentile_alerts: {
|
||||
* values: {
|
||||
* '50.0': 1,
|
||||
* '90.0': 5,
|
||||
* '99.0': 5,
|
||||
* },
|
||||
* },
|
||||
* execution_failures: {
|
||||
* doc_count: 10,
|
||||
* by_reason: {
|
||||
* doc_count_error_upper_bound: 0,
|
||||
* sum_other_doc_count: 0,
|
||||
* buckets: [
|
||||
* {
|
||||
* key: 'decrypt',
|
||||
* doc_count: 6,
|
||||
* },
|
||||
* {
|
||||
* key: 'execute',
|
||||
* doc_count: 4,
|
||||
* },
|
||||
* ],
|
||||
* },
|
||||
* },
|
||||
* percentile_scheduled_actions: {
|
||||
* values: {
|
||||
* '50.0': 0,
|
||||
* '95.0': 5,
|
||||
* '99.0': 5,
|
||||
* },
|
||||
* },
|
||||
* avg_execution_time: {
|
||||
* value: 288250000,
|
||||
* },
|
||||
* avg_total_search_duration: {
|
||||
* value: 28.630434782608695,
|
||||
* },
|
||||
*/
|
||||
export function parseExecutionCountAggregationResults(results: {
|
||||
execution_failures: IGetExecutionFailures;
|
||||
percentile_scheduled_actions: AggregationsPercentilesAggregateBase;
|
||||
percentile_alerts: AggregationsPercentilesAggregateBase;
|
||||
avg_execution_time: AggregationsSingleMetricAggregateBase;
|
||||
avg_es_search_duration: AggregationsSingleMetricAggregateBase;
|
||||
avg_total_search_duration: AggregationsSingleMetricAggregateBase;
|
||||
}): Pick<
|
||||
GetExecutionsPerDayCountResults,
|
||||
| 'countTotalFailedExecutions'
|
||||
| 'countFailedExecutionsByReason'
|
||||
| 'avgExecutionTime'
|
||||
| 'avgEsSearchDuration'
|
||||
| 'avgTotalSearchDuration'
|
||||
| 'generatedActionsPercentiles'
|
||||
| 'alertsPercentiles'
|
||||
> {
|
||||
const avgExecutionTimeNanos = results?.avg_execution_time?.value ?? 0;
|
||||
const avgEsSearchDurationMillis = results?.avg_es_search_duration?.value ?? 0;
|
||||
const avgTotalSearchDurationMillis = results?.avg_total_search_duration?.value ?? 0;
|
||||
const executionFailuresByReasonBuckets =
|
||||
(results?.execution_failures?.by_reason?.buckets as AggregationsStringTermsBucketKeys[]) ?? [];
|
||||
const actionPercentiles = results?.percentile_scheduled_actions?.values ?? {};
|
||||
const alertPercentiles = results?.percentile_alerts?.values ?? {};
|
||||
|
||||
return {
|
||||
countTotalFailedExecutions: results?.execution_failures?.doc_count ?? 0,
|
||||
countFailedExecutionsByReason: executionFailuresByReasonBuckets.reduce(
|
||||
(acc: Record<string, number>, bucket: AggregationsStringTermsBucketKeys) => {
|
||||
const reason: string = bucket.key;
|
||||
return {
|
||||
...acc,
|
||||
[reason]: bucket.doc_count ?? 0,
|
||||
};
|
||||
},
|
||||
{}
|
||||
),
|
||||
avgExecutionTime: Math.round(avgExecutionTimeNanos / Millis2Nanos),
|
||||
avgEsSearchDuration: Math.round(avgEsSearchDurationMillis),
|
||||
avgTotalSearchDuration: Math.round(avgTotalSearchDurationMillis),
|
||||
generatedActionsPercentiles: parsePercentileAggs(
|
||||
actionPercentiles as AggregationsKeyedPercentiles
|
||||
),
|
||||
alertsPercentiles: parsePercentileAggs(alertPercentiles as AggregationsKeyedPercentiles),
|
||||
};
|
||||
}
|
||||
|
||||
function getProviderAndActionFilterForTimeRange(
|
||||
action: string,
|
||||
provider: string = 'alerting',
|
||||
range: string = '1d'
|
||||
) {
|
||||
return {
|
||||
bool: {
|
||||
filter: {
|
||||
bool: {
|
||||
must: [
|
||||
{
|
||||
term: { 'event.action': action },
|
||||
},
|
||||
{
|
||||
term: { 'event.provider': provider },
|
||||
},
|
||||
{
|
||||
range: {
|
||||
'@timestamp': {
|
||||
gte: `now-${range}`,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
|
@ -0,0 +1,249 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
|
||||
import { getTotalCountAggregations, getTotalCountInUse } from './get_telemetry_from_kibana';
|
||||
|
||||
const elasticsearch = elasticsearchServiceMock.createStart();
|
||||
const esClient = elasticsearch.client.asInternalUser;
|
||||
const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
|
||||
|
||||
describe('kibana index telemetry', () => {
|
||||
beforeEach(() => {
|
||||
jest.resetAllMocks();
|
||||
});
|
||||
|
||||
describe('getTotalCountAggregations', () => {
|
||||
test('should return rule counts by rule type id, stats about schedule and throttle intervals and number of actions', async () => {
|
||||
esClient.search.mockResponseOnce({
|
||||
took: 4,
|
||||
timed_out: false,
|
||||
_shards: {
|
||||
total: 1,
|
||||
successful: 1,
|
||||
skipped: 0,
|
||||
failed: 0,
|
||||
},
|
||||
hits: {
|
||||
total: {
|
||||
value: 4,
|
||||
relation: 'eq',
|
||||
},
|
||||
max_score: null,
|
||||
hits: [],
|
||||
},
|
||||
aggregations: {
|
||||
by_rule_type_id: {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: '.index-threshold',
|
||||
doc_count: 2,
|
||||
},
|
||||
{
|
||||
key: 'logs.alert.document.count',
|
||||
doc_count: 1,
|
||||
},
|
||||
{
|
||||
key: 'document.test.',
|
||||
doc_count: 1,
|
||||
},
|
||||
],
|
||||
},
|
||||
max_throttle_time: { value: 60 },
|
||||
min_throttle_time: { value: 0 },
|
||||
avg_throttle_time: { value: 30 },
|
||||
max_interval_time: { value: 10 },
|
||||
min_interval_time: { value: 1 },
|
||||
avg_interval_time: { value: 4.5 },
|
||||
max_actions_count: { value: 4 },
|
||||
min_actions_count: { value: 0 },
|
||||
avg_actions_count: { value: 2.5 },
|
||||
},
|
||||
});
|
||||
|
||||
const telemetry = await getTotalCountAggregations({
|
||||
esClient,
|
||||
kibanaIndex: 'test',
|
||||
logger,
|
||||
});
|
||||
|
||||
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||
|
||||
expect(telemetry).toEqual({
|
||||
connectors_per_alert: {
|
||||
avg: 2.5,
|
||||
max: 4,
|
||||
min: 0,
|
||||
},
|
||||
count_by_type: {
|
||||
'__index-threshold': 2,
|
||||
document__test__: 1,
|
||||
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||
logs__alert__document__count: 1,
|
||||
},
|
||||
count_total: 4,
|
||||
schedule_time: {
|
||||
avg: '4.5s',
|
||||
max: '10s',
|
||||
min: '1s',
|
||||
},
|
||||
schedule_time_number_s: {
|
||||
avg: 4.5,
|
||||
max: 10,
|
||||
min: 1,
|
||||
},
|
||||
throttle_time: {
|
||||
avg: '30s',
|
||||
max: '60s',
|
||||
min: '0s',
|
||||
},
|
||||
throttle_time_number_s: {
|
||||
avg: 30,
|
||||
max: 60,
|
||||
min: 0,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test('should return empty results and log warning if query throws error', async () => {
|
||||
esClient.search.mockRejectedValueOnce(new Error('oh no'));
|
||||
|
||||
const telemetry = await getTotalCountAggregations({
|
||||
esClient,
|
||||
kibanaIndex: 'test',
|
||||
logger,
|
||||
});
|
||||
|
||||
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||
const loggerCall = logger.warn.mock.calls[0][0];
|
||||
const loggerMeta = logger.warn.mock.calls[0][1];
|
||||
expect(loggerCall as string).toMatchInlineSnapshot(
|
||||
`"Error executing alerting telemetry task: getTotalCountAggregations - {}"`
|
||||
);
|
||||
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
|
||||
expect(loggerMeta?.error?.stack_trace).toBeDefined();
|
||||
expect(telemetry).toEqual({
|
||||
connectors_per_alert: {
|
||||
avg: 0,
|
||||
max: 0,
|
||||
min: 0,
|
||||
},
|
||||
count_by_type: {},
|
||||
count_total: 0,
|
||||
schedule_time: {
|
||||
avg: '0s',
|
||||
max: '0s',
|
||||
min: '0s',
|
||||
},
|
||||
schedule_time_number_s: {
|
||||
avg: 0,
|
||||
max: 0,
|
||||
min: 0,
|
||||
},
|
||||
throttle_time: {
|
||||
avg: '0s',
|
||||
max: '0s',
|
||||
min: '0s',
|
||||
},
|
||||
throttle_time_number_s: {
|
||||
avg: 0,
|
||||
max: 0,
|
||||
min: 0,
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('getTotalCountInUse', () => {
|
||||
test('should return enabled rule counts by rule type id and number of namespaces', async () => {
|
||||
esClient.search.mockResponseOnce({
|
||||
took: 4,
|
||||
timed_out: false,
|
||||
_shards: {
|
||||
total: 1,
|
||||
successful: 1,
|
||||
skipped: 0,
|
||||
failed: 0,
|
||||
},
|
||||
hits: {
|
||||
total: {
|
||||
value: 4,
|
||||
relation: 'eq',
|
||||
},
|
||||
max_score: null,
|
||||
hits: [],
|
||||
},
|
||||
aggregations: {
|
||||
namespaces_count: { value: 1 },
|
||||
by_rule_type_id: {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: '.index-threshold',
|
||||
doc_count: 2,
|
||||
},
|
||||
{
|
||||
key: 'logs.alert.document.count',
|
||||
doc_count: 1,
|
||||
},
|
||||
{
|
||||
key: 'document.test.',
|
||||
doc_count: 1,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const telemetry = await getTotalCountInUse({
|
||||
esClient,
|
||||
kibanaIndex: 'test',
|
||||
logger,
|
||||
});
|
||||
|
||||
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||
|
||||
expect(telemetry).toStrictEqual({
|
||||
countByType: {
|
||||
'__index-threshold': 2,
|
||||
document__test__: 1,
|
||||
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||
logs__alert__document__count: 1,
|
||||
},
|
||||
countNamespaces: 1,
|
||||
countTotal: 4,
|
||||
});
|
||||
});
|
||||
|
||||
test('should return empty results and log warning if query throws error', async () => {
|
||||
esClient.search.mockRejectedValueOnce(new Error('oh no'));
|
||||
|
||||
const telemetry = await getTotalCountInUse({
|
||||
esClient,
|
||||
kibanaIndex: 'test',
|
||||
logger,
|
||||
});
|
||||
|
||||
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||
const loggerCall = logger.warn.mock.calls[0][0];
|
||||
const loggerMeta = logger.warn.mock.calls[0][1];
|
||||
expect(loggerCall as string).toMatchInlineSnapshot(
|
||||
`"Error executing alerting telemetry task: getTotalCountInUse - {}"`
|
||||
);
|
||||
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
|
||||
expect(loggerMeta?.error?.stack_trace).toBeDefined();
|
||||
expect(telemetry).toStrictEqual({
|
||||
countByType: {},
|
||||
countNamespaces: 0,
|
||||
countTotal: 0,
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
|
@ -0,0 +1,317 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type {
|
||||
AggregationsSingleMetricAggregateBase,
|
||||
AggregationsCardinalityAggregate,
|
||||
AggregationsTermsAggregateBase,
|
||||
AggregationsStringTermsBucketKeys,
|
||||
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import { ElasticsearchClient, Logger } from '@kbn/core/server';
|
||||
import { AlertingUsage } from '../types';
|
||||
import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
|
||||
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
|
||||
|
||||
interface Opts {
|
||||
esClient: ElasticsearchClient;
|
||||
kibanaIndex: string;
|
||||
logger: Logger;
|
||||
}
|
||||
|
||||
type GetTotalCountsResults = Pick<
|
||||
AlertingUsage,
|
||||
| 'count_total'
|
||||
| 'count_by_type'
|
||||
| 'throttle_time'
|
||||
| 'schedule_time'
|
||||
| 'throttle_time_number_s'
|
||||
| 'schedule_time_number_s'
|
||||
| 'connectors_per_alert'
|
||||
>;
|
||||
|
||||
interface GetTotalCountInUseResults {
|
||||
countTotal: number;
|
||||
countByType: Record<string, number>;
|
||||
countNamespaces: number;
|
||||
}
|
||||
|
||||
export async function getTotalCountAggregations({
|
||||
esClient,
|
||||
kibanaIndex,
|
||||
logger,
|
||||
}: Opts): Promise<GetTotalCountsResults> {
|
||||
try {
|
||||
const query = {
|
||||
index: kibanaIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
// Aggregate over all rule saved objects
|
||||
filter: [{ term: { type: 'alert' } }],
|
||||
},
|
||||
},
|
||||
runtime_mappings: {
|
||||
rule_action_count: {
|
||||
type: 'long',
|
||||
script: {
|
||||
source: `
|
||||
def alert = params._source['alert'];
|
||||
if (alert != null) {
|
||||
def actions = alert.actions;
|
||||
if (actions != null) {
|
||||
emit(actions.length);
|
||||
} else {
|
||||
emit(0);
|
||||
}
|
||||
}`,
|
||||
},
|
||||
},
|
||||
// Convert schedule interval duration string from rule saved object to interval in seconds
|
||||
rule_schedule_interval: {
|
||||
type: 'long',
|
||||
script: {
|
||||
source: `
|
||||
int parsed = 0;
|
||||
if (doc['alert.schedule.interval'].size() > 0) {
|
||||
def interval = doc['alert.schedule.interval'].value;
|
||||
|
||||
if (interval.length() > 1) {
|
||||
// get last char
|
||||
String timeChar = interval.substring(interval.length() - 1);
|
||||
// remove last char
|
||||
interval = interval.substring(0, interval.length() - 1);
|
||||
|
||||
if (interval.chars().allMatch(Character::isDigit)) {
|
||||
// using of regex is not allowed in painless language
|
||||
parsed = Integer.parseInt(interval);
|
||||
|
||||
if (timeChar.equals("s")) {
|
||||
parsed = parsed;
|
||||
} else if (timeChar.equals("m")) {
|
||||
parsed = parsed * 60;
|
||||
} else if (timeChar.equals("h")) {
|
||||
parsed = parsed * 60 * 60;
|
||||
} else if (timeChar.equals("d")) {
|
||||
parsed = parsed * 24 * 60 * 60;
|
||||
}
|
||||
emit(parsed);
|
||||
}
|
||||
}
|
||||
}
|
||||
emit(parsed);
|
||||
`,
|
||||
},
|
||||
},
|
||||
// Convert throttle interval duration string from rule saved object to interval in seconds
|
||||
rule_throttle_interval: {
|
||||
type: 'long',
|
||||
script: {
|
||||
source: `
|
||||
int parsed = 0;
|
||||
if (doc['alert.throttle'].size() > 0) {
|
||||
def throttle = doc['alert.throttle'].value;
|
||||
|
||||
if (throttle.length() > 1) {
|
||||
// get last char
|
||||
String timeChar = throttle.substring(throttle.length() - 1);
|
||||
// remove last char
|
||||
throttle = throttle.substring(0, throttle.length() - 1);
|
||||
|
||||
if (throttle.chars().allMatch(Character::isDigit)) {
|
||||
// using of regex is not allowed in painless language
|
||||
parsed = Integer.parseInt(throttle);
|
||||
|
||||
if (timeChar.equals("s")) {
|
||||
parsed = parsed;
|
||||
} else if (timeChar.equals("m")) {
|
||||
parsed = parsed * 60;
|
||||
} else if (timeChar.equals("h")) {
|
||||
parsed = parsed * 60 * 60;
|
||||
} else if (timeChar.equals("d")) {
|
||||
parsed = parsed * 24 * 60 * 60;
|
||||
}
|
||||
emit(parsed);
|
||||
}
|
||||
}
|
||||
}
|
||||
emit(parsed);
|
||||
`,
|
||||
},
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
by_rule_type_id: {
|
||||
terms: {
|
||||
field: 'alert.alertTypeId',
|
||||
size: NUM_ALERTING_RULE_TYPES,
|
||||
},
|
||||
},
|
||||
max_throttle_time: { max: { field: 'rule_throttle_interval' } },
|
||||
min_throttle_time: { min: { field: 'rule_throttle_interval' } },
|
||||
avg_throttle_time: { avg: { field: 'rule_throttle_interval' } },
|
||||
max_interval_time: { max: { field: 'rule_schedule_interval' } },
|
||||
min_interval_time: { min: { field: 'rule_schedule_interval' } },
|
||||
avg_interval_time: { avg: { field: 'rule_schedule_interval' } },
|
||||
max_actions_count: { max: { field: 'rule_action_count' } },
|
||||
min_actions_count: { min: { field: 'rule_action_count' } },
|
||||
avg_actions_count: { avg: { field: 'rule_action_count' } },
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
logger.debug(`query for getTotalCountAggregations - ${JSON.stringify(query)}`);
|
||||
const results = await esClient.search(query);
|
||||
|
||||
logger.debug(`results for getTotalCountAggregations query - ${JSON.stringify(results)}`);
|
||||
|
||||
const aggregations = results.aggregations as {
|
||||
by_rule_type_id: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||
max_throttle_time: AggregationsSingleMetricAggregateBase;
|
||||
min_throttle_time: AggregationsSingleMetricAggregateBase;
|
||||
avg_throttle_time: AggregationsSingleMetricAggregateBase;
|
||||
max_interval_time: AggregationsSingleMetricAggregateBase;
|
||||
min_interval_time: AggregationsSingleMetricAggregateBase;
|
||||
avg_interval_time: AggregationsSingleMetricAggregateBase;
|
||||
max_actions_count: AggregationsSingleMetricAggregateBase;
|
||||
min_actions_count: AggregationsSingleMetricAggregateBase;
|
||||
avg_actions_count: AggregationsSingleMetricAggregateBase;
|
||||
};
|
||||
|
||||
const totalRulesCount =
|
||||
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
|
||||
|
||||
return {
|
||||
count_total: totalRulesCount ?? 0,
|
||||
count_by_type: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets),
|
||||
throttle_time: {
|
||||
min: `${aggregations.min_throttle_time.value ?? 0}s`,
|
||||
avg: `${aggregations.avg_throttle_time.value ?? 0}s`,
|
||||
max: `${aggregations.max_throttle_time.value ?? 0}s`,
|
||||
},
|
||||
schedule_time: {
|
||||
min: `${aggregations.min_interval_time.value ?? 0}s`,
|
||||
avg: `${aggregations.avg_interval_time.value ?? 0}s`,
|
||||
max: `${aggregations.max_interval_time.value ?? 0}s`,
|
||||
},
|
||||
throttle_time_number_s: {
|
||||
min: aggregations.min_throttle_time.value ?? 0,
|
||||
avg: aggregations.avg_throttle_time.value ?? 0,
|
||||
max: aggregations.max_throttle_time.value ?? 0,
|
||||
},
|
||||
schedule_time_number_s: {
|
||||
min: aggregations.min_interval_time.value ?? 0,
|
||||
avg: aggregations.avg_interval_time.value ?? 0,
|
||||
max: aggregations.max_interval_time.value ?? 0,
|
||||
},
|
||||
connectors_per_alert: {
|
||||
min: aggregations.min_actions_count.value ?? 0,
|
||||
avg: aggregations.avg_actions_count.value ?? 0,
|
||||
max: aggregations.max_actions_count.value ?? 0,
|
||||
},
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
`Error executing alerting telemetry task: getTotalCountAggregations - ${JSON.stringify(err)}`,
|
||||
{
|
||||
tags: ['alerting', 'telemetry-failed'],
|
||||
error: { stack_trace: err.stack },
|
||||
}
|
||||
);
|
||||
return {
|
||||
count_total: 0,
|
||||
count_by_type: {},
|
||||
throttle_time: {
|
||||
min: '0s',
|
||||
avg: '0s',
|
||||
max: '0s',
|
||||
},
|
||||
schedule_time: {
|
||||
min: '0s',
|
||||
avg: '0s',
|
||||
max: '0s',
|
||||
},
|
||||
throttle_time_number_s: {
|
||||
min: 0,
|
||||
avg: 0,
|
||||
max: 0,
|
||||
},
|
||||
schedule_time_number_s: {
|
||||
min: 0,
|
||||
avg: 0,
|
||||
max: 0,
|
||||
},
|
||||
connectors_per_alert: {
|
||||
min: 0,
|
||||
avg: 0,
|
||||
max: 0,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function getTotalCountInUse({
|
||||
esClient,
|
||||
kibanaIndex,
|
||||
logger,
|
||||
}: Opts): Promise<GetTotalCountInUseResults> {
|
||||
try {
|
||||
const query = {
|
||||
index: kibanaIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
// Aggregate over only enabled rule saved objects
|
||||
filter: [{ term: { type: 'alert' } }, { term: { 'alert.enabled': true } }],
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
namespaces_count: { cardinality: { field: 'namespaces' } },
|
||||
by_rule_type_id: {
|
||||
terms: {
|
||||
field: 'alert.alertTypeId',
|
||||
size: NUM_ALERTING_RULE_TYPES,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
logger.debug(`query for getTotalCountInUse - ${JSON.stringify(query)}`);
|
||||
const results = await esClient.search(query);
|
||||
|
||||
logger.debug(`results for getTotalCountInUse query - ${JSON.stringify(results)}`);
|
||||
|
||||
const aggregations = results.aggregations as {
|
||||
by_rule_type_id: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||
namespaces_count: AggregationsCardinalityAggregate;
|
||||
};
|
||||
|
||||
const totalEnabledRulesCount =
|
||||
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
|
||||
|
||||
return {
|
||||
countTotal: totalEnabledRulesCount ?? 0,
|
||||
countByType: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets),
|
||||
countNamespaces: aggregations.namespaces_count.value ?? 0,
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
`Error executing alerting telemetry task: getTotalCountInUse - ${JSON.stringify(err)}`,
|
||||
{
|
||||
tags: ['alerting', 'telemetry-failed'],
|
||||
error: { stack_trace: err.stack },
|
||||
}
|
||||
);
|
||||
return {
|
||||
countTotal: 0,
|
||||
countByType: {},
|
||||
countNamespaces: 0,
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,256 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
|
||||
import {
|
||||
getFailedAndUnrecognizedTasksPerDay,
|
||||
parseBucket,
|
||||
} from './get_telemetry_from_task_manager';
|
||||
|
||||
const elasticsearch = elasticsearchServiceMock.createStart();
|
||||
const esClient = elasticsearch.client.asInternalUser;
|
||||
const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
|
||||
|
||||
describe('task manager telemetry', () => {
|
||||
beforeEach(() => {
|
||||
jest.resetAllMocks();
|
||||
});
|
||||
|
||||
describe('parseBucket', () => {
|
||||
test('should correctly parse aggregation bucket results', () => {
|
||||
expect(
|
||||
parseBucket([
|
||||
{
|
||||
key: 'failed',
|
||||
doc_count: 36,
|
||||
by_task_type: {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: 'alerting:.index-threshold',
|
||||
doc_count: 4,
|
||||
},
|
||||
{
|
||||
key: 'alerting:document.test.',
|
||||
doc_count: 32,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'unrecognized',
|
||||
doc_count: 4,
|
||||
by_task_type: {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: 'alerting:logs.alert.document.count',
|
||||
doc_count: 4,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
])
|
||||
).toEqual({
|
||||
countFailedAndUnrecognizedTasksByStatus: {
|
||||
failed: 36,
|
||||
unrecognized: 4,
|
||||
},
|
||||
countFailedAndUnrecognizedTasksByStatusByType: {
|
||||
failed: {
|
||||
'__index-threshold': 4,
|
||||
document__test__: 32,
|
||||
},
|
||||
unrecognized: {
|
||||
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||
logs__alert__document__count: 4,
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test('should handle missing values', () => {
|
||||
expect(
|
||||
parseBucket([
|
||||
{
|
||||
key: 'failed',
|
||||
by_task_type: {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: 'alerting:.index-threshold',
|
||||
doc_count: 4,
|
||||
},
|
||||
// @ts-expect-error
|
||||
{
|
||||
key: 'alerting:document.test.',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'unrecognized',
|
||||
doc_count: 4,
|
||||
// @ts-expect-error
|
||||
by_task_type: {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
},
|
||||
},
|
||||
// @ts-expect-error
|
||||
{
|
||||
key: 'another_key',
|
||||
},
|
||||
])
|
||||
).toEqual({
|
||||
countFailedAndUnrecognizedTasksByStatus: {
|
||||
failed: 0,
|
||||
unrecognized: 4,
|
||||
another_key: 0,
|
||||
},
|
||||
countFailedAndUnrecognizedTasksByStatusByType: {
|
||||
failed: {
|
||||
'__index-threshold': 4,
|
||||
document__test__: 0,
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test('should handle empty input', () => {
|
||||
expect(parseBucket([])).toEqual({
|
||||
countFailedAndUnrecognizedTasksByStatus: {},
|
||||
countFailedAndUnrecognizedTasksByStatusByType: {},
|
||||
});
|
||||
});
|
||||
|
||||
test('should handle undefined input', () => {
|
||||
// @ts-expect-error
|
||||
expect(parseBucket(undefined)).toEqual({
|
||||
countFailedAndUnrecognizedTasksByStatus: {},
|
||||
countFailedAndUnrecognizedTasksByStatusByType: {},
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('getFailedAndUnrecognizedTasksPerDay', () => {
|
||||
test('should return counts of failed and unrecognized tasks broken down by status and rule type', async () => {
|
||||
esClient.search.mockResponse({
|
||||
took: 4,
|
||||
timed_out: false,
|
||||
_shards: {
|
||||
total: 1,
|
||||
successful: 1,
|
||||
skipped: 0,
|
||||
failed: 0,
|
||||
},
|
||||
hits: {
|
||||
total: {
|
||||
value: 40,
|
||||
relation: 'eq',
|
||||
},
|
||||
max_score: null,
|
||||
hits: [],
|
||||
},
|
||||
aggregations: {
|
||||
by_status: {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: 'failed',
|
||||
doc_count: 36,
|
||||
by_task_type: {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: 'alerting:.index-threshold',
|
||||
doc_count: 4,
|
||||
},
|
||||
{
|
||||
key: 'alerting:document.test.',
|
||||
doc_count: 32,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'unrecognized',
|
||||
doc_count: 4,
|
||||
by_task_type: {
|
||||
doc_count_error_upper_bound: 0,
|
||||
sum_other_doc_count: 0,
|
||||
buckets: [
|
||||
{
|
||||
key: 'alerting:logs.alert.document.count',
|
||||
doc_count: 4,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const telemetry = await getFailedAndUnrecognizedTasksPerDay({
|
||||
esClient,
|
||||
taskManagerIndex: 'test',
|
||||
logger,
|
||||
});
|
||||
|
||||
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||
|
||||
expect(telemetry).toStrictEqual({
|
||||
countFailedAndUnrecognizedTasks: 40,
|
||||
countFailedAndUnrecognizedTasksByStatus: {
|
||||
failed: 36,
|
||||
unrecognized: 4,
|
||||
},
|
||||
countFailedAndUnrecognizedTasksByStatusByType: {
|
||||
failed: {
|
||||
'__index-threshold': 4,
|
||||
document__test__: 32,
|
||||
},
|
||||
unrecognized: {
|
||||
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||
logs__alert__document__count: 4,
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test('should return empty results and log warning if query throws error', async () => {
|
||||
esClient.search.mockRejectedValue(new Error('oh no'));
|
||||
|
||||
const telemetry = await getFailedAndUnrecognizedTasksPerDay({
|
||||
esClient,
|
||||
taskManagerIndex: 'test',
|
||||
logger,
|
||||
});
|
||||
|
||||
expect(esClient.search).toHaveBeenCalledTimes(1);
|
||||
|
||||
const loggerCall = logger.warn.mock.calls[0][0];
|
||||
const loggerMeta = logger.warn.mock.calls[0][1];
|
||||
expect(loggerCall as string).toMatchInlineSnapshot(
|
||||
`"Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - {}"`
|
||||
);
|
||||
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
|
||||
expect(loggerMeta?.error?.stack_trace).toBeDefined();
|
||||
expect(telemetry).toStrictEqual({
|
||||
countFailedAndUnrecognizedTasks: 0,
|
||||
countFailedAndUnrecognizedTasksByStatus: {},
|
||||
countFailedAndUnrecognizedTasksByStatusByType: {},
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
|
@ -0,0 +1,199 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { isEmpty, merge } from 'lodash';
|
||||
import type {
|
||||
AggregationsTermsAggregateBase,
|
||||
AggregationsStringTermsBucketKeys,
|
||||
AggregationsBuckets,
|
||||
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import { ElasticsearchClient, Logger } from '@kbn/core/server';
|
||||
import { replaceDotSymbols } from './replace_dots_with_underscores';
|
||||
import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
|
||||
|
||||
interface Opts {
|
||||
esClient: ElasticsearchClient;
|
||||
taskManagerIndex: string;
|
||||
logger: Logger;
|
||||
}
|
||||
|
||||
interface GetFailedAndUnrecognizedTasksAggregationBucket extends AggregationsStringTermsBucketKeys {
|
||||
by_task_type: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
|
||||
}
|
||||
|
||||
interface GetFailedAndUnrecognizedTasksResults {
|
||||
countFailedAndUnrecognizedTasks: number;
|
||||
countFailedAndUnrecognizedTasksByStatus: Record<string, number>;
|
||||
countFailedAndUnrecognizedTasksByStatusByType: Record<string, Record<string, number>>;
|
||||
}
|
||||
|
||||
export async function getFailedAndUnrecognizedTasksPerDay({
|
||||
esClient,
|
||||
taskManagerIndex,
|
||||
logger,
|
||||
}: Opts): Promise<GetFailedAndUnrecognizedTasksResults> {
|
||||
try {
|
||||
const query = {
|
||||
index: taskManagerIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
must: [
|
||||
{
|
||||
bool: {
|
||||
should: [
|
||||
{
|
||||
term: {
|
||||
'task.status': 'unrecognized',
|
||||
},
|
||||
},
|
||||
{
|
||||
term: {
|
||||
'task.status': 'failed',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
wildcard: {
|
||||
'task.taskType': {
|
||||
value: 'alerting:*',
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
range: {
|
||||
'task.runAt': {
|
||||
gte: 'now-1d',
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
by_status: {
|
||||
terms: {
|
||||
field: 'task.status',
|
||||
size: 10,
|
||||
},
|
||||
aggs: {
|
||||
by_task_type: {
|
||||
terms: {
|
||||
field: 'task.taskType',
|
||||
// Use number of alerting rule types because we're filtering by 'alerting:'
|
||||
size: NUM_ALERTING_RULE_TYPES,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
logger.debug(`query for getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify(query)}`);
|
||||
const results = await esClient.search(query);
|
||||
|
||||
logger.debug(
|
||||
`results for getFailedAndUnrecognizedTasksPerDay query - ${JSON.stringify(results)}`
|
||||
);
|
||||
|
||||
const aggregations = results.aggregations as {
|
||||
by_status: AggregationsTermsAggregateBase<GetFailedAndUnrecognizedTasksAggregationBucket>;
|
||||
};
|
||||
|
||||
const totalFailedAndUnrecognizedTasks =
|
||||
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
|
||||
|
||||
const aggregationsByStatus: AggregationsBuckets<GetFailedAndUnrecognizedTasksAggregationBucket> =
|
||||
aggregations.by_status.buckets as GetFailedAndUnrecognizedTasksAggregationBucket[];
|
||||
|
||||
return {
|
||||
...parseBucket(aggregationsByStatus),
|
||||
countFailedAndUnrecognizedTasks: totalFailedAndUnrecognizedTasks ?? 0,
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn(
|
||||
`Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify(
|
||||
err
|
||||
)}`,
|
||||
{
|
||||
tags: ['alerting', 'telemetry-failed'],
|
||||
error: { stack_trace: err.stack },
|
||||
}
|
||||
);
|
||||
return {
|
||||
countFailedAndUnrecognizedTasks: 0,
|
||||
countFailedAndUnrecognizedTasksByStatus: {},
|
||||
countFailedAndUnrecognizedTasksByStatusByType: {},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Bucket format:
|
||||
* {
|
||||
* "key": "idle", // task status
|
||||
* "doc_count": 28, // number of tasks with this status
|
||||
* "by_task_type": {
|
||||
* "doc_count_error_upper_bound": 0,
|
||||
* "sum_other_doc_count": 0,
|
||||
* "buckets": [
|
||||
* {
|
||||
* "key": "alerting:.es-query", // breakdown of task type for status
|
||||
* "doc_count": 1
|
||||
* },
|
||||
* {
|
||||
* "key": "alerting:.index-threshold",
|
||||
* "doc_count": 1
|
||||
* }
|
||||
* ]
|
||||
* }
|
||||
* }
|
||||
*/
|
||||
|
||||
export function parseBucket(
|
||||
buckets: GetFailedAndUnrecognizedTasksAggregationBucket[]
|
||||
): Pick<
|
||||
GetFailedAndUnrecognizedTasksResults,
|
||||
'countFailedAndUnrecognizedTasksByStatus' | 'countFailedAndUnrecognizedTasksByStatusByType'
|
||||
> {
|
||||
return (buckets ?? []).reduce(
|
||||
(summary, bucket) => {
|
||||
const status: string = bucket.key;
|
||||
const taskTypeBuckets = bucket?.by_task_type?.buckets as AggregationsStringTermsBucketKeys[];
|
||||
|
||||
const byTaskType = (taskTypeBuckets ?? []).reduce(
|
||||
(acc: Record<string, number>, taskTypeBucket: AggregationsStringTermsBucketKeys) => {
|
||||
const taskType: string = replaceDotSymbols(taskTypeBucket.key.replace('alerting:', ''));
|
||||
return {
|
||||
...acc,
|
||||
[taskType]: taskTypeBucket.doc_count ?? 0,
|
||||
};
|
||||
},
|
||||
{}
|
||||
);
|
||||
return {
|
||||
...summary,
|
||||
countFailedAndUnrecognizedTasksByStatus: {
|
||||
...summary.countFailedAndUnrecognizedTasksByStatus,
|
||||
[status]: bucket?.doc_count ?? 0,
|
||||
},
|
||||
countFailedAndUnrecognizedTasksByStatusByType: merge(
|
||||
summary.countFailedAndUnrecognizedTasksByStatusByType,
|
||||
isEmpty(byTaskType) ? {} : { [status]: byTaskType }
|
||||
),
|
||||
};
|
||||
},
|
||||
{
|
||||
countFailedAndUnrecognizedTasksByStatus: {},
|
||||
countFailedAndUnrecognizedTasksByStatusByType: {},
|
||||
}
|
||||
);
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
|
||||
|
||||
describe('parseSimpleRuleTypeBucket', () => {
|
||||
test('should correctly parse rule type bucket results', () => {
|
||||
expect(
|
||||
parseSimpleRuleTypeBucket([
|
||||
{
|
||||
key: '.index-threshold',
|
||||
doc_count: 78,
|
||||
},
|
||||
{
|
||||
key: 'document.test.',
|
||||
doc_count: 42,
|
||||
},
|
||||
{
|
||||
key: 'logs.alert.document.count',
|
||||
doc_count: 28,
|
||||
},
|
||||
])
|
||||
).toEqual({
|
||||
'__index-threshold': 78,
|
||||
document__test__: 42,
|
||||
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||
logs__alert__document__count: 28,
|
||||
});
|
||||
});
|
||||
|
||||
test('should handle missing values', () => {
|
||||
expect(
|
||||
parseSimpleRuleTypeBucket([
|
||||
// @ts-expect-error
|
||||
{
|
||||
key: '.index-threshold',
|
||||
},
|
||||
{
|
||||
key: 'document.test.',
|
||||
doc_count: 42,
|
||||
},
|
||||
{
|
||||
key: 'logs.alert.document.count',
|
||||
doc_count: 28,
|
||||
},
|
||||
])
|
||||
).toEqual({
|
||||
'__index-threshold': 0,
|
||||
document__test__: 42,
|
||||
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||
logs__alert__document__count: 28,
|
||||
});
|
||||
});
|
||||
|
||||
test('should handle empty input', () => {
|
||||
expect(parseSimpleRuleTypeBucket([])).toEqual({});
|
||||
});
|
||||
|
||||
test('should handle undefined input', () => {
|
||||
// @ts-expect-error
|
||||
expect(parseSimpleRuleTypeBucket(undefined)).toEqual({});
|
||||
});
|
||||
});
|
|
@ -0,0 +1,25 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import {
|
||||
AggregationsBuckets,
|
||||
AggregationsStringTermsBucketKeys,
|
||||
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import { replaceDotSymbols } from './replace_dots_with_underscores';
|
||||
|
||||
export function parseSimpleRuleTypeBucket(
|
||||
ruleTypeBuckets: AggregationsBuckets<AggregationsStringTermsBucketKeys>
|
||||
) {
|
||||
const buckets = ruleTypeBuckets as AggregationsStringTermsBucketKeys[];
|
||||
return (buckets ?? []).reduce((acc, bucket: AggregationsStringTermsBucketKeys) => {
|
||||
const ruleType: string = replaceDotSymbols(bucket.key);
|
||||
return {
|
||||
...acc,
|
||||
[ruleType]: bucket.doc_count ?? 0,
|
||||
};
|
||||
}, {});
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { replaceDotSymbols } from './replace_dots_with_underscores';
|
||||
|
||||
describe('replaceDotSymbols', () => {
|
||||
test('should replace "." symbols with "__" in string', async () => {
|
||||
expect(replaceDotSymbols('.index-threshold')).toEqual('__index-threshold');
|
||||
});
|
||||
});
|
|
@ -0,0 +1,10 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
export function replaceDotSymbols(strToReplace: string) {
|
||||
return strToReplace.replaceAll('.', '__');
|
||||
}
|
|
@ -13,13 +13,12 @@ import {
|
|||
TaskManagerStartContract,
|
||||
} from '@kbn/task-manager-plugin/server';
|
||||
|
||||
import { getFailedAndUnrecognizedTasksPerDay } from './lib/get_telemetry_from_task_manager';
|
||||
import { getTotalCountAggregations, getTotalCountInUse } from './lib/get_telemetry_from_kibana';
|
||||
import {
|
||||
getTotalCountAggregations,
|
||||
getTotalCountInUse,
|
||||
getExecutionsPerDayCount,
|
||||
getExecutionTimeoutsPerDayCount,
|
||||
getFailedAndUnrecognizedTasksPerDay,
|
||||
} from './alerting_telemetry';
|
||||
} from './lib/get_telemetry_from_event_log';
|
||||
|
||||
export const TELEMETRY_TASK_TYPE = 'alerting_telemetry';
|
||||
|
||||
|
@ -98,11 +97,11 @@ export function telemetryTaskRunner(
|
|||
async run() {
|
||||
const esClient = await getEsClient();
|
||||
return Promise.all([
|
||||
getTotalCountAggregations(esClient, kibanaIndex, logger),
|
||||
getTotalCountInUse(esClient, kibanaIndex, logger),
|
||||
getExecutionsPerDayCount(esClient, eventLogIndex, logger),
|
||||
getExecutionTimeoutsPerDayCount(esClient, eventLogIndex, logger),
|
||||
getFailedAndUnrecognizedTasksPerDay(esClient, taskManagerIndex, logger),
|
||||
getTotalCountAggregations({ esClient, kibanaIndex, logger }),
|
||||
getTotalCountInUse({ esClient, kibanaIndex, logger }),
|
||||
getExecutionsPerDayCount({ esClient, eventLogIndex, logger }),
|
||||
getExecutionTimeoutsPerDayCount({ esClient, eventLogIndex, logger }),
|
||||
getFailedAndUnrecognizedTasksPerDay({ esClient, taskManagerIndex, logger }),
|
||||
])
|
||||
.then(
|
||||
([
|
||||
|
@ -120,22 +119,25 @@ export function telemetryTaskRunner(
|
|||
count_active_total: totalInUse.countTotal,
|
||||
count_disabled_total: totalCountAggregations.count_total - totalInUse.countTotal,
|
||||
count_rules_namespaces: totalInUse.countNamespaces,
|
||||
count_rules_executions_per_day: dailyExecutionCounts.countTotal,
|
||||
count_rules_executions_by_type_per_day: dailyExecutionCounts.countByType,
|
||||
count_rules_executions_failured_per_day: dailyExecutionCounts.countTotalFailures,
|
||||
count_rules_executions_per_day: dailyExecutionCounts.countTotalRuleExecutions,
|
||||
count_rules_executions_by_type_per_day:
|
||||
dailyExecutionCounts.countRuleExecutionsByType,
|
||||
count_rules_executions_failured_per_day:
|
||||
dailyExecutionCounts.countTotalFailedExecutions,
|
||||
count_rules_executions_failured_by_reason_per_day:
|
||||
dailyExecutionCounts.countFailuresByReason,
|
||||
dailyExecutionCounts.countFailedExecutionsByReason,
|
||||
count_rules_executions_failured_by_reason_by_type_per_day:
|
||||
dailyExecutionCounts.countFailuresByReasonByType,
|
||||
count_rules_executions_timeouts_per_day: dailyExecutionTimeoutCounts.countTotal,
|
||||
dailyExecutionCounts.countFailedExecutionsByReasonByType,
|
||||
count_rules_executions_timeouts_per_day:
|
||||
dailyExecutionTimeoutCounts.countExecutionTimeouts,
|
||||
count_rules_executions_timeouts_by_type_per_day:
|
||||
dailyExecutionTimeoutCounts.countByType,
|
||||
dailyExecutionTimeoutCounts.countExecutionTimeoutsByType,
|
||||
count_failed_and_unrecognized_rule_tasks_per_day:
|
||||
dailyFailedAndUnrecognizedTasks.countTotal,
|
||||
dailyFailedAndUnrecognizedTasks.countFailedAndUnrecognizedTasks,
|
||||
count_failed_and_unrecognized_rule_tasks_by_status_per_day:
|
||||
dailyFailedAndUnrecognizedTasks.countByStatus,
|
||||
dailyFailedAndUnrecognizedTasks.countFailedAndUnrecognizedTasksByStatus,
|
||||
count_failed_and_unrecognized_rule_tasks_by_status_by_type_per_day:
|
||||
dailyFailedAndUnrecognizedTasks.countByStatusByRuleType,
|
||||
dailyFailedAndUnrecognizedTasks.countFailedAndUnrecognizedTasksByStatusByType,
|
||||
avg_execution_time_per_day: dailyExecutionCounts.avgExecutionTime,
|
||||
avg_execution_time_by_type_per_day: dailyExecutionCounts.avgExecutionTimeByType,
|
||||
avg_es_search_duration_per_day: dailyExecutionCounts.avgEsSearchDuration,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue