diff --git a/x-pack/plugins/alerting/server/usage/alerting_telemetry.test.ts b/x-pack/plugins/alerting/server/usage/alerting_telemetry.test.ts deleted file mode 100644 index 21767ef1f3d1..000000000000 --- a/x-pack/plugins/alerting/server/usage/alerting_telemetry.test.ts +++ /dev/null @@ -1,725 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0; you may not use this file except in compliance with the Elastic License - * 2.0. - */ - -/* eslint-disable @typescript-eslint/naming-convention */ - -// eslint-disable-next-line @kbn/eslint/no-restricted-paths -import { elasticsearchClientMock } from '@kbn/core/server/elasticsearch/client/mocks'; -import { loggingSystemMock } from '@kbn/core/server/mocks'; -import { - getTotalCountAggregations, - getTotalCountInUse, - getExecutionsPerDayCount, - getExecutionTimeoutsPerDayCount, - getFailedAndUnrecognizedTasksPerDay, - parsePercentileAggsByRuleType, -} from './alerting_telemetry'; - -const mockLogger = loggingSystemMock.create().get(); -describe('alerting telemetry', () => { - test('getTotalCountInUse should replace "." symbols with "__" in rule types names', async () => { - const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser; - mockEsClient.search.mockResponse( - // @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values - { - aggregations: { - byRuleTypeId: { - value: { - ruleTypes: { - '.index-threshold': 2, - 'logs.alert.document.count': 1, - 'document.test.': 1, - }, - namespaces: { - default: 1, - }, - }, - }, - }, - hits: { - hits: [], - }, - } - ); - - const telemetry = await getTotalCountInUse(mockEsClient, 'test', mockLogger); - - expect(mockEsClient.search).toHaveBeenCalledTimes(1); - - expect(telemetry).toMatchInlineSnapshot(` -Object { - "countByType": Object { - "__index-threshold": 2, - "document__test__": 1, - "logs__alert__document__count": 1, - }, - "countNamespaces": 1, - "countTotal": 4, -} -`); - }); - - test('getTotalCountInUse should return empty results if query throws error', async () => { - const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser; - mockEsClient.search.mockRejectedValue(new Error('oh no')); - - const telemetry = await getTotalCountInUse(mockEsClient, 'test', mockLogger); - - expect(mockEsClient.search).toHaveBeenCalledTimes(1); - expect(mockLogger.warn).toHaveBeenCalledWith( - `Error executing alerting telemetry task: getTotalCountInUse - {}` - ); - expect(telemetry).toMatchInlineSnapshot(` -Object { - "countByType": Object {}, - "countNamespaces": 0, - "countTotal": 0, -} -`); - }); - - test('getTotalCountAggregations should return min/max connectors in use', async () => { - const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser; - mockEsClient.search.mockResponse( - // @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values - { - aggregations: { - byRuleTypeId: { - value: { - ruleTypes: { - '.index-threshold': 2, - 'logs.alert.document.count': 1, - 'document.test.': 1, - }, - }, - }, - max_throttle_time: { value: 60 }, - min_throttle_time: { value: 0 }, - avg_throttle_time: { value: 30 }, - max_interval_time: { value: 10 }, - min_interval_time: { value: 1 }, - avg_interval_time: { value: 4.5 }, - max_actions_count: { value: 4 }, - min_actions_count: { value: 0 }, - avg_actions_count: { value: 2.5 }, - }, - hits: { - hits: [], - }, - } - ); - - const telemetry = await getTotalCountAggregations(mockEsClient, 'test', mockLogger); - - expect(mockEsClient.search).toHaveBeenCalledTimes(1); - - expect(telemetry).toMatchInlineSnapshot(` -Object { - "connectors_per_alert": Object { - "avg": 2.5, - "max": 4, - "min": 0, - }, - "count_by_type": Object { - "__index-threshold": 2, - "document__test__": 1, - "logs__alert__document__count": 1, - }, - "count_rules_namespaces": 0, - "count_total": 4, - "schedule_time": Object { - "avg": "4.5s", - "max": "10s", - "min": "1s", - }, - "schedule_time_number_s": Object { - "avg": 4.5, - "max": 10, - "min": 1, - }, - "throttle_time": Object { - "avg": "30s", - "max": "60s", - "min": "0s", - }, - "throttle_time_number_s": Object { - "avg": 30, - "max": 60, - "min": 0, - }, -} -`); - }); - - test('getTotalCountAggregations should return empty results if query throws error', async () => { - const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser; - mockEsClient.search.mockRejectedValue(new Error('oh no')); - - const telemetry = await getTotalCountAggregations(mockEsClient, 'test', mockLogger); - - expect(mockEsClient.search).toHaveBeenCalledTimes(1); - expect(mockLogger.warn).toHaveBeenCalledWith( - `Error executing alerting telemetry task: getTotalCountAggregations - {}` - ); - expect(telemetry).toMatchInlineSnapshot(` -Object { - "connectors_per_alert": Object { - "avg": 0, - "max": 0, - "min": 0, - }, - "count_by_type": Object {}, - "count_rules_namespaces": 0, - "count_total": 0, - "schedule_time": Object { - "avg": "0s", - "max": "0s", - "min": "0s", - }, - "schedule_time_number_s": Object { - "avg": 0, - "max": 0, - "min": 0, - }, - "throttle_time": Object { - "avg": "0s", - "max": "0s", - "min": "0s", - }, - "throttle_time_number_s": Object { - "avg": 0, - "max": 0, - "min": 0, - }, -} -`); - }); - - test('getExecutionsPerDayCount should return execution aggregations for total count, count by rule type and number of failed executions', async () => { - const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser; - mockEsClient.search.mockResponse( - // @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values - { - aggregations: { - byRuleTypeId: { - value: { - ruleTypes: { - '.index-threshold': 2, - 'logs.alert.document.count': 1, - 'document.test.': 1, - }, - ruleTypesDuration: { - '.index-threshold': 2087868, - 'logs.alert.document.count': 1675765, - 'document.test.': 17687687, - }, - ruleTypesEsSearchDuration: { - '.index-threshold': 23, - 'logs.alert.document.count': 526, - 'document.test.': 534, - }, - ruleTypesTotalSearchDuration: { - '.index-threshold': 62, - 'logs.alert.document.count': 588, - 'document.test.': 637, - }, - }, - }, - failuresByReason: { - value: { - reasons: { - unknown: { - '.index-threshold': 2, - 'logs.alert.document.count': 1, - 'document.test.': 1, - }, - }, - }, - }, - avgDuration: { value: 10 }, - avgEsSearchDuration: { - value: 25.785714285714285, - }, - avgTotalSearchDuration: { - value: 30.642857142857142, - }, - percentileScheduledActions: { - values: { - '50.0': 4.0, - '90.0': 26.0, - '99.0': 26.0, - }, - }, - percentileAlerts: { - values: { - '50.0': 10.0, - '90.0': 22.0, - '99.0': 22.0, - }, - }, - aggsByType: { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, - buckets: [ - { - key: '.index-threshold', - doc_count: 149, - percentileScheduledActions: { - values: { - '50.0': 4.0, - '90.0': 26.0, - '99.0': 26.0, - }, - }, - percentileAlerts: { - values: { - '50.0': 10.0, - '90.0': 22.0, - '99.0': 22.0, - }, - }, - }, - { - key: 'logs.alert.document.count', - doc_count: 1, - percentileScheduledActions: { - values: { - '50.0': 10.0, - '90.0': 10.0, - '99.0': 10.0, - }, - }, - percentileAlerts: { - values: { - '50.0': 5.0, - '90.0': 13.0, - '99.0': 13.0, - }, - }, - }, - ], - }, - }, - hits: { - hits: [], - }, - } - ); - - const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', mockLogger); - - expect(mockEsClient.search).toHaveBeenCalledTimes(1); - - expect(telemetry).toStrictEqual({ - avgEsSearchDuration: 26, - avgEsSearchDurationByType: { - '__index-threshold': 12, - document__test__: 534, - logs__alert__document__count: 526, - }, - avgExecutionTime: 0, - avgExecutionTimeByType: { - '__index-threshold': 1043934, - document__test__: 17687687, - logs__alert__document__count: 1675765, - }, - avgTotalSearchDuration: 31, - avgTotalSearchDurationByType: { - '__index-threshold': 31, - document__test__: 637, - logs__alert__document__count: 588, - }, - countByType: { - '__index-threshold': 2, - document__test__: 1, - logs__alert__document__count: 1, - }, - countFailuresByReason: { - unknown: 4, - }, - countFailuresByReasonByType: { - unknown: { - '__index-threshold': 2, - document__test__: 1, - logs__alert__document__count: 1, - }, - }, - countTotal: 4, - countTotalFailures: 4, - generatedActionsPercentiles: { - p50: 4, - p90: 26, - p99: 26, - }, - generatedActionsPercentilesByType: { - p50: { - '__index-threshold': 4, - logs__alert__document__count: 10, - }, - p90: { - '__index-threshold': 26, - logs__alert__document__count: 10, - }, - p99: { - '__index-threshold': 26, - logs__alert__document__count: 10, - }, - }, - alertsPercentiles: { - p50: 10, - p90: 22, - p99: 22, - }, - alertsPercentilesByType: { - p50: { - '__index-threshold': 10, - logs__alert__document__count: 5, - }, - p90: { - '__index-threshold': 22, - logs__alert__document__count: 13, - }, - p99: { - '__index-threshold': 22, - logs__alert__document__count: 13, - }, - }, - }); - }); - - test('getExecutionsPerDayCount should return empty results if query throws error', async () => { - const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser; - mockEsClient.search.mockRejectedValue(new Error('oh no')); - - const telemetry = await getExecutionsPerDayCount(mockEsClient, 'test', mockLogger); - - expect(mockEsClient.search).toHaveBeenCalledTimes(1); - expect(mockLogger.warn).toHaveBeenCalledWith( - `Error executing alerting telemetry task: getExecutionsPerDayCount - {}` - ); - expect(telemetry).toStrictEqual({ - avgEsSearchDuration: 0, - avgEsSearchDurationByType: {}, - avgExecutionTime: 0, - avgExecutionTimeByType: {}, - avgTotalSearchDuration: 0, - avgTotalSearchDurationByType: {}, - countByType: {}, - countFailuresByReason: {}, - countFailuresByReasonByType: {}, - countTotal: 0, - countTotalFailures: 0, - generatedActionsPercentiles: {}, - generatedActionsPercentilesByType: {}, - alertsPercentiles: {}, - alertsPercentilesByType: {}, - }); - }); - - test('getExecutionTimeoutsPerDayCount should return execution aggregations for total timeout count and count by rule type', async () => { - const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser; - mockEsClient.search.mockResponse( - // @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values - { - aggregations: { - byRuleTypeId: { - value: { - ruleTypes: { - '.index-threshold': 2, - 'logs.alert.document.count': 1, - 'document.test.': 1, - }, - }, - }, - }, - hits: { - hits: [], - }, - } - ); - - const telemetry = await getExecutionTimeoutsPerDayCount(mockEsClient, 'test', mockLogger); - - expect(mockEsClient.search).toHaveBeenCalledTimes(1); - - expect(telemetry).toStrictEqual({ - countTotal: 4, - countByType: { - '__index-threshold': 2, - document__test__: 1, - logs__alert__document__count: 1, - }, - }); - }); - - test('getExecutionTimeoutsPerDayCount should return empty results if query throws error', async () => { - const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser; - mockEsClient.search.mockRejectedValue(new Error('oh no')); - - const telemetry = await getExecutionTimeoutsPerDayCount(mockEsClient, 'test', mockLogger); - - expect(mockEsClient.search).toHaveBeenCalledTimes(1); - expect(mockLogger.warn).toHaveBeenCalledWith( - `Error executing alerting telemetry task: getExecutionsPerDayCount - {}` - ); - expect(telemetry).toStrictEqual({ - countTotal: 0, - countByType: {}, - }); - }); - - test('getFailedAndUnrecognizedTasksPerDay should aggregations for total count, count by status and count by status and rule type for failed and unrecognized tasks', async () => { - const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser; - mockEsClient.search.mockResponse( - // @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values - { - aggregations: { - byTaskTypeId: { - value: { - statuses: { - failed: { - '.index-threshold': 2, - 'logs.alert.document.count': 1, - 'document.test.': 1, - }, - unrecognized: { - 'o.l.d.task-type': 1, - }, - }, - }, - }, - }, - hits: { - hits: [], - }, - } - ); - - const telemetry = await getFailedAndUnrecognizedTasksPerDay(mockEsClient, 'test', mockLogger); - - expect(mockEsClient.search).toHaveBeenCalledTimes(1); - - expect(telemetry).toStrictEqual({ - countByStatus: { - failed: 4, - unrecognized: 1, - }, - countByStatusByRuleType: { - failed: { - '__index-threshold': 2, - document__test__: 1, - logs__alert__document__count: 1, - }, - unrecognized: { - 'o__l__d__task-type': 1, - }, - }, - countTotal: 5, - }); - }); - - test('getFailedAndUnrecognizedTasksPerDay should return empty results if query throws error', async () => { - const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser; - mockEsClient.search.mockRejectedValue(new Error('oh no')); - - const telemetry = await getFailedAndUnrecognizedTasksPerDay(mockEsClient, 'test', mockLogger); - - expect(mockEsClient.search).toHaveBeenCalledTimes(1); - expect(mockLogger.warn).toHaveBeenCalledWith( - `Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - {}` - ); - expect(telemetry).toStrictEqual({ - countByStatus: {}, - countByStatusByRuleType: {}, - countTotal: 0, - }); - }); - - test('parsePercentileAggsByRuleType', () => { - const aggsByType = { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, - buckets: [ - { - key: '.index-threshold', - doc_count: 149, - percentileScheduledActions: { - values: { - '50.0': 4.0, - '90.0': 26.0, - '99.0': 26.0, - }, - }, - percentileAlerts: { - values: { - '50.0': 3.0, - '90.0': 22.0, - '99.0': 22.0, - }, - }, - }, - { - key: 'logs.alert.document.count', - doc_count: 1, - percentileScheduledActions: { - values: { - '50.0': 10.0, - '90.0': 10.0, - '99.0': 10.0, - }, - }, - percentileAlerts: { - values: { - '50.0': 5.0, - '90.0': 16.0, - '99.0': 16.0, - }, - }, - }, - { - key: 'document.test.', - doc_count: 1, - percentileScheduledActions: { - values: { - '50.0': null, - '90.0': null, - '99.0': null, - }, - }, - percentileAlerts: { - values: { - '50.0': null, - '90.0': null, - '99.0': null, - }, - }, - }, - ], - }; - expect( - parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileScheduledActions.values') - ).toEqual({ - p50: { - '__index-threshold': 4, - document__test__: 0, - logs__alert__document__count: 10, - }, - p90: { - '__index-threshold': 26, - document__test__: 0, - logs__alert__document__count: 10, - }, - p99: { - '__index-threshold': 26, - document__test__: 0, - logs__alert__document__count: 10, - }, - }); - expect(parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileAlerts.values')).toEqual({ - p50: { - '__index-threshold': 3, - document__test__: 0, - logs__alert__document__count: 5, - }, - p90: { - '__index-threshold': 22, - document__test__: 0, - logs__alert__document__count: 16, - }, - p99: { - '__index-threshold': 22, - document__test__: 0, - logs__alert__document__count: 16, - }, - }); - }); - - test('parsePercentileAggsByRuleType handles unknown path', () => { - const aggsByType = { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, - buckets: [ - { - key: '.index-threshold', - doc_count: 149, - percentileScheduledActions: { - values: { - '50.0': 4.0, - '90.0': 26.0, - '99.0': 26.0, - }, - }, - }, - { - key: 'logs.alert.document.count', - doc_count: 1, - percentileScheduledActions: { - values: { - '50.0': 10.0, - '90.0': 10.0, - '99.0': 10.0, - }, - }, - }, - ], - }; - expect(parsePercentileAggsByRuleType(aggsByType.buckets, 'foo.values')).toEqual({ - p50: {}, - p90: {}, - p99: {}, - }); - }); - - test('parsePercentileAggsByRuleType handles unrecognized percentiles', () => { - const aggsByType = { - doc_count_error_upper_bound: 0, - sum_other_doc_count: 0, - buckets: [ - { - key: '.index-threshold', - doc_count: 149, - percentileScheduledActions: { - values: { - '50.0': 4.0, - '75.0': 8.0, - '90.0': 26.0, - '99.0': 26.0, - }, - }, - }, - { - key: 'logs.alert.document.count', - doc_count: 1, - percentileScheduledActions: { - values: { - '50.0': 10.0, - '75.0': 10.0, - '90.0': 10.0, - '99.0': 10.0, - }, - }, - }, - ], - }; - expect( - parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileScheduledActions.values') - ).toEqual({ - p50: { - '__index-threshold': 4, - logs__alert__document__count: 10, - }, - p90: { - '__index-threshold': 26, - logs__alert__document__count: 10, - }, - p99: { - '__index-threshold': 26, - logs__alert__document__count: 10, - }, - }); - }); -}); diff --git a/x-pack/plugins/alerting/server/usage/alerting_telemetry.ts b/x-pack/plugins/alerting/server/usage/alerting_telemetry.ts deleted file mode 100644 index 716dc890b0b6..000000000000 --- a/x-pack/plugins/alerting/server/usage/alerting_telemetry.ts +++ /dev/null @@ -1,962 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0; you may not use this file except in compliance with the Elastic License - * 2.0. - */ - -import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey'; -import { ElasticsearchClient, Logger } from '@kbn/core/server'; -import { get, merge } from 'lodash'; -import { AlertingUsage } from './types'; -import { NUM_ALERTING_RULE_TYPES } from './alerting_usage_collector'; - -const percentileFieldNameMapping: Record = { - '50.0': 'p50', - '90.0': 'p90', - '99.0': 'p99', -}; - -const ruleTypeMetric = { - scripted_metric: { - init_script: 'state.ruleTypes = [:]; state.namespaces = [:]', - map_script: ` - String ruleType = doc['alert.alertTypeId'].value; - String namespace = doc['namespaces'] !== null && doc['namespaces'].size() > 0 ? doc['namespaces'].value : 'default'; - state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1); - if (state.namespaces.containsKey(namespace) === false) { - state.namespaces.put(namespace, 1); - } - `, - // Combine script is executed per cluster, but we already have a key-value pair per cluster. - // Despite docs that say this is optional, this script can't be blank. - combine_script: 'return state', - // Reduce script is executed across all clusters, so we need to add up all the total from each cluster - // This also needs to account for having no data - reduce_script: ` - HashMap result = new HashMap(); - HashMap combinedRuleTypes = new HashMap(); - HashMap combinedNamespaces = new HashMap(); - for (state in states) { - for (String ruleType : state.ruleTypes.keySet()) { - int ruleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + state.ruleTypes.get(ruleType) : state.ruleTypes.get(ruleType); - combinedRuleTypes.put(ruleType, ruleTypeCount); - } - - for (String namespace : state.namespaces.keySet()) { - combinedNamespaces.put(namespace, 1); - } - } - - result.ruleTypes = combinedRuleTypes; - result.namespaces = combinedNamespaces; - return result; - `, - }, -}; - -const generatedActionsPercentilesAgg = { - percentiles: { - field: 'kibana.alert.rule.execution.metrics.number_of_generated_actions', - percents: [50, 90, 99], - }, -}; - -const alertsPercentilesAgg = { - percentiles: { - field: 'kibana.alert.rule.execution.metrics.alert_counts.active', - percents: [50, 90, 99], - }, -}; - -const ruleTypeExecutionsWithDurationMetric = { - scripted_metric: { - init_script: - 'state.ruleTypes = [:]; state.ruleTypesDuration = [:]; state.ruleTypesEsSearchDuration = [:]; state.ruleTypesTotalSearchDuration = [:];', - map_script: ` - String ruleType = doc['rule.category'].value; - long duration = doc['event.duration'].value / (1000 * 1000); - long esSearchDuration = doc['kibana.alert.rule.execution.metrics.es_search_duration_ms'].empty ? 0 : doc['kibana.alert.rule.execution.metrics.es_search_duration_ms'].value; - long totalSearchDuration = doc['kibana.alert.rule.execution.metrics.total_search_duration_ms'].empty ? 0 : doc['kibana.alert.rule.execution.metrics.total_search_duration_ms'].value; - state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1); - state.ruleTypesDuration.put(ruleType, state.ruleTypesDuration.containsKey(ruleType) ? state.ruleTypesDuration.get(ruleType) + duration : duration); - state.ruleTypesEsSearchDuration.put(ruleType, state.ruleTypesEsSearchDuration.containsKey(ruleType) ? state.ruleTypesEsSearchDuration.get(ruleType) + esSearchDuration : esSearchDuration); - state.ruleTypesTotalSearchDuration.put(ruleType, state.ruleTypesTotalSearchDuration.containsKey(ruleType) ? state.ruleTypesTotalSearchDuration.get(ruleType) + totalSearchDuration : totalSearchDuration); - `, - // Combine script is executed per cluster, but we already have a key-value pair per cluster. - // Despite docs that say this is optional, this script can't be blank. - combine_script: 'return state', - // Reduce script is executed across all clusters, so we need to add up all the total from each cluster - // This also needs to account for having no data - reduce_script: ` - HashMap result = new HashMap(); - HashMap combinedRuleTypes = new HashMap(); - HashMap combinedRuleTypeDurations = new HashMap(); - HashMap combinedRuleTypeEsSearchDurations = new HashMap(); - HashMap combinedRuleTypeTotalSearchDurations = new HashMap(); - for (state in states) { - for (String ruleType : state.ruleTypes.keySet()) { - int ruleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + state.ruleTypes.get(ruleType) : state.ruleTypes.get(ruleType); - combinedRuleTypes.put(ruleType, ruleTypeCount); - } - - for (String ruleType : state.ruleTypesDuration.keySet()) { - long ruleTypeDurationTotal = combinedRuleTypeDurations.containsKey(ruleType) ? combinedRuleTypeDurations.get(ruleType) + state.ruleTypesDuration.get(ruleType) : state.ruleTypesDuration.get(ruleType); - combinedRuleTypeDurations.put(ruleType, ruleTypeDurationTotal); - } - - for (String ruleType : state.ruleTypesEsSearchDuration.keySet()) { - long ruleTypeEsSearchDurationTotal = combinedRuleTypeEsSearchDurations.containsKey(ruleType) ? combinedRuleTypeEsSearchDurations.get(ruleType) + state.ruleTypesEsSearchDuration.get(ruleType) : state.ruleTypesEsSearchDuration.get(ruleType); - combinedRuleTypeEsSearchDurations.put(ruleType, ruleTypeEsSearchDurationTotal); - } - - for (String ruleType : state.ruleTypesTotalSearchDuration.keySet()) { - long ruleTypeTotalSearchDurationTotal = combinedRuleTypeTotalSearchDurations.containsKey(ruleType) ? combinedRuleTypeTotalSearchDurations.get(ruleType) + state.ruleTypesTotalSearchDuration.get(ruleType) : state.ruleTypesTotalSearchDuration.get(ruleType); - combinedRuleTypeTotalSearchDurations.put(ruleType, ruleTypeTotalSearchDurationTotal); - } - } - - result.ruleTypes = combinedRuleTypes; - result.ruleTypesDuration = combinedRuleTypeDurations; - result.ruleTypesEsSearchDuration = combinedRuleTypeEsSearchDurations; - result.ruleTypesTotalSearchDuration = combinedRuleTypeTotalSearchDurations; - return result; - `, - }, -}; - -const ruleTypeExecutionsMetric = { - scripted_metric: { - init_script: 'state.ruleTypes = [:]', - map_script: ` - String ruleType = doc['rule.category'].value; - state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1); - `, - // Combine script is executed per cluster, but we already have a key-value pair per cluster. - // Despite docs that say this is optional, this script can't be blank. - combine_script: 'return state', - // Reduce script is executed across all clusters, so we need to add up all the total from each cluster - // This also needs to account for having no data - reduce_script: ` - HashMap result = new HashMap(); - HashMap combinedRuleTypes = new HashMap(); - for (state in states) { - for (String ruleType : state.ruleTypes.keySet()) { - int ruleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + state.ruleTypes.get(ruleType) : state.ruleTypes.get(ruleType); - combinedRuleTypes.put(ruleType, ruleTypeCount); - } - } - - result.ruleTypes = combinedRuleTypes; - return result; - `, - }, -}; - -const taskTypeExecutionsMetric = { - scripted_metric: { - init_script: 'state.statuses = [:]', - map_script: ` - String status = doc['task.status'].value; - String taskType = doc['task.taskType'].value.replace('alerting:', ''); - Map taskTypes = state.statuses.containsKey(status) ? state.statuses.get(status) : [:]; - taskTypes.put(taskType, taskTypes.containsKey(taskType) ? taskTypes.get(taskType) + 1 : 1); - state.statuses.put(status, taskTypes); - `, - // Combine script is executed per cluster, but we already have a key-value pair per cluster. - // Despite docs that say this is optional, this script can't be blank. - combine_script: 'return state', - // Reduce script is executed across all clusters, so we need to add up all the total from each cluster - // This also needs to account for having no data - reduce_script: ` - HashMap result = new HashMap(); - HashMap combinedStatuses = new HashMap(); - for (state in states) { - for (String status : state.statuses.keySet()) { - HashMap combinedTaskTypes = new HashMap(); - Map statusTaskTypes = state.statuses.get(status); - for (String taskType : statusTaskTypes.keySet()) { - int statusByTaskTypeCount = combinedTaskTypes.containsKey(taskType) ? combinedTaskTypes.get(taskType) + statusTaskTypes.get(taskType) : statusTaskTypes.get(taskType); - combinedTaskTypes.put(taskType, statusByTaskTypeCount); - } - - combinedStatuses.put(status, combinedTaskTypes); - } - } - result.statuses = combinedStatuses; - return result; - `, - }, -}; - -const ruleTypeFailureExecutionsMetric = { - scripted_metric: { - init_script: 'state.reasons = [:]', - map_script: ` - if (doc['event.outcome'].value == 'failure') { - String reason = doc['event.reason'].value; - String ruleType = doc['rule.category'].value; - Map ruleTypes = state.reasons.containsKey(reason) ? state.reasons.get(reason) : [:]; - ruleTypes.put(ruleType, ruleTypes.containsKey(ruleType) ? ruleTypes.get(ruleType) + 1 : 1); - state.reasons.put(reason, ruleTypes); - } - `, - // Combine script is executed per cluster, but we already have a key-value pair per cluster. - // Despite docs that say this is optional, this script can't be blank. - combine_script: 'return state', - // Reduce script is executed across all clusters, so we need to add up all the total from each cluster - // This also needs to account for having no data - reduce_script: ` - HashMap result = new HashMap(); - HashMap combinedReasons = new HashMap(); - for (state in states) { - for (String reason : state.reasons.keySet()) { - HashMap combinedRuleTypes = new HashMap(); - Map reasonRuleTypes = state.reasons.get(reason); - for (String ruleType : state.reasons.get(reason).keySet()) { - int reasonByRuleTypeCount = combinedRuleTypes.containsKey(ruleType) ? combinedRuleTypes.get(ruleType) + reasonRuleTypes.get(ruleType) : reasonRuleTypes.get(ruleType); - combinedRuleTypes.put(ruleType, reasonByRuleTypeCount); - } - - combinedReasons.put(reason, combinedRuleTypes); - } - } - result.reasons = combinedReasons; - return result; - `, - }, -}; - -export async function getTotalCountAggregations( - esClient: ElasticsearchClient, - kibanaIndex: string, - logger: Logger -): Promise< - Pick< - AlertingUsage, - | 'count_total' - | 'count_by_type' - | 'throttle_time' - | 'schedule_time' - | 'throttle_time_number_s' - | 'schedule_time_number_s' - | 'connectors_per_alert' - | 'count_rules_namespaces' - > -> { - try { - const results = await esClient.search({ - index: kibanaIndex, - body: { - size: 0, - query: { - bool: { - filter: [{ term: { type: 'alert' } }], - }, - }, - runtime_mappings: { - alert_action_count: { - type: 'long', - script: { - source: ` - def alert = params._source['alert']; - if (alert != null) { - def actions = alert.actions; - if (actions != null) { - emit(actions.length); - } else { - emit(0); - } - }`, - }, - }, - alert_interval: { - type: 'long', - script: { - source: ` - int parsed = 0; - if (doc['alert.schedule.interval'].size() > 0) { - def interval = doc['alert.schedule.interval'].value; - - if (interval.length() > 1) { - // get last char - String timeChar = interval.substring(interval.length() - 1); - // remove last char - interval = interval.substring(0, interval.length() - 1); - - if (interval.chars().allMatch(Character::isDigit)) { - // using of regex is not allowed in painless language - parsed = Integer.parseInt(interval); - - if (timeChar.equals("s")) { - parsed = parsed; - } else if (timeChar.equals("m")) { - parsed = parsed * 60; - } else if (timeChar.equals("h")) { - parsed = parsed * 60 * 60; - } else if (timeChar.equals("d")) { - parsed = parsed * 24 * 60 * 60; - } - emit(parsed); - } - } - } - emit(parsed); - `, - }, - }, - alert_throttle: { - type: 'long', - script: { - source: ` - int parsed = 0; - if (doc['alert.throttle'].size() > 0) { - def throttle = doc['alert.throttle'].value; - - if (throttle.length() > 1) { - // get last char - String timeChar = throttle.substring(throttle.length() - 1); - // remove last char - throttle = throttle.substring(0, throttle.length() - 1); - - if (throttle.chars().allMatch(Character::isDigit)) { - // using of regex is not allowed in painless language - parsed = Integer.parseInt(throttle); - - if (timeChar.equals("s")) { - parsed = parsed; - } else if (timeChar.equals("m")) { - parsed = parsed * 60; - } else if (timeChar.equals("h")) { - parsed = parsed * 60 * 60; - } else if (timeChar.equals("d")) { - parsed = parsed * 24 * 60 * 60; - } - emit(parsed); - } - } - } - emit(parsed); - `, - }, - }, - }, - aggs: { - byRuleTypeId: ruleTypeMetric, - max_throttle_time: { max: { field: 'alert_throttle' } }, - min_throttle_time: { min: { field: 'alert_throttle' } }, - avg_throttle_time: { avg: { field: 'alert_throttle' } }, - max_interval_time: { max: { field: 'alert_interval' } }, - min_interval_time: { min: { field: 'alert_interval' } }, - avg_interval_time: { avg: { field: 'alert_interval' } }, - max_actions_count: { max: { field: 'alert_action_count' } }, - min_actions_count: { min: { field: 'alert_action_count' } }, - avg_actions_count: { avg: { field: 'alert_action_count' } }, - }, - }, - }); - - const aggregations = results.aggregations as { - byRuleTypeId: { value: { ruleTypes: Record } }; - max_throttle_time: { value: number }; - min_throttle_time: { value: number }; - avg_throttle_time: { value: number }; - max_interval_time: { value: number }; - min_interval_time: { value: number }; - avg_interval_time: { value: number }; - max_actions_count: { value: number }; - min_actions_count: { value: number }; - avg_actions_count: { value: number }; - }; - - const totalRulesCount = Object.keys(aggregations.byRuleTypeId.value.ruleTypes).reduce( - (total: number, key: string) => - parseInt(aggregations.byRuleTypeId.value.ruleTypes[key], 10) + total, - 0 - ); - - return { - count_total: totalRulesCount, - count_by_type: replaceDotSymbolsInRuleTypeIds(aggregations.byRuleTypeId.value.ruleTypes), - throttle_time: { - min: `${aggregations.min_throttle_time.value}s`, - avg: `${aggregations.avg_throttle_time.value}s`, - max: `${aggregations.max_throttle_time.value}s`, - }, - schedule_time: { - min: `${aggregations.min_interval_time.value}s`, - avg: `${aggregations.avg_interval_time.value}s`, - max: `${aggregations.max_interval_time.value}s`, - }, - throttle_time_number_s: { - min: aggregations.min_throttle_time.value, - avg: aggregations.avg_throttle_time.value, - max: aggregations.max_throttle_time.value, - }, - schedule_time_number_s: { - min: aggregations.min_interval_time.value, - avg: aggregations.avg_interval_time.value, - max: aggregations.max_interval_time.value, - }, - connectors_per_alert: { - min: aggregations.min_actions_count.value, - avg: aggregations.avg_actions_count.value, - max: aggregations.max_actions_count.value, - }, - count_rules_namespaces: 0, - }; - } catch (err) { - logger.warn( - `Error executing alerting telemetry task: getTotalCountAggregations - ${JSON.stringify(err)}` - ); - return { - count_total: 0, - count_by_type: {}, - throttle_time: { - min: '0s', - avg: '0s', - max: '0s', - }, - schedule_time: { - min: '0s', - avg: '0s', - max: '0s', - }, - throttle_time_number_s: { - min: 0, - avg: 0, - max: 0, - }, - schedule_time_number_s: { - min: 0, - avg: 0, - max: 0, - }, - connectors_per_alert: { - min: 0, - avg: 0, - max: 0, - }, - count_rules_namespaces: 0, - }; - } -} - -export async function getTotalCountInUse( - esClient: ElasticsearchClient, - kibanaIndex: string, - logger: Logger -) { - try { - const searchResult = await esClient.search({ - index: kibanaIndex, - size: 0, - body: { - query: { - bool: { - filter: [{ term: { type: 'alert' } }, { term: { 'alert.enabled': true } }], - }, - }, - aggs: { - byRuleTypeId: ruleTypeMetric, - }, - }, - }); - - const aggregations = searchResult.aggregations as { - byRuleTypeId: { - value: { ruleTypes: Record; namespaces: Record }; - }; - }; - - return { - countTotal: Object.keys(aggregations.byRuleTypeId.value.ruleTypes).reduce( - (total: number, key: string) => - parseInt(aggregations.byRuleTypeId.value.ruleTypes[key], 10) + total, - 0 - ), - countByType: replaceDotSymbolsInRuleTypeIds(aggregations.byRuleTypeId.value.ruleTypes), - countNamespaces: Object.keys(aggregations.byRuleTypeId.value.namespaces).length, - }; - } catch (err) { - logger.warn( - `Error executing alerting telemetry task: getTotalCountInUse - ${JSON.stringify(err)}` - ); - return { - countTotal: 0, - countByType: {}, - countNamespaces: 0, - }; - } -} - -export async function getExecutionsPerDayCount( - esClient: ElasticsearchClient, - eventLogIndex: string, - logger: Logger -) { - try { - const searchResult = await esClient.search({ - index: eventLogIndex, - size: 0, - body: { - query: { - bool: { - filter: { - bool: { - must: [ - { - term: { 'event.action': 'execute' }, - }, - { - term: { 'event.provider': 'alerting' }, - }, - { - range: { - '@timestamp': { - gte: 'now-1d', - }, - }, - }, - ], - }, - }, - }, - }, - aggs: { - byRuleTypeId: ruleTypeExecutionsWithDurationMetric, - failuresByReason: ruleTypeFailureExecutionsMetric, - avgDuration: { avg: { field: 'event.duration' } }, - avgEsSearchDuration: { - avg: { field: 'kibana.alert.rule.execution.metrics.es_search_duration_ms' }, - }, - avgTotalSearchDuration: { - avg: { field: 'kibana.alert.rule.execution.metrics.total_search_duration_ms' }, - }, - percentileScheduledActions: generatedActionsPercentilesAgg, - percentileAlerts: alertsPercentilesAgg, - aggsByType: { - terms: { - field: 'rule.category', - size: NUM_ALERTING_RULE_TYPES, - }, - aggs: { - percentileScheduledActions: generatedActionsPercentilesAgg, - percentileAlerts: alertsPercentilesAgg, - }, - }, - }, - }, - }); - - const executionsAggregations = searchResult.aggregations as { - byRuleTypeId: { - value: { - ruleTypes: Record; - ruleTypesDuration: Record; - ruleTypesEsSearchDuration: Record; - ruleTypesTotalSearchDuration: Record; - }; - }; - }; - - const aggsAvgExecutionTime = Math.round( - // @ts-expect-error aggegation type is not specified - // convert nanoseconds to milliseconds - searchResult.aggregations.avgDuration.value / (1000 * 1000) - ); - - const aggsAvgEsSearchDuration = Math.round( - // @ts-expect-error aggegation type is not specified - searchResult.aggregations.avgEsSearchDuration.value - ); - const aggsAvgTotalSearchDuration = Math.round( - // @ts-expect-error aggegation type is not specified - searchResult.aggregations.avgTotalSearchDuration.value - ); - - const aggsGeneratedActionsPercentiles = - // @ts-expect-error aggegation type is not specified - searchResult.aggregations.percentileScheduledActions.values; - - const aggsAlertsPercentiles = - // @ts-expect-error aggegation type is not specified - searchResult.aggregations.percentileAlerts.values; - - const aggsByTypeBuckets = - // @ts-expect-error aggegation type is not specified - searchResult.aggregations.aggsByType.buckets; - - const executionFailuresAggregations = searchResult.aggregations as { - failuresByReason: { value: { reasons: Record> } }; - }; - - return { - countTotal: Object.keys(executionsAggregations.byRuleTypeId.value.ruleTypes).reduce( - (total: number, key: string) => - parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) + total, - 0 - ), - countByType: replaceDotSymbolsInRuleTypeIds( - executionsAggregations.byRuleTypeId.value.ruleTypes - ), - countTotalFailures: Object.keys( - executionFailuresAggregations.failuresByReason.value.reasons - ).reduce((total: number, reason: string) => { - const byRuleTypesRefs = - executionFailuresAggregations.failuresByReason.value.reasons[reason]; - const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce( - (totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10), - 0 - ); - return countByRuleTypes + total; - }, 0), - countFailuresByReason: Object.keys( - executionFailuresAggregations.failuresByReason.value.reasons - ).reduce( - // ES DSL aggregations are returned as `any` by esClient.search - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (obj: any, reason: string) => { - const byRuleTypesRefs = - executionFailuresAggregations.failuresByReason.value.reasons[reason]; - const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce( - (totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10), - 0 - ); - return { - ...obj, - [replaceDotSymbols(reason)]: countByRuleTypes, - }; - }, - {} - ), - countFailuresByReasonByType: Object.keys( - executionFailuresAggregations.failuresByReason.value.reasons - ).reduce( - // ES DSL aggregations are returned as `any` by esClient.search - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (obj: any, key: string) => ({ - ...obj, - [key]: replaceDotSymbolsInRuleTypeIds( - executionFailuresAggregations.failuresByReason.value.reasons[key] - ), - }), - {} - ), - avgExecutionTime: aggsAvgExecutionTime, - avgExecutionTimeByType: Object.keys( - executionsAggregations.byRuleTypeId.value.ruleTypes - ).reduce( - // ES DSL aggregations are returned as `any` by esClient.search - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (obj: any, key: string) => ({ - ...obj, - [replaceDotSymbols(key)]: Math.round( - executionsAggregations.byRuleTypeId.value.ruleTypesDuration[key] / - parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) - ), - }), - {} - ), - avgEsSearchDuration: aggsAvgEsSearchDuration, - avgEsSearchDurationByType: Object.keys( - executionsAggregations.byRuleTypeId.value.ruleTypes - ).reduce( - // ES DSL aggregations are returned as `any` by esClient.search - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (obj: any, key: string) => ({ - ...obj, - [replaceDotSymbols(key)]: Math.round( - executionsAggregations.byRuleTypeId.value.ruleTypesEsSearchDuration[key] / - parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) - ), - }), - {} - ), - avgTotalSearchDuration: aggsAvgTotalSearchDuration, - avgTotalSearchDurationByType: Object.keys( - executionsAggregations.byRuleTypeId.value.ruleTypes - ).reduce( - // ES DSL aggregations are returned as `any` by esClient.search - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (obj: any, key: string) => ({ - ...obj, - [replaceDotSymbols(key)]: Math.round( - executionsAggregations.byRuleTypeId.value.ruleTypesTotalSearchDuration[key] / - parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) - ), - }), - {} - ), - generatedActionsPercentiles: Object.keys(aggsGeneratedActionsPercentiles).reduce( - // ES DSL aggregations are returned as `any` by esClient.search - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (acc: any, curr: string) => ({ - ...acc, - ...(percentileFieldNameMapping[curr] - ? { [percentileFieldNameMapping[curr]]: aggsGeneratedActionsPercentiles[curr] } - : {}), - }), - {} - ), - generatedActionsPercentilesByType: parsePercentileAggsByRuleType( - aggsByTypeBuckets, - 'percentileScheduledActions.values' - ), - alertsPercentiles: Object.keys(aggsAlertsPercentiles).reduce( - // ES DSL aggregations are returned as `any` by esClient.search - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (acc: any, curr: string) => ({ - ...acc, - ...(percentileFieldNameMapping[curr] - ? { [percentileFieldNameMapping[curr]]: aggsAlertsPercentiles[curr] } - : {}), - }), - {} - ), - alertsPercentilesByType: parsePercentileAggsByRuleType( - aggsByTypeBuckets, - 'percentileAlerts.values' - ), - }; - } catch (err) { - logger.warn( - `Error executing alerting telemetry task: getExecutionsPerDayCount - ${JSON.stringify(err)}` - ); - return { - countTotal: 0, - countByType: {}, - countTotalFailures: 0, - countFailuresByReason: {}, - countFailuresByReasonByType: {}, - avgExecutionTime: 0, - avgExecutionTimeByType: {}, - avgEsSearchDuration: 0, - avgEsSearchDurationByType: {}, - avgTotalSearchDuration: 0, - avgTotalSearchDurationByType: {}, - generatedActionsPercentiles: {}, - generatedActionsPercentilesByType: {}, - alertsPercentiles: {}, - alertsPercentilesByType: {}, - }; - } -} - -export async function getExecutionTimeoutsPerDayCount( - esClient: ElasticsearchClient, - eventLogIndex: string, - logger: Logger -) { - try { - const searchResult = await esClient.search({ - index: eventLogIndex, - size: 0, - body: { - query: { - bool: { - filter: { - bool: { - must: [ - { - term: { 'event.action': 'execute-timeout' }, - }, - { - term: { 'event.provider': 'alerting' }, - }, - { - range: { - '@timestamp': { - gte: 'now-1d', - }, - }, - }, - ], - }, - }, - }, - }, - aggs: { - byRuleTypeId: ruleTypeExecutionsMetric, - }, - }, - }); - - const executionsAggregations = searchResult.aggregations as { - byRuleTypeId: { - value: { ruleTypes: Record; ruleTypesDuration: Record }; - }; - }; - - return { - countTotal: Object.keys(executionsAggregations.byRuleTypeId.value.ruleTypes).reduce( - (total: number, key: string) => - parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) + total, - 0 - ), - countByType: replaceDotSymbolsInRuleTypeIds( - executionsAggregations.byRuleTypeId.value.ruleTypes - ), - }; - } catch (err) { - logger.warn( - `Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - ${JSON.stringify( - err - )}` - ); - return { - countTotal: 0, - countByType: {}, - }; - } -} - -export async function getFailedAndUnrecognizedTasksPerDay( - esClient: ElasticsearchClient, - taskManagerIndex: string, - logger: Logger -) { - try { - const searchResult = await esClient.search({ - index: taskManagerIndex, - size: 0, - body: { - query: { - bool: { - must: [ - { - bool: { - should: [ - { - term: { - 'task.status': 'unrecognized', - }, - }, - { - term: { - 'task.status': 'failed', - }, - }, - ], - }, - }, - { - wildcard: { - 'task.taskType': { - value: 'alerting:*', - }, - }, - }, - { - range: { - 'task.runAt': { - gte: 'now-1d', - }, - }, - }, - ], - }, - }, - aggs: { - byTaskTypeId: taskTypeExecutionsMetric, - }, - }, - }); - - const executionsAggregations = searchResult.aggregations as { - byTaskTypeId: { value: { statuses: Record> } }; - }; - - return { - countTotal: Object.keys(executionsAggregations.byTaskTypeId.value.statuses).reduce( - (total: number, status: string) => { - const byRuleTypesRefs = executionsAggregations.byTaskTypeId.value.statuses[status]; - const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce( - (totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10), - 0 - ); - return countByRuleTypes + total; - }, - 0 - ), - countByStatus: Object.keys(executionsAggregations.byTaskTypeId.value.statuses).reduce( - // ES DSL aggregations are returned as `any` by esClient.search - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (obj: any, status: string) => { - const byRuleTypesRefs = executionsAggregations.byTaskTypeId.value.statuses[status]; - const countByRuleTypes = Object.keys(byRuleTypesRefs).reduce( - (totalByType, ruleType) => parseInt(byRuleTypesRefs[ruleType] + totalByType, 10), - 0 - ); - return { - ...obj, - [status]: countByRuleTypes, - }; - }, - {} - ), - countByStatusByRuleType: Object.keys( - executionsAggregations.byTaskTypeId.value.statuses - ).reduce( - // ES DSL aggregations are returned as `any` by esClient.search - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (obj: any, key: string) => ({ - ...obj, - [key]: replaceDotSymbolsInRuleTypeIds( - executionsAggregations.byTaskTypeId.value.statuses[key] - ), - }), - {} - ), - }; - } catch (err) { - logger.warn( - `Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify( - err - )}` - ); - return { - countTotal: 0, - countByStatus: {}, - countByStatusByRuleType: {}, - }; - } -} - -function replaceDotSymbols(strToReplace: string) { - return strToReplace.replaceAll('.', '__'); -} - -function replaceDotSymbolsInRuleTypeIds(ruleTypeIdObj: Record) { - return Object.keys(ruleTypeIdObj).reduce( - (obj, key) => ({ ...obj, [replaceDotSymbols(key)]: ruleTypeIdObj[key] }), - {} - ); -} - -export function parsePercentileAggsByRuleType( - aggsByType: estypes.AggregationsStringTermsBucketKeys[], - path: string -) { - return (aggsByType ?? []).reduce( - (acc, curr) => { - const percentiles = get(curr, path, {}); - return merge( - acc, - Object.keys(percentiles).reduce((pacc, pcurr) => { - return { - ...pacc, - ...(percentileFieldNameMapping[pcurr] - ? { - [percentileFieldNameMapping[pcurr]]: { - [replaceDotSymbols(curr.key)]: percentiles[pcurr] ?? 0, - }, - } - : {}), - }; - }, {}) - ); - }, - { p50: {}, p90: {}, p99: {} } - ); -} diff --git a/x-pack/plugins/alerting/server/usage/alerting_usage_collector.ts b/x-pack/plugins/alerting/server/usage/alerting_usage_collector.ts index c3bca512a2bb..2c7f4db1d9fa 100644 --- a/x-pack/plugins/alerting/server/usage/alerting_usage_collector.ts +++ b/x-pack/plugins/alerting/server/usage/alerting_usage_collector.ts @@ -68,6 +68,8 @@ const byReasonSchema: MakeSchemaFrom['count_rules_executions_fail unknown: { type: 'long' }, }; +export const NUM_ALERTING_EXECUTION_FAILURE_REASON_TYPES = Object.keys(byReasonSchema).length; + const byPercentileSchema: MakeSchemaFrom['percentile_num_generated_actions_per_day'] = { p50: { type: 'long' }, diff --git a/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_event_log.test.ts b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_event_log.test.ts new file mode 100644 index 000000000000..754cab335f8e --- /dev/null +++ b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_event_log.test.ts @@ -0,0 +1,1524 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks'; +import { + getExecutionsPerDayCount, + parseExecutionFailureByRuleType, + parseRuleTypeBucket, + parsePercentileAggs, + parseExecutionCountAggregationResults, + getExecutionTimeoutsPerDayCount, +} from './get_telemetry_from_event_log'; + +const elasticsearch = elasticsearchServiceMock.createStart(); +const esClient = elasticsearch.client.asInternalUser; +const logger: ReturnType = loggingSystemMock.createLogger(); + +describe('event log telemetry', () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + + describe('parseRuleTypeBucket', () => { + test('should correctly parse rule type bucket results', () => { + expect( + parseRuleTypeBucket([ + { + key: '.index-threshold', + doc_count: 78, + avg_es_search_duration: { + value: 40.76056338028169, + }, + percentile_alerts: { + values: { + '50.0': 1, + '90.0': 1, + '99.0': 1, + }, + }, + execution_failures: { + doc_count: 7, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'execute', + doc_count: 4, + }, + { + key: 'decrypt', + doc_count: 3, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '90.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 100576923.07692307, + }, + avg_total_search_duration: { + value: 43.74647887323944, + }, + }, + { + key: 'document.test.', + doc_count: 42, + avg_es_search_duration: { + value: 0, + }, + percentile_alerts: { + values: { + '50.0': 5, + '90.0': 5, + '99.0': 5, + }, + }, + execution_failures: { + doc_count: 2, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 2, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 5, + '90.0': 5, + '99.0': 5, + }, + }, + avg_execution_time: { + value: 770071428.5714285, + }, + avg_total_search_duration: { + value: 0, + }, + }, + { + key: 'logs.alert.document.count', + doc_count: 28, + avg_es_search_duration: { + value: 26.962962962962962, + }, + percentile_alerts: { + values: { + '50.0': 0, + '90.0': 0, + '99.0': 0, + }, + }, + execution_failures: { + doc_count: 1, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 1, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '90.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 88321428.57142857, + }, + avg_total_search_duration: { + value: 31.296296296296298, + }, + }, + ]) + ).toEqual({ + countRuleExecutionsByType: { + '__index-threshold': 78, + document__test__: 42, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 28, + }, + avgExecutionTimeByType: { + '__index-threshold': 101, + document__test__: 770, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 88, + }, + avgEsSearchDurationByType: { + '__index-threshold': 41, + document__test__: 0, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 27, + }, + avgTotalSearchDurationByType: { + '__index-threshold': 44, + document__test__: 0, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 31, + }, + generatedActionsPercentilesByType: { + p50: { + '__index-threshold': 0, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + p90: { + '__index-threshold': 0, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + p99: { + '__index-threshold': 0, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + }, + alertsPercentilesByType: { + p50: { + '__index-threshold': 1, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + p90: { + '__index-threshold': 1, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + p99: { + '__index-threshold': 1, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + }, + }); + }); + + test('should handle missing values', () => { + expect( + parseRuleTypeBucket([ + { + key: '.index-threshold', + doc_count: 78, + percentile_alerts: { + values: { + '50.0': 1, + '90.0': 1, + '99.0': 1, + }, + }, + execution_failures: { + doc_count: 7, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + // @ts-expect-error + { + key: 'execute', + }, + { + key: 'decrypt', + doc_count: 3, + }, + ], + }, + }, + // @ts-expect-error + percentile_scheduled_actions: {}, + avg_execution_time: { + value: 100576923.07692307, + }, + avg_total_search_duration: { + value: 43.74647887323944, + }, + }, + { + key: 'document.test.', + avg_es_search_duration: { + value: 0, + }, + percentile_alerts: { + values: { + '50.0': 5, + '90.0': 5, + '99.0': 5, + }, + }, + execution_failures: { + doc_count: 2, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 2, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 5, + '90.0': 5, + '99.0': 5, + }, + }, + // @ts-expect-error + avg_execution_time: {}, + avg_total_search_duration: { + value: 0, + }, + }, + // @ts-expect-error + { + key: 'logs.alert.document.count', + }, + ]) + ).toEqual({ + countRuleExecutionsByType: { + '__index-threshold': 78, + document__test__: 0, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + avgExecutionTimeByType: { + '__index-threshold': 101, + document__test__: 0, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + avgEsSearchDurationByType: { + '__index-threshold': 0, + document__test__: 0, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + avgTotalSearchDurationByType: { + '__index-threshold': 44, + document__test__: 0, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + generatedActionsPercentilesByType: { + p50: { + document__test__: 5, + }, + p90: { + document__test__: 5, + }, + p99: { + document__test__: 5, + }, + }, + alertsPercentilesByType: { + p50: { + '__index-threshold': 1, + document__test__: 5, + }, + p90: { + '__index-threshold': 1, + document__test__: 5, + }, + p99: { + '__index-threshold': 1, + document__test__: 5, + }, + }, + }); + }); + + test('should handle empty input', () => { + expect(parseRuleTypeBucket([])).toEqual({ + countRuleExecutionsByType: {}, + avgExecutionTimeByType: {}, + avgEsSearchDurationByType: {}, + avgTotalSearchDurationByType: {}, + generatedActionsPercentilesByType: { + p50: {}, + p90: {}, + p99: {}, + }, + alertsPercentilesByType: { + p50: {}, + p90: {}, + p99: {}, + }, + }); + }); + + test('should handle undefined input', () => { + // @ts-expect-error + expect(parseRuleTypeBucket(undefined)).toEqual({ + countRuleExecutionsByType: {}, + avgExecutionTimeByType: {}, + avgEsSearchDurationByType: {}, + avgTotalSearchDurationByType: {}, + generatedActionsPercentilesByType: { + p50: {}, + p90: {}, + p99: {}, + }, + alertsPercentilesByType: { + p50: {}, + p90: {}, + p99: {}, + }, + }); + }); + }); + + describe('parsePercentileAggs', () => { + test('should correctly format percentile aggregation output', () => { + expect( + parsePercentileAggs( + { + '50.0': 1, + '90.0': 2, + '99.0': 3, + }, + 'ruleTypeId' + ) + ).toEqual({ + p50: { + ruleTypeId: 1, + }, + p90: { + ruleTypeId: 2, + }, + p99: { + ruleTypeId: 3, + }, + }); + }); + + test('should correctly format percentile aggregation output when no rule type is specified', () => { + expect( + parsePercentileAggs({ + '50.0': 1, + '90.0': 2, + '99.0': 3, + }) + ).toEqual({ + p50: 1, + p90: 2, + p99: 3, + }); + }); + + test('should handle unknown keys', () => { + expect( + parsePercentileAggs( + { + '50.0': 1, + '70.0': 2, + '99.0': 3, + }, + 'ruleTypeId' + ) + ).toEqual({ + p50: { + ruleTypeId: 1, + }, + p99: { + ruleTypeId: 3, + }, + }); + }); + + test('should handle empty input', () => { + expect(parsePercentileAggs({}, 'ruleTypeId')).toEqual({}); + }); + + test('should handle undefined input', () => { + expect( + parsePercentileAggs(undefined as unknown as Record, 'ruleTypeId') + ).toEqual({}); + }); + }); + + describe('parseExecutionFailureByRuleType', () => { + test('should format execution failures by rule type', () => { + expect( + parseExecutionFailureByRuleType([ + { + key: '.index-threshold', + doc_count: 78, + avg_es_search_duration: { + value: 40.76056338028169, + }, + percentile_alerts: { + values: { + '50.0': 1, + '95.0': 1, + '99.0': 1, + }, + }, + execution_failures: { + doc_count: 7, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'execute', + doc_count: 4, + }, + { + key: 'decrypt', + doc_count: 3, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 100576923.07692307, + }, + avg_total_search_duration: { + value: 43.74647887323944, + }, + }, + { + key: 'document.test.', + doc_count: 42, + avg_es_search_duration: { + value: 0, + }, + percentile_alerts: { + values: { + '50.0': 5, + '95.0': 5, + '99.0': 5, + }, + }, + execution_failures: { + doc_count: 2, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 2, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 5, + '95.0': 5, + '99.0': 5, + }, + }, + avg_execution_time: { + value: 770071428.5714285, + }, + avg_total_search_duration: { + value: 0, + }, + }, + { + key: 'logs.alert.document.count', + doc_count: 28, + avg_es_search_duration: { + value: 26.962962962962962, + }, + percentile_alerts: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + execution_failures: { + doc_count: 1, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 1, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 88321428.57142857, + }, + avg_total_search_duration: { + value: 31.296296296296298, + }, + }, + ]) + ).toEqual({ + countFailedExecutionsByReasonByType: { + decrypt: { + '__index-threshold': 3, + document__test__: 2, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 1, + }, + execute: { + '__index-threshold': 4, + }, + }, + }); + }); + + test('should handle results with some empty execution failures', () => { + expect( + parseExecutionFailureByRuleType([ + { + key: '.index-threshold', + doc_count: 78, + avg_es_search_duration: { + value: 40.76056338028169, + }, + percentile_alerts: { + values: { + '50.0': 1, + '95.0': 1, + '99.0': 1, + }, + }, + execution_failures: { + doc_count: 9, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 100576923.07692307, + }, + avg_total_search_duration: { + value: 43.74647887323944, + }, + }, + { + key: 'document.test.', + doc_count: 42, + avg_es_search_duration: { + value: 0, + }, + percentile_alerts: { + values: { + '50.0': 5, + '95.0': 5, + '99.0': 5, + }, + }, + execution_failures: { + doc_count: 2, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 2, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 5, + '95.0': 5, + '99.0': 5, + }, + }, + avg_execution_time: { + value: 770071428.5714285, + }, + avg_total_search_duration: { + value: 0, + }, + }, + { + key: 'logs.alert.document.count', + doc_count: 28, + avg_es_search_duration: { + value: 26.962962962962962, + }, + percentile_alerts: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + execution_failures: { + doc_count: 1, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 1, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 88321428.57142857, + }, + avg_total_search_duration: { + value: 31.296296296296298, + }, + }, + ]) + ).toEqual({ + countFailedExecutionsByReasonByType: { + decrypt: { + document__test__: 2, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 1, + }, + }, + }); + }); + + test('should handle results with empty execution failures', () => { + expect( + parseExecutionFailureByRuleType([ + { + key: '.index-threshold', + doc_count: 78, + avg_es_search_duration: { + value: 40.76056338028169, + }, + percentile_alerts: { + values: { + '50.0': 1, + '95.0': 1, + '99.0': 1, + }, + }, + execution_failures: { + doc_count: 0, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 100576923.07692307, + }, + avg_total_search_duration: { + value: 43.74647887323944, + }, + }, + { + key: 'document.test.', + doc_count: 42, + avg_es_search_duration: { + value: 0, + }, + percentile_alerts: { + values: { + '50.0': 5, + '95.0': 5, + '99.0': 5, + }, + }, + execution_failures: { + doc_count: 0, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 5, + '95.0': 5, + '99.0': 5, + }, + }, + avg_execution_time: { + value: 770071428.5714285, + }, + avg_total_search_duration: { + value: 0, + }, + }, + { + key: 'logs.alert.document.count', + doc_count: 28, + avg_es_search_duration: { + value: 26.962962962962962, + }, + percentile_alerts: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + execution_failures: { + doc_count: 0, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 88321428.57142857, + }, + avg_total_search_duration: { + value: 31.296296296296298, + }, + }, + ]) + ).toEqual({ countFailedExecutionsByReasonByType: {} }); + }); + + test('should handle results with no execution failures', () => { + expect( + parseExecutionFailureByRuleType([ + // @ts-expect-error + { + key: '.index-threshold', + doc_count: 78, + avg_es_search_duration: { + value: 40.76056338028169, + }, + percentile_alerts: { + values: { + '50.0': 1, + '95.0': 1, + '99.0': 1, + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 100576923.07692307, + }, + avg_total_search_duration: { + value: 43.74647887323944, + }, + }, + // @ts-expect-error + { + key: 'document.test.', + doc_count: 42, + avg_es_search_duration: { + value: 0, + }, + percentile_alerts: { + values: { + '50.0': 5, + '95.0': 5, + '99.0': 5, + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 5, + '95.0': 5, + '99.0': 5, + }, + }, + avg_execution_time: { + value: 770071428.5714285, + }, + avg_total_search_duration: { + value: 0, + }, + }, + // @ts-expect-error + { + key: 'logs.alert.document.count', + doc_count: 28, + avg_es_search_duration: { + value: 26.962962962962962, + }, + percentile_alerts: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '95.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 88321428.57142857, + }, + avg_total_search_duration: { + value: 31.296296296296298, + }, + }, + ]) + ).toEqual({ countFailedExecutionsByReasonByType: {} }); + }); + + test('should handle empty input', () => { + expect(parseExecutionFailureByRuleType([])).toEqual({ + countFailedExecutionsByReasonByType: {}, + }); + }); + + test('should handle undefined input', () => { + // @ts-expect-error + expect(parseExecutionFailureByRuleType(undefined)).toEqual({ + countFailedExecutionsByReasonByType: {}, + }); + }); + }); + + describe('parseExecutionCountAggregationResults', () => { + test('should correctly format aggregation results', () => { + expect( + parseExecutionCountAggregationResults({ + avg_es_search_duration: { + value: 26.246376811594203, + }, + percentile_alerts: { + values: { + '50.0': 1, + '90.0': 5, + '99.0': 5, + }, + }, + execution_failures: { + doc_count: 10, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 6, + }, + { + key: 'execute', + doc_count: 4, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '90.0': 5, + '99.0': 5, + }, + }, + avg_execution_time: { + value: 288250000, + }, + avg_total_search_duration: { + value: 28.630434782608695, + }, + }) + ).toEqual({ + countTotalFailedExecutions: 10, + countFailedExecutionsByReason: { + decrypt: 6, + execute: 4, + }, + avgExecutionTime: 288, + avgEsSearchDuration: 26, + avgTotalSearchDuration: 29, + generatedActionsPercentiles: { + p50: 0, + p90: 5, + p99: 5, + }, + alertsPercentiles: { + p50: 1, + p90: 5, + p99: 5, + }, + }); + }); + + test('should handle missing values', () => { + expect( + parseExecutionCountAggregationResults({ + // @ts-expect-error + avg_es_search_duration: {}, + percentile_alerts: { + values: { + '50.0': 1, + '70.0': 5, + '99.0': 5, + }, + }, + execution_failures: { + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + // @ts-expect-error + { + key: 'decrypt', + }, + { + key: 'execute', + doc_count: 4, + }, + ], + }, + }, + // @ts-expect-error + percentile_scheduled_actions: {}, + avg_total_search_duration: { + value: 28.630434782608695, + }, + }) + ).toEqual({ + countTotalFailedExecutions: 0, + countFailedExecutionsByReason: { + decrypt: 0, + execute: 4, + }, + avgExecutionTime: 0, + avgEsSearchDuration: 0, + avgTotalSearchDuration: 29, + generatedActionsPercentiles: {}, + alertsPercentiles: { + p50: 1, + p99: 5, + }, + }); + }); + + test('should handle empty input', () => { + // @ts-expect-error + expect(parseExecutionCountAggregationResults({})).toEqual({ + countTotalFailedExecutions: 0, + countFailedExecutionsByReason: {}, + avgExecutionTime: 0, + avgEsSearchDuration: 0, + avgTotalSearchDuration: 0, + generatedActionsPercentiles: {}, + alertsPercentiles: {}, + }); + }); + + test('should handle undefined input', () => { + // @ts-expect-error + expect(parseExecutionCountAggregationResults(undefined)).toEqual({ + countTotalFailedExecutions: 0, + countFailedExecutionsByReason: {}, + avgExecutionTime: 0, + avgEsSearchDuration: 0, + avgTotalSearchDuration: 0, + generatedActionsPercentiles: {}, + alertsPercentiles: {}, + }); + }); + }); + + describe('getExecutionsPerDayCount', () => { + test('should return counts of executions, failed executions and stats about execution durations', async () => { + esClient.search.mockResponse({ + took: 4, + timed_out: false, + _shards: { + total: 1, + successful: 1, + skipped: 0, + failed: 0, + }, + hits: { + total: { + value: 148, + relation: 'eq', + }, + max_score: null, + hits: [], + }, + aggregations: { + by_rule_type_id: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: '.index-threshold', + doc_count: 78, + avg_es_search_duration: { + value: 40.76056338028169, + }, + percentile_alerts: { + values: { + '50.0': 1, + '90.0': 1, + '99.0': 1, + }, + }, + execution_failures: { + doc_count: 7, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'execute', + doc_count: 4, + }, + { + key: 'decrypt', + doc_count: 3, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '90.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 100576923.07692307, + }, + avg_total_search_duration: { + value: 43.74647887323944, + }, + }, + { + key: 'document.test.', + doc_count: 42, + avg_es_search_duration: { + value: 0, + }, + percentile_alerts: { + values: { + '50.0': 5, + '90.0': 5, + '99.0': 5, + }, + }, + execution_failures: { + doc_count: 2, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 2, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 5, + '90.0': 5, + '99.0': 5, + }, + }, + avg_execution_time: { + value: 770071428.5714285, + }, + avg_total_search_duration: { + value: 0, + }, + }, + { + key: 'logs.alert.document.count', + doc_count: 28, + avg_es_search_duration: { + value: 26.962962962962962, + }, + percentile_alerts: { + values: { + '50.0': 0, + '90.0': 0, + '99.0': 0, + }, + }, + execution_failures: { + doc_count: 1, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 1, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '90.0': 0, + '99.0': 0, + }, + }, + avg_execution_time: { + value: 88321428.57142857, + }, + avg_total_search_duration: { + value: 31.296296296296298, + }, + }, + ], + }, + avg_es_search_duration: { + value: 26.246376811594203, + }, + percentile_alerts: { + values: { + '50.0': 1, + '90.0': 5, + '99.0': 5, + }, + }, + execution_failures: { + doc_count: 10, + by_reason: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'decrypt', + doc_count: 6, + }, + { + key: 'execute', + doc_count: 4, + }, + ], + }, + }, + percentile_scheduled_actions: { + values: { + '50.0': 0, + '90.0': 5, + '99.0': 5, + }, + }, + avg_execution_time: { + value: 288250000, + }, + avg_total_search_duration: { + value: 28.630434782608695, + }, + }, + }); + + const telemetry = await getExecutionsPerDayCount({ + esClient, + eventLogIndex: 'test', + logger, + }); + + expect(esClient.search).toHaveBeenCalledTimes(1); + + expect(telemetry).toStrictEqual({ + countTotalRuleExecutions: 148, + countRuleExecutionsByType: { + '__index-threshold': 78, + document__test__: 42, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 28, + }, + countTotalFailedExecutions: 10, + countFailedExecutionsByReason: { + decrypt: 6, + execute: 4, + }, + countFailedExecutionsByReasonByType: { + decrypt: { + '__index-threshold': 3, + document__test__: 2, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 1, + }, + execute: { + '__index-threshold': 4, + }, + }, + avgExecutionTime: 288, + avgEsSearchDuration: 26, + avgTotalSearchDuration: 29, + avgExecutionTimeByType: { + '__index-threshold': 101, + document__test__: 770, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 88, + }, + avgEsSearchDurationByType: { + '__index-threshold': 41, + document__test__: 0, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 27, + }, + avgTotalSearchDurationByType: { + '__index-threshold': 44, + document__test__: 0, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 31, + }, + generatedActionsPercentiles: { + p50: 0, + p90: 5, + p99: 5, + }, + alertsPercentiles: { + p50: 1, + p90: 5, + p99: 5, + }, + generatedActionsPercentilesByType: { + p50: { + '__index-threshold': 0, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + p90: { + '__index-threshold': 0, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + p99: { + '__index-threshold': 0, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + }, + + alertsPercentilesByType: { + p50: { + '__index-threshold': 1, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + p90: { + '__index-threshold': 1, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + p99: { + '__index-threshold': 1, + document__test__: 5, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 0, + }, + }, + }); + }); + + test('should return empty results and log warning if query throws error', async () => { + esClient.search.mockRejectedValue(new Error('oh no')); + + const telemetry = await getExecutionsPerDayCount({ + esClient, + eventLogIndex: 'test', + logger, + }); + + expect(esClient.search).toHaveBeenCalledTimes(1); + const loggerCall = logger.warn.mock.calls[0][0]; + const loggerMeta = logger.warn.mock.calls[0][1]; + expect(loggerCall as string).toMatchInlineSnapshot( + `"Error executing alerting telemetry task: getExecutionsPerDayCount - {}"` + ); + expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']); + expect(loggerMeta?.error?.stack_trace).toBeDefined(); + expect(telemetry).toStrictEqual({ + countTotalRuleExecutions: 0, + countRuleExecutionsByType: {}, + countTotalFailedExecutions: 0, + countFailedExecutionsByReason: {}, + countFailedExecutionsByReasonByType: {}, + avgExecutionTime: 0, + avgExecutionTimeByType: {}, + avgEsSearchDuration: 0, + avgEsSearchDurationByType: {}, + avgTotalSearchDuration: 0, + avgTotalSearchDurationByType: {}, + generatedActionsPercentiles: {}, + generatedActionsPercentilesByType: {}, + alertsPercentiles: {}, + alertsPercentilesByType: {}, + }); + }); + }); + + describe('getExecutionTimeoutsPerDayCount', () => { + test('should return counts of timed out executions and counts by type', async () => { + esClient.search.mockResponse({ + took: 4, + timed_out: false, + _shards: { + total: 1, + successful: 1, + skipped: 0, + failed: 0, + }, + hits: { + total: { + value: 4, + relation: 'eq', + }, + max_score: null, + hits: [], + }, + aggregations: { + by_rule_type_id: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: '.index-threshold', + doc_count: 2, + }, + { + key: 'logs.alert.document.count', + doc_count: 1, + }, + { + key: 'document.test.', + doc_count: 1, + }, + ], + }, + }, + }); + + const telemetry = await getExecutionTimeoutsPerDayCount({ + esClient, + eventLogIndex: 'test', + logger, + }); + + expect(esClient.search).toHaveBeenCalledTimes(1); + + expect(telemetry).toStrictEqual({ + countExecutionTimeouts: 4, + countExecutionTimeoutsByType: { + '__index-threshold': 2, + document__test__: 1, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 1, + }, + }); + }); + + test('should return empty results and log warning if query throws error', async () => { + esClient.search.mockRejectedValue(new Error('oh no')); + + const telemetry = await getExecutionTimeoutsPerDayCount({ + esClient, + eventLogIndex: 'test', + logger, + }); + + expect(esClient.search).toHaveBeenCalledTimes(1); + const loggerCall = logger.warn.mock.calls[0][0]; + const loggerMeta = logger.warn.mock.calls[0][1]; + expect(loggerCall as string).toMatchInlineSnapshot( + `"Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - {}"` + ); + expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']); + expect(loggerMeta?.error?.stack_trace).toBeDefined(); + expect(telemetry).toStrictEqual({ + countExecutionTimeouts: 0, + countExecutionTimeoutsByType: {}, + }); + }); + }); +}); diff --git a/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_event_log.ts b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_event_log.ts new file mode 100644 index 000000000000..a6c3e90cfc1c --- /dev/null +++ b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_event_log.ts @@ -0,0 +1,583 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { flatMap, merge } from 'lodash'; +import type { + AggregationsKeyedPercentiles, + AggregationsSingleBucketAggregateBase, + AggregationsPercentilesAggregateBase, + AggregationsSingleMetricAggregateBase, + AggregationsTermsAggregateBase, + AggregationsStringTermsBucketKeys, + AggregationsBuckets, +} from '@elastic/elasticsearch/lib/api/typesWithBodyKey'; +import { ElasticsearchClient, Logger } from '@kbn/core/server'; +import { + NUM_ALERTING_RULE_TYPES, + NUM_ALERTING_EXECUTION_FAILURE_REASON_TYPES, +} from '../alerting_usage_collector'; +import { replaceDotSymbols } from './replace_dots_with_underscores'; +import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket'; + +const Millis2Nanos = 1000 * 1000; +const percentileFieldNameMapping: Record = { + '50.0': 'p50', + '90.0': 'p90', + '99.0': 'p99', +}; + +interface Opts { + esClient: ElasticsearchClient; + eventLogIndex: string; + logger: Logger; +} + +interface GetExecutionsPerDayCountResults { + countTotalRuleExecutions: number; + countRuleExecutionsByType: Record; + countTotalFailedExecutions: number; + countFailedExecutionsByReason: Record; + countFailedExecutionsByReasonByType: Record>; + avgExecutionTime: number; + avgExecutionTimeByType: Record; + avgEsSearchDuration: number; + avgEsSearchDurationByType: Record; + avgTotalSearchDuration: number; + avgTotalSearchDurationByType: Record; + generatedActionsPercentiles: Record; + generatedActionsPercentilesByType: Record>; + alertsPercentiles: Record; + alertsPercentilesByType: Record>; +} + +interface GetExecutionTimeoutsPerDayCountResults { + countExecutionTimeouts: number; + countExecutionTimeoutsByType: Record; +} + +interface GetExecutionCountsExecutionFailures extends AggregationsSingleBucketAggregateBase { + by_reason: AggregationsTermsAggregateBase; +} +interface GetExecutionCountsAggregationBucket extends AggregationsStringTermsBucketKeys { + avg_execution_time: AggregationsSingleMetricAggregateBase; + avg_es_search_duration: AggregationsSingleMetricAggregateBase; + avg_total_search_duration: AggregationsSingleMetricAggregateBase; + execution_failures: GetExecutionCountsExecutionFailures; + percentile_scheduled_actions: AggregationsPercentilesAggregateBase; + percentile_alerts: AggregationsPercentilesAggregateBase; +} + +interface IGetExecutionFailures extends AggregationsSingleBucketAggregateBase { + by_reason: AggregationsTermsAggregateBase; +} + +export async function getExecutionsPerDayCount({ + esClient, + eventLogIndex, + logger, +}: Opts): Promise { + try { + const eventLogAggs = { + avg_execution_time: { + avg: { + field: 'event.duration', + }, + }, + avg_es_search_duration: { + avg: { + field: 'kibana.alert.rule.execution.metrics.es_search_duration_ms', + }, + }, + avg_total_search_duration: { + avg: { + field: 'kibana.alert.rule.execution.metrics.total_search_duration_ms', + }, + }, + + percentile_scheduled_actions: { + percentiles: { + field: 'kibana.alert.rule.execution.metrics.number_of_generated_actions', + percents: [50, 90, 99], + }, + }, + percentile_alerts: { + percentiles: { + field: 'kibana.alert.rule.execution.metrics.alert_counts.active', + percents: [50, 90, 99], + }, + }, + execution_failures: { + filter: { + term: { + 'event.outcome': 'failure', + }, + }, + aggs: { + by_reason: { + terms: { + field: 'event.reason', + size: NUM_ALERTING_EXECUTION_FAILURE_REASON_TYPES, + }, + }, + }, + }, + }; + + const query = { + index: eventLogIndex, + size: 0, + body: { + query: getProviderAndActionFilterForTimeRange('execute'), + aggs: { + ...eventLogAggs, + by_rule_type_id: { + terms: { + field: 'rule.category', + size: NUM_ALERTING_RULE_TYPES, + }, + aggs: eventLogAggs, + }, + }, + }, + }; + + logger.debug(`query for getExecutionsPerDayCount - ${JSON.stringify(query)}`); + const results = await esClient.search(query); + + logger.debug(`results for getExecutionsPerDayCount query - ${JSON.stringify(results)}`); + + const totalRuleExecutions = + typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value; + + const aggregations = results.aggregations as { + by_rule_type_id: AggregationsTermsAggregateBase; + execution_failures: IGetExecutionFailures; + percentile_scheduled_actions: AggregationsPercentilesAggregateBase; + percentile_alerts: AggregationsPercentilesAggregateBase; + avg_execution_time: AggregationsSingleMetricAggregateBase; + avg_es_search_duration: AggregationsSingleMetricAggregateBase; + avg_total_search_duration: AggregationsSingleMetricAggregateBase; + }; + + const aggregationsByRuleTypeId: AggregationsBuckets = + aggregations.by_rule_type_id.buckets as GetExecutionCountsAggregationBucket[]; + + return { + ...parseRuleTypeBucket(aggregationsByRuleTypeId), + ...parseExecutionFailureByRuleType(aggregationsByRuleTypeId), + ...parseExecutionCountAggregationResults(aggregations), + countTotalRuleExecutions: totalRuleExecutions ?? 0, + }; + } catch (err) { + logger.warn( + `Error executing alerting telemetry task: getExecutionsPerDayCount - ${JSON.stringify(err)}`, + { + tags: ['alerting', 'telemetry-failed'], + error: { stack_trace: err.stack }, + } + ); + return { + countTotalRuleExecutions: 0, + countRuleExecutionsByType: {}, + countTotalFailedExecutions: 0, + countFailedExecutionsByReason: {}, + countFailedExecutionsByReasonByType: {}, + avgExecutionTime: 0, + avgExecutionTimeByType: {}, + avgEsSearchDuration: 0, + avgEsSearchDurationByType: {}, + avgTotalSearchDuration: 0, + avgTotalSearchDurationByType: {}, + generatedActionsPercentiles: {}, + generatedActionsPercentilesByType: {}, + alertsPercentiles: {}, + alertsPercentilesByType: {}, + }; + } +} + +export async function getExecutionTimeoutsPerDayCount({ + esClient, + eventLogIndex, + logger, +}: Opts): Promise { + try { + const query = { + index: eventLogIndex, + size: 0, + body: { + query: getProviderAndActionFilterForTimeRange('execute-timeout'), + aggs: { + by_rule_type_id: { + terms: { + field: 'rule.category', + size: NUM_ALERTING_RULE_TYPES, + }, + }, + }, + }, + }; + + logger.debug(`query for getExecutionTimeoutsPerDayCount - ${JSON.stringify(query)}`); + const results = await esClient.search(query); + + logger.debug(`results for getExecutionTimeoutsPerDayCount query - ${JSON.stringify(results)}`); + + const aggregations = results.aggregations as { + by_rule_type_id: AggregationsTermsAggregateBase; + }; + + const totalTimedoutExecutionsCount = + typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value; + + return { + countExecutionTimeouts: totalTimedoutExecutionsCount ?? 0, + countExecutionTimeoutsByType: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets), + }; + } catch (err) { + logger.warn( + `Error executing alerting telemetry task: getExecutionsTimeoutsPerDayCount - ${JSON.stringify( + err + )}`, + { + tags: ['alerting', 'telemetry-failed'], + error: { stack_trace: err.stack }, + } + ); + return { + countExecutionTimeouts: 0, + countExecutionTimeoutsByType: {}, + }; + } +} + +/** + * Bucket format: + * { + * key: '.index-threshold', // rule type id + * doc_count: 78, // count of number of executions + * avg_es_search_duration: { // average es search duration across executions + * value: 40.76056338028169, + * }, + * percentile_alerts: { // stats for number of alerts created across executions + * values: { + * '50.0': 1, + * '95.0': 1, + * '99.0': 1, + * }, + * }, + * execution_failures: { + * doc_count: 7, // count of number of failed executions + * by_reason: { + * doc_count_error_upper_bound: 0, + * sum_other_doc_count: 0, + * buckets: [ + * { + * key: 'execute', // breakdown of reason for execution failures + * doc_count: 4, + * }, + * { + * key: 'decrypt', + * doc_count: 3, + * }, + * ], + * }, + * }, + * percentile_scheduled_actions: { // stats for number of actions generated across executions + * values: { + * '50.0': 0, + * '95.0': 0, + * '99.0': 0, + * }, + * }, + * avg_execution_time: { // average execution time in nanoseconds across executions + * value: 100576923.07692307, + * }, + * avg_total_search_duration: { // average total search duration across executions + * value: 43.74647887323944, + * }, + * } + */ + +export function parseRuleTypeBucket( + buckets: GetExecutionCountsAggregationBucket[] +): Pick< + GetExecutionsPerDayCountResults, + | 'countRuleExecutionsByType' + | 'avgExecutionTimeByType' + | 'avgEsSearchDurationByType' + | 'avgTotalSearchDurationByType' + | 'generatedActionsPercentilesByType' + | 'alertsPercentilesByType' +> { + let summary = { + countRuleExecutionsByType: {}, + avgExecutionTimeByType: {}, + avgEsSearchDurationByType: {}, + avgTotalSearchDurationByType: {}, + generatedActionsPercentilesByType: { p50: {}, p90: {}, p99: {} }, + alertsPercentilesByType: { p50: {}, p90: {}, p99: {} }, + }; + for (const bucket of buckets ?? []) { + const ruleType: string = replaceDotSymbols(bucket?.key) ?? ''; + const numExecutions: number = bucket?.doc_count ?? 0; + const avgExecutionTimeNanos = bucket?.avg_execution_time?.value ?? 0; + const avgEsSearchTimeMillis = bucket?.avg_es_search_duration?.value ?? 0; + const avgTotalSearchTimeMillis = bucket?.avg_total_search_duration?.value ?? 0; + const actionPercentiles = bucket?.percentile_scheduled_actions?.values ?? {}; + const alertPercentiles = bucket?.percentile_alerts?.values ?? {}; + + summary = { + countRuleExecutionsByType: { + ...summary.countRuleExecutionsByType, + [ruleType]: numExecutions, + }, + avgExecutionTimeByType: { + ...summary.avgExecutionTimeByType, + [ruleType]: Math.round(avgExecutionTimeNanos / Millis2Nanos), + }, + avgEsSearchDurationByType: { + ...summary.avgEsSearchDurationByType, + [ruleType]: Math.round(avgEsSearchTimeMillis), + }, + avgTotalSearchDurationByType: { + ...summary.avgTotalSearchDurationByType, + [ruleType]: Math.round(avgTotalSearchTimeMillis), + }, + generatedActionsPercentilesByType: merge( + summary.generatedActionsPercentilesByType, + parsePercentileAggs(actionPercentiles as AggregationsKeyedPercentiles, ruleType) + ), + alertsPercentilesByType: merge( + summary.alertsPercentilesByType, + parsePercentileAggs(alertPercentiles as AggregationsKeyedPercentiles, ruleType) + ), + }; + } + + return summary; +} + +interface FlattenedExecutionFailureBucket { + ruleType: string; + key: string; + doc_count: number; +} + +export function parseExecutionFailureByRuleType( + buckets: GetExecutionCountsAggregationBucket[] +): Pick { + const executionFailuresWithRuleTypeBuckets: FlattenedExecutionFailureBucket[] = flatMap( + buckets ?? [], + (bucket) => { + const ruleType: string = replaceDotSymbols(bucket.key); + + /** + * Execution failure bucket format + * [ + * { + * key: 'execute', + * doc_count: 4, + * }, + * { + * key: 'decrypt', + * doc_count: 3, + * }, + * ] + */ + + const executionFailuresBuckets = bucket?.execution_failures?.by_reason + ?.buckets as AggregationsStringTermsBucketKeys[]; + return (executionFailuresBuckets ?? []).map((b) => ({ ...b, ruleType })); + } + ); + + const parsedFailures = (executionFailuresWithRuleTypeBuckets ?? []).reduce( + (acc: Record>, bucket: FlattenedExecutionFailureBucket) => { + const ruleType: string = bucket.ruleType; + const reason: string = bucket.key; + + if (acc[reason]) { + if (acc[reason][ruleType]) { + return { + ...acc, + [reason]: { + ...acc[reason], + [ruleType]: acc[reason][ruleType] + bucket.doc_count, + }, + }; + } + return { + ...acc, + [reason]: { + ...acc[reason], + [ruleType]: bucket.doc_count, + }, + }; + } + return { + ...acc, + [reason]: { + [ruleType]: bucket.doc_count, + }, + }; + }, + {} + ); + + return { + countFailedExecutionsByReasonByType: parsedFailures, + }; +} + +export function parsePercentileAggs( + percentiles: AggregationsKeyedPercentiles, + ruleTypeId?: string +) { + return Object.keys(percentiles ?? {}).reduce((acc, percentileKey: string) => { + let result = {}; + const percentileKeyMapped = percentileFieldNameMapping[percentileKey]; + if (percentileKeyMapped) { + if (ruleTypeId) { + result = { + [percentileKeyMapped]: { + [ruleTypeId]: percentiles[percentileKey] ?? 0, + }, + }; + } else { + result = { + [percentileKeyMapped]: percentiles[percentileKey] ?? 0, + }; + } + } + return { + ...acc, + ...result, + }; + }, {}); +} + +/** + * Aggregation Result Format (minus rule type id agg buckets) + * { + * avg_es_search_duration: { + * value: 26.246376811594203, + * }, + * percentile_alerts: { + * values: { + * '50.0': 1, + * '90.0': 5, + * '99.0': 5, + * }, + * }, + * execution_failures: { + * doc_count: 10, + * by_reason: { + * doc_count_error_upper_bound: 0, + * sum_other_doc_count: 0, + * buckets: [ + * { + * key: 'decrypt', + * doc_count: 6, + * }, + * { + * key: 'execute', + * doc_count: 4, + * }, + * ], + * }, + * }, + * percentile_scheduled_actions: { + * values: { + * '50.0': 0, + * '95.0': 5, + * '99.0': 5, + * }, + * }, + * avg_execution_time: { + * value: 288250000, + * }, + * avg_total_search_duration: { + * value: 28.630434782608695, + * }, + */ +export function parseExecutionCountAggregationResults(results: { + execution_failures: IGetExecutionFailures; + percentile_scheduled_actions: AggregationsPercentilesAggregateBase; + percentile_alerts: AggregationsPercentilesAggregateBase; + avg_execution_time: AggregationsSingleMetricAggregateBase; + avg_es_search_duration: AggregationsSingleMetricAggregateBase; + avg_total_search_duration: AggregationsSingleMetricAggregateBase; +}): Pick< + GetExecutionsPerDayCountResults, + | 'countTotalFailedExecutions' + | 'countFailedExecutionsByReason' + | 'avgExecutionTime' + | 'avgEsSearchDuration' + | 'avgTotalSearchDuration' + | 'generatedActionsPercentiles' + | 'alertsPercentiles' +> { + const avgExecutionTimeNanos = results?.avg_execution_time?.value ?? 0; + const avgEsSearchDurationMillis = results?.avg_es_search_duration?.value ?? 0; + const avgTotalSearchDurationMillis = results?.avg_total_search_duration?.value ?? 0; + const executionFailuresByReasonBuckets = + (results?.execution_failures?.by_reason?.buckets as AggregationsStringTermsBucketKeys[]) ?? []; + const actionPercentiles = results?.percentile_scheduled_actions?.values ?? {}; + const alertPercentiles = results?.percentile_alerts?.values ?? {}; + + return { + countTotalFailedExecutions: results?.execution_failures?.doc_count ?? 0, + countFailedExecutionsByReason: executionFailuresByReasonBuckets.reduce( + (acc: Record, bucket: AggregationsStringTermsBucketKeys) => { + const reason: string = bucket.key; + return { + ...acc, + [reason]: bucket.doc_count ?? 0, + }; + }, + {} + ), + avgExecutionTime: Math.round(avgExecutionTimeNanos / Millis2Nanos), + avgEsSearchDuration: Math.round(avgEsSearchDurationMillis), + avgTotalSearchDuration: Math.round(avgTotalSearchDurationMillis), + generatedActionsPercentiles: parsePercentileAggs( + actionPercentiles as AggregationsKeyedPercentiles + ), + alertsPercentiles: parsePercentileAggs(alertPercentiles as AggregationsKeyedPercentiles), + }; +} + +function getProviderAndActionFilterForTimeRange( + action: string, + provider: string = 'alerting', + range: string = '1d' +) { + return { + bool: { + filter: { + bool: { + must: [ + { + term: { 'event.action': action }, + }, + { + term: { 'event.provider': provider }, + }, + { + range: { + '@timestamp': { + gte: `now-${range}`, + }, + }, + }, + ], + }, + }, + }, + }; +} diff --git a/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_kibana.test.ts b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_kibana.test.ts new file mode 100644 index 000000000000..4b17875bc8b6 --- /dev/null +++ b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_kibana.test.ts @@ -0,0 +1,249 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks'; +import { getTotalCountAggregations, getTotalCountInUse } from './get_telemetry_from_kibana'; + +const elasticsearch = elasticsearchServiceMock.createStart(); +const esClient = elasticsearch.client.asInternalUser; +const logger: ReturnType = loggingSystemMock.createLogger(); + +describe('kibana index telemetry', () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + + describe('getTotalCountAggregations', () => { + test('should return rule counts by rule type id, stats about schedule and throttle intervals and number of actions', async () => { + esClient.search.mockResponseOnce({ + took: 4, + timed_out: false, + _shards: { + total: 1, + successful: 1, + skipped: 0, + failed: 0, + }, + hits: { + total: { + value: 4, + relation: 'eq', + }, + max_score: null, + hits: [], + }, + aggregations: { + by_rule_type_id: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: '.index-threshold', + doc_count: 2, + }, + { + key: 'logs.alert.document.count', + doc_count: 1, + }, + { + key: 'document.test.', + doc_count: 1, + }, + ], + }, + max_throttle_time: { value: 60 }, + min_throttle_time: { value: 0 }, + avg_throttle_time: { value: 30 }, + max_interval_time: { value: 10 }, + min_interval_time: { value: 1 }, + avg_interval_time: { value: 4.5 }, + max_actions_count: { value: 4 }, + min_actions_count: { value: 0 }, + avg_actions_count: { value: 2.5 }, + }, + }); + + const telemetry = await getTotalCountAggregations({ + esClient, + kibanaIndex: 'test', + logger, + }); + + expect(esClient.search).toHaveBeenCalledTimes(1); + + expect(telemetry).toEqual({ + connectors_per_alert: { + avg: 2.5, + max: 4, + min: 0, + }, + count_by_type: { + '__index-threshold': 2, + document__test__: 1, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 1, + }, + count_total: 4, + schedule_time: { + avg: '4.5s', + max: '10s', + min: '1s', + }, + schedule_time_number_s: { + avg: 4.5, + max: 10, + min: 1, + }, + throttle_time: { + avg: '30s', + max: '60s', + min: '0s', + }, + throttle_time_number_s: { + avg: 30, + max: 60, + min: 0, + }, + }); + }); + + test('should return empty results and log warning if query throws error', async () => { + esClient.search.mockRejectedValueOnce(new Error('oh no')); + + const telemetry = await getTotalCountAggregations({ + esClient, + kibanaIndex: 'test', + logger, + }); + + expect(esClient.search).toHaveBeenCalledTimes(1); + const loggerCall = logger.warn.mock.calls[0][0]; + const loggerMeta = logger.warn.mock.calls[0][1]; + expect(loggerCall as string).toMatchInlineSnapshot( + `"Error executing alerting telemetry task: getTotalCountAggregations - {}"` + ); + expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']); + expect(loggerMeta?.error?.stack_trace).toBeDefined(); + expect(telemetry).toEqual({ + connectors_per_alert: { + avg: 0, + max: 0, + min: 0, + }, + count_by_type: {}, + count_total: 0, + schedule_time: { + avg: '0s', + max: '0s', + min: '0s', + }, + schedule_time_number_s: { + avg: 0, + max: 0, + min: 0, + }, + throttle_time: { + avg: '0s', + max: '0s', + min: '0s', + }, + throttle_time_number_s: { + avg: 0, + max: 0, + min: 0, + }, + }); + }); + }); + + describe('getTotalCountInUse', () => { + test('should return enabled rule counts by rule type id and number of namespaces', async () => { + esClient.search.mockResponseOnce({ + took: 4, + timed_out: false, + _shards: { + total: 1, + successful: 1, + skipped: 0, + failed: 0, + }, + hits: { + total: { + value: 4, + relation: 'eq', + }, + max_score: null, + hits: [], + }, + aggregations: { + namespaces_count: { value: 1 }, + by_rule_type_id: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: '.index-threshold', + doc_count: 2, + }, + { + key: 'logs.alert.document.count', + doc_count: 1, + }, + { + key: 'document.test.', + doc_count: 1, + }, + ], + }, + }, + }); + + const telemetry = await getTotalCountInUse({ + esClient, + kibanaIndex: 'test', + logger, + }); + + expect(esClient.search).toHaveBeenCalledTimes(1); + + expect(telemetry).toStrictEqual({ + countByType: { + '__index-threshold': 2, + document__test__: 1, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 1, + }, + countNamespaces: 1, + countTotal: 4, + }); + }); + + test('should return empty results and log warning if query throws error', async () => { + esClient.search.mockRejectedValueOnce(new Error('oh no')); + + const telemetry = await getTotalCountInUse({ + esClient, + kibanaIndex: 'test', + logger, + }); + + expect(esClient.search).toHaveBeenCalledTimes(1); + const loggerCall = logger.warn.mock.calls[0][0]; + const loggerMeta = logger.warn.mock.calls[0][1]; + expect(loggerCall as string).toMatchInlineSnapshot( + `"Error executing alerting telemetry task: getTotalCountInUse - {}"` + ); + expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']); + expect(loggerMeta?.error?.stack_trace).toBeDefined(); + expect(telemetry).toStrictEqual({ + countByType: {}, + countNamespaces: 0, + countTotal: 0, + }); + }); + }); +}); diff --git a/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_kibana.ts b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_kibana.ts new file mode 100644 index 000000000000..f2ef27374263 --- /dev/null +++ b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_kibana.ts @@ -0,0 +1,317 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { + AggregationsSingleMetricAggregateBase, + AggregationsCardinalityAggregate, + AggregationsTermsAggregateBase, + AggregationsStringTermsBucketKeys, +} from '@elastic/elasticsearch/lib/api/typesWithBodyKey'; +import { ElasticsearchClient, Logger } from '@kbn/core/server'; +import { AlertingUsage } from '../types'; +import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector'; +import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket'; + +interface Opts { + esClient: ElasticsearchClient; + kibanaIndex: string; + logger: Logger; +} + +type GetTotalCountsResults = Pick< + AlertingUsage, + | 'count_total' + | 'count_by_type' + | 'throttle_time' + | 'schedule_time' + | 'throttle_time_number_s' + | 'schedule_time_number_s' + | 'connectors_per_alert' +>; + +interface GetTotalCountInUseResults { + countTotal: number; + countByType: Record; + countNamespaces: number; +} + +export async function getTotalCountAggregations({ + esClient, + kibanaIndex, + logger, +}: Opts): Promise { + try { + const query = { + index: kibanaIndex, + size: 0, + body: { + query: { + bool: { + // Aggregate over all rule saved objects + filter: [{ term: { type: 'alert' } }], + }, + }, + runtime_mappings: { + rule_action_count: { + type: 'long', + script: { + source: ` + def alert = params._source['alert']; + if (alert != null) { + def actions = alert.actions; + if (actions != null) { + emit(actions.length); + } else { + emit(0); + } + }`, + }, + }, + // Convert schedule interval duration string from rule saved object to interval in seconds + rule_schedule_interval: { + type: 'long', + script: { + source: ` + int parsed = 0; + if (doc['alert.schedule.interval'].size() > 0) { + def interval = doc['alert.schedule.interval'].value; + + if (interval.length() > 1) { + // get last char + String timeChar = interval.substring(interval.length() - 1); + // remove last char + interval = interval.substring(0, interval.length() - 1); + + if (interval.chars().allMatch(Character::isDigit)) { + // using of regex is not allowed in painless language + parsed = Integer.parseInt(interval); + + if (timeChar.equals("s")) { + parsed = parsed; + } else if (timeChar.equals("m")) { + parsed = parsed * 60; + } else if (timeChar.equals("h")) { + parsed = parsed * 60 * 60; + } else if (timeChar.equals("d")) { + parsed = parsed * 24 * 60 * 60; + } + emit(parsed); + } + } + } + emit(parsed); + `, + }, + }, + // Convert throttle interval duration string from rule saved object to interval in seconds + rule_throttle_interval: { + type: 'long', + script: { + source: ` + int parsed = 0; + if (doc['alert.throttle'].size() > 0) { + def throttle = doc['alert.throttle'].value; + + if (throttle.length() > 1) { + // get last char + String timeChar = throttle.substring(throttle.length() - 1); + // remove last char + throttle = throttle.substring(0, throttle.length() - 1); + + if (throttle.chars().allMatch(Character::isDigit)) { + // using of regex is not allowed in painless language + parsed = Integer.parseInt(throttle); + + if (timeChar.equals("s")) { + parsed = parsed; + } else if (timeChar.equals("m")) { + parsed = parsed * 60; + } else if (timeChar.equals("h")) { + parsed = parsed * 60 * 60; + } else if (timeChar.equals("d")) { + parsed = parsed * 24 * 60 * 60; + } + emit(parsed); + } + } + } + emit(parsed); + `, + }, + }, + }, + aggs: { + by_rule_type_id: { + terms: { + field: 'alert.alertTypeId', + size: NUM_ALERTING_RULE_TYPES, + }, + }, + max_throttle_time: { max: { field: 'rule_throttle_interval' } }, + min_throttle_time: { min: { field: 'rule_throttle_interval' } }, + avg_throttle_time: { avg: { field: 'rule_throttle_interval' } }, + max_interval_time: { max: { field: 'rule_schedule_interval' } }, + min_interval_time: { min: { field: 'rule_schedule_interval' } }, + avg_interval_time: { avg: { field: 'rule_schedule_interval' } }, + max_actions_count: { max: { field: 'rule_action_count' } }, + min_actions_count: { min: { field: 'rule_action_count' } }, + avg_actions_count: { avg: { field: 'rule_action_count' } }, + }, + }, + }; + + logger.debug(`query for getTotalCountAggregations - ${JSON.stringify(query)}`); + const results = await esClient.search(query); + + logger.debug(`results for getTotalCountAggregations query - ${JSON.stringify(results)}`); + + const aggregations = results.aggregations as { + by_rule_type_id: AggregationsTermsAggregateBase; + max_throttle_time: AggregationsSingleMetricAggregateBase; + min_throttle_time: AggregationsSingleMetricAggregateBase; + avg_throttle_time: AggregationsSingleMetricAggregateBase; + max_interval_time: AggregationsSingleMetricAggregateBase; + min_interval_time: AggregationsSingleMetricAggregateBase; + avg_interval_time: AggregationsSingleMetricAggregateBase; + max_actions_count: AggregationsSingleMetricAggregateBase; + min_actions_count: AggregationsSingleMetricAggregateBase; + avg_actions_count: AggregationsSingleMetricAggregateBase; + }; + + const totalRulesCount = + typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value; + + return { + count_total: totalRulesCount ?? 0, + count_by_type: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets), + throttle_time: { + min: `${aggregations.min_throttle_time.value ?? 0}s`, + avg: `${aggregations.avg_throttle_time.value ?? 0}s`, + max: `${aggregations.max_throttle_time.value ?? 0}s`, + }, + schedule_time: { + min: `${aggregations.min_interval_time.value ?? 0}s`, + avg: `${aggregations.avg_interval_time.value ?? 0}s`, + max: `${aggregations.max_interval_time.value ?? 0}s`, + }, + throttle_time_number_s: { + min: aggregations.min_throttle_time.value ?? 0, + avg: aggregations.avg_throttle_time.value ?? 0, + max: aggregations.max_throttle_time.value ?? 0, + }, + schedule_time_number_s: { + min: aggregations.min_interval_time.value ?? 0, + avg: aggregations.avg_interval_time.value ?? 0, + max: aggregations.max_interval_time.value ?? 0, + }, + connectors_per_alert: { + min: aggregations.min_actions_count.value ?? 0, + avg: aggregations.avg_actions_count.value ?? 0, + max: aggregations.max_actions_count.value ?? 0, + }, + }; + } catch (err) { + logger.warn( + `Error executing alerting telemetry task: getTotalCountAggregations - ${JSON.stringify(err)}`, + { + tags: ['alerting', 'telemetry-failed'], + error: { stack_trace: err.stack }, + } + ); + return { + count_total: 0, + count_by_type: {}, + throttle_time: { + min: '0s', + avg: '0s', + max: '0s', + }, + schedule_time: { + min: '0s', + avg: '0s', + max: '0s', + }, + throttle_time_number_s: { + min: 0, + avg: 0, + max: 0, + }, + schedule_time_number_s: { + min: 0, + avg: 0, + max: 0, + }, + connectors_per_alert: { + min: 0, + avg: 0, + max: 0, + }, + }; + } +} + +export async function getTotalCountInUse({ + esClient, + kibanaIndex, + logger, +}: Opts): Promise { + try { + const query = { + index: kibanaIndex, + size: 0, + body: { + query: { + bool: { + // Aggregate over only enabled rule saved objects + filter: [{ term: { type: 'alert' } }, { term: { 'alert.enabled': true } }], + }, + }, + aggs: { + namespaces_count: { cardinality: { field: 'namespaces' } }, + by_rule_type_id: { + terms: { + field: 'alert.alertTypeId', + size: NUM_ALERTING_RULE_TYPES, + }, + }, + }, + }, + }; + + logger.debug(`query for getTotalCountInUse - ${JSON.stringify(query)}`); + const results = await esClient.search(query); + + logger.debug(`results for getTotalCountInUse query - ${JSON.stringify(results)}`); + + const aggregations = results.aggregations as { + by_rule_type_id: AggregationsTermsAggregateBase; + namespaces_count: AggregationsCardinalityAggregate; + }; + + const totalEnabledRulesCount = + typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value; + + return { + countTotal: totalEnabledRulesCount ?? 0, + countByType: parseSimpleRuleTypeBucket(aggregations.by_rule_type_id.buckets), + countNamespaces: aggregations.namespaces_count.value ?? 0, + }; + } catch (err) { + logger.warn( + `Error executing alerting telemetry task: getTotalCountInUse - ${JSON.stringify(err)}`, + { + tags: ['alerting', 'telemetry-failed'], + error: { stack_trace: err.stack }, + } + ); + return { + countTotal: 0, + countByType: {}, + countNamespaces: 0, + }; + } +} diff --git a/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_task_manager.test.ts b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_task_manager.test.ts new file mode 100644 index 000000000000..6d58da12ca27 --- /dev/null +++ b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_task_manager.test.ts @@ -0,0 +1,256 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks'; +import { + getFailedAndUnrecognizedTasksPerDay, + parseBucket, +} from './get_telemetry_from_task_manager'; + +const elasticsearch = elasticsearchServiceMock.createStart(); +const esClient = elasticsearch.client.asInternalUser; +const logger: ReturnType = loggingSystemMock.createLogger(); + +describe('task manager telemetry', () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + + describe('parseBucket', () => { + test('should correctly parse aggregation bucket results', () => { + expect( + parseBucket([ + { + key: 'failed', + doc_count: 36, + by_task_type: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'alerting:.index-threshold', + doc_count: 4, + }, + { + key: 'alerting:document.test.', + doc_count: 32, + }, + ], + }, + }, + { + key: 'unrecognized', + doc_count: 4, + by_task_type: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'alerting:logs.alert.document.count', + doc_count: 4, + }, + ], + }, + }, + ]) + ).toEqual({ + countFailedAndUnrecognizedTasksByStatus: { + failed: 36, + unrecognized: 4, + }, + countFailedAndUnrecognizedTasksByStatusByType: { + failed: { + '__index-threshold': 4, + document__test__: 32, + }, + unrecognized: { + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 4, + }, + }, + }); + }); + + test('should handle missing values', () => { + expect( + parseBucket([ + { + key: 'failed', + by_task_type: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'alerting:.index-threshold', + doc_count: 4, + }, + // @ts-expect-error + { + key: 'alerting:document.test.', + }, + ], + }, + }, + { + key: 'unrecognized', + doc_count: 4, + // @ts-expect-error + by_task_type: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + }, + }, + // @ts-expect-error + { + key: 'another_key', + }, + ]) + ).toEqual({ + countFailedAndUnrecognizedTasksByStatus: { + failed: 0, + unrecognized: 4, + another_key: 0, + }, + countFailedAndUnrecognizedTasksByStatusByType: { + failed: { + '__index-threshold': 4, + document__test__: 0, + }, + }, + }); + }); + + test('should handle empty input', () => { + expect(parseBucket([])).toEqual({ + countFailedAndUnrecognizedTasksByStatus: {}, + countFailedAndUnrecognizedTasksByStatusByType: {}, + }); + }); + + test('should handle undefined input', () => { + // @ts-expect-error + expect(parseBucket(undefined)).toEqual({ + countFailedAndUnrecognizedTasksByStatus: {}, + countFailedAndUnrecognizedTasksByStatusByType: {}, + }); + }); + }); + + describe('getFailedAndUnrecognizedTasksPerDay', () => { + test('should return counts of failed and unrecognized tasks broken down by status and rule type', async () => { + esClient.search.mockResponse({ + took: 4, + timed_out: false, + _shards: { + total: 1, + successful: 1, + skipped: 0, + failed: 0, + }, + hits: { + total: { + value: 40, + relation: 'eq', + }, + max_score: null, + hits: [], + }, + aggregations: { + by_status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'failed', + doc_count: 36, + by_task_type: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'alerting:.index-threshold', + doc_count: 4, + }, + { + key: 'alerting:document.test.', + doc_count: 32, + }, + ], + }, + }, + { + key: 'unrecognized', + doc_count: 4, + by_task_type: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: 'alerting:logs.alert.document.count', + doc_count: 4, + }, + ], + }, + }, + ], + }, + }, + }); + + const telemetry = await getFailedAndUnrecognizedTasksPerDay({ + esClient, + taskManagerIndex: 'test', + logger, + }); + + expect(esClient.search).toHaveBeenCalledTimes(1); + + expect(telemetry).toStrictEqual({ + countFailedAndUnrecognizedTasks: 40, + countFailedAndUnrecognizedTasksByStatus: { + failed: 36, + unrecognized: 4, + }, + countFailedAndUnrecognizedTasksByStatusByType: { + failed: { + '__index-threshold': 4, + document__test__: 32, + }, + unrecognized: { + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 4, + }, + }, + }); + }); + + test('should return empty results and log warning if query throws error', async () => { + esClient.search.mockRejectedValue(new Error('oh no')); + + const telemetry = await getFailedAndUnrecognizedTasksPerDay({ + esClient, + taskManagerIndex: 'test', + logger, + }); + + expect(esClient.search).toHaveBeenCalledTimes(1); + + const loggerCall = logger.warn.mock.calls[0][0]; + const loggerMeta = logger.warn.mock.calls[0][1]; + expect(loggerCall as string).toMatchInlineSnapshot( + `"Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - {}"` + ); + expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']); + expect(loggerMeta?.error?.stack_trace).toBeDefined(); + expect(telemetry).toStrictEqual({ + countFailedAndUnrecognizedTasks: 0, + countFailedAndUnrecognizedTasksByStatus: {}, + countFailedAndUnrecognizedTasksByStatusByType: {}, + }); + }); + }); +}); diff --git a/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_task_manager.ts b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_task_manager.ts new file mode 100644 index 000000000000..b13f300bcd4a --- /dev/null +++ b/x-pack/plugins/alerting/server/usage/lib/get_telemetry_from_task_manager.ts @@ -0,0 +1,199 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { isEmpty, merge } from 'lodash'; +import type { + AggregationsTermsAggregateBase, + AggregationsStringTermsBucketKeys, + AggregationsBuckets, +} from '@elastic/elasticsearch/lib/api/typesWithBodyKey'; +import { ElasticsearchClient, Logger } from '@kbn/core/server'; +import { replaceDotSymbols } from './replace_dots_with_underscores'; +import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector'; + +interface Opts { + esClient: ElasticsearchClient; + taskManagerIndex: string; + logger: Logger; +} + +interface GetFailedAndUnrecognizedTasksAggregationBucket extends AggregationsStringTermsBucketKeys { + by_task_type: AggregationsTermsAggregateBase; +} + +interface GetFailedAndUnrecognizedTasksResults { + countFailedAndUnrecognizedTasks: number; + countFailedAndUnrecognizedTasksByStatus: Record; + countFailedAndUnrecognizedTasksByStatusByType: Record>; +} + +export async function getFailedAndUnrecognizedTasksPerDay({ + esClient, + taskManagerIndex, + logger, +}: Opts): Promise { + try { + const query = { + index: taskManagerIndex, + size: 0, + body: { + query: { + bool: { + must: [ + { + bool: { + should: [ + { + term: { + 'task.status': 'unrecognized', + }, + }, + { + term: { + 'task.status': 'failed', + }, + }, + ], + }, + }, + { + wildcard: { + 'task.taskType': { + value: 'alerting:*', + }, + }, + }, + { + range: { + 'task.runAt': { + gte: 'now-1d', + }, + }, + }, + ], + }, + }, + aggs: { + by_status: { + terms: { + field: 'task.status', + size: 10, + }, + aggs: { + by_task_type: { + terms: { + field: 'task.taskType', + // Use number of alerting rule types because we're filtering by 'alerting:' + size: NUM_ALERTING_RULE_TYPES, + }, + }, + }, + }, + }, + }, + }; + + logger.debug(`query for getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify(query)}`); + const results = await esClient.search(query); + + logger.debug( + `results for getFailedAndUnrecognizedTasksPerDay query - ${JSON.stringify(results)}` + ); + + const aggregations = results.aggregations as { + by_status: AggregationsTermsAggregateBase; + }; + + const totalFailedAndUnrecognizedTasks = + typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value; + + const aggregationsByStatus: AggregationsBuckets = + aggregations.by_status.buckets as GetFailedAndUnrecognizedTasksAggregationBucket[]; + + return { + ...parseBucket(aggregationsByStatus), + countFailedAndUnrecognizedTasks: totalFailedAndUnrecognizedTasks ?? 0, + }; + } catch (err) { + logger.warn( + `Error executing alerting telemetry task: getFailedAndUnrecognizedTasksPerDay - ${JSON.stringify( + err + )}`, + { + tags: ['alerting', 'telemetry-failed'], + error: { stack_trace: err.stack }, + } + ); + return { + countFailedAndUnrecognizedTasks: 0, + countFailedAndUnrecognizedTasksByStatus: {}, + countFailedAndUnrecognizedTasksByStatusByType: {}, + }; + } +} + +/** + * Bucket format: + * { + * "key": "idle", // task status + * "doc_count": 28, // number of tasks with this status + * "by_task_type": { + * "doc_count_error_upper_bound": 0, + * "sum_other_doc_count": 0, + * "buckets": [ + * { + * "key": "alerting:.es-query", // breakdown of task type for status + * "doc_count": 1 + * }, + * { + * "key": "alerting:.index-threshold", + * "doc_count": 1 + * } + * ] + * } + * } + */ + +export function parseBucket( + buckets: GetFailedAndUnrecognizedTasksAggregationBucket[] +): Pick< + GetFailedAndUnrecognizedTasksResults, + 'countFailedAndUnrecognizedTasksByStatus' | 'countFailedAndUnrecognizedTasksByStatusByType' +> { + return (buckets ?? []).reduce( + (summary, bucket) => { + const status: string = bucket.key; + const taskTypeBuckets = bucket?.by_task_type?.buckets as AggregationsStringTermsBucketKeys[]; + + const byTaskType = (taskTypeBuckets ?? []).reduce( + (acc: Record, taskTypeBucket: AggregationsStringTermsBucketKeys) => { + const taskType: string = replaceDotSymbols(taskTypeBucket.key.replace('alerting:', '')); + return { + ...acc, + [taskType]: taskTypeBucket.doc_count ?? 0, + }; + }, + {} + ); + return { + ...summary, + countFailedAndUnrecognizedTasksByStatus: { + ...summary.countFailedAndUnrecognizedTasksByStatus, + [status]: bucket?.doc_count ?? 0, + }, + countFailedAndUnrecognizedTasksByStatusByType: merge( + summary.countFailedAndUnrecognizedTasksByStatusByType, + isEmpty(byTaskType) ? {} : { [status]: byTaskType } + ), + }; + }, + { + countFailedAndUnrecognizedTasksByStatus: {}, + countFailedAndUnrecognizedTasksByStatusByType: {}, + } + ); +} diff --git a/x-pack/plugins/alerting/server/usage/lib/parse_simple_rule_type_bucket.test.ts b/x-pack/plugins/alerting/server/usage/lib/parse_simple_rule_type_bucket.test.ts new file mode 100644 index 000000000000..7b6f758ec121 --- /dev/null +++ b/x-pack/plugins/alerting/server/usage/lib/parse_simple_rule_type_bucket.test.ts @@ -0,0 +1,67 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket'; + +describe('parseSimpleRuleTypeBucket', () => { + test('should correctly parse rule type bucket results', () => { + expect( + parseSimpleRuleTypeBucket([ + { + key: '.index-threshold', + doc_count: 78, + }, + { + key: 'document.test.', + doc_count: 42, + }, + { + key: 'logs.alert.document.count', + doc_count: 28, + }, + ]) + ).toEqual({ + '__index-threshold': 78, + document__test__: 42, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 28, + }); + }); + + test('should handle missing values', () => { + expect( + parseSimpleRuleTypeBucket([ + // @ts-expect-error + { + key: '.index-threshold', + }, + { + key: 'document.test.', + doc_count: 42, + }, + { + key: 'logs.alert.document.count', + doc_count: 28, + }, + ]) + ).toEqual({ + '__index-threshold': 0, + document__test__: 42, + // eslint-disable-next-line @typescript-eslint/naming-convention + logs__alert__document__count: 28, + }); + }); + + test('should handle empty input', () => { + expect(parseSimpleRuleTypeBucket([])).toEqual({}); + }); + + test('should handle undefined input', () => { + // @ts-expect-error + expect(parseSimpleRuleTypeBucket(undefined)).toEqual({}); + }); +}); diff --git a/x-pack/plugins/alerting/server/usage/lib/parse_simple_rule_type_bucket.ts b/x-pack/plugins/alerting/server/usage/lib/parse_simple_rule_type_bucket.ts new file mode 100644 index 000000000000..3cbae0651708 --- /dev/null +++ b/x-pack/plugins/alerting/server/usage/lib/parse_simple_rule_type_bucket.ts @@ -0,0 +1,25 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { + AggregationsBuckets, + AggregationsStringTermsBucketKeys, +} from '@elastic/elasticsearch/lib/api/typesWithBodyKey'; +import { replaceDotSymbols } from './replace_dots_with_underscores'; + +export function parseSimpleRuleTypeBucket( + ruleTypeBuckets: AggregationsBuckets +) { + const buckets = ruleTypeBuckets as AggregationsStringTermsBucketKeys[]; + return (buckets ?? []).reduce((acc, bucket: AggregationsStringTermsBucketKeys) => { + const ruleType: string = replaceDotSymbols(bucket.key); + return { + ...acc, + [ruleType]: bucket.doc_count ?? 0, + }; + }, {}); +} diff --git a/x-pack/plugins/alerting/server/usage/lib/replace_dots_with_underscores.test.ts b/x-pack/plugins/alerting/server/usage/lib/replace_dots_with_underscores.test.ts new file mode 100644 index 000000000000..23fe3ca3c85e --- /dev/null +++ b/x-pack/plugins/alerting/server/usage/lib/replace_dots_with_underscores.test.ts @@ -0,0 +1,14 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { replaceDotSymbols } from './replace_dots_with_underscores'; + +describe('replaceDotSymbols', () => { + test('should replace "." symbols with "__" in string', async () => { + expect(replaceDotSymbols('.index-threshold')).toEqual('__index-threshold'); + }); +}); diff --git a/x-pack/plugins/alerting/server/usage/lib/replace_dots_with_underscores.ts b/x-pack/plugins/alerting/server/usage/lib/replace_dots_with_underscores.ts new file mode 100644 index 000000000000..84cb7c2daa57 --- /dev/null +++ b/x-pack/plugins/alerting/server/usage/lib/replace_dots_with_underscores.ts @@ -0,0 +1,10 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +export function replaceDotSymbols(strToReplace: string) { + return strToReplace.replaceAll('.', '__'); +} diff --git a/x-pack/plugins/alerting/server/usage/task.ts b/x-pack/plugins/alerting/server/usage/task.ts index f6b9a1be3814..9d01ac21e845 100644 --- a/x-pack/plugins/alerting/server/usage/task.ts +++ b/x-pack/plugins/alerting/server/usage/task.ts @@ -13,13 +13,12 @@ import { TaskManagerStartContract, } from '@kbn/task-manager-plugin/server'; +import { getFailedAndUnrecognizedTasksPerDay } from './lib/get_telemetry_from_task_manager'; +import { getTotalCountAggregations, getTotalCountInUse } from './lib/get_telemetry_from_kibana'; import { - getTotalCountAggregations, - getTotalCountInUse, getExecutionsPerDayCount, getExecutionTimeoutsPerDayCount, - getFailedAndUnrecognizedTasksPerDay, -} from './alerting_telemetry'; +} from './lib/get_telemetry_from_event_log'; export const TELEMETRY_TASK_TYPE = 'alerting_telemetry'; @@ -98,11 +97,11 @@ export function telemetryTaskRunner( async run() { const esClient = await getEsClient(); return Promise.all([ - getTotalCountAggregations(esClient, kibanaIndex, logger), - getTotalCountInUse(esClient, kibanaIndex, logger), - getExecutionsPerDayCount(esClient, eventLogIndex, logger), - getExecutionTimeoutsPerDayCount(esClient, eventLogIndex, logger), - getFailedAndUnrecognizedTasksPerDay(esClient, taskManagerIndex, logger), + getTotalCountAggregations({ esClient, kibanaIndex, logger }), + getTotalCountInUse({ esClient, kibanaIndex, logger }), + getExecutionsPerDayCount({ esClient, eventLogIndex, logger }), + getExecutionTimeoutsPerDayCount({ esClient, eventLogIndex, logger }), + getFailedAndUnrecognizedTasksPerDay({ esClient, taskManagerIndex, logger }), ]) .then( ([ @@ -120,22 +119,25 @@ export function telemetryTaskRunner( count_active_total: totalInUse.countTotal, count_disabled_total: totalCountAggregations.count_total - totalInUse.countTotal, count_rules_namespaces: totalInUse.countNamespaces, - count_rules_executions_per_day: dailyExecutionCounts.countTotal, - count_rules_executions_by_type_per_day: dailyExecutionCounts.countByType, - count_rules_executions_failured_per_day: dailyExecutionCounts.countTotalFailures, + count_rules_executions_per_day: dailyExecutionCounts.countTotalRuleExecutions, + count_rules_executions_by_type_per_day: + dailyExecutionCounts.countRuleExecutionsByType, + count_rules_executions_failured_per_day: + dailyExecutionCounts.countTotalFailedExecutions, count_rules_executions_failured_by_reason_per_day: - dailyExecutionCounts.countFailuresByReason, + dailyExecutionCounts.countFailedExecutionsByReason, count_rules_executions_failured_by_reason_by_type_per_day: - dailyExecutionCounts.countFailuresByReasonByType, - count_rules_executions_timeouts_per_day: dailyExecutionTimeoutCounts.countTotal, + dailyExecutionCounts.countFailedExecutionsByReasonByType, + count_rules_executions_timeouts_per_day: + dailyExecutionTimeoutCounts.countExecutionTimeouts, count_rules_executions_timeouts_by_type_per_day: - dailyExecutionTimeoutCounts.countByType, + dailyExecutionTimeoutCounts.countExecutionTimeoutsByType, count_failed_and_unrecognized_rule_tasks_per_day: - dailyFailedAndUnrecognizedTasks.countTotal, + dailyFailedAndUnrecognizedTasks.countFailedAndUnrecognizedTasks, count_failed_and_unrecognized_rule_tasks_by_status_per_day: - dailyFailedAndUnrecognizedTasks.countByStatus, + dailyFailedAndUnrecognizedTasks.countFailedAndUnrecognizedTasksByStatus, count_failed_and_unrecognized_rule_tasks_by_status_by_type_per_day: - dailyFailedAndUnrecognizedTasks.countByStatusByRuleType, + dailyFailedAndUnrecognizedTasks.countFailedAndUnrecognizedTasksByStatusByType, avg_execution_time_per_day: dailyExecutionCounts.avgExecutionTime, avg_execution_time_by_type_per_day: dailyExecutionCounts.avgExecutionTimeByType, avg_es_search_duration_per_day: dailyExecutionCounts.avgEsSearchDuration,