[Response Ops][Alerting] Add telemetry for backfill rule runs and gap information (#221847)

Resolves https://github.com/elastic/kibana/issues/212091

## Summary

Updates alerting snapshot telemetry to capture data about number of
backfill executions and gap durations from the event log index.

## To Verify

1. Reduce the cadence of the usage collector task

```
--- a/x-pack/platform/plugins/shared/alerting/server/usage/task.ts
+++ b/x-pack/platform/plugins/shared/alerting/server/usage/task.ts
@@ -31,7 +31,7 @@ import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../common';
 export const TELEMETRY_TASK_TYPE = 'alerting_telemetry';

 export const TASK_ID = `Alerting-${TELEMETRY_TASK_TYPE}`;
-export const SCHEDULE: IntervalSchedule = { interval: '1d' };
+export const SCHEDULE: IntervalSchedule = { interval: '5m' };

```

2. Enable the gap detection feature flag

```
--- a/x-pack/solutions/security/plugins/security_solution/common/experimental_features.ts
+++ b/x-pack/solutions/security/plugins/security_solution/common/experimental_features.ts
@@ -191,7 +191,7 @@ export const allowedExperimentalValues = Object.freeze({
   /**
    * Enables the storing of gaps in the event log
    */
-  storeGapsInEventLogEnabled: false,
+  storeGapsInEventLogEnabled: true,

```

3. Start Kibana and create a detection rule and let it run once.
4. Stop Kibana for a period of time (at least 3 times the rule
interval).
5. Restart Kibana and navigate to
`https://localhost:5601/app/security/rules/id/<ruleId>`. Under the
`Execution Results` tab, you should see a section for `Gaps` and `Manual
runs`. When the rule runs again, you should see an entry under `Gaps`
with an action to `Fill gaps`. Click the action to fill the gaps.

<img width="2250" alt="Screenshot 2025-05-29 at 5 41 24 PM"
src="https://github.com/user-attachments/assets/a08455d0-8c54-4170-831b-3dedf6932fe7"
/>

6. Verify that the next time the usage collection task runs, you should
see data for backfill executions and gaps. You can see this in the Dev
Console using

```
POST kbn:/internal/telemetry/clusters/_stats?apiVersion=2
{ "unencrypted": true, "refreshCache": true }
```

---------

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
This commit is contained in:
Ying Mao 2025-06-24 15:12:21 -04:00 committed by GitHub
parent 92df41a533
commit 38a76c9034
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 477 additions and 3 deletions

View file

@ -3153,6 +3153,31 @@
"type": "long"
}
}
},
"count_backfill_executions": {
"type": "long"
},
"count_backfills_by_execution_status_per_day": {
"properties": {
"success": {
"type": "long"
},
"failure": {
"type": "long"
},
"unknown": {
"type": "long"
}
}
},
"count_gaps": {
"type": "long"
},
"total_unfilled_gap_duration_ms": {
"type": "long"
},
"total_filled_gap_duration_ms": {
"type": "long"
}
}
},

View file

@ -244,6 +244,11 @@ export function createAlertingUsageCollector(
count_rules_snoozed_by_type: {},
count_rules_muted_by_type: {},
count_ignored_fields_by_rule_type: {},
count_backfill_executions: 0,
count_backfills_by_execution_status_per_day: {},
count_gaps: 0,
total_unfilled_gap_duration_ms: 0,
total_filled_gap_duration_ms: 0,
};
}
},
@ -322,6 +327,11 @@ export function createAlertingUsageCollector(
count_rules_snoozed_by_type: byTypeSchema,
count_rules_muted_by_type: byTypeSchema,
count_ignored_fields_by_rule_type: byTypeSchema,
count_backfill_executions: { type: 'long' },
count_backfills_by_execution_status_per_day: byStatusPerDaySchema,
count_gaps: { type: 'long' },
total_unfilled_gap_duration_ms: { type: 'long' },
total_filled_gap_duration_ms: { type: 'long' },
},
});
}

View file

@ -0,0 +1,233 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
import type { MockedLogger } from '@kbn/logging-mocks';
import { loggerMock } from '@kbn/logging-mocks';
import { errors } from '@elastic/elasticsearch';
import { getBackfillTelemetryPerDay } from './get_backfill_telemetry';
const elasticsearch = elasticsearchServiceMock.createStart();
const esClient = elasticsearch.client.asInternalUser;
let logger: MockedLogger;
describe('backfill telemetry', () => {
beforeEach(() => {
jest.resetAllMocks();
logger = loggerMock.create();
});
describe('getBackfillTelemetryPerDay', () => {
test('should return counts of backfill executions and gap stats', async () => {
esClient.search.mockResponseOnce({
took: 4,
timed_out: false,
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 8, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
by_execution_status: {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{ key: 'success', doc_count: 6 },
{ key: 'failure', doc_count: 2 },
],
},
},
});
esClient.search.mockResponseOnce({
took: 4,
timed_out: false,
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 1, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
total_unfilled_duration_ms: { value: 2203673 },
total_filled_duration_ms: { value: 454245 },
},
});
const telemetry = await getBackfillTelemetryPerDay({
esClient,
eventLogIndex: 'test',
logger,
});
expect(esClient.search).toHaveBeenCalledTimes(2);
expect(telemetry).toStrictEqual({
countExecutions: 8,
countBackfillsByExecutionStatus: {
success: 6,
failure: 2,
},
countGaps: 1,
totalUnfilledGapDurationMs: 2203673,
totalFilledGapDurationMs: 454245,
hasErrors: false,
});
});
test('should handle empty results', async () => {
esClient.search.mockResponseOnce({
took: 4,
timed_out: false,
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 0, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
by_execution_status: {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [],
},
},
});
esClient.search.mockResponseOnce({
took: 4,
timed_out: false,
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 0, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
total_unfilled_duration_ms: { value: 0 },
total_filled_duration_ms: { value: 0 },
},
});
const telemetry = await getBackfillTelemetryPerDay({
esClient,
eventLogIndex: 'test',
logger,
});
expect(esClient.search).toHaveBeenCalledTimes(2);
expect(telemetry).toStrictEqual({
countExecutions: 0,
countBackfillsByExecutionStatus: {},
countGaps: 0,
totalUnfilledGapDurationMs: 0,
totalFilledGapDurationMs: 0,
hasErrors: false,
});
});
test('should return empty results and log warning if query throws error', async () => {
esClient.search.mockRejectedValueOnce(new Error('oh no'));
esClient.search.mockResponseOnce({
took: 4,
timed_out: false,
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 1, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
total_unfilled_duration_ms: { value: 2203673 },
total_filled_duration_ms: { value: 454245 },
},
});
const telemetry = await getBackfillTelemetryPerDay({
esClient,
eventLogIndex: 'test',
logger,
});
expect(esClient.search).toHaveBeenCalledTimes(2);
const loggerCall = logger.warn.mock.calls[0][0];
const loggerMeta = logger.warn.mock.calls[0][1];
expect(loggerCall as string).toMatchInlineSnapshot(
`"Error executing alerting telemetry task: getBackfillExecutionCount - Error: oh no"`
);
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerMeta?.error?.stack_trace).toBeDefined();
expect(telemetry).toStrictEqual({
hasErrors: true,
errorMessage: 'oh no',
countExecutions: 0,
countBackfillsByExecutionStatus: {},
countGaps: 1,
totalUnfilledGapDurationMs: 2203673,
totalFilledGapDurationMs: 454245,
});
});
it('should return empty results and log debug log if query throws search_phase_execution_exception error', async () => {
esClient.search.mockResponseOnce({
took: 4,
timed_out: false,
_shards: { total: 1, successful: 1, skipped: 0, failed: 0 },
hits: { total: { value: 8, relation: 'eq' }, max_score: null, hits: [] },
aggregations: {
by_execution_status: {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{ key: 'success', doc_count: 6 },
{ key: 'failure', doc_count: 2 },
],
},
},
});
esClient.search.mockRejectedValueOnce(
new errors.ResponseError({
warnings: [],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
meta: {} as any,
body: {
error: {
root_cause: [],
type: 'search_phase_execution_exception',
reason: 'no_shard_available_action_exception',
phase: 'fetch',
grouped: true,
failed_shards: [],
caused_by: {
type: 'no_shard_available_action_exception',
reason: 'This is the nested reason',
},
},
},
statusCode: 503,
headers: {},
})
);
const telemetry = await getBackfillTelemetryPerDay({
esClient,
eventLogIndex: 'test',
logger,
});
expect(esClient.search).toHaveBeenCalledTimes(2);
const loggerCalls = loggingSystemMock.collect(logger);
expect(loggerCalls.debug).toHaveLength(1);
expect(loggerCalls.debug[0][0]).toMatchInlineSnapshot(`
"Error executing alerting telemetry task: getGapInfo - ResponseError: search_phase_execution_exception
Caused by:
no_shard_available_action_exception: This is the nested reason"
`);
// logger meta
expect(loggerCalls.debug[0][1]?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerCalls.debug[0][1]?.error?.stack_trace).toBeDefined();
expect(loggerCalls.warn).toHaveLength(0);
expect(telemetry).toStrictEqual({
hasErrors: true,
errorMessage: 'no_shard_available_action_exception',
countExecutions: 8,
countBackfillsByExecutionStatus: {
success: 6,
failure: 2,
},
countGaps: 0,
totalUnfilledGapDurationMs: 0,
totalFilledGapDurationMs: 0,
});
});
});
});

View file

@ -0,0 +1,165 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type {
AggregationsSingleMetricAggregateBase,
AggregationsTermsAggregateBase,
AggregationsStringTermsBucketKeys,
} from '@elastic/elasticsearch/lib/api/types';
import type { ElasticsearchClient, Logger } from '@kbn/core/server';
import { getProviderAndActionFilterForTimeRange } from './get_telemetry_from_event_log';
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
import { parseAndLogError } from './parse_and_log_error';
interface Opts {
esClient: ElasticsearchClient;
eventLogIndex: string;
logger: Logger;
}
interface GetBackfillTelemetryPerDayCountResults {
hasErrors: boolean;
errorMessage?: string;
countExecutions: number;
countBackfillsByExecutionStatus: Record<string, number>;
countGaps: number;
totalUnfilledGapDurationMs: number;
totalFilledGapDurationMs: number;
}
interface GetBackfillExecutionsPerDayCountResults {
hasErrors: boolean;
errorMessage?: string;
countExecutions: number;
countBackfillsByExecutionStatus: Record<string, number>;
}
interface GetGapDataPerDayCountResults {
hasErrors: boolean;
errorMessage?: string;
countGaps: number;
totalUnfilledGapDurationMs: number;
totalFilledGapDurationMs: number;
}
async function getBackfillExecutionCount({
esClient,
eventLogIndex,
logger,
}: Opts): Promise<GetBackfillExecutionsPerDayCountResults> {
try {
const query = {
index: eventLogIndex,
size: 0,
track_total_hits: true,
query: getProviderAndActionFilterForTimeRange('execute-backfill'),
aggs: {
by_execution_status: {
terms: {
field: 'event.outcome',
},
},
},
};
const results = await esClient.search(query);
const totalBackfillExecutions =
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
const aggregations = results.aggregations as {
by_execution_status: AggregationsTermsAggregateBase<AggregationsStringTermsBucketKeys>;
};
return {
hasErrors: false,
countExecutions: totalBackfillExecutions ?? 0,
countBackfillsByExecutionStatus: parseSimpleRuleTypeBucket(
aggregations.by_execution_status.buckets
),
};
} catch (err) {
const errorMessage = parseAndLogError(err, `getBackfillExecutionCount`, logger);
return {
hasErrors: true,
errorMessage,
countExecutions: 0,
countBackfillsByExecutionStatus: {},
};
}
}
async function getGapInfo({
esClient,
eventLogIndex,
logger,
}: Opts): Promise<GetGapDataPerDayCountResults> {
try {
const query = {
index: eventLogIndex,
track_total_hits: true,
size: 0,
query: getProviderAndActionFilterForTimeRange('gap'),
aggs: {
total_unfilled_duration_ms: {
sum: { field: 'kibana.alert.rule.gap.unfilled_duration_ms' },
},
total_filled_duration_ms: {
sum: { field: 'kibana.alert.rule.gap.filled_duration_ms' },
},
},
};
const results = await esClient.search(query);
const totalGapsReported =
typeof results.hits.total === 'number' ? results.hits.total : results.hits.total?.value;
const aggregations = results.aggregations as {
total_unfilled_duration_ms: AggregationsSingleMetricAggregateBase;
total_filled_duration_ms: AggregationsSingleMetricAggregateBase;
};
return {
hasErrors: false,
countGaps: totalGapsReported ?? 0,
totalUnfilledGapDurationMs: aggregations.total_unfilled_duration_ms.value ?? 0,
totalFilledGapDurationMs: aggregations.total_filled_duration_ms.value ?? 0,
};
} catch (err) {
const errorMessage = parseAndLogError(err, `getGapInfo`, logger);
return {
hasErrors: true,
errorMessage,
countGaps: 0,
totalUnfilledGapDurationMs: 0,
totalFilledGapDurationMs: 0,
};
}
}
export async function getBackfillTelemetryPerDay(
opts: Opts
): Promise<GetBackfillTelemetryPerDayCountResults> {
const backfillResults = await getBackfillExecutionCount(opts);
const gapResults = await getGapInfo(opts);
const errorMessage = [backfillResults.errorMessage, gapResults.errorMessage]
.filter((message) => !!message)
.join(',');
return {
hasErrors: backfillResults.hasErrors || gapResults.hasErrors,
...(errorMessage ? { errorMessage } : {}),
countExecutions: backfillResults.countExecutions,
countBackfillsByExecutionStatus: backfillResults.countBackfillsByExecutionStatus,
countGaps: gapResults.countGaps,
totalUnfilledGapDurationMs: gapResults.totalUnfilledGapDurationMs,
totalFilledGapDurationMs: gapResults.totalFilledGapDurationMs,
};
}

View file

@ -564,7 +564,7 @@ export function parseExecutionCountAggregationResults(results: {
};
}
function getProviderAndActionFilterForTimeRange(
export function getProviderAndActionFilterForTimeRange(
action: string,
provider = 'alerting',
range = '1d'

View file

@ -23,6 +23,7 @@ import {
getExecutionsPerDayCount,
getExecutionTimeoutsPerDayCount,
} from './lib/get_telemetry_from_event_log';
import { getBackfillTelemetryPerDay } from './lib/get_backfill_telemetry';
import { stateSchemaByVersion, emptyState, type LatestTaskStateSchema } from './task_state';
import { RULE_SAVED_OBJECT_TYPE } from '../saved_objects';
import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../common';
@ -119,6 +120,7 @@ export function telemetryTaskRunner(
getFailedAndUnrecognizedTasksPerDay({ esClient, taskManagerIndex, logger }),
getMWTelemetry({ logger, savedObjectsClient }),
getTotalAlertsCountAggregations({ esClient, logger }),
getBackfillTelemetryPerDay({ esClient, eventLogIndex, logger }),
])
.then(
([
@ -129,6 +131,7 @@ export function telemetryTaskRunner(
dailyFailedAndUnrecognizedTasks,
MWTelemetry,
totalAlertsCountAggregations,
dailyBackfillCounts,
]) => {
const hasErrors =
totalCountAggregations.hasErrors ||
@ -137,7 +140,8 @@ export function telemetryTaskRunner(
dailyExecutionTimeoutCounts.hasErrors ||
dailyFailedAndUnrecognizedTasks.hasErrors ||
MWTelemetry.hasErrors ||
totalAlertsCountAggregations.hasErrors;
totalAlertsCountAggregations.hasErrors ||
dailyBackfillCounts.hasErrors;
const errorMessages = [
totalCountAggregations.errorMessage,
@ -147,6 +151,7 @@ export function telemetryTaskRunner(
dailyFailedAndUnrecognizedTasks.errorMessage,
MWTelemetry.errorMessage,
totalAlertsCountAggregations.errorMessage,
dailyBackfillCounts.errorMessage,
].filter((message) => message !== undefined);
const updatedState: LatestTaskStateSchema = {
@ -220,6 +225,12 @@ export function telemetryTaskRunner(
percentile_num_alerts_by_type_per_day: dailyExecutionCounts.alertsPercentilesByType,
count_alerts_total: totalAlertsCountAggregations.count_alerts_total,
count_alerts_by_rule_type: totalAlertsCountAggregations.count_alerts_by_rule_type,
count_backfill_executions: dailyBackfillCounts.countExecutions,
count_backfills_by_execution_status_per_day:
dailyBackfillCounts.countBackfillsByExecutionStatus,
count_gaps: dailyBackfillCounts.countGaps,
total_unfilled_gap_duration_ms: dailyBackfillCounts.totalUnfilledGapDurationMs,
total_filled_gap_duration_ms: dailyBackfillCounts.totalFilledGapDurationMs,
count_ignored_fields_by_rule_type:
totalAlertsCountAggregations.count_ignored_fields_by_rule_type,
};

View file

@ -135,6 +135,14 @@ const stateSchemaV5 = stateSchemaV4.extends({
count_ignored_fields_by_rule_type: schema.recordOf(schema.string(), schema.number()),
});
const stateSchemaV6 = stateSchemaV5.extends({
count_backfill_executions: schema.number(),
count_backfills_by_execution_status_per_day: schema.recordOf(schema.string(), schema.number()),
count_gaps: schema.number(),
total_unfilled_gap_duration_ms: schema.number(),
total_filled_gap_duration_ms: schema.number(),
});
export const stateSchemaByVersion = {
1: {
// A task that was created < 8.10 will go through this "up" migration
@ -256,9 +264,21 @@ export const stateSchemaByVersion = {
}),
schema: stateSchemaV5,
},
6: {
up: (state: Record<string, unknown>) => ({
...stateSchemaByVersion[5].up(state),
count_backfill_executions: state.count_backfill_executions || 0,
count_backfills_by_execution_status_per_day:
state.count_backfills_by_execution_status_per_day || {},
count_gaps: state.count_gaps || 0,
total_unfilled_gap_duration_ms: state.total_unfilled_gap_duration_ms || 0,
total_filled_gap_duration_ms: state.total_filled_gap_duration_ms || 0,
}),
schema: stateSchemaV6,
},
};
const latestTaskStateSchema = stateSchemaByVersion[5].schema;
const latestTaskStateSchema = stateSchemaByVersion[6].schema;
export type LatestTaskStateSchema = TypeOf<typeof latestTaskStateSchema>;
export const emptyState: LatestTaskStateSchema = {
@ -339,6 +359,11 @@ export const emptyState: LatestTaskStateSchema = {
percentile_num_alerts_by_type_per_day: {},
count_alerts_total: 0,
count_alerts_by_rule_type: {},
count_backfill_executions: 0,
count_backfills_by_execution_status_per_day: {},
count_gaps: 0,
total_unfilled_gap_duration_ms: 0,
total_filled_gap_duration_ms: 0,
count_ignored_fields_by_rule_type: {},
count_rules_with_linked_dashboards: 0,
count_rules_with_investigation_guide: 0,

View file

@ -104,4 +104,9 @@ export interface AlertingUsage {
count_rules_snoozed_by_type: Record<string, number>;
count_rules_muted_by_type: Record<string, number>;
count_ignored_fields_by_rule_type: Record<string, number>;
count_backfill_executions: number;
count_backfills_by_execution_status_per_day: Record<string, number>;
count_gaps: number;
total_unfilled_gap_duration_ms: number;
total_filled_gap_duration_ms: number;
}