[ResponseOps][MW] Add telemetry for the maintenance window (#192483)

Resolve: https://github.com/elastic/kibana/issues/184088

In this PR add telemetry collection of these metrics:

- total number of MW in deployments
- number of active MW with "repeat" toggle on (time based)
- number of active MW with "filter alerts" toggle on (KQL based)

## Testing

Create several MW with different settings (toggles on and off)
To test changes reflected in telemetry object,  
modify this file: `x-pack/plugins/alerting/server/usage/task.ts`

With:

```
async function scheduleTasks(logger: Logger, taskManager: TaskManagerStartContract) {
  try {
    await taskManager.ensureScheduled({
      id: TASK_ID,
      taskType: TELEMETRY_TASK_TYPE,
      state: emptyState,
      params: {},
      schedule: SCHEDULE,
    });
  } catch (e) {
    logger.error(`Error scheduling ${TASK_ID}, received ${e.message}`);
  }
  await taskManager.runSoon(TASK_ID);
}
```

This will cause the telemetry to be sent as soon as the server is
restarted.

**Run Telemetry usage payload API in your browser console to verify
telemetry object:**

https://docs.elastic.dev/telemetry/collection/snapshot-telemetry#telemetry-usage-payload-api
P.S.: Add space at the beginning of URL


### Checklist

- [x] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios

---------

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
This commit is contained in:
Julia 2024-09-19 10:28:48 +02:00 committed by GitHub
parent 210f5527a0
commit eabb102281
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 364 additions and 13 deletions

View file

@ -205,6 +205,9 @@ export function createAlertingUsageCollector(
count_rules_with_tags: 0,
count_rules_snoozed: 0,
count_rules_muted: 0,
count_mw_total: 0,
count_mw_with_repeat_toggle_on: 0,
count_mw_with_filter_alert_toggle_on: 0,
count_rules_with_muted_alerts: 0,
count_connector_types_by_consumers: {},
count_rules_by_execution_status_per_day: {},
@ -289,6 +292,9 @@ export function createAlertingUsageCollector(
count_rules_by_notify_when: byNotifyWhenSchema,
count_rules_snoozed: { type: 'long' },
count_rules_muted: { type: 'long' },
count_mw_total: { type: 'long' },
count_mw_with_repeat_toggle_on: { type: 'long' },
count_mw_with_filter_alert_toggle_on: { type: 'long' },
count_rules_with_muted_alerts: { type: 'long' },
count_connector_types_by_consumers: { DYNAMIC_KEY: { DYNAMIC_KEY: { type: 'long' } } },
count_rules_by_execution_status_per_day: byStatusPerDaySchema,

View file

@ -6,11 +6,97 @@
*/
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
import { getTotalCountAggregations, getTotalCountInUse } from './get_telemetry_from_kibana';
import {
getTotalCountAggregations,
getTotalCountInUse,
getMWTelemetry,
} from './get_telemetry_from_kibana';
import { savedObjectsClientMock } from '@kbn/core/server/mocks';
import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../../common';
import { ISavedObjectsRepository } from '@kbn/core/server';
const elasticsearch = elasticsearchServiceMock.createStart();
const esClient = elasticsearch.client.asInternalUser;
const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
const savedObjectsClient = savedObjectsClientMock.create() as unknown as ISavedObjectsRepository;
const thrownError = new Error('Fail');
const mockedResponse = {
saved_objects: [
{
id: '1',
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
attributes: {
title: 'test_rule_1',
enabled: true,
duration: 1800000,
expirationDate: '2025-09-09T13:13:07.824Z',
events: [],
rRule: {
dtstart: '2024-09-09T13:13:02.054Z',
tzid: 'Europe/Stockholm',
freq: 0,
count: 1,
},
createdBy: null,
updatedBy: null,
createdAt: '2024-09-09T13:13:07.825Z',
updatedAt: '2024-09-09T13:13:07.825Z',
scopedQuery: null,
},
},
{
id: '2',
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
attributes: {
title: 'test_rule_2',
enabled: true,
duration: 1800000,
expirationDate: '2025-09-09T13:13:07.824Z',
events: [],
rRule: {
dtstart: '2024-09-09T13:13:02.054Z',
tzid: 'Europe/Stockholm',
freq: 3,
interval: 1,
byweekday: ['SU'],
},
createdBy: null,
updatedBy: null,
createdAt: '2024-09-09T13:13:07.825Z',
updatedAt: '2024-09-09T13:13:07.825Z',
scopedQuery: {
filters: [],
kql: 'kibana.alert.job_errors_results.job_id : * ',
dsl: '{"bool":{"must":[],"filter":[{"bool":{"should":[{"exists":{"field":"kibana.alert.job_errors_results.job_id"}}],"minimum_should_match":1}}],"should":[],"must_not":[]}}',
},
},
},
{
id: '3',
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
attributes: {
title: 'test_rule_3',
enabled: true,
duration: 1800000,
expirationDate: '2025-09-09T13:13:07.824Z',
events: [],
rRule: {
dtstart: '2024-09-09T13:13:02.054Z',
tzid: 'Europe/Stockholm',
freq: 3,
interval: 1,
byweekday: ['TU'],
},
createdBy: null,
updatedBy: null,
createdAt: '2024-09-09T13:13:07.825Z',
updatedAt: '2024-09-09T13:13:07.825Z',
scopedQuery: null,
},
},
],
};
describe('kibana index telemetry', () => {
beforeEach(() => {
@ -420,4 +506,94 @@ describe('kibana index telemetry', () => {
});
});
});
describe('getMWTelemetry', () => {
test('should return MW telemetry', async () => {
savedObjectsClient.createPointInTimeFinder = jest.fn().mockReturnValue({
close: jest.fn(),
find: jest.fn().mockImplementation(async function* () {
yield mockedResponse;
}),
});
const telemetry = await getMWTelemetry({
savedObjectsClient,
logger,
});
expect(savedObjectsClient.createPointInTimeFinder).toHaveBeenCalledWith({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
fields: ['rRule', 'scopedQuery'],
});
expect(telemetry).toStrictEqual({
count_mw_total: 3,
count_mw_with_repeat_toggle_on: 2,
count_mw_with_filter_alert_toggle_on: 1,
hasErrors: false,
});
});
});
test('should throw the error', async () => {
savedObjectsClient.createPointInTimeFinder = jest.fn().mockReturnValue({
close: jest.fn(),
find: jest.fn().mockImplementation(async function* () {
throw thrownError;
}),
});
const telemetry = await getMWTelemetry({
savedObjectsClient,
logger,
});
expect(savedObjectsClient.createPointInTimeFinder).toHaveBeenCalledWith({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
fields: ['rRule', 'scopedQuery'],
});
expect(telemetry).toStrictEqual({
count_mw_total: 0,
count_mw_with_repeat_toggle_on: 0,
count_mw_with_filter_alert_toggle_on: 0,
hasErrors: true,
errorMessage: 'Fail',
});
expect(logger.warn).toHaveBeenCalled();
const loggerCall = logger.warn.mock.calls[0][0];
const loggerMeta = logger.warn.mock.calls[0][1];
expect(loggerCall).toBe('Error executing alerting telemetry task: getTotalMWCount - {}');
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerMeta?.error?.stack_trace).toBeDefined();
});
test('should stop on MW max limit count', async () => {
savedObjectsClient.createPointInTimeFinder = jest.fn().mockReturnValue({
close: jest.fn(),
find: jest.fn().mockImplementation(async function* () {
yield mockedResponse;
}),
});
const telemetry = await getMWTelemetry({
savedObjectsClient,
logger,
maxDocuments: 1,
});
expect(savedObjectsClient.createPointInTimeFinder).toHaveBeenCalledWith({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
fields: ['rRule', 'scopedQuery'],
});
expect(telemetry).toStrictEqual({
count_mw_total: 2,
count_mw_with_repeat_toggle_on: 1,
count_mw_with_filter_alert_toggle_on: 1,
hasErrors: false,
});
});
});

View file

@ -11,7 +11,7 @@ import type {
AggregationsTermsAggregateBase,
AggregationsStringTermsBucketKeys,
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { ElasticsearchClient, Logger } from '@kbn/core/server';
import { ElasticsearchClient, Logger, ISavedObjectsRepository } from '@kbn/core/server';
import {
ConnectorsByConsumersBucket,
@ -23,6 +23,8 @@ import { AlertingUsage } from '../types';
import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
import { groupRulesBySearchType } from './group_rules_by_search_type';
import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../../common';
import { MaintenanceWindowAttributes } from '../../data/maintenance_window/types';
interface Opts {
esClient: ElasticsearchClient;
@ -30,6 +32,12 @@ interface Opts {
logger: Logger;
}
interface MWOpts {
savedObjectsClient: ISavedObjectsRepository;
logger: Logger;
maxDocuments?: number;
}
type GetTotalCountsResults = Pick<
AlertingUsage,
| 'count_total'
@ -48,6 +56,14 @@ type GetTotalCountsResults = Pick<
| 'connectors_per_alert'
> & { errorMessage?: string; hasErrors: boolean };
type GetMWTelemetryResults = Pick<
AlertingUsage,
'count_mw_total' | 'count_mw_with_repeat_toggle_on' | 'count_mw_with_filter_alert_toggle_on'
> & {
errorMessage?: string;
hasErrors: boolean;
};
interface GetTotalCountInUseResults {
countTotal: number;
countByType: Record<string, number>;
@ -56,6 +72,8 @@ interface GetTotalCountInUseResults {
hasErrors: boolean;
}
const TELEMETRY_MW_COUNT_LIMIT = 10000;
export async function getTotalCountAggregations({
esClient,
alertIndex,
@ -490,3 +508,60 @@ export async function getTotalCountInUse({
};
}
}
export async function getMWTelemetry({
savedObjectsClient,
logger,
maxDocuments = TELEMETRY_MW_COUNT_LIMIT,
}: MWOpts): Promise<GetMWTelemetryResults> {
try {
const mwFinder = savedObjectsClient.createPointInTimeFinder<MaintenanceWindowAttributes>({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
fields: ['rRule', 'scopedQuery'],
});
let countMWTotal = 0;
let countMWWithRepeatToggleON = 0;
let countMWWithFilterAlertToggleON = 0;
mwLoop: for await (const response of mwFinder.find()) {
for (const mwSavedObject of response.saved_objects) {
if (countMWTotal > maxDocuments) break mwLoop;
countMWTotal = countMWTotal + 1;
// scopedQuery property will be null if "Filter alerts" toggle will be off
if (mwSavedObject.attributes.scopedQuery) {
countMWWithFilterAlertToggleON = countMWWithFilterAlertToggleON + 1;
}
// interval property will be not in place if "Repeat" toggle will be off
if (Object.hasOwn(mwSavedObject.attributes.rRule, 'interval')) {
countMWWithRepeatToggleON = countMWWithRepeatToggleON + 1;
}
}
}
await mwFinder.close();
return {
hasErrors: false,
count_mw_total: countMWTotal,
count_mw_with_repeat_toggle_on: countMWWithRepeatToggleON,
count_mw_with_filter_alert_toggle_on: countMWWithFilterAlertToggleON,
};
} catch (err) {
const errorMessage = err?.message ? err.message : err.toString();
logger.warn(
`Error executing alerting telemetry task: getTotalMWCount - ${JSON.stringify(err)}`,
{
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err?.stack },
}
);
return {
hasErrors: true,
errorMessage,
count_mw_total: 0,
count_mw_with_repeat_toggle_on: 0,
count_mw_with_filter_alert_toggle_on: 0,
};
}
}

View file

@ -12,15 +12,19 @@ import {
TaskManagerStartContract,
IntervalSchedule,
} from '@kbn/task-manager-plugin/server';
import { getFailedAndUnrecognizedTasksPerDay } from './lib/get_telemetry_from_task_manager';
import { getTotalCountAggregations, getTotalCountInUse } from './lib/get_telemetry_from_kibana';
import {
getTotalCountAggregations,
getTotalCountInUse,
getMWTelemetry,
} from './lib/get_telemetry_from_kibana';
import {
getExecutionsPerDayCount,
getExecutionTimeoutsPerDayCount,
} from './lib/get_telemetry_from_event_log';
import { stateSchemaByVersion, emptyState, type LatestTaskStateSchema } from './task_state';
import { RULE_SAVED_OBJECT_TYPE } from '../saved_objects';
import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../common';
export const TELEMETRY_TASK_TYPE = 'alerting_telemetry';
@ -36,12 +40,6 @@ export function initializeAlertingTelemetry(
registerAlertingTelemetryTask(logger, core, taskManager, eventLogIndex);
}
export function scheduleAlertingTelemetry(logger: Logger, taskManager?: TaskManagerStartContract) {
if (taskManager) {
scheduleTasks(logger, taskManager).catch(() => {}); // it shouldn't reject, but just in case
}
}
function registerAlertingTelemetryTask(
logger: Logger,
core: CoreSetup,
@ -58,6 +56,12 @@ function registerAlertingTelemetryTask(
});
}
export function scheduleAlertingTelemetry(logger: Logger, taskManager?: TaskManagerStartContract) {
if (taskManager) {
scheduleTasks(logger, taskManager).catch(() => {}); // it shouldn't reject, but just in case
}
}
async function scheduleTasks(logger: Logger, taskManager: TaskManagerStartContract) {
try {
await taskManager.ensureScheduled({
@ -93,16 +97,26 @@ export function telemetryTaskRunner(
.getStartServices()
.then(([coreStart]) => coreStart.savedObjects.getIndexForType(RULE_SAVED_OBJECT_TYPE));
const getSavedObjectClient = () =>
core
.getStartServices()
.then(([coreStart]) =>
coreStart.savedObjects.createInternalRepository([MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE])
);
return {
async run() {
const esClient = await getEsClient();
const alertIndex = await getAlertIndex();
const savedObjectsClient = await getSavedObjectClient();
return Promise.all([
getTotalCountAggregations({ esClient, alertIndex, logger }),
getTotalCountInUse({ esClient, alertIndex, logger }),
getExecutionsPerDayCount({ esClient, eventLogIndex, logger }),
getExecutionTimeoutsPerDayCount({ esClient, eventLogIndex, logger }),
getFailedAndUnrecognizedTasksPerDay({ esClient, taskManagerIndex, logger }),
getMWTelemetry({ logger, savedObjectsClient }),
])
.then(
([
@ -111,13 +125,15 @@ export function telemetryTaskRunner(
dailyExecutionCounts,
dailyExecutionTimeoutCounts,
dailyFailedAndUnrecognizedTasks,
MWTelemetry,
]) => {
const hasErrors =
totalCountAggregations.hasErrors ||
totalInUse.hasErrors ||
dailyExecutionCounts.hasErrors ||
dailyExecutionTimeoutCounts.hasErrors ||
dailyFailedAndUnrecognizedTasks.hasErrors;
dailyFailedAndUnrecognizedTasks.hasErrors ||
MWTelemetry.hasErrors;
const errorMessages = [
totalCountAggregations.errorMessage,
@ -125,6 +141,7 @@ export function telemetryTaskRunner(
dailyExecutionCounts.errorMessage,
dailyExecutionTimeoutCounts.errorMessage,
dailyFailedAndUnrecognizedTasks.errorMessage,
MWTelemetry.errorMessage,
].filter((message) => message !== undefined);
const updatedState: LatestTaskStateSchema = {
@ -147,6 +164,10 @@ export function telemetryTaskRunner(
count_rules_by_notify_when: totalCountAggregations.count_rules_by_notify_when,
count_rules_snoozed: totalCountAggregations.count_rules_snoozed,
count_rules_muted: totalCountAggregations.count_rules_muted,
count_mw_total: MWTelemetry.count_mw_total,
count_mw_with_repeat_toggle_on: MWTelemetry.count_mw_with_repeat_toggle_on,
count_mw_with_filter_alert_toggle_on:
MWTelemetry.count_mw_with_filter_alert_toggle_on,
count_rules_with_muted_alerts: totalCountAggregations.count_rules_with_muted_alerts,
count_connector_types_by_consumers:
totalCountAggregations.count_connector_types_by_consumers,

View file

@ -146,6 +146,9 @@ export const stateSchemaByVersion = {
}),
count_rules_snoozed: schema.number(),
count_rules_muted: schema.number(),
count_mw_total: schema.number(),
count_mw_with_repeat_toggle_on: schema.number(),
count_mw_with_filter_alert_toggle_on: schema.number(),
count_rules_with_muted_alerts: schema.number(),
count_connector_types_by_consumers: schema.recordOf(
schema.string(),
@ -248,6 +251,9 @@ export const emptyState: LatestTaskStateSchema = {
},
count_rules_snoozed: 0,
count_rules_muted: 0,
count_mw_total: 0,
count_mw_with_repeat_toggle_on: 0,
count_mw_with_filter_alert_toggle_on: 0,
count_rules_with_muted_alerts: 0,
count_connector_types_by_consumers: {},
count_rules_namespaces: 0,

View file

@ -41,6 +41,9 @@ export interface AlertingUsage {
count_connector_types_by_consumers: Record<string, Record<string, number>>;
count_rules_snoozed: number;
count_rules_muted: number;
count_mw_total: number;
count_mw_with_repeat_toggle_on: number;
count_mw_with_filter_alert_toggle_on: number;
count_rules_with_muted_alerts: number;
count_rules_by_execution_status_per_day: Record<string, number>;
percentile_num_generated_actions_per_day: {

View file

@ -1724,6 +1724,15 @@
"count_rules_muted": {
"type": "long"
},
"count_mw_total": {
"type": "long"
},
"count_mw_with_repeat_toggle_on": {
"type": "long"
},
"count_mw_with_filter_alert_toggle_on": {
"type": "long"
},
"count_rules_with_muted_alerts": {
"type": "long"
},

View file

@ -90,6 +90,44 @@ export default function createAlertingAndActionsTelemetryTests({ getService }: F
return ruleResponse.body.id;
}
async function createMaintenanceWindow({
spaceId,
interval,
scopedQuery = null,
}: {
spaceId: string;
interval?: number;
scopedQuery?: {
filters: string[];
kql: string;
dsl: string;
} | null;
}) {
const response = await supertestWithoutAuth
.post(`${getUrlPrefix(spaceId)}/internal/alerting/rules/maintenance_window`)
.set('kbn-xsrf', 'foo')
.auth(Superuser.username, Superuser.password)
.send({
title: 'test-maintenance-window',
duration: 60 * 60 * 1000, // 1 hr
r_rule: {
dtstart: new Date().toISOString(),
tzid: 'UTC',
freq: 0,
count: 1,
...(interval ? { interval } : {}),
},
category_ids: ['management'],
scoped_query: scopedQuery,
});
expect(response.status).to.equal(200);
objectRemover.add(spaceId, response.body.id, 'rules/maintenance_window', 'alerting', true);
return response.body.id;
}
async function setup() {
// Create rules and connectors in multiple spaces
for (const space of Spaces) {
@ -216,6 +254,18 @@ export default function createAlertingAndActionsTelemetryTests({ getService }: F
actions: [],
},
});
// MW with both toggles off
await createMaintenanceWindow({ spaceId: space.id });
// MW with 'Repeat' toggle on and 'Filter alerts' toggle on
await createMaintenanceWindow({
spaceId: space.id,
interval: 1,
scopedQuery: {
filters: [],
kql: 'kibana.alert.job_errors_results.job_id : * ',
dsl: '{"bool":{"must":[],"filter":[{"bool":{"should":[{"exists":{"field":"kibana.alert.job_errors_results.job_id"}}],"minimum_should_match":1}}],"should":[],"must_not":[]}}',
},
});
}
}
@ -500,6 +550,11 @@ export default function createAlertingAndActionsTelemetryTests({ getService }: F
expect(telemetry.count_rules_by_execution_status_per_day.failure > 0).to.be(true);
expect(telemetry.count_rules_by_execution_status_per_day.success > 0).to.be(true);
// maintenance window telemetry
expect(telemetry.count_mw_total).to.equal(6);
expect(telemetry.count_mw_with_filter_alert_toggle_on).to.equal(3);
expect(telemetry.count_mw_with_repeat_toggle_on).to.equal(3);
}
it('should retrieve telemetry data in the expected format', async () => {
@ -527,7 +582,7 @@ export default function createAlertingAndActionsTelemetryTests({ getService }: F
let actionsTelemetry: any;
await retry.try(async () => {
const telemetryTask = await es.get<TaskManagerDoc>({
id: `task:Actions-actions_telemetry`,
id: 'task:Actions-actions_telemetry',
index: '.kibana_task_manager',
});
expect(telemetryTask!._source!.task?.status).to.be('idle');
@ -550,7 +605,7 @@ export default function createAlertingAndActionsTelemetryTests({ getService }: F
let alertingTelemetry: any;
await retry.try(async () => {
const telemetryTask = await es.get<TaskManagerDoc>({
id: `task:Alerting-alerting_telemetry`,
id: 'task:Alerting-alerting_telemetry',
index: '.kibana_task_manager',
});
expect(telemetryTask!._source!.task?.status).to.be('idle');