mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 01:38:56 -04:00
[Alerting] Telemetry for long-running/cancelled rules (#123291)
* Renaming alerting telemetry files * Adding daily counts for execution timeouts * Threading in usageCounter * Adding usage counter for alerts after cancellation * Updating telemetry mappings * Adding tests * Adding tests * Cleanup * Cleanup * Adding rule type id to counter name * Adding new siem rule types * Replacing all dots with underscores Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
parent
740ce6c7aa
commit
83fee75692
14 changed files with 601 additions and 121 deletions
|
@ -8,7 +8,7 @@
|
|||
import type { PublicMethodsOf } from '@kbn/utility-types';
|
||||
import { first } from 'rxjs/operators';
|
||||
import { BehaviorSubject } from 'rxjs';
|
||||
import { UsageCollectionSetup } from 'src/plugins/usage_collection/server';
|
||||
import { UsageCollectionSetup, UsageCounter } from 'src/plugins/usage_collection/server';
|
||||
import { SecurityPluginSetup, SecurityPluginStart } from '../../security/server';
|
||||
import {
|
||||
EncryptedSavedObjectsPluginSetup,
|
||||
|
@ -51,7 +51,7 @@ import {
|
|||
AlertTypeState,
|
||||
Services,
|
||||
} from './types';
|
||||
import { registerAlertsUsageCollector } from './usage';
|
||||
import { registerAlertingUsageCollector } from './usage';
|
||||
import { initializeAlertingTelemetry, scheduleAlertingTelemetry } from './usage/task';
|
||||
import { IEventLogger, IEventLogService, IEventLogClientService } from '../../event_log/server';
|
||||
import { PluginStartContract as FeaturesPluginStart } from '../../features/server';
|
||||
|
@ -153,6 +153,7 @@ export class AlertingPlugin {
|
|||
private eventLogService?: IEventLogService;
|
||||
private eventLogger?: IEventLogger;
|
||||
private kibanaBaseUrl: string | undefined;
|
||||
private usageCounter: UsageCounter | undefined;
|
||||
|
||||
constructor(initializerContext: PluginInitializerContext) {
|
||||
this.config = initializerContext.config.create<AlertsConfig>().pipe(first()).toPromise();
|
||||
|
@ -208,7 +209,7 @@ export class AlertingPlugin {
|
|||
|
||||
const usageCollection = plugins.usageCollection;
|
||||
if (usageCollection) {
|
||||
registerAlertsUsageCollector(
|
||||
registerAlertingUsageCollector(
|
||||
usageCollection,
|
||||
core.getStartServices().then(([_, { taskManager }]) => taskManager)
|
||||
);
|
||||
|
@ -223,7 +224,7 @@ export class AlertingPlugin {
|
|||
}
|
||||
|
||||
// Usage counter for telemetry
|
||||
const usageCounter = plugins.usageCollection?.createUsageCounter(ALERTS_FEATURE_ID);
|
||||
this.usageCounter = plugins.usageCollection?.createUsageCounter(ALERTS_FEATURE_ID);
|
||||
|
||||
setupSavedObjects(
|
||||
core.savedObjects,
|
||||
|
@ -259,7 +260,7 @@ export class AlertingPlugin {
|
|||
defineRoutes({
|
||||
router,
|
||||
licenseState: this.licenseState,
|
||||
usageCounter,
|
||||
usageCounter: this.usageCounter,
|
||||
encryptedSavedObjects: plugins.encryptedSavedObjects,
|
||||
});
|
||||
|
||||
|
@ -393,6 +394,7 @@ export class AlertingPlugin {
|
|||
supportsEphemeralTasks: plugins.taskManager.supportsEphemeralTasks(),
|
||||
maxEphemeralActionsPerRule: config.maxEphemeralActionsPerAlert,
|
||||
cancelAlertsOnRuleTimeout: config.cancelAlertsOnRuleTimeout,
|
||||
usageCounter: this.usageCounter,
|
||||
});
|
||||
});
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
import sinon from 'sinon';
|
||||
import { schema } from '@kbn/config-schema';
|
||||
import { usageCountersServiceMock } from 'src/plugins/usage_collection/server/usage_counters/usage_counters_service.mock';
|
||||
import {
|
||||
AlertExecutorOptions,
|
||||
AlertTypeParams,
|
||||
|
@ -59,6 +60,9 @@ const ruleType: jest.Mocked<UntypedNormalizedRuleType> = {
|
|||
|
||||
let fakeTimer: sinon.SinonFakeTimers;
|
||||
|
||||
const mockUsageCountersSetup = usageCountersServiceMock.createSetupContract();
|
||||
const mockUsageCounter = mockUsageCountersSetup.createUsageCounter('test');
|
||||
|
||||
describe('Task Runner', () => {
|
||||
let mockedTaskInstance: ConcreteTaskInstance;
|
||||
|
||||
|
@ -113,6 +117,7 @@ describe('Task Runner', () => {
|
|||
supportsEphemeralTasks: false,
|
||||
maxEphemeralActionsPerRule: 10,
|
||||
cancelAlertsOnRuleTimeout: true,
|
||||
usageCounter: mockUsageCounter,
|
||||
};
|
||||
|
||||
function testAgainstEphemeralSupport(
|
||||
|
@ -397,6 +402,7 @@ describe('Task Runner', () => {
|
|||
},
|
||||
expect.any(Function)
|
||||
);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
testAgainstEphemeralSupport(
|
||||
|
@ -683,6 +689,7 @@ describe('Task Runner', () => {
|
|||
ruleset: 'alerts',
|
||||
},
|
||||
});
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
}
|
||||
);
|
||||
|
||||
|
@ -899,6 +906,7 @@ describe('Task Runner', () => {
|
|||
ruleset: 'alerts',
|
||||
},
|
||||
});
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
testAgainstEphemeralSupport(
|
||||
|
@ -965,6 +973,7 @@ describe('Task Runner', () => {
|
|||
4,
|
||||
'ruleExecutionStatus for test:1: {"lastExecutionDate":"1970-01-01T00:00:00.000Z","status":"active"}'
|
||||
);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
}
|
||||
);
|
||||
|
||||
|
@ -1157,6 +1166,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
testAgainstEphemeralSupport(
|
||||
|
@ -1218,6 +1228,7 @@ describe('Task Runner', () => {
|
|||
});
|
||||
await taskRunner.run();
|
||||
expect(enqueueFunction).toHaveBeenCalledTimes(1);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
}
|
||||
);
|
||||
|
||||
|
@ -1287,6 +1298,7 @@ describe('Task Runner', () => {
|
|||
});
|
||||
await taskRunner.run();
|
||||
expect(enqueueFunction).toHaveBeenCalledTimes(1);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
}
|
||||
);
|
||||
|
||||
|
@ -1607,6 +1619,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
}
|
||||
);
|
||||
|
||||
|
@ -2013,6 +2026,7 @@ describe('Task Runner', () => {
|
|||
},
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
}
|
||||
);
|
||||
|
||||
|
@ -2112,6 +2126,7 @@ describe('Task Runner', () => {
|
|||
expect(enqueueFunction).toHaveBeenCalledTimes(2);
|
||||
expect((enqueueFunction as jest.Mock).mock.calls[1][0].id).toEqual('1');
|
||||
expect((enqueueFunction as jest.Mock).mock.calls[0][0].id).toEqual('2');
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
}
|
||||
);
|
||||
|
||||
|
@ -2246,6 +2261,7 @@ describe('Task Runner', () => {
|
|||
},
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
}
|
||||
);
|
||||
|
||||
|
@ -2501,6 +2517,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('validates params before executing the alert type', async () => {
|
||||
|
@ -2557,6 +2574,7 @@ describe('Task Runner', () => {
|
|||
expect(taskRunnerFactoryInitializerParams.logger.error).toHaveBeenCalledWith(
|
||||
`Executing Rule foo:test:1 has resulted in Error: params invalid: [param1]: expected value of type [string] but got [undefined]`
|
||||
);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('uses API key when provided', async () => {
|
||||
|
@ -2591,6 +2609,7 @@ describe('Task Runner', () => {
|
|||
request,
|
||||
'/'
|
||||
);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test(`doesn't use API key when not provided`, async () => {
|
||||
|
@ -2623,6 +2642,7 @@ describe('Task Runner', () => {
|
|||
request,
|
||||
'/'
|
||||
);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('rescheduled the Alert if the schedule has update during a task run', async () => {
|
||||
|
@ -2673,6 +2693,7 @@ describe('Task Runner', () => {
|
|||
},
|
||||
}
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('recovers gracefully when the RuleType executor throws an exception', async () => {
|
||||
|
@ -2826,6 +2847,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('recovers gracefully when the Alert Task Runner throws an exception when fetching the encrypted attributes', async () => {
|
||||
|
@ -2960,6 +2982,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('recovers gracefully when the Alert Task Runner throws an exception when license is higher than supported', async () => {
|
||||
|
@ -3103,6 +3126,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('recovers gracefully when the Alert Task Runner throws an exception when getting internal Services', async () => {
|
||||
|
@ -3246,6 +3270,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('recovers gracefully when the Alert Task Runner throws an exception when fetching attributes', async () => {
|
||||
|
@ -3388,6 +3413,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('recovers gracefully when the Runner of a legacy Alert task which has no schedule throws an exception when fetching attributes', async () => {
|
||||
|
@ -3438,6 +3464,7 @@ describe('Task Runner', () => {
|
|||
"state": Object {},
|
||||
}
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test(`doesn't change previousStartedAt when it fails to run`, async () => {
|
||||
|
@ -3484,6 +3511,7 @@ describe('Task Runner', () => {
|
|||
expect(runnerResult.state.previousStartedAt).toEqual(
|
||||
new Date(originalAlertSate.previousStartedAt)
|
||||
);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('avoids rescheduling a failed Alert Task Runner when it throws due to failing to fetch the alert', async () => {
|
||||
|
@ -3525,6 +3553,7 @@ describe('Task Runner', () => {
|
|||
`Unable to execute rule "1" in the "foo" space because Saved object [alert/1] not found - this rule will not be rescheduled. To restart rule execution, try disabling and re-enabling this rule.`
|
||||
);
|
||||
expect(isUnrecoverableError(ex)).toBeTruthy();
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -3566,6 +3595,7 @@ describe('Task Runner', () => {
|
|||
1,
|
||||
`Unable to execute rule "1" in the "test space" space because Saved object [alert/1] not found - this rule will not be rescheduled. To restart rule execution, try disabling and re-enabling this rule.`
|
||||
);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -3877,6 +3907,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('duration is updated for active alerts when alert state contains start time', async () => {
|
||||
|
@ -4118,6 +4149,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('duration is not calculated for active alerts when alert state does not contain start time', async () => {
|
||||
|
@ -4347,6 +4379,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('end is logged for active alerts when alert state contains start time and alert recovers', async () => {
|
||||
|
@ -4575,6 +4608,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('end calculation is skipped for active alerts when alert state does not contain start time and alert recovers', async () => {
|
||||
|
@ -4799,6 +4833,7 @@ describe('Task Runner', () => {
|
|||
],
|
||||
]
|
||||
`);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('successfully executes the task with ephemeral tasks enabled', async () => {
|
||||
|
@ -4989,6 +5024,7 @@ describe('Task Runner', () => {
|
|||
},
|
||||
{ refresh: false, namespace: undefined }
|
||||
);
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('successfully bails on execution if the rule is disabled', async () => {
|
||||
|
@ -5083,6 +5119,7 @@ describe('Task Runner', () => {
|
|||
},
|
||||
message: 'test:1: execution failed',
|
||||
});
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('successfully stores successful runs', async () => {
|
||||
|
|
|
@ -8,6 +8,7 @@ import apm from 'elastic-apm-node';
|
|||
import type { PublicMethodsOf } from '@kbn/utility-types';
|
||||
import { Dictionary, pickBy, mapValues, without, cloneDeep } from 'lodash';
|
||||
import type { Request } from '@hapi/hapi';
|
||||
import { UsageCounter } from 'src/plugins/usage_collection/server';
|
||||
import uuid from 'uuid';
|
||||
import { addSpaceIdToPath } from '../../../spaces/server';
|
||||
import { Logger, KibanaRequest } from '../../../../../src/core/server';
|
||||
|
@ -109,6 +110,7 @@ export class TaskRunner<
|
|||
>;
|
||||
private readonly executionId: string;
|
||||
private readonly ruleTypeRegistry: RuleTypeRegistry;
|
||||
private usageCounter?: UsageCounter;
|
||||
private searchAbortController: AbortController;
|
||||
private cancelled: boolean;
|
||||
|
||||
|
@ -127,6 +129,7 @@ export class TaskRunner<
|
|||
) {
|
||||
this.context = context;
|
||||
this.logger = context.logger;
|
||||
this.usageCounter = context.usageCounter;
|
||||
this.ruleType = ruleType;
|
||||
this.ruleName = null;
|
||||
this.taskInstance = taskInstanceToAlertTaskInstance(taskInstance);
|
||||
|
@ -256,6 +259,18 @@ export class TaskRunner<
|
|||
return !this.context.cancelAlertsOnRuleTimeout || !this.ruleType.cancelAlertsOnRuleTimeout;
|
||||
}
|
||||
|
||||
private countUsageOfActionExecutionAfterRuleCancellation() {
|
||||
if (this.cancelled && this.usageCounter) {
|
||||
if (this.context.cancelAlertsOnRuleTimeout && this.ruleType.cancelAlertsOnRuleTimeout) {
|
||||
// Increment usage counter for skipped actions
|
||||
this.usageCounter.incrementCounter({
|
||||
counterName: `alertsSkippedDueToRuleExecutionTimeout_${this.ruleType.id}`,
|
||||
incrementBy: 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async executeAlert(
|
||||
alertId: string,
|
||||
alert: AlertInstance<InstanceState, InstanceContext>,
|
||||
|
@ -378,6 +393,7 @@ export class TaskRunner<
|
|||
event.error.message = err.message;
|
||||
event.event = event.event || {};
|
||||
event.event.outcome = 'failure';
|
||||
|
||||
throw new ErrorWithReason(AlertExecutionStatusErrorReasons.Execute, err);
|
||||
}
|
||||
|
||||
|
@ -483,6 +499,12 @@ export class TaskRunner<
|
|||
this.logger.debug(
|
||||
`no scheduling of actions for rule ${ruleLabel}: rule execution has been cancelled.`
|
||||
);
|
||||
// Usage counter for telemetry
|
||||
// This keeps track of how many times action executions were skipped after rule
|
||||
// execution completed successfully after the execution timeout
|
||||
// This can occur when rule executors do not short circuit execution in response
|
||||
// to timeout
|
||||
this.countUsageOfActionExecutionAfterRuleCancellation();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
import sinon from 'sinon';
|
||||
import { usageCountersServiceMock } from 'src/plugins/usage_collection/server/usage_counters/usage_counters_service.mock';
|
||||
import {
|
||||
AlertExecutorOptions,
|
||||
AlertTypeParams,
|
||||
|
@ -52,6 +53,9 @@ const ruleType: jest.Mocked<UntypedNormalizedRuleType> = {
|
|||
|
||||
let fakeTimer: sinon.SinonFakeTimers;
|
||||
|
||||
const mockUsageCountersSetup = usageCountersServiceMock.createSetupContract();
|
||||
const mockUsageCounter = mockUsageCountersSetup.createUsageCounter('test');
|
||||
|
||||
describe('Task Runner Cancel', () => {
|
||||
let mockedTaskInstance: ConcreteTaskInstance;
|
||||
|
||||
|
@ -106,6 +110,7 @@ describe('Task Runner Cancel', () => {
|
|||
supportsEphemeralTasks: false,
|
||||
maxEphemeralActionsPerRule: 10,
|
||||
cancelAlertsOnRuleTimeout: true,
|
||||
usageCounter: mockUsageCounter,
|
||||
};
|
||||
|
||||
const mockDate = new Date('2019-02-12T21:01:22.479Z');
|
||||
|
@ -333,6 +338,11 @@ describe('Task Runner Cancel', () => {
|
|||
},
|
||||
{ refresh: false, namespace: undefined }
|
||||
);
|
||||
expect(mockUsageCounter.incrementCounter).toHaveBeenCalledTimes(1);
|
||||
expect(mockUsageCounter.incrementCounter).toHaveBeenCalledWith({
|
||||
counterName: 'alertsSkippedDueToRuleExecutionTimeout_test',
|
||||
incrementBy: 1,
|
||||
});
|
||||
});
|
||||
|
||||
test('actionsPlugin.execute is called if rule execution is cancelled but cancelAlertsOnRuleTimeout from config is false', async () => {
|
||||
|
@ -361,6 +371,8 @@ describe('Task Runner Cancel', () => {
|
|||
await promise;
|
||||
|
||||
testActionsExecute();
|
||||
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('actionsPlugin.execute is called if rule execution is cancelled but cancelAlertsOnRuleTimeout for ruleType is false', async () => {
|
||||
|
@ -397,6 +409,8 @@ describe('Task Runner Cancel', () => {
|
|||
await promise;
|
||||
|
||||
testActionsExecute();
|
||||
|
||||
expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('actionsPlugin.execute is skipped if rule execution is cancelled and cancelAlertsOnRuleTimeout for both config and ruleType are true', async () => {
|
||||
|
@ -563,6 +577,12 @@ describe('Task Runner Cancel', () => {
|
|||
ruleset: 'alerts',
|
||||
},
|
||||
});
|
||||
|
||||
expect(mockUsageCounter.incrementCounter).toHaveBeenCalledTimes(1);
|
||||
expect(mockUsageCounter.incrementCounter).toHaveBeenCalledWith({
|
||||
counterName: 'alertsSkippedDueToRuleExecutionTimeout_test',
|
||||
incrementBy: 1,
|
||||
});
|
||||
});
|
||||
|
||||
function testActionsExecute() {
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
import sinon from 'sinon';
|
||||
import { usageCountersServiceMock } from 'src/plugins/usage_collection/server/usage_counters/usage_counters_service.mock';
|
||||
import { ConcreteTaskInstance, TaskStatus } from '../../../task_manager/server';
|
||||
import { TaskRunnerContext, TaskRunnerFactory } from './task_runner_factory';
|
||||
import { encryptedSavedObjectsMock } from '../../../encrypted_saved_objects/server/mocks';
|
||||
|
@ -22,7 +23,8 @@ import { ruleTypeRegistryMock } from '../rule_type_registry.mock';
|
|||
import { executionContextServiceMock } from '../../../../../src/core/server/mocks';
|
||||
|
||||
const executionContext = executionContextServiceMock.createSetupContract();
|
||||
|
||||
const mockUsageCountersSetup = usageCountersServiceMock.createSetupContract();
|
||||
const mockUsageCounter = mockUsageCountersSetup.createUsageCounter('test');
|
||||
const ruleType: UntypedNormalizedRuleType = {
|
||||
id: 'test',
|
||||
name: 'My test alert',
|
||||
|
@ -86,6 +88,7 @@ describe('Task Runner Factory', () => {
|
|||
maxEphemeralActionsPerRule: 10,
|
||||
cancelAlertsOnRuleTimeout: true,
|
||||
executionContext,
|
||||
usageCounter: mockUsageCounter,
|
||||
};
|
||||
|
||||
beforeEach(() => {
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
import type { PublicMethodsOf } from '@kbn/utility-types';
|
||||
import { UsageCounter } from 'src/plugins/usage_collection/server';
|
||||
import type {
|
||||
Logger,
|
||||
KibanaRequest,
|
||||
|
@ -46,6 +47,7 @@ export interface TaskRunnerContext {
|
|||
supportsEphemeralTasks: boolean;
|
||||
maxEphemeralActionsPerRule: number;
|
||||
cancelAlertsOnRuleTimeout: boolean;
|
||||
usageCounter?: UsageCounter;
|
||||
}
|
||||
|
||||
export class TaskRunnerFactory {
|
||||
|
|
|
@ -5,22 +5,25 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
/* eslint-disable @typescript-eslint/naming-convention */
|
||||
|
||||
// eslint-disable-next-line @kbn/eslint/no-restricted-paths
|
||||
import { elasticsearchClientMock } from '../../../../../src/core/server/elasticsearch/client/mocks';
|
||||
import {
|
||||
getTotalCountAggregations,
|
||||
getTotalCountInUse,
|
||||
getExecutionsPerDayCount,
|
||||
} from './alerts_telemetry';
|
||||
getExecutionTimeoutsPerDayCount,
|
||||
} from './alerting_telemetry';
|
||||
|
||||
describe('alerts telemetry', () => {
|
||||
test('getTotalCountInUse should replace first "." symbol to "__" in alert types names', async () => {
|
||||
describe('alerting telemetry', () => {
|
||||
test('getTotalCountInUse should replace "." symbols with "__" in rule types names', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockReturnValue(
|
||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
||||
elasticsearchClientMock.createSuccessTransportRequestPromise({
|
||||
aggregations: {
|
||||
byAlertTypeId: {
|
||||
byRuleTypeId: {
|
||||
value: {
|
||||
ruleTypes: {
|
||||
'.index-threshold': 2,
|
||||
|
@ -47,8 +50,8 @@ describe('alerts telemetry', () => {
|
|||
Object {
|
||||
"countByType": Object {
|
||||
"__index-threshold": 2,
|
||||
"document.test__": 1,
|
||||
"logs.alert.document.count": 1,
|
||||
"document__test__": 1,
|
||||
"logs__alert__document__count": 1,
|
||||
},
|
||||
"countNamespaces": 1,
|
||||
"countTotal": 4,
|
||||
|
@ -62,7 +65,7 @@ Object {
|
|||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
||||
elasticsearchClientMock.createSuccessTransportRequestPromise({
|
||||
aggregations: {
|
||||
byAlertTypeId: {
|
||||
byRuleTypeId: {
|
||||
value: {
|
||||
ruleTypes: {
|
||||
'.index-threshold': 2,
|
||||
|
@ -100,8 +103,8 @@ Object {
|
|||
},
|
||||
"count_by_type": Object {
|
||||
"__index-threshold": 2,
|
||||
"document.test__": 1,
|
||||
"logs.alert.document.count": 1,
|
||||
"document__test__": 1,
|
||||
"logs__alert__document__count": 1,
|
||||
},
|
||||
"count_rules_namespaces": 0,
|
||||
"count_total": 4,
|
||||
|
@ -129,7 +132,7 @@ Object {
|
|||
`);
|
||||
});
|
||||
|
||||
test('getTotalExecutionsCount should return execution aggregations for total count, count by rule type and number of failed executions', async () => {
|
||||
test('getExecutionsPerDayCount should return execution aggregations for total count, count by rule type and number of failed executions', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockReturnValue(
|
||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
||||
|
@ -176,26 +179,62 @@ Object {
|
|||
avgExecutionTime: 0,
|
||||
avgExecutionTimeByType: {
|
||||
'__index-threshold': 1043934,
|
||||
'document.test__': 17687687,
|
||||
'logs.alert.document.count': 1675765,
|
||||
document__test__: 17687687,
|
||||
logs__alert__document__count: 1675765,
|
||||
},
|
||||
countByType: {
|
||||
'__index-threshold': 2,
|
||||
'document.test__': 1,
|
||||
'logs.alert.document.count': 1,
|
||||
document__test__: 1,
|
||||
logs__alert__document__count: 1,
|
||||
},
|
||||
countFailuresByReason: {
|
||||
unknown: 4,
|
||||
},
|
||||
countFailuresByReasonByType: {
|
||||
unknown: {
|
||||
'.index-threshold': 2,
|
||||
'document.test.': 1,
|
||||
'logs.alert.document.count': 1,
|
||||
'__index-threshold': 2,
|
||||
document__test__: 1,
|
||||
logs__alert__document__count: 1,
|
||||
},
|
||||
},
|
||||
countTotal: 4,
|
||||
countTotalFailures: 4,
|
||||
});
|
||||
});
|
||||
|
||||
test('getExecutionTimeoutsPerDayCount should return execution aggregations for total timeout count and count by rule type', async () => {
|
||||
const mockEsClient = elasticsearchClientMock.createClusterClient().asScoped().asInternalUser;
|
||||
mockEsClient.search.mockReturnValue(
|
||||
// @ts-expect-error @elastic/elasticsearch Aggregate only allows unknown values
|
||||
elasticsearchClientMock.createSuccessTransportRequestPromise({
|
||||
aggregations: {
|
||||
byRuleTypeId: {
|
||||
value: {
|
||||
ruleTypes: {
|
||||
'.index-threshold': 2,
|
||||
'logs.alert.document.count': 1,
|
||||
'document.test.': 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
hits: {
|
||||
hits: [],
|
||||
},
|
||||
})
|
||||
);
|
||||
|
||||
const telemetry = await getExecutionTimeoutsPerDayCount(mockEsClient, 'test');
|
||||
|
||||
expect(mockEsClient.search).toHaveBeenCalledTimes(1);
|
||||
|
||||
expect(telemetry).toStrictEqual({
|
||||
countTotal: 4,
|
||||
countByType: {
|
||||
'__index-threshold': 2,
|
||||
document__test__: 1,
|
||||
logs__alert__document__count: 1,
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
|
@ -6,15 +6,15 @@
|
|||
*/
|
||||
|
||||
import { ElasticsearchClient } from 'kibana/server';
|
||||
import { AlertsUsage } from './types';
|
||||
import { AlertingUsage } from './types';
|
||||
|
||||
const alertTypeMetric = {
|
||||
const ruleTypeMetric = {
|
||||
scripted_metric: {
|
||||
init_script: 'state.ruleTypes = [:]; state.namespaces = [:]',
|
||||
map_script: `
|
||||
String alertType = doc['alert.alertTypeId'].value;
|
||||
String ruleType = doc['alert.alertTypeId'].value;
|
||||
String namespace = doc['namespaces'] !== null && doc['namespaces'].size() > 0 ? doc['namespaces'].value : 'default';
|
||||
state.ruleTypes.put(alertType, state.ruleTypes.containsKey(alertType) ? state.ruleTypes.get(alertType) + 1 : 1);
|
||||
state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1);
|
||||
if (state.namespaces.containsKey(namespace) === false) {
|
||||
state.namespaces.put(namespace, 1);
|
||||
}
|
||||
|
@ -38,7 +38,7 @@ const alertTypeMetric = {
|
|||
},
|
||||
};
|
||||
|
||||
const ruleTypeExecutionsMetric = {
|
||||
const ruleTypeExecutionsWithDurationMetric = {
|
||||
scripted_metric: {
|
||||
init_script: 'state.ruleTypes = [:]; state.ruleTypesDuration = [:];',
|
||||
map_script: `
|
||||
|
@ -66,6 +66,32 @@ const ruleTypeExecutionsMetric = {
|
|||
},
|
||||
};
|
||||
|
||||
const ruleTypeExecutionsMetric = {
|
||||
scripted_metric: {
|
||||
init_script: 'state.ruleTypes = [:]',
|
||||
map_script: `
|
||||
String ruleType = doc['rule.category'].value;
|
||||
state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1);
|
||||
`,
|
||||
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
|
||||
// Despite docs that say this is optional, this script can't be blank.
|
||||
combine_script: 'return state',
|
||||
// Reduce script is executed across all clusters, so we need to add up all the total from each cluster
|
||||
// This also needs to account for having no data
|
||||
reduce_script: `
|
||||
Map result = [:];
|
||||
for (Map m : states.toArray()) {
|
||||
if (m !== null) {
|
||||
for (String k : m.keySet()) {
|
||||
result.put(k, result.containsKey(k) ? result.get(k) + m.get(k) : m.get(k));
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
`,
|
||||
},
|
||||
};
|
||||
|
||||
const ruleTypeFailureExecutionsMetric = {
|
||||
scripted_metric: {
|
||||
init_script: 'state.reasons = [:]',
|
||||
|
@ -99,10 +125,10 @@ const ruleTypeFailureExecutionsMetric = {
|
|||
|
||||
export async function getTotalCountAggregations(
|
||||
esClient: ElasticsearchClient,
|
||||
kibanaInex: string
|
||||
kibanaIndex: string
|
||||
): Promise<
|
||||
Pick<
|
||||
AlertsUsage,
|
||||
AlertingUsage,
|
||||
| 'count_total'
|
||||
| 'count_by_type'
|
||||
| 'throttle_time'
|
||||
|
@ -114,7 +140,7 @@ export async function getTotalCountAggregations(
|
|||
>
|
||||
> {
|
||||
const { body: results } = await esClient.search({
|
||||
index: kibanaInex,
|
||||
index: kibanaIndex,
|
||||
body: {
|
||||
size: 0,
|
||||
query: {
|
||||
|
@ -210,7 +236,7 @@ export async function getTotalCountAggregations(
|
|||
},
|
||||
},
|
||||
aggs: {
|
||||
byAlertTypeId: alertTypeMetric,
|
||||
byRuleTypeId: ruleTypeMetric,
|
||||
max_throttle_time: { max: { field: 'alert_throttle' } },
|
||||
min_throttle_time: { min: { field: 'alert_throttle' } },
|
||||
avg_throttle_time: { avg: { field: 'alert_throttle' } },
|
||||
|
@ -225,7 +251,7 @@ export async function getTotalCountAggregations(
|
|||
});
|
||||
|
||||
const aggregations = results.aggregations as {
|
||||
byAlertTypeId: { value: { ruleTypes: Record<string, string> } };
|
||||
byRuleTypeId: { value: { ruleTypes: Record<string, string> } };
|
||||
max_throttle_time: { value: number };
|
||||
min_throttle_time: { value: number };
|
||||
avg_throttle_time: { value: number };
|
||||
|
@ -237,23 +263,15 @@ export async function getTotalCountAggregations(
|
|||
avg_actions_count: { value: number };
|
||||
};
|
||||
|
||||
const totalAlertsCount = Object.keys(aggregations.byAlertTypeId.value.ruleTypes).reduce(
|
||||
const totalRulesCount = Object.keys(aggregations.byRuleTypeId.value.ruleTypes).reduce(
|
||||
(total: number, key: string) =>
|
||||
parseInt(aggregations.byAlertTypeId.value.ruleTypes[key], 10) + total,
|
||||
parseInt(aggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
||||
0
|
||||
);
|
||||
|
||||
return {
|
||||
count_total: totalAlertsCount,
|
||||
count_by_type: Object.keys(aggregations.byAlertTypeId.value.ruleTypes).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, key: string) => ({
|
||||
...obj,
|
||||
[replaceFirstAndLastDotSymbols(key)]: aggregations.byAlertTypeId.value.ruleTypes[key],
|
||||
}),
|
||||
{}
|
||||
),
|
||||
count_total: totalRulesCount,
|
||||
count_by_type: replaceDotSymbolsInRuleTypeIds(aggregations.byRuleTypeId.value.ruleTypes),
|
||||
throttle_time: {
|
||||
min: `${aggregations.min_throttle_time.value}s`,
|
||||
avg: `${aggregations.avg_throttle_time.value}s`,
|
||||
|
@ -283,9 +301,9 @@ export async function getTotalCountAggregations(
|
|||
};
|
||||
}
|
||||
|
||||
export async function getTotalCountInUse(esClient: ElasticsearchClient, kibanaInex: string) {
|
||||
export async function getTotalCountInUse(esClient: ElasticsearchClient, kibanaIndex: string) {
|
||||
const { body: searchResult } = await esClient.search({
|
||||
index: kibanaInex,
|
||||
index: kibanaIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
|
@ -294,43 +312,28 @@ export async function getTotalCountInUse(esClient: ElasticsearchClient, kibanaIn
|
|||
},
|
||||
},
|
||||
aggs: {
|
||||
byAlertTypeId: alertTypeMetric,
|
||||
byRuleTypeId: ruleTypeMetric,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const aggregations = searchResult.aggregations as {
|
||||
byAlertTypeId: {
|
||||
byRuleTypeId: {
|
||||
value: { ruleTypes: Record<string, string>; namespaces: Record<string, string> };
|
||||
};
|
||||
};
|
||||
|
||||
return {
|
||||
countTotal: Object.keys(aggregations.byAlertTypeId.value.ruleTypes).reduce(
|
||||
countTotal: Object.keys(aggregations.byRuleTypeId.value.ruleTypes).reduce(
|
||||
(total: number, key: string) =>
|
||||
parseInt(aggregations.byAlertTypeId.value.ruleTypes[key], 10) + total,
|
||||
parseInt(aggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
||||
0
|
||||
),
|
||||
countByType: Object.keys(aggregations.byAlertTypeId.value.ruleTypes).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, key: string) => ({
|
||||
...obj,
|
||||
[replaceFirstAndLastDotSymbols(key)]: aggregations.byAlertTypeId.value.ruleTypes[key],
|
||||
}),
|
||||
{}
|
||||
),
|
||||
countNamespaces: Object.keys(aggregations.byAlertTypeId.value.namespaces).length,
|
||||
countByType: replaceDotSymbolsInRuleTypeIds(aggregations.byRuleTypeId.value.ruleTypes),
|
||||
countNamespaces: Object.keys(aggregations.byRuleTypeId.value.namespaces).length,
|
||||
};
|
||||
}
|
||||
|
||||
function replaceFirstAndLastDotSymbols(strToReplace: string) {
|
||||
const hasFirstSymbolDot = strToReplace.startsWith('.');
|
||||
const appliedString = hasFirstSymbolDot ? strToReplace.replace('.', '__') : strToReplace;
|
||||
const hasLastSymbolDot = strToReplace.endsWith('.');
|
||||
return hasLastSymbolDot ? `${appliedString.slice(0, -1)}__` : appliedString;
|
||||
}
|
||||
|
||||
export async function getExecutionsPerDayCount(
|
||||
esClient: ElasticsearchClient,
|
||||
eventLogIndex: string
|
||||
|
@ -363,7 +366,7 @@ export async function getExecutionsPerDayCount(
|
|||
},
|
||||
},
|
||||
aggs: {
|
||||
byRuleTypeId: ruleTypeExecutionsMetric,
|
||||
byRuleTypeId: ruleTypeExecutionsWithDurationMetric,
|
||||
failuresByReason: ruleTypeFailureExecutionsMetric,
|
||||
avgDuration: { avg: { field: 'event.duration' } },
|
||||
},
|
||||
|
@ -392,15 +395,8 @@ export async function getExecutionsPerDayCount(
|
|||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
||||
0
|
||||
),
|
||||
countByType: Object.keys(executionsAggregations.byRuleTypeId.value.ruleTypes).reduce(
|
||||
// ES DSL aggregations are returned as `any` by esClient.search
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, key: string) => ({
|
||||
...obj,
|
||||
[replaceFirstAndLastDotSymbols(key)]:
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypes[key],
|
||||
}),
|
||||
{}
|
||||
countByType: replaceDotSymbolsInRuleTypeIds(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
||||
),
|
||||
countTotalFailures: Object.keys(
|
||||
executionFailuresAggregations.failuresByReason.value.reasons
|
||||
|
@ -426,7 +422,7 @@ export async function getExecutionsPerDayCount(
|
|||
);
|
||||
return {
|
||||
...obj,
|
||||
[replaceFirstAndLastDotSymbols(reason)]: countByRuleTypes,
|
||||
[replaceDotSymbols(reason)]: countByRuleTypes,
|
||||
};
|
||||
},
|
||||
{}
|
||||
|
@ -438,8 +434,9 @@ export async function getExecutionsPerDayCount(
|
|||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, key: string) => ({
|
||||
...obj,
|
||||
[replaceFirstAndLastDotSymbols(key)]:
|
||||
executionFailuresAggregations.failuresByReason.value.reasons[key],
|
||||
[key]: replaceDotSymbolsInRuleTypeIds(
|
||||
executionFailuresAggregations.failuresByReason.value.reasons[key]
|
||||
),
|
||||
}),
|
||||
{}
|
||||
),
|
||||
|
@ -449,7 +446,7 @@ export async function getExecutionsPerDayCount(
|
|||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(obj: any, key: string) => ({
|
||||
...obj,
|
||||
[replaceFirstAndLastDotSymbols(key)]: Math.round(
|
||||
[replaceDotSymbols(key)]: Math.round(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypesDuration[key] /
|
||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10)
|
||||
),
|
||||
|
@ -458,3 +455,69 @@ export async function getExecutionsPerDayCount(
|
|||
),
|
||||
};
|
||||
}
|
||||
|
||||
export async function getExecutionTimeoutsPerDayCount(
|
||||
esClient: ElasticsearchClient,
|
||||
eventLogIndex: string
|
||||
) {
|
||||
const { body: searchResult } = await esClient.search({
|
||||
index: eventLogIndex,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: {
|
||||
bool: {
|
||||
must: [
|
||||
{
|
||||
term: { 'event.action': 'execute-timeout' },
|
||||
},
|
||||
{
|
||||
term: { 'event.provider': 'alerting' },
|
||||
},
|
||||
{
|
||||
range: {
|
||||
'@timestamp': {
|
||||
gte: 'now-1d',
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
byRuleTypeId: ruleTypeExecutionsMetric,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const executionsAggregations = searchResult.aggregations as {
|
||||
byRuleTypeId: {
|
||||
value: { ruleTypes: Record<string, string>; ruleTypesDuration: Record<string, number> };
|
||||
};
|
||||
};
|
||||
|
||||
return {
|
||||
countTotal: Object.keys(executionsAggregations.byRuleTypeId.value.ruleTypes).reduce(
|
||||
(total: number, key: string) =>
|
||||
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) + total,
|
||||
0
|
||||
),
|
||||
countByType: replaceDotSymbolsInRuleTypeIds(
|
||||
executionsAggregations.byRuleTypeId.value.ruleTypes
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
function replaceDotSymbols(strToReplace: string) {
|
||||
return strToReplace.replaceAll('.', '__');
|
||||
}
|
||||
|
||||
function replaceDotSymbolsInRuleTypeIds(ruleTypeIdObj: Record<string, string>) {
|
||||
return Object.keys(ruleTypeIdObj).reduce(
|
||||
(obj, key) => ({ ...obj, [replaceDotSymbols(key)]: ruleTypeIdObj[key] }),
|
||||
{}
|
||||
);
|
||||
}
|
|
@ -6,13 +6,13 @@
|
|||
*/
|
||||
|
||||
import { UsageCollectionSetup } from 'src/plugins/usage_collection/server';
|
||||
import { registerAlertsUsageCollector } from './alerts_usage_collector';
|
||||
import { registerAlertingUsageCollector } from './alerting_usage_collector';
|
||||
import { taskManagerMock } from '../../../task_manager/server/mocks';
|
||||
const taskManagerStart = taskManagerMock.createStart();
|
||||
|
||||
beforeEach(() => jest.resetAllMocks());
|
||||
|
||||
describe('registerAlertsUsageCollector', () => {
|
||||
describe('registerAlertingUsageCollector', () => {
|
||||
let usageCollectionMock: jest.Mocked<UsageCollectionSetup>;
|
||||
|
||||
beforeEach(() => {
|
||||
|
@ -23,7 +23,7 @@ describe('registerAlertsUsageCollector', () => {
|
|||
});
|
||||
|
||||
it('should call registerCollector', () => {
|
||||
registerAlertsUsageCollector(
|
||||
registerAlertingUsageCollector(
|
||||
usageCollectionMock as UsageCollectionSetup,
|
||||
new Promise(() => taskManagerStart)
|
||||
);
|
||||
|
@ -31,7 +31,7 @@ describe('registerAlertsUsageCollector', () => {
|
|||
});
|
||||
|
||||
it('should call makeUsageCollector with type = alerts', () => {
|
||||
registerAlertsUsageCollector(
|
||||
registerAlertingUsageCollector(
|
||||
usageCollectionMock as UsageCollectionSetup,
|
||||
new Promise(() => taskManagerStart)
|
||||
);
|
|
@ -8,12 +8,12 @@
|
|||
import { MakeSchemaFrom, UsageCollectionSetup } from 'src/plugins/usage_collection/server';
|
||||
import { get } from 'lodash';
|
||||
import { TaskManagerStartContract } from '../../../task_manager/server';
|
||||
import { AlertsUsage } from './types';
|
||||
import { AlertingUsage } from './types';
|
||||
|
||||
const byTypeSchema: MakeSchemaFrom<AlertsUsage>['count_by_type'] = {
|
||||
const byTypeSchema: MakeSchemaFrom<AlertingUsage>['count_by_type'] = {
|
||||
// TODO: Find out an automated way to populate the keys or reformat these into an array (and change the Remote Telemetry indexer accordingly)
|
||||
DYNAMIC_KEY: { type: 'long' },
|
||||
// Known alerts (searching the use of the alerts API `registerType`:
|
||||
// Known rule types (searching the use of the rules API `registerType`:
|
||||
// Built-in
|
||||
'__index-threshold': { type: 'long' },
|
||||
'__es-query': { type: 'long' },
|
||||
|
@ -39,6 +39,12 @@ const byTypeSchema: MakeSchemaFrom<AlertsUsage>['count_by_type'] = {
|
|||
// Security Solution
|
||||
siem__signals: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
siem__notifications: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
siem__eqlRule: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
siem__indicatorRule: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
siem__mlRule: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
siem__queryRule: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
siem__savedQueryRule: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
siem__thresholdRule: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
// Uptime
|
||||
xpack__uptime__alerts__monitorStatus: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
xpack__uptime__alerts__tls: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
|
@ -50,7 +56,7 @@ const byTypeSchema: MakeSchemaFrom<AlertsUsage>['count_by_type'] = {
|
|||
xpack__ml__anomaly_detection_jobs_health: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
|
||||
};
|
||||
|
||||
const byReasonSchema: MakeSchemaFrom<AlertsUsage>['count_rules_executions_failured_by_reason_per_day'] =
|
||||
const byReasonSchema: MakeSchemaFrom<AlertingUsage>['count_rules_executions_failured_by_reason_per_day'] =
|
||||
{
|
||||
// TODO: Find out an automated way to populate the keys or reformat these into an array (and change the Remote Telemetry indexer accordingly)
|
||||
DYNAMIC_KEY: { type: 'long' },
|
||||
|
@ -60,7 +66,7 @@ const byReasonSchema: MakeSchemaFrom<AlertsUsage>['count_rules_executions_failur
|
|||
unknown: { type: 'long' },
|
||||
};
|
||||
|
||||
const byReasonSchemaByType: MakeSchemaFrom<AlertsUsage>['count_rules_executions_failured_by_reason_by_type_per_day'] =
|
||||
const byReasonSchemaByType: MakeSchemaFrom<AlertingUsage>['count_rules_executions_failured_by_reason_by_type_per_day'] =
|
||||
{
|
||||
// TODO: Find out an automated way to populate the keys or reformat these into an array (and change the Remote Telemetry indexer accordingly)
|
||||
DYNAMIC_KEY: byTypeSchema,
|
||||
|
@ -70,11 +76,11 @@ const byReasonSchemaByType: MakeSchemaFrom<AlertsUsage>['count_rules_executions_
|
|||
unknown: byTypeSchema,
|
||||
};
|
||||
|
||||
export function createAlertsUsageCollector(
|
||||
export function createAlertingUsageCollector(
|
||||
usageCollection: UsageCollectionSetup,
|
||||
taskManager: Promise<TaskManagerStartContract>
|
||||
) {
|
||||
return usageCollection.makeUsageCollector<AlertsUsage>({
|
||||
return usageCollection.makeUsageCollector<AlertingUsage>({
|
||||
type: 'alerts',
|
||||
isReady: async () => {
|
||||
await taskManager;
|
||||
|
@ -84,7 +90,7 @@ export function createAlertsUsageCollector(
|
|||
try {
|
||||
const doc = await getLatestTaskState(await taskManager);
|
||||
// get the accumulated state from the recurring task
|
||||
const { runs, ...state } = get(doc, 'state') as AlertsUsage & { runs: number };
|
||||
const { runs, ...state } = get(doc, 'state') as AlertingUsage & { runs: number };
|
||||
|
||||
return {
|
||||
...state,
|
||||
|
@ -127,6 +133,8 @@ export function createAlertsUsageCollector(
|
|||
count_rules_executions_failured_per_day: 0,
|
||||
count_rules_executions_failured_by_reason_per_day: {},
|
||||
count_rules_executions_failured_by_reason_by_type_per_day: {},
|
||||
count_rules_executions_timeouts_per_day: 0,
|
||||
count_rules_executions_timeouts_by_type_per_day: {},
|
||||
avg_execution_time_per_day: 0,
|
||||
avg_execution_time_by_type_per_day: {},
|
||||
};
|
||||
|
@ -169,6 +177,8 @@ export function createAlertsUsageCollector(
|
|||
count_rules_executions_failured_per_day: { type: 'long' },
|
||||
count_rules_executions_failured_by_reason_per_day: byReasonSchema,
|
||||
count_rules_executions_failured_by_reason_by_type_per_day: byReasonSchemaByType,
|
||||
count_rules_executions_timeouts_per_day: { type: 'long' },
|
||||
count_rules_executions_timeouts_by_type_per_day: byTypeSchema,
|
||||
avg_execution_time_per_day: { type: 'long' },
|
||||
avg_execution_time_by_type_per_day: byTypeSchema,
|
||||
},
|
||||
|
@ -194,10 +204,10 @@ async function getLatestTaskState(taskManager: TaskManagerStartContract) {
|
|||
return null;
|
||||
}
|
||||
|
||||
export function registerAlertsUsageCollector(
|
||||
export function registerAlertingUsageCollector(
|
||||
usageCollection: UsageCollectionSetup,
|
||||
taskManager: Promise<TaskManagerStartContract>
|
||||
) {
|
||||
const collector = createAlertsUsageCollector(usageCollection, taskManager);
|
||||
const collector = createAlertingUsageCollector(usageCollection, taskManager);
|
||||
usageCollection.registerCollector(collector);
|
||||
}
|
|
@ -5,4 +5,4 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
export { registerAlertsUsageCollector } from './alerts_usage_collector';
|
||||
export { registerAlertingUsageCollector } from './alerting_usage_collector';
|
||||
|
|
|
@ -17,7 +17,8 @@ import {
|
|||
getTotalCountAggregations,
|
||||
getTotalCountInUse,
|
||||
getExecutionsPerDayCount,
|
||||
} from './alerts_telemetry';
|
||||
getExecutionTimeoutsPerDayCount,
|
||||
} from './alerting_telemetry';
|
||||
|
||||
export const TELEMETRY_TASK_TYPE = 'alerting_telemetry';
|
||||
|
||||
|
@ -92,29 +93,40 @@ export function telemetryTaskRunner(
|
|||
getTotalCountAggregations(esClient, kibanaIndex),
|
||||
getTotalCountInUse(esClient, kibanaIndex),
|
||||
getExecutionsPerDayCount(esClient, eventLogIndex),
|
||||
getExecutionTimeoutsPerDayCount(esClient, eventLogIndex),
|
||||
])
|
||||
.then(([totalCountAggregations, totalInUse, totalExecutions]) => {
|
||||
return {
|
||||
state: {
|
||||
runs: (state.runs || 0) + 1,
|
||||
...totalCountAggregations,
|
||||
count_active_by_type: totalInUse.countByType,
|
||||
count_active_total: totalInUse.countTotal,
|
||||
count_disabled_total: totalCountAggregations.count_total - totalInUse.countTotal,
|
||||
count_rules_namespaces: totalInUse.countNamespaces,
|
||||
count_rules_executions_per_day: totalExecutions.countTotal,
|
||||
count_rules_executions_by_type_per_day: totalExecutions.countByType,
|
||||
count_rules_executions_failured_per_day: totalExecutions.countTotalFailures,
|
||||
count_rules_executions_failured_by_reason_per_day:
|
||||
totalExecutions.countFailuresByReason,
|
||||
count_rules_executions_failured_by_reason_by_type_per_day:
|
||||
totalExecutions.countFailuresByReasonByType,
|
||||
avg_execution_time_per_day: totalExecutions.avgExecutionTime,
|
||||
avg_execution_time_by_type_per_day: totalExecutions.avgExecutionTimeByType,
|
||||
},
|
||||
runAt: getNextMidnight(),
|
||||
};
|
||||
})
|
||||
.then(
|
||||
([
|
||||
totalCountAggregations,
|
||||
totalInUse,
|
||||
dailyExecutionCounts,
|
||||
dailyExecutionTimeoutCounts,
|
||||
]) => {
|
||||
return {
|
||||
state: {
|
||||
runs: (state.runs || 0) + 1,
|
||||
...totalCountAggregations,
|
||||
count_active_by_type: totalInUse.countByType,
|
||||
count_active_total: totalInUse.countTotal,
|
||||
count_disabled_total: totalCountAggregations.count_total - totalInUse.countTotal,
|
||||
count_rules_namespaces: totalInUse.countNamespaces,
|
||||
count_rules_executions_per_day: dailyExecutionCounts.countTotal,
|
||||
count_rules_executions_by_type_per_day: dailyExecutionCounts.countByType,
|
||||
count_rules_executions_failured_per_day: dailyExecutionCounts.countTotalFailures,
|
||||
count_rules_executions_failured_by_reason_per_day:
|
||||
dailyExecutionCounts.countFailuresByReason,
|
||||
count_rules_executions_failured_by_reason_by_type_per_day:
|
||||
dailyExecutionCounts.countFailuresByReasonByType,
|
||||
count_rules_executions_timeouts_per_day: dailyExecutionTimeoutCounts.countTotal,
|
||||
count_rules_executions_timeouts_by_type_per_day:
|
||||
dailyExecutionTimeoutCounts.countByType,
|
||||
avg_execution_time_per_day: dailyExecutionCounts.avgExecutionTime,
|
||||
avg_execution_time_by_type_per_day: dailyExecutionCounts.avgExecutionTimeByType,
|
||||
},
|
||||
runAt: getNextMidnight(),
|
||||
};
|
||||
}
|
||||
)
|
||||
.catch((errMsg) => {
|
||||
logger.warn(`Error executing alerting telemetry task: ${errMsg}`);
|
||||
return {
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
export interface AlertsUsage {
|
||||
export interface AlertingUsage {
|
||||
count_total: number;
|
||||
count_active_total: number;
|
||||
count_disabled_total: number;
|
||||
|
@ -17,6 +17,8 @@ export interface AlertsUsage {
|
|||
count_rules_executions_failured_per_day: number;
|
||||
count_rules_executions_failured_by_reason_per_day: Record<string, number>;
|
||||
count_rules_executions_failured_by_reason_by_type_per_day: Record<string, Record<string, number>>;
|
||||
count_rules_executions_timeouts_per_day: number;
|
||||
count_rules_executions_timeouts_by_type_per_day: Record<string, number>;
|
||||
avg_execution_time_per_day: number;
|
||||
avg_execution_time_by_type_per_day: Record<string, number>;
|
||||
throttle_time: {
|
||||
|
|
|
@ -400,6 +400,24 @@
|
|||
"siem__notifications": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__eqlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__indicatorRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__mlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__queryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__savedQueryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__thresholdRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__monitorStatus": {
|
||||
"type": "long"
|
||||
},
|
||||
|
@ -485,6 +503,24 @@
|
|||
"siem__notifications": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__eqlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__indicatorRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__mlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__queryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__savedQueryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__thresholdRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__monitorStatus": {
|
||||
"type": "long"
|
||||
},
|
||||
|
@ -576,6 +612,24 @@
|
|||
"siem__notifications": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__eqlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__indicatorRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__mlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__queryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__savedQueryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__thresholdRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__monitorStatus": {
|
||||
"type": "long"
|
||||
},
|
||||
|
@ -685,6 +739,24 @@
|
|||
"siem__notifications": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__eqlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__indicatorRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__mlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__queryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__savedQueryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__thresholdRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__monitorStatus": {
|
||||
"type": "long"
|
||||
},
|
||||
|
@ -770,6 +842,24 @@
|
|||
"siem__notifications": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__eqlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__indicatorRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__mlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__queryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__savedQueryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__thresholdRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__monitorStatus": {
|
||||
"type": "long"
|
||||
},
|
||||
|
@ -855,6 +945,24 @@
|
|||
"siem__notifications": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__eqlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__indicatorRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__mlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__queryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__savedQueryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__thresholdRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__monitorStatus": {
|
||||
"type": "long"
|
||||
},
|
||||
|
@ -940,6 +1048,24 @@
|
|||
"siem__notifications": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__eqlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__indicatorRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__mlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__queryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__savedQueryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__thresholdRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__monitorStatus": {
|
||||
"type": "long"
|
||||
},
|
||||
|
@ -1025,6 +1151,24 @@
|
|||
"siem__notifications": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__eqlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__indicatorRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__mlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__queryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__savedQueryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__thresholdRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__monitorStatus": {
|
||||
"type": "long"
|
||||
},
|
||||
|
@ -1047,6 +1191,112 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"count_rules_executions_timeouts_per_day": {
|
||||
"type": "long"
|
||||
},
|
||||
"count_rules_executions_timeouts_by_type_per_day": {
|
||||
"properties": {
|
||||
"DYNAMIC_KEY": {
|
||||
"type": "long"
|
||||
},
|
||||
"__index-threshold": {
|
||||
"type": "long"
|
||||
},
|
||||
"__es-query": {
|
||||
"type": "long"
|
||||
},
|
||||
"transform_health": {
|
||||
"type": "long"
|
||||
},
|
||||
"apm__error_rate": {
|
||||
"type": "long"
|
||||
},
|
||||
"apm__transaction_error_rate": {
|
||||
"type": "long"
|
||||
},
|
||||
"apm__transaction_duration": {
|
||||
"type": "long"
|
||||
},
|
||||
"apm__transaction_duration_anomaly": {
|
||||
"type": "long"
|
||||
},
|
||||
"metrics__alert__threshold": {
|
||||
"type": "long"
|
||||
},
|
||||
"metrics__alert__inventory__threshold": {
|
||||
"type": "long"
|
||||
},
|
||||
"logs__alert__document__count": {
|
||||
"type": "long"
|
||||
},
|
||||
"monitoring_alert_cluster_health": {
|
||||
"type": "long"
|
||||
},
|
||||
"monitoring_alert_cpu_usage": {
|
||||
"type": "long"
|
||||
},
|
||||
"monitoring_alert_disk_usage": {
|
||||
"type": "long"
|
||||
},
|
||||
"monitoring_alert_elasticsearch_version_mismatch": {
|
||||
"type": "long"
|
||||
},
|
||||
"monitoring_alert_kibana_version_mismatch": {
|
||||
"type": "long"
|
||||
},
|
||||
"monitoring_alert_license_expiration": {
|
||||
"type": "long"
|
||||
},
|
||||
"monitoring_alert_logstash_version_mismatch": {
|
||||
"type": "long"
|
||||
},
|
||||
"monitoring_alert_nodes_changed": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__signals": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__notifications": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__eqlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__indicatorRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__mlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__queryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__savedQueryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__thresholdRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__monitorStatus": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__tls": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__durationAnomaly": {
|
||||
"type": "long"
|
||||
},
|
||||
"__geo-containment": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__ml__anomaly_detection_alert": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__ml__anomaly_detection_jobs_health": {
|
||||
"type": "long"
|
||||
}
|
||||
}
|
||||
},
|
||||
"avg_execution_time_per_day": {
|
||||
"type": "long"
|
||||
},
|
||||
|
@ -1115,6 +1365,24 @@
|
|||
"siem__notifications": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__eqlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__indicatorRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__mlRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__queryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__savedQueryRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"siem__thresholdRule": {
|
||||
"type": "long"
|
||||
},
|
||||
"xpack__uptime__alerts__monitorStatus": {
|
||||
"type": "long"
|
||||
},
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue