mirror of
https://github.com/elastic/kibana.git
synced 2025-04-23 17:28:26 -04:00
[Observability] [Rules] Enable recovery context for APM Anomaly rule (#213252)
## Summary It fixes #212014 by adding a recovery context to the APM anomaly rule. ### Checklist - [x] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios ## How to test the PR: - Run Kibana locally and use oblt-edge - From the Service Inventory, create an anomaly detection job for `elastic-co-frontend` and `packetbeat` services - Create an APM Anomaly rule for all services with severity warning - Create an action with Log, then select when recovered <img width="1188" alt="Screenshot 2025-03-10 at 12 45 36" src="https://github.com/user-attachments/assets/4b356981-f5dc-4493-ac7a-061b0a1a98f8" /> - Once the rule fires an alert, edit the rule and change the severity level to something else to recover the alert. - Watch the logs where Kibana is running; you should see the recovery message printed with all the variable values.
This commit is contained in:
parent
df78cbbedd
commit
cf06906ab1
3 changed files with 170 additions and 0 deletions
|
@ -12,6 +12,7 @@ import type { ObservabilityRuleTypeRegistry } from '@kbn/observability-plugin/pu
|
|||
import { getAlertUrlErrorCount, getAlertUrlTransaction } from '../../../../common/utils/formatters';
|
||||
import {
|
||||
anomalyMessage,
|
||||
anomalyRecoveryMessage,
|
||||
errorCountMessage,
|
||||
errorCountRecoveryMessage,
|
||||
transactionDurationMessage,
|
||||
|
@ -140,6 +141,7 @@ export function registerApmRuleTypes(observabilityRuleTypeRegistry: Observabilit
|
|||
validate: validateAnomalyRule,
|
||||
requiresAppContext: false,
|
||||
defaultActionMessage: anomalyMessage,
|
||||
defaultRecoveryMessage: anomalyRecoveryMessage,
|
||||
priority: 90,
|
||||
});
|
||||
}
|
||||
|
|
|
@ -244,3 +244,130 @@ describe('Transaction duration anomaly alert', () => {
|
|||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('recovered alerts', () => {
|
||||
it('should returns the recovered alerts', async () => {
|
||||
jest.spyOn(GetServiceAnomalies, 'getMLJobs').mockReturnValue(
|
||||
Promise.resolve([
|
||||
{
|
||||
jobId: '1',
|
||||
environment: 'development',
|
||||
},
|
||||
{
|
||||
jobId: '2',
|
||||
environment: 'production',
|
||||
},
|
||||
] as unknown as ApmMlJob[])
|
||||
);
|
||||
|
||||
const { services, dependencies, executor } = createRuleTypeMocks();
|
||||
|
||||
services.alertsClient.report.mockReturnValue({ uuid: 'test-uuid' });
|
||||
services.alertsClient.getRecoveredAlerts.mockReturnValue([
|
||||
{
|
||||
alert: {
|
||||
getId: jest.fn().mockReturnValue('test-id'),
|
||||
getUuid: jest.fn().mockReturnValue('test-uuid'),
|
||||
scheduledExecutionOptions: undefined,
|
||||
meta: {},
|
||||
state: {},
|
||||
context: {},
|
||||
id: 'synthtrace-high-cardinality-0_Synthtrace: many_errors_request',
|
||||
alertAsData: undefined,
|
||||
},
|
||||
hit: {
|
||||
'service.name': 'packetbeat',
|
||||
'service.environment': 'production',
|
||||
'transaction.type': 'output',
|
||||
'processor.event': 'transaction',
|
||||
'kibana.alert.severity': 'minor',
|
||||
'kibana.alert.evaluation.value': 42.801142973792565,
|
||||
'kibana.alert.evaluation.threshold': 3,
|
||||
'kibana.alert.reason':
|
||||
'minor throughput anomaly with a score of 42.801142973792565, was detected in the last 30 mins for packetbeat.',
|
||||
'agent.name': 'go',
|
||||
labels: { worker: 'netclient' },
|
||||
'service.language.name': 'go',
|
||||
'kibana.alert.rule.category': 'APM Anomaly',
|
||||
'kibana.alert.rule.consumer': 'alerts',
|
||||
'kibana.alert.rule.execution.uuid': '46b2b08d-3373-48c1-9b93-00026b882042',
|
||||
'kibana.alert.rule.name': 'APM Anomaly rule',
|
||||
'kibana.alert.rule.parameters': {
|
||||
windowSize: 30,
|
||||
windowUnit: 'm',
|
||||
anomalySeverityType: 'warning',
|
||||
anomalyDetectorTypes: ['txLatency', 'txThroughput', 'txFailureRate'],
|
||||
environment: 'ENVIRONMENT_ALL',
|
||||
},
|
||||
'kibana.alert.rule.producer': 'apm',
|
||||
'kibana.alert.rule.revision': 12,
|
||||
'kibana.alert.rule.rule_type_id': 'apm.anomaly',
|
||||
'kibana.alert.rule.tags': [],
|
||||
'kibana.alert.rule.uuid': 'e3aa20a8-25cb-49b8-94c8-3d930bde1219',
|
||||
'kibana.space_ids': ['default'],
|
||||
'@timestamp': '2025-03-05T14:37:29.661Z',
|
||||
'event.action': 'active',
|
||||
'event.kind': 'signal',
|
||||
'kibana.alert.rule.execution.timestamp': '2025-03-05T14:37:29.661Z',
|
||||
'kibana.alert.action_group': 'threshold_met',
|
||||
'kibana.alert.flapping': false,
|
||||
'kibana.alert.flapping_history': [],
|
||||
'kibana.alert.instance.id': 'packetbeat_output_apm-production-3c88-apm_tx_metrics_1',
|
||||
'kibana.alert.maintenance_window_ids': [],
|
||||
'kibana.alert.consecutive_matches': 3,
|
||||
'kibana.alert.status': 'active',
|
||||
'kibana.alert.uuid': 'fe7fbfe4-4b26-4264-b0e7-28e69ce21376',
|
||||
'kibana.alert.workflow_status': 'open',
|
||||
'kibana.alert.duration.us': 120043000,
|
||||
'kibana.alert.start': '2025-03-05T14:35:29.618Z',
|
||||
'kibana.alert.time_range': { gte: '2025-03-05T14:35:29.618Z' },
|
||||
'kibana.version': '9.1.0',
|
||||
tags: [],
|
||||
'kibana.alert.previous_action_group': 'threshold_met',
|
||||
},
|
||||
},
|
||||
]);
|
||||
const ml = {
|
||||
mlSystemProvider: () => ({
|
||||
mlAnomalySearch: () => ({
|
||||
aggregations: {
|
||||
anomaly_groups: {
|
||||
buckets: [],
|
||||
},
|
||||
},
|
||||
}),
|
||||
}),
|
||||
anomalyDetectorsProvider: jest.fn(),
|
||||
} as unknown as MlPluginSetup;
|
||||
|
||||
registerAnomalyRuleType({
|
||||
...dependencies,
|
||||
ml,
|
||||
});
|
||||
|
||||
const params = {
|
||||
anomalySeverityType: ML_ANOMALY_SEVERITY.MINOR,
|
||||
anomalyDetectorTypes: [AnomalyDetectorType.txLatency],
|
||||
windowSize: 5,
|
||||
windowUnit: 'm',
|
||||
};
|
||||
|
||||
await executor({ params });
|
||||
|
||||
expect(services.alertsClient.setAlertData).toHaveBeenCalledWith({
|
||||
context: {
|
||||
alertDetailsUrl: 'http://localhost:5601/eyr/app/observability/alerts/test-uuid',
|
||||
environment: 'production',
|
||||
reason:
|
||||
'minor throughput anomaly with a score of 42.801142973792565, was detected in the last 30 mins for packetbeat.',
|
||||
serviceName: 'packetbeat',
|
||||
threshold: 'minor',
|
||||
transactionType: 'output',
|
||||
triggerValue: 'minor',
|
||||
viewInAppUrl:
|
||||
'http://localhost:5601/eyr/app/apm/services/packetbeat?transactionType=output&environment=production',
|
||||
},
|
||||
id: 'test-id',
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
@ -51,6 +51,7 @@ import {
|
|||
import type {
|
||||
THRESHOLD_MET_GROUP,
|
||||
ApmRuleParamsType,
|
||||
AdditionalContext,
|
||||
} from '../../../../../common/rules/apm_rule_types';
|
||||
import {
|
||||
ANOMALY_ALERT_SEVERITY_TYPES,
|
||||
|
@ -96,6 +97,7 @@ export function registerAnomalyRuleType({
|
|||
id: ApmRuleType.Anomaly,
|
||||
name: ruleTypeConfig.name,
|
||||
actionGroups: ruleTypeConfig.actionGroups,
|
||||
doesSetRecoveryContext: true,
|
||||
defaultActionGroupId: ruleTypeConfig.defaultActionGroupId,
|
||||
validate: { params: anomalyParamsSchema },
|
||||
schemas: {
|
||||
|
@ -352,6 +354,45 @@ export function registerAnomalyRuleType({
|
|||
context,
|
||||
});
|
||||
});
|
||||
// Handle recovered alerts context
|
||||
const recoveredAlerts = alertsClient.getRecoveredAlerts() ?? [];
|
||||
for (const recoveredAlert of recoveredAlerts) {
|
||||
const alertHits = recoveredAlert.hit as AdditionalContext;
|
||||
const recoveredAlertId = recoveredAlert.alert.getId();
|
||||
const alertUuid = recoveredAlert.alert.getUuid();
|
||||
const alertDetailsUrl = await getAlertDetailsUrl(basePath, spaceId, alertUuid);
|
||||
|
||||
const environment = alertHits?.[SERVICE_ENVIRONMENT];
|
||||
const serviceName = alertHits?.[SERVICE_NAME];
|
||||
const transactionType = alertHits?.[TRANSACTION_TYPE];
|
||||
const severityLevel = alertHits?.[ALERT_SEVERITY];
|
||||
const reasonMessage = alertHits?.[ALERT_REASON];
|
||||
|
||||
const relativeViewInAppUrl = getAlertUrlTransaction(
|
||||
serviceName,
|
||||
environment,
|
||||
transactionType
|
||||
);
|
||||
const viewInAppUrl = addSpaceIdToPath(
|
||||
basePath.publicBaseUrl,
|
||||
spaceId,
|
||||
relativeViewInAppUrl
|
||||
);
|
||||
const recoveredContext = {
|
||||
alertDetailsUrl,
|
||||
environment: getEnvironmentLabel(environment),
|
||||
reason: reasonMessage,
|
||||
serviceName,
|
||||
threshold: selectedOption?.label,
|
||||
transactionType,
|
||||
triggerValue: severityLevel,
|
||||
viewInAppUrl,
|
||||
};
|
||||
alertsClient.setAlertData({
|
||||
id: recoveredAlertId,
|
||||
context: recoveredContext,
|
||||
});
|
||||
}
|
||||
|
||||
return { state: {} };
|
||||
},
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue