[Observability] [Rules] Enable recovery context for APM Anomaly rule (#213252)

## Summary

It fixes #212014 by adding a recovery context to the APM anomaly rule.

### Checklist

- [x] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios

## How to test the PR:

- Run Kibana locally and use oblt-edge
- From the Service Inventory, create an anomaly detection job for
`elastic-co-frontend` and `packetbeat` services
- Create an APM Anomaly rule for all services with severity warning 
- Create an action with Log, then select when recovered
<img width="1188" alt="Screenshot 2025-03-10 at 12 45 36"
src="https://github.com/user-attachments/assets/4b356981-f5dc-4493-ac7a-061b0a1a98f8"
/>

- Once the rule fires an alert, edit the rule and change the severity
level to something else to recover the alert.

- Watch the logs where Kibana is running; you should see the recovery
message printed with all the variable values.
This commit is contained in:
Faisal Kanout 2025-03-10 18:32:30 +03:00 committed by GitHub
parent df78cbbedd
commit cf06906ab1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 170 additions and 0 deletions

View file

@ -12,6 +12,7 @@ import type { ObservabilityRuleTypeRegistry } from '@kbn/observability-plugin/pu
import { getAlertUrlErrorCount, getAlertUrlTransaction } from '../../../../common/utils/formatters';
import {
anomalyMessage,
anomalyRecoveryMessage,
errorCountMessage,
errorCountRecoveryMessage,
transactionDurationMessage,
@ -140,6 +141,7 @@ export function registerApmRuleTypes(observabilityRuleTypeRegistry: Observabilit
validate: validateAnomalyRule,
requiresAppContext: false,
defaultActionMessage: anomalyMessage,
defaultRecoveryMessage: anomalyRecoveryMessage,
priority: 90,
});
}

View file

@ -244,3 +244,130 @@ describe('Transaction duration anomaly alert', () => {
});
});
});
describe('recovered alerts', () => {
it('should returns the recovered alerts', async () => {
jest.spyOn(GetServiceAnomalies, 'getMLJobs').mockReturnValue(
Promise.resolve([
{
jobId: '1',
environment: 'development',
},
{
jobId: '2',
environment: 'production',
},
] as unknown as ApmMlJob[])
);
const { services, dependencies, executor } = createRuleTypeMocks();
services.alertsClient.report.mockReturnValue({ uuid: 'test-uuid' });
services.alertsClient.getRecoveredAlerts.mockReturnValue([
{
alert: {
getId: jest.fn().mockReturnValue('test-id'),
getUuid: jest.fn().mockReturnValue('test-uuid'),
scheduledExecutionOptions: undefined,
meta: {},
state: {},
context: {},
id: 'synthtrace-high-cardinality-0_Synthtrace: many_errors_request',
alertAsData: undefined,
},
hit: {
'service.name': 'packetbeat',
'service.environment': 'production',
'transaction.type': 'output',
'processor.event': 'transaction',
'kibana.alert.severity': 'minor',
'kibana.alert.evaluation.value': 42.801142973792565,
'kibana.alert.evaluation.threshold': 3,
'kibana.alert.reason':
'minor throughput anomaly with a score of 42.801142973792565, was detected in the last 30 mins for packetbeat.',
'agent.name': 'go',
labels: { worker: 'netclient' },
'service.language.name': 'go',
'kibana.alert.rule.category': 'APM Anomaly',
'kibana.alert.rule.consumer': 'alerts',
'kibana.alert.rule.execution.uuid': '46b2b08d-3373-48c1-9b93-00026b882042',
'kibana.alert.rule.name': 'APM Anomaly rule',
'kibana.alert.rule.parameters': {
windowSize: 30,
windowUnit: 'm',
anomalySeverityType: 'warning',
anomalyDetectorTypes: ['txLatency', 'txThroughput', 'txFailureRate'],
environment: 'ENVIRONMENT_ALL',
},
'kibana.alert.rule.producer': 'apm',
'kibana.alert.rule.revision': 12,
'kibana.alert.rule.rule_type_id': 'apm.anomaly',
'kibana.alert.rule.tags': [],
'kibana.alert.rule.uuid': 'e3aa20a8-25cb-49b8-94c8-3d930bde1219',
'kibana.space_ids': ['default'],
'@timestamp': '2025-03-05T14:37:29.661Z',
'event.action': 'active',
'event.kind': 'signal',
'kibana.alert.rule.execution.timestamp': '2025-03-05T14:37:29.661Z',
'kibana.alert.action_group': 'threshold_met',
'kibana.alert.flapping': false,
'kibana.alert.flapping_history': [],
'kibana.alert.instance.id': 'packetbeat_output_apm-production-3c88-apm_tx_metrics_1',
'kibana.alert.maintenance_window_ids': [],
'kibana.alert.consecutive_matches': 3,
'kibana.alert.status': 'active',
'kibana.alert.uuid': 'fe7fbfe4-4b26-4264-b0e7-28e69ce21376',
'kibana.alert.workflow_status': 'open',
'kibana.alert.duration.us': 120043000,
'kibana.alert.start': '2025-03-05T14:35:29.618Z',
'kibana.alert.time_range': { gte: '2025-03-05T14:35:29.618Z' },
'kibana.version': '9.1.0',
tags: [],
'kibana.alert.previous_action_group': 'threshold_met',
},
},
]);
const ml = {
mlSystemProvider: () => ({
mlAnomalySearch: () => ({
aggregations: {
anomaly_groups: {
buckets: [],
},
},
}),
}),
anomalyDetectorsProvider: jest.fn(),
} as unknown as MlPluginSetup;
registerAnomalyRuleType({
...dependencies,
ml,
});
const params = {
anomalySeverityType: ML_ANOMALY_SEVERITY.MINOR,
anomalyDetectorTypes: [AnomalyDetectorType.txLatency],
windowSize: 5,
windowUnit: 'm',
};
await executor({ params });
expect(services.alertsClient.setAlertData).toHaveBeenCalledWith({
context: {
alertDetailsUrl: 'http://localhost:5601/eyr/app/observability/alerts/test-uuid',
environment: 'production',
reason:
'minor throughput anomaly with a score of 42.801142973792565, was detected in the last 30 mins for packetbeat.',
serviceName: 'packetbeat',
threshold: 'minor',
transactionType: 'output',
triggerValue: 'minor',
viewInAppUrl:
'http://localhost:5601/eyr/app/apm/services/packetbeat?transactionType=output&environment=production',
},
id: 'test-id',
});
});
});

View file

@ -51,6 +51,7 @@ import {
import type {
THRESHOLD_MET_GROUP,
ApmRuleParamsType,
AdditionalContext,
} from '../../../../../common/rules/apm_rule_types';
import {
ANOMALY_ALERT_SEVERITY_TYPES,
@ -96,6 +97,7 @@ export function registerAnomalyRuleType({
id: ApmRuleType.Anomaly,
name: ruleTypeConfig.name,
actionGroups: ruleTypeConfig.actionGroups,
doesSetRecoveryContext: true,
defaultActionGroupId: ruleTypeConfig.defaultActionGroupId,
validate: { params: anomalyParamsSchema },
schemas: {
@ -352,6 +354,45 @@ export function registerAnomalyRuleType({
context,
});
});
// Handle recovered alerts context
const recoveredAlerts = alertsClient.getRecoveredAlerts() ?? [];
for (const recoveredAlert of recoveredAlerts) {
const alertHits = recoveredAlert.hit as AdditionalContext;
const recoveredAlertId = recoveredAlert.alert.getId();
const alertUuid = recoveredAlert.alert.getUuid();
const alertDetailsUrl = await getAlertDetailsUrl(basePath, spaceId, alertUuid);
const environment = alertHits?.[SERVICE_ENVIRONMENT];
const serviceName = alertHits?.[SERVICE_NAME];
const transactionType = alertHits?.[TRANSACTION_TYPE];
const severityLevel = alertHits?.[ALERT_SEVERITY];
const reasonMessage = alertHits?.[ALERT_REASON];
const relativeViewInAppUrl = getAlertUrlTransaction(
serviceName,
environment,
transactionType
);
const viewInAppUrl = addSpaceIdToPath(
basePath.publicBaseUrl,
spaceId,
relativeViewInAppUrl
);
const recoveredContext = {
alertDetailsUrl,
environment: getEnvironmentLabel(environment),
reason: reasonMessage,
serviceName,
threshold: selectedOption?.label,
transactionType,
triggerValue: severityLevel,
viewInAppUrl,
};
alertsClient.setAlertData({
id: recoveredAlertId,
context: recoveredContext,
});
}
return { state: {} };
},