Add error.grouping_name to group alerts in Error Count rule (#161810)

Resolves https://github.com/elastic/actionable-observability/issues/70

For the APM Error Count rule -
- Added `error.grouping_name` in the index mapping of AAD index
- Added `error.grouping_name` in the alert document in AAD index
- Added `errorGroupingName` in the list of action variables

I discussed with @simianhacker regarding the alert instance ID having
space/quotes with introduction of `errorGroupingName`. It appears that
using `errorGroupingName` as is should not be an issue and so we don't
need to modify or hash it.

## Group by dropdown
<img width="604" alt="Screenshot 2023-07-13 at 17 27 44"
src="d9ab1a8d-3272-4c36-8c71-a7163a024249">

## Reason message
<img width="755" alt="Screenshot 2023-07-13 at 17 38 31"
src="dbe4a86b-812b-4068-abea-4b96fa5fb38b">

## Index mapping
<img width="1514" alt="Screenshot 2023-07-13 at 17 40 32"
src="f1e48045-a7a8-4044-bc33-f4d34dc1c8cc">

## Alert document
<img width="681" alt="Screenshot 2023-07-13 at 17 39 46"
src="985cf003-ac32-4c7e-9f2a-5bda033c194b">

## Action variable
<img width="612" alt="Screenshot 2023-07-13 at 17 43 13"
src="2edfb388-f99d-4cae-98ef-3e9b275bb848">

## Alert notification
<img width="650" alt="Screenshot 2023-07-13 at 17 41 37"
src="c057a3a1-dc6e-4fee-97ad-5790ab3c531b">
This commit is contained in:
Bena Kansara 2023-07-14 18:34:21 +02:00 committed by GitHub
parent 8bb85ae594
commit 73ce87a0a9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 153 additions and 4 deletions

View file

@ -75,6 +75,7 @@ const ObservabilityApmAlertOptional = rt.partial({
}),
error: rt.partial({
grouping_key: schemaString,
grouping_name: schemaString,
}),
kibana: rt.partial({
alert: rt.partial({

View file

@ -82,6 +82,8 @@ Array [
exports[`Error ERROR_GROUP_ID 1`] = `"grouping key"`;
exports[`Error ERROR_GROUP_NAME 1`] = `undefined`;
exports[`Error ERROR_ID 1`] = `"error id"`;
exports[`Error ERROR_LOG_LEVEL 1`] = `undefined`;
@ -411,6 +413,8 @@ exports[`Span ERROR_EXCEPTION 1`] = `undefined`;
exports[`Span ERROR_GROUP_ID 1`] = `undefined`;
exports[`Span ERROR_GROUP_NAME 1`] = `undefined`;
exports[`Span ERROR_ID 1`] = `undefined`;
exports[`Span ERROR_LOG_LEVEL 1`] = `undefined`;
@ -736,6 +740,8 @@ exports[`Transaction ERROR_EXCEPTION 1`] = `undefined`;
exports[`Transaction ERROR_GROUP_ID 1`] = `undefined`;
exports[`Transaction ERROR_GROUP_NAME 1`] = `undefined`;
exports[`Transaction ERROR_ID 1`] = `undefined`;
exports[`Transaction ERROR_LOG_LEVEL 1`] = `undefined`;

View file

@ -101,6 +101,7 @@ export const PARENT_ID = 'parent.id';
export const ERROR_ID = 'error.id';
export const ERROR_GROUP_ID = 'error.grouping_key';
export const ERROR_GROUP_NAME = 'error.grouping_name';
export const ERROR_CULPRIT = 'error.culprit';
export const ERROR_LOG_LEVEL = 'error.log.level';
export const ERROR_LOG_MESSAGE = 'error.log.message';

View file

@ -18,6 +18,7 @@ import { ML_ANOMALY_SEVERITY } from '@kbn/ml-anomaly-utils/anomaly_severity';
import { ML_ANOMALY_THRESHOLD } from '@kbn/ml-anomaly-utils/anomaly_threshold';
import {
ERROR_GROUP_ID,
ERROR_GROUP_NAME,
SERVICE_ENVIRONMENT,
SERVICE_NAME,
TRANSACTION_NAME,
@ -61,6 +62,8 @@ const getFieldNameLabel = (field: string): string => {
return 'name';
case ERROR_GROUP_ID:
return 'error key';
case ERROR_GROUP_NAME:
return 'error name';
default:
return field;
}

View file

@ -39,6 +39,7 @@ import {
SERVICE_NAME,
TRANSACTION_NAME,
ERROR_GROUP_ID,
ERROR_GROUP_NAME,
} from '../../../../../common/es_fields/apm';
import {
ErrorState,
@ -218,7 +219,7 @@ export function ErrorCountRuleType(props: Props) {
<APMRuleGroupBy
onChange={onGroupByChange}
options={{ groupBy: ruleParams.groupBy }}
fields={[TRANSACTION_NAME, ERROR_GROUP_ID]}
fields={[TRANSACTION_NAME, ERROR_GROUP_ID, ERROR_GROUP_NAME]}
preSelectedOptions={[SERVICE_NAME, SERVICE_ENVIRONMENT]}
/>
</EuiFormRow>

View file

@ -102,4 +102,13 @@ export const apmActionVariables = {
),
name: 'errorGroupingKey' as const,
},
errorGroupingName: {
description: i18n.translate(
'xpack.apm.alerts.action_variables.errorGroupingName',
{
defaultMessage: 'The error grouping name the alert is created for',
}
),
name: 'errorGroupingName' as const,
},
};

View file

@ -20,6 +20,7 @@ import { legacyExperimentalFieldMap } from '@kbn/alerts-as-data-utils';
import {
AGENT_NAME,
ERROR_GROUP_ID,
ERROR_GROUP_NAME,
PROCESSOR_EVENT,
SERVICE_ENVIRONMENT,
SERVICE_LANGUAGE_NAME,
@ -57,6 +58,10 @@ export const apmRuleTypeAlertFieldMap = {
type: 'keyword',
required: false,
},
[ERROR_GROUP_NAME]: {
type: 'keyword',
required: false,
},
[PROCESSOR_EVENT]: {
type: 'keyword',
required: false,

View file

@ -584,4 +584,117 @@ describe('Error count alert', () => {
alertDetailsUrl: 'mockedAlertsLocator > getLocation',
});
});
it('sends alert when rule is configured with group by on error.grouping_key and error.grouping_name', async () => {
const { services, dependencies, executor, scheduleActions } =
createRuleTypeMocks();
registerErrorCountRuleType(dependencies);
const params = {
threshold: 2,
windowSize: 5,
windowUnit: 'm',
groupBy: [
'service.name',
'service.environment',
'error.grouping_key',
'error.grouping_name',
],
};
services.scopedClusterClient.asCurrentUser.search.mockResponse({
hits: {
hits: [],
total: {
relation: 'eq',
value: 2,
},
},
aggregations: {
error_counts: {
buckets: [
{
key: ['foo', 'env-foo', 'error-key-foo', 'error-name-foo'],
doc_count: 5,
},
{
key: ['foo', 'env-foo-2', 'error-key-foo-2', 'error-name-foo2'],
doc_count: 4,
},
{
key: ['bar', 'env-bar', 'error-key-bar', 'error-name-bar'],
doc_count: 3,
},
{
key: ['bar', 'env-bar-2', 'error-key-bar-2', 'error-name-bar2'],
doc_count: 1,
},
],
},
},
took: 0,
timed_out: false,
_shards: {
failed: 0,
skipped: 0,
successful: 1,
total: 1,
},
});
await executor({ params });
[
'foo_env-foo_error-key-foo_error-name-foo',
'foo_env-foo-2_error-key-foo-2_error-name-foo2',
'bar_env-bar_error-key-bar_error-name-bar',
].forEach((instanceName) =>
expect(services.alertFactory.create).toHaveBeenCalledWith(instanceName)
);
expect(scheduleActions).toHaveBeenCalledTimes(3);
expect(scheduleActions).toHaveBeenCalledWith('threshold_met', {
serviceName: 'foo',
environment: 'env-foo',
threshold: 2,
triggerValue: 5,
reason:
'Error count is 5 in the last 5 mins for service: foo, env: env-foo, error key: error-key-foo, error name: error-name-foo. Alert when > 2.',
interval: '5 mins',
viewInAppUrl:
'http://localhost:5601/eyr/app/apm/services/foo/errors?environment=env-foo',
errorGroupingKey: 'error-key-foo',
errorGroupingName: 'error-name-foo',
alertDetailsUrl: 'mockedAlertsLocator > getLocation',
});
expect(scheduleActions).toHaveBeenCalledWith('threshold_met', {
serviceName: 'foo',
environment: 'env-foo-2',
threshold: 2,
triggerValue: 4,
reason:
'Error count is 4 in the last 5 mins for service: foo, env: env-foo-2, error key: error-key-foo-2, error name: error-name-foo2. Alert when > 2.',
interval: '5 mins',
viewInAppUrl:
'http://localhost:5601/eyr/app/apm/services/foo/errors?environment=env-foo-2',
errorGroupingKey: 'error-key-foo-2',
errorGroupingName: 'error-name-foo2',
alertDetailsUrl: 'mockedAlertsLocator > getLocation',
});
expect(scheduleActions).toHaveBeenCalledWith('threshold_met', {
serviceName: 'bar',
environment: 'env-bar',
reason:
'Error count is 3 in the last 5 mins for service: bar, env: env-bar, error key: error-key-bar, error name: error-name-bar. Alert when > 2.',
threshold: 2,
triggerValue: 3,
interval: '5 mins',
viewInAppUrl:
'http://localhost:5601/eyr/app/apm/services/bar/errors?environment=env-bar',
errorGroupingKey: 'error-key-bar',
errorGroupingName: 'error-name-bar',
alertDetailsUrl: 'mockedAlertsLocator > getLocation',
});
});
});

View file

@ -83,6 +83,7 @@ export function registerErrorCountRuleType({
apmActionVariables.serviceName,
apmActionVariables.transactionName,
apmActionVariables.errorGroupingKey,
apmActionVariables.errorGroupingName,
apmActionVariables.threshold,
apmActionVariables.triggerValue,
apmActionVariables.viewInAppUrl,

View file

@ -15,11 +15,13 @@ describe('getGroupByActionVariables', () => {
'transaction.type': 'request',
'transaction.name': 'tx-java',
'error.grouping_key': 'error-key-0',
'error.grouping_name': 'error-name-0',
});
expect(result).toMatchInlineSnapshot(`
Object {
"environment": "development",
"errorGroupingKey": "error-key-0",
"errorGroupingName": "error-name-0",
"serviceName": "opbeans-java",
"transactionName": "tx-java",
"transactionType": "request",

View file

@ -8,6 +8,7 @@
import { getFieldValueLabel } from '../../../../../common/rules/apm_rule_types';
import {
ERROR_GROUP_ID,
ERROR_GROUP_NAME,
SERVICE_ENVIRONMENT,
SERVICE_NAME,
TRANSACTION_NAME,
@ -26,6 +27,8 @@ const renameActionVariable = (field: string): string => {
return 'transactionName';
case ERROR_GROUP_ID:
return 'errorGroupingKey';
case ERROR_GROUP_NAME:
return 'errorGroupingName';
default:
return field;
}

View file

@ -109,6 +109,7 @@ export default function ApiTest({ getService }: FtrProviderContext) {
'service.environment',
'transaction.name',
'error.grouping_key',
'error.grouping_name',
],
},
actions: [
@ -120,7 +121,8 @@ export default function ApiTest({ getService }: FtrProviderContext) {
{
message: `${errorCountMessage}
- Transaction name: {{context.transactionName}}
- Error grouping key: {{context.errorGroupingKey}}`,
- Error grouping key: {{context.errorGroupingKey}}
- Error grouping name: {{context.errorGroupingName}}`,
},
],
},
@ -158,6 +160,7 @@ export default function ApiTest({ getService }: FtrProviderContext) {
expect(resp.hits.hits[0]._source).property('service.environment', 'production');
expect(resp.hits.hits[0]._source).property('transaction.name', 'tx-java');
expect(resp.hits.hits[0]._source).property('error.grouping_key', errorGroupingKey);
expect(resp.hits.hits[0]._source).property('error.grouping_name', errorMessage);
});
it('returns correct message', async () => {
@ -168,7 +171,7 @@ export default function ApiTest({ getService }: FtrProviderContext) {
});
expect(resp.hits.hits[0]._source?.message).eql(
`Error count is 15 in the last 1 hr for service: opbeans-java, env: production, name: tx-java, error key: ${errorGroupingKey}. Alert when > 1.
`Error count is 15 in the last 1 hr for service: opbeans-java, env: production, name: tx-java, error key: ${errorGroupingKey}, error name: ${errorMessage}. Alert when > 1.
Apm error count is active with the following conditions:
@ -180,7 +183,8 @@ Apm error count is active with the following conditions:
[View alert details](http://mockedpublicbaseurl/app/observability/alerts?_a=(kuery:%27kibana.alert.uuid:%20%22${alertId}%22%27%2CrangeFrom:%27${rangeFrom}%27%2CrangeTo:now%2Cstatus:all))
- Transaction name: tx-java
- Error grouping key: ${errorGroupingKey}`
- Error grouping key: ${errorGroupingKey}
- Error grouping name: ${errorMessage}`
);
});