feat(slo): Avoid false positive burn rate alerting with partial rolled-up data (#203279)

This commit is contained in:
Kevin Delemme 2025-01-07 15:21:22 -05:00 committed by GitHub
parent b30210929b
commit 0e13d86fc7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 167 additions and 101 deletions

View file

@ -5,7 +5,7 @@
* 2.0.
*/
import { SanitizedRuleConfig } from '@kbn/alerting-plugin/common';
import { Rule, SanitizedRuleConfig } from '@kbn/alerting-plugin/common';
import { DEFAULT_FLAPPING_SETTINGS } from '@kbn/alerting-plugin/common/rules_settings';
import { RuleExecutorServices } from '@kbn/alerting-plugin/server';
import { publicAlertsClientMock } from '@kbn/alerting-plugin/server/alerts_client/alerts_client.mock';
@ -25,7 +25,13 @@ import {
import { ISearchStartSearchSource } from '@kbn/data-plugin/public';
import { dataViewPluginMocks } from '@kbn/data-views-plugin/public/mocks';
import { MockedLogger } from '@kbn/logging-mocks';
import { Rule } from '@kbn/alerting-plugin/common';
import {
ALERT_EVALUATION_THRESHOLD,
ALERT_EVALUATION_VALUE,
ALERT_GROUP,
ALERT_REASON,
SLO_BURN_RATE_RULE_TYPE_ID,
} from '@kbn/rule-registry-plugin/common/technical_rule_data_field_names';
import { SharePluginStart } from '@kbn/share-plugin/server';
import { sloDefinitionSchema } from '@kbn/slo-schema';
import { get } from 'lodash';
@ -41,25 +47,18 @@ import {
SLO_INSTANCE_ID_FIELD,
SLO_REVISION_FIELD,
} from '../../../../common/field_names/slo';
import {
ALERT_EVALUATION_THRESHOLD,
ALERT_EVALUATION_VALUE,
ALERT_GROUP,
ALERT_REASON,
SLO_BURN_RATE_RULE_TYPE_ID,
} from '@kbn/rule-registry-plugin/common/technical_rule_data_field_names';
import { SLODefinition, StoredSLODefinition } from '../../../domain/models';
import { SLONotFound } from '../../../errors';
import { SO_SLO_TYPE } from '../../../saved_objects';
import { createSLO } from '../../../services/fixtures/slo';
import { BurnRateAlert, getRuleExecutor } from './executor';
import {
LONG_WINDOW,
SHORT_WINDOW,
generateAboveThresholdKey,
generateBurnRateKey,
generateStatsKey,
generateWindowId,
LONG_WINDOW,
SHORT_WINDOW,
} from './lib/build_query';
import { EvaluationBucket } from './lib/evaluate';
import {
@ -188,7 +187,7 @@ describe('BurnRateRuleExecutor', () => {
describe('multi-window', () => {
it('throws when the slo is not found', async () => {
soClientMock.find.mockRejectedValue(new SLONotFound('SLO [non-existent] not found'));
const executor = getRuleExecutor({ basePath: basePathMock });
const executor = getRuleExecutor(basePathMock);
await expect(
executor({
@ -212,7 +211,7 @@ describe('BurnRateRuleExecutor', () => {
it('returns early when the slo is disabled', async () => {
const slo = createSLO({ objective: { target: 0.9 }, enabled: false });
soClientMock.find.mockResolvedValueOnce(createFindResponse([slo]));
const executor = getRuleExecutor({ basePath: basePathMock });
const executor = getRuleExecutor(basePathMock);
const result = await executor({
params: someRuleParamsWithWindows({ sloId: slo.id }),
@ -264,7 +263,7 @@ describe('BurnRateRuleExecutor', () => {
generateEsResponse(ruleParams, [], { instanceId: 'bar' })
);
const executor = getRuleExecutor({ basePath: basePathMock });
const executor = getRuleExecutor(basePathMock);
await executor({
params: ruleParams,
startedAt: new Date(),
@ -312,7 +311,7 @@ describe('BurnRateRuleExecutor', () => {
generateEsResponse(ruleParams, [], { instanceId: 'bar' })
);
const executor = getRuleExecutor({ basePath: basePathMock });
const executor = getRuleExecutor(basePathMock);
await executor({
params: ruleParams,
startedAt: new Date(),
@ -369,9 +368,7 @@ describe('BurnRateRuleExecutor', () => {
start: new Date().toISOString(),
}));
const executor = getRuleExecutor({
basePath: basePathMock,
});
const executor = getRuleExecutor(basePathMock);
await executor({
params: ruleParams,
@ -519,9 +516,7 @@ describe('BurnRateRuleExecutor', () => {
start: new Date().toISOString(),
}));
const executor = getRuleExecutor({
basePath: basePathMock,
});
const executor = getRuleExecutor(basePathMock);
await executor({
params: ruleParams,
@ -643,7 +638,7 @@ describe('BurnRateRuleExecutor', () => {
start: new Date().toISOString(),
}));
const executor = getRuleExecutor({ basePath: basePathMock });
const executor = getRuleExecutor(basePathMock);
await executor({
params: ruleParams,
startedAt: new Date(),

View file

@ -5,25 +5,29 @@
* 2.0.
*/
import { i18n } from '@kbn/i18n';
import numeral from '@elastic/numeral';
import { AlertsClientError, ExecutorType, RuleExecutorOptions } from '@kbn/alerting-plugin/server';
import { ObservabilitySloAlert } from '@kbn/alerts-as-data-utils';
import { IBasePath } from '@kbn/core/server';
import { i18n } from '@kbn/i18n';
import { getEcsGroups } from '@kbn/observability-alerting-rule-utils';
import { getAlertDetailsUrl } from '@kbn/observability-plugin/common';
import {
ALERT_EVALUATION_THRESHOLD,
ALERT_EVALUATION_VALUE,
ALERT_GROUP,
ALERT_REASON,
} from '@kbn/rule-data-utils';
import { AlertsClientError, RuleExecutorOptions } from '@kbn/alerting-plugin/server';
import { IBasePath } from '@kbn/core/server';
import { LocatorPublic } from '@kbn/share-plugin/common';
import { upperCase } from 'lodash';
import { addSpaceIdToPath } from '@kbn/spaces-plugin/server';
import { ALL_VALUE } from '@kbn/slo-schema';
import { AlertsLocatorParams, getAlertDetailsUrl } from '@kbn/observability-plugin/common';
import { ObservabilitySloAlert } from '@kbn/alerts-as-data-utils';
import { ExecutorType } from '@kbn/alerting-plugin/server';
import { addSpaceIdToPath } from '@kbn/spaces-plugin/server';
import { upperCase } from 'lodash';
import {
ALERT_ACTION,
HIGH_PRIORITY_ACTION,
LOW_PRIORITY_ACTION,
MEDIUM_PRIORITY_ACTION,
SUPPRESSED_PRIORITY_ACTION,
} from '../../../../common/constants';
import {
SLO_ID_FIELD,
SLO_INSTANCE_ID_FIELD,
@ -31,6 +35,10 @@ import {
} from '../../../../common/field_names/slo';
import { Duration } from '../../../domain/models';
import { KibanaSavedObjectsSLORepository } from '../../../services';
import { evaluate } from './lib/evaluate';
import { evaluateDependencies } from './lib/evaluate_dependencies';
import { shouldSuppressInstanceId } from './lib/should_suppress_instance_id';
import { getSloSummary } from './lib/summary_repository';
import {
AlertStates,
BurnRateAlertContext,
@ -41,29 +49,12 @@ import {
Group,
WindowSchema,
} from './types';
import {
ALERT_ACTION,
HIGH_PRIORITY_ACTION,
MEDIUM_PRIORITY_ACTION,
LOW_PRIORITY_ACTION,
SUPPRESSED_PRIORITY_ACTION,
} from '../../../../common/constants';
import { evaluate } from './lib/evaluate';
import { evaluateDependencies } from './lib/evaluate_dependencies';
import { shouldSuppressInstanceId } from './lib/should_suppress_instance_id';
import { getSloSummary } from './lib/summary_repository';
export type BurnRateAlert = Omit<ObservabilitySloAlert, 'kibana.alert.group'> & {
[ALERT_GROUP]?: Group[];
};
export const getRuleExecutor = ({
basePath,
alertsLocator,
}: {
basePath: IBasePath;
alertsLocator?: LocatorPublic<AlertsLocatorParams>;
}) =>
export const getRuleExecutor = (basePath: IBasePath) =>
async function executor(
options: RuleExecutorOptions<
BurnRateRuleParams,

View file

@ -506,8 +506,9 @@ Object {
"script": Object {
"params": Object {
"target": 0.98,
"totalSlices": 30,
},
"source": "params.total != null && params.total > 0 ? (1 - (params.good / params.total)) / (1 - params.target) : 0",
"source": "params.total != null && params.total > 0 && params.totalSlices > 0 ? ((params.total - params.good) / params.totalSlices) / (1 - params.target) : 0",
},
},
},
@ -555,8 +556,9 @@ Object {
"script": Object {
"params": Object {
"target": 0.98,
"totalSlices": 3,
},
"source": "params.total != null && params.total > 0 ? (1 - (params.good / params.total)) / (1 - params.target) : 0",
"source": "params.total != null && params.total > 0 && params.totalSlices > 0 ? ((params.total - params.good) / params.totalSlices) / (1 - params.target) : 0",
},
},
},
@ -604,8 +606,9 @@ Object {
"script": Object {
"params": Object {
"target": 0.98,
"totalSlices": 180,
},
"source": "params.total != null && params.total > 0 ? (1 - (params.good / params.total)) / (1 - params.target) : 0",
"source": "params.total != null && params.total > 0 && params.totalSlices > 0 ? ((params.total - params.good) / params.totalSlices) / (1 - params.target) : 0",
},
},
},
@ -653,8 +656,9 @@ Object {
"script": Object {
"params": Object {
"target": 0.98,
"totalSlices": 15,
},
"source": "params.total != null && params.total > 0 ? (1 - (params.good / params.total)) / (1 - params.target) : 0",
"source": "params.total != null && params.total > 0 && params.totalSlices > 0 ? ((params.total - params.good) / params.totalSlices) / (1 - params.target) : 0",
},
},
},
@ -702,8 +706,9 @@ Object {
"script": Object {
"params": Object {
"target": 0.98,
"totalSlices": 720,
},
"source": "params.total != null && params.total > 0 ? (1 - (params.good / params.total)) / (1 - params.target) : 0",
"source": "params.total != null && params.total > 0 && params.totalSlices > 0 ? ((params.total - params.good) / params.totalSlices) / (1 - params.target) : 0",
},
},
},
@ -751,8 +756,9 @@ Object {
"script": Object {
"params": Object {
"target": 0.98,
"totalSlices": 60,
},
"source": "params.total != null && params.total > 0 ? (1 - (params.good / params.total)) / (1 - params.target) : 0",
"source": "params.total != null && params.total > 0 && params.totalSlices > 0 ? ((params.total - params.good) / params.totalSlices) / (1 - params.target) : 0",
},
},
},
@ -800,8 +806,9 @@ Object {
"script": Object {
"params": Object {
"target": 0.98,
"totalSlices": 2160,
},
"source": "params.total != null && params.total > 0 ? (1 - (params.good / params.total)) / (1 - params.target) : 0",
"source": "params.total != null && params.total > 0 && params.totalSlices > 0 ? ((params.total - params.good) / params.totalSlices) / (1 - params.target) : 0",
},
},
},
@ -849,8 +856,9 @@ Object {
"script": Object {
"params": Object {
"target": 0.98,
"totalSlices": 180,
},
"source": "params.total != null && params.total > 0 ? (1 - (params.good / params.total)) / (1 - params.target) : 0",
"source": "params.total != null && params.total > 0 && params.totalSlices > 0 ? ((params.total - params.good) / params.totalSlices) / (1 - params.target) : 0",
},
},
},

View file

@ -24,6 +24,7 @@ describe('buildQuery()', () => {
const rule = createBurnRateRule(slo);
expect(buildQuery(STARTED_AT, slo, rule)).toMatchSnapshot();
});
it('should return a valid query with afterKey', () => {
const slo = createSLO({
id: 'test-slo',
@ -32,6 +33,7 @@ describe('buildQuery()', () => {
const rule = createBurnRateRule(slo);
expect(buildQuery(STARTED_AT, slo, rule, { instanceId: 'example' })).toMatchSnapshot();
});
it('should return a valid query for timeslices', () => {
const slo = createSLOWithTimeslicesBudgetingMethod({
id: 'test-slo',

View file

@ -7,9 +7,10 @@
import { timeslicesBudgetingMethodSchema } from '@kbn/slo-schema';
import { Duration, SLODefinition, toDurationUnit } from '../../../../domain/models';
import { BurnRateRuleParams, WindowSchema } from '../types';
import { getDelayInSecondsFromSLO } from '../../../../domain/services/get_delay_in_seconds_from_slo';
import { getLookbackDateRange } from '../../../../domain/services/get_lookback_date_range';
import { getSlicesFromDateRange } from '../../../../services/utils/get_slices_from_date_range';
import { BurnRateRuleParams, WindowSchema } from '../types';
type BurnRateWindowWithDuration = WindowSchema & {
longDuration: Duration;
@ -47,6 +48,7 @@ const TIMESLICE_AGGS = {
good: { sum: { field: 'slo.isGoodSlice' } },
total: { value_count: { field: 'slo.isGoodSlice' } },
};
const OCCURRENCE_AGGS = {
good: { sum: { field: 'slo.numerator' } },
total: { sum: { field: 'slo.denominator' } },
@ -59,12 +61,45 @@ function buildWindowAgg(
slo: SLODefinition,
dateRange: { from: Date; to: Date }
) {
const aggs = timeslicesBudgetingMethodSchema.is(slo.budgetingMethod)
? TIMESLICE_AGGS
: OCCURRENCE_AGGS;
const isTimesliceBudgetingMethod = timeslicesBudgetingMethodSchema.is(slo.budgetingMethod);
const aggs = isTimesliceBudgetingMethod ? TIMESLICE_AGGS : OCCURRENCE_AGGS;
// For timeslice budgeting method, we always compute the burn rate based on the observed bad slices, e.g. total observed - good observed = bad slices observed,
// And we compare this to the expected slices in the whole window duration
const burnRateAgg = isTimesliceBudgetingMethod
? {
bucket_script: {
buckets_path: {
good: `${generateStatsKey(id, type)}>good`,
total: `${generateStatsKey(id, type)}>total`,
},
script: {
source:
'params.total != null && params.total > 0 && params.totalSlices > 0 ? ((params.total - params.good) / params.totalSlices) / (1 - params.target) : 0',
params: {
target: slo.objective.target,
totalSlices: getSlicesFromDateRange(dateRange, slo.objective.timesliceWindow!),
},
},
},
}
: {
bucket_script: {
buckets_path: {
good: `${generateStatsKey(id, type)}>good`,
total: `${generateStatsKey(id, type)}>total`,
},
script: {
source:
'params.total != null && params.total > 0 ? (1 - (params.good / params.total)) / (1 - params.target) : 0',
params: { target: slo.objective.target },
},
},
};
return {
[`${id}_${type}`]: {
[generateStatsKey(id, type)]: {
filter: {
range: {
'@timestamp': {
@ -75,19 +110,7 @@ function buildWindowAgg(
},
aggs,
},
[generateBurnRateKey(id, type)]: {
bucket_script: {
buckets_path: {
good: `${id}_${type}>good`,
total: `${id}_${type}>total`,
},
script: {
source:
'params.total != null && params.total > 0 ? (1 - (params.good / params.total)) / (1 - params.target) : 0',
params: { target: slo.objective.target },
},
},
},
[generateBurnRateKey(id, type)]: burnRateAgg,
[generateAboveThresholdKey(id, type)]: {
bucket_script: {
buckets_path: { burnRate: generateBurnRateKey(id, type) },
@ -134,14 +157,15 @@ function buildEvaluation(burnRateWindows: BurnRateWindowWithDuration[]) {
};
}, {});
const source = burnRateWindows.reduce((acc, _windDef, index) => {
const windowId = `${WINDOW}_${index}`;
const OP = acc ? ' || ' : '';
return `${acc}${OP}(params.${generateAboveThresholdKey(
windowId,
SHORT_WINDOW
)} == 1 && params.${generateAboveThresholdKey(windowId, LONG_WINDOW)} == 1)`;
}, '');
const source = burnRateWindows
.map((_windDef, index) => {
const windowId = `${WINDOW}_${index}`;
return `(params.${generateAboveThresholdKey(
windowId,
SHORT_WINDOW
)} == 1 && params.${generateAboveThresholdKey(windowId, LONG_WINDOW)} == 1)`;
})
.join(' || ');
return {
evaluation: {

View file

@ -75,12 +75,14 @@ async function queryAllResults(
index: SLO_DESTINATION_INDEX_PATTERN,
...queryAndAggs,
});
if (!results.aggregations) {
throw new Error('Elasticsearch query failed to return a valid aggregation');
}
if (results.aggregations.instances.buckets.length === 0) {
return buckets;
}
return queryAllResults(
esClient,
slo,

View file

@ -22,12 +22,9 @@ export async function getSloSummary(
query: {
bool: {
filter: [
{
term: { 'slo.id': slo.id },
},
{
term: { 'slo.instanceId': instanceId },
},
{ term: { 'slo.id': slo.id } },
{ term: { 'slo.revision': slo.revision } },
{ term: { 'slo.instanceId': instanceId } },
],
},
},

View file

@ -85,7 +85,7 @@ export function sloBurnRateRuleType(
producer: sloFeatureId,
minimumLicenseRequired: 'platinum' as LicenseType,
isExportable: true,
executor: getRuleExecutor({ basePath, alertsLocator }),
executor: getRuleExecutor(basePath),
doesSetRecoveryContext: true,
actionVariables: {
context: [

View file

@ -25,7 +25,7 @@ import { computeBurnRate, computeSLI } from '../domain/services';
import { getDelayInSecondsFromSLO } from '../domain/services/get_delay_in_seconds_from_slo';
import { getLookbackDateRange } from '../domain/services/get_lookback_date_range';
import { InternalQueryError } from '../errors';
import { computeTotalSlicesFromDateRange } from './utils/compute_total_slices_from_date_range';
import { getSlicesFromDateRange } from './utils/get_slices_from_date_range';
type WindowName = string;
export interface BurnRatesClient {
@ -224,10 +224,8 @@ function handleWindowedResult(
from: new Date(bucket.from_as_string!),
to: new Date(bucket.to_as_string!),
};
const totalSlices = computeTotalSlicesFromDateRange(
dateRange,
slo.objective.timesliceWindow!
);
const totalSlices = getSlicesFromDateRange(dateRange, slo.objective.timesliceWindow!);
sliValue = computeSLI(good, total, totalSlices);
} else {

View file

@ -46,3 +46,11 @@ export function twoMinute(): Duration {
export function fiveMinute(): Duration {
return new Duration(5, DurationUnit.Minute);
}
export function sevenMinutes(): Duration {
return new Duration(7, DurationUnit.Minute);
}
export function twoHoursInMinutes(): Duration {
return new Duration(120, DurationUnit.Minute);
}

View file

@ -33,7 +33,7 @@ import {
toCalendarAlignedTimeWindowMomentUnit,
} from '../domain/models';
import { computeSLI, computeSummaryStatus, toErrorBudget } from '../domain/services';
import { computeTotalSlicesFromDateRange } from './utils/compute_total_slices_from_date_range';
import { getSlicesFromDateRange } from './utils/get_slices_from_date_range';
interface DailyAggBucket {
key_as_string: string;
@ -194,7 +194,7 @@ function handleResultForCalendarAlignedAndTimeslices(
dateRange: { range: DateRange; queryRange: DateRange }
): HistoricalSummary[] {
const initialErrorBudget = 1 - objective.target;
const totalSlices = computeTotalSlicesFromDateRange(dateRange.range, objective.timesliceWindow!);
const totalSlices = getSlicesFromDateRange(dateRange.range, objective.timesliceWindow!);
return buckets.map((bucket: DailyAggBucket): HistoricalSummary => {
const good = bucket.cumulative_good?.value ?? 0;

View file

@ -25,7 +25,7 @@ import { computeSLI, computeSummaryStatus, toErrorBudget } from '../domain/servi
import { toDateRange } from '../domain/services/date_range';
import { BurnRatesClient } from './burn_rates_client';
import { getFlattenedGroupings } from './utils';
import { computeTotalSlicesFromDateRange } from './utils/compute_total_slices_from_date_range';
import { getSlicesFromDateRange } from './utils/get_slices_from_date_range';
interface Params {
slo: SLODefinition;
@ -190,7 +190,7 @@ function computeSliValue(
const total = bucket?.total?.value ?? 0;
if (timeslicesBudgetingMethodSchema.is(slo.budgetingMethod)) {
const totalSlices = computeTotalSlicesFromDateRange(dateRange, slo.objective.timesliceWindow!);
const totalSlices = getSlicesFromDateRange(dateRange, slo.objective.timesliceWindow!);
return computeSLI(good, total, totalSlices);
}

View file

@ -0,0 +1,41 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { oneMinute, sevenMinutes, twoHoursInMinutes, twoMinute } from '../fixtures/duration';
import { createSLOWithTimeslicesBudgetingMethod } from '../fixtures/slo';
import { getSlicesFromDateRange } from './get_slices_from_date_range';
describe('utils', () => {
describe('GetSlicesFromDateRange', () => {
it.each([
['1min', oneMinute(), 60],
['2min', twoMinute(), 30],
['7min', sevenMinutes(), 9],
['120min', twoHoursInMinutes(), 1],
])(
'returns the correct number of slices for %s timeslice window',
(desc, timesliceWindow, expected) => {
const slo = createSLOWithTimeslicesBudgetingMethod({
objective: {
target: 0.98,
timesliceTarget: 0.9,
timesliceWindow,
},
});
const dateRange = {
from: new Date('2022-01-01T14:46:12.643Z'),
to: new Date('2022-01-01T15:46:12.643Z'),
};
const result = getSlicesFromDateRange(dateRange, slo.objective.timesliceWindow!);
expect(result).toBe(expected);
}
);
});
});

View file

@ -8,7 +8,7 @@
import moment from 'moment';
import { DateRange, Duration, toMomentUnitOfTime } from '../../domain/models';
export function computeTotalSlicesFromDateRange(dateRange: DateRange, timesliceWindow: Duration) {
export function getSlicesFromDateRange(dateRange: DateRange, timesliceWindow: Duration) {
const dateRangeDurationInUnit = moment(dateRange.to).diff(
dateRange.from,
toMomentUnitOfTime(timesliceWindow.unit)