[Logs UI] Respect and report on log threshold alert limits (#142263)

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
Felix Stürmer 2022-10-12 12:27:58 +02:00 committed by GitHub
parent 757ab767eb
commit 6792b7c9c6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 374 additions and 32 deletions

View file

@ -14,6 +14,8 @@ import {
getGroupedESQuery,
processUngroupedResults,
processGroupByResults,
LogThresholdAlertFactory,
LogThresholdAlertLimit,
} from './log_threshold_executor';
import {
Comparator,
@ -406,9 +408,14 @@ describe('Log threshold executor', () => {
});
describe('Results processors', () => {
describe('Can process ungrouped results', () => {
test('It handles the ALERT state correctly', () => {
const alertFactoryMock = jest.fn();
describe('for ungrouped results', () => {
it('handles the ALERT state correctly', () => {
const alertFactoryMock: jest.MockedFunction<LogThresholdAlertFactory> = jest.fn();
const alertLimitMock: jest.Mocked<LogThresholdAlertLimit> = {
getValue: jest.fn().mockReturnValue(10),
setLimitReached: jest.fn(),
};
const ruleParams = {
...baseRuleParams,
criteria: [positiveCriteria[0]],
@ -421,7 +428,7 @@ describe('Log threshold executor', () => {
},
} as UngroupedSearchQueryResponse;
processUngroupedResults(results, ruleParams, alertFactoryMock);
processUngroupedResults(results, ruleParams, alertFactoryMock, alertLimitMock);
// first call, fifth argument
expect(alertFactoryMock.mock.calls[0][4]).toEqual([
@ -437,11 +444,91 @@ describe('Log threshold executor', () => {
},
]);
});
it('reports reaching a low limit when alerting', () => {
const alertFactoryMock: jest.MockedFunction<LogThresholdAlertFactory> = jest.fn();
const alertLimitMock: jest.Mocked<LogThresholdAlertLimit> = {
getValue: jest.fn().mockReturnValue(1),
setLimitReached: jest.fn(),
};
const ruleParams = {
...baseRuleParams,
criteria: [positiveCriteria[0]],
};
const results = {
hits: {
total: {
value: 10,
},
},
} as UngroupedSearchQueryResponse;
processUngroupedResults(results, ruleParams, alertFactoryMock, alertLimitMock);
expect(alertFactoryMock).toBeCalledTimes(1);
expect(alertLimitMock.setLimitReached).toHaveBeenCalledWith(true);
});
it('reports not reaching a higher limit when alerting', () => {
const alertFactoryMock: jest.MockedFunction<LogThresholdAlertFactory> = jest.fn();
const alertLimitMock: jest.Mocked<LogThresholdAlertLimit> = {
getValue: jest.fn().mockReturnValue(10),
setLimitReached: jest.fn(),
};
const ruleParams = {
...baseRuleParams,
criteria: [positiveCriteria[0]],
};
const results = {
hits: {
total: {
value: 10,
},
},
} as UngroupedSearchQueryResponse;
processUngroupedResults(results, ruleParams, alertFactoryMock, alertLimitMock);
expect(alertFactoryMock).toBeCalledTimes(1);
expect(alertLimitMock.setLimitReached).toHaveBeenCalledWith(false);
});
it('reports not reaching the limit without any alerts', () => {
const alertFactoryMock: jest.MockedFunction<LogThresholdAlertFactory> = jest.fn();
const alertLimitMock: jest.Mocked<LogThresholdAlertLimit> = {
getValue: jest.fn().mockReturnValue(0),
setLimitReached: jest.fn(),
};
const ruleParams = {
...baseRuleParams,
criteria: [positiveCriteria[0]],
};
const results = {
hits: {
total: {
value: 0,
},
},
} as UngroupedSearchQueryResponse;
processUngroupedResults(results, ruleParams, alertFactoryMock, alertLimitMock);
expect(alertFactoryMock).not.toHaveBeenCalled();
expect(alertLimitMock.setLimitReached).toHaveBeenCalledWith(false);
});
});
describe('Can process grouped results', () => {
test('It handles the ALERT state correctly', () => {
const alertFactoryMock = jest.fn();
describe('for grouped results', () => {
it('handles the ALERT state correctly', () => {
const alertFactoryMock: jest.MockedFunction<LogThresholdAlertFactory> = jest.fn();
const alertLimitMock: jest.Mocked<LogThresholdAlertLimit> = {
getValue: jest.fn().mockReturnValue(2),
setLimitReached: jest.fn(),
};
const ruleParams = {
...baseRuleParams,
criteria: [positiveCriteria[0]],
@ -481,7 +568,7 @@ describe('Log threshold executor', () => {
},
] as GroupedSearchQueryResponse['aggregations']['groups']['buckets'];
processGroupByResults(results, ruleParams, alertFactoryMock);
processGroupByResults(results, ruleParams, alertFactoryMock, alertLimitMock);
expect(alertFactoryMock.mock.calls.length).toBe(2);
// First call, fifth argument
@ -514,6 +601,110 @@ describe('Log threshold executor', () => {
},
]);
});
it('respects and reports reaching a low limit when alerting', () => {
const alertFactoryMock: jest.MockedFunction<LogThresholdAlertFactory> = jest.fn();
const alertLimitMock: jest.Mocked<LogThresholdAlertLimit> = {
getValue: jest.fn().mockReturnValue(1),
setLimitReached: jest.fn(),
};
const ruleParams = {
...baseRuleParams,
criteria: [positiveCriteria[0]],
groupBy: ['host.name', 'event.dataset'],
};
// Two groups should fire, one shouldn't
const results = [
{
key: {
'host.name': 'i-am-a-host-name-1',
'event.dataset': 'i-am-a-dataset-1',
},
doc_count: 100,
filtered_results: {
doc_count: 10,
},
},
{
key: {
'host.name': 'i-am-a-host-name-2',
'event.dataset': 'i-am-a-dataset-2',
},
doc_count: 100,
filtered_results: {
doc_count: 2,
},
},
{
key: {
'host.name': 'i-am-a-host-name-3',
'event.dataset': 'i-am-a-dataset-3',
},
doc_count: 100,
filtered_results: {
doc_count: 20,
},
},
] as GroupedSearchQueryResponse['aggregations']['groups']['buckets'];
processGroupByResults(results, ruleParams, alertFactoryMock, alertLimitMock);
expect(alertFactoryMock).toHaveBeenCalledTimes(1);
expect(alertLimitMock.setLimitReached).toHaveBeenCalledWith(true);
});
it('reports not reaching a higher limit when alerting', () => {
const alertFactoryMock: jest.MockedFunction<LogThresholdAlertFactory> = jest.fn();
const alertLimitMock: jest.Mocked<LogThresholdAlertLimit> = {
getValue: jest.fn().mockReturnValue(10),
setLimitReached: jest.fn(),
};
const ruleParams = {
...baseRuleParams,
criteria: [positiveCriteria[0]],
groupBy: ['host.name', 'event.dataset'],
};
// Two groups should fire, one shouldn't
const results = [
{
key: {
'host.name': 'i-am-a-host-name-1',
'event.dataset': 'i-am-a-dataset-1',
},
doc_count: 100,
filtered_results: {
doc_count: 10,
},
},
{
key: {
'host.name': 'i-am-a-host-name-2',
'event.dataset': 'i-am-a-dataset-2',
},
doc_count: 100,
filtered_results: {
doc_count: 2,
},
},
{
key: {
'host.name': 'i-am-a-host-name-3',
'event.dataset': 'i-am-a-dataset-3',
},
doc_count: 100,
filtered_results: {
doc_count: 20,
},
},
] as GroupedSearchQueryResponse['aggregations']['groups']['buckets'];
processGroupByResults(results, ruleParams, alertFactoryMock, alertLimitMock);
expect(alertFactoryMock).toHaveBeenCalledTimes(2);
expect(alertLimitMock.setLimitReached).toHaveBeenCalledWith(false);
});
});
});
});

View file

@ -19,6 +19,7 @@ import {
Alert,
AlertInstanceContext as AlertContext,
AlertInstanceState as AlertState,
RuleExecutorServices,
RuleTypeState,
} from '@kbn/alerting-plugin/server';
@ -60,18 +61,23 @@ export type LogThresholdRuleTypeState = RuleTypeState; // no specific state used
export type LogThresholdAlertState = AlertState; // no specific state used
export type LogThresholdAlertContext = AlertContext; // no specific instance context used
type LogThresholdAlert = Alert<
export type LogThresholdAlert = Alert<
LogThresholdAlertState,
LogThresholdAlertContext,
LogThresholdActionGroups
>;
type LogThresholdAlertFactory = (
export type LogThresholdAlertFactory = (
id: string,
reason: string,
value: number,
threshold: number,
actions?: Array<{ actionGroup: LogThresholdActionGroups; context: AlertContext }>
) => LogThresholdAlert;
export type LogThresholdAlertLimit = RuleExecutorServices<
LogThresholdAlertState,
LogThresholdAlertContext,
LogThresholdActionGroups
>['alertFactory']['alertLimit'];
const COMPOSITE_GROUP_SIZE = 2000;
@ -96,8 +102,13 @@ export const createLogThresholdExecutor = (libs: InfraBackendLibs) =>
LogThresholdAlertContext,
LogThresholdActionGroups
>(async ({ services, params, startedAt }) => {
const { alertWithLifecycle, savedObjectsClient, scopedClusterClient, getAlertStartedDate } =
services;
const {
alertFactory: { alertLimit },
alertWithLifecycle,
savedObjectsClient,
scopedClusterClient,
getAlertStartedDate,
} = services;
const { basePath } = libs;
const alertFactory: LogThresholdAlertFactory = (id, reason, value, threshold, actions) => {
@ -150,6 +161,7 @@ export const createLogThresholdExecutor = (libs: InfraBackendLibs) =>
runtimeMappings,
scopedClusterClient.asCurrentUser,
alertFactory,
alertLimit,
startedAt.valueOf()
);
} else {
@ -160,6 +172,7 @@ export const createLogThresholdExecutor = (libs: InfraBackendLibs) =>
runtimeMappings,
scopedClusterClient.asCurrentUser,
alertFactory,
alertLimit,
startedAt.valueOf()
);
}
@ -185,6 +198,7 @@ export async function executeAlert(
runtimeMappings: estypes.MappingRuntimeFields,
esClient: ElasticsearchClient,
alertFactory: LogThresholdAlertFactory,
alertLimit: LogThresholdAlertLimit,
executionTimestamp: number
) {
const query = getESQuery(
@ -200,9 +214,19 @@ export async function executeAlert(
}
if (hasGroupBy(ruleParams)) {
processGroupByResults(await getGroupedResults(query, esClient), ruleParams, alertFactory);
processGroupByResults(
await getGroupedResults(query, esClient),
ruleParams,
alertFactory,
alertLimit
);
} else {
processUngroupedResults(await getUngroupedResults(query, esClient), ruleParams, alertFactory);
processUngroupedResults(
await getUngroupedResults(query, esClient),
ruleParams,
alertFactory,
alertLimit
);
}
}
@ -213,6 +237,7 @@ export async function executeRatioAlert(
runtimeMappings: estypes.MappingRuntimeFields,
esClient: ElasticsearchClient,
alertFactory: LogThresholdAlertFactory,
alertLimit: LogThresholdAlertLimit,
executionTimestamp: number
) {
// Ratio alert params are separated out into two standard sets of alert params
@ -254,7 +279,8 @@ export async function executeRatioAlert(
numeratorGroupedResults,
denominatorGroupedResults,
ruleParams,
alertFactory
alertFactory,
alertLimit
);
} else {
const [numeratorUngroupedResults, denominatorUngroupedResults] = await Promise.all([
@ -265,7 +291,8 @@ export async function executeRatioAlert(
numeratorUngroupedResults,
denominatorUngroupedResults,
ruleParams,
alertFactory
alertFactory,
alertLimit
);
}
}
@ -297,7 +324,8 @@ const getESQuery = (
export const processUngroupedResults = (
results: UngroupedSearchQueryResponse,
params: CountRuleParams,
alertFactory: LogThresholdAlertFactory
alertFactory: LogThresholdAlertFactory,
alertLimit: LogThresholdAlertLimit
) => {
const { count, criteria, timeSize, timeUnit } = params;
const documentCount = results.hits.total.value;
@ -323,6 +351,9 @@ export const processUngroupedResults = (
},
];
alertFactory(UNGROUPED_FACTORY_KEY, reasonMessage, documentCount, count.value, actions);
alertLimit.setLimitReached(alertLimit.getValue() <= 1);
} else {
alertLimit.setLimitReached(false);
}
};
@ -330,7 +361,8 @@ export const processUngroupedRatioResults = (
numeratorResults: UngroupedSearchQueryResponse,
denominatorResults: UngroupedSearchQueryResponse,
params: RatioRuleParams,
alertFactory: LogThresholdAlertFactory
alertFactory: LogThresholdAlertFactory,
alertLimit: LogThresholdAlertLimit
) => {
const { count, criteria, timeSize, timeUnit } = params;
@ -360,6 +392,9 @@ export const processUngroupedRatioResults = (
},
];
alertFactory(UNGROUPED_FACTORY_KEY, reasonMessage, ratio, count.value, actions);
alertLimit.setLimitReached(alertLimit.getValue() <= 1);
} else {
alertLimit.setLimitReached(false);
}
};
@ -405,16 +440,24 @@ const getReducedGroupByResults = (
export const processGroupByResults = (
results: GroupedSearchQueryResponse['aggregations']['groups']['buckets'],
params: CountRuleParams,
alertFactory: LogThresholdAlertFactory
alertFactory: LogThresholdAlertFactory,
alertLimit: LogThresholdAlertLimit
) => {
const { count, criteria, timeSize, timeUnit } = params;
const groupResults = getReducedGroupByResults(results);
groupResults.forEach((group) => {
let remainingAlertCount = alertLimit.getValue();
for (const group of groupResults) {
if (remainingAlertCount <= 0) {
break;
}
const documentCount = group.documentCount;
if (checkValueAgainstComparatorMap[count.comparator](documentCount, count.value)) {
remainingAlertCount -= 1;
const reasonMessage = getReasonMessageForGroupedCountAlert(
documentCount,
count.value,
@ -437,21 +480,30 @@ export const processGroupByResults = (
];
alertFactory(group.name, reasonMessage, documentCount, count.value, actions);
}
});
}
alertLimit.setLimitReached(remainingAlertCount <= 0);
};
export const processGroupByRatioResults = (
numeratorResults: GroupedSearchQueryResponse['aggregations']['groups']['buckets'],
denominatorResults: GroupedSearchQueryResponse['aggregations']['groups']['buckets'],
params: RatioRuleParams,
alertFactory: LogThresholdAlertFactory
alertFactory: LogThresholdAlertFactory,
alertLimit: LogThresholdAlertLimit
) => {
const { count, criteria, timeSize, timeUnit } = params;
const numeratorGroupResults = getReducedGroupByResults(numeratorResults);
const denominatorGroupResults = getReducedGroupByResults(denominatorResults);
numeratorGroupResults.forEach((numeratorGroup) => {
let remainingAlertCount = alertLimit.getValue();
for (const numeratorGroup of numeratorGroupResults) {
if (remainingAlertCount <= 0) {
break;
}
const numeratorDocumentCount = numeratorGroup.documentCount;
const denominatorGroup = denominatorGroupResults.find(
(_group) => _group.name === numeratorGroup.name
@ -464,6 +516,7 @@ export const processGroupByRatioResults = (
ratio !== undefined &&
checkValueAgainstComparatorMap[count.comparator](ratio, count.value)
) {
remainingAlertCount -= 1;
const reasonMessage = getReasonMessageForGroupedRatioAlert(
ratio,
count.value,
@ -487,7 +540,9 @@ export const processGroupByRatioResults = (
];
alertFactory(numeratorGroup.name, reasonMessage, ratio, count.value, actions);
}
});
}
alertLimit.setLimitReached(remainingAlertCount <= 0);
};
export const buildFiltersFromCriteria = (

View file

@ -10,6 +10,8 @@ import sinon from 'sinon';
import {
executeAlert,
executeRatioAlert,
LogThresholdAlertFactory,
LogThresholdAlertLimit,
} from '@kbn/infra-plugin/server/lib/alerting/log_threshold/log_threshold_executor';
import {
Comparator,
@ -28,9 +30,13 @@ export default function ({ getService }: FtrProviderContext) {
after(() => esArchiver.unload('x-pack/test/functional/es_archives/infra/alerts_test_data'));
describe('without group by', () => {
it('should work', async () => {
it('should trigger alerts below the alert limit', async () => {
const timestamp = new Date(DATES['alert-test-data'].gauge.max);
const alertFactory = sinon.fake();
const alertFactory = sinon.fake() as SinonSpyOf<LogThresholdAlertFactory>;
const alertLimit = {
getValue: sinon.fake.returns(10),
setLimitReached: sinon.fake(),
} as SinonSpiesOf<LogThresholdAlertLimit>;
const ruleParams = {
count: {
comparator: Comparator.GT_OR_EQ,
@ -46,6 +52,7 @@ export default function ({ getService }: FtrProviderContext) {
},
],
};
await executeAlert(
ruleParams,
'@timestamp',
@ -53,8 +60,10 @@ export default function ({ getService }: FtrProviderContext) {
{},
esClient,
alertFactory,
alertLimit,
timestamp.valueOf()
);
expect(alertFactory.callCount).to.equal(1);
expect(alertFactory.getCall(0).args).to.eql([
'*',
@ -74,13 +83,18 @@ export default function ({ getService }: FtrProviderContext) {
},
],
]);
expect(alertLimit.setLimitReached.calledOnceWith(false)).to.be(true);
});
});
describe('with group by', () => {
it('should work', async () => {
it('should trigger alerts up to the alert limit', async () => {
const timestamp = new Date(DATES['alert-test-data'].gauge.max);
const alertFactory = sinon.fake();
const alertFactory = sinon.fake() as SinonSpyOf<LogThresholdAlertFactory>;
const alertLimit = {
getValue: sinon.fake.returns(2),
setLimitReached: sinon.fake(),
} as SinonSpiesOf<LogThresholdAlertLimit>;
const ruleParams = {
count: {
comparator: Comparator.GT_OR_EQ,
@ -97,6 +111,7 @@ export default function ({ getService }: FtrProviderContext) {
},
],
};
await executeAlert(
ruleParams,
'@timestamp',
@ -104,8 +119,10 @@ export default function ({ getService }: FtrProviderContext) {
{},
esClient,
alertFactory,
alertLimit,
timestamp.valueOf()
);
expect(alertFactory.callCount).to.equal(2);
expect(alertFactory.getCall(0).args).to.eql([
'dev',
@ -125,6 +142,64 @@ export default function ({ getService }: FtrProviderContext) {
},
],
]);
expect(alertLimit.setLimitReached.calledOnceWith(true)).to.be(true);
});
it('should limit alerts to the alert limit', async () => {
const timestamp = new Date(DATES['alert-test-data'].gauge.max);
const alertFactory = sinon.fake() as SinonSpyOf<LogThresholdAlertFactory>;
const alertLimit = {
getValue: sinon.fake.returns(1),
setLimitReached: sinon.fake(),
} as SinonSpiesOf<LogThresholdAlertLimit>;
const ruleParams = {
count: {
comparator: Comparator.GT_OR_EQ,
value: 1,
},
timeUnit: 'm' as TimeUnit,
timeSize: 5,
groupBy: ['env'],
criteria: [
{
field: 'env',
comparator: Comparator.NOT_EQ,
value: 'test',
},
],
};
await executeAlert(
ruleParams,
'@timestamp',
'alerts-test-data',
{},
esClient,
alertFactory,
alertLimit,
timestamp.valueOf()
);
expect(alertFactory.callCount).to.equal(1);
expect(alertFactory.getCall(0).args).to.eql([
'dev',
'2 log entries in the last 5 mins for dev. Alert when ≥ 1.',
2,
1,
[
{
actionGroup: 'logs.threshold.fired',
context: {
conditions: 'env does not equal test',
group: 'dev',
isRatio: false,
matchingDocuments: 2,
reason: '2 log entries in the last 5 mins for dev. Alert when ≥ 1.',
},
},
],
]);
expect(alertLimit.setLimitReached.calledOnceWith(true)).to.be(true);
});
});
});
@ -134,9 +209,13 @@ export default function ({ getService }: FtrProviderContext) {
after(() => esArchiver.unload('x-pack/test/functional/es_archives/infra/ten_thousand_plus'));
describe('without group by', () => {
it('should work', async () => {
it('should trigger alerts below the alert limit', async () => {
const timestamp = new Date(DATES.ten_thousand_plus.max);
const alertFactory = sinon.fake();
const alertFactory = sinon.fake() as SinonSpyOf<LogThresholdAlertFactory>;
const alertLimit = {
getValue: sinon.fake.returns(2),
setLimitReached: sinon.fake(),
} as SinonSpiesOf<LogThresholdAlertLimit>;
const ruleParams = {
count: {
comparator: Comparator.GT_OR_EQ,
@ -156,6 +235,7 @@ export default function ({ getService }: FtrProviderContext) {
{},
esClient,
alertFactory,
alertLimit,
timestamp.valueOf()
);
expect(alertFactory.callCount).to.equal(1);
@ -179,13 +259,18 @@ export default function ({ getService }: FtrProviderContext) {
},
],
]);
expect(alertLimit.setLimitReached.calledOnceWith(false)).to.be(true);
});
});
describe('with group by', () => {
it('should work', async () => {
it('should trigger alerts below the alert limit', async () => {
const timestamp = new Date(DATES.ten_thousand_plus.max);
const alertFactory = sinon.fake();
const alertFactory = sinon.fake() as SinonSpyOf<LogThresholdAlertFactory>;
const alertLimit = {
getValue: sinon.fake.returns(2),
setLimitReached: sinon.fake(),
} as SinonSpiesOf<LogThresholdAlertLimit>;
const ruleParams = {
count: {
comparator: Comparator.GT_OR_EQ,
@ -206,6 +291,7 @@ export default function ({ getService }: FtrProviderContext) {
{},
esClient,
alertFactory,
alertLimit,
timestamp.valueOf()
);
expect(alertFactory.callCount).to.equal(1);
@ -229,8 +315,18 @@ export default function ({ getService }: FtrProviderContext) {
},
],
]);
expect(alertLimit.setLimitReached.calledOnceWith(false)).to.be(true);
});
});
});
});
}
type SinonSpyOf<SpyTarget extends (...args: any[]) => any> = sinon.SinonSpy<
Parameters<SpyTarget>,
ReturnType<SpyTarget>
>;
type SinonSpiesOf<SpyTarget extends Record<string, (...args: any[]) => any>> = {
[Key in keyof SpyTarget]: SinonSpyOf<SpyTarget[Key]>;
};