[ML] AIOps: Adds log rate analysis to alert details page contextual insight. (#187690)

## Summary

Part of #178501.

This adds log rate analysis results to the prompt used for contextual
insights on alert details pages:

<img width="1149" alt="image"
src="https://github.com/user-attachments/assets/80b0f8e6-1ea1-4dbf-86ff-82c2ef175aa6">

<img width="1129" alt="image"
src="https://github.com/user-attachments/assets/a538cc3c-6f13-43e4-ad7d-8a93a779d349">


### Checklist

- [x] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios
- [x] This was checked for breaking API changes and was [labeled
appropriately](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process)

---------

Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
Walter Rafelsberger 2024-08-28 13:37:15 +02:00 committed by GitHub
parent 1344d3b238
commit 3c2ce3c839
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 387 additions and 44 deletions

View file

@ -11,7 +11,7 @@ export interface GetLogRateAnalysisParametersFromAlertArgs {
alertStartedAt: string;
alertEndedAt?: string;
timeSize?: number;
timeUnit?: moment.unitOfTime.DurationConstructor;
timeUnit?: string;
}
export const getLogRateAnalysisParametersFromAlert = ({
@ -20,12 +20,7 @@ export const getLogRateAnalysisParametersFromAlert = ({
timeSize,
timeUnit,
}: GetLogRateAnalysisParametersFromAlertArgs) => {
// Identify `intervalFactor` to adjust time ranges based on alert settings.
// The default time ranges for `initialAnalysisStart` are suitable for a `1m` lookback.
// If an alert would have a `5m` lookback, this would result in a factor of `5`.
const lookbackDuration =
timeSize && timeUnit ? moment.duration(timeSize, timeUnit) : moment.duration(1, 'm');
const intervalFactor = Math.max(1, lookbackDuration.asSeconds() / 60);
const intervalFactor = getIntervalFactor(timeSize, timeUnit);
const alertStart = moment(alertStartedAt);
const alertEnd = alertEndedAt ? moment(alertEndedAt) : undefined;
@ -43,6 +38,21 @@ export const getLogRateAnalysisParametersFromAlert = ({
};
};
// Identify `intervalFactor` to adjust time ranges based on alert settings.
// The default time ranges for `initialAnalysisStart` are suitable for a `1m` lookback.
// If an alert would have a `5m` lookback, this would result in a factor of `5`.
export const getIntervalFactor = (timeSize?: number, timeUnit?: string) => {
const lookbackDuration =
timeSize && timeUnit
? moment.duration(
timeSize,
// workaround to cast the string based time unit to moment's format.
timeUnit as unknown as moment.unitOfTime.DurationConstructor | undefined
)
: moment.duration(1, 'm');
return Math.max(1, lookbackDuration.asSeconds() / 60);
};
interface GetParameterHelperArgs {
alertStart: Moment;
intervalFactor: number;

View file

@ -0,0 +1,213 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import moment from 'moment';
import { queue } from 'async';
import { chunk } from 'lodash';
import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { withSpan } from '@kbn/apm-utils';
import type { ElasticsearchClient } from '@kbn/core/server';
import type { SignificantItem } from '@kbn/ml-agg-utils';
import { getSampleProbability } from '@kbn/ml-random-sampler-utils';
import type { AiopsLogRateAnalysisSchema } from '../api/schema';
import { getIntervalFactor } from '../get_log_rate_analysis_parameters_from_alert';
import { getSwappedWindowParameters } from '../get_swapped_window_parameters';
import { getLogRateChange } from '../get_log_rate_change';
import { getBaselineAndDeviationRates } from '../get_baseline_and_deviation_rates';
import { getLogRateAnalysisTypeForCounts } from '../get_log_rate_analysis_type_for_counts';
import { LOG_RATE_ANALYSIS_TYPE } from '../log_rate_analysis_type';
import { fetchIndexInfo } from './fetch_index_info';
import { fetchSignificantCategories } from './fetch_significant_categories';
import { fetchSignificantTermPValues } from './fetch_significant_term_p_values';
const MAX_CONCURRENT_QUERIES = 5;
const CHUNK_SIZE = 50;
interface QueueItem {
fn: typeof fetchSignificantCategories | typeof fetchSignificantTermPValues;
fieldNames: string[];
}
/**
* Runs log rate analysis data on an index given some alert metadata.
*/
export async function fetchLogRateAnalysisForAlert({
esClient,
abortSignal,
arguments: args,
}: {
esClient: ElasticsearchClient;
abortSignal?: AbortSignal;
arguments: {
index: string;
alertStartedAt: string;
alertRuleParameterTimeSize?: number;
alertRuleParameterTimeUnit?: string;
timefield?: string;
searchQuery?: estypes.QueryDslQueryContainer;
};
}) {
const { alertStartedAt, timefield = '@timestamp' } = args;
const alertStart = moment(alertStartedAt);
const intervalFactor = getIntervalFactor(
args.alertRuleParameterTimeSize,
args.alertRuleParameterTimeUnit
);
// The deviation time range is 1 lookback duration before the alert start.
// The baseline time range is 2 lookback durations before the deviation time range.
const windowParameters = {
baselineMin: alertStart
.clone()
.subtract(3 * intervalFactor, 'minutes')
.valueOf(),
baselineMax: alertStart
.clone()
.subtract(1 * intervalFactor, 'minutes')
.valueOf(),
deviationMin: alertStart
.clone()
.subtract(1 * intervalFactor, 'minutes')
.valueOf(),
deviationMax: alertStart.valueOf(),
};
const { searchQuery = { match_all: {} } } = args;
// Step 1: Get field candidates and total doc counts.
const indexInfoParams: AiopsLogRateAnalysisSchema = {
index: args.index,
start: windowParameters.baselineMin,
end: windowParameters.deviationMax,
searchQuery: JSON.stringify(searchQuery),
timeFieldName: timefield,
...windowParameters,
};
const indexInfo = await withSpan(
{ name: 'fetch_index_info', type: 'aiops-log-rate-analysis-for-alert' },
() =>
fetchIndexInfo({
esClient,
abortSignal,
arguments: {
...indexInfoParams,
textFieldCandidatesOverrides: ['message', 'error.message'],
},
})
);
const { textFieldCandidates, keywordFieldCandidates } = indexInfo;
const logRateAnalysisType = getLogRateAnalysisTypeForCounts({
baselineCount: indexInfo.baselineTotalDocCount,
deviationCount: indexInfo.deviationTotalDocCount,
windowParameters,
});
// Just in case the log rate analysis type is 'dip', we need to swap
// the window parameters for the analysis.
const analysisWindowParameters =
logRateAnalysisType === LOG_RATE_ANALYSIS_TYPE.SPIKE
? windowParameters
: getSwappedWindowParameters(windowParameters);
// Step 2: Identify significant items.
// The following code will fetch significant categories and term p-values
// using an async queue. The field candidates will be passed on as chunks
// of 50 fields with up to 5 concurrent queries. This is to prevent running
// into bucket limit issues if we'd throw possibly hundreds of field candidates
// into a single query.
const significantItems: SignificantItem[] = [];
// Set up the queue: A queue item is an object with the function to call and
// the field names to be passed to the function. This is done so we can push
// queries for both keyword fields (using significant_terms/p-values) and
// text fields (using categorize_text + custom code to identify significance)
// into the same queue.
const significantItemsQueue = queue(async function ({ fn, fieldNames }: QueueItem) {
significantItems.push(
...(await fn({
esClient,
abortSignal,
arguments: {
...indexInfoParams,
...analysisWindowParameters,
fieldNames,
sampleProbability: getSampleProbability(
indexInfo.deviationTotalDocCount + indexInfo.baselineTotalDocCount
),
},
}))
);
}, MAX_CONCURRENT_QUERIES);
// Push the actual items to the queue. We don't need to chunk the text fields
// since they are just `message` and `error.message`.
significantItemsQueue.push(
[
{ fn: fetchSignificantCategories, fieldNames: textFieldCandidates },
...chunk(keywordFieldCandidates, CHUNK_SIZE).map((fieldNames) => ({
fn: fetchSignificantTermPValues,
fieldNames,
})),
],
(err) => {
if (err) significantItemsQueue.kill();
}
);
// Wait for the queue to finish.
await withSpan(
{ name: 'fetch_significant_items', type: 'aiops-log-rate-analysis-for-alert' },
() => significantItemsQueue.drain()
);
// RETURN DATA
// Adapt the raw significant items data for contextual insights.
return {
logRateAnalysisType,
significantItems: significantItems
.map(({ fieldName, fieldValue, type, doc_count: docCount, bg_count: bgCount }) => {
const { baselineBucketRate, deviationBucketRate } = getBaselineAndDeviationRates(
logRateAnalysisType,
// Normalize the amount of baseline buckets based on treating the
// devation duration as 1 bucket.
(windowParameters.baselineMax - windowParameters.baselineMin) /
(windowParameters.deviationMax - windowParameters.deviationMin),
1,
docCount,
bgCount
);
const fieldType = type === 'keyword' ? 'metadata' : 'log message pattern';
const data = {
fieldType,
fieldName,
fieldValue: String(fieldValue).substring(0, 140),
logRateChange: getLogRateChange(
logRateAnalysisType,
baselineBucketRate,
deviationBucketRate
).message,
};
return {
logRateChangeSort: bgCount > 0 ? docCount / bgCount : docCount,
data,
};
})
.sort((a, b) => b.logRateChangeSort - a.logRateChangeSort)
.map((d) => d.data),
};
}

View file

@ -31,5 +31,6 @@
"@kbn/ml-string-hash",
"@kbn/ml-response-stream",
"@kbn/i18n",
"@kbn/apm-utils",
]
}

View file

@ -14,6 +14,7 @@ import { APMEventClient } from '../../../lib/helpers/create_es_client/create_apm
import { PROCESSOR_EVENT, TRACE_ID } from '../../../../common/es_fields/apm';
import { getTypedSearch } from '../../../utils/create_typed_es_client';
import { getDownstreamServiceResource } from '../get_observability_alert_details_context/get_downstream_dependency_name';
import { getShouldMatchOrNotExistFilter } from '../utils/get_should_match_or_not_exist_filter';
export interface LogCategory {
errorCategory: string;
@ -101,7 +102,7 @@ export async function getLogCategories({
categories: {
categorize_text: {
field: 'message',
size: 500,
size: 10,
},
aggs: {
sample: {
@ -147,37 +148,3 @@ export async function getLogCategories({
entities: flattenObject(sampleDoc),
};
}
// field/value pairs should match, or the field should not exist
export function getShouldMatchOrNotExistFilter(
keyValuePairs: Array<{
field: string;
value?: string;
}>
) {
return keyValuePairs
.filter(({ value }) => value)
.map(({ field, value }) => {
return {
bool: {
should: [
{
bool: {
filter: [{ term: { [field]: value } }],
},
},
{
bool: {
must_not: {
bool: {
filter: [{ exists: { field } }],
},
},
},
},
],
minimum_should_match: 1,
},
};
});
}

View file

@ -0,0 +1,63 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
import type { CoreRequestHandlerContext } from '@kbn/core/server';
import { aiAssistantLogsIndexPattern } from '@kbn/observability-ai-assistant-plugin/server';
import { fetchLogRateAnalysisForAlert } from '@kbn/aiops-log-rate-analysis/queries/fetch_log_rate_analysis_for_alert';
import { PROCESSOR_EVENT } from '../../../../common/es_fields/apm';
import { getShouldMatchOrNotExistFilter } from '../utils/get_should_match_or_not_exist_filter';
/**
* Runs log rate analysis data on an index given some alert metadata.
*/
export async function getLogRateAnalysisForAlert({
esClient,
coreContext,
arguments: args,
}: {
esClient: ElasticsearchClient;
coreContext: Pick<CoreRequestHandlerContext, 'uiSettings'>;
arguments: {
alertStartedAt: string;
alertRuleParameterTimeSize?: number;
alertRuleParameterTimeUnit?: string;
entities: {
'service.name'?: string;
'host.name'?: string;
'container.id'?: string;
'kubernetes.pod.name'?: string;
};
};
}): ReturnType<typeof fetchLogRateAnalysisForAlert> {
const index = await coreContext.uiSettings.client.get<string>(aiAssistantLogsIndexPattern);
const keyValueFilters = getShouldMatchOrNotExistFilter(
Object.entries(args.entities).map(([key, value]) => ({ field: key, value }))
);
const searchQuery = {
bool: {
must_not: [
// exclude APM errors
{ term: { [PROCESSOR_EVENT]: 'error' } },
],
filter: [...keyValueFilters],
},
};
return fetchLogRateAnalysisForAlert({
esClient,
arguments: {
index,
alertStartedAt: args.alertStartedAt,
alertRuleParameterTimeSize: args.alertRuleParameterTimeSize,
alertRuleParameterTimeUnit: args.alertRuleParameterTimeUnit,
searchQuery,
},
});
}

View file

@ -22,6 +22,7 @@ import {
APMDownstreamDependency,
getAssistantDownstreamDependencies,
} from '../get_apm_downstream_dependencies';
import { getLogRateAnalysisForAlert } from '../get_log_rate_analysis_for_alert';
import { getLogCategories, LogCategory } from '../get_log_categories';
import { getAnomalies } from '../get_apm_service_summary/get_anomalies';
import { getServiceNameFromSignals } from './get_service_name_from_signals';
@ -160,6 +161,42 @@ export const getAlertDetailsContextHandler = (
});
}
// log rate analysis
dataFetchers.push(async () => {
const { logRateAnalysisType, significantItems } = await getLogRateAnalysisForAlert({
esClient,
coreContext,
arguments: {
alertStartedAt: moment(alertStartedAt).toISOString(),
alertRuleParameterTimeSize: query.alert_rule_parameter_time_size
? parseInt(query.alert_rule_parameter_time_size, 10)
: undefined,
alertRuleParameterTimeUnit: query.alert_rule_parameter_time_unit,
entities: {
'service.name': serviceName,
'host.name': hostName,
'container.id': containerId,
'kubernetes.pod.name': kubernetesPodName,
},
},
});
if (logRateAnalysisType !== 'spike' || significantItems.length === 0) {
return {
key: 'logRateAnalysis',
description:
'Log rate analysis did not identify any significant metadata or log patterns.',
data: [],
};
}
return {
key: 'logRateAnalysis',
description: `Statistically significant log metadata and log message patterns occurring in the lookback period before the alert was triggered.`,
data: significantItems,
};
});
// log categories
dataFetchers.push(async () => {
const downstreamDependencies = await downstreamDependenciesPromise;

View file

@ -5,7 +5,7 @@
* 2.0.
*/
import { getShouldMatchOrNotExistFilter } from '.';
import { getShouldMatchOrNotExistFilter } from './get_should_match_or_not_exist_filter';
describe('getShouldMatchOrNotExistFilter', () => {
describe('when all fields are provided', () => {

View file

@ -0,0 +1,40 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
// field/value pairs should match, or the field should not exist
export function getShouldMatchOrNotExistFilter(
keyValuePairs: Array<{
field: string;
value?: string;
}>
) {
return keyValuePairs
.filter(({ value }) => value)
.map(({ field, value }) => {
return {
bool: {
should: [
{
bool: {
filter: [{ term: { [field]: value } }],
},
},
{
bool: {
must_not: {
bool: {
filter: [{ exists: { field } }],
},
},
},
},
],
minimum_should_match: 1,
},
};
});
}

View file

@ -127,7 +127,8 @@
"@kbn/server-route-repository-utils",
"@kbn/core-analytics-browser",
"@kbn/apm-types",
"@kbn/entities-schema"
"@kbn/entities-schema",
"@kbn/aiops-log-rate-analysis"
],
"exclude": [
"target/**/*"

View file

@ -9,6 +9,7 @@ import { EuiFlexGroup, EuiFlexItem } from '@elastic/eui';
import React, { useCallback } from 'react';
import { i18n } from '@kbn/i18n';
import { ALERT_RULE_PARAMETERS } from '@kbn/rule-data-utils';
import dedent from 'dedent';
import { type AlertDetailsContextualInsight } from '../../../server/services';
import { useKibana } from '../../utils/kibana_react';
@ -35,6 +36,12 @@ export function AlertDetailContextualInsights({ alert }: { alert: AlertData | nu
query: {
alert_started_at: new Date(alert.formatted.start).toISOString(),
// alert fields used for log rate analysis
alert_rule_parameter_time_size: alert.formatted.fields[ALERT_RULE_PARAMETERS]
?.timeSize as string | undefined,
alert_rule_parameter_time_unit: alert.formatted.fields[ALERT_RULE_PARAMETERS]
?.timeUnit as string | undefined,
// service fields
'service.name': fields['service.name'],
'service.environment': fields['service.environment'],

View file

@ -20,6 +20,10 @@ export const alertDetailsContextRt = t.intersection([
alert_started_at: t.string,
}),
t.partial({
// alert fields used for log rate analysis
alert_rule_parameter_time_size: t.string,
alert_rule_parameter_time_unit: t.string,
// apm fields
'service.name': t.string,
'service.environment': t.string,