mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 09:48:58 -04:00
[ML] AIOps: Adds log rate analysis to alert details page contextual insight. (#187690)
## Summary Part of #178501. This adds log rate analysis results to the prompt used for contextual insights on alert details pages: <img width="1149" alt="image" src="https://github.com/user-attachments/assets/80b0f8e6-1ea1-4dbf-86ff-82c2ef175aa6"> <img width="1129" alt="image" src="https://github.com/user-attachments/assets/a538cc3c-6f13-43e4-ad7d-8a93a779d349"> ### Checklist - [x] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios - [x] This was checked for breaking API changes and was [labeled appropriately](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process) --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
parent
1344d3b238
commit
3c2ce3c839
11 changed files with 387 additions and 44 deletions
|
@ -11,7 +11,7 @@ export interface GetLogRateAnalysisParametersFromAlertArgs {
|
|||
alertStartedAt: string;
|
||||
alertEndedAt?: string;
|
||||
timeSize?: number;
|
||||
timeUnit?: moment.unitOfTime.DurationConstructor;
|
||||
timeUnit?: string;
|
||||
}
|
||||
|
||||
export const getLogRateAnalysisParametersFromAlert = ({
|
||||
|
@ -20,12 +20,7 @@ export const getLogRateAnalysisParametersFromAlert = ({
|
|||
timeSize,
|
||||
timeUnit,
|
||||
}: GetLogRateAnalysisParametersFromAlertArgs) => {
|
||||
// Identify `intervalFactor` to adjust time ranges based on alert settings.
|
||||
// The default time ranges for `initialAnalysisStart` are suitable for a `1m` lookback.
|
||||
// If an alert would have a `5m` lookback, this would result in a factor of `5`.
|
||||
const lookbackDuration =
|
||||
timeSize && timeUnit ? moment.duration(timeSize, timeUnit) : moment.duration(1, 'm');
|
||||
const intervalFactor = Math.max(1, lookbackDuration.asSeconds() / 60);
|
||||
const intervalFactor = getIntervalFactor(timeSize, timeUnit);
|
||||
|
||||
const alertStart = moment(alertStartedAt);
|
||||
const alertEnd = alertEndedAt ? moment(alertEndedAt) : undefined;
|
||||
|
@ -43,6 +38,21 @@ export const getLogRateAnalysisParametersFromAlert = ({
|
|||
};
|
||||
};
|
||||
|
||||
// Identify `intervalFactor` to adjust time ranges based on alert settings.
|
||||
// The default time ranges for `initialAnalysisStart` are suitable for a `1m` lookback.
|
||||
// If an alert would have a `5m` lookback, this would result in a factor of `5`.
|
||||
export const getIntervalFactor = (timeSize?: number, timeUnit?: string) => {
|
||||
const lookbackDuration =
|
||||
timeSize && timeUnit
|
||||
? moment.duration(
|
||||
timeSize,
|
||||
// workaround to cast the string based time unit to moment's format.
|
||||
timeUnit as unknown as moment.unitOfTime.DurationConstructor | undefined
|
||||
)
|
||||
: moment.duration(1, 'm');
|
||||
return Math.max(1, lookbackDuration.asSeconds() / 60);
|
||||
};
|
||||
|
||||
interface GetParameterHelperArgs {
|
||||
alertStart: Moment;
|
||||
intervalFactor: number;
|
||||
|
|
|
@ -0,0 +1,213 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import moment from 'moment';
|
||||
import { queue } from 'async';
|
||||
import { chunk } from 'lodash';
|
||||
|
||||
import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
|
||||
import { withSpan } from '@kbn/apm-utils';
|
||||
import type { ElasticsearchClient } from '@kbn/core/server';
|
||||
import type { SignificantItem } from '@kbn/ml-agg-utils';
|
||||
import { getSampleProbability } from '@kbn/ml-random-sampler-utils';
|
||||
|
||||
import type { AiopsLogRateAnalysisSchema } from '../api/schema';
|
||||
import { getIntervalFactor } from '../get_log_rate_analysis_parameters_from_alert';
|
||||
import { getSwappedWindowParameters } from '../get_swapped_window_parameters';
|
||||
import { getLogRateChange } from '../get_log_rate_change';
|
||||
import { getBaselineAndDeviationRates } from '../get_baseline_and_deviation_rates';
|
||||
import { getLogRateAnalysisTypeForCounts } from '../get_log_rate_analysis_type_for_counts';
|
||||
import { LOG_RATE_ANALYSIS_TYPE } from '../log_rate_analysis_type';
|
||||
|
||||
import { fetchIndexInfo } from './fetch_index_info';
|
||||
import { fetchSignificantCategories } from './fetch_significant_categories';
|
||||
import { fetchSignificantTermPValues } from './fetch_significant_term_p_values';
|
||||
|
||||
const MAX_CONCURRENT_QUERIES = 5;
|
||||
const CHUNK_SIZE = 50;
|
||||
|
||||
interface QueueItem {
|
||||
fn: typeof fetchSignificantCategories | typeof fetchSignificantTermPValues;
|
||||
fieldNames: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs log rate analysis data on an index given some alert metadata.
|
||||
*/
|
||||
export async function fetchLogRateAnalysisForAlert({
|
||||
esClient,
|
||||
abortSignal,
|
||||
arguments: args,
|
||||
}: {
|
||||
esClient: ElasticsearchClient;
|
||||
abortSignal?: AbortSignal;
|
||||
arguments: {
|
||||
index: string;
|
||||
alertStartedAt: string;
|
||||
alertRuleParameterTimeSize?: number;
|
||||
alertRuleParameterTimeUnit?: string;
|
||||
timefield?: string;
|
||||
searchQuery?: estypes.QueryDslQueryContainer;
|
||||
};
|
||||
}) {
|
||||
const { alertStartedAt, timefield = '@timestamp' } = args;
|
||||
const alertStart = moment(alertStartedAt);
|
||||
|
||||
const intervalFactor = getIntervalFactor(
|
||||
args.alertRuleParameterTimeSize,
|
||||
args.alertRuleParameterTimeUnit
|
||||
);
|
||||
|
||||
// The deviation time range is 1 lookback duration before the alert start.
|
||||
// The baseline time range is 2 lookback durations before the deviation time range.
|
||||
const windowParameters = {
|
||||
baselineMin: alertStart
|
||||
.clone()
|
||||
.subtract(3 * intervalFactor, 'minutes')
|
||||
.valueOf(),
|
||||
baselineMax: alertStart
|
||||
.clone()
|
||||
.subtract(1 * intervalFactor, 'minutes')
|
||||
.valueOf(),
|
||||
deviationMin: alertStart
|
||||
.clone()
|
||||
.subtract(1 * intervalFactor, 'minutes')
|
||||
.valueOf(),
|
||||
deviationMax: alertStart.valueOf(),
|
||||
};
|
||||
|
||||
const { searchQuery = { match_all: {} } } = args;
|
||||
|
||||
// Step 1: Get field candidates and total doc counts.
|
||||
const indexInfoParams: AiopsLogRateAnalysisSchema = {
|
||||
index: args.index,
|
||||
start: windowParameters.baselineMin,
|
||||
end: windowParameters.deviationMax,
|
||||
searchQuery: JSON.stringify(searchQuery),
|
||||
timeFieldName: timefield,
|
||||
...windowParameters,
|
||||
};
|
||||
|
||||
const indexInfo = await withSpan(
|
||||
{ name: 'fetch_index_info', type: 'aiops-log-rate-analysis-for-alert' },
|
||||
() =>
|
||||
fetchIndexInfo({
|
||||
esClient,
|
||||
abortSignal,
|
||||
arguments: {
|
||||
...indexInfoParams,
|
||||
textFieldCandidatesOverrides: ['message', 'error.message'],
|
||||
},
|
||||
})
|
||||
);
|
||||
const { textFieldCandidates, keywordFieldCandidates } = indexInfo;
|
||||
|
||||
const logRateAnalysisType = getLogRateAnalysisTypeForCounts({
|
||||
baselineCount: indexInfo.baselineTotalDocCount,
|
||||
deviationCount: indexInfo.deviationTotalDocCount,
|
||||
windowParameters,
|
||||
});
|
||||
|
||||
// Just in case the log rate analysis type is 'dip', we need to swap
|
||||
// the window parameters for the analysis.
|
||||
const analysisWindowParameters =
|
||||
logRateAnalysisType === LOG_RATE_ANALYSIS_TYPE.SPIKE
|
||||
? windowParameters
|
||||
: getSwappedWindowParameters(windowParameters);
|
||||
|
||||
// Step 2: Identify significant items.
|
||||
// The following code will fetch significant categories and term p-values
|
||||
// using an async queue. The field candidates will be passed on as chunks
|
||||
// of 50 fields with up to 5 concurrent queries. This is to prevent running
|
||||
// into bucket limit issues if we'd throw possibly hundreds of field candidates
|
||||
// into a single query.
|
||||
|
||||
const significantItems: SignificantItem[] = [];
|
||||
|
||||
// Set up the queue: A queue item is an object with the function to call and
|
||||
// the field names to be passed to the function. This is done so we can push
|
||||
// queries for both keyword fields (using significant_terms/p-values) and
|
||||
// text fields (using categorize_text + custom code to identify significance)
|
||||
// into the same queue.
|
||||
const significantItemsQueue = queue(async function ({ fn, fieldNames }: QueueItem) {
|
||||
significantItems.push(
|
||||
...(await fn({
|
||||
esClient,
|
||||
abortSignal,
|
||||
arguments: {
|
||||
...indexInfoParams,
|
||||
...analysisWindowParameters,
|
||||
fieldNames,
|
||||
sampleProbability: getSampleProbability(
|
||||
indexInfo.deviationTotalDocCount + indexInfo.baselineTotalDocCount
|
||||
),
|
||||
},
|
||||
}))
|
||||
);
|
||||
}, MAX_CONCURRENT_QUERIES);
|
||||
|
||||
// Push the actual items to the queue. We don't need to chunk the text fields
|
||||
// since they are just `message` and `error.message`.
|
||||
significantItemsQueue.push(
|
||||
[
|
||||
{ fn: fetchSignificantCategories, fieldNames: textFieldCandidates },
|
||||
...chunk(keywordFieldCandidates, CHUNK_SIZE).map((fieldNames) => ({
|
||||
fn: fetchSignificantTermPValues,
|
||||
fieldNames,
|
||||
})),
|
||||
],
|
||||
(err) => {
|
||||
if (err) significantItemsQueue.kill();
|
||||
}
|
||||
);
|
||||
|
||||
// Wait for the queue to finish.
|
||||
await withSpan(
|
||||
{ name: 'fetch_significant_items', type: 'aiops-log-rate-analysis-for-alert' },
|
||||
() => significantItemsQueue.drain()
|
||||
);
|
||||
|
||||
// RETURN DATA
|
||||
// Adapt the raw significant items data for contextual insights.
|
||||
return {
|
||||
logRateAnalysisType,
|
||||
significantItems: significantItems
|
||||
.map(({ fieldName, fieldValue, type, doc_count: docCount, bg_count: bgCount }) => {
|
||||
const { baselineBucketRate, deviationBucketRate } = getBaselineAndDeviationRates(
|
||||
logRateAnalysisType,
|
||||
// Normalize the amount of baseline buckets based on treating the
|
||||
// devation duration as 1 bucket.
|
||||
(windowParameters.baselineMax - windowParameters.baselineMin) /
|
||||
(windowParameters.deviationMax - windowParameters.deviationMin),
|
||||
1,
|
||||
docCount,
|
||||
bgCount
|
||||
);
|
||||
|
||||
const fieldType = type === 'keyword' ? 'metadata' : 'log message pattern';
|
||||
|
||||
const data = {
|
||||
fieldType,
|
||||
fieldName,
|
||||
fieldValue: String(fieldValue).substring(0, 140),
|
||||
logRateChange: getLogRateChange(
|
||||
logRateAnalysisType,
|
||||
baselineBucketRate,
|
||||
deviationBucketRate
|
||||
).message,
|
||||
};
|
||||
|
||||
return {
|
||||
logRateChangeSort: bgCount > 0 ? docCount / bgCount : docCount,
|
||||
data,
|
||||
};
|
||||
})
|
||||
.sort((a, b) => b.logRateChangeSort - a.logRateChangeSort)
|
||||
.map((d) => d.data),
|
||||
};
|
||||
}
|
|
@ -31,5 +31,6 @@
|
|||
"@kbn/ml-string-hash",
|
||||
"@kbn/ml-response-stream",
|
||||
"@kbn/i18n",
|
||||
"@kbn/apm-utils",
|
||||
]
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@ import { APMEventClient } from '../../../lib/helpers/create_es_client/create_apm
|
|||
import { PROCESSOR_EVENT, TRACE_ID } from '../../../../common/es_fields/apm';
|
||||
import { getTypedSearch } from '../../../utils/create_typed_es_client';
|
||||
import { getDownstreamServiceResource } from '../get_observability_alert_details_context/get_downstream_dependency_name';
|
||||
import { getShouldMatchOrNotExistFilter } from '../utils/get_should_match_or_not_exist_filter';
|
||||
|
||||
export interface LogCategory {
|
||||
errorCategory: string;
|
||||
|
@ -101,7 +102,7 @@ export async function getLogCategories({
|
|||
categories: {
|
||||
categorize_text: {
|
||||
field: 'message',
|
||||
size: 500,
|
||||
size: 10,
|
||||
},
|
||||
aggs: {
|
||||
sample: {
|
||||
|
@ -147,37 +148,3 @@ export async function getLogCategories({
|
|||
entities: flattenObject(sampleDoc),
|
||||
};
|
||||
}
|
||||
|
||||
// field/value pairs should match, or the field should not exist
|
||||
export function getShouldMatchOrNotExistFilter(
|
||||
keyValuePairs: Array<{
|
||||
field: string;
|
||||
value?: string;
|
||||
}>
|
||||
) {
|
||||
return keyValuePairs
|
||||
.filter(({ value }) => value)
|
||||
.map(({ field, value }) => {
|
||||
return {
|
||||
bool: {
|
||||
should: [
|
||||
{
|
||||
bool: {
|
||||
filter: [{ term: { [field]: value } }],
|
||||
},
|
||||
},
|
||||
{
|
||||
bool: {
|
||||
must_not: {
|
||||
bool: {
|
||||
filter: [{ exists: { field } }],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
minimum_should_match: 1,
|
||||
},
|
||||
};
|
||||
});
|
||||
}
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
|
||||
import type { CoreRequestHandlerContext } from '@kbn/core/server';
|
||||
import { aiAssistantLogsIndexPattern } from '@kbn/observability-ai-assistant-plugin/server';
|
||||
import { fetchLogRateAnalysisForAlert } from '@kbn/aiops-log-rate-analysis/queries/fetch_log_rate_analysis_for_alert';
|
||||
import { PROCESSOR_EVENT } from '../../../../common/es_fields/apm';
|
||||
import { getShouldMatchOrNotExistFilter } from '../utils/get_should_match_or_not_exist_filter';
|
||||
|
||||
/**
|
||||
* Runs log rate analysis data on an index given some alert metadata.
|
||||
*/
|
||||
export async function getLogRateAnalysisForAlert({
|
||||
esClient,
|
||||
coreContext,
|
||||
arguments: args,
|
||||
}: {
|
||||
esClient: ElasticsearchClient;
|
||||
coreContext: Pick<CoreRequestHandlerContext, 'uiSettings'>;
|
||||
arguments: {
|
||||
alertStartedAt: string;
|
||||
alertRuleParameterTimeSize?: number;
|
||||
alertRuleParameterTimeUnit?: string;
|
||||
entities: {
|
||||
'service.name'?: string;
|
||||
'host.name'?: string;
|
||||
'container.id'?: string;
|
||||
'kubernetes.pod.name'?: string;
|
||||
};
|
||||
};
|
||||
}): ReturnType<typeof fetchLogRateAnalysisForAlert> {
|
||||
const index = await coreContext.uiSettings.client.get<string>(aiAssistantLogsIndexPattern);
|
||||
|
||||
const keyValueFilters = getShouldMatchOrNotExistFilter(
|
||||
Object.entries(args.entities).map(([key, value]) => ({ field: key, value }))
|
||||
);
|
||||
|
||||
const searchQuery = {
|
||||
bool: {
|
||||
must_not: [
|
||||
// exclude APM errors
|
||||
{ term: { [PROCESSOR_EVENT]: 'error' } },
|
||||
],
|
||||
filter: [...keyValueFilters],
|
||||
},
|
||||
};
|
||||
|
||||
return fetchLogRateAnalysisForAlert({
|
||||
esClient,
|
||||
arguments: {
|
||||
index,
|
||||
alertStartedAt: args.alertStartedAt,
|
||||
alertRuleParameterTimeSize: args.alertRuleParameterTimeSize,
|
||||
alertRuleParameterTimeUnit: args.alertRuleParameterTimeUnit,
|
||||
searchQuery,
|
||||
},
|
||||
});
|
||||
}
|
|
@ -22,6 +22,7 @@ import {
|
|||
APMDownstreamDependency,
|
||||
getAssistantDownstreamDependencies,
|
||||
} from '../get_apm_downstream_dependencies';
|
||||
import { getLogRateAnalysisForAlert } from '../get_log_rate_analysis_for_alert';
|
||||
import { getLogCategories, LogCategory } from '../get_log_categories';
|
||||
import { getAnomalies } from '../get_apm_service_summary/get_anomalies';
|
||||
import { getServiceNameFromSignals } from './get_service_name_from_signals';
|
||||
|
@ -160,6 +161,42 @@ export const getAlertDetailsContextHandler = (
|
|||
});
|
||||
}
|
||||
|
||||
// log rate analysis
|
||||
dataFetchers.push(async () => {
|
||||
const { logRateAnalysisType, significantItems } = await getLogRateAnalysisForAlert({
|
||||
esClient,
|
||||
coreContext,
|
||||
arguments: {
|
||||
alertStartedAt: moment(alertStartedAt).toISOString(),
|
||||
alertRuleParameterTimeSize: query.alert_rule_parameter_time_size
|
||||
? parseInt(query.alert_rule_parameter_time_size, 10)
|
||||
: undefined,
|
||||
alertRuleParameterTimeUnit: query.alert_rule_parameter_time_unit,
|
||||
entities: {
|
||||
'service.name': serviceName,
|
||||
'host.name': hostName,
|
||||
'container.id': containerId,
|
||||
'kubernetes.pod.name': kubernetesPodName,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
if (logRateAnalysisType !== 'spike' || significantItems.length === 0) {
|
||||
return {
|
||||
key: 'logRateAnalysis',
|
||||
description:
|
||||
'Log rate analysis did not identify any significant metadata or log patterns.',
|
||||
data: [],
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
key: 'logRateAnalysis',
|
||||
description: `Statistically significant log metadata and log message patterns occurring in the lookback period before the alert was triggered.`,
|
||||
data: significantItems,
|
||||
};
|
||||
});
|
||||
|
||||
// log categories
|
||||
dataFetchers.push(async () => {
|
||||
const downstreamDependencies = await downstreamDependenciesPromise;
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
import { getShouldMatchOrNotExistFilter } from '.';
|
||||
import { getShouldMatchOrNotExistFilter } from './get_should_match_or_not_exist_filter';
|
||||
|
||||
describe('getShouldMatchOrNotExistFilter', () => {
|
||||
describe('when all fields are provided', () => {
|
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
// field/value pairs should match, or the field should not exist
|
||||
export function getShouldMatchOrNotExistFilter(
|
||||
keyValuePairs: Array<{
|
||||
field: string;
|
||||
value?: string;
|
||||
}>
|
||||
) {
|
||||
return keyValuePairs
|
||||
.filter(({ value }) => value)
|
||||
.map(({ field, value }) => {
|
||||
return {
|
||||
bool: {
|
||||
should: [
|
||||
{
|
||||
bool: {
|
||||
filter: [{ term: { [field]: value } }],
|
||||
},
|
||||
},
|
||||
{
|
||||
bool: {
|
||||
must_not: {
|
||||
bool: {
|
||||
filter: [{ exists: { field } }],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
minimum_should_match: 1,
|
||||
},
|
||||
};
|
||||
});
|
||||
}
|
|
@ -127,7 +127,8 @@
|
|||
"@kbn/server-route-repository-utils",
|
||||
"@kbn/core-analytics-browser",
|
||||
"@kbn/apm-types",
|
||||
"@kbn/entities-schema"
|
||||
"@kbn/entities-schema",
|
||||
"@kbn/aiops-log-rate-analysis"
|
||||
],
|
||||
"exclude": [
|
||||
"target/**/*"
|
||||
|
|
|
@ -9,6 +9,7 @@ import { EuiFlexGroup, EuiFlexItem } from '@elastic/eui';
|
|||
|
||||
import React, { useCallback } from 'react';
|
||||
import { i18n } from '@kbn/i18n';
|
||||
import { ALERT_RULE_PARAMETERS } from '@kbn/rule-data-utils';
|
||||
import dedent from 'dedent';
|
||||
import { type AlertDetailsContextualInsight } from '../../../server/services';
|
||||
import { useKibana } from '../../utils/kibana_react';
|
||||
|
@ -35,6 +36,12 @@ export function AlertDetailContextualInsights({ alert }: { alert: AlertData | nu
|
|||
query: {
|
||||
alert_started_at: new Date(alert.formatted.start).toISOString(),
|
||||
|
||||
// alert fields used for log rate analysis
|
||||
alert_rule_parameter_time_size: alert.formatted.fields[ALERT_RULE_PARAMETERS]
|
||||
?.timeSize as string | undefined,
|
||||
alert_rule_parameter_time_unit: alert.formatted.fields[ALERT_RULE_PARAMETERS]
|
||||
?.timeUnit as string | undefined,
|
||||
|
||||
// service fields
|
||||
'service.name': fields['service.name'],
|
||||
'service.environment': fields['service.environment'],
|
||||
|
|
|
@ -20,6 +20,10 @@ export const alertDetailsContextRt = t.intersection([
|
|||
alert_started_at: t.string,
|
||||
}),
|
||||
t.partial({
|
||||
// alert fields used for log rate analysis
|
||||
alert_rule_parameter_time_size: t.string,
|
||||
alert_rule_parameter_time_unit: t.string,
|
||||
|
||||
// apm fields
|
||||
'service.name': t.string,
|
||||
'service.environment': t.string,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue