mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 17:59:23 -04:00
[Obs AI Assistant] Improve context for contextual insights components on alert details page (#180766)
This adds an API call to the contextual insights component which should improve the relevance of the responses by providing more context to the LLM. --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com> Co-authored-by: Cauê Marcondes <55978943+cauemarcondes@users.noreply.github.com>
This commit is contained in:
parent
91c8270aea
commit
abec38e337
12 changed files with 670 additions and 405 deletions
|
@ -1,122 +0,0 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { EuiFlexGroup, EuiFlexItem } from '@elastic/eui';
|
||||
import {
|
||||
createFunctionRequestMessage,
|
||||
createFunctionResponseMessage,
|
||||
} from '@kbn/observability-ai-assistant-plugin/public';
|
||||
import React, { useCallback } from 'react';
|
||||
import dedent from 'dedent';
|
||||
|
||||
import { i18n } from '@kbn/i18n';
|
||||
import { callApmApi } from '../../../../services/rest/create_call_apm_api';
|
||||
import {
|
||||
SERVICE_ENVIRONMENT,
|
||||
SERVICE_NAME,
|
||||
TRANSACTION_NAME,
|
||||
TRANSACTION_TYPE,
|
||||
} from '../../../../../common/es_fields/apm';
|
||||
import { useKibana } from '../../../../context/kibana_context/use_kibana';
|
||||
import { AlertDetailsAppSectionProps } from './types';
|
||||
|
||||
export function AlertDetailContextualInsights({
|
||||
alert,
|
||||
}: {
|
||||
alert: AlertDetailsAppSectionProps['alert'];
|
||||
}) {
|
||||
const {
|
||||
services: { observabilityAIAssistant },
|
||||
} = useKibana();
|
||||
|
||||
const ObservabilityAIAssistantContextualInsight =
|
||||
observabilityAIAssistant?.ObservabilityAIAssistantContextualInsight;
|
||||
|
||||
const getPromptMessages = useCallback(async () => {
|
||||
const {
|
||||
serviceSummary,
|
||||
downstreamDependencies,
|
||||
logCategories,
|
||||
serviceChangePoints,
|
||||
exitSpanChangePoints,
|
||||
anomalies,
|
||||
} = await callApmApi(
|
||||
'GET /internal/apm/assistant/get_apm_alert_details_context',
|
||||
{
|
||||
signal: null,
|
||||
params: {
|
||||
query: {
|
||||
[SERVICE_NAME]: alert.fields[SERVICE_NAME],
|
||||
[SERVICE_ENVIRONMENT]: alert.fields[SERVICE_ENVIRONMENT],
|
||||
[TRANSACTION_TYPE]: alert.fields[TRANSACTION_TYPE],
|
||||
[TRANSACTION_NAME]: alert.fields[TRANSACTION_NAME],
|
||||
alert_started_at: new Date(alert.start).toISOString(),
|
||||
},
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
const serviceName = alert.fields[SERVICE_NAME];
|
||||
const serviceEnvironment = alert.fields[SERVICE_ENVIRONMENT];
|
||||
|
||||
const content = {
|
||||
apmAlertContext: dedent(
|
||||
`High level information about the service where the alert occurred. Use this as background but do not repeat this information to the user.
|
||||
${JSON.stringify(serviceSummary)}
|
||||
|
||||
Downstream dependencies from the service "${serviceName}". Problems in these services can negatively affect the performance of "${serviceName}":
|
||||
${JSON.stringify(downstreamDependencies)}
|
||||
|
||||
Significant change points for "${serviceName}". Use this to spot dips or spikes in throughput, latency and failure rate.
|
||||
${JSON.stringify(serviceChangePoints)}
|
||||
|
||||
Significant change points for the dependencies of "${serviceName}". Use this to spot dips or spikes in throughput, latency and failure rate for downstream dependencies:
|
||||
${JSON.stringify(exitSpanChangePoints)}
|
||||
|
||||
Log events occurring around the time of the alert. The log messages can sometimes diagnose the root cause of the alert:
|
||||
${JSON.stringify(logCategories)}
|
||||
|
||||
Anomalies for services running in the environment "${serviceEnvironment}"
|
||||
${anomalies}
|
||||
|
||||
Help the user understand the root cause of the alert by using the above information. Suggest actions the user should take to investigate further.
|
||||
`
|
||||
),
|
||||
};
|
||||
|
||||
return [
|
||||
createFunctionRequestMessage({
|
||||
name: 'get_apm_alert_details_context',
|
||||
args: {},
|
||||
}).message,
|
||||
|
||||
createFunctionResponseMessage({
|
||||
name: 'get_apm_alert_details_context',
|
||||
content,
|
||||
data: content,
|
||||
}).message,
|
||||
];
|
||||
}, [alert]);
|
||||
|
||||
if (!ObservabilityAIAssistantContextualInsight) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return (
|
||||
<EuiFlexGroup direction="column" gutterSize="m">
|
||||
<EuiFlexItem grow={false}>
|
||||
<ObservabilityAIAssistantContextualInsight
|
||||
title={i18n.translate(
|
||||
'xpack.apm.alertDetailContextualInsights.InsightButtonLabel',
|
||||
{ defaultMessage: 'Help me understand this alert' }
|
||||
)}
|
||||
messages={getPromptMessages}
|
||||
/>
|
||||
</EuiFlexItem>
|
||||
</EuiFlexGroup>
|
||||
);
|
||||
}
|
|
@ -1,240 +0,0 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { ScopedAnnotationsClient } from '@kbn/observability-plugin/server';
|
||||
import type { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
|
||||
import type { CoreRequestHandlerContext, Logger } from '@kbn/core/server';
|
||||
import moment from 'moment';
|
||||
import * as t from 'io-ts';
|
||||
import { LatencyAggregationType } from '../../../../common/latency_aggregation_types';
|
||||
import type { MlClient } from '../../../lib/helpers/get_ml_client';
|
||||
import type { APMEventClient } from '../../../lib/helpers/create_es_client/create_apm_event_client';
|
||||
import type { ApmAlertsClient } from '../../../lib/helpers/get_apm_alerts_client';
|
||||
import { getApmServiceSummary } from '../get_apm_service_summary';
|
||||
import { getAssistantDownstreamDependencies } from '../get_apm_downstream_dependencies';
|
||||
import { getLogCategories } from '../get_log_categories';
|
||||
import {
|
||||
ApmTimeseriesType,
|
||||
getApmTimeseries,
|
||||
TimeseriesChangePoint,
|
||||
} from '../get_apm_timeseries';
|
||||
import { getAnomalies } from '../get_apm_service_summary/get_anomalies';
|
||||
|
||||
export const apmAlertDetailsContextRt = t.intersection([
|
||||
t.type({
|
||||
'service.name': t.string,
|
||||
alert_started_at: t.string,
|
||||
}),
|
||||
t.partial({
|
||||
'service.environment': t.string,
|
||||
'transaction.type': t.string,
|
||||
'transaction.name': t.string,
|
||||
|
||||
// alert fields
|
||||
'host.name': t.string,
|
||||
'container.id': t.string,
|
||||
}),
|
||||
]);
|
||||
|
||||
export async function getApmAlertDetailsContext({
|
||||
coreContext,
|
||||
alertStartedAt,
|
||||
annotationsClient,
|
||||
apmAlertsClient,
|
||||
apmEventClient,
|
||||
esClient,
|
||||
logger,
|
||||
mlClient,
|
||||
query,
|
||||
}: {
|
||||
coreContext: CoreRequestHandlerContext;
|
||||
annotationsClient?: ScopedAnnotationsClient;
|
||||
apmAlertsClient: ApmAlertsClient;
|
||||
alertStartedAt: string;
|
||||
apmEventClient: APMEventClient;
|
||||
esClient: ElasticsearchClient;
|
||||
logger: Logger;
|
||||
mlClient?: MlClient;
|
||||
query: t.TypeOf<typeof apmAlertDetailsContextRt>;
|
||||
}) {
|
||||
const serviceSummaryPromise = getApmServiceSummary({
|
||||
apmEventClient,
|
||||
annotationsClient,
|
||||
esClient,
|
||||
apmAlertsClient,
|
||||
mlClient,
|
||||
logger,
|
||||
arguments: {
|
||||
'service.name': query['service.name'],
|
||||
'service.environment': query['service.environment'],
|
||||
start: moment(alertStartedAt).subtract(5, 'minute').toISOString(),
|
||||
end: alertStartedAt,
|
||||
},
|
||||
});
|
||||
|
||||
const downstreamDependenciesPromise = getAssistantDownstreamDependencies({
|
||||
apmEventClient,
|
||||
arguments: {
|
||||
'service.name': query['service.name'],
|
||||
'service.environment': query['service.environment'],
|
||||
start: moment(alertStartedAt).subtract(5, 'minute').toISOString(),
|
||||
end: alertStartedAt,
|
||||
},
|
||||
});
|
||||
|
||||
const logCategoriesPromise = getLogCategories({
|
||||
esClient,
|
||||
coreContext,
|
||||
arguments: {
|
||||
start: moment(alertStartedAt).subtract(5, 'minute').toISOString(),
|
||||
end: alertStartedAt,
|
||||
'service.name': query['service.name'],
|
||||
'host.name': query['host.name'],
|
||||
'container.id': query['container.id'],
|
||||
},
|
||||
});
|
||||
|
||||
const serviceTimeseriesPromise = getApmTimeseries({
|
||||
apmEventClient,
|
||||
arguments: {
|
||||
start: moment(alertStartedAt).subtract(12, 'hours').toISOString(),
|
||||
end: alertStartedAt,
|
||||
stats: [
|
||||
{
|
||||
title: 'Latency',
|
||||
'service.name': query['service.name'],
|
||||
'service.environment': query['service.environment'],
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.transactionLatency,
|
||||
function: LatencyAggregationType.p95,
|
||||
'transaction.type': query['transaction.type'],
|
||||
'transaction.name': query['transaction.name'],
|
||||
},
|
||||
},
|
||||
{
|
||||
title: 'Throughput',
|
||||
'service.name': query['service.name'],
|
||||
'service.environment': query['service.environment'],
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.transactionThroughput,
|
||||
'transaction.type': query['transaction.type'],
|
||||
'transaction.name': query['transaction.name'],
|
||||
},
|
||||
},
|
||||
{
|
||||
title: 'Failure rate',
|
||||
'service.name': query['service.name'],
|
||||
'service.environment': query['service.environment'],
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.transactionFailureRate,
|
||||
'transaction.type': query['transaction.type'],
|
||||
'transaction.name': query['transaction.name'],
|
||||
},
|
||||
},
|
||||
{
|
||||
title: 'Error events',
|
||||
'service.name': query['service.name'],
|
||||
'service.environment': query['service.environment'],
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.errorEventRate,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
const exitSpanTimeseriesPromise = getApmTimeseries({
|
||||
apmEventClient,
|
||||
arguments: {
|
||||
start: moment(alertStartedAt).subtract(30, 'minute').toISOString(),
|
||||
end: alertStartedAt,
|
||||
stats: [
|
||||
{
|
||||
title: 'Exit span latency',
|
||||
'service.name': query['service.name'],
|
||||
'service.environment': query['service.environment'],
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.exitSpanLatency,
|
||||
},
|
||||
},
|
||||
{
|
||||
title: 'Exit span failure rate',
|
||||
'service.name': query['service.name'],
|
||||
'service.environment': query['service.environment'],
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.exitSpanFailureRate,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
const anomaliesPromise = getAnomalies({
|
||||
start: moment(alertStartedAt).subtract(1, 'hour').valueOf(),
|
||||
end: moment(alertStartedAt).valueOf(),
|
||||
environment: query['service.environment'],
|
||||
mlClient,
|
||||
logger,
|
||||
});
|
||||
|
||||
const [
|
||||
serviceSummary,
|
||||
downstreamDependencies,
|
||||
logCategories,
|
||||
serviceTimeseries,
|
||||
exitSpanTimeseries,
|
||||
anomalies,
|
||||
] = await Promise.all([
|
||||
serviceSummaryPromise,
|
||||
downstreamDependenciesPromise,
|
||||
logCategoriesPromise,
|
||||
serviceTimeseriesPromise,
|
||||
exitSpanTimeseriesPromise,
|
||||
anomaliesPromise,
|
||||
]);
|
||||
|
||||
const serviceChangePoints = serviceTimeseries.map(
|
||||
(
|
||||
timeseries
|
||||
): {
|
||||
title: string;
|
||||
grouping: string;
|
||||
changes: TimeseriesChangePoint[];
|
||||
} => {
|
||||
return {
|
||||
title: timeseries.stat.title,
|
||||
grouping: timeseries.id,
|
||||
changes: timeseries.changes,
|
||||
};
|
||||
}
|
||||
);
|
||||
|
||||
const exitSpanChangePoints = exitSpanTimeseries.map(
|
||||
(
|
||||
timeseries
|
||||
): {
|
||||
title: string;
|
||||
grouping: string;
|
||||
changes: TimeseriesChangePoint[];
|
||||
} => {
|
||||
return {
|
||||
title: timeseries.stat.title,
|
||||
grouping: timeseries.id,
|
||||
changes: timeseries.changes,
|
||||
};
|
||||
}
|
||||
);
|
||||
|
||||
return {
|
||||
serviceSummary,
|
||||
downstreamDependencies,
|
||||
logCategories,
|
||||
serviceChangePoints,
|
||||
exitSpanChangePoints,
|
||||
anomalies,
|
||||
};
|
||||
}
|
|
@ -140,17 +140,15 @@ export async function getApmServiceSummary({
|
|||
apmAlertsClient.search({
|
||||
size: 100,
|
||||
track_total_hits: false,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
...termQuery(ALERT_RULE_PRODUCER, 'apm'),
|
||||
...termQuery(ALERT_STATUS, ALERT_STATUS_ACTIVE),
|
||||
...rangeQuery(start, end),
|
||||
...termQuery(SERVICE_NAME, serviceName),
|
||||
...environmentQuery(environment),
|
||||
],
|
||||
},
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
...termQuery(ALERT_RULE_PRODUCER, 'apm'),
|
||||
...termQuery(ALERT_STATUS, ALERT_STATUS_ACTIVE),
|
||||
...rangeQuery(start, end),
|
||||
...termQuery(SERVICE_NAME, serviceName),
|
||||
...environmentQuery(environment),
|
||||
],
|
||||
},
|
||||
},
|
||||
}),
|
||||
|
|
|
@ -221,7 +221,9 @@ export async function getApmTimeseries({
|
|||
end,
|
||||
unit: statResult.unit,
|
||||
changes: [
|
||||
...(changePointType && changePointType !== 'indeterminable'
|
||||
...(changePointType &&
|
||||
changePointType !== 'indeterminable' &&
|
||||
changePointType !== 'stationary'
|
||||
? [
|
||||
{
|
||||
date: statResult.change_point.bucket?.key,
|
||||
|
|
|
@ -18,7 +18,7 @@ import { getTypedSearch } from '../../../utils/create_typed_es_client';
|
|||
|
||||
export type LogCategories =
|
||||
| Array<{
|
||||
key: string;
|
||||
errorCategory: string;
|
||||
docCount: number;
|
||||
sampleMessage: string;
|
||||
}>
|
||||
|
@ -79,7 +79,9 @@ export async function getLogCategories({
|
|||
query,
|
||||
});
|
||||
const totalDocCount = hitCountRes.hits.total.value;
|
||||
const samplingProbability = Math.min(100_000 / totalDocCount, 1);
|
||||
const rawSamplingProbability = Math.min(100_000 / totalDocCount, 1);
|
||||
const samplingProbability =
|
||||
rawSamplingProbability < 0.5 ? rawSamplingProbability : 1;
|
||||
|
||||
const categorizedLogsRes = await search({
|
||||
index,
|
||||
|
@ -116,7 +118,7 @@ export async function getLogCategories({
|
|||
({ doc_count: docCount, key, sample }) => {
|
||||
const sampleMessage = (sample.hits.hits[0]._source as { message: string })
|
||||
.message;
|
||||
return { key: key as string, docCount, sampleMessage };
|
||||
return { errorCategory: key as string, docCount, sampleMessage };
|
||||
}
|
||||
);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
|
||||
import { CoreRequestHandlerContext } from '@kbn/core-http-request-handler-context-server';
|
||||
import { aiAssistantLogsIndexPattern } from '@kbn/observability-ai-assistant-plugin/common';
|
||||
import {
|
||||
rangeQuery,
|
||||
termQuery,
|
||||
typedSearch,
|
||||
} from '@kbn/observability-plugin/server/utils/queries';
|
||||
import * as t from 'io-ts';
|
||||
import moment from 'moment';
|
||||
import { ApmDocumentType } from '../../../../common/document_type';
|
||||
import { APMEventClient } from '../../../lib/helpers/create_es_client/create_apm_event_client';
|
||||
import { observabilityAlertDetailsContextRt } from '.';
|
||||
import { RollupInterval } from '../../../../common/rollup';
|
||||
|
||||
export async function getContainerIdFromSignals({
|
||||
query,
|
||||
esClient,
|
||||
coreContext,
|
||||
apmEventClient,
|
||||
}: {
|
||||
query: t.TypeOf<typeof observabilityAlertDetailsContextRt>;
|
||||
esClient: ElasticsearchClient;
|
||||
coreContext: CoreRequestHandlerContext;
|
||||
apmEventClient: APMEventClient;
|
||||
}) {
|
||||
if (query['container.id']) {
|
||||
return query['container.id'];
|
||||
}
|
||||
|
||||
if (query['service.name']) {
|
||||
const containerId = await getContainerIdFromTrace({
|
||||
query,
|
||||
apmEventClient,
|
||||
});
|
||||
|
||||
if (containerId) {
|
||||
return containerId;
|
||||
}
|
||||
|
||||
return getContainerIdFromLogs({ query, esClient, coreContext });
|
||||
}
|
||||
}
|
||||
|
||||
async function getContainerIdFromLogs({
|
||||
query,
|
||||
esClient,
|
||||
coreContext,
|
||||
}: {
|
||||
query: t.TypeOf<typeof observabilityAlertDetailsContextRt>;
|
||||
esClient: ElasticsearchClient;
|
||||
coreContext: CoreRequestHandlerContext;
|
||||
}) {
|
||||
const index =
|
||||
(await coreContext.uiSettings.client.get<string>(
|
||||
aiAssistantLogsIndexPattern
|
||||
)) ?? 'logs-*';
|
||||
|
||||
const start = moment(query.alert_started_at).subtract(30, 'minutes').unix();
|
||||
const end = moment(query.alert_started_at).unix();
|
||||
|
||||
const res = await typedSearch<{ container: { id: string } }, any>(esClient, {
|
||||
index,
|
||||
_source: ['container.id'],
|
||||
terminate_after: 1,
|
||||
size: 1,
|
||||
track_total_hits: false,
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
{ exists: { field: 'container.id' } },
|
||||
...termQuery('service.name', query['service.name']),
|
||||
...rangeQuery(start, end),
|
||||
],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return res.hits.hits[0]?._source?.container?.id;
|
||||
}
|
||||
|
||||
async function getContainerIdFromTrace({
|
||||
query,
|
||||
apmEventClient,
|
||||
}: {
|
||||
query: t.TypeOf<typeof observabilityAlertDetailsContextRt>;
|
||||
apmEventClient: APMEventClient;
|
||||
}) {
|
||||
const start = moment(query.alert_started_at).subtract(30, 'minutes').unix();
|
||||
const end = moment(query.alert_started_at).unix();
|
||||
|
||||
const res = await apmEventClient.search('get_container_id', {
|
||||
apm: {
|
||||
sources: [
|
||||
{
|
||||
documentType: ApmDocumentType.TransactionEvent,
|
||||
rollupInterval: RollupInterval.None,
|
||||
},
|
||||
],
|
||||
},
|
||||
body: {
|
||||
_source: ['container.id'],
|
||||
terminate_after: 1,
|
||||
size: 1,
|
||||
track_total_hits: false,
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
{ exists: { field: 'container.id' } },
|
||||
...termQuery('service.name', query['service.name']),
|
||||
...rangeQuery(start, end),
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return res.hits.hits[0]?._source.container?.id;
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
|
||||
import { CoreRequestHandlerContext } from '@kbn/core-http-request-handler-context-server';
|
||||
import { aiAssistantLogsIndexPattern } from '@kbn/observability-ai-assistant-plugin/common';
|
||||
import {
|
||||
rangeQuery,
|
||||
termQuery,
|
||||
typedSearch,
|
||||
} from '@kbn/observability-plugin/server/utils/queries';
|
||||
import * as t from 'io-ts';
|
||||
import moment from 'moment';
|
||||
import { ApmDocumentType } from '../../../../common/document_type';
|
||||
import { APMEventClient } from '../../../lib/helpers/create_es_client/create_apm_event_client';
|
||||
import { observabilityAlertDetailsContextRt } from '.';
|
||||
import { RollupInterval } from '../../../../common/rollup';
|
||||
|
||||
export async function getServiceNameFromSignals({
|
||||
query,
|
||||
esClient,
|
||||
coreContext,
|
||||
apmEventClient,
|
||||
}: {
|
||||
query: t.TypeOf<typeof observabilityAlertDetailsContextRt>;
|
||||
esClient: ElasticsearchClient;
|
||||
coreContext: CoreRequestHandlerContext;
|
||||
apmEventClient: APMEventClient;
|
||||
}) {
|
||||
if (query['service.name']) {
|
||||
return query['service.name'];
|
||||
}
|
||||
|
||||
if (query['container.id']) {
|
||||
const serviceName = await getServiceNameFromTraces({
|
||||
query,
|
||||
apmEventClient,
|
||||
});
|
||||
|
||||
if (serviceName) {
|
||||
return serviceName;
|
||||
}
|
||||
|
||||
return getServiceNameFromLogs({ query, esClient, coreContext });
|
||||
}
|
||||
}
|
||||
|
||||
async function getServiceNameFromLogs({
|
||||
query,
|
||||
esClient,
|
||||
coreContext,
|
||||
}: {
|
||||
query: t.TypeOf<typeof observabilityAlertDetailsContextRt>;
|
||||
esClient: ElasticsearchClient;
|
||||
coreContext: CoreRequestHandlerContext;
|
||||
}) {
|
||||
const index =
|
||||
(await coreContext.uiSettings.client.get<string>(
|
||||
aiAssistantLogsIndexPattern
|
||||
)) ?? 'logs-*';
|
||||
|
||||
const start = moment(query.alert_started_at).subtract(30, 'minutes').unix();
|
||||
const end = moment(query.alert_started_at).unix();
|
||||
|
||||
const res = await typedSearch<{ service: { name: string } }, any>(esClient, {
|
||||
index,
|
||||
_source: ['service.name'],
|
||||
terminate_after: 1,
|
||||
size: 1,
|
||||
track_total_hits: false,
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
{ exists: { field: 'service.name' } },
|
||||
...termQuery('container.id', query['container.id']),
|
||||
...rangeQuery(start, end),
|
||||
],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return res.hits.hits[0]?._source?.service?.name;
|
||||
}
|
||||
|
||||
async function getServiceNameFromTraces({
|
||||
query,
|
||||
apmEventClient,
|
||||
}: {
|
||||
query: t.TypeOf<typeof observabilityAlertDetailsContextRt>;
|
||||
apmEventClient: APMEventClient;
|
||||
}) {
|
||||
const start = moment(query.alert_started_at).subtract(30, 'minutes').unix();
|
||||
const end = moment(query.alert_started_at).unix();
|
||||
|
||||
const res = await apmEventClient.search('get_service_name', {
|
||||
apm: {
|
||||
sources: [
|
||||
{
|
||||
documentType: ApmDocumentType.TransactionEvent,
|
||||
rollupInterval: RollupInterval.None,
|
||||
},
|
||||
],
|
||||
},
|
||||
body: {
|
||||
_source: ['service.name'],
|
||||
terminate_after: 1,
|
||||
size: 1,
|
||||
track_total_hits: false,
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
{ exists: { field: 'service.name' } },
|
||||
...termQuery('container.id', query['container.id']),
|
||||
...rangeQuery(start, end),
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return res.hits.hits[0]?._source.service.name;
|
||||
}
|
|
@ -0,0 +1,297 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { ScopedAnnotationsClient } from '@kbn/observability-plugin/server';
|
||||
import type { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
|
||||
import type { CoreRequestHandlerContext, Logger } from '@kbn/core/server';
|
||||
import moment from 'moment';
|
||||
import * as t from 'io-ts';
|
||||
import { LatencyAggregationType } from '../../../../common/latency_aggregation_types';
|
||||
import type { MlClient } from '../../../lib/helpers/get_ml_client';
|
||||
import type { APMEventClient } from '../../../lib/helpers/create_es_client/create_apm_event_client';
|
||||
import type { ApmAlertsClient } from '../../../lib/helpers/get_apm_alerts_client';
|
||||
import { getApmServiceSummary } from '../get_apm_service_summary';
|
||||
import { getAssistantDownstreamDependencies } from '../get_apm_downstream_dependencies';
|
||||
import { getLogCategories } from '../get_log_categories';
|
||||
import { ApmTimeseriesType, getApmTimeseries } from '../get_apm_timeseries';
|
||||
import { getAnomalies } from '../get_apm_service_summary/get_anomalies';
|
||||
import { getServiceNameFromSignals } from './get_service_name_from_signals';
|
||||
import { getContainerIdFromSignals } from './get_container_id_from_signals';
|
||||
|
||||
export const observabilityAlertDetailsContextRt = t.intersection([
|
||||
t.type({
|
||||
alert_started_at: t.string,
|
||||
}),
|
||||
t.partial({
|
||||
// apm fields
|
||||
'service.name': t.string,
|
||||
'service.environment': t.string,
|
||||
'transaction.type': t.string,
|
||||
'transaction.name': t.string,
|
||||
|
||||
// infrastructure fields
|
||||
'host.name': t.string,
|
||||
'container.id': t.string,
|
||||
}),
|
||||
]);
|
||||
|
||||
export async function getObservabilityAlertDetailsContext({
|
||||
coreContext,
|
||||
annotationsClient,
|
||||
apmAlertsClient,
|
||||
apmEventClient,
|
||||
esClient,
|
||||
logger,
|
||||
mlClient,
|
||||
query,
|
||||
}: {
|
||||
coreContext: CoreRequestHandlerContext;
|
||||
annotationsClient?: ScopedAnnotationsClient;
|
||||
apmAlertsClient: ApmAlertsClient;
|
||||
apmEventClient: APMEventClient;
|
||||
esClient: ElasticsearchClient;
|
||||
logger: Logger;
|
||||
mlClient?: MlClient;
|
||||
query: t.TypeOf<typeof observabilityAlertDetailsContextRt>;
|
||||
}) {
|
||||
const alertStartedAt = query.alert_started_at;
|
||||
const serviceEnvironment = query['service.environment'];
|
||||
const hostName = query['host.name'];
|
||||
const [serviceName, containerId] = await Promise.all([
|
||||
getServiceNameFromSignals({
|
||||
query,
|
||||
esClient,
|
||||
coreContext,
|
||||
apmEventClient,
|
||||
}),
|
||||
getContainerIdFromSignals({
|
||||
query,
|
||||
esClient,
|
||||
coreContext,
|
||||
apmEventClient,
|
||||
}),
|
||||
]);
|
||||
|
||||
const serviceSummaryPromise = serviceName
|
||||
? getApmServiceSummary({
|
||||
apmEventClient,
|
||||
annotationsClient,
|
||||
esClient,
|
||||
apmAlertsClient,
|
||||
mlClient,
|
||||
logger,
|
||||
arguments: {
|
||||
'service.name': serviceName,
|
||||
'service.environment': serviceEnvironment,
|
||||
start: moment(alertStartedAt).subtract(5, 'minute').toISOString(),
|
||||
end: alertStartedAt,
|
||||
},
|
||||
})
|
||||
: undefined;
|
||||
|
||||
const downstreamDependenciesPromise = serviceName
|
||||
? getAssistantDownstreamDependencies({
|
||||
apmEventClient,
|
||||
arguments: {
|
||||
'service.name': serviceName,
|
||||
'service.environment': serviceEnvironment,
|
||||
start: moment(alertStartedAt).subtract(5, 'minute').toISOString(),
|
||||
end: alertStartedAt,
|
||||
},
|
||||
})
|
||||
: undefined;
|
||||
|
||||
const logCategoriesPromise = getLogCategories({
|
||||
esClient,
|
||||
coreContext,
|
||||
arguments: {
|
||||
start: moment(alertStartedAt).subtract(5, 'minute').toISOString(),
|
||||
end: alertStartedAt,
|
||||
'service.name': serviceName,
|
||||
'host.name': hostName,
|
||||
'container.id': containerId,
|
||||
},
|
||||
});
|
||||
|
||||
const serviceChangePointsPromise = getServiceChangePoints({
|
||||
apmEventClient,
|
||||
alertStartedAt,
|
||||
serviceName,
|
||||
serviceEnvironment,
|
||||
transactionType: query['transaction.type'],
|
||||
transactionName: query['transaction.name'],
|
||||
});
|
||||
|
||||
const exitSpanChangePointsPromise = getExitSpanChangePoints({
|
||||
apmEventClient,
|
||||
alertStartedAt,
|
||||
serviceName,
|
||||
serviceEnvironment,
|
||||
});
|
||||
|
||||
const anomaliesPromise = getAnomalies({
|
||||
start: moment(alertStartedAt).subtract(1, 'hour').valueOf(),
|
||||
end: moment(alertStartedAt).valueOf(),
|
||||
environment: serviceEnvironment,
|
||||
mlClient,
|
||||
logger,
|
||||
});
|
||||
|
||||
const [
|
||||
serviceSummary,
|
||||
downstreamDependencies,
|
||||
logCategories,
|
||||
serviceChangePoints,
|
||||
exitSpanChangePoints,
|
||||
anomalies,
|
||||
] = await Promise.all([
|
||||
serviceSummaryPromise,
|
||||
downstreamDependenciesPromise,
|
||||
logCategoriesPromise,
|
||||
serviceChangePointsPromise,
|
||||
exitSpanChangePointsPromise,
|
||||
anomaliesPromise,
|
||||
]);
|
||||
|
||||
return {
|
||||
serviceSummary,
|
||||
downstreamDependencies,
|
||||
logCategories,
|
||||
serviceChangePoints,
|
||||
exitSpanChangePoints,
|
||||
anomalies,
|
||||
};
|
||||
}
|
||||
|
||||
async function getServiceChangePoints({
|
||||
apmEventClient,
|
||||
alertStartedAt,
|
||||
serviceName,
|
||||
serviceEnvironment,
|
||||
transactionType,
|
||||
transactionName,
|
||||
}: {
|
||||
apmEventClient: APMEventClient;
|
||||
alertStartedAt: string;
|
||||
serviceName: string | undefined;
|
||||
serviceEnvironment: string | undefined;
|
||||
transactionType: string | undefined;
|
||||
transactionName: string | undefined;
|
||||
}) {
|
||||
if (!serviceName) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const res = await getApmTimeseries({
|
||||
apmEventClient,
|
||||
arguments: {
|
||||
start: moment(alertStartedAt).subtract(12, 'hours').toISOString(),
|
||||
end: alertStartedAt,
|
||||
stats: [
|
||||
{
|
||||
title: 'Latency',
|
||||
'service.name': serviceName,
|
||||
'service.environment': serviceEnvironment,
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.transactionLatency,
|
||||
function: LatencyAggregationType.p95,
|
||||
'transaction.type': transactionType,
|
||||
'transaction.name': transactionName,
|
||||
},
|
||||
},
|
||||
{
|
||||
title: 'Throughput',
|
||||
'service.name': serviceName,
|
||||
'service.environment': serviceEnvironment,
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.transactionThroughput,
|
||||
'transaction.type': transactionType,
|
||||
'transaction.name': transactionName,
|
||||
},
|
||||
},
|
||||
{
|
||||
title: 'Failure rate',
|
||||
'service.name': serviceName,
|
||||
'service.environment': serviceEnvironment,
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.transactionFailureRate,
|
||||
'transaction.type': transactionType,
|
||||
'transaction.name': transactionName,
|
||||
},
|
||||
},
|
||||
{
|
||||
title: 'Error events',
|
||||
'service.name': serviceName,
|
||||
'service.environment': serviceEnvironment,
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.errorEventRate,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
return res
|
||||
.filter((timeseries) => timeseries.changes.length > 0)
|
||||
.map((timeseries) => ({
|
||||
title: timeseries.stat.title,
|
||||
grouping: timeseries.id,
|
||||
changes: timeseries.changes,
|
||||
}));
|
||||
}
|
||||
|
||||
async function getExitSpanChangePoints({
|
||||
apmEventClient,
|
||||
alertStartedAt,
|
||||
serviceName,
|
||||
serviceEnvironment,
|
||||
}: {
|
||||
apmEventClient: APMEventClient;
|
||||
alertStartedAt: string;
|
||||
serviceName: string | undefined;
|
||||
serviceEnvironment: string | undefined;
|
||||
}) {
|
||||
if (!serviceName) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const res = await getApmTimeseries({
|
||||
apmEventClient,
|
||||
arguments: {
|
||||
start: moment(alertStartedAt).subtract(30, 'minute').toISOString(),
|
||||
end: alertStartedAt,
|
||||
stats: [
|
||||
{
|
||||
title: 'Exit span latency',
|
||||
'service.name': serviceName,
|
||||
'service.environment': serviceEnvironment,
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.exitSpanLatency,
|
||||
},
|
||||
},
|
||||
{
|
||||
title: 'Exit span failure rate',
|
||||
'service.name': serviceName,
|
||||
'service.environment': serviceEnvironment,
|
||||
timeseries: {
|
||||
name: ApmTimeseriesType.exitSpanFailureRate,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
return res
|
||||
.filter((timeseries) => timeseries.changes.length > 0)
|
||||
.map((timeseries) => {
|
||||
return {
|
||||
title: timeseries.stat.title,
|
||||
grouping: timeseries.id,
|
||||
changes: timeseries.changes,
|
||||
};
|
||||
});
|
||||
}
|
|
@ -12,9 +12,9 @@ import { getMlClient } from '../../lib/helpers/get_ml_client';
|
|||
import { getRandomSampler } from '../../lib/helpers/get_random_sampler';
|
||||
import { createApmServerRoute } from '../apm_routes/create_apm_server_route';
|
||||
import {
|
||||
apmAlertDetailsContextRt,
|
||||
getApmAlertDetailsContext,
|
||||
} from './get_apm_alert_details_context';
|
||||
observabilityAlertDetailsContextRt,
|
||||
getObservabilityAlertDetailsContext,
|
||||
} from './get_observability_alert_details_context';
|
||||
|
||||
import {
|
||||
downstreamDependenciesRouteRt,
|
||||
|
@ -31,20 +31,20 @@ import {
|
|||
} from './get_apm_timeseries';
|
||||
import { LogCategories } from './get_log_categories';
|
||||
|
||||
const getApmAlertDetailsContextRoute = createApmServerRoute({
|
||||
endpoint: 'GET /internal/apm/assistant/get_apm_alert_details_context',
|
||||
const getObservabilityAlertDetailsContextRoute = createApmServerRoute({
|
||||
endpoint: 'GET /internal/apm/assistant/get_obs_alert_details_context',
|
||||
options: {
|
||||
tags: ['access:apm', 'access:ai_assistant'],
|
||||
},
|
||||
|
||||
params: t.type({
|
||||
query: apmAlertDetailsContextRt,
|
||||
query: observabilityAlertDetailsContextRt,
|
||||
}),
|
||||
handler: async (
|
||||
resources
|
||||
): Promise<{
|
||||
serviceSummary: ServiceSummary;
|
||||
downstreamDependencies: APMDownstreamDependency[];
|
||||
serviceSummary?: ServiceSummary;
|
||||
downstreamDependencies?: APMDownstreamDependency[];
|
||||
logCategories: LogCategories;
|
||||
serviceChangePoints: Array<{
|
||||
title: string;
|
||||
|
@ -59,8 +59,6 @@ const getApmAlertDetailsContextRoute = createApmServerRoute({
|
|||
const { context, request, plugins, logger, params } = resources;
|
||||
const { query } = params;
|
||||
|
||||
const alertStartedAt = query.alert_started_at;
|
||||
|
||||
const [
|
||||
apmEventClient,
|
||||
annotationsClient,
|
||||
|
@ -81,9 +79,8 @@ const getApmAlertDetailsContextRoute = createApmServerRoute({
|
|||
]);
|
||||
const esClient = coreContext.elasticsearch.client.asCurrentUser;
|
||||
|
||||
return getApmAlertDetailsContext({
|
||||
return getObservabilityAlertDetailsContext({
|
||||
coreContext,
|
||||
alertStartedAt,
|
||||
annotationsClient,
|
||||
apmAlertsClient,
|
||||
apmEventClient,
|
||||
|
@ -152,6 +149,6 @@ const getDownstreamDependenciesRoute = createApmServerRoute({
|
|||
|
||||
export const assistantRouteRepository = {
|
||||
...getApmTimeSeriesRoute,
|
||||
...getApmAlertDetailsContextRoute,
|
||||
...getObservabilityAlertDetailsContextRoute,
|
||||
...getDownstreamDependenciesRoute,
|
||||
};
|
||||
|
|
|
@ -112,7 +112,8 @@
|
|||
"@kbn/shared-svg",
|
||||
"@kbn/shared-ux-utility",
|
||||
"@kbn/management-settings-components-field-row",
|
||||
"@kbn/shared-ux-markdown"
|
||||
"@kbn/shared-ux-markdown",
|
||||
"@kbn/core-http-request-handler-context-server"
|
||||
],
|
||||
"exclude": ["target/**/*"]
|
||||
}
|
||||
|
|
|
@ -274,14 +274,10 @@ export function getScreenDescription(alertDetail: AlertData) {
|
|||
: ''
|
||||
}
|
||||
|
||||
The alert details are:
|
||||
Use the following alert fields as background information for generating a response. Do not list them as bullet points in the response.
|
||||
${Object.entries(getRelevantAlertFields(alertDetail))
|
||||
.map(([key, value]) => `${key}: ${JSON.stringify(value)}`)
|
||||
.join('\n')}
|
||||
|
||||
Do not repeat this information to the user, unless it is relevant for them to know.
|
||||
Please suggestion root causes if possible.
|
||||
Suggest next steps for the user to take.
|
||||
`);
|
||||
}
|
||||
|
||||
|
|
|
@ -7,34 +7,116 @@
|
|||
|
||||
import { EuiFlexGroup, EuiFlexItem } from '@elastic/eui';
|
||||
|
||||
import React, { useMemo } from 'react';
|
||||
import React, { useCallback } from 'react';
|
||||
import { i18n } from '@kbn/i18n';
|
||||
import dedent from 'dedent';
|
||||
import { isEmpty } from 'lodash';
|
||||
import { useKibana } from '../../utils/kibana_react';
|
||||
import { AlertData } from '../../hooks/use_fetch_alert_detail';
|
||||
|
||||
export function AlertDetailContextualInsights({ alert }: { alert: AlertData | null }) {
|
||||
const {
|
||||
services: { observabilityAIAssistant },
|
||||
services: { observabilityAIAssistant, http },
|
||||
} = useKibana();
|
||||
|
||||
const ObservabilityAIAssistantContextualInsight =
|
||||
observabilityAIAssistant?.ObservabilityAIAssistantContextualInsight;
|
||||
|
||||
const messages = useMemo(() => {
|
||||
if (!observabilityAIAssistant) {
|
||||
return null;
|
||||
const getPromptMessages = useCallback(async () => {
|
||||
const fields = alert?.formatted.fields as Record<string, string> | undefined;
|
||||
if (!observabilityAIAssistant || !fields || !alert) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const res = await http.get('/internal/apm/assistant/get_obs_alert_details_context', {
|
||||
query: {
|
||||
alert_started_at: new Date(alert.formatted.start).toISOString(),
|
||||
|
||||
// service fields
|
||||
'service.name': fields['service.name'],
|
||||
'service.environment': fields['service.environment'],
|
||||
'transaction.type': fields['transaction.type'],
|
||||
'transaction.name': fields['transaction.name'],
|
||||
|
||||
// infra fields
|
||||
'host.name': fields['host.name'],
|
||||
'container.id': fields['container.id'],
|
||||
},
|
||||
});
|
||||
|
||||
const {
|
||||
serviceSummary,
|
||||
downstreamDependencies,
|
||||
logCategories,
|
||||
serviceChangePoints,
|
||||
exitSpanChangePoints,
|
||||
anomalies,
|
||||
} = res as any;
|
||||
|
||||
const serviceName = fields['service.name'];
|
||||
const serviceEnvironment = fields['service.environment'];
|
||||
|
||||
const obsAlertContext = `${
|
||||
!isEmpty(serviceSummary)
|
||||
? `Metadata for the service where the alert occurred:
|
||||
${JSON.stringify(serviceSummary, null, 2)}`
|
||||
: ''
|
||||
}
|
||||
|
||||
${
|
||||
!isEmpty(downstreamDependencies)
|
||||
? `Downstream dependencies from the service "${serviceName}". Problems in these services can negatively affect the performance of "${serviceName}":
|
||||
${JSON.stringify(downstreamDependencies, null, 2)}`
|
||||
: ''
|
||||
}
|
||||
|
||||
${
|
||||
!isEmpty(serviceChangePoints)
|
||||
? `Significant change points for "${serviceName}". Use this to spot dips and spikes in throughput, latency and failure rate:
|
||||
${JSON.stringify(serviceChangePoints, null, 2)}`
|
||||
: ''
|
||||
}
|
||||
|
||||
${
|
||||
!isEmpty(exitSpanChangePoints)
|
||||
? `Significant change points for the dependencies of "${serviceName}". Use this to spot dips or spikes in throughput, latency and failure rate for downstream dependencies:
|
||||
${JSON.stringify(exitSpanChangePoints, null, 2)}`
|
||||
: ''
|
||||
}
|
||||
|
||||
${
|
||||
!isEmpty(logCategories)
|
||||
? `Log events occurring around the time of the alert:
|
||||
${JSON.stringify(logCategories, null, 2)}`
|
||||
: ''
|
||||
}
|
||||
|
||||
${
|
||||
!isEmpty(anomalies)
|
||||
? `Anomalies for services running in the environment "${serviceEnvironment}":
|
||||
${anomalies}`
|
||||
: ''
|
||||
}
|
||||
`;
|
||||
|
||||
return observabilityAIAssistant.getContextualInsightMessages({
|
||||
message: `I'm looking at an alert and trying to understand why it was triggered`,
|
||||
instructions: dedent(
|
||||
`I'm an SRE. I am looking at an alert that was triggered. I want to understand why it was triggered, what it means, and what I should do next.`
|
||||
`I'm an SRE. I am looking at an alert that was triggered. I want to understand why it was triggered, what it means, and what I should do next.
|
||||
|
||||
The following contextual information is available to help me understand the alert:
|
||||
${obsAlertContext}
|
||||
|
||||
Be brief and to the point.
|
||||
Do not list the alert details as bullet points.
|
||||
Do refer to the contextual information provided above when relevant.
|
||||
Pay specific attention to why the alert happened and what may have contributed to it.
|
||||
`
|
||||
),
|
||||
});
|
||||
}, [observabilityAIAssistant]);
|
||||
}, [alert, http, observabilityAIAssistant]);
|
||||
|
||||
if (!ObservabilityAIAssistantContextualInsight || !messages) {
|
||||
if (!ObservabilityAIAssistantContextualInsight) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -46,7 +128,7 @@ export function AlertDetailContextualInsights({ alert }: { alert: AlertData | nu
|
|||
'xpack.observability.alertDetailContextualInsights.InsightButtonLabel',
|
||||
{ defaultMessage: 'Help me understand this alert' }
|
||||
)}
|
||||
messages={messages}
|
||||
messages={getPromptMessages}
|
||||
/>
|
||||
</EuiFlexItem>
|
||||
</EuiFlexGroup>
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue