[APM] Co-locate data fetcher and return format for AI Assistant Alert insights (#186971)

This co-locates data fetchers with their return values. Before the all data fetching happens at once, then later joined with return values. This makes it easier to make changes and add new data fetchers.
This commit is contained in:
Søren Louv-Jansen 2024-07-02 10:00:23 +02:00 committed by GitHub
parent 1fa094b7ac
commit e5c1b2596b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 182 additions and 167 deletions

View file

@ -8,7 +8,7 @@ import { castArray } from 'lodash';
import React, { TableHTMLAttributes } from 'react';
import { EuiTable, EuiTableProps, EuiTableBody, EuiTableRow, EuiTableRowCell } from '@elastic/eui';
import { FormattedValue } from './formatted_value';
import { KeyValuePair } from '../../../utils/flatten_object';
import { KeyValuePair } from '../../../../common/utils/flatten_object';
export function KeyValueTable({
keyValuePairs,

View file

@ -11,7 +11,7 @@ import React from 'react';
import { euiStyled } from '@kbn/kibana-react-plugin/common';
import { Stackframe } from '../../../../typings/es_schemas/raw/fields/stackframe';
import { KeyValueTable } from '../key_value_table';
import { flattenObject } from '../../../utils/flatten_object';
import { flattenObject } from '../../../../common/utils/flatten_object';
const VariablesContainer = euiStyled.div`
background: ${({ theme }) => theme.eui.euiColorEmptyShade};

View file

@ -9,15 +9,9 @@ import datemath from '@elastic/datemath';
import { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
import type { CoreRequestHandlerContext } from '@kbn/core/server';
import { aiAssistantLogsIndexPattern } from '@kbn/observability-ai-assistant-plugin/server';
import { flattenObject, KeyValuePair } from '../../../../common/utils/flatten_object';
import { APMEventClient } from '../../../lib/helpers/create_es_client/create_apm_event_client';
import {
SERVICE_NAME,
CONTAINER_ID,
HOST_NAME,
KUBERNETES_POD_NAME,
PROCESSOR_EVENT,
TRACE_ID,
} from '../../../../common/es_fields/apm';
import { PROCESSOR_EVENT, TRACE_ID } from '../../../../common/es_fields/apm';
import { getTypedSearch } from '../../../utils/create_typed_es_client';
import { getDownstreamServiceResource } from '../get_observability_alert_details_context/get_downstream_dependency_name';
@ -40,24 +34,25 @@ export async function getLogCategories({
arguments: {
start: string;
end: string;
'service.name'?: string;
'host.name'?: string;
'container.id'?: string;
'kubernetes.pod.name'?: string;
entities: {
'service.name'?: string;
'host.name'?: string;
'container.id'?: string;
'kubernetes.pod.name'?: string;
};
};
}): Promise<LogCategory[] | undefined> {
}): Promise<{
logCategories: LogCategory[];
entities: KeyValuePair[];
}> {
const start = datemath.parse(args.start)?.valueOf()!;
const end = datemath.parse(args.end)?.valueOf()!;
const keyValueFilters = getShouldMatchOrNotExistFilter([
{ field: SERVICE_NAME, value: args[SERVICE_NAME] },
{ field: CONTAINER_ID, value: args[CONTAINER_ID] },
{ field: HOST_NAME, value: args[HOST_NAME] },
{ field: KUBERNETES_POD_NAME, value: args[KUBERNETES_POD_NAME] },
]);
const keyValueFilters = getShouldMatchOrNotExistFilter(
Object.entries(args.entities).map(([key, value]) => ({ field: key, value }))
);
const index = await coreContext.uiSettings.client.get<string>(aiAssistantLogsIndexPattern);
const search = getTypedSearch(esClient);
const query = {
@ -93,7 +88,8 @@ export async function getLogCategories({
const categorizedLogsRes = await search({
index,
size: 0,
size: 1,
_source: Object.keys(args.entities),
track_total_hits: 0,
query,
aggs: {
@ -144,7 +140,12 @@ export async function getLogCategories({
}
);
return Promise.all(promises ?? []);
const sampleDoc = categorizedLogsRes.hits.hits?.[0]?._source as Record<string, string>;
return {
logCategories: await Promise.all(promises ?? []),
entities: flattenObject(sampleDoc),
};
}
// field/value pairs should match, or the field should not exist

View file

@ -6,9 +6,9 @@
*/
import { Logger } from '@kbn/core/server';
import {
AlertDetailsContextualInsightsHandlerQuery,
AlertDetailsContextualInsightsRequestContext,
import type {
AlertDetailsContextualInsight,
AlertDetailsContextualInsightsHandler,
} from '@kbn/observability-plugin/server/services';
import moment from 'moment';
import { isEmpty } from 'lodash';
@ -18,8 +18,11 @@ import { getApmEventClient } from '../../../lib/helpers/get_apm_event_client';
import { getMlClient } from '../../../lib/helpers/get_ml_client';
import { getRandomSampler } from '../../../lib/helpers/get_random_sampler';
import { getApmServiceSummary } from '../get_apm_service_summary';
import { getAssistantDownstreamDependencies } from '../get_apm_downstream_dependencies';
import { getLogCategories } from '../get_log_categories';
import {
APMDownstreamDependency,
getAssistantDownstreamDependencies,
} from '../get_apm_downstream_dependencies';
import { getLogCategories, LogCategory } from '../get_log_categories';
import { getAnomalies } from '../get_apm_service_summary/get_anomalies';
import { getServiceNameFromSignals } from './get_service_name_from_signals';
import { getContainerIdFromSignals } from './get_container_id_from_signals';
@ -30,11 +33,8 @@ import { getApmErrors } from './get_apm_errors';
export const getAlertDetailsContextHandler = (
resourcePlugins: APMRouteHandlerResources['plugins'],
logger: Logger
) => {
return async (
requestContext: AlertDetailsContextualInsightsRequestContext,
query: AlertDetailsContextualInsightsHandlerQuery
) => {
): AlertDetailsContextualInsightsHandler => {
return async (requestContext, query) => {
const resources = {
getApmIndices: async () => {
const coreContext = await requestContext.core;
@ -91,6 +91,7 @@ export const getAlertDetailsContextHandler = (
const serviceEnvironment = query['service.environment'];
const hostName = query['host.name'];
const kubernetesPodName = query['kubernetes.pod.name'];
const [serviceName, containerId] = await Promise.all([
getServiceNameFromSignals({
query,
@ -106,80 +107,132 @@ export const getAlertDetailsContextHandler = (
}),
]);
async function handleError<T>(cb: () => Promise<T>): Promise<T | undefined> {
try {
return await cb();
} catch (error) {
logger.error('Error while fetching observability alert details context');
logger.error(error);
return;
}
const downstreamDependenciesPromise = serviceName
? getAssistantDownstreamDependencies({
apmEventClient,
arguments: {
'service.name': serviceName,
'service.environment': serviceEnvironment,
start: moment(alertStartedAt).subtract(24, 'hours').toISOString(),
end: alertStartedAt,
},
randomSampler,
})
: undefined;
const dataFetchers: Array<() => Promise<AlertDetailsContextualInsight>> = [];
// service summary
if (serviceName) {
dataFetchers.push(async () => {
const serviceSummary = await getApmServiceSummary({
apmEventClient,
annotationsClient,
esClient,
apmAlertsClient,
mlClient,
logger,
arguments: {
'service.name': serviceName,
'service.environment': serviceEnvironment,
start: moment(alertStartedAt).subtract(5, 'minute').toISOString(),
end: alertStartedAt,
},
});
return {
key: 'serviceSummary',
description: `Metadata for the service "${serviceName}" that produced the alert. The alert might be caused by an issue in the service itself or one of its dependencies.`,
data: serviceSummary,
};
});
}
const serviceSummaryPromise = serviceName
? handleError(() =>
getApmServiceSummary({
apmEventClient,
annotationsClient,
esClient,
apmAlertsClient,
mlClient,
logger,
arguments: {
'service.name': serviceName,
'service.environment': serviceEnvironment,
start: moment(alertStartedAt).subtract(5, 'minute').toISOString(),
end: alertStartedAt,
},
})
)
: undefined;
// downstream dependencies
if (serviceName) {
dataFetchers.push(async () => {
const downstreamDependencies = await downstreamDependenciesPromise;
return {
key: 'downstreamDependencies',
description: `Downstream dependencies from the service "${serviceName}". Problems in these services can negatively affect the performance of "${serviceName}"`,
data: downstreamDependencies,
};
});
}
const downstreamDependenciesPromise = serviceName
? handleError(() =>
getAssistantDownstreamDependencies({
apmEventClient,
arguments: {
'service.name': serviceName,
'service.environment': serviceEnvironment,
start: moment(alertStartedAt).subtract(24, 'hours').toISOString(),
end: alertStartedAt,
},
randomSampler,
})
)
: undefined;
const logCategoriesPromise = handleError(() =>
getLogCategories({
// log categories
dataFetchers.push(async () => {
const downstreamDependencies = await downstreamDependenciesPromise;
const { logCategories, entities } = await getLogCategories({
apmEventClient,
esClient,
coreContext,
arguments: {
start: moment(alertStartedAt).subtract(15, 'minute').toISOString(),
end: alertStartedAt,
'service.name': serviceName,
'host.name': hostName,
'container.id': containerId,
'kubernetes.pod.name': kubernetesPodName,
entities: {
'service.name': serviceName,
'host.name': hostName,
'container.id': containerId,
'kubernetes.pod.name': kubernetesPodName,
},
},
})
);
});
const apmErrorsPromise = serviceName
? handleError(() =>
getApmErrors({
apmEventClient,
start: moment(alertStartedAt).subtract(15, 'minute').toISOString(),
end: alertStartedAt,
serviceName,
serviceEnvironment,
})
)
: undefined;
const entitiesAsString = entities.map(({ key, value }) => `${key}:${value}`).join(', ');
const serviceChangePointsPromise = handleError(() =>
getServiceChangePoints({
return {
key: 'logCategories',
description: `Log events occurring up to 15 minutes before the alert was triggered. Filtered by the entities: ${entitiesAsString}`,
data: logCategoriesWithDownstreamServiceName(logCategories, downstreamDependencies),
};
});
// apm errors
if (serviceName) {
dataFetchers.push(async () => {
const apmErrors = await getApmErrors({
apmEventClient,
start: moment(alertStartedAt).subtract(15, 'minute').toISOString(),
end: alertStartedAt,
serviceName,
serviceEnvironment,
});
const downstreamDependencies = await downstreamDependenciesPromise;
const errorsWithDownstreamServiceName = getApmErrorsWithDownstreamServiceName(
apmErrors,
downstreamDependencies
);
return {
key: 'apmErrors',
description: `Exceptions (errors) thrown by the service "${serviceName}". If an error contains a downstream service name this could be a possible root cause. If relevant please describe what the error means and what it could be caused by.`,
data: errorsWithDownstreamServiceName,
};
});
}
// exit span change points
dataFetchers.push(async () => {
const exitSpanChangePoints = await getExitSpanChangePoints({
apmEventClient,
start: moment(alertStartedAt).subtract(6, 'hours').toISOString(),
end: alertStartedAt,
serviceName,
serviceEnvironment,
});
return {
key: 'exitSpanChangePoints',
description: `Significant change points for the dependencies of "${serviceName}". Use this to spot dips or spikes in throughput, latency and failure rate for downstream dependencies`,
data: exitSpanChangePoints,
};
});
// service change points
dataFetchers.push(async () => {
const serviceChangePoints = await getServiceChangePoints({
apmEventClient,
start: moment(alertStartedAt).subtract(6, 'hours').toISOString(),
end: alertStartedAt,
@ -187,88 +240,49 @@ export const getAlertDetailsContextHandler = (
serviceEnvironment,
transactionType: query['transaction.type'],
transactionName: query['transaction.name'],
})
);
});
const exitSpanChangePointsPromise = handleError(() =>
getExitSpanChangePoints({
apmEventClient,
start: moment(alertStartedAt).subtract(6, 'hours').toISOString(),
end: alertStartedAt,
serviceName,
serviceEnvironment,
})
);
return {
key: 'serviceChangePoints',
description: `Significant change points for "${serviceName}". Use this to spot dips and spikes in throughput, latency and failure rate`,
data: serviceChangePoints,
};
});
const anomaliesPromise = handleError(() =>
getAnomalies({
// Anomalies
dataFetchers.push(async () => {
const anomalies = await getAnomalies({
start: moment(alertStartedAt).subtract(1, 'hour').valueOf(),
end: moment(alertStartedAt).valueOf(),
environment: serviceEnvironment,
mlClient,
logger,
})
);
});
const [
serviceSummary,
downstreamDependencies,
logCategories,
apmErrors,
serviceChangePoints,
exitSpanChangePoints,
anomalies,
] = await Promise.all([
serviceSummaryPromise,
downstreamDependenciesPromise,
logCategoriesPromise,
apmErrorsPromise,
serviceChangePointsPromise,
exitSpanChangePointsPromise,
anomaliesPromise,
]);
return [
{
key: 'serviceSummary',
description: `Metadata for the service "${serviceName}" that produced the alert. The alert might be caused by an issue in the service itself or one of its dependencies.`,
data: serviceSummary,
},
{
key: 'downstreamDependencies',
description: `Downstream dependencies from the service "${serviceName}". Problems in these services can negatively affect the performance of "${serviceName}"`,
data: downstreamDependencies,
},
{
key: 'serviceChangePoints',
description: `Significant change points for "${serviceName}". Use this to spot dips and spikes in throughput, latency and failure rate`,
data: serviceChangePoints,
},
{
key: 'exitSpanChangePoints',
description: `Significant change points for the dependencies of "${serviceName}". Use this to spot dips or spikes in throughput, latency and failure rate for downstream dependencies`,
data: exitSpanChangePoints,
},
{
key: 'logCategories',
description: `Related log events occurring shortly before the alert was triggered.`,
data: logCategoriesWithDownstreamServiceName(logCategories, downstreamDependencies),
},
{
key: 'apmErrors',
description: `Exceptions for the service "${serviceName}". If a downstream service name is included this could be a possible root cause. If relevant please describe what the error means and what it could be caused by.`,
data: apmErrorsWithDownstreamServiceName(apmErrors, downstreamDependencies),
},
{
return {
key: 'anomalies',
description: `Anomalies for services running in the environment "${serviceEnvironment}". Anomalies are detected using machine learning and can help you spot unusual patterns in your data.`,
data: anomalies,
},
].filter(({ data }) => !isEmpty(data));
};
});
const items = await Promise.all(
dataFetchers.map(async (dataFetcher) => {
try {
return await dataFetcher();
} catch (error) {
logger.error('Error while fetching observability alert details context');
logger.error(error);
return;
}
})
);
return items.filter((item) => item && !isEmpty(item.data)) as AlertDetailsContextualInsight[];
};
};
function apmErrorsWithDownstreamServiceName(
function getApmErrorsWithDownstreamServiceName(
apmErrors?: Awaited<ReturnType<typeof getApmErrors>>,
downstreamDependencies?: Awaited<ReturnType<typeof getAssistantDownstreamDependencies>>
) {
@ -290,8 +304,8 @@ function apmErrorsWithDownstreamServiceName(
}
function logCategoriesWithDownstreamServiceName(
logCategories?: Awaited<ReturnType<typeof getLogCategories>>,
downstreamDependencies?: Awaited<ReturnType<typeof getAssistantDownstreamDependencies>>
logCategories?: LogCategory[],
downstreamDependencies?: APMDownstreamDependency[]
) {
return logCategories?.map(
({ errorCategory, docCount, sampleMessage, downstreamServiceResource }) => {

View file

@ -57,7 +57,7 @@ export interface AlertDetailsContextualInsightsRequestContext {
}>;
licensing: Promise<LicensingApiRequestHandlerContext>;
}
type AlertDetailsContextualInsightsHandler = (
export type AlertDetailsContextualInsightsHandler = (
context: AlertDetailsContextualInsightsRequestContext,
query: AlertDetailsContextualInsightsHandlerQuery
) => Promise<AlertDetailsContextualInsight[]>;