[8.x] [RCA] AI-assisted root cause analysis (#197200) (#203767)

# Backport

This will backport the following commits from `main` to `8.x`:
- [[RCA] AI-assisted root cause analysis
(#197200)](https://github.com/elastic/kibana/pull/197200)

<!--- Backport version: 7.3.2 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT {commits} BACKPORT-->
This commit is contained in:
Dario Gieselaar 2024-12-12 15:00:12 +01:00 committed by GitHub
parent 10e01b4be4
commit b3ba62a972
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
144 changed files with 27287 additions and 358 deletions

View file

@ -919,6 +919,7 @@ module.exports = {
'x-pack/plugins/observability_solution/exploratory_view/**/*.{js,mjs,ts,tsx}',
'x-pack/plugins/observability_solution/ux/**/*.{js,mjs,ts,tsx}',
'x-pack/plugins/observability_solution/slo/**/*.{js,mjs,ts,tsx}',
'x-pack/packages/observability/**/*.{js,mjs,ts,tsx}',
],
rules: {
'no-console': ['warn', { allow: ['error'] }],
@ -938,6 +939,7 @@ module.exports = {
'x-pack/plugins/observability_solution/observability/**/*.stories.*',
'x-pack/plugins/observability_solution/exploratory_view/**/*.stories.*',
'x-pack/plugins/observability_solution/slo/**/*.stories.*',
'x-pack/packages/observability/**/*.{js,mjs,ts,tsx}',
],
rules: {
'react/function-component-definition': [

2
.github/CODEOWNERS vendored
View file

@ -662,6 +662,8 @@ packages/kbn-object-versioning-utils @elastic/appex-sharedux
x-pack/plugins/observability_solution/observability_ai_assistant_app @elastic/obs-ai-assistant
x-pack/plugins/observability_solution/observability_ai_assistant_management @elastic/obs-ai-assistant
x-pack/plugins/observability_solution/observability_ai_assistant @elastic/obs-ai-assistant
x-pack/packages/observability/observability_ai/observability_ai_common @elastic/obs-ai-assistant
x-pack/packages/observability/observability_ai/observability_ai_server @elastic/obs-ai-assistant
x-pack/packages/observability/alert_details @elastic/obs-ux-management-team
x-pack/packages/observability/alerting_rule_utils @elastic/obs-ux-management-team
x-pack/packages/observability/alerting_test_data @elastic/obs-ux-management-team

View file

@ -693,6 +693,8 @@
"@kbn/observability-ai-assistant-app-plugin": "link:x-pack/plugins/observability_solution/observability_ai_assistant_app",
"@kbn/observability-ai-assistant-management-plugin": "link:x-pack/plugins/observability_solution/observability_ai_assistant_management",
"@kbn/observability-ai-assistant-plugin": "link:x-pack/plugins/observability_solution/observability_ai_assistant",
"@kbn/observability-ai-common": "link:x-pack/packages/observability/observability_ai/observability_ai_common",
"@kbn/observability-ai-server": "link:x-pack/packages/observability/observability_ai/observability_ai_server",
"@kbn/observability-alert-details": "link:x-pack/packages/observability/alert_details",
"@kbn/observability-alerting-rule-utils": "link:x-pack/packages/observability/alerting_rule_utils",
"@kbn/observability-alerting-test-data": "link:x-pack/packages/observability/alerting_test_data",
@ -1143,6 +1145,7 @@
"fnv-plus": "^1.3.1",
"formik": "^2.4.6",
"fp-ts": "^2.3.1",
"fuse.js": "^7.0.0",
"get-port": "^5.0.0",
"getopts": "^2.2.5",
"getos": "^3.1.0",

View file

@ -13,6 +13,7 @@ export type {
SearchHit,
ESSearchResponse,
ESSearchRequest,
ESSearchRequestWithoutBody,
ESSourceOptions,
InferSearchResponseOf,
AggregationResultOf,

View file

@ -8,6 +8,7 @@
*/
import * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import * as estypesWithoutBody from '@elastic/elasticsearch/lib/api/types';
import type {
Field,
QueryDslFieldAndFormat,
@ -26,6 +27,7 @@ import {
export type ESFilter = estypes.QueryDslQueryContainer;
export type ESSearchRequest = estypes.SearchRequest;
export type ESSearchRequestWithoutBody = estypesWithoutBody.SearchRequest;
export type AggregationOptionsByType = Required<estypes.AggregationsAggregationContainer>;
// Typings for Elasticsearch queries and aggregations. These are intended to be

View file

@ -23,20 +23,15 @@ type InvalidAggregationRequest = unknown;
// Union keys are not included in keyof, but extends iterates over the types in a union.
type ValidAggregationKeysOf<T extends Record<string, any>> = T extends T ? keyof T : never;
type KeyOfSource<T> = Record<
keyof T,
(T extends Record<string, { terms: { missing_bucket: true } }> ? null : never) | string | number
>;
type KeyOfSource<T> = {
[key in keyof T]:
| (T[key] extends Record<string, { terms: { missing_bucket: true } }> ? null : never)
| string
| number;
};
type KeysOfSources<T extends any[]> = T extends [any]
? KeyOfSource<T[0]>
: T extends [any, any]
? KeyOfSource<T[0]> & KeyOfSource<T[1]>
: T extends [any, any, any]
? KeyOfSource<T[0]> & KeyOfSource<T[1]> & KeyOfSource<T[2]>
: T extends [any, any, any, any]
? KeyOfSource<T[0]> & KeyOfSource<T[1]> & KeyOfSource<T[2]> & KeyOfSource<T[3]>
: Record<string, null | string | number>;
// convert to intersection to be able to get all the keys
type KeysOfSources<T extends any[]> = UnionToIntersection<KeyOfSource<ValuesType<Pick<T, number>>>>;
type CompositeKeysOf<TAggregationContainer extends AggregationsAggregationContainer> =
TAggregationContainer extends {

View file

@ -24,6 +24,9 @@ const updateInvestigationParamsSchema = z.object({
}),
tags: z.array(z.string()),
externalIncidentUrl: z.string().nullable(),
rootCauseAnalysis: z.object({
events: z.array(z.any()),
}),
})
.partial(),
});

View file

@ -35,6 +35,11 @@ const investigationSchema = z.object({
notes: z.array(investigationNoteSchema),
items: z.array(investigationItemSchema),
externalIncidentUrl: z.string().nullable(),
rootCauseAnalysis: z
.object({
events: z.array(z.any()),
})
.optional(),
});
type Status = z.infer<typeof statusSchema>;

View file

@ -98,8 +98,15 @@ export function registerRoutes<TDependencies extends Record<string, any>>({
if (isKibanaResponse(result)) {
return result;
} else if (isObservable(result)) {
const controller = new AbortController();
request.events.aborted$.subscribe(() => {
controller.abort();
});
return response.ok({
body: observableIntoEventSourceStream(result as Observable<ServerSentEvent>),
body: observableIntoEventSourceStream(result as Observable<ServerSentEvent>, {
logger,
signal: controller.signal,
}),
});
} else {
const body = result || {};

View file

@ -0,0 +1,198 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/
import { Logger } from '@kbn/logging';
import { observableIntoEventSourceStream } from './observable_into_event_source_stream';
import { PassThrough } from 'node:stream';
import { Subject } from 'rxjs';
import { ServerSentEvent, ServerSentEventType } from '@kbn/sse-utils/src/events';
import {
ServerSentEventErrorCode,
createSSEInternalError,
createSSERequestError,
} from '@kbn/sse-utils/src/errors';
describe('observableIntoEventSourceStream', () => {
let logger: jest.Mocked<Logger>;
let controller: AbortController;
let stream: PassThrough;
let source$: Subject<ServerSentEvent>;
let data: string[];
beforeEach(() => {
jest.useFakeTimers();
logger = {
debug: jest.fn(),
error: jest.fn(),
} as unknown as jest.Mocked<Logger>;
controller = new AbortController();
source$ = new Subject();
data = [];
stream = observableIntoEventSourceStream(source$, { logger, signal: controller.signal });
stream.on('data', (chunk) => {
data.push(chunk.toString());
});
});
afterEach(() => {
jest.clearAllTimers();
});
it('writes events into the stream in SSE format', () => {
source$.next({ type: ServerSentEventType.data, data: { foo: 'bar' } });
source$.complete();
jest.runAllTimers();
expect(data).toEqual(['event: data\ndata: {"data":{"foo":"bar"}}\n\n']);
});
it('handles SSE errors', () => {
const sseError = createSSEInternalError('Invalid input');
source$.error(sseError);
jest.runAllTimers();
expect(logger.error).toHaveBeenCalledWith(sseError);
expect(logger.debug).toHaveBeenCalled();
const debugFn = logger.debug.mock.calls[0][0] as () => string;
const loggedError = JSON.parse(debugFn());
expect(loggedError).toEqual({
type: 'error',
error: {
code: ServerSentEventErrorCode.internalError,
message: 'Invalid input',
meta: {},
},
});
expect(data).toEqual([
`event: error\ndata: ${JSON.stringify({
error: {
code: ServerSentEventErrorCode.internalError,
message: 'Invalid input',
meta: {},
},
})}\n\n`,
]);
});
it('handles SSE errors with metadata', () => {
const sseError = createSSERequestError('Invalid request', 400);
source$.error(sseError);
jest.runAllTimers();
expect(logger.error).toHaveBeenCalledWith(sseError);
expect(logger.debug).toHaveBeenCalled();
const debugFn = logger.debug.mock.calls[0][0] as () => string;
const loggedError = JSON.parse(debugFn());
expect(loggedError).toEqual({
type: 'error',
error: {
code: ServerSentEventErrorCode.requestError,
message: 'Invalid request',
meta: {
status: 400,
},
},
});
expect(data).toEqual([
`event: error\ndata: ${JSON.stringify({
error: {
code: ServerSentEventErrorCode.requestError,
message: 'Invalid request',
meta: {
status: 400,
},
},
})}\n\n`,
]);
});
it('handles non-SSE errors', () => {
const error = new Error('Non-SSE Error');
source$.error(error);
jest.runAllTimers();
expect(logger.error).toHaveBeenCalledWith(error);
expect(data).toEqual([
`event: error\ndata: ${JSON.stringify({
error: {
code: ServerSentEventErrorCode.internalError,
message: 'Non-SSE Error',
},
})}\n\n`,
]);
});
it('should send keep-alive comments every 10 seconds', () => {
jest.advanceTimersByTime(10000);
expect(data).toContain(': keep-alive');
jest.advanceTimersByTime(10000);
expect(data.filter((d) => d === ': keep-alive')).toHaveLength(2);
});
describe('without fake timers', () => {
beforeEach(() => {
jest.useFakeTimers({ doNotFake: ['nextTick'] });
});
it('should end the stream when the observable completes', async () => {
jest.useFakeTimers({ doNotFake: ['nextTick'] });
const endSpy = jest.fn();
stream.on('end', endSpy);
source$.complete();
await new Promise((resolve) => process.nextTick(resolve));
expect(endSpy).toHaveBeenCalled();
});
it('should end stream when signal is aborted', async () => {
const endSpy = jest.fn();
stream.on('end', endSpy);
// Emit some data
source$.next({ type: ServerSentEventType.data, data: { initial: 'data' } });
// Abort the signal
controller.abort();
// Emit more data after abort
source$.next({ type: ServerSentEventType.data, data: { after: 'abort' } });
await new Promise((resolve) => process.nextTick(resolve));
expect(endSpy).toHaveBeenCalled();
// Data after abort should not be received
expect(data).toEqual([
`event: data\ndata: ${JSON.stringify({ data: { initial: 'data' } })}\n\n`,
]);
});
afterEach(() => {
jest.useFakeTimers();
});
});
});

View file

@ -7,12 +7,51 @@
* License v3.0 only", or the "Server Side Public License, v 1".
*/
import { map, Observable } from 'rxjs';
import { Logger } from '@kbn/logging';
import {
isSSEError,
ServerSentErrorEvent,
ServerSentEventErrorCode,
} from '@kbn/sse-utils/src/errors';
import { ServerSentEvent, ServerSentEventType } from '@kbn/sse-utils/src/events';
import { catchError, map, Observable, of } from 'rxjs';
import { PassThrough } from 'stream';
import { ServerSentEvent } from '@kbn/sse-utils';
export function observableIntoEventSourceStream(source$: Observable<ServerSentEvent>): PassThrough {
const withSerializedEvents$ = source$.pipe(
export function observableIntoEventSourceStream(
source$: Observable<ServerSentEvent>,
{
logger,
signal,
}: {
logger: Pick<Logger, 'debug' | 'error'>;
signal: AbortSignal;
}
) {
const withSerializedErrors$ = source$.pipe(
catchError((error): Observable<ServerSentErrorEvent> => {
if (isSSEError(error)) {
logger.error(error);
logger.debug(() => JSON.stringify(error));
return of({
type: ServerSentEventType.error,
error: {
code: error.code,
message: error.message,
meta: error.meta,
},
});
}
logger.error(error);
return of({
type: ServerSentEventType.error,
error: {
code: ServerSentEventErrorCode.internalError,
message: error.message as string,
},
});
}),
map((event) => {
const { type, ...rest } = event;
return `event: ${type}\ndata: ${JSON.stringify(rest)}\n\n`;
@ -21,18 +60,38 @@ export function observableIntoEventSourceStream(source$: Observable<ServerSentEv
const stream = new PassThrough();
withSerializedEvents$.subscribe({
const intervalId = setInterval(() => {
// `:` denotes a comment - this is to keep the connection open
// it will be ignored by the SSE parser on the client
stream.write(': keep-alive');
}, 10000);
const subscription = withSerializedErrors$.subscribe({
next: (line) => {
stream.write(line);
},
complete: () => {
stream.end();
clearTimeout(intervalId);
},
error: (error) => {
stream.write(`event: error\ndata: ${JSON.stringify(error)}\n\n`);
clearTimeout(intervalId);
stream.write(
`event:error\ndata: ${JSON.stringify({
error: {
code: ServerSentEventErrorCode.internalError,
message: error.message,
},
})}\n\n`
);
stream.end();
},
});
signal.addEventListener('abort', () => {
subscription.unsubscribe();
stream.end();
});
return stream;
}

View file

@ -15,5 +15,6 @@
],
"kbn_references": [
"@kbn/sse-utils",
"@kbn/logging",
]
}

View file

@ -21,7 +21,8 @@ function myRequestHandler(
data: {
anyData: {},
},
})
}),
logger
),
});
}

View file

@ -1318,6 +1318,10 @@
"@kbn/observability-ai-assistant-management-plugin/*": ["x-pack/plugins/observability_solution/observability_ai_assistant_management/*"],
"@kbn/observability-ai-assistant-plugin": ["x-pack/plugins/observability_solution/observability_ai_assistant"],
"@kbn/observability-ai-assistant-plugin/*": ["x-pack/plugins/observability_solution/observability_ai_assistant/*"],
"@kbn/observability-ai-common": ["x-pack/packages/observability/observability_ai/observability_ai_common"],
"@kbn/observability-ai-common/*": ["x-pack/packages/observability/observability_ai/observability_ai_common/*"],
"@kbn/observability-ai-server": ["x-pack/packages/observability/observability_ai/observability_ai_server"],
"@kbn/observability-ai-server/*": ["x-pack/packages/observability/observability_ai/observability_ai_server/*"],
"@kbn/observability-alert-details": ["x-pack/packages/observability/alert_details"],
"@kbn/observability-alert-details/*": ["x-pack/packages/observability/alert_details/*"],
"@kbn/observability-alerting-rule-utils": ["x-pack/packages/observability/alerting_rule_utils"],

View file

@ -0,0 +1,15 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
module.exports = {
preset: '@kbn/test',
rootDir: '../../../../..',
roots: [
'<rootDir>/x-pack/packages/observability/observability_ai/observability_ai_common',
'<rootDir>/x-pack/packages/observability/observability_ai/observability_ai_server',
],
};

View file

@ -0,0 +1,7 @@
{
"type": "shared-common",
"id": "@kbn/observability-ai-common",
"owner": "@elastic/obs-ai-assistant",
"group": "observability",
"visibility": "private"
}

View file

@ -0,0 +1,6 @@
{
"name": "@kbn/observability-ai-common",
"private": true,
"version": "1.0.0",
"license": "Elastic License 2.0"
}

View file

@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export {
RCA_END_PROCESS_TOOL_NAME,
RCA_INVESTIGATE_ENTITY_TOOL_NAME,
RCA_OBSERVE_TOOL_NAME,
} from './tool_names';

View file

@ -0,0 +1,10 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export const RCA_OBSERVE_TOOL_NAME = 'observe';
export const RCA_END_PROCESS_TOOL_NAME = 'endProcessAndWriteReport';
export const RCA_INVESTIGATE_ENTITY_TOOL_NAME = 'investigateEntity';

View file

@ -0,0 +1,20 @@
{
"extends": "../../../../../tsconfig.base.json",
"compilerOptions": {
"outDir": "target/types",
"types": [
"jest",
"node",
"react"
]
},
"include": [
"**/*.ts",
"**/*.tsx",
],
"exclude": [
"target/**/*"
],
"kbn_references": [
]
}

View file

@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
module.exports = {
preset: '@kbn/test',
rootDir: '../../../../..',
roots: ['<rootDir>/x-pack/packages/observability/observability_ai/observability_ai_server'],
};

View file

@ -0,0 +1,7 @@
{
"type": "shared-server",
"id": "@kbn/observability-ai-server",
"owner": "@elastic/obs-ai-assistant",
"group": "observability",
"visibility": "private"
}

View file

@ -0,0 +1,6 @@
{
"name": "@kbn/observability-ai-server",
"private": true,
"version": "1.0.0",
"license": "Elastic License 2.0"
}

View file

@ -0,0 +1,51 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { from, Observable, of, switchMap } from 'rxjs';
import { RCA_END_PROCESS_TOOL_NAME } from '@kbn/observability-ai-common/root_cause_analysis';
import { AssistantMessage, MessageRole } from '@kbn/inference-common';
import { writeFinalReport } from './tasks/write_final_report';
import { EndProcessToolMessage, RootCauseAnalysisContext } from './types';
import { generateSignificantEventsTimeline } from './tasks/generate_timeline';
import { EMPTY_ASSISTANT_MESSAGE } from './empty_assistant_message';
export function callEndRcaProcessTool({
rcaContext,
toolCallId,
}: {
rcaContext: RootCauseAnalysisContext;
toolCallId: string;
}): Observable<EndProcessToolMessage | AssistantMessage> {
return from(
writeFinalReport({
rcaContext,
})
).pipe(
switchMap((report) => {
return from(
generateSignificantEventsTimeline({
rcaContext,
report,
}).then((timeline) => {
return { timeline, report };
})
);
}),
switchMap(({ report, timeline }) => {
const toolMessage: EndProcessToolMessage = {
name: RCA_END_PROCESS_TOOL_NAME,
role: MessageRole.Tool,
toolCallId,
response: {
report,
timeline,
},
};
return of(toolMessage, EMPTY_ASSISTANT_MESSAGE);
})
);
}

View file

@ -0,0 +1,80 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { from, Observable, of, switchMap } from 'rxjs';
import { MessageRole } from '@kbn/inference-common';
import { RCA_INVESTIGATE_ENTITY_TOOL_NAME } from '@kbn/observability-ai-common/root_cause_analysis';
import { InvestigateEntityToolMessage, RootCauseAnalysisContext, ToolErrorMessage } from './types';
import { investigateEntity } from './tasks/investigate_entity';
import { formatEntity } from './util/format_entity';
export function callInvestigateEntityTool({
field,
value,
context,
toolCallId,
rcaContext,
}: {
field: string;
value: string;
context: string;
toolCallId: string;
rcaContext: RootCauseAnalysisContext;
}): Observable<InvestigateEntityToolMessage | ToolErrorMessage> {
const nextEntity = {
[field]: value,
};
return from(
investigateEntity({
rcaContext,
entity: nextEntity,
context,
})
).pipe(
switchMap((entityInvestigation) => {
if (!entityInvestigation) {
const entityNotFoundToolMessage: ToolErrorMessage = {
name: 'error',
role: MessageRole.Tool,
response: {
error: {
message: `Entity ${formatEntity(nextEntity)} not found, have
you verified it exists and if the field and value you are using
are correct?`,
},
},
toolCallId,
};
return of(entityNotFoundToolMessage);
}
const {
attachments,
relatedEntities,
entity: investigatedEntity,
summary,
} = entityInvestigation;
const toolMessage: InvestigateEntityToolMessage = {
name: RCA_INVESTIGATE_ENTITY_TOOL_NAME,
role: MessageRole.Tool as const,
toolCallId,
response: {
entity: investigatedEntity,
relatedEntities,
summary,
},
data: {
attachments,
},
};
return of(toolMessage);
})
);
}

View file

@ -0,0 +1,91 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { AssistantMessage, MessageRole } from '@kbn/inference-common';
import {
RCA_INVESTIGATE_ENTITY_TOOL_NAME,
RCA_OBSERVE_TOOL_NAME,
} from '@kbn/observability-ai-common/root_cause_analysis';
import { compact, findLast } from 'lodash';
import { from, Observable, of, switchMap } from 'rxjs';
import { observeInvestigationResults } from './tasks/observe_investigation_results';
import {
InvestigateEntityToolMessage,
ObservationToolMessage,
RootCauseAnalysisContext,
RootCauseAnalysisEvent,
} from './types';
export function callObserveTool({
rcaContext,
toolCallId,
}: {
rcaContext: RootCauseAnalysisContext;
toolCallId: string;
}): Observable<ObservationToolMessage> {
const { events } = rcaContext;
const lastAssistantMessage = findLast(
events.slice(0, -1),
(event): event is Extract<RootCauseAnalysisEvent, AssistantMessage> =>
event.role === MessageRole.Assistant
);
const toolMessagesByToolCallId = Object.fromEntries(
compact(
events.map((message) =>
'toolCallId' in message &&
(message.name === RCA_INVESTIGATE_ENTITY_TOOL_NAME || message.name === 'error')
? [message.toolCallId, message]
: undefined
)
)
);
const investigationToolMessages =
lastAssistantMessage && lastAssistantMessage.toolCalls
? compact(
lastAssistantMessage.toolCalls.map((investigateEntityToolCall) => {
if (investigateEntityToolCall.function.name !== RCA_INVESTIGATE_ENTITY_TOOL_NAME) {
return undefined;
}
return {
toolCall: investigateEntityToolCall,
toolResponse: toolMessagesByToolCallId[investigateEntityToolCall.toolCallId],
};
})
)
: [];
const investigations = investigationToolMessages
.map((toolMessage) => toolMessage.toolResponse)
.filter(
(toolResponse): toolResponse is InvestigateEntityToolMessage =>
toolResponse.name === RCA_INVESTIGATE_ENTITY_TOOL_NAME
)
.map((toolResponse) => ({ ...toolResponse.data, ...toolResponse.response }));
return from(
observeInvestigationResults({
rcaContext,
investigations,
})
).pipe(
switchMap((summary) => {
const observationToolMessage: ObservationToolMessage = {
name: RCA_OBSERVE_TOOL_NAME,
response: {
content: summary.content,
},
data: summary,
role: MessageRole.Tool,
toolCallId,
};
return of(observationToolMessage);
})
);
}

View file

@ -0,0 +1,15 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { AssistantMessage, MessageRole } from '@kbn/inference-common';
import { RootCauseAnalysisEvent } from './types';
export const EMPTY_ASSISTANT_MESSAGE: Extract<RootCauseAnalysisEvent, AssistantMessage> = {
content: '',
role: MessageRole.Assistant,
toolCalls: [],
};

View file

@ -0,0 +1,20 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export type {
RootCauseAnalysisEvent,
InvestigateEntityToolMessage,
EndProcessToolMessage,
ObservationToolMessage,
RootCauseAnalysisToolMessage,
ToolErrorMessage,
RootCauseAnalysisToolRequest,
} from './types';
export type { SignificantEventsTimeline, SignificantEvent } from './tasks/generate_timeline';
export type { EntityInvestigation } from './tasks/investigate_entity';
export { runRootCauseAnalysis } from './run_root_cause_analysis';

View file

@ -0,0 +1,345 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export const RCA_SYSTEM_PROMPT_BASE = `You are a helpful assistant for Elastic Observability.
You are a distinguished SRE, who has an established career, working in both
small shops and FAANG-level companies. You have worked with Elasticsearch
since the beginning and expertly use it in your analysis of incidents.
You use an evidence-based strategy to determine the root cause of
an incident. You thoroughly analyze Observability data. You use your
understanding of different architectures like microservies, monoliths,
event-driven systems, and environments like Kubernetes to discover
patterns and correlations in the data ingested into the user's system.
Your sizable experience with monitoring software systems has taught
you how to investigate issues and correlate symptoms of the investigate
service with its dependencies.
## Capabilities
You are highly skilled at inspecting logs, traces, alerts, and SLOs to uncover
the root cause of incidents, with a special emphasis on detecting log patterns
that reveal system behavior. You can identify related entities, such as upstream
services or the specific pod a service is running on, by searching through logs
and traces for relationships using metadata like IP addresses, session IDs, or
distributed tracing data. While you can analyze alerts and SLO-derived metrics,
you do not directly analyze other system metrics, inspect files, or execute
commands that modify the system.
## Non-capabilities
You lack the capabilities to analyze metrics or connect to external systems.`;
export const RCA_PROMPT_ENTITIES = `# Entities
In an Observability system, entities are distinct components or resources within
the infrastructure, each representing points of interest for monitoring and
troubleshooting. These entities form the backbone of log-based analysis and
allow teams to track behavior, detect anomalies, and investigate issues across
different layers of the system. Heres a breakdown of common entities in
observability:
1. Services: Core units of functionality in an application ecosystem,
representing individual processes or applications (e.g., user-authentication,
payment processing). Services typically expose APIs or endpoints, and logs from
these entities often capture requests, responses, and error events, which are
critical for understanding application behavior.
2. Kubernetes (K8s) Entities:
- Pods: The smallest deployable units in Kubernetes, usually containing one
or more containers. Logs from pods provide insight into container operations,
errors, and application states.
- Namespaces: Logical groupings within a cluster for organizing and isolating
resources, helping in filtering logs by domain or responsibility.
- Nodes: Worker machines (either physical or virtual) where pods run. Node
logs often cover hardware resource events, errors, and other system-level events
relevant to pod health and performance.
- Deployments and ReplicaSets: Define and manage the desired state of pod
replication and rolling updates. Logs from these components can reveal changes
in application versions, scaling events, and configuration updates.
3. Virtual Machines (VMs): Virtualized computing resources that generate
operating system-level logs capturing events such as application crashes,
network issues, and OS-related errors.
4. Applications: Software systems or packages running across the infrastructure,n
which may encompass multiple services. Logs from applications track user flows,
application states, and error messages, providing context for user interactions
and system events.
5. Serverless Functions (e.g., AWS Lambda): Code executions triggered by
specific events. Logs from serverless functions capture invocation details,
execution paths, and error traces, which are useful for understanding specific
function behaviors and pinpointing execution anomalies.
6. Databases and Data Stores: Includes SQL/NoSQL databases, caches, and storage
solutions. Logs from these entities cover query executions, connection issues,
and transaction errors, essential for tracking data layer issues.
7. Containers: Portable environments running individual services or processes.
Container logs capture application and system events within the containerized
environment, helping track process-level errors and status changes.
8. Load Balancers and API Gateways: Components responsible for managing and
routing traffic. Logs from these entities include request paths, status codes,
and errors encountered, which can indicate connectivity issues or
misconfigurations.
9. Networking Components: Entities like virtual private clouds (VPCs),
firewalls, VPNs, and network interfaces. Logs from these components track
traffic flows, connectivity issues, and security events, crucial for identifying
network-related anomalies.
10. Clusters and Regions: Groupings of infrastructure either physically or
logically, such as across data centers or cloud regions. Cluster and region logs
help capture high-level events and error messages, useful for understanding
system-wide issues and region-specific disruptions.
Each of these entities is typically identified by fields such as
\`service.name\`, \`kubernetes.pod.name\`, \`container.id\`, or similar fields
in log records. Observability systems use these identifiers to connect entities,
creating a map of relationships and dependencies that helps teams diagnose
issues, understand cross-entity impacts, and uncover root causes in distributed
architectures.`;
export const RCA_PROMPT_DEPENDENCIES = `## Understanding the Flow: Upstream vs. Downstream
- Upstream dependencies: These are the services that your service
depends on. They supply data, perform tasks, or provide resources that
your service consumes.
- Downstream dependencies: These are the services that depend on your
service. They consume the data or resources your service generates.
When diagnosing issues, distinguishing the direction of dependency can
clarify whether a problem originates from your services reliance on an
external input or whether your service is causing issues for other systems.
---
## When to Investigate Upstream Dependencies
Upstream issues typically occur when your service is failing due to problems
with the responses it receives from external systems.
1. Timeouts and Latency
- Symptoms: Slow response times, retries, or timeouts.
- Errors: HTTP 504, retrying connection, exceeded timeout threshold.
- Focus: Check the performance and availability of upstream services
(e.g., APIs, databases) and network latency.
2. Data Integrity Issues**
- Symptoms: Inconsistent or corrupted data.
- Errors: unexpected data format, deserialization errors.
- Focus: Verify data received from upstream services, and investigate
schema or data format changes.
3. Connection Failures
- Symptoms: Your service cannot connect to upstream services.
- Errors: DNS lookup failed, connection refused, socket timeout.
- Focus: Check upstream service health, DNS, and networking components.
4. Authentication/Authorization Failures**
- Symptoms: Failed access to upstream resources.
- Errors: 401 Unauthorized, 403 Forbidden, token issues.
- Focus: Validate credentials or tokens and investigate upstream access
policies.
---
## When to Investigate Downstream Dependencies
Downstream issues occur when your service is functioning but its outputs cause
failures in other services that depend on it.
1. Data or API Response Issues
- Symptoms: Downstream services receive bad or invalid data.
- Errors: data type mismatch, invalid JSON format.
- Focus: Ensure your service is returning correct data and check for API
changes.
2. Rate-Limiting and Resource Exhaustion**
- Symptoms: Downstream services are overwhelmed.
- Errors: 429 Too Many Requests, throttling or resource exhaustion.
- Focus: Check your services request rates and resource usage (e.g., memory, CPU).
3. Unexpected Behavior or Regression
- Symptoms: Downstream failures after a recent deployment.
- Errors: New downstream errors after your service changes.
- Focus: Review recent updates, API contracts, or integration points.
4. Eventual Consistency or Queue Backlogs
- Symptoms: Delayed processing in downstream systems.
- Errors: message queue full, backlog warnings.
- Focus: Check event production rates and queue statuses in downstream services.`;
export const RCA_PROMPT_CHANGES = `## Reasoning about Correlating Changes in Incident Investigations
In a root cause analysis, understanding the types and timing of changes is key
to linking symptoms with underlying causes. Changes can broadly be classified
into **symptomatic changes** (indicators of system issues like elevated error
rates or degraded throughput) and **system changes** (events that modify system
configuration or structure, such as scale-downs, new version rollouts, or
significant configuration adjustments). By correlating these changes, we can
assess whether observed symptoms are likely related to specific system
modifications.
### Identifying Correlations Between Symptomatic and System Changes
When investigating a sudden issuesuch as a 5x increase in latencyits
essential to evaluate both the **timing** and **nature** of associated changes
in upstream dependencies, resource utilization, and configuration events. For
instance:
- Consistent Symptomatic Behavior: If an upstream dependency exhibits a
similar, sustained latency spike around the same time and shows log entries
indicating CPU throttling, this would suggest a correlated, persistent issue
that may directly impact the observed symptom. A scale-down event preceding the
latency increase might indicate that reduced resources are stressing the
dependency.
- Transient vs. Persistent Issues: Another upstream dependency that
experiences a brief latency increase but recovers quickly is less likely
related. Short-lived changes that self-correct without intervention typically
have different root causes or may be unrelated noise.
### Types of Changes to Consider in Correlation
1. Log Pattern Changes: A shift in log patterns, especially around error
levels, provides significant insight. If theres an increase in critical or
warning log patterns for a dependency during the latency spike, it could
indicate that the issue stems from this entity. Compare these log patterns to
past behavior to assess whether they represent an anomaly that might warrant
further investigation.
2. Event-Driven System Changes:
- Scale Events: Scale-up or scale-down events can directly impact
performance. If a latency increase aligns with a scale-down, it may suggest that
resource reduction is straining the system.
- Release or Deployment Events: A new version rollout or config change is
a frequent source of correlated issues. Compare the timing of the latency
increase to the deployment to see if the change directly impacts the system.
Correlate with alerts or SLO breaches on endpoints to understand the immediate
effects of the release.
3. SLO and Alert-Based Changes: SLO breaches and alerts can provide concrete
timestamps for when symptoms begin. For instance, a breach on error rates for a
specific service endpoint following a dependencys scale-down event suggests a
possible causal link. An alert indicating sustained latency increase in a
dependency that remains unresolved points to a high-priority area for deeper
investigation.
4. Dependency Health and Behavior:
- Related vs. Unrelated Dependencies: Similar to the latency example,
observe if multiple dependencies experience symptomatic changes simultaneously.
Related dependencies should show consistent, similar issues, while unrelated
dependencies may exhibit brief, unrelated spikes. Persistent issues across key
dependencies likely indicate a systemic cause, while isolated changes are less
likely to be relevant.
### Examples of Reasoning Through Changes
Consider these scenarios:
- Increase in Error Rates and a Recent Deployment: Suppose error rates for
an endpoint increase sharply post-deployment. If related logs show new error
patterns, this aligns the symptom with a deployment change. Investigate specific
changes in the deployment (e.g., code changes or resource allocation).
- Throughput Decrease and Scaling Events: If throughput dips shortly after a
scale-down event, it might suggest resource constraints. Analyze CPU or memory
throttling logs from this period in upstream dependencies to confirm.
- Cross-Service Latency Spikes: If multiple services along a call path
experience latency spikes, with CPU throttling logs, this suggests a resource
bottleneck. Trace logs and alerts related to autoscaling decisions may provide
insights into whether the system configuration caused cascading delays.
By carefully mapping these changes and analyzing their timing, you can
distinguish between causally related events and incidental changes, allowing for
a targeted and effective investigation.`;
export const RCA_PROMPT_CHANGE_POINTS = `## Change points
Change points can be defined as the following type:
- \`dip\`: a significant dip occurs at this change point
- \`distribution_change\`: the overall distribution of the values has changed
significantly
- \`non_stationary\`: there is no change point, but the values are not from a
stationary distribution
- \`spike\`: a significant spike occurs at this point
- \`stationary\`: no change point found
- \`step_change\`: the change indicates a statistically significant step up or
down in value distribution
- \`trend_change\`: there is an overall trend change occurring at this point
For \`spike\`, and \`dip\`, this means: a short-lived spike or dip that then again
stabilizes. For persisted changes, you'd see a \`step_change\` (if the values
before and after the change point are stable), or a \`trend_change\` when the
values show an upward or downward trend after the change.`;
export const RCA_PROMPT_SIGNIFICANT_EVENTS = `## Significant events
Generate a timeline of significant events. These events should capture
significant observed changes in the system that can be extracted from the
analyzed data. This timeline is absolutely critical to the investigation,
and close attention has to be paid to the data, and the instructions.
The timeline should focus on key events as captured in log patterns, including
both notable changes and unusual/critical messages. This data-driven timeline
should help establish a chain of causality, pinpointing when anomalies began,
what system behaviors were observed, and how these patterns relate to the overall incident.
- Use ISO timestamps to ensure precision and clarity.
- Include alerts that are part of the investigation. For these, use the start
time of the alert, and mention critical information about the alert, such as
reason and grouping fields.
- Focus on log entries that signal significant system behavior (e.g., errors,
retries, anomalies).
- Highlight critical log messages or changes in patterns that may correlate
with the issue.
- Include notable anomalies, such as spikes in error rates, unexpected system
responses, or any log entries suggesting failure or degradation.
Do not include:
- Events that are indicative of normal operations.
- Events that are unlikely to be related to the investigated issue.
Key Elements to Include:
- Log Patterns: Capture log messages that show unusual events or
abnormalities such as error codes, failed retries, or changes in log frequency.
- Timestamps: Ensure every entry in the timeline is time-stamped
with an accurate ISO 8601 timestamp.
- Event Description: Provide a clear, concise, and objective description of
what was observed in the logs.
- Corroborating Data: Link log anomalies to other relevant data points such
as traffic shifts, request patterns, or upstream/downstream service impacts.`;
export const RCA_PROMPT_TIMELINE_GUIDE = `
The timeline should focus on key events as
captured in log patterns, including both notable changes and unusual/critical
messages. This data-driven timeline should help establish a chain of causality,
pinpointing when anomalies began, what system behaviors were observed, and how
these patterns relate to the overall incident.
- Use ISO timestamps** to ensure precision and clarity.
- Focus on log entries** that signal significant system behavior (e.g.,
errors, retries, anomalies).
- Highlight critical log messages** or changes in patterns that may correlate
with the issue.
- Include notable anomalies, such as spikes in error rates, unexpected
system responses, or any log entries suggesting failure or degradation.
Key Elements to Include:
Log Patterns: Capture log messages that show unusual events or
abnormalities such as error codes, failed retries, or changes in log frequency.
Timestamps: Ensure every entry in the timeline is time-stamped
with an accurate ISO 8601 timestamp.
Event Description: Provide a clear, concise description of what was
observed in the logs.
Corroborating Data: Link log anomalies to other relevant data points such
as traffic shifts, request patterns, or upstream/downstream service impacts.`;

View file

@ -0,0 +1,305 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { RulesClient } from '@kbn/alerting-plugin/server';
import { calculateAuto } from '@kbn/calculate-auto';
import { MessageRole, AssistantMessage, ToolMessage, ToolChoiceType } from '@kbn/inference-common';
import { InferenceClient } from '@kbn/inference-plugin/server';
import { Logger } from '@kbn/logging';
import { AlertsClient } from '@kbn/rule-registry-plugin/server';
import { findLast, pick } from 'lodash';
import moment from 'moment';
import { catchError, filter, from, map, mergeMap, Observable, of, switchMap } from 'rxjs';
import { ObservabilityAIAssistantClient } from '@kbn/observability-ai-assistant-plugin/server';
import { ObservabilityElasticsearchClient } from '@kbn/observability-utils-server/es/client/create_observability_es_client';
import {
RCA_END_PROCESS_TOOL_NAME,
RCA_INVESTIGATE_ENTITY_TOOL_NAME,
RCA_OBSERVE_TOOL_NAME,
} from '@kbn/observability-ai-common/root_cause_analysis';
import { callEndRcaProcessTool } from './call_end_rca_process_tool';
import { callInvestigateEntityTool } from './call_investigate_entity_tool';
import { callObserveTool } from './call_observe_tool';
import { RCA_PROMPT_CHANGES, RCA_PROMPT_ENTITIES, RCA_SYSTEM_PROMPT_BASE } from './prompts';
import { RCA_TOOLS } from './tools';
import {
EndProcessToolMessage,
InvestigateEntityToolMessage,
ObservationToolMessage,
RootCauseAnalysisContext,
RootCauseAnalysisEvent,
ToolErrorMessage,
} from './types';
import { callTools } from './util/call_tools';
import { formatEntity } from './util/format_entity';
import { validateInvestigateEntityToolCalls } from './util/validate_investigate_entity_tool_call';
const SYSTEM_PROMPT_WITH_OBSERVE_INSTRUCTIONS = `${RCA_SYSTEM_PROMPT_BASE}
Your next step is to request an observation from another agent based
on the initial context or the results of previous investigations.`;
const SYSTEM_PROMPT_WITH_DECISION_INSTRUCTIONS = `${RCA_SYSTEM_PROMPT_BASE}
${RCA_PROMPT_ENTITIES}
${RCA_PROMPT_CHANGES}
To determine whether to end the process or continue analyzing another entity,
follow the advice from the previous observation, and these tips:
Continuing the process:
- Do not investigate an entity twice. This will result in a failure.
- Logs, traces, or observability data that suggest upstream or downstream
issues (such as connection failures, timeouts, or authentication errors)
indicate further investigation is required.
Ending the process:
- No further entities to investigate: If there are no unexplored upstream or
downstream dependencies, and all related entities have been investigated without
discovering new anomalies, it may be appropriate to end the process.
- If all investigated entities (e.g., services, hosts, containers) are
functioning normally, with no relevant issues found, and there are no signs of
dependencies being affected, you may consider ending the process.
- Avoid concluding the investigation based solely on symptoms or the absence
of immediate errors in the data. Unless a system change has been connected to
the incident, it is important to continue investigating dependencies to ensure
the root cause has been accurately identified.`;
export function runRootCauseAnalysis({
serviceName,
start: requestedStart,
end: requestedEnd,
esClient,
alertsClient,
rulesClient,
observabilityAIAssistantClient,
spaceId,
indices,
connectorId,
inferenceClient,
context: initialContext,
logger: incomingLogger,
prevEvents,
}: {
context: string;
serviceName: string;
logger: Logger;
inferenceClient: InferenceClient;
start: number;
end: number;
alertsClient: AlertsClient;
rulesClient: RulesClient;
esClient: ObservabilityElasticsearchClient;
observabilityAIAssistantClient: ObservabilityAIAssistantClient;
indices: {
logs: string[];
traces: string[];
sloSummaries: string[];
};
connectorId: string;
spaceId: string;
prevEvents?: RootCauseAnalysisEvent[];
}): Observable<RootCauseAnalysisEvent> {
const logger = incomingLogger.get('rca');
const entity = { 'service.name': serviceName };
const bucketSize = calculateAuto
.atLeast(30, moment.duration(requestedEnd - requestedStart))!
.asMilliseconds();
const start = Math.floor(requestedStart / bucketSize) * bucketSize;
const end = Math.floor(requestedEnd / bucketSize) * bucketSize;
const initialMessage = {
role: MessageRole.User as const,
content: `Investigate the health status of ${formatEntity(entity)}.
The context given for this investigation is:
${initialContext}`,
};
const nextEvents = [initialMessage, ...(prevEvents ?? [])];
const initialRcaContext: RootCauseAnalysisContext = {
connectorId,
start,
end,
esClient,
events: nextEvents,
indices,
inferenceClient,
initialContext,
alertsClient,
observabilityAIAssistantClient,
logger,
rulesClient,
spaceId,
tokenLimit: 32_000,
};
const investigationTimeRangePrompt = `## Time range
The time range of the investigation is ${new Date(start).toISOString()} until ${new Date(
end
).toISOString()}`;
initialContext = `${initialContext}
${investigationTimeRangePrompt}
`;
const next$ = callTools(
{
system: RCA_SYSTEM_PROMPT_BASE,
connectorId,
inferenceClient,
messages: nextEvents,
logger,
},
({ messages }) => {
const lastSuccessfulToolResponse = findLast(
messages,
(message) => message.role === MessageRole.Tool && message.name !== 'error'
) as Exclude<ToolMessage, ToolErrorMessage> | undefined;
const shouldWriteObservationNext =
!lastSuccessfulToolResponse || lastSuccessfulToolResponse.name !== RCA_OBSERVE_TOOL_NAME;
const nextTools = shouldWriteObservationNext
? pick(RCA_TOOLS, RCA_OBSERVE_TOOL_NAME)
: pick(RCA_TOOLS, RCA_END_PROCESS_TOOL_NAME, RCA_INVESTIGATE_ENTITY_TOOL_NAME);
const nextSystem = shouldWriteObservationNext
? SYSTEM_PROMPT_WITH_OBSERVE_INSTRUCTIONS
: SYSTEM_PROMPT_WITH_DECISION_INSTRUCTIONS;
return {
messages,
system: `${nextSystem}
${investigationTimeRangePrompt}`,
tools: nextTools,
toolChoice: shouldWriteObservationNext
? { function: RCA_OBSERVE_TOOL_NAME }
: ToolChoiceType.required,
};
},
({
toolCalls,
messages,
}): Observable<
| ObservationToolMessage
| ToolErrorMessage
| InvestigateEntityToolMessage
| EndProcessToolMessage
| AssistantMessage
> => {
const nextRcaContext = {
...initialRcaContext,
events: messages as RootCauseAnalysisEvent[],
};
return of(undefined).pipe(
switchMap(() => {
return from(
validateInvestigateEntityToolCalls({ rcaContext: nextRcaContext, toolCalls })
);
}),
switchMap((errors) => {
if (errors.length) {
return of(
...toolCalls.map((toolCall) => {
const toolCallErrorMessage: ToolErrorMessage = {
role: MessageRole.Tool,
name: 'error',
response: {
error: {
message: `Some ${RCA_INVESTIGATE_ENTITY_TOOL_NAME} calls were not valid:
${errors.map((error) => `- ${error}`).join('\n')}`,
},
},
toolCallId: toolCall.toolCallId,
};
return toolCallErrorMessage;
})
);
}
return of(...toolCalls).pipe(
mergeMap((toolCall) => {
function executeToolCall(): Observable<
| EndProcessToolMessage
| InvestigateEntityToolMessage
| ObservationToolMessage
| ToolErrorMessage
| AssistantMessage
> {
switch (toolCall.function.name) {
case RCA_END_PROCESS_TOOL_NAME:
return callEndRcaProcessTool({
rcaContext: nextRcaContext,
toolCallId: toolCall.toolCallId,
});
case RCA_INVESTIGATE_ENTITY_TOOL_NAME:
return callInvestigateEntityTool({
context: toolCall.function.arguments.context,
field: toolCall.function.arguments.entity.field,
value: toolCall.function.arguments.entity.value,
rcaContext: nextRcaContext,
toolCallId: toolCall.toolCallId,
});
case RCA_OBSERVE_TOOL_NAME:
return callObserveTool({
rcaContext: nextRcaContext,
toolCallId: toolCall.toolCallId,
});
}
}
return executeToolCall().pipe(
catchError((error) => {
logger.error(`Failed executing task: ${error.message}`);
logger.error(error);
const toolErrorMessage: ToolErrorMessage = {
name: 'error',
role: MessageRole.Tool,
response: {
error: {
...('toJSON' in error && typeof error.toJSON === 'function'
? error.toJSON()
: {}),
message: error.message,
},
},
toolCallId: toolCall.toolCallId,
};
return of(toolErrorMessage);
})
);
}, 3)
);
})
);
}
);
return next$.pipe(
filter((event) =>
Boolean(event.role !== MessageRole.Assistant || event.content || event.toolCalls?.length)
),
map((event) => {
if (event.role === MessageRole.Assistant) {
return event as Extract<RootCauseAnalysisEvent, AssistantMessage>;
}
return event;
})
);
}

View file

@ -0,0 +1,402 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { getEntityKuery } from '@kbn/observability-utils-common/entities/get_entity_kuery';
import { formatValueForKql } from '@kbn/observability-utils-common/es/format_value_for_kql';
import type { TruncatedDocumentAnalysis } from '@kbn/observability-utils-common/llm/log_analysis/document_analysis';
import { ShortIdTable } from '@kbn/observability-utils-common/llm/short_id_table';
import {
P_VALUE_SIGNIFICANCE_HIGH,
P_VALUE_SIGNIFICANCE_MEDIUM,
} from '@kbn/observability-utils-common/ml/p_value_to_label';
import {
FieldPatternResultWithChanges,
getLogPatterns,
} from '@kbn/observability-utils-server/entities/get_log_patterns';
import { castArray, compact, groupBy, orderBy } from 'lodash';
import { RCA_PROMPT_CHANGES, RCA_PROMPT_ENTITIES } from '../../prompts';
import { RootCauseAnalysisContext } from '../../types';
import { formatEntity } from '../../util/format_entity';
import { serializeKnowledgeBaseEntries } from '../../util/serialize_knowledge_base_entries';
import { ScoredKnowledgeBaseEntry } from '../get_knowledge_base_entries';
type LogPatternRelevance = 'normal' | 'unusual' | 'warning' | 'critical';
export type AnalyzedLogPattern = FieldPatternResultWithChanges & {
relevance: LogPatternRelevance;
interesting: boolean;
};
export interface AnalyzeLogPatternOutput {
ownPatterns: AnalyzedLogPattern[];
patternsFromOtherEntities: AnalyzedLogPattern[];
}
const normalDescription = `normal operations, such as such access logs`;
const unusualDescription = `something unusual and/or
appear rarely, such as startup or shutdown messages or
other rare vents`;
const warningDescription = `something being in an unexpected state,
such as error messages, rate limiting or disk usage warnings`;
const criticalDescription = `something being in a critical state,
such as startup failure messages, out-of-memory errors or crashloopbackoff
events`;
interface LogPatternCutOff {
significance?: 'high' | 'medium' | 'low';
pValue?: number;
}
export async function analyzeLogPatterns({
entity,
allAnalysis,
system,
rcaContext: { logger: parentLogger, inferenceClient, connectorId, esClient, start, end, indices },
cutoff,
kbEntries,
}: {
entity: Record<string, string>;
allAnalysis: Array<{ index: string | string[]; analysis: TruncatedDocumentAnalysis }>;
system: string;
cutoff?: LogPatternCutOff;
kbEntries: ScoredKnowledgeBaseEntry[];
rcaContext: Pick<
RootCauseAnalysisContext,
'indices' | 'logger' | 'inferenceClient' | 'connectorId' | 'esClient' | 'start' | 'end'
>;
}): Promise<AnalyzeLogPatternOutput> {
const kuery = getEntityKuery(entity);
const logger = parentLogger.get('analyzeLogPatterns');
const fields = ['message', 'error.exception.message'];
logger.debug(() => `Analyzing log patterns for ${JSON.stringify(entity)}`);
const systemPrompt = `You are a helpful assistant for Elastic Observability.
You are an expert in analyzing log messages for software
systems, and you use your extensive experience as an SRE
to thoroughly analyze log patterns for things that require
attention from the user.
${RCA_PROMPT_CHANGES}
${RCA_PROMPT_ENTITIES}
## Entity
The following entity is being analyzed:
${formatEntity(entity)}
${serializeKnowledgeBaseEntries(kbEntries)}
### Entity analysis
${allAnalysis.map(({ index: analyzedIndex, analysis }) => {
return `#### Indices: ${castArray(analyzedIndex).join(',')}
${JSON.stringify(analysis)}`;
})}
${system}`;
const kueryForOtherEntities = `NOT (${kuery}) AND ${Object.values(entity)
.map(
(val) =>
`(${fields.map((field) => `(${[field, formatValueForKql(val)].join(':')})`).join(' OR ')})`
)
.join(' AND ')}`;
const [logPatternsFromEntity, logPatternsFromElsewhere] = await Promise.all([
getLogPatterns({
esClient,
index: [...indices.logs, ...indices.traces],
start,
end,
kuery,
includeChanges: true,
fields,
metadata: [],
}),
getLogPatterns({
esClient,
index: [...indices.logs],
start,
end,
kuery: kueryForOtherEntities,
metadata: Object.keys(entity),
includeChanges: true,
fields,
}),
]);
const patternIdLookupTable = new ShortIdTable();
logger.debug(
() =>
`Found ${logPatternsFromEntity.length} own log patterns and ${logPatternsFromElsewhere.length} from others`
);
logger.trace(
() =>
`Found log patterns${JSON.stringify({
entity,
logPatternsFromEntity,
logPatternsFromElsewhere,
})}`
);
const patternsWithIds = [...logPatternsFromEntity, ...logPatternsFromElsewhere].map((pattern) => {
return {
...pattern,
shortId: patternIdLookupTable.take(pattern.regex),
};
});
const patternsByRegex = new Map(patternsWithIds.map((pattern) => [pattern.regex, pattern]));
const serializedOwnEntity = formatEntity(entity);
const [ownPatterns, patternsFromOtherEntities] = await Promise.all([
logPatternsFromEntity.length ? categorizeOwnPatterns() : [],
logPatternsFromElsewhere.length ? selectRelevantPatternsFromOtherEntities() : [],
]);
logger.trace(
() =>
`Classified log patterns ${JSON.stringify([entity, ownPatterns, patternsFromOtherEntities])}`
);
const allPatterns = [...ownPatterns, ...patternsFromOtherEntities];
const sortedByPValueAsc = orderBy(
allPatterns.filter((pattern) => pattern.change && pattern.change.p_value),
(pattern) => {
return pattern.change.p_value;
},
'asc'
);
const pValueCutOff = getPValueCutoff({ cutoff, max: sortedByPValueAsc[0]?.change.p_value });
return {
ownPatterns: ownPatterns.map((pattern) => ({
...pattern,
interesting: isInterestingPattern(pattern, pValueCutOff),
})),
patternsFromOtherEntities: patternsFromOtherEntities.map((pattern) => ({
...pattern,
interesting: isInterestingPattern(pattern, pValueCutOff),
})),
};
function categorizeOwnPatterns() {
return inferenceClient
.output({
id: 'analyze_log_patterns',
connectorId,
system: systemPrompt,
input: `Based on the following log patterns from
${formatEntity(entity)}, group these patterns into
the following categories:
- normal (patterns that are indicative of ${normalDescription})
- unusual (patterns that are indicative of ${unusualDescription})
- warning (patterns that are indicative of ${warningDescription})
- critical (patterns that are indicative of ${criticalDescription})
## Log patterns:
${preparePatternsForLlm(logPatternsFromEntity)}
`,
schema: {
type: 'object',
properties: {
categories: {
type: 'array',
items: {
type: 'object',
properties: {
relevance: {
type: 'string',
enum: ['normal', 'unusual', 'warning', 'critical'],
},
shortIds: {
type: 'array',
description:
'The pattern IDs you want to group here. Use the pattern short ID.',
items: {
type: 'string',
},
},
},
required: ['relevance', 'shortIds'],
},
},
},
required: ['categories'],
} as const,
})
.then((outputEvent) => {
return outputEvent.output.categories.flatMap((category) => {
return mapIdsBackToPatterns(category.shortIds).map((pattern) => {
return {
...pattern,
relevance: category.relevance,
};
});
});
});
}
function selectRelevantPatternsFromOtherEntities() {
return inferenceClient
.output({
id: 'select_relevant_patterns_from_other_entities',
connectorId,
system: systemPrompt,
input: `Based on the following log patterns that
are NOT from ${serializedOwnEntity}, group these
patterns into the following categories:
- irrelevant (patterns that are not relevant for
${serializedOwnEntity})
- normal (patterns that relevant for
${serializedOwnEntity} and are indicative of ${normalDescription})
- unusual (patterns that are relevant for
${serializedOwnEntity} and are indicative of ${unusualDescription})
- warning (patterns that are relevant for
${serializedOwnEntity} and are indicative of ${warningDescription})
- critical (patterns that are relevant for
${serializedOwnEntity} and are indicative of ${criticalDescription})
Relevant patterns are messages that mention the
investigated entity, or things that are indicative
of critical failures or changes in the entity
that owns the log pattern.
## Log patterns:
${preparePatternsForLlm(logPatternsFromElsewhere)}
`,
schema: {
type: 'object',
properties: {
categories: {
type: 'array',
items: {
type: 'object',
properties: {
relevance: {
type: 'string',
enum: ['irrelevant', 'normal', 'unusual', 'warning', 'critical'],
},
shortIds: {
type: 'array',
description:
'The pattern IDs you want to group here. Use the pattern short ID.',
items: {
type: 'string',
},
},
},
required: ['relevance', 'shortIds'],
},
},
},
required: ['categories'],
} as const,
})
.then((outputEvent) => {
return outputEvent.output.categories.flatMap((category) => {
return mapIdsBackToPatterns(category.shortIds).flatMap((pattern) => {
if (category.relevance === 'irrelevant') {
return [];
}
return [
{
...pattern,
relevance: category.relevance,
},
];
});
});
});
}
function preparePatternsForLlm(patterns: FieldPatternResultWithChanges[]): string {
const groupedByField = groupBy(patterns, (pattern) => pattern.field);
return Object.entries(groupedByField)
.map(([field, patternsForField]) => {
return `### \`${field}\`
#### Patterns
${JSON.stringify(
patternsForField.map((pattern) => {
return {
shortId: patternIdLookupTable.take(pattern.regex),
regex: pattern.regex,
sample: pattern.sample,
highlight: pattern.highlight,
change: pattern.change,
};
})
)}
`;
})
.join('\n\n');
}
function mapIdsBackToPatterns(ids?: string[]) {
return compact(
ids?.map((shortId) => {
const lookupId = patternIdLookupTable.lookup(shortId);
if (!lookupId) {
return undefined;
}
const pattern = patternsByRegex.get(lookupId);
return pattern;
})
);
}
}
function isInterestingPattern(
pattern: Omit<AnalyzedLogPattern, 'interesting'>,
pValueCutOff: number
) {
return (pattern.change.p_value ?? 1) <= pValueCutOff || pattern.relevance !== 'normal';
}
function getPValueCutoff({ max, cutoff }: { max?: number; cutoff?: LogPatternCutOff }) {
if (cutoff?.pValue) {
return cutoff?.pValue;
}
if (cutoff?.significance === 'high') {
return P_VALUE_SIGNIFICANCE_HIGH;
}
if (cutoff?.significance === 'medium') {
return P_VALUE_SIGNIFICANCE_MEDIUM;
}
if (max === undefined) {
return Number.MAX_VALUE;
}
if (max <= P_VALUE_SIGNIFICANCE_HIGH) {
return P_VALUE_SIGNIFICANCE_HIGH;
}
if (max <= P_VALUE_SIGNIFICANCE_MEDIUM) {
return P_VALUE_SIGNIFICANCE_MEDIUM;
}
return Number.MAX_VALUE;
}

View file

@ -0,0 +1,74 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { InferenceClient } from '@kbn/inference-plugin/server';
import { TruncatedDocumentAnalysis } from '@kbn/observability-utils-common/llm/log_analysis/document_analysis';
import { FieldPatternResultWithChanges } from '@kbn/observability-utils-server/entities/get_log_patterns';
import { RCA_SYSTEM_PROMPT_BASE } from '../../prompts';
import { formatEntity } from '../../util/format_entity';
import { serializeKnowledgeBaseEntries } from '../../util/serialize_knowledge_base_entries';
import { ScoredKnowledgeBaseEntry } from '../get_knowledge_base_entries';
import { getInvestigateEntityTaskPrompt } from '../investigate_entity/prompts';
export async function describeEntity({
inferenceClient,
connectorId,
entity,
contextForEntityInvestigation,
analysis,
ownPatterns,
kbEntries,
}: {
inferenceClient: InferenceClient;
connectorId: string;
entity: Record<string, string>;
analysis: TruncatedDocumentAnalysis;
contextForEntityInvestigation: string;
ownPatterns: FieldPatternResultWithChanges[];
kbEntries: ScoredKnowledgeBaseEntry[];
}) {
const system = RCA_SYSTEM_PROMPT_BASE;
const input = `${getInvestigateEntityTaskPrompt({ entity, contextForEntityInvestigation })}
## Context for investigating ${formatEntity(entity)}
${contextForEntityInvestigation}
${serializeKnowledgeBaseEntries(kbEntries)}
## Data samples
${JSON.stringify(analysis)}
## Log patterns
${JSON.stringify(ownPatterns.map(({ regex, sample }) => ({ regex, sample })))}
## Current task
Describe the entity characteristics based on the sample documents and log
patterns. Put it in context of the investigation process. Mention the reason
why it's being investigated, and how it is related other entities that were
previously investigated. Mention these three things:
- infrastructure & environment
- communication characteristics (protocols and endpoints)
- context of entity in investigation
You shouldn't mention the log patterns, they will be analyzed elsewhere.
`;
const response = await inferenceClient.output({
id: 'describe_entity',
connectorId,
system,
input,
});
return response.content;
}

View file

@ -0,0 +1,189 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { InferenceClient } from '@kbn/inference-plugin/server';
import { TruncatedDocumentAnalysis } from '@kbn/observability-utils-common/llm/log_analysis/document_analysis';
import { omit, partition, sumBy } from 'lodash';
import { RCA_SYSTEM_PROMPT_BASE } from '../../prompts';
import { formatEntity } from '../../util/format_entity';
import { serializeKnowledgeBaseEntries } from '../../util/serialize_knowledge_base_entries';
import { AnalyzedLogPattern } from '../analyze_log_patterns';
import { ScoredKnowledgeBaseEntry } from '../get_knowledge_base_entries';
import { getInvestigateEntityTaskPrompt } from '../investigate_entity/prompts';
export interface LogPatternDescription {
content: string;
docCount: number;
interestingPatternCount: number;
ignoredPatternCount: number;
ignoredDocCount: number;
}
export async function describeLogPatterns({
inferenceClient,
connectorId,
entity,
contextForEntityInvestigation,
analysis,
ownPatterns: allOwnPatterns,
patternsFromOtherEntities,
kbEntries,
}: {
inferenceClient: InferenceClient;
connectorId: string;
entity: Record<string, string>;
analysis: TruncatedDocumentAnalysis;
contextForEntityInvestigation: string;
ownPatterns: AnalyzedLogPattern[];
patternsFromOtherEntities: AnalyzedLogPattern[];
kbEntries: ScoredKnowledgeBaseEntry[];
}): Promise<LogPatternDescription> {
const system = RCA_SYSTEM_PROMPT_BASE;
const [ownInterestingPatterns, ignoredOwnPatterns] = partition(
allOwnPatterns,
(pattern) => pattern.interesting
);
const stats = {
docCount: sumBy(allOwnPatterns, (pattern) => pattern.count),
interestingPatternCount: ownInterestingPatterns.length,
otherInterestingPatternCount: patternsFromOtherEntities.length,
ignoredPatternCount: ignoredOwnPatterns.length,
ignoredDocCount: sumBy(ignoredOwnPatterns, (pattern) => pattern.count),
};
const header = `## Log analysis
### Stats for own log patterns:
- ${stats.docCount} documents analyzed
- ${stats.interestingPatternCount} interesting patterns
- ${stats.ignoredPatternCount} ignored patterns, accounting for
${stats.ignoredDocCount} out of ${stats.docCount} documents
- ${stats.otherInterestingPatternCount} relevant patterns from
other entities`;
if (!stats.interestingPatternCount && !stats.otherInterestingPatternCount) {
return {
...stats,
content: `${header}\n\nNo interesting log patterns`,
};
}
const ownLogPatternsPrompt = ownInterestingPatterns.length
? JSON.stringify(
ownInterestingPatterns.map(({ regex, sample, change, count, timeseries }) => ({
regex,
sample,
change,
count,
timeseries: timeseries.map(({ x, y }, index) => {
if (index === change.change_point) {
return `${change.type} at ${new Date(x).toISOString()}: ${y}`;
}
return `${new Date(x).toISOString()}: ${y}`;
}),
}))
)
: 'No own log patterns found';
const otherLogPatternsPrompt = patternsFromOtherEntities.length
? JSON.stringify(
patternsFromOtherEntities.map(
({ regex, sample, change, count, timeseries, metadata, field, highlight }) => ({
regex,
sample,
change,
count,
timeseries: timeseries.map(({ x, y }, index) => {
if (index === change.change_point) {
return `${change.type} at ${new Date(x).toISOString()}: ${y}`;
}
return `${new Date(x).toISOString()}: ${y}`;
}),
entity: omit(metadata, field),
highlight,
})
)
)
: 'No relevant log patterns found from other entities';
const input = `${getInvestigateEntityTaskPrompt({ entity, contextForEntityInvestigation })}
## Context for investigating ${formatEntity(entity)}
${contextForEntityInvestigation}
${serializeKnowledgeBaseEntries(kbEntries)}
## Data samples
${JSON.stringify(analysis)}
## Log patterns from ${formatEntity(entity)}
${ownLogPatternsPrompt}
## Possibly relevant log patterns from other entities
${otherLogPatternsPrompt}
### Interpreting log patterns and samples
The pattern itself is what is consistent across all messages. The values from these parts
are separately given in "constants". There's also a single (random) _sample_ included, with
the variable part being given as well. E.g., if the failure in the sample is not part of the pattern
itself, you should mention that in your analysis.
## Task
Using only the log patterns, describe your observations about the entity.
Group these pattterns together based on topic. Some examples of these topics:
- normal operations such as request logs
- connection issues to an upstream dependency
- startup messages
- garbage collection messages
For patterns with change points, describe the trend before and after the change point based
on the data points. E.g.:
- A persisted drop to near-zero after 2020-01-01T05:00:00.000Z
- A spike from 10 to 100 at 2020-01-01T05:00:00.000Z, which went back down
to the average after 2020-01-01T05:02:00.000Z
- A trend change after 2020-01-01T05:00:00.000Z. The values ranged from 10
to 20 before, but then after increased from 20 to 100 until
2020-01-01T05:02:00.000Z.
Do not:
- repeat the variables, instead, repeat the constants.
- repeat the timeseries as a whole, verbatim, in full. However, you can use individual data points + timestamps to illustrate the magnitude of the change, as in the example previously given.
- make up timestamps.
- do not separately list individual events if you have already mentioned
the pattern.
Statistics:
- ${stats.interestingPatternCount} patterns from ${formatEntity(entity)}
were collected
- ${stats.docCount} logs were categorized
- ${stats.ignoredPatternCount} patterns were deemed uninteresting and accounted
for ${stats.ignoredDocCount} out of the total amount of logs
`;
const response = await inferenceClient.output({
id: 'describe_log_patterns',
connectorId,
system,
input,
});
return {
...stats,
content: response.content,
};
}

View file

@ -0,0 +1,438 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { InferenceClient } from '@kbn/inference-plugin/server';
import { Logger } from '@kbn/logging';
import { getEntityKuery } from '@kbn/observability-utils-common/entities/get_entity_kuery';
import {
DocumentAnalysis,
TruncatedDocumentAnalysis,
} from '@kbn/observability-utils-common/llm/log_analysis/document_analysis';
import { sortAndTruncateAnalyzedFields } from '@kbn/observability-utils-common/llm/log_analysis/sort_and_truncate_analyzed_fields';
import { analyzeDocuments } from '@kbn/observability-utils-server/entities/analyze_documents';
import { FieldPatternResultWithChanges } from '@kbn/observability-utils-server/entities/get_log_patterns';
import { ObservabilityElasticsearchClient } from '@kbn/observability-utils-server/es/client/create_observability_es_client';
import { kqlQuery } from '@kbn/observability-utils-server/es/queries/kql_query';
import { rangeQuery } from '@kbn/observability-utils-server/es/queries/range_query';
import { chunk, isEmpty, isEqual } from 'lodash';
import pLimit from 'p-limit';
import {
RCA_PROMPT_DEPENDENCIES,
RCA_PROMPT_ENTITIES,
RCA_SYSTEM_PROMPT_BASE,
} from '../../prompts';
import { chunkOutputCalls } from '../../util/chunk_output_calls';
import { formatEntity } from '../../util/format_entity';
import { serializeKnowledgeBaseEntries } from '../../util/serialize_knowledge_base_entries';
import { toBlockquote } from '../../util/to_blockquote';
import { ScoredKnowledgeBaseEntry } from '../get_knowledge_base_entries';
import { RelatedEntityKeywordSearch } from './write_keyword_searches_for_related_entities';
export interface RelatedEntityFromSearchResults {
entity: { [x: string]: string };
highlight: Record<string, string[]>;
analysis: TruncatedDocumentAnalysis;
}
function getPromptForFoundEntity({ entity, analysis, highlight }: RelatedEntityFromSearchResults) {
return `## Entity: ${formatEntity(entity)}
${toBlockquote(`### Search highlights for ${formatEntity(entity)}
${JSON.stringify(highlight)}`)}
`;
}
function getInputPromptBase({
entity,
analysis,
ownPatterns,
patternsFromOtherEntities,
searches,
context,
kbEntries,
}: {
entity: Record<string, string>;
analysis: TruncatedDocumentAnalysis;
ownPatterns: FieldPatternResultWithChanges[];
patternsFromOtherEntities: FieldPatternResultWithChanges[];
searches: RelatedEntityKeywordSearch[];
context: string;
kbEntries: ScoredKnowledgeBaseEntry[];
}) {
const otherPatternsPrompt = patternsFromOtherEntities.length
? JSON.stringify(
patternsFromOtherEntities.map((pattern) => ({
sample: pattern.sample,
regex: pattern.regex,
}))
)
: 'No relevant log patterns from other entities found';
const logPatternsPrompt = ownPatterns.length
? JSON.stringify(
ownPatterns.map((pattern) => {
return { sample: pattern.sample, regex: pattern.regex };
})
)
: 'No log patterns found';
return `Describe possible relationships to the investigated entity ${formatEntity(entity)}.
## Context
${toBlockquote(context)}
${serializeKnowledgeBaseEntries(kbEntries)}
## Data analysis
${JSON.stringify(analysis)}
## Log patterns for ${formatEntity(entity)}
${logPatternsPrompt}
## Patterns from other entities
${otherPatternsPrompt}
## Search keywords
${searches
.map(({ fragments, appearsAs }) => {
return `## Appears as: ${appearsAs}
### Fragments:
${fragments.map((fragment) => `- \`${fragment}\``).join('\n')}`;
})
.join('\n')}`;
}
function getInputPromptInstructions({ entity }: { entity: Record<string, any> }) {
return `### Indicator strength
In an Observability system, indicators of relationships between entities like
services, hosts, users, or requests can vary in strength. Some indicators
clearly define relationships, while others only suggest correlations. Heres a
breakdown of these indicators into strong, average, and weak categories, with an
additional look at how weak indicators can become strong when combined.
Strong indicators provide definitive links between entities. Distributed tracing
IDs (trace, span, and parent) are among the strongest indicators, as they map
the complete request path across services, showing exact service interactions.
Session or user IDs are also strong indicators, capturing a users actions
across services or hosts and revealing issues specific to particular users.
Average indicators give helpful context but may require supporting data to
clarify relationships. IP addresses, for instance, are moderately strong for
tracking inter-service calls within controlled environments but are weaker
across public or shared networks where IP reuse is common. URL paths also fall
in this category; they link entities to specific endpoints or service functions
and are moderately strong for tracking interactions between microservices with
known APIs. Port numbers are another average indicator. While they suggest the
service interaction type (HTTP, database), they generally need pairing with IP
addresses or URLs for more accuracy, as port numbers alone are often shared
across different services.
Weak indicators are often too generic to imply a direct relationship but can
suggest possible correlations. Host names, for example, are broad and typically
cover a range of services or applications, especially in large clusters.
Time-based indicators, such as timestamps or TTL values, suggest possible timing
correlations but dont establish a definitive link on their own. Status codes,
like HTTP 500 errors, indicate issues but dont specify causality, often
requiring corroboration with stronger indicators like trace or session IDs.
However, weak indicators can become strong when they appear together. For
instance, a combination of IP address, port, and timestamp can strongly suggest
a direct interaction between services, especially when the same combination is
seen repeatedly or in conjunction with related URLs. Similarly, a host name
combined with a unique URL path can strongly suggest that a specific service or
pod is generating particular request patterns, even if each alone is too
general.
## Relevance to the investigation
Given the context of the investigation, some entities might be very relevant
even if there is no strong evidence of them being a direct dependency of
${formatEntity(entity)}. For instance, the related entity might be an
orchestrating entity, or it might be involved in a specific operation related
to the ongoing issue.
## Identifying entity relationships
Your current task is to identify possible entity relationships for the
investigated entity ${formatEntity(entity)}. You will get some context, document
analysis for the investigated entity, and results from keyword searches that were
extracted from the entity. Based on this data, list entities that could possibly
be related to the given entity and/or the initial context. List the highly
relevant entities first.
## Output
For each possible relationship, describe the following things:
- The related entity (as a key-value pair)
- The indicators you have observed as evidence of the relationship. Include the
strength of the indicator, and the exact pieces of data that are related to it
(field names and values, in both the investigated entity, and the possibly
related entity).
- Reason how the related entity is related to both ${formatEntity(entity)} as a
dependency and the context. For instance, describe who is the caller and callee
or whether that is unclear, based on the data, or explain how it might be
related to the context.
- The overall likeliness of it being a relevant entity.`;
}
export async function analyzeFetchedRelatedEntities({
connectorId,
inferenceClient,
esClient,
start,
end,
searches,
groupingFields,
index,
entity,
ownPatterns,
analysis,
patternsFromOtherEntities,
logger: parentLogger,
context,
kbEntries,
}: {
connectorId: string;
inferenceClient: InferenceClient;
esClient: ObservabilityElasticsearchClient;
start: number;
end: number;
searches: RelatedEntityKeywordSearch[];
groupingFields: string[];
index: string | string[];
entity: Record<string, string>;
analysis: {
truncated: TruncatedDocumentAnalysis;
full: DocumentAnalysis;
};
ownPatterns: FieldPatternResultWithChanges[];
patternsFromOtherEntities: FieldPatternResultWithChanges[];
context: string;
logger: Logger;
kbEntries: ScoredKnowledgeBaseEntry[];
}): Promise<{
summaries: string[];
foundEntities: RelatedEntityFromSearchResults[];
}> {
const entityFields = Object.keys(entity);
const logger = parentLogger.get('findRelatedEntities');
logger.debug(
() => `Finding related entities: ${JSON.stringify({ entity, groupingFields, searches })}`
);
const allValuesFromEntity = Array.from(
new Set(analysis.full.fields.flatMap((field) => field.values))
);
const foundEntities = (
await Promise.all(
groupingFields.map((groupingField) => getResultsForGroupingField(groupingField))
)
).flat();
logger.debug(() => `Found ${foundEntities.length} entities via keyword searches`);
const system = `${RCA_SYSTEM_PROMPT_BASE}
${RCA_PROMPT_ENTITIES}
${RCA_PROMPT_DEPENDENCIES}`;
const inputPromptBase = getInputPromptBase({
entity,
analysis: analysis.truncated,
ownPatterns,
patternsFromOtherEntities,
searches,
context,
kbEntries,
});
const foundEntityPrompts = foundEntities.map((foundEntity) => {
return {
text: getPromptForFoundEntity(foundEntity),
id: formatEntity(foundEntity.entity),
};
});
const inputPromptInstructions = getInputPromptInstructions({ entity });
// don't do more than 10 entities in a response, we'll run out of
// tokens
const requests = chunk(foundEntityPrompts, 10).flatMap((texts) =>
chunkOutputCalls({
system,
input: `${inputPromptBase} ${inputPromptInstructions}`,
texts,
tokenLimit: 32_000 - 6_000,
})
);
const allRelevantEntityDescriptions = await Promise.all(
requests.map(async (request) => {
const outputCompleteEvent = await inferenceClient.output({
id: 'describe_relevant_entities',
connectorId,
system: request.system,
input: `${inputPromptBase}
# Found entities
${request.texts.map((text) => text.text).join('\n\n')}
${inputPromptInstructions}`,
});
return outputCompleteEvent.content;
})
);
return {
summaries: allRelevantEntityDescriptions,
foundEntities,
};
async function getResultsForGroupingField(
groupingField: string
): Promise<RelatedEntityFromSearchResults[]> {
const excludeQuery = isEqual([groupingField], entityFields)
? `NOT (${groupingField}:"${entity[groupingField]}")`
: ``;
const fieldCaps = await esClient.fieldCaps('check_if_grouping_field_exists', {
fields: [groupingField],
index,
index_filter: {
bool: {
filter: [...rangeQuery(start, end)],
},
},
});
if (isEmpty(fieldCaps.fields[groupingField])) {
return [];
}
const keywordSearchResults = await esClient.search(
'find_related_entities_via_keyword_searches',
{
track_total_hits: false,
index,
query: {
bool: {
must: [...rangeQuery(start, end), ...kqlQuery(excludeQuery)],
should: [
{
multi_match: {
query: searches.flatMap((search) => search.fragments).join(' '),
fields: '*',
},
},
],
minimum_should_match: 1,
},
},
fields: [groupingField],
collapse: {
field: groupingField,
},
highlight: {
fields: {
'*': {},
},
},
_source: false,
size: 1_000,
}
);
if (!keywordSearchResults.hits.hits.length) {
logger.debug(() => `No hits: ${JSON.stringify({ entity, groupingField, searches })}`);
return [];
}
logger.trace(
() =>
`Hits: ${JSON.stringify({
entity,
groupingField,
searches,
count: keywordSearchResults.hits.hits.length,
hits: keywordSearchResults.hits.hits,
})}`
);
const limiter = pLimit(20);
const groupingFieldAnalysis = await Promise.all(
keywordSearchResults.hits.hits.map(async (hit) => {
return limiter(async () => {
const groupValue = hit.fields![groupingField][0] as string;
const analysisForGroupingField = await analyzeDocuments({
esClient,
start,
end,
index,
kuery: getEntityKuery({
[groupingField]: groupValue,
}),
});
const analysisWithRelevantValues = {
...analysisForGroupingField,
fields: analysisForGroupingField.fields
.filter((field) => {
return !field.empty;
})
.map((field) => {
const valuesFoundInEntity = field.values.filter((value) => {
return (
allValuesFromEntity.includes(value) ||
allValuesFromEntity.some((valueFromEntity) => {
return (
typeof valueFromEntity === 'string' &&
typeof value === 'string' &&
(value.includes(valueFromEntity) || valueFromEntity.includes(value))
);
})
);
});
return {
...field,
values: valuesFoundInEntity,
};
}),
};
return {
groupingField,
key: groupValue,
highlight: hit.highlight!,
analysis: sortAndTruncateAnalyzedFields(analysisWithRelevantValues),
};
});
})
);
return groupingFieldAnalysis.map(({ key, highlight, analysis: analysisForGroupingField }) => {
return {
entity: {
[groupingField]: key,
},
highlight,
analysis: analysisForGroupingField,
};
});
}
}

View file

@ -0,0 +1,159 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import stringify from 'json-stable-stringify';
import pLimit from 'p-limit';
import { RelatedEntityFromSearchResults } from '.';
import {
RCA_PROMPT_DEPENDENCIES,
RCA_PROMPT_ENTITIES,
RCA_SYSTEM_PROMPT_BASE,
} from '../../prompts';
import { RootCauseAnalysisContext } from '../../types';
import { formatEntity } from '../../util/format_entity';
import { getPreviouslyInvestigatedEntities } from '../../util/get_previously_investigated_entities';
import { toBlockquote } from '../../util/to_blockquote';
export interface RelatedEntityDescription {
entity: Record<string, string>;
reason: string;
confidence: string;
}
export async function extractRelatedEntities({
entity,
entityReport,
summaries,
foundEntities,
context,
rcaContext: { events, connectorId, inferenceClient },
}: {
foundEntities: RelatedEntityFromSearchResults[];
entity: Record<string, string>;
entityReport: string;
summaries: string[];
context: string;
rcaContext: Pick<RootCauseAnalysisContext, 'events' | 'connectorId' | 'inferenceClient'>;
}): Promise<{ relatedEntities: RelatedEntityDescription[] }> {
const system = `${RCA_SYSTEM_PROMPT_BASE}
${RCA_PROMPT_ENTITIES}
${RCA_PROMPT_DEPENDENCIES}`;
const previouslyInvestigatedEntities = getPreviouslyInvestigatedEntities({ events });
const previouslyInvestigatedEntitiesPrompt = previouslyInvestigatedEntities.length
? `## Previously investigated entities
${previouslyInvestigatedEntities
.map((prevEntity) => `- ${formatEntity(prevEntity)}`)
.join('\n')}`
: '';
const prompts = summaries.map((summary) => {
return `
# Investigated entity
${formatEntity(entity)}
# Report
${toBlockquote(entityReport)}
# Related entities report
${toBlockquote(summary)}
${previouslyInvestigatedEntitiesPrompt}
# Context
${context}
# Task
Your current task is to extract relevant entities as a data structure from the
related entities report. Order them by relevance to the investigation, put the
most relevant ones first.
`;
});
const limiter = pLimit(5);
const allEvents = await Promise.all(
prompts.map(async (input) => {
const completeEvent = await limiter(() =>
inferenceClient.output({
id: 'get_entity_relationships',
connectorId,
system,
input,
schema: {
type: 'object',
properties: {
related_entities: {
type: 'array',
items: {
type: 'object',
properties: {
entity: {
type: 'object',
properties: {
field: {
type: 'string',
},
value: {
type: 'string',
},
},
required: ['field', 'value'],
},
reason: {
type: 'string',
description: 'Describe why this entity might be relevant. Provide evidence.',
},
confidence: {
type: 'string',
description:
'Describe how confident you are in your conclusion about this relationship: low, moderate, high',
},
},
required: ['entity', 'reason', 'confidence'],
},
},
},
required: ['related_entities'],
} as const,
})
);
return completeEvent.output;
})
);
const foundEntityIds = foundEntities.map(({ entity: foundEntity }) => stringify(foundEntity));
const relatedEntities = allEvents
.flat()
.flatMap((event) => {
return event.related_entities.map((item) => {
return {
entity: { [item.entity.field]: item.entity.value },
reason: item.reason,
confidence: item.confidence,
};
});
})
.filter((item) => {
return foundEntityIds.includes(stringify(item.entity));
});
return {
relatedEntities,
};
}

View file

@ -0,0 +1,97 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { InferenceClient } from '@kbn/inference-plugin/server';
import { Logger } from '@kbn/logging';
import {
DocumentAnalysis,
TruncatedDocumentAnalysis,
} from '@kbn/observability-utils-common/llm/log_analysis/document_analysis';
import { ObservabilityElasticsearchClient } from '@kbn/observability-utils-server/es/client/create_observability_es_client';
import { FieldPatternResultWithChanges } from '@kbn/observability-utils-server/entities/get_log_patterns';
import {
analyzeFetchedRelatedEntities,
RelatedEntityFromSearchResults,
} from './analyze_fetched_related_entities';
import {
RelatedEntityKeywordSearch,
writeKeywordSearchForRelatedEntities,
} from './write_keyword_searches_for_related_entities';
import { ScoredKnowledgeBaseEntry } from '../get_knowledge_base_entries';
export type { RelatedEntityFromSearchResults };
export async function findRelatedEntities({
connectorId,
inferenceClient,
start,
end,
index,
esClient,
entity,
analysis,
logger,
context,
ownPatterns,
patternsFromOtherEntities,
kbEntries,
}: {
connectorId: string;
inferenceClient: InferenceClient;
start: number;
end: number;
index: string | string[];
esClient: ObservabilityElasticsearchClient;
entity: Record<string, string>;
analysis: {
truncated: TruncatedDocumentAnalysis;
full: DocumentAnalysis;
};
logger: Logger;
context: string;
ownPatterns: FieldPatternResultWithChanges[];
patternsFromOtherEntities: FieldPatternResultWithChanges[];
kbEntries: ScoredKnowledgeBaseEntry[];
}): Promise<{
searches: RelatedEntityKeywordSearch[];
summaries: string[];
foundEntities: RelatedEntityFromSearchResults[];
}> {
const { groupingFields, searches } = await writeKeywordSearchForRelatedEntities({
connectorId,
inferenceClient,
entity,
analysis: analysis.truncated,
ownPatterns,
context,
kbEntries,
});
const { summaries, foundEntities } = await analyzeFetchedRelatedEntities({
entity,
connectorId,
start,
end,
esClient,
index,
inferenceClient,
searches,
groupingFields,
logger,
analysis,
ownPatterns,
patternsFromOtherEntities,
context,
kbEntries,
});
return {
searches,
summaries,
foundEntities,
};
}

View file

@ -0,0 +1,199 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { InferenceClient } from '@kbn/inference-plugin/server';
import { TruncatedDocumentAnalysis } from '@kbn/observability-utils-common/llm/log_analysis/document_analysis';
import { FieldPatternResultWithChanges } from '@kbn/observability-utils-server/entities/get_log_patterns';
import { RCA_PROMPT_ENTITIES, RCA_SYSTEM_PROMPT_BASE } from '../../prompts';
import { formatEntity } from '../../util/format_entity';
import { serializeKnowledgeBaseEntries } from '../../util/serialize_knowledge_base_entries';
import { toBlockquote } from '../../util/to_blockquote';
import { ScoredKnowledgeBaseEntry } from '../get_knowledge_base_entries';
const SYSTEM_PROMPT_ADDENDUM = `# Guide: Constructing Keyword Searches to Find Related Entities
When investigating issues like elevated failure rates for a
specific endpoint, you can use the metadata at hand (IP addresses,
URLs, session IDs, tracing IDs, etc.) to build targeted keyword searches.
By extracting meaningful fragments from the data, you can correlate
related services or hosts across distributed systems. Heres how
you can break down the metadata and format your searches.
## Grouping fields
Define grouping fields for the entities you want to extract. For
instance, "service.name" if you are looking for services, or
"kubernetes.pod.name" if you are looking for pods. Focus
on services, unless you are looking for deployment or
configuration changes.
---
## Key Metadata and Search Format
### Example: Investigating a service failure for \`/api/products\`
You can break down various pieces of metadata into searchable
fragments. For each value, include a short description of its
relationship to the investigation. This value will be used
by the system to determine the relevance of a given entity
that matches the search request.
### 1. **IP Address and Port**
- **Fragments:**
- \`"10.44.0.11:8080"\`: Full address.
- \`"10.44.0.11"\`: IP address only.
- \`"8080"\`: Port number.
- **Appears as:** This IP address and port are referenced as
<ip-field-name> and <port-field-name> in the investigated entity
<entity-name>..
### 2. **Outgoing Request URL**
- **Fragments:**
- \`"http://called-service/api/product"\`: Full outgoing URL.
- \`"/api/product*"\`: Endpoint path.
- \`"called-service"\`: Service name of the upstream dependency.
- **Appears as:** These URL fragments appear as attributes.request.url
in the investigated entity <entity-name>. They could appear as referer
in the upstream dependency.
### 3. **Parent and Span IDs**
- **Fragments:**
- \`"000aa"\`: Parent ID.
- \`"000bbb"\`: Span ID.
- **Relationship:** These ids appear as span.id and parent.id in the
investigated entity <entity-name>. They could be referring to spans
found on upstream or downstream services.
---
## Example Search Format in JSON
To structure your keyword search, format the fragments and their
relationships in a JSON array like this:
\`\`\`json
{
"groupingFields": [ "service.name" ],
"values": [
{
"fragments": [
"10.44.0.11:8080",
"10.44.0.11",
"8080"
],
"appearsAs": "This IP address and port are referenced as <ip-field-name> and <port-field-name> in the investigated entity <entity-name>."
},
{
"fragments": [
"http://<upstream-service>/api/product",
"/api/product",
"<upstream-service>"
],
"relationship": "These URL fragments appear as attributes.request.url in the investigated entity <entity-name>."
},
{
"fragments": [
"000aa",
"000bbb"
],
"relationship": " These ids appear as span.id and parent.id in the investigated entity <entity-name>. They could be referring to spans found on upstream or downstream services"
}
]
}`;
export interface RelatedEntityKeywordSearch {
fragments: string[];
appearsAs: string;
}
export async function writeKeywordSearchForRelatedEntities({
connectorId,
inferenceClient,
entity,
analysis,
ownPatterns,
context,
kbEntries,
}: {
connectorId: string;
inferenceClient: InferenceClient;
entity: Record<string, string>;
analysis: TruncatedDocumentAnalysis;
ownPatterns: FieldPatternResultWithChanges[];
context: string;
kbEntries: ScoredKnowledgeBaseEntry[];
}): Promise<{
groupingFields: string[];
searches: RelatedEntityKeywordSearch[];
}> {
const logPatternsPrompt = ownPatterns.length
? JSON.stringify(
ownPatterns.map((pattern) => ({ regex: pattern.regex, sample: pattern.sample }))
)
: 'No log patterns found';
return inferenceClient
.output({
id: 'extract_keyword_searches',
connectorId,
system: `${RCA_SYSTEM_PROMPT_BASE}
${RCA_PROMPT_ENTITIES}`,
input: `Your current task is to to extract keyword searches
to find related entities to the entity ${formatEntity(entity)},
based on the following context:
## Investigation context
${toBlockquote(context)}
${serializeKnowledgeBaseEntries(kbEntries)}
## Data analysis
${JSON.stringify(analysis)}
## Log patterns
${logPatternsPrompt}
## Instructions
${SYSTEM_PROMPT_ADDENDUM}`,
schema: {
type: 'object',
properties: {
groupingFields: {
type: 'array',
items: {
type: 'string',
},
},
searches: {
type: 'array',
items: {
type: 'object',
properties: {
fragments: {
type: 'array',
items: {
type: 'string',
},
},
appearsAs: {
type: 'string',
description:
'Describe in what fields these values appear as in the investigated entity. You can mention multiple fields if applicable',
},
},
required: ['fragments', 'appearsAs'],
},
},
},
required: ['searches', 'groupingFields'],
} as const,
})
.then((event) => event.output);
}

View file

@ -0,0 +1,96 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { RCA_SYSTEM_PROMPT_BASE } from '../../prompts';
import { RootCauseAnalysisContext } from '../../types';
import { stringifySummaries } from '../../util/stringify_summaries';
type SignificantEventSeverity = 'info' | 'unusual' | 'warning' | 'critical';
type SignificantEventType = 'alert' | 'slo' | 'event';
export interface SignificantEvent {
severity: SignificantEventSeverity;
'@timestamp'?: string;
description: string;
type: SignificantEventType;
}
export interface SignificantEventsTimeline {
events: SignificantEvent[];
}
export async function generateSignificantEventsTimeline({
report,
rcaContext,
}: {
report: string;
rcaContext: RootCauseAnalysisContext;
}): Promise<SignificantEventsTimeline> {
const { connectorId, inferenceClient } = rcaContext;
return await inferenceClient
.output({
id: 'generate_timeline',
system: RCA_SYSTEM_PROMPT_BASE,
connectorId,
input: `Your current task is to generate a timeline
of significant events, based on the given RCA report,
according to a structured schema. This timeline will
be presented to the user as a visualization.
${stringifySummaries(rcaContext)}
# Report
${report}
`,
schema: {
type: 'object',
properties: {
events: {
type: 'array',
items: {
type: 'object',
properties: {
timestamp: {
type: 'string',
description: 'The ISO timestamp of when the event occurred',
},
severity: {
type: 'string',
enum: ['info', 'unusual', 'warning', 'critical'],
},
type: {
type: 'string',
enum: ['alert', 'slo', 'event'],
},
description: {
type: 'string',
description: 'A description of the event',
},
},
required: ['severity', 'description'],
},
},
},
required: ['events'],
} as const,
})
.then((timelineCompleteEvent) => {
return {
events: timelineCompleteEvent.output.events.map((event) => {
return {
'@timestamp': event.timestamp,
severity: event.severity,
type: event.type ?? 'event',
description: event.description,
};
}),
};
});
}

View file

@ -0,0 +1,185 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { ShortIdTable } from '@kbn/observability-ai-assistant-plugin/common';
import { decode, encode } from 'gpt-tokenizer';
import { orderBy, sumBy } from 'lodash';
import { RCA_SYSTEM_PROMPT_BASE } from '../../prompts';
import { RootCauseAnalysisContext } from '../../types';
import { formatEntity } from '../../util/format_entity';
import { toBlockquote } from '../../util/to_blockquote';
export interface ScoredKnowledgeBaseEntry {
id: string;
text: string;
tokens: number;
score: number;
truncated?: {
tokens: number;
text: string;
};
}
export async function getKnowledgeBaseEntries({
entity,
context,
rcaContext,
maxTokens: maxTokensForEntries,
}: {
entity: Record<string, string>;
context: string;
rcaContext: RootCauseAnalysisContext;
maxTokens: number;
}): Promise<ScoredKnowledgeBaseEntry[]> {
const response = await rcaContext.observabilityAIAssistantClient.recall({
queries: [
...Object.values(entity).map((value) => ({ text: value, boost: 3 })),
{ text: context },
],
limit: {
tokenCount: Number.MAX_VALUE,
},
});
const { inferenceClient, connectorId } = rcaContext;
const shortIdTable = new ShortIdTable();
const system = RCA_SYSTEM_PROMPT_BASE;
const input = `Re-order the attached documents, based on relevance to the context.
Score them between 1 and 5, based on their relative relevance to each other. The
most relevant doc should be scored 5, and the least relevant doc should be scored
1.
# Entity
${formatEntity(entity)}
# Context
${toBlockquote(context)}
`;
const maxTokensForScoring = rcaContext.tokenLimit - encode(system + input).length - 1_000;
const entriesWithTokens = response.map((entry) => {
return {
id: entry.id,
text: entry.text,
tokens: encode(entry.text),
};
});
const totalTokenCount = sumBy(entriesWithTokens, (entry) => entry.tokens.length);
const truncatedEntriesWithShortIds = entriesWithTokens.map((entry) => {
const tokensForEntry = Math.floor(
(entry.tokens.length / totalTokenCount) * maxTokensForScoring
);
const truncatedText = decode(entry.tokens.slice(0, tokensForEntry));
const isTruncated = tokensForEntry < entry.tokens.length;
return {
id: entry.id,
tokens: entry.tokens,
shortId: shortIdTable.take(entry.id),
text: entry.text,
truncatedText,
isTruncated,
};
});
const scoredEntries = await inferenceClient.output({
id: 'score_entries',
connectorId,
system: RCA_SYSTEM_PROMPT_BASE,
input: `${input}
${truncatedEntriesWithShortIds
.map((entry) => {
return `# ID: ${entry.shortId}
## Text (${entry.isTruncated ? `truncated` : `not truncated `})
${toBlockquote(entry.truncatedText)}
`;
})
.join('\n\n')}
`,
stream: false,
schema: {
type: 'object',
properties: {
docs: {
type: 'array',
items: {
type: 'object',
properties: {
score: {
type: 'number',
description:
'A score between 1 and 5, with 5 being most relevant, and 1 being least relevant',
},
id: {
type: 'string',
},
},
required: ['score', 'id'],
},
},
},
required: ['docs'],
},
} as const);
const scoresById = new Map(scoredEntries.output.docs.map((doc) => [doc.id, doc.score]));
const entriesWithScore = truncatedEntriesWithShortIds.map((entry) => {
const score = scoresById.get(entry.shortId) ?? 0;
return {
...entry,
score,
};
});
const sortedEntries = orderBy(entriesWithScore, (entry) => entry.score, 'desc');
const returnedEntries: ScoredKnowledgeBaseEntry[] = [];
const tokensLeft = maxTokensForEntries;
sortedEntries.forEach((entry) => {
if (entry.tokens.length <= tokensLeft) {
returnedEntries.push({
id: entry.id,
text: entry.text,
tokens: entry.tokens.length,
score: entry.score,
});
return;
}
const tokensToTake = tokensLeft;
if (tokensToTake > 0) {
const tookTokens = entry.tokens.slice(0, tokensToTake);
returnedEntries.push({
id: entry.id,
text: entry.text,
tokens: entry.tokens.length,
score: entry.score,
truncated: {
text: decode(tookTokens),
tokens: tookTokens.length,
},
});
}
});
return returnedEntries;
}

View file

@ -0,0 +1,268 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { getEntityKuery } from '@kbn/observability-utils-common/entities/get_entity_kuery';
import { sortAndTruncateAnalyzedFields } from '@kbn/observability-utils-common/llm/log_analysis/sort_and_truncate_analyzed_fields';
import { analyzeDocuments } from '@kbn/observability-utils-server/entities/analyze_documents';
import { getDataStreamsForEntity } from '@kbn/observability-utils-server/entities/get_data_streams_for_entity';
import { getAlertsForEntity } from '@kbn/observability-utils-server/entities/signals/get_alerts_for_entity';
import { getSlosForEntity } from '@kbn/observability-utils-server/entities/signals/get_slos_for_entity';
import { ObservabilityElasticsearchClient } from '@kbn/observability-utils-server/es/client/create_observability_es_client';
import { RootCauseAnalysisContext } from '../../types';
import { stringifySummaries } from '../../util/stringify_summaries';
import { analyzeLogPatterns } from '../analyze_log_patterns';
import { describeEntity } from '../describe_entity';
import { describeLogPatterns } from '../describe_log_patterns';
import { findRelatedEntities } from '../find_related_entities';
import { extractRelatedEntities } from '../find_related_entities/extract_related_entities';
import { writeEntityInvestigationReport } from '../write_entity_investigation_report';
import { EntityInvestigation } from './types';
import { getKnowledgeBaseEntries } from '../get_knowledge_base_entries';
export type { EntityInvestigation };
export interface EntityInvestigationParameters {
entity: Record<string, string>;
rcaContext: RootCauseAnalysisContext;
context: string;
}
export async function investigateEntity(
parameters: EntityInvestigationParameters
): Promise<EntityInvestigation | undefined> {
const {
entity,
rcaContext,
rcaContext: {
inferenceClient,
connectorId,
start,
end,
esClient,
logger: parentLogger,
indices,
},
context,
} = parameters;
const kuery = getEntityKuery(entity);
const logger = parentLogger.get('investigateEntity');
logger.debug(() => `Investigating entity: ${JSON.stringify(parameters.entity)}`);
const kbPromise = getKnowledgeBaseEntries({
entity,
context,
rcaContext,
maxTokens: 4_000,
}).catch((error) => {
logger.error(`Could not fetch entries from knowledge base`);
logger.error(error);
return [];
});
const [{ dataStreams }, alerts, slos] = await getSignals({ ...parameters, kuery });
logger.debug(
() =>
`Signals for entity ${JSON.stringify(entity)}: ${dataStreams.length} data streams, ${
alerts.length
} alerts, ${slos.length} slos`
);
if (!dataStreams.length) {
return undefined;
}
const fullAnalysis = await analyzeDataStreamsForEntity({
start,
end,
esClient,
kuery,
dataStreams,
});
const truncatedAnalysis = sortAndTruncateAnalyzedFields(fullAnalysis);
const kbEntries = await kbPromise;
const { ownPatterns, patternsFromOtherEntities } = await analyzeLogPatterns({
allAnalysis: [{ index: dataStreams, analysis: truncatedAnalysis }],
entity,
system: stringifySummaries(rcaContext),
cutoff: {
significance: 'high',
},
rcaContext,
kbEntries,
});
logger.trace(
() => `Analyzed log patterns: ${JSON.stringify({ ownPatterns, patternsFromOtherEntities })}`
);
const entityReportPromise = Promise.all([
describeEntity({
inferenceClient,
analysis: truncatedAnalysis,
connectorId,
contextForEntityInvestigation: context,
entity,
ownPatterns,
kbEntries,
}),
describeLogPatterns({
analysis: truncatedAnalysis,
connectorId,
contextForEntityInvestigation: context,
entity,
inferenceClient,
ownPatterns,
patternsFromOtherEntities,
kbEntries,
}),
]).then(([entityDescription, logPatternDescription]) => {
return writeEntityInvestigationReport({
connectorId,
inferenceClient,
entityDescription,
logPatternDescription,
contextForEntityInvestigation: context,
entity,
}).then((report) => {
return {
description: entityDescription,
logPatternDescription,
report,
};
});
});
const [entityReport, relatedEntitiesResults] = await Promise.all([
entityReportPromise,
findRelatedEntities({
connectorId,
end,
entity,
esClient,
index: indices.logs,
inferenceClient,
logger,
start,
context,
analysis: {
full: fullAnalysis,
truncated: truncatedAnalysis,
},
ownPatterns,
patternsFromOtherEntities,
kbEntries,
}).then(async ({ searches, summaries, foundEntities }) => {
const report = await entityReportPromise;
const { relatedEntities } = await extractRelatedEntities({
entityReport: report.report,
summaries,
entity,
foundEntities,
context,
rcaContext,
});
return {
relatedEntities,
foundEntities,
searches,
summaries,
};
}),
]);
return {
entity,
summary: [
entityReport.description,
entityReport.logPatternDescription.content,
entityReport.report,
].join('\n\n'),
relatedEntities: relatedEntitiesResults.relatedEntities,
attachments: {
alerts,
slos,
analysis: truncatedAnalysis,
ownPatterns,
patternsFromOtherEntities,
searches: relatedEntitiesResults.searches,
relatedEntitiesSummaries: relatedEntitiesResults.summaries,
kbEntries,
},
};
}
async function getSignals({
entity,
kuery,
rcaContext: { start, end, esClient, rulesClient, alertsClient, indices, spaceId },
}: {
kuery: string;
entity: Record<string, unknown>;
rcaContext: Pick<
RootCauseAnalysisContext,
'start' | 'end' | 'esClient' | 'rulesClient' | 'alertsClient' | 'indices' | 'spaceId'
>;
}) {
return await Promise.all([
getDataStreamsForEntity({
esClient,
kuery,
index: indices.logs.concat(indices.traces),
}),
getAlertsForEntity({ entity, rulesClient, alertsClient, start, end, size: 10 }).then(
(alertsResponse) => {
return alertsResponse.hits.hits.map((hit) => hit._source!);
}
),
getSlosForEntity({
entity,
start,
end,
esClient,
size: 1000,
sloSummaryIndices: indices.sloSummaries,
spaceId,
}).then((slosResponse) => {
return slosResponse.hits.hits.map((hit) => hit._source);
}),
]);
}
async function analyzeDataStreamsForEntity({
start,
end,
dataStreams,
esClient,
kuery,
}: {
start: number;
end: number;
kuery: string;
dataStreams: string[];
esClient: ObservabilityElasticsearchClient;
}) {
const analysis = await analyzeDocuments({
esClient,
start,
end,
index: dataStreams,
kuery,
});
return {
...analysis,
fields: analysis.fields.filter((field) => !field.empty),
};
}

View file

@ -0,0 +1,22 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { formatEntity } from '../../util/format_entity';
import { toBlockquote } from '../../util/to_blockquote';
export const getInvestigateEntityTaskPrompt = ({
entity,
contextForEntityInvestigation,
}: {
entity: Record<string, string>;
contextForEntityInvestigation: string;
}) => `## Entity-Based Investigation: Task Guide
In the investigation process, you are currently investigating the entity
${formatEntity(entity)}. The context given for this investigation is:
${toBlockquote(contextForEntityInvestigation)}`;

View file

@ -0,0 +1,31 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { ParsedTechnicalFields } from '@kbn/rule-registry-plugin/common';
import type { TruncatedDocumentAnalysis } from '@kbn/observability-utils-common/llm/log_analysis/document_analysis';
import type { AnalyzeLogPatternOutput } from '../analyze_log_patterns';
import type { RelatedEntityDescription } from '../find_related_entities/extract_related_entities';
import type { RelatedEntityKeywordSearch } from '../find_related_entities/write_keyword_searches_for_related_entities';
import type { ScoredKnowledgeBaseEntry } from '../get_knowledge_base_entries';
export interface EntityInvestigation {
entity: Record<string, string>;
summary: string;
relatedEntities: RelatedEntityDescription[];
attachments: {
analysis: TruncatedDocumentAnalysis;
slos: Array<
Record<string, any> & {
status: 'VIOLATED' | 'DEGRADED' | 'HEALTHY' | 'NO_DATA';
}
>;
alerts: ParsedTechnicalFields[];
searches: RelatedEntityKeywordSearch[];
relatedEntitiesSummaries: string[];
kbEntries: ScoredKnowledgeBaseEntry[];
} & AnalyzeLogPatternOutput;
}

View file

@ -0,0 +1,239 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { MessageRole } from '@kbn/inference-common';
import { RCA_OBSERVE_TOOL_NAME } from '@kbn/observability-ai-common/root_cause_analysis';
import { RCA_PROMPT_CHANGES, RCA_PROMPT_ENTITIES, RCA_SYSTEM_PROMPT_BASE } from '../../prompts';
import { ObservationToolMessage, RootCauseAnalysisContext } from '../../types';
import { formatEntity } from '../../util/format_entity';
import { getPreviouslyInvestigatedEntities } from '../../util/get_previously_investigated_entities';
import { stringifySummaries } from '../../util/stringify_summaries';
import { toBlockquote } from '../../util/to_blockquote';
import { EntityInvestigation } from '../investigate_entity/types';
const INITIAL_OBSERVATION_TASK_GUIDE = `Your current task is to write observations based on the initial context. You
should acknowledge the context briefly, and mention key observations from the
initial context.
Then, briefly describe what change you are looking for. Are the symptoms:
- rapid, or gradual onset?
- subtle or prounounced?
If possible, mention the time of the change.
When considering the initial context, reason about relevant changes to observe,
such as short-lived versus persistent changes or singular events, like scale
events, rollouts, or configuration changes.
After, taking into account the capabilities you have, plan for next steps.
Describe the next step, which is to investigate the entity found in the initial
context. Only mention the entity (as a field/value). Do not mention any
additional filters.
Be brief, accurate, and critical.`;
const INVESTIGATION_ADDENDUM = `
**Task Guide: Observe the investigation results**
You will receive one or more investigations. These investigations mention:
- a general characterization of the entity based on its data
- relevant log patterns
- other signals, like SLOs or alerts
- possibly related entities, and investigation suggestions
First, you should briefly acknowledge the initial context of the investigation
and where it stands.
Next, you should note key observations from the investigations, and how they relate
to the ongoing investigation.
After, you should generate a timeline of significant events. For this timeline,
include events from previous observations. Additionally, include significant
events from the inspected investigations. Group events together in a topic
if needed. Significant events are things like: an increase in errors, deployment
events, a drop to zero for access logs, etc. In most cases, you do not want to
mention individual log messages, unless it is a particularly significant event
by itself.
For each event, mention:
- the timestamp of the event
- the nature of the change, if applicable
- data from the event, such as specific log patterns, alerts or slos
- the meaning of the event and how it is related to the initial context
Do not include:
- the time range from the investigation itself (start/end)
- other events that occurred during the investigation itself, like running
log analysis or other patterns
## Correlating significant events
When correlating significant events, pay close attention to the timestamp of
the mentioned change, and how it correlates to the timestamp of the change you
want to correlate it to, such as the start time of an alert. An alert might be
delayed, but if you see many changes around a specific timestamp, and some of
them being significantly earlier, or later, the latter group is likely not
relevant.
## Context and reasoning
Next, use the timeline of events and the new observations to revise your
analysis of the initial context and the ongoing investigation. Reason about
how changes could be related: are they close in time, or far removed, compared
to others? Is the type of change similar? Is the magnitude of the change similar?`;
const SUGGEST_NEXT_STEPS_PROMPT = `
Next, consider next steps. it's always important to contextualize the significant
in the initial context of the investigation. Focus on your strongest pieces of
evidence. Your observations should be related to finding out the cause of the
initial context of the investigation - you should not concern yourself with the
impact on _other_ entities.
Suggest to conclude the process when:
- there is a clear and obvious root cause
- you have investigated more than 10 entities
- OR you cannot find any unhealthy entities
- there are no more entities to investigate
If the conclusion is you need to continue your investigation, mention the entities
that should be investigated. Do this only if there is a significant change one of
the related entities will give you new insights into the root cause (instead of
just the impact). DO NOT investigate an entity more than once.`;
const CONCLUDE_PROCESS_PROMPT = `
You must suggest to conclude the process and write the final report, as your
capabilities do not allow you go investigate more entities.`;
function getInitialPrompts(initialContext: string) {
return {
system: `${RCA_SYSTEM_PROMPT_BASE}
${RCA_PROMPT_ENTITIES}
${RCA_PROMPT_CHANGES}`,
input: `## Context
${initialContext}
${INITIAL_OBSERVATION_TASK_GUIDE}`,
};
}
function getObserveInvestigationsPrompts({
investigations,
summaries,
rcaContext,
}: {
investigations: EntityInvestigation[];
summaries: ObservationStepSummary[];
rcaContext: RootCauseAnalysisContext;
}) {
const previouslyInvestigatedEntities = getPreviouslyInvestigatedEntities(rcaContext);
const canContinue =
summaries.length <= 5 &&
investigations.filter((investigation) => 'summary' in investigation).length <= 10;
const investigationsPrompt = `Observe the following investigations that recently concluded:
${investigations
.map((investigation, index) => {
return `## ${index + 1}: investigation of ${formatEntity(investigation.entity)}
${toBlockquote(investigation.summary)}
${
investigation.relatedEntities.length
? `### Relationships to ${formatEntity(investigation.entity)}
${toBlockquote(JSON.stringify(investigation.relatedEntities))}
`
: ``
}
`;
})
.join('\n\n')}
${INVESTIGATION_ADDENDUM}
${
canContinue
? `${SUGGEST_NEXT_STEPS_PROMPT}
${
previouslyInvestigatedEntities.length
? `The following entities have been investigated previously.
Do not investigate them again:
${previouslyInvestigatedEntities.map((entity) => `- ${JSON.stringify(entity)}`).join('\n')}`
: ``
}
`
: CONCLUDE_PROCESS_PROMPT
}
`;
const systemPrompt = `${RCA_SYSTEM_PROMPT_BASE}
${RCA_PROMPT_ENTITIES}
${stringifySummaries(rcaContext)}`;
return {
system: systemPrompt,
input: investigationsPrompt,
};
}
export interface ObservationStepSummary {
investigations: EntityInvestigation[];
content: string;
}
export function observeInvestigationResults({
rcaContext,
rcaContext: { logger, events, initialContext, inferenceClient, connectorId },
investigations,
}: {
rcaContext: RootCauseAnalysisContext;
investigations: EntityInvestigation[];
}): Promise<ObservationStepSummary> {
const summaries = events
.filter((event): event is ObservationToolMessage => {
return event.role === MessageRole.Tool && event.name === RCA_OBSERVE_TOOL_NAME;
})
.map((event) => event.data);
logger.debug(
() =>
`Observing ${investigations.length} investigations (${summaries.length} previous summaries)`
);
const { system, input } = investigations.length
? getObserveInvestigationsPrompts({ summaries, investigations, rcaContext })
: getInitialPrompts(initialContext);
return inferenceClient
.output({
id: 'observe',
system,
input,
connectorId,
})
.then((outputCompleteEvent) => {
return {
content: outputCompleteEvent.content,
investigations,
};
});
}

View file

@ -0,0 +1,84 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { InferenceClient } from '@kbn/inference-plugin/server';
import { RCA_PROMPT_SIGNIFICANT_EVENTS, RCA_SYSTEM_PROMPT_BASE } from '../../prompts';
import { formatEntity } from '../../util/format_entity';
import { toBlockquote } from '../../util/to_blockquote';
import { LogPatternDescription } from '../describe_log_patterns';
import { getInvestigateEntityTaskPrompt } from '../investigate_entity/prompts';
export async function writeEntityInvestigationReport({
inferenceClient,
connectorId,
entity,
contextForEntityInvestigation,
entityDescription,
logPatternDescription,
}: {
inferenceClient: InferenceClient;
connectorId: string;
entity: Record<string, string>;
contextForEntityInvestigation: string;
entityDescription: string;
logPatternDescription: LogPatternDescription;
}): Promise<string> {
const system = RCA_SYSTEM_PROMPT_BASE;
const shouldGenerateTimeline = logPatternDescription.interestingPatternCount > 0;
let input = `${getInvestigateEntityTaskPrompt({ entity, contextForEntityInvestigation })}
## Entity description
${toBlockquote(entityDescription)}
## Log pattern analysis
${toBlockquote(logPatternDescription.content)}
# Current task
Your current task is to write a report the investigation into ${formatEntity(entity)}.
The log pattern analysis and entity description will be added to your report (at the
top), so you don't need to repeat anything in it.`;
if (shouldGenerateTimeline) {
input += `${RCA_PROMPT_SIGNIFICANT_EVENTS}\n\n`;
}
input += `## Context and reasoning
Reason about the role that the entity plays in the investigation, given the context.
mention evidence (hard pieces of data) when reasoning.
Do not suggest next steps - this will happen in a follow-up task.`;
if (shouldGenerateTimeline) {
input += `## Format
Your reply should only contain two sections:
- Timeline of significant events
- Context and reasoning
`;
} else {
input += `## Format
Your reply should only contain one section:
- Context and reasoning
`;
}
const response = await inferenceClient.output({
id: 'generate_entity_report',
connectorId,
input,
system,
});
return response.content;
}

View file

@ -0,0 +1,191 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { RCA_PROMPT_TIMELINE_GUIDE, RCA_SYSTEM_PROMPT_BASE } from '../../prompts';
import { RootCauseAnalysisContext } from '../../types';
import { stringifySummaries } from '../../util/stringify_summaries';
const SYSTEM_PROMPT_ADDENDUM = `
# Guide: Writing a Root Cause Analysis (RCA) Report
A Root Cause Analysis (RCA) report is the final step in a thorough
investigation. Its purpose is to provide a clear, evidence-backed explanation of
the underlying cause of an issue, as well as the impact. Even if no definitive
root cause is identified, the report should reflect the findings, the hypotheses
considered, and why certain assumptions were rejected. This guide will help
structure an RCA that distinguishes between cause and effect, organizes
evidence, and presents a timeline of key events.
---
## 1. Introduction
Start by summarizing the reason for the investigation. Provide a brief overview
of the incident, the affected services or entities, and the initial alerts or
issues that triggered the investigation.
- **What prompted the investigation?**
- **Which entities were investigated?**
- **Was there a specific hypothesis proposed at the outset?**
### Example:
- **Overview:** This RCA report investigates the elevated error rates in
\`myservice\` and its downstream dependencies, first identified through an SLO
breach for the \`/api/submit\` endpoint. The investigation considered multiple
entities and possible causes, including resource exhaustion and upstream service
failures.
---
## 2. Investigation Summary
Summarize the key steps of the investigation, outlining:
- **What hypotheses were proposed and why.**
- **Which entities were investigated (e.g., \`myservice\`, \`myotherservice\`,
\`notification-service\`).**
- **Which hypotheses were discarded and why.**
For each hypothesis, present the supporting or contradicting evidence.
- **Strong Indicators:** Clear, repeated evidence pointing toward or against a
hypothesis.
- **Weak Indicators:** Inconsistent or ambiguous data that did not provide
conclusive answers.
#### Example Format:
- **Hypothesis 1:** Resource exhaustion in \`myservice\` caused elevated error
rates.
- **Evidence:**
- **Strong:** Memory usage exceeded 90% during the incident.
- **Weak:** CPU usage remained stable, making resource exhaustion a partial
explanation.
- **Hypothesis 2:** Upstream latency from \`myotherservice\` caused delays.
- **Evidence:**
- **Strong:** API logs showed frequent retries and timeouts from
\`myotherservice\`.
- **Weak:** No errors were observed in \`myotherservice\` logs, suggesting an
issue isolated to \`myservice\`.
---
## 3. Cause and Effect
Differentiate between the **cause** (what initiated the issue) and the
**effect** (the impact or symptoms seen across the system). The cause should
focus on the root, while the effect describes the wider system response or
failure.
- **Root Cause:** Identify the underlying problem, supported by strong evidence.
If no root cause is found, clearly state that the investigation did not lead to
a conclusive root cause.
- **Impact:** Describe the downstream effects on other services, performance
degradation, or SLO violations.
#### Example:
- **Cause:** The root cause of the elevated error rate was identified as a
memory leak in \`myservice\` that gradually led to resource exhaustion.
- **Effect:** This led to elevated latency and increased error rates at the
\`/api/submit\` endpoint, impacting downstream services like
\`notification-service\` that rely on responses from \`myservice\`.
---
## 4. Evidence for Root Cause
Present a structured section summarizing all the evidence that supports the
identified root cause. If no root cause is identified, outline the most
significant findings that guided or limited the investigation.
- **Log Patterns:** Describe any abnormal log patterns observed, including
notable change points.
- **Alerts and SLOs:** Mention any alerts or breached SLOs that were triggered,
including their relevance to the investigation.
- **Data Analysis:** Include any data trends or patterns that were analyzed
(e.g., resource usage spikes, network traffic).
#### Example:
- **Memory Usage:** Logs showed a steady increase in memory consumption starting
at 10:00 AM, peaking at 12:00 PM, where memory usage surpassed 90%, triggering
the alert.
- **Error Rate Logs:** Error rates for \`/api/submit\` began increasing around
11:30 AM, correlating with the memory pressure in \`myservice\`.
- **API Logs:** \`myotherservice\` API logs showed no internal errors, ruling out
an upstream dependency as the primary cause.
---
## 5. Proposed Impact
Even if the root cause is clear, it is important to mention the impact of the
issue on the system, users, and business operations. This includes:
- **Affected Services:** Identify the services impacted (e.g., downstream
dependencies).
- **Performance Degradation:** Describe any SLO breaches or performance
bottlenecks.
- **User Impact:** Explain how users or clients were affected (e.g., higher
latency, failed transactions).
#### Example:
- **Impact:** The memory leak in \`myservice\` caused service degradation over a
2-hour window. This affected \`/api/submit\`, causing delays and failed
requests, ultimately impacting user-facing services relying on that endpoint.
---
## 6. Timeline of Significant Events
${RCA_PROMPT_TIMELINE_GUIDE}
---
## 7. Conclusion and Next Steps
Summarize the conclusions of the investigation:
- If a root cause was identified, confirm it with the strongest supporting
evidence.
- If no root cause was found, state that clearly and suggest areas for further
investigation or monitoring.
Finally, outline the next steps:
- **Fixes or Mitigations:** Recommend any immediate actions (e.g., patch
deployment, configuration changes).
- **Monitoring Improvements:** Suggest new alerts or monitoring metrics based on
lessons learned.
- **Further Investigations:** If necessary, propose any follow-up investigations
to gather more evidence.
#### Example:
- **Conclusion:** The root cause of the incident was a memory leak in
\`myservice\`, leading to resource exhaustion and elevated error rates at
\`/api/submit\`. The leak has been patched, and monitoring has been improved to
detect memory spikes earlier.
- **Next Steps:** Monitor memory usage for the next 24 hours to ensure no
recurrence. Investigate adding a memory ceiling for \`myservice\` to prevent
future resource exhaustion.`;
export async function writeFinalReport({
rcaContext,
}: {
rcaContext: RootCauseAnalysisContext;
}): Promise<string> {
const { inferenceClient, connectorId } = rcaContext;
return await inferenceClient
.output({
id: 'write_final_report',
connectorId,
system: `${RCA_SYSTEM_PROMPT_BASE}
${SYSTEM_PROMPT_ADDENDUM}`,
input: `Write the RCA report, based on the observations.
${stringifySummaries(rcaContext)}`,
})
.then((event) => event.content);
}

View file

@ -0,0 +1,77 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import {
RCA_END_PROCESS_TOOL_NAME,
RCA_INVESTIGATE_ENTITY_TOOL_NAME,
RCA_OBSERVE_TOOL_NAME,
} from '@kbn/observability-ai-common/root_cause_analysis/tool_names';
export const RCA_TOOLS = {
[RCA_OBSERVE_TOOL_NAME]: {
description: `Request an observation from another agent on
the results of the returned investigations. The title should
cover key new observations from the initial context or
completed investigations, not anything about next steps.`,
schema: {
type: 'object',
properties: {
title: {
type: 'string',
description: `A short title w/ the key new observations that will be displayed on top of a collapsible panel.`,
},
},
required: ['title'],
},
},
[RCA_END_PROCESS_TOOL_NAME]: {
description: `End the RCA process by requesting a
written report from another agent`,
schema: {
type: 'object',
properties: {
endProcess: {
type: 'boolean',
},
},
required: ['endProcess'],
},
},
[RCA_INVESTIGATE_ENTITY_TOOL_NAME]: {
description: `Investigate an entity`,
schema: {
type: 'object',
properties: {
context: {
type: 'string',
description: `Context that will be used in the investigation of the entity. Mention the initial context
of the investigation, a very short summary of the last observation if applicable, and pieces
of data that can be relevant for the investigation into the entity, such as timestamps or
keywords`,
},
entity: {
type: 'object',
description: `The entity you want to investigate, such as a service. Use
the Elasticsearch field names and values. For example, for services, use
the following structure: ${JSON.stringify({
entity: { field: 'service.name', value: 'opbeans-java' },
})}`,
properties: {
field: {
type: 'string',
},
value: {
type: 'string',
},
},
required: ['field', 'value'],
},
},
required: ['context', 'entity'],
},
},
} as const;

View file

@ -0,0 +1,101 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import {
ToolMessage,
UserMessage,
ToolCallsOf,
ToolChoice,
AssistantMessageOf,
} from '@kbn/inference-common';
import { InferenceClient } from '@kbn/inference-plugin/server';
import { Logger } from '@kbn/logging';
import { AlertsClient } from '@kbn/rule-registry-plugin/server';
import { RulesClient } from '@kbn/alerting-plugin/server';
import { ObservabilityAIAssistantClient } from '@kbn/observability-ai-assistant-plugin/server';
import { ObservabilityElasticsearchClient } from '@kbn/observability-utils-server/es/client/create_observability_es_client';
import {
RCA_END_PROCESS_TOOL_NAME,
RCA_INVESTIGATE_ENTITY_TOOL_NAME,
RCA_OBSERVE_TOOL_NAME,
} from '@kbn/observability-ai-common/root_cause_analysis';
import { ObservationStepSummary } from './tasks/observe_investigation_results';
import { EntityInvestigation } from './tasks/investigate_entity';
import { SignificantEventsTimeline } from './tasks/generate_timeline';
import { RCA_TOOLS } from './tools';
export type EndProcessToolMessage = ToolMessage<
typeof RCA_END_PROCESS_TOOL_NAME,
{
report: string;
timeline: SignificantEventsTimeline;
}
>;
export type ObservationToolMessage = ToolMessage<
typeof RCA_OBSERVE_TOOL_NAME,
{
content: string;
},
ObservationStepSummary
>;
export type InvestigateEntityToolMessage = ToolMessage<
typeof RCA_INVESTIGATE_ENTITY_TOOL_NAME,
Pick<EntityInvestigation, 'entity' | 'summary' | 'relatedEntities'>,
{ attachments: EntityInvestigation['attachments'] }
>;
export type ToolErrorMessage = ToolMessage<
'error',
{
error: {
message: string;
};
}
>;
export type RootCauseAnalysisEvent =
| RootCauseAnalysisToolMessage
| ToolErrorMessage
| UserMessage
| AssistantMessageOf<{
tools: typeof RCA_TOOLS;
toolChoice?: ToolChoice<keyof typeof RCA_TOOLS>;
}>;
export type RootCauseAnalysisToolRequest<
TToolName extends keyof typeof RCA_TOOLS = keyof typeof RCA_TOOLS
> = ToolCallsOf<{
tools: Pick<typeof RCA_TOOLS, TToolName>;
}>['toolCalls'][number];
export type RootCauseAnalysisToolMessage =
| EndProcessToolMessage
| InvestigateEntityToolMessage
| ObservationToolMessage;
export interface RootCauseAnalysisContext {
initialContext: string;
start: number;
end: number;
events: RootCauseAnalysisEvent[];
indices: {
logs: string[];
traces: string[];
sloSummaries: string[];
};
inferenceClient: InferenceClient;
tokenLimit: number;
connectorId: string;
esClient: ObservabilityElasticsearchClient;
alertsClient: AlertsClient;
rulesClient: RulesClient;
logger: Logger;
spaceId: string;
observabilityAIAssistantClient: ObservabilityAIAssistantClient;
}

View file

@ -0,0 +1,177 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import {
Message,
ToolDefinition,
ToolChoice,
ToolCallsOf,
withoutChunkEvents,
withoutTokenCountEvents,
ToolMessage,
MessageOf,
MessageRole,
} from '@kbn/inference-common';
import { InferenceClient } from '@kbn/inference-plugin/server';
import { Logger } from '@kbn/logging';
import {
defer,
last,
merge,
Observable,
of,
OperatorFunction,
share,
switchMap,
toArray,
} from 'rxjs';
interface CallToolOptions extends CallToolTools {
system: string;
messages: Message[];
inferenceClient: InferenceClient;
connectorId: string;
logger: Logger;
}
interface CallToolTools {
tools: Record<string, ToolDefinition>;
toolChoice?: ToolChoice;
}
type CallbackOf<
TCallToolTools extends CallToolTools,
TEmittedMessage extends Message
> = (parameters: {
messages: Message[];
toolCalls: ToolCallsOf<TCallToolTools>['toolCalls'];
}) => Observable<TEmittedMessage>;
type GetNextRequestCallback<TCallToolTools extends CallToolTools> = ({
messages,
system,
}: {
messages: Message[];
system: string;
}) => { system: string; messages: Message[] } & TCallToolTools;
export function callTools<TCallToolOptions extends CallToolOptions>(
{ system, messages, inferenceClient, connectorId, tools, toolChoice, logger }: TCallToolOptions,
callback: CallbackOf<TCallToolOptions, ToolMessage>
): Observable<MessageOf<TCallToolOptions>>;
export function callTools<
TCallToolOptions extends Omit<CallToolOptions, 'tools' | 'toolChoice'> = never,
TCallToolTools extends CallToolTools = never,
TEmittedMessage extends Message = never
>(
options: TCallToolOptions,
getNextRequest: GetNextRequestCallback<TCallToolTools>,
callback: CallbackOf<TCallToolTools, TEmittedMessage>
): Observable<TEmittedMessage>;
export function callTools(
{ system, messages, inferenceClient, connectorId, tools, toolChoice, logger }: CallToolOptions,
...callbacks:
| [GetNextRequestCallback<CallToolTools>, CallbackOf<CallToolOptions, ToolMessage>]
| [CallbackOf<CallToolTools, ToolMessage>]
): Observable<Message> {
const callback = callbacks.length === 2 ? callbacks[1] : callbacks[0];
const getNextRequest =
callbacks.length === 2
? callbacks[0]
: (next: { messages: Message[]; system: string }) => {
return {
...next,
tools,
toolChoice,
};
};
const nextRequest = getNextRequest({ system, messages });
const chatComplete$ = defer(() =>
inferenceClient.chatComplete({
connectorId,
stream: true,
...nextRequest,
})
);
const asCompletedMessages$ = chatComplete$.pipe(
withoutChunkEvents(),
withoutTokenCountEvents(),
switchMap((event) => {
return of({
role: MessageRole.Assistant as const,
content: event.content,
toolCalls: event.toolCalls,
});
})
);
const withToolResponses$ = asCompletedMessages$
.pipe(
switchMap((message) => {
if (message.toolCalls.length) {
return merge(
of(message),
callback({ toolCalls: message.toolCalls, messages: messages.concat(message) })
);
}
return of(message);
})
)
.pipe(handleNext());
return withToolResponses$;
function handleNext(): OperatorFunction<Message, Message> {
return (source$) => {
const shared$ = source$.pipe(share());
const next$ = merge(
shared$,
shared$.pipe(
toArray(),
last(),
switchMap((nextMessages) => {
logger.debug(() =>
JSON.stringify(
nextMessages.map((message) => {
return {
role: message.role,
toolCalls: 'toolCalls' in message ? message.toolCalls : undefined,
toolCallId: 'toolCallId' in message ? message.toolCallId : undefined,
};
})
)
);
if (nextMessages[nextMessages.length - 1].role !== MessageRole.Assistant) {
const options: CallToolOptions = {
system,
connectorId,
inferenceClient,
messages: messages.concat(nextMessages),
tools,
toolChoice,
logger,
};
const after$ = callTools(options, getNextRequest, callback);
return after$;
}
return of();
})
)
);
return next$;
};
}
}

View file

@ -0,0 +1,97 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { encode } from 'gpt-tokenizer';
import { uniqueId } from 'lodash';
interface TextWithId {
id: string;
text: string;
}
interface Parameters {
system: string;
input: string;
tokenLimit: number;
}
interface ChunkedOutputRequest {
input: string;
system: string;
}
export function chunkOutputCalls({}: Parameters & { texts: string[] }): Array<
ChunkedOutputRequest & {
texts: string[];
}
>;
export function chunkOutputCalls({}: Parameters & { texts: TextWithId[] }): Array<
ChunkedOutputRequest & {
texts: TextWithId[];
}
>;
export function chunkOutputCalls({
system,
input,
texts,
tokenLimit,
}: Parameters & {
texts: string[] | TextWithId[];
}) {
const inputAndSystemPromptCount = encode(system).length + encode(input).length;
if (!texts.length) {
return [{ system, input, texts: [] }];
}
const textWithIds = texts.map((text) => {
if (typeof text === 'string') {
return {
id: uniqueId(),
text,
};
}
return text;
});
const textsWithCount = textWithIds.map(({ text, id }) => ({
tokenCount: encode(text).length,
text,
id,
}));
const chunks: Array<{ tokenCount: number; texts: TextWithId[] }> = [];
textsWithCount.forEach(({ text, id, tokenCount }) => {
let chunkWithRoomLeft = chunks.find((chunk) => {
return chunk.tokenCount + tokenCount <= tokenLimit;
});
if (!chunkWithRoomLeft) {
chunkWithRoomLeft = { texts: [], tokenCount: inputAndSystemPromptCount };
chunks.push(chunkWithRoomLeft);
}
chunkWithRoomLeft.texts.push({ text, id });
chunkWithRoomLeft.tokenCount += tokenCount;
});
const hasTextWithIds = texts.some((text) => typeof text !== 'string');
return chunks.map((chunk) => {
const textsForChunk = hasTextWithIds
? chunk.texts
: chunk.texts.map((text) => (typeof text === 'string' ? text : text.text));
return {
system,
input,
texts: textsForChunk,
};
});
}

View file

@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export function formatEntity(entity: Record<string, string>) {
return Object.entries(entity)
.map(([field, value]) => `${field}:${value}`)
.join('/');
}

View file

@ -0,0 +1,22 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { MessageRole } from '@kbn/inference-common';
import { RCA_INVESTIGATE_ENTITY_TOOL_NAME } from '@kbn/observability-ai-common/root_cause_analysis';
import { InvestigateEntityToolMessage, RootCauseAnalysisContext } from '../types';
export function getPreviouslyInvestigatedEntities({
events,
}: Pick<RootCauseAnalysisContext, 'events'>) {
const investigationToolResponses = events.filter(
(event): event is InvestigateEntityToolMessage => {
return event.role === MessageRole.Tool && event.name === RCA_INVESTIGATE_ENTITY_TOOL_NAME;
}
);
return investigationToolResponses.map((event) => event.response.entity);
}

View file

@ -0,0 +1,34 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { ScoredKnowledgeBaseEntry } from '../tasks/get_knowledge_base_entries';
import { toBlockquote } from './to_blockquote';
export function serializeKnowledgeBaseEntries(entries: ScoredKnowledgeBaseEntry[]) {
if (!entries.length) {
return `## Knowledge base
No relevant knowledge base entries were found.
`;
}
const serializedEntries = entries
.filter((entry) => entry.score >= 3)
.map(
(entry) => `## Entry \`${entry.id}\ (score: ${entry.score}, ${
entry.truncated ? `truncated` : `not truncated`
})
${toBlockquote(entry.text)}`
);
return `## Knowledge base
The following relevant entries were found in the knowledge base
${serializedEntries.join('\n\n')}`;
}

View file

@ -0,0 +1,47 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { compact } from 'lodash';
import { MessageRole } from '@kbn/inference-common';
import { RCA_OBSERVE_TOOL_NAME } from '@kbn/observability-ai-common/root_cause_analysis';
import { formatEntity } from './format_entity';
import { toBlockquote } from './to_blockquote';
import { ObservationToolMessage, RootCauseAnalysisContext } from '../types';
export function stringifySummaries({ events }: RootCauseAnalysisContext): string {
const summaries = events
.filter((event): event is ObservationToolMessage => {
return event.role === MessageRole.Tool && event.name === RCA_OBSERVE_TOOL_NAME;
})
.map((event) => event.data);
if (!summaries.length) {
return `# Previous observations
No previous observations`;
}
return `# Previous observations
${summaries.map((summary, index) => {
const header = `## Observation #${index + 1}`;
const entitiesHeader = summary.investigations.length
? `### Investigated entities
${summary.investigations
.map((investigation) => `- ${formatEntity(investigation.entity)}`)
.join('\n')}`
: undefined;
const summaryBody = `### Summary
${toBlockquote(summary.content)}`;
return compact([header, entitiesHeader, summaryBody]).join('\n\n');
})}`;
}

View file

@ -0,0 +1,13 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export function toBlockquote(input: string): string {
return input
.split('\n')
.map((line) => `> ${line}`)
.join('\n');
}

View file

@ -0,0 +1,124 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { MessageRole, ToolCallsOf } from '@kbn/inference-common';
import { entityQuery } from '@kbn/observability-utils-common/es/queries/entity_query';
import { RCA_INVESTIGATE_ENTITY_TOOL_NAME } from '@kbn/observability-ai-common/root_cause_analysis';
import { isEqual } from 'lodash';
import { getEntitiesByFuzzySearch } from '@kbn/observability-utils-server/entities/get_entities_by_fuzzy_search';
import { RCA_TOOLS } from '../tools';
import {
InvestigateEntityToolMessage,
RootCauseAnalysisContext,
RootCauseAnalysisToolRequest,
} from '../types';
import { formatEntity } from './format_entity';
interface EntityExistsResultExists {
exists: true;
entity: Record<string, string>;
}
interface EntityExistsResultDoesNotExist {
exists: false;
entity: Record<string, string>;
suggestions: string[];
}
type EntityExistsResult = EntityExistsResultExists | EntityExistsResultDoesNotExist;
export async function validateInvestigateEntityToolCalls({
rcaContext,
toolCalls,
}: {
rcaContext: Pick<RootCauseAnalysisContext, 'esClient' | 'indices' | 'start' | 'end' | 'events'>;
toolCalls: RootCauseAnalysisToolRequest[];
}) {
const { events, esClient, indices, start, end } = rcaContext;
const previouslyInvestigatedEntities = events
.filter(
(event): event is InvestigateEntityToolMessage =>
event.role === MessageRole.Tool && event.name === RCA_INVESTIGATE_ENTITY_TOOL_NAME
)
.map((toolResponse) => toolResponse.response.entity);
const investigateEntityToolCalls = toolCalls.filter(
(
toolCall
): toolCall is ToolCallsOf<{
tools: Pick<typeof RCA_TOOLS, typeof RCA_INVESTIGATE_ENTITY_TOOL_NAME>;
}>['toolCalls'][number] => toolCall.function.name === RCA_INVESTIGATE_ENTITY_TOOL_NAME
);
if (!investigateEntityToolCalls.length) {
return [];
}
const entitiesToInvestigate = investigateEntityToolCalls.map((toolCall) => {
const { entity: entityToInvestigate } = toolCall.function.arguments;
return {
[entityToInvestigate.field]: entityToInvestigate.value,
};
});
const entityExistsResponses: EntityExistsResult[] = await Promise.all(
entitiesToInvestigate.map(async (entity) => {
const response = await esClient.search('find_data_for_entity', {
track_total_hits: 1,
size: 0,
timeout: '1ms',
index: indices.logs.concat(indices.traces),
query: {
bool: {
filter: [...entityQuery(entity)],
},
},
});
const exists = response.hits.total.value > 0;
if (!exists) {
return getEntitiesByFuzzySearch({
start,
end,
esClient,
index: indices.logs.concat(indices.traces),
entity,
}).then((suggestions) => {
return {
entity,
exists,
suggestions,
};
});
}
return { entity, exists };
})
);
const alreadyInvestigatedEntities = entitiesToInvestigate.filter((entity) => {
return previouslyInvestigatedEntities.some((prevEntity) => isEqual(entity, prevEntity));
});
const errors = [
...entityExistsResponses
.filter(
(entityExistsResult): entityExistsResult is EntityExistsResultDoesNotExist =>
!entityExistsResult.exists
)
.map(({ suggestions, entity }) => {
return `Entity ${formatEntity(
entity
)} does not exist. Did you mean one of ${suggestions.join(', ')}?`;
}),
...alreadyInvestigatedEntities.map((entity) => {
return `Entity ${formatEntity(entity)} was already investigated before.`;
}),
];
return errors;
}

View file

@ -0,0 +1,29 @@
{
"extends": "../../../../../tsconfig.base.json",
"compilerOptions": {
"outDir": "target/types",
"types": [
"jest",
"node",
"react"
]
},
"include": [
"**/*.ts"
],
"exclude": [
"target/**/*"
],
"kbn_references": [
"@kbn/observability-utils-common",
"@kbn/alerting-plugin",
"@kbn/rule-registry-plugin",
"@kbn/inference-plugin",
"@kbn/logging",
"@kbn/calculate-auto",
"@kbn/observability-ai-assistant-plugin",
"@kbn/inference-common",
"@kbn/observability-ai-common",
"@kbn/observability-utils-server",
]
}

View file

@ -0,0 +1,50 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { castArray, orderBy } from 'lodash';
import Fuse from 'fuse.js';
import { ObservabilityElasticsearchClient } from '../es/client/create_observability_es_client';
export async function getEntitiesByFuzzySearch({
esClient,
entity,
start,
end,
index,
}: {
esClient: ObservabilityElasticsearchClient;
entity: Record<string, string>;
start: number;
end: number;
index: string | string[];
}): Promise<string[]> {
if (Object.keys(entity).length > 1) {
return [];
}
const [field, value] = Object.entries(entity)[0];
const { terms } = await esClient.client.termsEnum({
index: castArray(index).join(','),
field,
index_filter: {
range: {
'@timestamp': {
gte: new Date(start).toISOString(),
lte: new Date(end).toISOString(),
},
},
},
size: 10_000,
});
const results = new Fuse(terms, { includeScore: true, threshold: 0.75 }).search(value);
return orderBy(results, (result) => result.score, 'asc')
.slice(0, 5)
.map((result) => result.item);
}

View file

@ -0,0 +1,405 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import {
AggregationsCategorizeTextAggregation,
AggregationsDateHistogramAggregation,
AggregationsMaxAggregation,
AggregationsMinAggregation,
AggregationsTopHitsAggregation,
QueryDslQueryContainer,
} from '@elastic/elasticsearch/lib/api/types';
import { categorizationAnalyzer } from '@kbn/aiops-log-pattern-analysis/categorization_analyzer';
import { ChangePointType } from '@kbn/es-types/src';
import { pValueToLabel } from '@kbn/observability-utils-common/ml/p_value_to_label';
import { calculateAuto } from '@kbn/calculate-auto';
import { omit, orderBy, uniqBy } from 'lodash';
import moment from 'moment';
import { ObservabilityElasticsearchClient } from '../es/client/create_observability_es_client';
import { kqlQuery } from '../es/queries/kql_query';
import { rangeQuery } from '../es/queries/range_query';
interface FieldPatternResultBase {
field: string;
count: number;
pattern: string;
regex: string;
sample: string;
firstOccurrence: string;
lastOccurrence: string;
highlight: Record<string, string[]>;
metadata: Record<string, unknown[]>;
}
interface FieldPatternResultChanges {
timeseries: Array<{ x: number; y: number }>;
change: {
timestamp?: string;
significance: 'high' | 'medium' | 'low' | null;
type: ChangePointType;
change_point?: number;
p_value?: number;
};
}
export type FieldPatternResult<TChanges extends boolean | undefined = undefined> =
FieldPatternResultBase & (TChanges extends true ? FieldPatternResultChanges : {});
export type FieldPatternResultWithChanges = FieldPatternResult<true>;
interface CategorizeTextOptions {
query: QueryDslQueryContainer;
metadata: string[];
esClient: ObservabilityElasticsearchClient;
samplingProbability: number;
fields: string[];
index: string | string[];
useMlStandardTokenizer: boolean;
size: number;
start: number;
end: number;
}
// eslint-disable-next-line @typescript-eslint/consistent-type-definitions
type CategorizeTextSubAggregations = {
sample: { top_hits: AggregationsTopHitsAggregation };
minTimestamp: { min: AggregationsMinAggregation };
maxTimestamp: { max: AggregationsMaxAggregation };
};
interface CategorizeTextAggregationResult {
categorize_text: AggregationsCategorizeTextAggregation;
aggs: CategorizeTextSubAggregations &
(
| {}
| {
timeseries: { date_histogram: AggregationsDateHistogramAggregation };
changes: { change_point: { buckets_path: string } };
}
);
}
export async function runCategorizeTextAggregation<
TChanges extends boolean | undefined = undefined
>(
options: CategorizeTextOptions & { includeChanges?: TChanges }
): Promise<Array<FieldPatternResult<TChanges>>>;
export async function runCategorizeTextAggregation({
esClient,
fields,
metadata,
index,
query,
samplingProbability,
useMlStandardTokenizer,
includeChanges,
size,
start,
end,
}: CategorizeTextOptions & { includeChanges?: boolean }): Promise<
Array<FieldPatternResult<boolean>>
> {
const aggs = Object.fromEntries(
fields.map((field): [string, CategorizeTextAggregationResult] => [
field,
{
categorize_text: {
field,
min_doc_count: 1,
size,
categorization_analyzer: useMlStandardTokenizer
? {
tokenizer: 'ml_standard',
char_filter: [
{
type: 'pattern_replace',
pattern: '\\\\n',
replacement: '',
} as unknown as string,
],
}
: categorizationAnalyzer,
},
aggs: {
minTimestamp: {
min: {
field: '@timestamp',
},
},
maxTimestamp: {
max: {
field: '@timestamp',
},
},
...(includeChanges
? {
timeseries: {
date_histogram: {
field: '@timestamp',
min_doc_count: 0,
extended_bounds: {
min: start,
max: end,
},
fixed_interval: `${calculateAuto
.atLeast(30, moment.duration(end - start, 'ms'))!
.asMilliseconds()}ms`,
},
},
changes: {
change_point: {
buckets_path: 'timeseries>_count',
},
},
}
: {}),
sample: {
top_hits: {
size: 1,
_source: false,
fields: [field, ...metadata],
sort: {
_score: {
order: 'desc',
},
},
highlight: {
fields: {
'*': {},
},
},
},
},
},
},
])
);
const response = await esClient.search('get_log_patterns', {
index,
size: 0,
track_total_hits: false,
query: {
bool: {
filter: [query, ...rangeQuery(start, end)],
},
},
aggregations: {
sampler: {
random_sampler: {
probability: samplingProbability,
},
aggs,
},
},
});
if (!response.aggregations) {
return [];
}
const fieldAggregates = omit(response.aggregations.sampler, 'seed', 'doc_count', 'probability');
return Object.entries(fieldAggregates).flatMap(([fieldName, aggregate]) => {
const buckets = aggregate.buckets;
return buckets.map((bucket) => {
return {
field: fieldName,
count: bucket.doc_count,
pattern: bucket.key,
regex: bucket.regex,
sample: bucket.sample.hits.hits[0].fields![fieldName][0] as string,
highlight: bucket.sample.hits.hits[0].highlight ?? {},
metadata: bucket.sample.hits.hits[0].fields!,
firstOccurrence: new Date(bucket.minTimestamp.value!).toISOString(),
lastOccurrence: new Date(bucket.maxTimestamp.value!).toISOString(),
...('timeseries' in bucket
? {
timeseries: bucket.timeseries.buckets.map((dateBucket) => ({
x: dateBucket.key,
y: dateBucket.doc_count,
})),
change: Object.entries(bucket.changes.type).map(
([changePointType, change]): FieldPatternResultChanges['change'] => {
return {
type: changePointType as ChangePointType,
significance:
change.p_value !== undefined ? pValueToLabel(change.p_value) : null,
change_point: change.change_point,
p_value: change.p_value,
timestamp:
change.change_point !== undefined
? bucket.timeseries.buckets[change.change_point].key_as_string
: undefined,
};
}
)[0],
}
: {}),
};
});
});
}
interface LogPatternOptions {
esClient: ObservabilityElasticsearchClient;
start: number;
end: number;
index: string | string[];
kuery: string;
metadata?: string[];
fields: string[];
}
export async function getLogPatterns<TChanges extends boolean | undefined = undefined>(
options: LogPatternOptions & { includeChanges?: TChanges }
): Promise<Array<FieldPatternResult<TChanges>>>;
export async function getLogPatterns({
esClient,
start,
end,
index,
kuery,
includeChanges,
metadata = [],
fields,
}: LogPatternOptions & { includeChanges?: boolean }): Promise<Array<FieldPatternResult<boolean>>> {
const fieldCapsResponse = await esClient.fieldCaps('get_field_caps_for_log_pattern_analysis', {
fields,
index_filter: {
bool: {
filter: [...rangeQuery(start, end)],
},
},
index,
types: ['text', 'match_only_text'],
});
const fieldsInFieldCaps = Object.keys(fieldCapsResponse.fields);
if (!fieldsInFieldCaps.length) {
return [];
}
const totalDocsResponse = await esClient.search('get_total_docs_for_log_pattern_analysis', {
index,
size: 0,
track_total_hits: true,
query: {
bool: {
filter: [...kqlQuery(kuery), ...rangeQuery(start, end)],
},
},
});
const totalHits = totalDocsResponse.hits.total.value;
if (totalHits === 0) {
return [];
}
let samplingProbability = 100_000 / totalHits;
if (samplingProbability >= 0.5) {
samplingProbability = 1;
}
const fieldGroups = includeChanges
? fieldsInFieldCaps.map((field) => [field])
: [fieldsInFieldCaps];
const allPatterns = await Promise.all(
fieldGroups.map(async (fieldGroup) => {
const topMessagePatterns = await runCategorizeTextAggregation({
esClient,
index,
fields: fieldGroup,
query: {
bool: {
filter: kqlQuery(kuery),
},
},
samplingProbability,
useMlStandardTokenizer: false,
size: 100,
start,
end,
includeChanges,
metadata,
});
if (topMessagePatterns.length === 0) {
return [];
}
const patternsToExclude = topMessagePatterns.filter((pattern) => {
// elasticsearch will barf because the query is too complex. this measures
// the # of groups to capture for a measure of complexity.
const complexity = pattern.regex.match(/(\.\+\?)|(\.\*\?)/g)?.length ?? 0;
return (
complexity <= 25 &&
// anything less than 50 messages should be re-processed with the ml_standard tokenizer
pattern.count > 50
);
});
const rareMessagePatterns = await runCategorizeTextAggregation({
esClient,
index,
fields: fieldGroup,
start,
end,
query: {
bool: {
filter: kqlQuery(kuery),
must_not: [
...patternsToExclude.map((pattern) => {
return {
bool: {
filter: [
{
regexp: {
[pattern.field]: {
value: pattern.regex,
},
},
},
{
match: {
[pattern.field]: {
query: pattern.pattern,
fuzziness: 0,
operator: 'and' as const,
auto_generate_synonyms_phrase_query: false,
},
},
},
],
},
};
}),
],
},
},
size: 1000,
includeChanges,
samplingProbability: 1,
useMlStandardTokenizer: true,
metadata,
});
return [...patternsToExclude, ...rareMessagePatterns];
})
);
return uniqBy(
orderBy(allPatterns.flat(), (pattern) => pattern.count, 'desc'),
(pattern) => pattern.sample
);
}

View file

@ -24,6 +24,8 @@
"@kbn/alerting-plugin",
"@kbn/rule-registry-plugin",
"@kbn/rule-data-utils",
"@kbn/aiops-log-pattern-analysis",
"@kbn/calculate-auto",
"@kbn/utility-types",
"@kbn/task-manager-plugin",
]

View file

@ -13,6 +13,9 @@ export {
type AssistantMessage,
type ToolMessage,
type UserMessage,
type MessageOf,
type AssistantMessageOf,
type ToolMessageOf,
type ToolSchemaType,
type FromToolSchema,
type ToolSchema,

View file

@ -33,6 +33,9 @@ export {
type AssistantMessage,
type UserMessage,
type ToolMessage,
type AssistantMessageOf,
type MessageOf,
type ToolMessageOf,
} from './messages';
export { type ToolSchema, type ToolSchemaType, type FromToolSchema } from './tool_schema';
export {

View file

@ -5,7 +5,7 @@
* 2.0.
*/
import type { ToolCall } from './tools';
import type { ToolCall, ToolCallsOf, ToolNamesOf, ToolOptions, ToolResponsesOf } from './tools';
/**
* Enum for all possible {@link Message} roles.
@ -52,17 +52,32 @@ export type AssistantMessage = MessageBase<MessageRole.Assistant> & {
/**
* Represents a tool invocation result, following a request from the LLM to execute a tool.
*/
export type ToolMessage<TToolResponse extends Record<string, any> | unknown> =
MessageBase<MessageRole.Tool> & {
/**
* The call id matching the {@link ToolCall} this tool message is for.
*/
toolCallId: string;
/**
* The response from the tool invocation.
*/
response: TToolResponse;
};
export type ToolMessage<
TName extends string = string,
TToolResponse extends Record<string, any> | unknown = Record<string, any> | unknown,
TToolData extends Record<string, any> | undefined = Record<string, any> | undefined
> = MessageBase<MessageRole.Tool> & {
/*
* The name of the tool called. Used for refining the type of the response.
*/
name: TName;
/**
* The call id matching the {@link ToolCall} this tool message is for.
*/
toolCallId: string;
/**
* The response from the tool invocation.
*/
response: TToolResponse;
} & (TToolData extends undefined
? {}
: {
/**
* Additional data from the tool invocation, that is not sent to the LLM
* but can be used to attach baggage (such as timeseries or debug data)
*/
data: TToolData;
});
/**
* Mixin composed of all the possible types of messages in a chatComplete discussion.
@ -72,4 +87,30 @@ export type ToolMessage<TToolResponse extends Record<string, any> | unknown> =
* - {@link AssistantMessage}
* - {@link ToolMessage}
*/
export type Message = UserMessage | AssistantMessage | ToolMessage<unknown>;
export type Message = UserMessage | AssistantMessage | ToolMessage;
/**
* Utility type to get the Assistant message type of a {@link ToolOptions} type.
*/
export type AssistantMessageOf<TToolOptions extends ToolOptions> = Omit<
AssistantMessage,
'toolCalls'
> &
ToolCallsOf<TToolOptions>;
/**
* Utility type to get the Tool message type of a {@link ToolOptions} type.
*/
export type ToolMessageOf<TToolOptions extends ToolOptions> = ToolMessage<
ToolNamesOf<TToolOptions>,
ToolResponsesOf<TToolOptions['tools']>
>;
/**
* Utility type to get the mixin Message type of a {@link ToolOptions} type.
*/
export type MessageOf<TToolOptions extends ToolOptions> =
| UserMessage
| AssistantMessageOf<TToolOptions>
| ToolMessageOf<TToolOptions>;

View file

@ -8,24 +8,24 @@
import type { ValuesType } from 'utility-types';
import { FromToolSchema, ToolSchema } from './tool_schema';
type Assert<TValue, TType> = TValue extends TType ? TValue & TType : never;
type ToolsOfChoice<TToolOptions extends ToolOptions> = TToolOptions['toolChoice'] extends {
function: infer TToolName;
}
? TToolName extends keyof TToolOptions['tools']
? Pick<TToolOptions['tools'], TToolName>
? TToolName extends string
? Pick<TToolOptions['tools'], TToolName>
: TToolOptions['tools']
: TToolOptions['tools']
: TToolOptions['tools'];
/**
* Utility type to infer the tool calls response shape.
*/
type ToolResponsesOf<TTools extends Record<string, ToolDefinition> | undefined> =
export type ToolResponsesOf<TTools extends Record<string, ToolDefinition> | undefined> =
TTools extends Record<string, ToolDefinition>
? Array<
ValuesType<{
[TName in keyof TTools]: ToolResponseOf<Assert<TName, string>, TTools[TName]>;
[TName in keyof TTools & string]: ToolCall<TName, ToolResponseOf<TTools[TName]>>;
}>
>
: never[];
@ -33,10 +33,11 @@ type ToolResponsesOf<TTools extends Record<string, ToolDefinition> | undefined>
/**
* Utility type to infer the tool call response shape.
*/
type ToolResponseOf<TName extends string, TToolDefinition extends ToolDefinition> = ToolCall<
TName,
TToolDefinition extends { schema: ToolSchema } ? FromToolSchema<TToolDefinition['schema']> : {}
>;
export type ToolResponseOf<TToolDefinition extends ToolDefinition> = TToolDefinition extends {
schema: ToolSchema;
}
? FromToolSchema<TToolDefinition['schema']>
: {};
/**
* Tool invocation choice type.
@ -129,6 +130,10 @@ export interface ToolCall<
name: TName;
} & (TArguments extends Record<string, any> ? { arguments: TArguments } : {});
}
/**
* Utility type to get the tool names of ToolOptions
*/
export type ToolNamesOf<TToolOptions extends ToolOptions> = keyof TToolOptions['tools'] & string;
/**
* Tool-related parameters of {@link ChatCompleteAPI}

View file

@ -96,6 +96,17 @@ export interface OutputOptions<
* Defaults to false.
*/
stream?: TStream;
/**
* Optional configuration for retrying the call if an error occurs.
*/
retry?: {
/**
* Whether to retry on validation errors. Can be a number or retries,
* or a boolean, which means one retry.
*/
onValidationError?: boolean | number;
};
}
/**

View file

@ -3,4 +3,4 @@
"private": true,
"version": "1.0.0",
"license": "Elastic License 2.0"
}
}

View file

@ -0,0 +1,72 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { AggregationsCustomCategorizeTextAnalyzer } from '@elastic/elasticsearch/lib/api/types';
// This is a copy of the default categorization analyzer but using the 'standard' tokenizer rather than the 'ml_standard' tokenizer.
// The 'ml_standard' tokenizer splits tokens in a way that was observed to give better categories in testing many years ago, however,
// the downside of these better categories is then a potential failure to match the original documents when creating a filter for Discover.
// A future enhancement would be to check which analyzer is specified in the mappings for the source field and to use
// that instead of unconditionally using 'standard'.
// However for an initial fix, using the standard analyzer will be more likely to match the results from the majority of searches.
export const categorizationAnalyzer: AggregationsCustomCategorizeTextAnalyzer = {
char_filter: ['first_line_with_letters'],
tokenizer: 'standard',
filter: [
// @ts-expect-error filter type in AggregationsCustomCategorizeTextAnalyzer is incorrect
{
type: 'stop',
stopwords: [
'Monday',
'Tuesday',
'Wednesday',
'Thursday',
'Friday',
'Saturday',
'Sunday',
'Mon',
'Tue',
'Wed',
'Thu',
'Fri',
'Sat',
'Sun',
'January',
'February',
'March',
'April',
'May',
'June',
'July',
'August',
'September',
'October',
'November',
'December',
'Jan',
'Feb',
'Mar',
'Apr',
'May',
'Jun',
'Jul',
'Aug',
'Sep',
'Oct',
'Nov',
'Dec',
'GMT',
'UTC',
],
},
// @ts-expect-error filter type in AggregationsCustomCategorizeTextAnalyzer is incorrect
{
type: 'limit',
max_token_count: '100',
},
],
};

View file

@ -5,16 +5,14 @@
* 2.0.
*/
import type {
QueryDslQueryContainer,
AggregationsCustomCategorizeTextAnalyzer,
} from '@elastic/elasticsearch/lib/api/types';
import type { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types';
import type { MappingRuntimeFields } from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { isPopulatedObject } from '@kbn/ml-is-populated-object/src/is_populated_object';
import type { createRandomSamplerWrapper } from '@kbn/ml-random-sampler-utils';
import { createDefaultQuery } from '@kbn/aiops-common/create_default_query';
import { categorizationAnalyzer } from './categorization_analyzer';
const CATEGORY_LIMIT = 1000;
const EXAMPLE_LIMIT = 4;
@ -121,67 +119,3 @@ export function createCategoryRequest(
},
};
}
// This is a copy of the default categorization analyzer but using the 'standard' tokenizer rather than the 'ml_standard' tokenizer.
// The 'ml_standard' tokenizer splits tokens in a way that was observed to give better categories in testing many years ago, however,
// the downside of these better categories is then a potential failure to match the original documents when creating a filter for Discover.
// A future enhancement would be to check which analyzer is specified in the mappings for the source field and to use
// that instead of unconditionally using 'standard'.
// However for an initial fix, using the standard analyzer will be more likely to match the results from the majority of searches.
const categorizationAnalyzer: AggregationsCustomCategorizeTextAnalyzer = {
char_filter: ['first_line_with_letters'],
tokenizer: 'standard',
filter: [
// @ts-expect-error filter type in AggregationsCustomCategorizeTextAnalyzer is incorrect
{
type: 'stop',
stopwords: [
'Monday',
'Tuesday',
'Wednesday',
'Thursday',
'Friday',
'Saturday',
'Sunday',
'Mon',
'Tue',
'Wed',
'Thu',
'Fri',
'Sat',
'Sun',
'January',
'February',
'March',
'April',
'May',
'June',
'July',
'August',
'September',
'October',
'November',
'December',
'Jan',
'Feb',
'Mar',
'Apr',
'May',
'Jun',
'Jul',
'Aug',
'Sep',
'Oct',
'Nov',
'Dec',
'GMT',
'UTC',
],
},
// @ts-expect-error filter type in AggregationsCustomCategorizeTextAnalyzer is incorrect
{
type: 'limit',
max_token_count: '100',
},
],
};

View file

@ -26099,7 +26099,6 @@
"xpack.investigateApp.addObservationUI.h2.addAnObservationChartLabel": "Ajouter un graphique d'observation",
"xpack.investigateApp.addObservationUI.p.selectADataSourceLabel": "Sélectionnez une source de données pour générer un graphique d'aperçu",
"xpack.investigateApp.appTitle": "Investigations",
"xpack.investigateApp.assistantHypothesis.observabilityAIAssistantContextualInsight.helpMeInvestigateThisLabel": "Aidez-moi à résoudre la cause de cet échec",
"xpack.investigateApp.defaultChart.error_equation.description": "Vérifiez l'équation.",
"xpack.investigateApp.defaultChart.error_equation.title": "Une erreur s'est produite lors de l'affichage du graphique",
"xpack.investigateApp.defaultChart.noData.title": "Aucune donnée graphique disponible",

View file

@ -25957,7 +25957,6 @@
"xpack.investigateApp.addObservationUI.h2.addAnObservationChartLabel": "観測グラフを追加",
"xpack.investigateApp.addObservationUI.p.selectADataSourceLabel": "データソースを選択して、プレビューグラフを生成",
"xpack.investigateApp.appTitle": "調査",
"xpack.investigateApp.assistantHypothesis.observabilityAIAssistantContextualInsight.helpMeInvestigateThisLabel": "このエラーの調査を支援",
"xpack.investigateApp.defaultChart.error_equation.description": "式を確認してください。",
"xpack.investigateApp.defaultChart.error_equation.title": "グラフの表示中にエラーが発生しました",
"xpack.investigateApp.defaultChart.noData.title": "グラフデータがありません",

View file

@ -26040,7 +26040,6 @@
"xpack.investigateApp.addObservationUI.h2.addAnObservationChartLabel": "添加观察图表",
"xpack.investigateApp.addObservationUI.p.selectADataSourceLabel": "选择数据源以生成预览图表",
"xpack.investigateApp.appTitle": "调查",
"xpack.investigateApp.assistantHypothesis.observabilityAIAssistantContextualInsight.helpMeInvestigateThisLabel": "帮助我调查此故障",
"xpack.investigateApp.defaultChart.error_equation.description": "检查方程。",
"xpack.investigateApp.defaultChart.error_equation.title": "渲染图表时出错",
"xpack.investigateApp.defaultChart.noData.title": "没有可用图表数据",

View file

@ -12,6 +12,7 @@ import {
ChatCompletionEventType,
} from '@kbn/inference-common';
import { createOutputApi } from './create_output_api';
import { createToolValidationError } from '../../server/chat_complete/errors';
describe('createOutputApi', () => {
let chatComplete: jest.Mock;
@ -119,4 +120,80 @@ describe('createOutputApi', () => {
},
]);
});
describe('when using retry', () => {
const unvalidatedFailedToolCall = {
function: {
name: 'myFunction',
arguments: JSON.stringify({ foo: 'bar' }),
},
toolCallId: 'foo',
};
const validationError = createToolValidationError('Validation failed', {
toolCalls: [unvalidatedFailedToolCall],
});
it('retries once when onValidationError is a boolean', async () => {
chatComplete.mockRejectedValueOnce(validationError);
chatComplete.mockResolvedValueOnce(
Promise.resolve({ content: 'retried content', toolCalls: [unvalidatedFailedToolCall] })
);
const output = createOutputApi(chatComplete);
const response = await output({
id: 'retry-id',
stream: false,
connectorId: '.retry-connector',
input: 'input message',
retry: {
onValidationError: true,
},
});
expect(chatComplete).toHaveBeenCalledTimes(2);
expect(response).toEqual({
id: 'retry-id',
content: 'retried content',
output: unvalidatedFailedToolCall.function.arguments,
});
});
it('retries the number of specified attempts', async () => {
chatComplete.mockRejectedValue(validationError);
const output = createOutputApi(chatComplete);
await expect(
output({
id: 'retry-id',
stream: false,
connectorId: '.retry-connector',
input: 'input message',
retry: {
onValidationError: 2,
},
})
).rejects.toThrow('Validation failed');
expect(chatComplete).toHaveBeenCalledTimes(3);
});
it('throws an error if retry is provided in streaming mode', () => {
const output = createOutputApi(chatComplete);
expect(() =>
output({
id: 'stream-retry-id',
stream: true,
connectorId: '.stream-retry-connector',
input: 'input message',
retry: {
onValidationError: 1,
},
})
).toThrowError('Retry options are not supported in streaming mode');
});
});
});

View file

@ -10,17 +10,22 @@ import {
ChatCompletionEventType,
MessageRole,
OutputAPI,
OutputCompositeResponse,
OutputEventType,
OutputOptions,
ToolSchema,
isToolValidationError,
withoutTokenCountEvents,
} from '@kbn/inference-common';
import { isObservable, map } from 'rxjs';
import { ensureMultiTurn } from '../utils/ensure_multi_turn';
type DefaultOutputOptions = OutputOptions<string, ToolSchema | undefined, boolean>;
export function createOutputApi(chatCompleteApi: ChatCompleteAPI): OutputAPI;
export function createOutputApi(chatCompleteApi: ChatCompleteAPI) {
return ({
return function callOutputApi({
id,
connectorId,
input,
@ -29,19 +34,26 @@ export function createOutputApi(chatCompleteApi: ChatCompleteAPI) {
previousMessages,
functionCalling,
stream,
}: OutputOptions<string, ToolSchema | undefined, boolean>) => {
retry,
}: DefaultOutputOptions): OutputCompositeResponse<string, ToolSchema | undefined, boolean> {
if (stream && retry !== undefined) {
throw new Error(`Retry options are not supported in streaming mode`);
}
const messages = ensureMultiTurn([
...(previousMessages || []),
{
role: MessageRole.User,
content: input,
},
]);
const response = chatCompleteApi({
connectorId,
stream,
functionCalling,
system,
messages: ensureMultiTurn([
...(previousMessages || []),
{
role: MessageRole.User,
content: input,
},
]),
messages,
...(schema
? {
tools: {
@ -79,16 +91,55 @@ export function createOutputApi(chatCompleteApi: ChatCompleteAPI) {
})
);
} else {
return response.then((chatResponse) => {
return {
id,
content: chatResponse.content,
output:
chatResponse.toolCalls.length && 'arguments' in chatResponse.toolCalls[0].function
? chatResponse.toolCalls[0].function.arguments
: undefined,
};
});
return response.then(
(chatResponse) => {
return {
id,
content: chatResponse.content,
output:
chatResponse.toolCalls.length && 'arguments' in chatResponse.toolCalls[0].function
? chatResponse.toolCalls[0].function.arguments
: undefined,
};
},
(error: Error) => {
if (isToolValidationError(error) && retry?.onValidationError) {
const retriesLeft =
typeof retry.onValidationError === 'number' ? retry.onValidationError : 1;
return callOutputApi({
id,
connectorId,
input,
schema,
system,
previousMessages: messages.concat(
{
role: MessageRole.Assistant as const,
content: '',
toolCalls: error.meta.toolCalls!,
},
...(error.meta.toolCalls?.map((toolCall) => {
return {
name: toolCall.function.name,
role: MessageRole.Tool as const,
toolCallId: toolCall.toolCallId,
response: {
error: error.meta,
},
};
}) ?? [])
),
functionCalling,
stream: false,
retry: {
onValidationError: retriesLeft - 1,
},
}) as OutputCompositeResponse<string, ToolSchema | undefined, false>;
}
throw error;
}
);
}
};
}

View file

@ -170,6 +170,7 @@ describe('bedrockClaudeAdapter', () => {
],
},
{
name: 'my_function',
role: MessageRole.Tool,
toolCallId: '0',
response: {

View file

@ -172,6 +172,7 @@ describe('geminiAdapter', () => {
],
},
{
name: 'my_function',
role: MessageRole.Tool,
toolCallId: '0',
response: {

View file

@ -142,6 +142,7 @@ describe('openAIAdapter', () => {
],
},
{
name: 'my_function',
role: MessageRole.Tool,
toolCallId: '0',
response: {

View file

@ -58,7 +58,6 @@ export const openAIAdapter: InferenceConnectorAdapter = {
request = {
stream,
messages: messagesToOpenAI({ system: wrapped.system, messages: wrapped.messages }),
temperature: 0,
};
} else {
request = {
@ -66,7 +65,6 @@ export const openAIAdapter: InferenceConnectorAdapter = {
messages: messagesToOpenAI({ system, messages }),
tool_choice: toolChoiceToOpenAI(toolChoice),
tools: toolsToOpenAI(tools),
temperature: 0,
};
}

View file

@ -5,7 +5,7 @@
* 2.0.
*/
import { last } from 'lodash';
import { last, omit } from 'lodash';
import { defer, switchMap, throwError } from 'rxjs';
import type { Logger } from '@kbn/logging';
import type { KibanaRequest } from '@kbn/core-http-server';
@ -51,14 +51,26 @@ export function createChatCompleteApi({ request, actions, logger }: CreateChatCo
const connectorType = connector.type;
const inferenceAdapter = getInferenceAdapter(connectorType);
const messagesWithoutData = messages.map((message) => omit(message, 'data'));
if (!inferenceAdapter) {
return throwError(() =>
createInferenceRequestError(`Adapter for type ${connectorType} not implemented`, 400)
);
}
logger.debug(() => `Sending request: ${JSON.stringify(last(messages))}`);
logger.trace(() => JSON.stringify({ messages, toolChoice, tools, system }));
logger.debug(
() => `Sending request, last message is: ${JSON.stringify(last(messagesWithoutData))}`
);
logger.trace(() =>
JSON.stringify({
messages: messagesWithoutData,
toolChoice,
tools,
system,
})
);
return inferenceAdapter.chatComplete({
system,

View file

@ -44,7 +44,7 @@ export function createToolValidationError(
name?: string;
arguments?: string;
errorsText?: string;
toolCalls?: UnvalidatedToolCall[];
toolCalls: UnvalidatedToolCall[];
}
): ChatCompletionToolValidationError {
return new InferenceTaskError(ChatCompletionErrorCode.ToolValidationError, message, meta);

View file

@ -79,7 +79,7 @@ export function wrapWithSimulatedFunctionCalling({
};
}
const convertToolResponseMessage = (message: ToolMessage<unknown>): UserMessage => {
const convertToolResponseMessage = (message: ToolMessage): UserMessage => {
return {
role: MessageRole.User,
content: JSON.stringify({

View file

@ -183,7 +183,7 @@ describe('chunksIntoMessage', () => {
}
await expect(async () => getMessage()).rejects.toThrowErrorMatchingInlineSnapshot(
`"Tool call arguments for myFunction were invalid"`
`"Tool call arguments for myFunction (001) were invalid"`
);
});

View file

@ -5,17 +5,17 @@
* 2.0.
*/
import { last, map, merge, OperatorFunction, scan, share } from 'rxjs';
import type { Logger } from '@kbn/logging';
import {
UnvalidatedToolCall,
ToolOptions,
ChatCompletionChunkEvent,
ChatCompletionEventType,
ChatCompletionMessageEvent,
ChatCompletionTokenCountEvent,
ToolOptions,
UnvalidatedToolCall,
withoutTokenCountEvents,
} from '@kbn/inference-common';
import type { Logger } from '@kbn/logging';
import { OperatorFunction, map, merge, share, toArray } from 'rxjs';
import { validateToolCalls } from '../../util/validate_tool_calls';
export function chunksIntoMessage<TToolOptions extends ToolOptions>({
@ -37,38 +37,36 @@ export function chunksIntoMessage<TToolOptions extends ToolOptions>({
shared$,
shared$.pipe(
withoutTokenCountEvents(),
scan(
(prev, chunk) => {
prev.content += chunk.content ?? '';
toArray(),
map((chunks): ChatCompletionMessageEvent<TToolOptions> => {
const concatenatedChunk = chunks.reduce(
(prev, chunk) => {
prev.content += chunk.content ?? '';
chunk.tool_calls?.forEach((toolCall) => {
let prevToolCall = prev.tool_calls[toolCall.index];
if (!prevToolCall) {
prev.tool_calls[toolCall.index] = {
function: {
name: '',
arguments: '',
},
toolCallId: '',
};
chunk.tool_calls?.forEach((toolCall) => {
let prevToolCall = prev.tool_calls[toolCall.index];
if (!prevToolCall) {
prev.tool_calls[toolCall.index] = {
function: {
name: '',
arguments: '',
},
toolCallId: '',
};
prevToolCall = prev.tool_calls[toolCall.index];
}
prevToolCall = prev.tool_calls[toolCall.index];
}
prevToolCall.function.name += toolCall.function.name;
prevToolCall.function.arguments += toolCall.function.arguments;
prevToolCall.toolCallId += toolCall.toolCallId;
});
prevToolCall.function.name += toolCall.function.name;
prevToolCall.function.arguments += toolCall.function.arguments;
prevToolCall.toolCallId += toolCall.toolCallId;
});
return prev;
},
{ content: '', tool_calls: [] as UnvalidatedToolCall[] }
);
return prev;
},
{
content: '',
tool_calls: [] as UnvalidatedToolCall[],
}
),
last(),
map((concatenatedChunk): ChatCompletionMessageEvent<TToolOptions> => {
logger.debug(() => `Received completed message: ${JSON.stringify(concatenatedChunk)}`);
const validatedToolCalls = validateToolCalls<TToolOptions>({

View file

@ -76,9 +76,11 @@ const chatCompleteBodySchema: Type<ChatCompleteRequestBody> = schema.object({
name: schema.maybe(schema.string()),
}),
schema.object({
name: schema.string(),
role: schema.literal(MessageRole.Tool),
toolCallId: schema.string(),
response: schema.recordOf(schema.string(), schema.any()),
data: schema.maybe(schema.recordOf(schema.string(), schema.any())),
}),
])
),

View file

@ -34,6 +34,7 @@ export const generateEsqlTask = <TToolOptions extends ToolOptions>({
docBase,
functionCalling,
logger,
system,
}: {
connectorId: string;
systemMessage: string;
@ -43,6 +44,7 @@ export const generateEsqlTask = <TToolOptions extends ToolOptions>({
docBase: EsqlDocumentBase;
functionCalling?: FunctionCallingMode;
logger: Pick<Logger, 'debug'>;
system?: string;
}) => {
return function askLlmToRespond({
documentationRequest: { commands, functions },
@ -97,7 +99,7 @@ export const generateEsqlTask = <TToolOptions extends ToolOptions>({
When converting queries from one language to ES|QL, make sure that the functions are available
and documented in ES|QL. E.g., for SPL's LEN, use LENGTH. For IF, use CASE.
`,
${system ? `## Additional instructions\n\n${system}` : ''}`,
messages: [
...messages,
{
@ -106,6 +108,7 @@ export const generateEsqlTask = <TToolOptions extends ToolOptions>({
toolCalls: [fakeRequestDocsToolCall],
},
{
name: fakeRequestDocsToolCall.function.name,
role: MessageRole.Tool,
response: {
documentation: requestedDocumentation,

View file

@ -21,6 +21,7 @@ export function naturalLanguageToEsql<TToolOptions extends ToolOptions>({
toolChoice,
logger,
functionCalling,
system,
...rest
}: NlToEsqlTaskParams<TToolOptions>): Observable<NlToEsqlTaskEvent<TToolOptions>> {
return from(loadDocBase()).pipe(
@ -41,6 +42,7 @@ export function naturalLanguageToEsql<TToolOptions extends ToolOptions>({
tools,
toolChoice,
},
system,
});
return requestDocumentation({

View file

@ -29,5 +29,6 @@ export type NlToEsqlTaskParams<TToolOptions extends ToolOptions> = {
connectorId: string;
logger: Pick<Logger, 'debug'>;
functionCalling?: FunctionCallingMode;
system?: string;
} & TToolOptions &
({ input: string } | { messages: Message[] });

View file

@ -108,7 +108,7 @@ describe('validateToolCalls', () => {
});
}
expect(() => validate()).toThrowErrorMatchingInlineSnapshot(
`"Tool call arguments for my_function were invalid"`
`"Tool call arguments for my_function (1) were invalid"`
);
try {
@ -119,6 +119,15 @@ describe('validateToolCalls', () => {
arguments: JSON.stringify({ foo: 'bar' }),
errorsText: `data must have required property 'bar'`,
name: 'my_function',
toolCalls: [
{
function: {
arguments: JSON.stringify({ foo: 'bar' }),
name: 'my_function',
},
toolCallId: '1',
},
],
});
} else {
fail('Expected toolValidationError');

View file

@ -54,11 +54,12 @@ export function validateToolCalls<TToolOptions extends ToolOptions>({
if (!valid) {
throw createToolValidationError(
`Tool call arguments for ${toolCall.function.name} were invalid`,
`Tool call arguments for ${toolCall.function.name} (${toolCall.toolCallId}) were invalid`,
{
name: toolCall.function.name,
errorsText: validator.errorsText(),
arguments: toolCall.function.arguments,
toolCalls,
}
);
}

View file

@ -51,6 +51,9 @@
"@kbn/rule-data-utils",
"@kbn/spaces-plugin",
"@kbn/cloud-plugin",
"@kbn/observability-utils-browser",
"@kbn/observability-utils-server",
"@kbn/observability-utils-common",
"@kbn/storybook",
"@kbn/dashboard-plugin",
"@kbn/deeplinks-analytics",

View file

@ -17,6 +17,7 @@ import { SearchBar, IUnifiedSearchPluginServices } from '@kbn/unified-search-plu
import { KibanaContextProvider } from '@kbn/kibana-react-plugin/public';
import { merge } from 'lodash';
import { Storage } from '@kbn/kibana-utils-plugin/public';
import { of } from 'rxjs';
import type { EsqlQueryMeta } from '../public/services/esql';
import type { InvestigateAppServices } from '../public/services/types';
import { InvestigateAppKibanaContext } from '../public/hooks/use_kibana';
@ -54,6 +55,10 @@ export function getMockInvestigateAppContext(): DeeplyMockedKeys<InvestigateAppK
}),
},
charts: {} as any,
investigateAppRepositoryClient: {
fetch: jest.fn().mockImplementation(() => Promise.resolve()),
stream: jest.fn().mockImplementation(() => of()) as any,
},
};
const core = coreMock.createStart();

View file

@ -2,8 +2,8 @@
"type": "plugin",
"id": "@kbn/investigate-app-plugin",
"owner": "@elastic/obs-ux-management-team",
"group": "observability",
"visibility": "private",
"group": "observability",
"plugin": {
"id": "investigateApp",
"server": true,
@ -24,14 +24,22 @@
"observability",
"licensing",
"ruleRegistry",
"inference",
"alerting",
"spaces",
"slo",
"apmDataAccess",
"usageCollection"
],
"optionalPlugins": [
"observabilityAIAssistant",
"observabilityAIAssistantApp"
],
"requiredBundles": [
"esql",
"kibanaReact",
"kibanaUtils"
],
"optionalPlugins": ["observabilityAIAssistant"],
"extraPublicDirs": []
}
}

View file

@ -11,7 +11,7 @@ import type {
ReturnOf,
RouteRepositoryClient,
} from '@kbn/server-route-repository';
import { formatRequest } from '@kbn/server-route-repository-utils/src/format_request';
import { createRepositoryClient } from '@kbn/server-route-repository-client';
import type { InvestigateAppServerRouteRepository } from '../../server';
type FetchOptions = Omit<HttpFetchOptions, 'body'> & {
@ -25,15 +25,15 @@ export type InvestigateAppAPIClientOptions = Omit<
signal: AbortSignal | null;
};
export type InvestigateAppAPIClient = RouteRepositoryClient<
export type InvestigateAppRepositoryClient = RouteRepositoryClient<
InvestigateAppServerRouteRepository,
InvestigateAppAPIClientOptions
>['fetch'];
>;
export type AutoAbortedInvestigateAppAPIClient = RouteRepositoryClient<
export type AutoAbortedInvestigateAppRepositoryClient = RouteRepositoryClient<
InvestigateAppServerRouteRepository,
Omit<InvestigateAppAPIClientOptions, 'signal'>
>['fetch'];
>;
export type InvestigateAppAPIEndpoint = keyof InvestigateAppServerRouteRepository;
@ -45,19 +45,6 @@ export type APIReturnType<TEndpoint extends InvestigateAppAPIEndpoint> = ReturnO
export type InvestigateAppAPIClientRequestParamsOf<TEndpoint extends InvestigateAppAPIEndpoint> =
ClientRequestParamsOf<InvestigateAppServerRouteRepository, TEndpoint>;
export function createCallInvestigateAppAPI(core: CoreStart | CoreSetup) {
return ((endpoint, options) => {
const { params } = options as unknown as {
params?: Partial<Record<string, any>>;
};
const { method, pathname, version } = formatRequest(endpoint, params?.path);
return core.http[method](pathname, {
...options,
body: params && params.body ? JSON.stringify(params.body) : undefined,
query: params?.query,
version,
});
}) as InvestigateAppAPIClient;
export function createInvestigateAppRepositoryClient(core: CoreStart | CoreSetup) {
return createRepositoryClient(core) as InvestigateAppRepositoryClient;
}

View file

@ -4,19 +4,22 @@
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import dedent from 'dedent';
import {
ALERT_RULE_PARAMETERS,
ALERT_START,
ALERT_RULE_CATEGORY,
ALERT_REASON,
} from '@kbn/rule-data-utils';
import { i18n } from '@kbn/i18n';
import { EntityWithSource } from '@kbn/investigation-shared';
import React, { useCallback } from 'react';
import type { RootCauseAnalysisEvent } from '@kbn/observability-ai-server/root_cause_analysis';
import { EcsFieldsResponse } from '@kbn/rule-registry-plugin/common';
import React, { useState, useRef, useEffect } from 'react';
import { omit } from 'lodash';
import {
ALERT_FLAPPING_HISTORY,
ALERT_RULE_EXECUTION_TIMESTAMP,
ALERT_RULE_EXECUTION_UUID,
EVENT_ACTION,
EVENT_KIND,
} from '@kbn/rule-registry-plugin/common/technical_rule_data_field_names';
import { isRequestAbortedError } from '@kbn/server-route-repository-client';
import { useKibana } from '../../../../hooks/use_kibana';
import { useInvestigation } from '../../contexts/investigation_context';
import { useFetchEntities } from '../../../../hooks/use_fetch_entities';
import { useUpdateInvestigation } from '../../../../hooks/use_update_investigation';
export interface InvestigationContextualInsight {
key: string;
@ -25,98 +28,177 @@ export interface InvestigationContextualInsight {
}
export function AssistantHypothesis({ investigationId }: { investigationId: string }) {
const { alert } = useInvestigation();
const {
alert,
globalParams: { timeRange },
investigation,
} = useInvestigation();
const {
core: { notifications },
services: { investigateAppRepositoryClient },
dependencies: {
start: {
observabilityAIAssistant: {
ObservabilityAIAssistantContextualInsight,
getContextualInsightMessages,
},
observabilityAIAssistant: { useGenAIConnectors },
observabilityAIAssistantApp: { RootCauseAnalysisContainer },
},
},
} = useKibana();
const { data: entitiesData } = useFetchEntities({
investigationId,
serviceName: alert?.['service.name'] ? `${alert?.['service.name']}` : undefined,
serviceEnvironment: alert?.['service.environment']
? `${alert?.['service.environment']}`
: undefined,
hostName: alert?.['host.name'] ? `${alert?.['host.name']}` : undefined,
containerId: alert?.['container.id'] ? `${alert?.['container.id']}` : undefined,
});
const getAlertContextMessages = useCallback(async () => {
if (!getContextualInsightMessages || !alert) {
return [];
const { mutateAsync: updateInvestigation } = useUpdateInvestigation();
const { loading: loadingConnector, selectedConnector } = useGenAIConnectors();
const serviceName = alert?.['service.name'] as string | undefined;
const [events, setEvents] = useState<RootCauseAnalysisEvent[]>([]);
const [loading, setLoading] = useState(false);
const [error, setError] = useState<Error | undefined>(undefined);
const controllerRef = useRef(new AbortController());
useEffect(() => {
if (investigation?.rootCauseAnalysis) {
setEvents(investigation.rootCauseAnalysis.events);
}
}, [investigation?.rootCauseAnalysis]);
const entities = entitiesData?.entities ?? [];
const [completeInBackground, setCompleteInBackground] = useState(true);
const entityContext = entities?.length
? `
Alerts can optionally be associated with entities. Entities can be services, hosts, containers, or other resources. Entities can have metrics associated with them.
The alert that triggered this investigation is associated with the following entities: ${entities
.map((entity, index) => {
return dedent(`
## Entity ${index + 1}:
${formatEntityMetrics(entity)};
`);
})
.join('/n/n')}`
: '';
const runRootCauseAnalysis = ({
alert: nonNullishAlert,
connectorId,
serviceName: nonNullishServiceName,
}: {
alert: EcsFieldsResponse;
connectorId: string;
serviceName: string;
}) => {
const rangeFrom = timeRange.from;
return getContextualInsightMessages({
message: `I am investigating a failure in my system. I was made aware of the failure by an alert and I am trying to understand the root cause of the issue.`,
instructions: dedent(
`I'm an SRE. I am investigating a failure in my system. I was made aware of the failure via an alert. Your current task is to help me identify the root cause of the failure in my system.
const rangeTo = timeRange.to;
The rule that triggered the alert is a ${
alert[ALERT_RULE_CATEGORY]
} rule. The alert started at ${alert[ALERT_START]}. The alert reason is ${
alert[ALERT_REASON]
}. The rule parameters are ${JSON.stringify(ALERT_RULE_PARAMETERS)}.
setLoading(true);
${entityContext}
setError(undefined);
Based on the alert details, suggest a root cause and next steps to mitigate the issue.
I do not have the alert details or entity details in front of me, so be sure to repeat the alert reason (${
alert[ALERT_REASON]
}), when the alert was triggered (${
alert[ALERT_START]
}), and the entity metrics in your response.
setEvents([]);
When displaying the entity metrics, please convert the metrics to a human-readable format. For example, convert "logRate" to "Log Rate" and "errorRate" to "Error Rate".
`
),
});
}, [alert, getContextualInsightMessages, entitiesData?.entities]);
investigateAppRepositoryClient
.stream('POST /internal/observability/investigation/root_cause_analysis', {
params: {
body: {
investigationId,
connectorId,
context: `The user is investigating an alert for the ${serviceName} service,
and wants to find the root cause. Here is the alert:
if (!ObservabilityAIAssistantContextualInsight) {
${JSON.stringify(sanitizeAlert(nonNullishAlert))}`,
rangeFrom,
rangeTo,
serviceName: nonNullishServiceName,
completeInBackground,
},
},
signal: controllerRef.current.signal,
})
.subscribe({
next: (event) => {
setEvents((prev) => {
return prev.concat(event.event);
});
},
error: (nextError) => {
if (!isRequestAbortedError(nextError)) {
notifications.toasts.addError(nextError, {
title: i18n.translate(
'xpack.investigateApp.assistantHypothesis.failedToLoadAnalysis',
{
defaultMessage: `Failed to load analysis`,
}
),
});
setError(nextError);
} else {
setError(
new Error(
i18n.translate('xpack.investigateApp.assistantHypothesis.analysisAborted', {
defaultMessage: `Analysis was aborted`,
})
)
);
}
setLoading(false);
},
complete: () => {
setLoading(false);
},
});
};
if (!serviceName) {
return null;
}
return alert && entitiesData ? (
<ObservabilityAIAssistantContextualInsight
title={i18n.translate(
'xpack.investigateApp.assistantHypothesis.observabilityAIAssistantContextualInsight.helpMeInvestigateThisLabel',
{ defaultMessage: 'Help me investigate this failure' }
)}
messages={getAlertContextMessages}
return (
<RootCauseAnalysisContainer
events={events}
loading={loading || loadingConnector}
completeInBackground={completeInBackground}
onCompleteInBackgroundClick={() => {
setCompleteInBackground(() => !completeInBackground);
}}
onStopAnalysisClick={() => {
controllerRef.current.abort();
controllerRef.current = new AbortController();
}}
onClearAnalysisClick={() => {
setEvents([]);
if (investigation?.rootCauseAnalysis) {
updateInvestigation({
investigationId,
payload: {
rootCauseAnalysis: {
events: [],
},
},
});
}
}}
onResetAnalysisClick={() => {
controllerRef.current.abort();
controllerRef.current = new AbortController();
if (alert && selectedConnector && serviceName) {
runRootCauseAnalysis({
alert,
connectorId: selectedConnector,
serviceName,
});
}
}}
error={error}
onStartAnalysisClick={() => {
if (alert && selectedConnector && serviceName) {
runRootCauseAnalysis({
alert,
connectorId: selectedConnector,
serviceName,
});
}
}}
/>
) : null;
);
}
function sanitizeAlert(alert: EcsFieldsResponse) {
return omit(
alert,
ALERT_RULE_EXECUTION_TIMESTAMP,
'_index',
ALERT_FLAPPING_HISTORY,
EVENT_ACTION,
EVENT_KIND,
ALERT_RULE_EXECUTION_UUID,
'@timestamp'
);
}
const formatEntityMetrics = (entity: EntityWithSource): string => {
const entityMetrics = Object.entries(entity.metrics)
.map(([key, value]) => `${key}: ${value}`)
.join(', ');
const entitySources = entity.sources.map((source) => source.dataStream).join(', ');
return dedent(`
Entity name: ${entity.display_name};
Entity type: ${entity.type};
Entity metrics: ${entityMetrics};
Entity data streams: ${entitySources}
`);
};

View file

@ -27,6 +27,7 @@ import type {
InvestigateAppSetupDependencies,
InvestigateAppStartDependencies,
} from './types';
import { createInvestigateAppRepositoryClient, InvestigateAppRepositoryClient } from './api';
const getCreateEsqlService = once(() => import('./services/esql').then((m) => m.createEsqlService));
@ -41,6 +42,7 @@ export class InvestigateAppPlugin
{
logger: Logger;
config: ConfigSchema;
repositoryClient!: InvestigateAppRepositoryClient;
constructor(context: PluginInitializerContext<ConfigSchema>) {
this.logger = context.logger.get();
@ -51,6 +53,8 @@ export class InvestigateAppPlugin
coreSetup: CoreSetup<InvestigateAppStartDependencies, InvestigateAppPublicStart>,
pluginsSetup: InvestigateAppSetupDependencies
): InvestigateAppPublicSetup {
this.repositoryClient = createInvestigateAppRepositoryClient(coreSetup);
coreSetup.application.register({
id: INVESTIGATE_APP_ID,
title: i18n.translate('xpack.investigateApp.appTitle', {
@ -93,6 +97,7 @@ export class InvestigateAppPlugin
lens: pluginsStart.lens,
}),
charts: pluginsStart.charts,
investigateAppRepositoryClient: this.repositoryClient,
};
ReactDOM.render(
@ -127,6 +132,7 @@ export class InvestigateAppPlugin
start: pluginsStart,
},
services: {
investigateAppRepositoryClient: this.repositoryClient,
esql: createEsqlService({
data: pluginsStart.data,
dataViews: pluginsStart.dataViews,

View file

@ -7,8 +7,10 @@
import { ChartsPluginStart } from '@kbn/charts-plugin/public';
import type { EsqlService } from './esql';
import type { InvestigateAppRepositoryClient } from '../api';
export interface InvestigateAppServices {
esql: EsqlService;
charts: ChartsPluginStart;
investigateAppRepositoryClient: InvestigateAppRepositoryClient;
}

View file

@ -8,6 +8,10 @@ import type {
ObservabilityAIAssistantPublicSetup,
ObservabilityAIAssistantPublicStart,
} from '@kbn/observability-ai-assistant-plugin/public';
import type {
ObservabilityAIAssistantAppPublicSetup,
ObservabilityAIAssistantAppPublicStart,
} from '@kbn/observability-ai-assistant-app-plugin/public';
import { ChartsPluginStart } from '@kbn/charts-plugin/public';
import type { ContentManagementPublicStart } from '@kbn/content-management-plugin/public';
import type { DataPublicPluginSetup, DataPublicPluginStart } from '@kbn/data-plugin/public';
@ -43,6 +47,7 @@ export interface InvestigateAppSetupDependencies {
investigate: InvestigatePublicSetup;
observabilityShared: ObservabilitySharedPluginSetup;
observabilityAIAssistant: ObservabilityAIAssistantPublicSetup;
observabilityAIAssistantApp: ObservabilityAIAssistantAppPublicSetup;
lens: LensPublicSetup;
dataViews: DataViewsPublicPluginSetup;
data: DataPublicPluginSetup;
@ -58,6 +63,7 @@ export interface InvestigateAppStartDependencies {
investigate: InvestigatePublicStart;
observabilityShared: ObservabilitySharedPluginStart;
observabilityAIAssistant: ObservabilityAIAssistantPublicStart;
observabilityAIAssistantApp: ObservabilityAIAssistantAppPublicStart;
lens: LensPublicStart;
dataViews: DataViewsPublicPluginStart;
data: DataPublicPluginStart;

View file

@ -15,18 +15,19 @@ import {
findInvestigationsParamsSchema,
getAllInvestigationStatsParamsSchema,
getAllInvestigationTagsParamsSchema,
getEntitiesParamsSchema,
GetEntitiesResponse,
getEventsParamsSchema,
GetEventsResponse,
getInvestigationItemsParamsSchema,
getInvestigationNotesParamsSchema,
getInvestigationParamsSchema,
updateInvestigationItemParamsSchema,
updateInvestigationNoteParamsSchema,
updateInvestigationParamsSchema,
getEventsParamsSchema,
GetEventsResponse,
getEntitiesParamsSchema,
GetEntitiesResponse,
} from '@kbn/investigation-shared';
import { ScopedAnnotationsClient } from '@kbn/observability-plugin/server';
import { createEntitiesESClient } from '../clients/create_entities_es_client';
import { createInvestigation } from '../services/create_investigation';
import { createInvestigationItem } from '../services/create_investigation_item';
import { createInvestigationNote } from '../services/create_investigation_note';
@ -34,20 +35,20 @@ import { deleteInvestigation } from '../services/delete_investigation';
import { deleteInvestigationItem } from '../services/delete_investigation_item';
import { deleteInvestigationNote } from '../services/delete_investigation_note';
import { findInvestigations } from '../services/find_investigations';
import { AlertsClient, getAlertsClient } from '../services/get_alerts_client';
import { getAllInvestigationStats } from '../services/get_all_investigation_stats';
import { getAllInvestigationTags } from '../services/get_all_investigation_tags';
import { getEntitiesWithSource } from '../services/get_entities';
import { getAlertEvents, getAnnotationEvents } from '../services/get_events';
import { getInvestigation } from '../services/get_investigation';
import { getInvestigationItems } from '../services/get_investigation_items';
import { getInvestigationNotes } from '../services/get_investigation_notes';
import { investigationRepositoryFactory } from '../services/investigation_repository';
import { updateInvestigation } from '../services/update_investigation';
import { getAlertEvents, getAnnotationEvents } from '../services/get_events';
import { AlertsClient, getAlertsClient } from '../services/get_alerts_client';
import { updateInvestigationItem } from '../services/update_investigation_item';
import { updateInvestigationNote } from '../services/update_investigation_note';
import { createInvestigateAppServerRoute } from './create_investigate_app_server_route';
import { getAllInvestigationStats } from '../services/get_all_investigation_stats';
import { getEntitiesWithSource } from '../services/get_entities';
import { createEntitiesESClient } from '../clients/create_entities_es_client';
import { rootCauseAnalysisRoute } from './rca/route';
const createInvestigationRoute = createInvestigateAppServerRoute({
endpoint: 'POST /api/observability/investigations 2023-10-31',
@ -400,6 +401,7 @@ export function getGlobalInvestigateAppServerRouteRepository() {
...getEntitiesRoute,
...getAllInvestigationStatsRoute,
...getAllInvestigationTagsRoute,
...rootCauseAnalysisRoute,
};
}

View file

@ -0,0 +1,163 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { Observable, catchError, from, of, share, switchMap, toArray } from 'rxjs';
import { ServerSentEventBase } from '@kbn/sse-utils';
import {
RootCauseAnalysisEvent,
runRootCauseAnalysis,
} from '@kbn/observability-ai-server/root_cause_analysis';
import { z } from '@kbn/zod';
import datemath from '@elastic/datemath';
import { OBSERVABILITY_LOGS_DATA_ACCESS_LOG_SOURCES_ID } from '@kbn/management-settings-ids';
import { createObservabilityEsClient } from '@kbn/observability-utils-server/es/client/create_observability_es_client';
import { preconditionFailed } from '@hapi/boom';
import { createInvestigateAppServerRoute } from '../create_investigate_app_server_route';
import { investigationRepositoryFactory } from '../../services/investigation_repository';
export const rootCauseAnalysisRoute = createInvestigateAppServerRoute({
endpoint: 'POST /internal/observability/investigation/root_cause_analysis',
options: {
tags: [],
},
params: z.object({
body: z.object({
investigationId: z.string(),
rangeFrom: z.string(),
rangeTo: z.string(),
serviceName: z.string(),
context: z.string(),
connectorId: z.string(),
completeInBackground: z.boolean().optional(),
}),
}),
handler: async ({
params,
plugins,
request,
context: requestContext,
logger,
}): Promise<Observable<ServerSentEventBase<'event', { event: RootCauseAnalysisEvent }>>> => {
const {
body: {
investigationId,
context,
rangeFrom,
rangeTo,
serviceName,
connectorId,
completeInBackground,
},
} = params;
if (!plugins.observabilityAIAssistant) {
throw preconditionFailed('Observability AI Assistant plugin is not available');
}
const start = datemath.parse(rangeFrom)?.valueOf()!;
const end = datemath.parse(rangeTo)?.valueOf()!;
const coreContext = await requestContext.core;
const coreEsClient = coreContext.elasticsearch.client.asCurrentUser;
const soClient = coreContext.savedObjects.client;
const uiSettingsClient = coreContext.uiSettings.client;
const repository = investigationRepositoryFactory({ soClient, logger });
const esClient = createObservabilityEsClient({
client: coreEsClient,
logger,
plugin: 'investigateApp',
});
const [
investigation,
rulesClient,
alertsClient,
inferenceClient,
observabilityAIAssistantClient,
spaceId = 'default',
apmIndices,
logSources,
sloSummaryIndices,
] = await Promise.all([
repository.findById(investigationId),
(await plugins.alerting.start()).getRulesClientWithRequest(request),
(await plugins.ruleRegistry.start()).getRacClientWithRequest(request),
(await plugins.inference.start()).getClient({ request }),
plugins
.observabilityAIAssistant!.start()
.then((observabilityAIAssistantStart) =>
observabilityAIAssistantStart.service.getClient({ request, scopes: ['observability'] })
),
(await plugins.spaces?.start())?.spacesService.getSpaceId(request),
plugins.apmDataAccess.setup.getApmIndices(soClient),
uiSettingsClient.get(OBSERVABILITY_LOGS_DATA_ACCESS_LOG_SOURCES_ID) as Promise<string[]>,
(await plugins.slo.start()).getSloClientWithRequest(request).getSummaryIndices(),
]);
const next$ = runRootCauseAnalysis({
alertsClient,
connectorId,
start,
end,
esClient,
inferenceClient,
indices: {
logs: logSources,
traces: [apmIndices.span, apmIndices.error, apmIndices.transaction],
sloSummaries: sloSummaryIndices,
},
rulesClient,
observabilityAIAssistantClient,
serviceName,
spaceId,
context,
logger,
}).pipe(
switchMap((event) => {
return of({
type: 'event' as const,
event,
});
})
);
if (completeInBackground) {
const shared$ = next$.pipe(share());
shared$
.pipe(
toArray(),
catchError(() => {
return of();
}),
switchMap((events) => {
return from(
repository.save({
...investigation,
rootCauseAnalysis: {
events: events.map(({ event }) => event),
},
})
);
})
)
.subscribe({
error: (error) => {
logger.error(`Failed to update investigation: ${error.message}`);
logger.error(error);
},
});
return shared$;
}
return next$;
},
});

View file

@ -5,11 +5,23 @@
* 2.0.
*/
import { ObservabilityPluginSetup } from '@kbn/observability-plugin/server';
import {
import type { ObservabilityPluginSetup } from '@kbn/observability-plugin/server';
import type {
RuleRegistryPluginSetupContract,
RuleRegistryPluginStartContract,
} from '@kbn/rule-registry-plugin/server';
import type { AlertingServerSetup, AlertingServerStart } from '@kbn/alerting-plugin/server/plugin';
import type { SLOServerStart, SLOServerSetup } from '@kbn/slo-plugin/server';
import type { InferenceServerStart, InferenceServerSetup } from '@kbn/inference-plugin/server';
import type { SpacesPluginSetup, SpacesPluginStart } from '@kbn/spaces-plugin/server';
import type {
ApmDataAccessPluginStart,
ApmDataAccessPluginSetup,
} from '@kbn/apm-data-access-plugin/server';
import type {
ObservabilityAIAssistantServerStart,
ObservabilityAIAssistantServerSetup,
} from '@kbn/observability-ai-assistant-plugin/server';
import { UsageCollectionSetup } from '@kbn/usage-collection-plugin/server';
/* eslint-disable @typescript-eslint/no-empty-interface*/
@ -19,11 +31,23 @@ export interface ConfigSchema {}
export interface InvestigateAppSetupDependencies {
observability: ObservabilityPluginSetup;
ruleRegistry: RuleRegistryPluginSetupContract;
slo: SLOServerSetup;
alerting: AlertingServerSetup;
inference: InferenceServerSetup;
spaces?: SpacesPluginSetup;
apmDataAccess: ApmDataAccessPluginSetup;
observabilityAIAssistant?: ObservabilityAIAssistantServerSetup;
usageCollection: UsageCollectionSetup;
}
export interface InvestigateAppStartDependencies {
ruleRegistry: RuleRegistryPluginStartContract;
slo: SLOServerStart;
alerting: AlertingServerStart;
inference: InferenceServerStart;
spaces?: SpacesPluginStart;
apmDataAccess: ApmDataAccessPluginStart;
observabilityAIAssistant?: ObservabilityAIAssistantServerStart;
}
export interface InvestigateAppServerSetup {}

View file

@ -17,57 +17,67 @@
".storybook/**/*.js"
],
"kbn_references": [
"@kbn/esql",
"@kbn/core",
"@kbn/data-views-plugin",
"@kbn/expressions-plugin",
"@kbn/kibana-utils-plugin",
"@kbn/utility-types-jest",
"@kbn/es-types",
"@kbn/data-plugin",
"@kbn/embeddable-plugin",
"@kbn/unified-search-plugin",
"@kbn/kibana-react-plugin",
"@kbn/server-route-repository",
"@kbn/server-route-repository-client",
"@kbn/react-kibana-context-theme",
"@kbn/shared-ux-link-redirect-app",
"@kbn/kibana-react-plugin",
"@kbn/i18n",
"@kbn/embeddable-plugin",
"@kbn/observability-ai-assistant-plugin",
"@kbn/lens-plugin",
"@kbn/esql",
"@kbn/esql-utils",
"@kbn/data-plugin",
"@kbn/es-types",
"@kbn/field-types",
"@kbn/expressions-plugin",
"@kbn/deeplinks-observability",
"@kbn/logging",
"@kbn/data-views-plugin",
"@kbn/observability-shared-plugin",
"@kbn/config-schema",
"@kbn/investigate-plugin",
"@kbn/dataset-quality-plugin",
"@kbn/utility-types-jest",
"@kbn/content-management-plugin",
"@kbn/kibana-utils-plugin",
"@kbn/visualization-utils",
"@kbn/unified-search-plugin",
"@kbn/es-query",
"@kbn/server-route-repository",
"@kbn/security-plugin",
"@kbn/ui-actions-plugin",
"@kbn/server-route-repository-utils",
"@kbn/core-saved-objects-server",
"@kbn/rule-registry-plugin",
"@kbn/shared-ux-router",
"@kbn/i18n",
"@kbn/investigation-shared",
"@kbn/core-security-common",
"@kbn/saved-objects-finder-plugin",
"@kbn/presentation-containers",
"@kbn/lens-plugin",
"@kbn/rule-registry-plugin",
"@kbn/security-plugin",
"@kbn/rule-data-utils",
"@kbn/investigate-plugin",
"@kbn/observability-utils-browser",
"@kbn/lens-embeddable-utils",
"@kbn/i18n-react",
"@kbn/zod",
"@kbn/observability-plugin",
"@kbn/licensing-plugin",
"@kbn/rule-data-utils",
"@kbn/es-query",
"@kbn/saved-objects-finder-plugin",
"@kbn/presentation-containers",
"@kbn/observability-ai-server",
"@kbn/charts-plugin",
"@kbn/observability-shared-plugin",
"@kbn/core-security-common",
"@kbn/deeplinks-observability",
"@kbn/logging",
"@kbn/esql-utils",
"@kbn/observability-ai-assistant-plugin",
"@kbn/observability-ai-assistant-app-plugin",
"@kbn/content-management-plugin",
"@kbn/dataset-quality-plugin",
"@kbn/ui-actions-plugin",
"@kbn/field-types",
"@kbn/entities-schema",
"@kbn/core-elasticsearch-server",
"@kbn/observability-plugin",
"@kbn/config-schema",
"@kbn/visualization-utils",
"@kbn/usage-collection-plugin",
"@kbn/calculate-auto",
"@kbn/ml-random-sampler-utils",
"@kbn/charts-plugin",
"@kbn/observability-utils-browser",
"@kbn/usage-collection-plugin",
"@kbn/zod",
"@kbn/inference-common",
"@kbn/core-elasticsearch-server",
"@kbn/sse-utils",
"@kbn/management-settings-ids",
"@kbn/observability-utils-server",
"@kbn/licensing-plugin",
"@kbn/core-saved-objects-server",
"@kbn/alerting-plugin",
"@kbn/slo-plugin",
"@kbn/inference-plugin",
"@kbn/spaces-plugin",
"@kbn/apm-data-access-plugin",
],
}

View file

@ -52,6 +52,7 @@ export function convertMessagesForInference(messages: Message[]): InferenceMessa
}
inferenceMessages.push({
name: message.message.name!,
role: InferenceMessageRole.Tool,
response: JSON.parse(message.message.content ?? '{}'),
toolCallId: toolCallRequest.toolCalls![0].toolCallId,

Some files were not shown because too many files have changed in this diff Show more