[Obs AI Assistant] Fix contextual insights scoring (#214259)

Closes https://github.com/elastic/kibana/issues/209572 ### Summary Scoring in contextual insights is broken because the `get_contextual_insight_instructions` tool call is not followed by the tool response. This happens because we replace the last user message (in this case tool response) with the user message related to scoring. ### Solution We should include the tool call name when replacing this message, so that it gets converted to inference messages correctly here: 07012811b2/x-pack/platform/plugins/shared/observability_ai_assistant/common/convert_messages_for_inference.ts (L60-L81) ### Checklist - [x] The PR description includes the appropriate Release Notes section, and the correct `release_note:*` label is applied per the [guidelines](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process)
2025-04-23 09:19:04 -04:00 · 2025-03-18 14:37:21 -04:00 · 2025-03-18 14:37:21 -04:00 · 70e3a6096e
commit 70e3a6096e
parent ca1f38a693
28 changed files with 948 additions and 214 deletions
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/common/convert_messages_for_inference.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/common/convert_messages_for_inference.ts
@ -66,6 +66,7 @@ export function convertMessagesForInference(
          msg.role === InferenceMessageRole.Assistant &&
          msg.toolCalls?.[0]?.function.name === message.message.name
      ) as AssistantMessage | undefined;
+
      if (!toolCallRequest) {
        throw new Error(`Could not find tool call request for ${message.message.name}`);
      }
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/functions/context.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/functions/context.ts
@ -63,12 +63,14 @@ export function registerContextFunction({
        );

        const userPrompt = userMessage?.message.content!;
+        const userMessageFunctionName = userMessage?.message.name;

        const { scores, relevantDocuments, suggestions } = await recallAndScore({
          recall: client.recall,
          chat,
          logger: resources.logger,
          userPrompt,
+          userMessageFunctionName,
          context: screenDescription,
          messages,
          signal,
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/recall_and_score.test.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/recall_and_score.test.ts
@ -0,0 +1,265 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { RecalledSuggestion, recallAndScore } from './recall_and_score';
+import { scoreSuggestions } from './score_suggestions';
+import { MessageRole, type Message } from '../../../common';
+import type { FunctionCallChatFunction } from '../../service/types';
+import { AnalyticsServiceStart } from '@kbn/core/server';
+import { Logger } from '@kbn/logging';
+import { recallRankingEventType } from '../../analytics/recall_ranking';
+
+jest.mock('./score_suggestions', () => ({
+  scoreSuggestions: jest.fn(),
+}));
+
+export const sampleMessages: Message[] = [
+  {
+    '@timestamp': '2025-03-13T14:53:11.240Z',
+    message: { role: MessageRole.User, content: 'test' },
+  },
+];
+
+export const normalConversationMessages: Message[] = [
+  {
+    '@timestamp': '2025-03-12T21:00:13.980Z',
+    message: { role: MessageRole.User, content: 'What is my favourite color?' },
+  },
+  {
+    '@timestamp': '2025-03-12T21:00:14.920Z',
+    message: {
+      function_call: { name: 'context', trigger: MessageRole.Assistant },
+      role: MessageRole.Assistant,
+      content: '',
+    },
+  },
+];
+
+export const contextualInsightsMessages: Message[] = [
+  {
+    '@timestamp': '2025-03-12T21:01:21.111Z',
+    message: {
+      role: MessageRole.User,
+      content: "I'm looking at an alert and trying to understand why it was triggered",
+    },
+  },
+  {
+    '@timestamp': '2025-03-12T21:01:21.111Z',
+    message: {
+      role: MessageRole.Assistant,
+      function_call: {
+        name: 'get_contextual_insight_instructions',
+        trigger: MessageRole.Assistant,
+        arguments: '{}',
+      },
+    },
+  },
+  {
+    '@timestamp': '2025-03-12T21:01:21.111Z',
+    message: {
+      role: MessageRole.User,
+      content:
+        '{"instructions":"I\'m an SRE. I am looking at an alert that was triggered. I want to understand why it was triggered......}',
+      name: 'get_contextual_insight_instructions',
+    },
+  },
+  {
+    '@timestamp': '2025-03-12T21:01:21.984Z',
+    message: {
+      function_call: { name: 'context', trigger: MessageRole.Assistant },
+      role: MessageRole.Assistant,
+      content: '',
+    },
+  },
+];
+
+describe('recallAndScore', () => {
+  const mockRecall = jest.fn();
+  const mockChat = jest.fn() as unknown as FunctionCallChatFunction;
+  const mockLogger = { error: jest.fn(), debug: jest.fn() } as unknown as Logger;
+  const mockAnalytics = { reportEvent: jest.fn() } as unknown as AnalyticsServiceStart;
+  const signal = new AbortController().signal;
+
+  beforeEach(() => {
+    jest.clearAllMocks();
+  });
+
+  describe('when no documents are recalled', () => {
+    let result: {
+      relevantDocuments?: RecalledSuggestion[];
+      scores?: Array<{ id: string; score: number }>;
+      suggestions: RecalledSuggestion[];
+    };
+
+    beforeEach(async () => {
+      mockRecall.mockResolvedValue([]);
+
+      result = await recallAndScore({
+        recall: mockRecall,
+        chat: mockChat,
+        analytics: mockAnalytics,
+        userPrompt: 'What is my favorite color?',
+        context: 'Some context',
+        messages: sampleMessages,
+        logger: mockLogger,
+        signal,
+      });
+    });
+
+    it('returns empty suggestions', async () => {
+      expect(result).toEqual({ relevantDocuments: [], scores: [], suggestions: [] });
+    });
+
+    it('invokes recall with user prompt and screen context', async () => {
+      expect(mockRecall).toHaveBeenCalledWith({
+        queries: [
+          { text: 'What is my favorite color?', boost: 3 },
+          { text: 'Some context', boost: 1 },
+        ],
+      });
+    });
+
+    it('does not score the suggestions', async () => {
+      expect(scoreSuggestions).not.toHaveBeenCalled();
+    });
+  });
+
+  it('handles errors when scoring fails', async () => {
+    mockRecall.mockResolvedValue([{ id: 'doc1', text: 'Hello world', score: 0.5 }]);
+    (scoreSuggestions as jest.Mock).mockRejectedValue(new Error('Scoring failed'));
+
+    const result = await recallAndScore({
+      recall: mockRecall,
+      chat: mockChat,
+      analytics: mockAnalytics,
+      userPrompt: 'test',
+      context: 'context',
+      messages: sampleMessages,
+      logger: mockLogger,
+      signal,
+    });
+
+    expect(mockLogger.error).toHaveBeenCalledWith(
+      expect.stringContaining('Error scoring documents: Scoring failed'),
+      expect.any(Object)
+    );
+    expect(result.suggestions.length).toBe(1);
+    expect(result.suggestions[0].id).toBe('doc1');
+  });
+
+  it('calls scoreSuggestions with correct arguments', async () => {
+    const recalledDocs = [{ id: 'doc1', text: 'Hello world', score: 0.8 }];
+    mockRecall.mockResolvedValue(recalledDocs);
+    (scoreSuggestions as jest.Mock).mockResolvedValue({
+      scores: [{ id: 'doc1', score: 7 }],
+      relevantDocuments: recalledDocs,
+    });
+
+    await recallAndScore({
+      recall: mockRecall,
+      chat: mockChat,
+      analytics: mockAnalytics,
+      userPrompt: 'test',
+      context: 'context',
+      messages: sampleMessages,
+      logger: mockLogger,
+      signal,
+    });
+
+    expect(scoreSuggestions).toHaveBeenCalledWith({
+      suggestions: recalledDocs,
+      logger: mockLogger,
+      messages: sampleMessages,
+      userPrompt: 'test',
+      userMessageFunctionName: undefined,
+      context: 'context',
+      signal,
+      chat: mockChat,
+    });
+  });
+
+  it('handles the normal conversation flow correctly', async () => {
+    mockRecall.mockResolvedValue([
+      { id: 'fav_color', text: 'My favourite color is blue.', score: 0.9 },
+    ]);
+    (scoreSuggestions as jest.Mock).mockResolvedValue({
+      scores: [{ id: 'fav_color', score: 7 }],
+      relevantDocuments: [{ id: 'fav_color', text: 'My favourite color is blue.' }],
+    });
+
+    const result = await recallAndScore({
+      recall: mockRecall,
+      chat: mockChat,
+      analytics: mockAnalytics,
+      userPrompt: "What's my favourite color?",
+      context: '',
+      messages: normalConversationMessages,
+      logger: mockLogger,
+      signal,
+    });
+
+    expect(result.relevantDocuments).toEqual([
+      { id: 'fav_color', text: 'My favourite color is blue.' },
+    ]);
+    expect(mockRecall).toHaveBeenCalled();
+    expect(scoreSuggestions).toHaveBeenCalled();
+  });
+
+  it('handles contextual insights conversation flow correctly', async () => {
+    mockRecall.mockResolvedValue([
+      { id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.', score: 0.85 },
+    ]);
+    (scoreSuggestions as jest.Mock).mockResolvedValue({
+      scores: [{ id: 'alert_cause', score: 6 }],
+      relevantDocuments: [
+        { id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.' },
+      ],
+    });
+
+    const result = await recallAndScore({
+      recall: mockRecall,
+      chat: mockChat,
+      analytics: mockAnalytics,
+      userPrompt: "I'm looking at an alert and trying to understand why it was triggered",
+      context: 'User is analyzing an alert',
+      messages: contextualInsightsMessages,
+      logger: mockLogger,
+      signal,
+    });
+
+    expect(result.relevantDocuments).toEqual([
+      { id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.' },
+    ]);
+    expect(mockRecall).toHaveBeenCalled();
+    expect(scoreSuggestions).toHaveBeenCalled();
+  });
+
+  it('reports analytics with the correct structure', async () => {
+    const recalledDocs = [{ id: 'doc1', text: 'Hello world', score: 0.8 }];
+    mockRecall.mockResolvedValue(recalledDocs);
+    (scoreSuggestions as jest.Mock).mockResolvedValue({
+      scores: [{ id: 'doc1', score: 7 }],
+      relevantDocuments: recalledDocs,
+    });
+
+    await recallAndScore({
+      recall: mockRecall,
+      chat: mockChat,
+      analytics: mockAnalytics,
+      userPrompt: 'test',
+      context: 'context',
+      messages: sampleMessages,
+      logger: mockLogger,
+      signal,
+    });
+
+    expect(mockAnalytics.reportEvent).toHaveBeenCalledWith(
+      recallRankingEventType,
+      expect.objectContaining({ scoredDocuments: [{ elserScore: 0.8, llmScore: 7 }] })
+    );
+  });
+});
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/recall_and_score.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/recall_and_score.ts
@ -21,6 +21,7 @@ export async function recallAndScore({
  chat,
  analytics,
  userPrompt,
+  userMessageFunctionName,
  context,
  messages,
  logger,
@ -30,6 +31,7 @@ export async function recallAndScore({
  chat: FunctionCallChatFunction;
  analytics: AnalyticsServiceStart;
  userPrompt: string;
+  userMessageFunctionName?: string;
  context: string;
  messages: Message[];
  logger: Logger;
@ -62,6 +64,7 @@ export async function recallAndScore({
      logger,
      messages,
      userPrompt,
+      userMessageFunctionName,
      context,
      signal,
      chat,
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/score_suggestions.test.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/score_suggestions.test.ts
@ -0,0 +1,168 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { scoreSuggestions } from './score_suggestions';
+import { Logger } from '@kbn/logging';
+import { of } from 'rxjs';
+import { MessageRole, StreamingChatResponseEventType } from '../../../common';
+import { RecalledSuggestion } from './recall_and_score';
+import { FunctionCallChatFunction } from '../../service/types';
+import { ChatEvent } from '../../../common/conversation_complete';
+import { contextualInsightsMessages, normalConversationMessages } from './recall_and_score.test';
+
+const suggestions: RecalledSuggestion[] = [
+  { id: 'doc1', text: 'Relevant document 1', score: 0.9 },
+  { id: 'doc2', text: 'Relevant document 2', score: 0.8 },
+  { id: 'doc3', text: 'Less relevant document 3', score: 0.3 },
+];
+
+const userPrompt = 'What is my favourite color?';
+const context = 'Some context';
+
+describe('scoreSuggestions', () => {
+  const mockLogger = { error: jest.fn(), debug: jest.fn() } as unknown as Logger;
+  let mockChat: jest.MockedFunction<FunctionCallChatFunction>;
+
+  beforeEach(() => {
+    mockChat = jest.fn((_name, _params) =>
+      of({
+        type: StreamingChatResponseEventType.ChatCompletionChunk,
+        message: {
+          function_call: {
+            name: 'score',
+            arguments: JSON.stringify({ scores: 'doc1,7\ndoc2,5\ndoc3,3' }),
+          },
+        },
+      } as ChatEvent)
+    );
+  });
+
+  it('should correctly score and return relevant documents', async () => {
+    const result = await scoreSuggestions({
+      suggestions,
+      messages: normalConversationMessages,
+      userPrompt,
+      context,
+      chat: mockChat,
+      signal: new AbortController().signal,
+      logger: mockLogger,
+    });
+
+    expect(result.scores).toEqual([
+      { id: 'doc1', score: 7 },
+      { id: 'doc2', score: 5 },
+      { id: 'doc3', score: 3 },
+    ]);
+
+    expect(result.relevantDocuments).toEqual([
+      { id: 'doc1', text: 'Relevant document 1', score: 0.9 },
+      { id: 'doc2', text: 'Relevant document 2', score: 0.8 },
+    ]);
+  });
+
+  it('should return no relevant documents if all scores are low', async () => {
+    mockChat.mockReturnValueOnce(
+      of({
+        id: 'mock-id',
+        type: StreamingChatResponseEventType.ChatCompletionChunk,
+        message: {
+          function_call: {
+            name: 'score',
+            arguments: JSON.stringify({ scores: 'doc1,2\ndoc2,3\ndoc3,1' }),
+          },
+        },
+      })
+    );
+
+    const result = await scoreSuggestions({
+      suggestions,
+      messages: normalConversationMessages,
+      userPrompt,
+      userMessageFunctionName: 'score',
+      context,
+      chat: mockChat,
+      signal: new AbortController().signal,
+      logger: mockLogger,
+    });
+
+    expect(result.relevantDocuments).toEqual([]);
+  });
+
+  it('should ignore hallucinated document IDs', async () => {
+    mockChat.mockReturnValueOnce(
+      of({
+        id: 'mock-id',
+        type: StreamingChatResponseEventType.ChatCompletionChunk,
+        message: {
+          function_call: {
+            name: 'score',
+            arguments: JSON.stringify({ scores: 'doc1,6\nfake_doc,5' }),
+          },
+        },
+      })
+    );
+
+    const result = await scoreSuggestions({
+      suggestions,
+      messages: normalConversationMessages,
+      userPrompt,
+      context,
+      chat: mockChat,
+      signal: new AbortController().signal,
+      logger: mockLogger,
+    });
+
+    expect(result.relevantDocuments).toEqual([
+      { id: 'doc1', text: 'Relevant document 1', score: 0.9 },
+    ]);
+  });
+
+  it('it throws an exception when function args are invalid', async () => {
+    mockChat.mockReturnValueOnce(
+      of({
+        id: 'mock-id',
+        type: StreamingChatResponseEventType.ChatCompletionChunk,
+        message: { function_call: { name: 'score', arguments: 'invalid_json' } },
+      })
+    );
+
+    await expect(
+      scoreSuggestions({
+        suggestions,
+        messages: normalConversationMessages,
+        userPrompt,
+        context,
+        chat: mockChat,
+        signal: new AbortController().signal,
+        logger: mockLogger,
+      })
+    ).rejects.toThrow();
+  });
+
+  it('should handle scenarios where the last user message is a tool response', async () => {
+    const lastUserMessage = contextualInsightsMessages
+      .filter((message) => message.message.role === MessageRole.User)
+      .pop();
+
+    const result = await scoreSuggestions({
+      suggestions,
+      messages: contextualInsightsMessages,
+      userPrompt: lastUserMessage?.message.content!,
+      userMessageFunctionName: lastUserMessage?.message.name,
+      context,
+      chat: mockChat,
+      signal: new AbortController().signal,
+      logger: mockLogger,
+    });
+
+    expect(result.scores).toEqual([
+      { id: 'doc1', score: 7 },
+      { id: 'doc2', score: 5 },
+      { id: 'doc3', score: 3 },
+    ]);
+  });
+});
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/score_suggestions.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/score_suggestions.ts
@ -33,6 +33,7 @@ export async function scoreSuggestions({
  suggestions,
  messages,
  userPrompt,
+  userMessageFunctionName,
  context,
  chat,
  signal,
@ -41,6 +42,7 @@ export async function scoreSuggestions({
  suggestions: RecalledSuggestion[];
  messages: Message[];
  userPrompt: string;
+  userMessageFunctionName?: string;
  context: string;
  chat: FunctionCallChatFunction;
  signal: AbortSignal;
@ -81,7 +83,10 @@ export async function scoreSuggestions({
    '@timestamp': new Date().toISOString(),
    message: {
      role: MessageRole.User,
-      content: newUserMessageContent,
+      content: userMessageFunctionName
+        ? JSON.stringify(newUserMessageContent)
+        : newUserMessageContent,
+      ...(userMessageFunctionName ? { name: userMessageFunctionName } : {}),
    },
  };

@ -122,8 +127,8 @@ export async function scoreSuggestions({
  );

  const scores = parseSuggestionScores(scoresAsString)
-    // Restore original IDs
-    .map(({ id, score }) => ({ id: shortIdTable.lookup(id)!, score }));
+    // Restore original IDs (added fallback to id for testing purposes)
+    .map(({ id, score }) => ({ id: shortIdTable.lookup(id) || id, score }));

  if (scores.length === 0) {
    // seemingly invalid or no scores, return all
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/complete.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/complete.spec.ts
@ -22,11 +22,14 @@ import {
  LlmProxy,
  ToolMessage,
 } from '../../../../../../observability_ai_assistant_api_integration/common/create_llm_proxy';
-import { decodeEvents, getConversationCreatedEvent } from '../helpers';
 import type { DeploymentAgnosticFtrProviderContext } from '../../../../ftr_provider_context';
 import { SupertestWithRoleScope } from '../../../../services/role_scoped_supertest';
-import { clearConversations } from '../knowledge_base/helpers';
-import { systemMessageSorted } from './functions/helpers';
+import {
+  systemMessageSorted,
+  clearConversations,
+  decodeEvents,
+  getConversationCreatedEvent,
+} from '../utils/conversation';

 export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
  const log = getService('log');
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/alerts.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/alerts.spec.ts
@ -11,14 +11,17 @@ import {
  LlmProxy,
  createLlmProxy,
 } from '../../../../../../../observability_ai_assistant_api_integration/common/create_llm_proxy';
-import { getMessageAddedEvents, invokeChatCompleteWithFunctionRequest } from './helpers';
+import {
+  getMessageAddedEvents,
+  invokeChatCompleteWithFunctionRequest,
+} from '../../utils/conversation';
 import type { DeploymentAgnosticFtrProviderContext } from '../../../../../ftr_provider_context';

 export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
  const log = getService('log');
  const observabilityAIAssistantAPIClient = getService('observabilityAIAssistantApi');

-  describe('when calling the alerts function', function () {
+  describe('alerts', function () {
    // Fails on MKI: https://github.com/elastic/kibana/issues/205581
    this.tags(['failsOnMKI']);
    let proxy: LlmProxy;
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/context.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/context.spec.ts
@ -0,0 +1,258 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import expect from '@kbn/expect';
+import { first, last } from 'lodash';
+import { ChatCompletionStreamParams } from 'openai/lib/ChatCompletionStream';
+import {
+  KnowledgeBaseEntry,
+  MessageAddEvent,
+  MessageRole,
+} from '@kbn/observability-ai-assistant-plugin/common';
+import { CONTEXT_FUNCTION_NAME } from '@kbn/observability-ai-assistant-plugin/server/functions/context';
+import { RecalledSuggestion } from '@kbn/observability-ai-assistant-plugin/server/utils/recall/recall_and_score';
+import { Instruction } from '@kbn/observability-ai-assistant-plugin/common/types';
+import {
+  KnowledgeBaseDocument,
+  LlmProxy,
+  createLlmProxy,
+} from '../../../../../../../observability_ai_assistant_api_integration/common/create_llm_proxy';
+import type { DeploymentAgnosticFtrProviderContext } from '../../../../../ftr_provider_context';
+import {
+  addSampleDocsToInternalKb,
+  clearKnowledgeBase,
+  deleteInferenceEndpoint,
+  deleteKnowledgeBaseModel,
+} from '../../utils/knowledge_base';
+import { chatComplete } from '../../utils/conversation';
+
+const screenContexts = [
+  {
+    screenDescription: 'User is viewing an active alert.',
+    data: [
+      {
+        name: 'alert_fields',
+        description: 'The fields and values for the alert',
+        value: {
+          'kibana.alert.rule.name': 'Error count threshold rule',
+          'kibana.alert.status': 'active',
+          'service.name': 'opbeans-go',
+        },
+      },
+    ],
+  },
+];
+
+const sampleDocsForInternalKb = [
+  {
+    id: 'favourite_color',
+    title: 'Favorite Color',
+    text: 'My favourite color is blue.',
+  },
+  {
+    id: 'alert_instructions',
+    title: 'Alert Handling Guide',
+    text: 'All alerts should be considered high priority. Every alert is monitored every day. Threshold alerts should be resolved first. Consider this when analyzing alerts.',
+  },
+  {
+    id: 'miscellaneous',
+    title: 'Miscellaneous Note',
+    text: 'hello again',
+  },
+];
+
+const userPrompt = `What's my favourite color?`;
+
+export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
+  const observabilityAIAssistantAPIClient = getService('observabilityAIAssistantApi');
+  const es = getService('es');
+  const ml = getService('ml');
+  const log = getService('log');
+
+  describe('context', function () {
+    this.tags(['failsOnMKI']);
+    let llmProxy: LlmProxy;
+    let connectorId: string;
+    let messageAddedEvents: MessageAddEvent[];
+    let getDocuments: () => Promise<KnowledgeBaseDocument[]>;
+
+    before(async () => {
+      llmProxy = await createLlmProxy(log);
+      connectorId = await observabilityAIAssistantAPIClient.createProxyActionConnector({
+        port: llmProxy.getPort(),
+      });
+
+      await addSampleDocsToInternalKb(getService, sampleDocsForInternalKb);
+
+      ({ getDocuments } = llmProxy.interceptScoreToolChoice(log));
+
+      void llmProxy.interceptConversation('Your favourite color is blue.');
+
+      ({ messageAddedEvents } = await chatComplete({
+        userPrompt,
+        screenContexts,
+        connectorId,
+        observabilityAIAssistantAPIClient,
+      }));
+
+      await llmProxy.waitForAllInterceptorsToHaveBeenCalled();
+    });
+
+    after(async () => {
+      llmProxy.close();
+      await observabilityAIAssistantAPIClient.deleteActionConnector({
+        actionId: connectorId,
+      });
+
+      await deleteKnowledgeBaseModel(ml);
+      await deleteInferenceEndpoint({ es });
+      await clearKnowledgeBase(es);
+    });
+
+    describe('calling the context function via /chat/complete', () => {
+      let firstRequestBody: ChatCompletionStreamParams;
+      let secondRequestBody: ChatCompletionStreamParams;
+
+      before(async () => {
+        firstRequestBody = llmProxy.interceptedRequests[0].requestBody;
+        secondRequestBody = llmProxy.interceptedRequests[1].requestBody;
+      });
+
+      it('makes 2 requests to the LLM', () => {
+        expect(llmProxy.interceptedRequests.length).to.be(2);
+      });
+
+      it('emits 3 messageAdded events', () => {
+        expect(messageAddedEvents.length).to.be(3);
+      });
+
+      describe('The first request - Scoring documents', () => {
+        it('contains the correct number of messages', () => {
+          expect(firstRequestBody.messages.length).to.be(2);
+        });
+
+        it('contains the system message as the first message in the request', () => {
+          expect(first(firstRequestBody.messages)?.role === MessageRole.System);
+        });
+
+        it('contains a message with the prompt for scoring', () => {
+          expect(last(firstRequestBody.messages)?.content).to.contain(
+            'score the documents that are relevant to the prompt on a scale from 0 to 7'
+          );
+        });
+
+        it('instructs the LLM with the correct tool_choice and tools for scoring', () => {
+          // @ts-expect-error
+          expect(firstRequestBody.tool_choice?.function?.name).to.be('score');
+          expect(firstRequestBody.tools?.length).to.be(1);
+          expect(first(firstRequestBody.tools)?.function.name).to.be('score');
+        });
+
+        it('sends the correct documents to the LLM', async () => {
+          const extractedDocs = await getDocuments();
+          const expectedTexts = sampleDocsForInternalKb.map((doc) => doc.text).sort();
+          const actualTexts = extractedDocs.map((doc) => doc.text).sort();
+
+          expect(actualTexts).to.eql(expectedTexts);
+        });
+      });
+
+      describe('The second request - Sending the user prompt', () => {
+        it('contains the correct number of messages', () => {
+          expect(secondRequestBody.messages.length).to.be(4);
+        });
+
+        it('contains the system message as the first message in the request', () => {
+          expect(first(secondRequestBody.messages)?.role === MessageRole.System);
+        });
+
+        it('contains the user prompt', () => {
+          expect(secondRequestBody.messages[1].role).to.be(MessageRole.User);
+          expect(secondRequestBody.messages[1].content).to.be(userPrompt);
+        });
+
+        it('leaves the LLM to choose the correct tool by leave tool_choice as auto and passes tools', () => {
+          expect(secondRequestBody.tool_choice).to.be('auto');
+          expect(secondRequestBody.tools?.length).to.not.be(0);
+        });
+
+        it('contains the tool call for context and the corresponding response', () => {
+          expect(secondRequestBody.messages[2].role).to.be(MessageRole.Assistant);
+          // @ts-expect-error
+          expect(secondRequestBody.messages[2].tool_calls[0].function.name).to.be(
+            CONTEXT_FUNCTION_NAME
+          );
+
+          expect(last(secondRequestBody.messages)?.role).to.be('tool');
+          // @ts-expect-error
+          expect(last(secondRequestBody.messages)?.tool_call_id).to.equal(
+            // @ts-expect-error
+            secondRequestBody.messages[2].tool_calls[0].id
+          );
+        });
+
+        it('sends the knowledge base entries to the LLM', () => {
+          const content = last(secondRequestBody.messages)?.content as string;
+          const parsedContent = JSON.parse(content);
+          const learnings = parsedContent.learnings;
+
+          const expectedTexts = sampleDocsForInternalKb.map((doc) => doc.text).sort();
+          const actualTexts = learnings.map((learning: KnowledgeBaseEntry) => learning.text).sort();
+
+          expect(actualTexts).to.eql(expectedTexts);
+        });
+      });
+
+      describe('information retrieval', () => {
+        let contextFunctionResponse: MessageAddEvent | undefined;
+
+        before(() => {
+          contextFunctionResponse = messageAddedEvents.find(
+            ({ message }) => message.message.name === CONTEXT_FUNCTION_NAME
+          );
+        });
+
+        it('retrieves the screen context correctly', async () => {
+          expect(contextFunctionResponse).to.not.be(null);
+
+          const parsedContextResponseContent = JSON.parse(
+            contextFunctionResponse!.message.message.content!
+          );
+          expect(parsedContextResponseContent).to.have.property('screen_description');
+          expect(parsedContextResponseContent.screen_description).to.contain(
+            screenContexts[0].screenDescription
+          );
+        });
+
+        it('retrieves entries from the KB correctly with a score', async () => {
+          const parsedContextResponseData = JSON.parse(
+            contextFunctionResponse!.message.message.data!
+          );
+          expect(parsedContextResponseData).to.have.property('suggestions');
+          expect(parsedContextResponseData.suggestions).to.be.an('array');
+          expect(parsedContextResponseData.suggestions.length).to.be(3);
+
+          parsedContextResponseData.suggestions.forEach((suggestion: RecalledSuggestion) => {
+            expect(suggestion).to.have.property('id');
+            expect(suggestion).to.have.property('text');
+            expect(suggestion).to.have.property('score');
+          });
+
+          const suggestionTexts = parsedContextResponseData.suggestions
+            .map((s: KnowledgeBaseEntry) => s.text)
+            .sort();
+
+          const sampleDocTexts = sampleDocsForInternalKb
+            .map((doc: Instruction & { title: string }) => doc.text)
+            .sort();
+
+          expect(suggestionTexts).to.eql(sampleDocTexts);
+        });
+      });
+    });
+  });
+}
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/elasticsearch.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/elasticsearch.spec.ts
@ -14,7 +14,10 @@ import {
  LlmProxy,
  createLlmProxy,
 } from '../../../../../../../observability_ai_assistant_api_integration/common/create_llm_proxy';
-import { getMessageAddedEvents, invokeChatCompleteWithFunctionRequest } from './helpers';
+import {
+  getMessageAddedEvents,
+  invokeChatCompleteWithFunctionRequest,
+} from '../../utils/conversation';
 import type { DeploymentAgnosticFtrProviderContext } from '../../../../../ftr_provider_context';

 export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
@ -22,7 +25,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
  const synthtrace = getService('synthtrace');
  const observabilityAIAssistantAPIClient = getService('observabilityAIAssistantApi');

-  describe('when calling elasticsearch', function () {
+  describe('elasticsearch', function () {
    // Fails on MKI: https://github.com/elastic/kibana/issues/205581
    this.tags(['failsOnMKI']);
    let proxy: LlmProxy;
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/execute_query.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/execute_query.spec.ts
@ -15,7 +15,7 @@ import {
  LlmProxy,
  createLlmProxy,
 } from '../../../../../../../observability_ai_assistant_api_integration/common/create_llm_proxy';
-import { chatComplete } from './helpers';
+import { chatComplete } from '../../utils/conversation';
 import type { DeploymentAgnosticFtrProviderContext } from '../../../../../ftr_provider_context';
 import { createSimpleSyntheticLogs } from '../../synthtrace_scenarios/simple_logs';

--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/get_alerts_dataset_info.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/get_alerts_dataset_info.spec.ts
@ -20,7 +20,7 @@ import {
  RelevantField,
  createLlmProxy,
 } from '../../../../../../../observability_ai_assistant_api_integration/common/create_llm_proxy';
-import { chatComplete, getSystemMessage, systemMessageSorted } from './helpers';
+import { chatComplete, getSystemMessage, systemMessageSorted } from '../../utils/conversation';
 import type { DeploymentAgnosticFtrProviderContext } from '../../../../../ftr_provider_context';
 import { APM_ALERTS_INDEX } from '../../../apm/alerts/helpers/alerting_helper';

--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/get_dataset_info.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/get_dataset_info.spec.ts
@ -16,7 +16,7 @@ import {
  RelevantField,
  createLlmProxy,
 } from '../../../../../../../observability_ai_assistant_api_integration/common/create_llm_proxy';
-import { chatComplete, getSystemMessage, systemMessageSorted } from './helpers';
+import { chatComplete, getSystemMessage, systemMessageSorted } from '../../utils/conversation';
 import type { DeploymentAgnosticFtrProviderContext } from '../../../../../ftr_provider_context';
 import { createSimpleSyntheticLogs } from '../../synthtrace_scenarios/simple_logs';

--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/recall.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/recall.spec.ts
@ -6,21 +6,64 @@
 */

 import expect from '@kbn/expect';
-import { AI_ASSISTANT_KB_INFERENCE_ID } from '@kbn/observability-ai-assistant-plugin/server/service/inference_endpoint';
 import { first, uniq } from 'lodash';
 import type { DeploymentAgnosticFtrProviderContext } from '../../../../../ftr_provider_context';
 import {
  clearKnowledgeBase,
  deleteInferenceEndpoint,
  deleteKnowledgeBaseModel,
-  importTinyElserModel,
-  setupKnowledgeBase,
-  waitForKnowledgeBaseReady,
-} from '../../knowledge_base/helpers';
-import { setAdvancedSettings } from '../../utils/advanced_settings';
+  addSampleDocsToInternalKb,
+  addSampleDocsToCustomIndex,
+} from '../../utils/knowledge_base';

 const customSearchConnectorIndex = 'animals_kb';

+const sampleDocsForInternalKb = [
+  {
+    id: 'technical_db_outage_slow_queries',
+    title: 'Database Outage: Slow Query Execution',
+    text: 'At 03:15 AM UTC, the production database experienced a significant outage, leading to slow query execution and increased response times across multiple services. A surge in database load was detected, with 90% of queries exceeding 2 seconds. A detailed log analysis pointed to locking issues within the transaction queue and inefficient index usage.',
+  },
+  {
+    id: 'technical_api_gateway_timeouts',
+    title: 'Service Timeout: API Gateway Bottleneck',
+    text: 'At 10:45 AM UTC, the API Gateway encountered a timeout issue, causing a 500 error for all incoming requests. Detailed traces indicated a significant bottleneck at the gateway level, where requests stalled while waiting for upstream service responses. The upstream service was overwhelmed due to a sudden spike in inbound traffic and failed to release resources promptly.',
+  },
+  {
+    id: 'technical_cache_misses_thirdparty_api',
+    title: 'Cache Misses and Increased Latency: Third-Party API Failure',
+    text: 'At 04:30 PM UTC, a dramatic increase in cache misses and latency was observed. The failure of a third-party API prevented critical data from being cached, leading to unnecessary re-fetching of resources from external sources. This caused significant delays in response times, with up to 10-second delays in some key services.',
+  },
+];
+
+const sampleDocsForCustomIndex = [
+  {
+    id: 'animal_elephants_social_structure',
+    title: 'Elephants and Their Social Structure',
+    text: 'Elephants are highly social animals that live in matriarchal herds led by the oldest female. These animals communicate through low-frequency sounds, called infrasound, that travel long distances. They are known for their intelligence, strong memory, and deep emotional bonds with each other.',
+  },
+  {
+    id: 'animal_cheetah_life_speed',
+    title: 'The Life of a Cheetah',
+    text: 'Cheetahs are the fastest land animals, capable of reaching speeds up to 60 miles per hour in short bursts. They rely on their speed to catch prey, such as gazelles. Unlike other big cats, cheetahs cannot roar, but they make distinctive chirping sounds, especially when communicating with their cubs.',
+  },
+  {
+    id: 'animal_whale_migration_patterns',
+    title: 'Whales and Their Migration Patterns',
+    text: 'Whales are known for their long migration patterns, traveling thousands of miles between feeding and breeding grounds.',
+  },
+  {
+    id: 'animal_giraffe_habitat_feeding',
+    title: 'Giraffes: Habitat and Feeding Habits',
+    text: 'Giraffes are the tallest land animals, with long necks that help them reach leaves high up in trees. They live in savannas and grasslands, where they feed on leaves, twigs, and fruits from acacia trees.',
+  },
+  {
+    id: 'animal_penguin_antarctic_adaptations',
+    title: 'Penguins and Their Antarctic Adaptations',
+    text: 'Penguins are flightless birds that have adapted to life in the cold Antarctic environment. They have a thick layer of blubber to keep warm, and their wings have evolved into flippers for swimming in the icy waters.',
+  },
+];
+
 export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
  const observabilityAIAssistantAPIClient = getService('observabilityAIAssistantApi');
  const es = getService('es');
@ -28,8 +71,12 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon

  describe('recall', function () {
    before(async () => {
-      await addSampleDocsToInternalKb(getService);
-      await addSampleDocsToCustomIndex(getService);
+      await addSampleDocsToInternalKb(getService, sampleDocsForInternalKb);
+      await addSampleDocsToCustomIndex(
+        getService,
+        sampleDocsForCustomIndex,
+        customSearchConnectorIndex
+      );
    });

    after(async () => {
@ -109,113 +156,6 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
  }
 }

-async function addSampleDocsToInternalKb(
-  getService: DeploymentAgnosticFtrProviderContext['getService']
-) {
-  const log = getService('log');
-  const ml = getService('ml');
-  const retry = getService('retry');
-  const observabilityAIAssistantAPIClient = getService('observabilityAIAssistantApi');
-
-  const sampleDocs = [
-    {
-      id: 'technical_db_outage_slow_queries',
-      title: 'Database Outage: Slow Query Execution',
-      text: 'At 03:15 AM UTC, the production database experienced a significant outage, leading to slow query execution and increased response times across multiple services. A surge in database load was detected, with 90% of queries exceeding 2 seconds. A detailed log analysis pointed to locking issues within the transaction queue and inefficient index usage.',
-    },
-    {
-      id: 'technical_api_gateway_timeouts',
-      title: 'Service Timeout: API Gateway Bottleneck',
-      text: 'At 10:45 AM UTC, the API Gateway encountered a timeout issue, causing a 500 error for all incoming requests. Detailed traces indicated a significant bottleneck at the gateway level, where requests stalled while waiting for upstream service responses. The upstream service was overwhelmed due to a sudden spike in inbound traffic and failed to release resources promptly.',
-    },
-    {
-      id: 'technical_cache_misses_thirdparty_api',
-      title: 'Cache Misses and Increased Latency: Third-Party API Failure',
-      text: 'At 04:30 PM UTC, a dramatic increase in cache misses and latency was observed. The failure of a third-party API prevented critical data from being cached, leading to unnecessary re-fetching of resources from external sources. This caused significant delays in response times, with up to 10-second delays in some key services.',
-    },
-  ];
-
-  await importTinyElserModel(ml);
-  await setupKnowledgeBase(observabilityAIAssistantAPIClient);
-  await waitForKnowledgeBaseReady({ observabilityAIAssistantAPIClient, log, retry });
-
-  await observabilityAIAssistantAPIClient.editor({
-    endpoint: 'POST /internal/observability_ai_assistant/kb/entries/import',
-    params: {
-      body: {
-        entries: sampleDocs,
-      },
-    },
-  });
-}
-
-async function addSampleDocsToCustomIndex(
-  getService: DeploymentAgnosticFtrProviderContext['getService']
-) {
-  const es = getService('es');
-  const supertest = getService('supertest');
-  const log = getService('log');
-
-  const sampleDocs = [
-    {
-      id: 'animal_elephants_social_structure',
-      title: 'Elephants and Their Social Structure',
-      text: 'Elephants are highly social animals that live in matriarchal herds led by the oldest female. These animals communicate through low-frequency sounds, called infrasound, that travel long distances. They are known for their intelligence, strong memory, and deep emotional bonds with each other.',
-    },
-    {
-      id: 'animal_cheetah_life_speed',
-      title: 'The Life of a Cheetah',
-      text: 'Cheetahs are the fastest land animals, capable of reaching speeds up to 60 miles per hour in short bursts. They rely on their speed to catch prey, such as gazelles. Unlike other big cats, cheetahs cannot roar, but they make distinctive chirping sounds, especially when communicating with their cubs.',
-    },
-    {
-      id: 'animal_whale_migration_patterns',
-      title: 'Whales and Their Migration Patterns',
-      text: 'Whales are known for their long migration patterns, traveling thousands of miles between feeding and breeding grounds.',
-    },
-    {
-      id: 'animal_giraffe_habitat_feeding',
-      title: 'Giraffes: Habitat and Feeding Habits',
-      text: 'Giraffes are the tallest land animals, with long necks that help them reach leaves high up in trees. They live in savannas and grasslands, where they feed on leaves, twigs, and fruits from acacia trees.',
-    },
-    {
-      id: 'animal_penguin_antarctic_adaptations',
-      title: 'Penguins and Their Antarctic Adaptations',
-      text: 'Penguins are flightless birds that have adapted to life in the cold Antarctic environment. They have a thick layer of blubber to keep warm, and their wings have evolved into flippers for swimming in the icy waters.',
-    },
-  ];
-
-  // create index with semantic_text mapping for `text` field
-  log.info('Creating custom index with sample animal docs...');
-  await es.indices.create({
-    index: customSearchConnectorIndex,
-    mappings: {
-      properties: {
-        title: { type: 'text' },
-        text: { type: 'semantic_text', inference_id: AI_ASSISTANT_KB_INFERENCE_ID },
-      },
-    },
-  });
-
-  log.info('Indexing sample animal docs...');
-  // ingest sampleDocs
-  await Promise.all(
-    sampleDocs.map(async (doc) => {
-      const { id, ...restDoc } = doc;
-      return es.index({
-        refresh: 'wait_for',
-        index: customSearchConnectorIndex,
-        id,
-        body: restDoc,
-      });
-    })
-  );
-
-  // update the advanced settings (`observability:aiAssistantSearchConnectorIndexPattern`) to include the custom index
-  await setAdvancedSettings(supertest, {
-    'observability:aiAssistantSearchConnectorIndexPattern': customSearchConnectorIndex,
-  });
-}
-
 function formatScore(score: number) {
  if (score > 0.5) {
    return 'high';
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/summarize.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/summarize.spec.ts
@ -12,7 +12,7 @@ import {
  createLlmProxy,
 } from '../../../../../../../observability_ai_assistant_api_integration/common/create_llm_proxy';
 import type { DeploymentAgnosticFtrProviderContext } from '../../../../../ftr_provider_context';
-import { invokeChatCompleteWithFunctionRequest } from './helpers';
+import { invokeChatCompleteWithFunctionRequest } from '../../utils/conversation';
 import {
  clearKnowledgeBase,
  importTinyElserModel,
@ -20,7 +20,7 @@ import {
  deleteKnowledgeBaseModel,
  setupKnowledgeBase,
  waitForKnowledgeBaseReady,
-} from '../../knowledge_base/helpers';
+} from '../../utils/knowledge_base';

 export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
  const log = getService('log');
@ -29,7 +29,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
  const retry = getService('retry');
  const observabilityAIAssistantAPIClient = getService('observabilityAIAssistantApi');

-  describe('when calling summarize function', function () {
+  describe('summarize', function () {
    // Fails on MKI: https://github.com/elastic/kibana/issues/205581
    this.tags(['failsOnMKI']);
    let proxy: LlmProxy;
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/connectors/connectors.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/connectors/connectors.spec.ts
@ -10,6 +10,7 @@ import type { DeploymentAgnosticFtrProviderContext } from '../../../../ftr_provi

 export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
  const observabilityAIAssistantAPIClient = getService('observabilityAIAssistantApi');
+
  describe('List connectors', function () {
    before(async () => {
      await observabilityAIAssistantAPIClient.deleteAllActionConnectors();
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/helpers.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/helpers.ts
@ -1,52 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-import { Readable } from 'stream';
-import {
-  ConversationCreateEvent,
-  ConversationUpdateEvent,
-  StreamingChatResponseEvent,
-  StreamingChatResponseEventType,
-} from '@kbn/observability-ai-assistant-plugin/common/conversation_complete';
-
-export function decodeEvents(body: Readable | string) {
-  return String(body)
-    .split('\n')
-    .map((line) => line.trim())
-    .filter(Boolean)
-    .map((line) => JSON.parse(line) as StreamingChatResponseEvent);
-}
-
-export function getConversationCreatedEvent(body: Readable | string) {
-  const decodedEvents = decodeEvents(body);
-  const conversationCreatedEvent = decodedEvents.find(
-    (event) => event.type === StreamingChatResponseEventType.ConversationCreate
-  ) as ConversationCreateEvent;
-
-  if (!conversationCreatedEvent) {
-    throw new Error(
-      `No conversation created event found: ${JSON.stringify(decodedEvents, null, 2)}`
-    );
-  }
-
-  return conversationCreatedEvent;
-}
-
-export function getConversationUpdatedEvent(body: Readable | string) {
-  const decodedEvents = decodeEvents(body);
-  const conversationUpdatedEvent = decodedEvents.find(
-    (event) => event.type === StreamingChatResponseEventType.ConversationUpdate
-  ) as ConversationUpdateEvent;
-
-  if (!conversationUpdatedEvent) {
-    throw new Error(
-      `No conversation created event found: ${JSON.stringify(decodedEvents, null, 2)}`
-    );
-  }
-
-  return conversationUpdatedEvent;
-}
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/index.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/index.ts
@ -23,6 +23,7 @@ export default function aiAssistantApiIntegrationTests({
    loadTestFile(require.resolve('./complete/functions/retrieve_elastic_doc.spec.ts'));
    loadTestFile(require.resolve('./complete/functions/summarize.spec.ts'));
    loadTestFile(require.resolve('./complete/functions/recall.spec.ts'));
+    loadTestFile(require.resolve('./complete/functions/context.spec.ts'));
    loadTestFile(require.resolve('./public_complete/public_complete.spec.ts'));
    loadTestFile(require.resolve('./knowledge_base/knowledge_base_setup.spec.ts'));
    loadTestFile(
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base.spec.ts
@ -15,7 +15,7 @@ import {
  deleteKnowledgeBaseModel,
  setupKnowledgeBase,
  waitForKnowledgeBaseReady,
-} from './helpers';
+} from '../utils/knowledge_base';

 export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
  const ml = getService('ml');
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base_add_semantic_text_field_migration.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base_add_semantic_text_field_migration.spec.ts
@ -18,7 +18,7 @@ import {
  deleteInferenceEndpoint,
  setupKnowledgeBase,
  waitForKnowledgeBaseReady,
-} from './helpers';
+} from '../utils/knowledge_base';

 interface InferenceChunk {
  text: string;
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base_reindex.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base_reindex.spec.ts
@ -17,7 +17,7 @@ import {
  deleteInferenceEndpoint,
  setupKnowledgeBase,
  waitForKnowledgeBaseReady,
-} from './helpers';
+} from '../utils/knowledge_base';

 export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
  const observabilityAIAssistantAPIClient = getService('observabilityAIAssistantApi');
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base_setup.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base_setup.spec.ts
@ -13,7 +13,7 @@ import {
  TINY_ELSER,
  deleteInferenceEndpoint,
  setupKnowledgeBase,
-} from './helpers';
+} from '../utils/knowledge_base';

 export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
  const ml = getService('ml');
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base_status.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base_status.spec.ts
@ -15,7 +15,7 @@ import {
  deleteInferenceEndpoint,
  setupKnowledgeBase,
  waitForKnowledgeBaseReady,
-} from './helpers';
+} from '../utils/knowledge_base';

 export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderContext) {
  const ml = getService('ml');
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base_user_instructions.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/knowledge_base_user_instructions.spec.ts
@ -13,19 +13,18 @@ import { Instruction } from '@kbn/observability-ai-assistant-plugin/common/types
 import pRetry from 'p-retry';
 import type { DeploymentAgnosticFtrProviderContext } from '../../../../ftr_provider_context';
 import {
-  clearConversations,
  clearKnowledgeBase,
  importTinyElserModel,
  deleteInferenceEndpoint,
  deleteKnowledgeBaseModel,
  setupKnowledgeBase,
  waitForKnowledgeBaseReady,
-} from './helpers';
-import { getConversationCreatedEvent } from '../helpers';
+} from '../utils/knowledge_base';
 import {
  LlmProxy,
  createLlmProxy,
 } from '../../../../../../observability_ai_assistant_api_integration/common/create_llm_proxy';
+import { clearConversations, getConversationCreatedEvent } from '../utils/conversation';

 const sortById = (data: Array<Instruction & { public?: boolean }>) => sortBy(data, 'id');

--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/helpers.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/helpers.ts
@ -5,18 +5,23 @@
 * 2.0.
 */
 import expect from '@kbn/expect';
+import { Client } from '@elastic/elasticsearch';
 import {
+  ConversationCreateEvent,
+  ConversationUpdateEvent,
  Message,
  MessageAddEvent,
  MessageRole,
  StreamingChatResponseEvent,
+  StreamingChatResponseEventType,
 } from '@kbn/observability-ai-assistant-plugin/common';
 import { Readable } from 'stream';
 import type { AssistantScope } from '@kbn/ai-assistant-common';
-import { DeploymentAgnosticFtrProviderContext } from '../../../../../ftr_provider_context';
-import type { ObservabilityAIAssistantApiClient } from '../../../../../services/observability_ai_assistant_api';
+import { ObservabilityAIAssistantScreenContextRequest } from '@kbn/observability-ai-assistant-plugin/common/types';
+import { DeploymentAgnosticFtrProviderContext } from '../../../../ftr_provider_context';
+import type { ObservabilityAIAssistantApiClient } from '../../../../services/observability_ai_assistant_api';

-function decodeEvents(body: Readable | string) {
+export function decodeEvents(body: Readable | string) {
  return String(body)
    .split('\n')
    .map((line) => line.trim())
@ -77,10 +82,12 @@ export async function invokeChatCompleteWithFunctionRequest({

 export async function chatComplete({
  userPrompt,
+  screenContexts = [],
  connectorId,
  observabilityAIAssistantAPIClient,
 }: {
  userPrompt: string;
+  screenContexts?: ObservabilityAIAssistantScreenContextRequest[];
  connectorId: string;
  observabilityAIAssistantAPIClient: ObservabilityAIAssistantApiClient;
 }) {
@ -99,7 +106,7 @@ export async function chatComplete({
        ],
        connectorId,
        persist: false,
-        screenContexts: [],
+        screenContexts,
        scopes: ['observability' as const],
      },
    },
@ -135,3 +142,44 @@ export async function getSystemMessage(

  return body.systemMessage;
 }
+
+export async function clearConversations(es: Client) {
+  const KB_INDEX = '.kibana-observability-ai-assistant-conversations-*';
+
+  return es.deleteByQuery({
+    index: KB_INDEX,
+    conflicts: 'proceed',
+    query: { match_all: {} },
+    refresh: true,
+  });
+}
+
+export function getConversationCreatedEvent(body: Readable | string) {
+  const decodedEvents = decodeEvents(body);
+  const conversationCreatedEvent = decodedEvents.find(
+    (event) => event.type === StreamingChatResponseEventType.ConversationCreate
+  ) as ConversationCreateEvent;
+
+  if (!conversationCreatedEvent) {
+    throw new Error(
+      `No conversation created event found: ${JSON.stringify(decodedEvents, null, 2)}`
+    );
+  }
+
+  return conversationCreatedEvent;
+}
+
+export function getConversationUpdatedEvent(body: Readable | string) {
+  const decodedEvents = decodeEvents(body);
+  const conversationUpdatedEvent = decodedEvents.find(
+    (event) => event.type === StreamingChatResponseEventType.ConversationUpdate
+  ) as ConversationUpdateEvent;
+
+  if (!conversationUpdatedEvent) {
+    throw new Error(
+      `No conversation updated event found: ${JSON.stringify(decodedEvents, null, 2)}`
+    );
+  }
+
+  return conversationUpdatedEvent;
+}
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/helpers.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/knowledge_base/helpers.ts
@ -10,9 +10,12 @@ import { Client } from '@elastic/elasticsearch';
 import { AI_ASSISTANT_KB_INFERENCE_ID } from '@kbn/observability-ai-assistant-plugin/server/service/inference_endpoint';
 import { ToolingLog } from '@kbn/tooling-log';
 import { RetryService } from '@kbn/ftr-common-functional-services';
+import { Instruction } from '@kbn/observability-ai-assistant-plugin/common/types';
+import { DeploymentAgnosticFtrProviderContext } from '../../../../ftr_provider_context';
 import type { ObservabilityAIAssistantApiClient } from '../../../../services/observability_ai_assistant_api';
 import { MachineLearningProvider } from '../../../../../services/ml';
 import { SUPPORTED_TRAINED_MODELS } from '../../../../../../functional/services/ml/api';
+import { setAdvancedSettings } from './advanced_settings';

 export const TINY_ELSER = {
  ...SUPPORTED_TRAINED_MODELS.TINY_ELSER,
@ -82,17 +85,6 @@ export async function clearKnowledgeBase(es: Client) {
  });
 }

-export async function clearConversations(es: Client) {
-  const KB_INDEX = '.kibana-observability-ai-assistant-conversations-*';
-
-  return es.deleteByQuery({
-    index: KB_INDEX,
-    conflicts: 'proceed',
-    query: { match_all: {} },
-    refresh: true,
-  });
-}
-
 export async function deleteInferenceEndpoint({
  es,
  name = AI_ASSISTANT_KB_INFERENCE_ID,
@ -102,3 +94,67 @@ export async function deleteInferenceEndpoint({
 }) {
  return es.inference.delete({ inference_id: name, force: true });
 }
+
+export async function addSampleDocsToInternalKb(
+  getService: DeploymentAgnosticFtrProviderContext['getService'],
+  sampleDocs: Array<Instruction & { title: string }>
+) {
+  const log = getService('log');
+  const ml = getService('ml');
+  const retry = getService('retry');
+  const observabilityAIAssistantAPIClient = getService('observabilityAIAssistantApi');
+
+  await importTinyElserModel(ml);
+  await setupKnowledgeBase(observabilityAIAssistantAPIClient);
+  await waitForKnowledgeBaseReady({ observabilityAIAssistantAPIClient, log, retry });
+
+  await observabilityAIAssistantAPIClient.editor({
+    endpoint: 'POST /internal/observability_ai_assistant/kb/entries/import',
+    params: {
+      body: {
+        entries: sampleDocs,
+      },
+    },
+  });
+}
+
+export async function addSampleDocsToCustomIndex(
+  getService: DeploymentAgnosticFtrProviderContext['getService'],
+  sampleDocs: Array<Instruction & { title: string }>,
+  customSearchConnectorIndex: string
+) {
+  const es = getService('es');
+  const supertest = getService('supertest');
+  const log = getService('log');
+
+  // create index with semantic_text mapping for `text` field
+  log.info('Creating custom index with sample animal docs...');
+  await es.indices.create({
+    index: customSearchConnectorIndex,
+    mappings: {
+      properties: {
+        title: { type: 'text' },
+        text: { type: 'semantic_text', inference_id: AI_ASSISTANT_KB_INFERENCE_ID },
+      },
+    },
+  });
+
+  log.info('Indexing sample animal docs...');
+  // ingest sampleDocs
+  await Promise.all(
+    sampleDocs.map(async (doc) => {
+      const { id, ...restDoc } = doc;
+      return es.index({
+        refresh: 'wait_for',
+        index: customSearchConnectorIndex,
+        id,
+        body: restDoc,
+      });
+    })
+  );
+
+  // update the advanced settings (`observability:aiAssistantSearchConnectorIndexPattern`) to include the custom index
+  await setAdvancedSettings(supertest, {
+    'observability:aiAssistantSearchConnectorIndexPattern': customSearchConnectorIndex,
+  });
+}
--- a/x-pack/test/observability_ai_assistant_api_integration/common/create_llm_proxy.ts
+++ b/x-pack/test/observability_ai_assistant_api_integration/common/create_llm_proxy.ts
@ -42,6 +42,11 @@ export interface RelevantField {
  name: string;
 }

+export interface KnowledgeBaseDocument {
+  id: string;
+  text: string;
+}
+
 export interface LlmResponseSimulator {
  requestBody: ChatCompletionStreamParams;
  status: (code: number) => void;
@ -197,14 +202,10 @@ export class LlmProxy {
      when: (requestBody) => requestBody.tool_choice?.function?.name === 'select_relevant_fields',
      arguments: (requestBody) => {
        const messageWithFieldIds = last(requestBody.messages);
-        relevantFields = (messageWithFieldIds?.content as string)
-          .split('\n\n')
-          .slice(1)
-          .join('')
-          .trim()
-          .split('\n')
+        const matches = (messageWithFieldIds?.content as string).match(/\{[\s\S]*?\}/g)!;
+        relevantFields = matches
          .slice(from, to)
-          .map((line) => JSON.parse(line) as RelevantField);
+          .map((jsonStr) => JSON.parse(jsonStr) as RelevantField);

        return JSON.stringify({ fieldIds: relevantFields.map(({ id }) => id) });
      },
@ -219,6 +220,30 @@ export class LlmProxy {
    };
  }

+  interceptScoreToolChoice(log: ToolingLog) {
+    let documents: KnowledgeBaseDocument[] = [];
+
+    const simulator = this.interceptWithFunctionRequest({
+      name: 'score',
+      // @ts-expect-error
+      when: (requestBody) => requestBody.tool_choice?.function?.name === 'score',
+      arguments: (requestBody) => {
+        documents = extractDocumentsFromMessage(last(requestBody.messages)?.content as string, log);
+        const scores = documents.map((doc: KnowledgeBaseDocument) => `${doc.id},7`).join(';');
+
+        return JSON.stringify({ scores });
+      },
+    });
+
+    return {
+      simulator,
+      getDocuments: async () => {
+        await simulator;
+        return documents;
+      },
+    };
+  }
+
  interceptTitle(title: string) {
    return this.interceptWithFunctionRequest({
      name: TITLE_CONVERSATION_FUNCTION_NAME,
@ -355,3 +380,8 @@ async function getRequestBody(request: http.IncomingMessage): Promise<ChatComple
 function sseEvent(chunk: unknown) {
  return `data: ${JSON.stringify(chunk)}\n\n`;
 }
+
+function extractDocumentsFromMessage(content: string, log: ToolingLog): KnowledgeBaseDocument[] {
+  const matches = content.match(/\{[\s\S]*?\}/g)!;
+  return matches.map((jsonStr) => JSON.parse(jsonStr));
+}
--- a/x-pack/test/observability_ai_assistant_functional/tests/conversations/index.spec.ts
+++ b/x-pack/test/observability_ai_assistant_functional/tests/conversations/index.spec.ts
@ -11,7 +11,7 @@ import { ChatFeedback } from '@kbn/observability-ai-assistant-plugin/public/anal
 import { pick } from 'lodash';
 import { parse as parseCookie } from 'tough-cookie';
 import { kbnTestConfig } from '@kbn/test';
-import { systemMessageSorted } from '../../../api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/helpers';
+import { systemMessageSorted } from '../../../api_integration/deployment_agnostic/apis/observability/ai_assistant/utils/conversation';
 import {
  createLlmProxy,
  LlmProxy,