[8.12] [Obs AI Assistant] Evaluation framework (#173010) (#173289)

# Backport This will backport the following commits from `main` to `8.12`: - [[Obs AI Assistant] Evaluation framework (#173010)](https://github.com/elastic/kibana/pull/173010)  ### Questions ? Please refer to the [Backport tool documentation](https://github.com/sqren/backport)  Co-authored-by: Dario Gieselaar <dario.gieselaar@elastic.co>
2025-04-24 09:48:58 -04:00 · 2023-12-13 11:17:04 -05:00 · 2023-12-13 11:17:04 -05:00 · 97bd709078
commit 97bd709078
parent 68a2aa2321
16 changed files with 1106 additions and 11 deletions
--- a/package.json
+++ b/package.json
@ -930,6 +930,7 @@
    "exponential-backoff": "^3.1.1",
    "extract-zip": "^2.0.1",
    "fast-deep-equal": "^3.1.1",
+    "fast-glob": "^3.3.2",
    "fflate": "^0.6.9",
    "file-saver": "^1.3.8",
    "fnv-plus": "^1.3.1",
@ -1630,6 +1631,7 @@
    "supertest": "^6.3.3",
    "supports-color": "^7.0.0",
    "svgo": "^2.8.0",
+    "table": "^6.8.1",
    "tape": "^5.0.1",
    "tempy": "^0.3.0",
    "terser": "^5.16.5",
--- a/x-pack/plugins/observability_ai_assistant/common/utils/concatenate_openai_chunks.ts
+++ b/x-pack/plugins/observability_ai_assistant/common/utils/concatenate_openai_chunks.ts
@ -7,10 +7,23 @@

 import { cloneDeep } from 'lodash';
 import { type Observable, scan } from 'rxjs';
-import { CreateChatCompletionResponseChunk, MessageRole } from '../types';
+import { type CreateChatCompletionResponseChunk, MessageRole } from '../types';

 export const concatenateOpenAiChunks =
-  () => (source: Observable<CreateChatCompletionResponseChunk>) =>
+  () =>
+  (
+    source: Observable<CreateChatCompletionResponseChunk>
+  ): Observable<{
+    message: {
+      content: string;
+      role: MessageRole;
+      function_call: {
+        name: string;
+        arguments: string;
+        trigger: MessageRole.Assistant | MessageRole.User;
+      };
+    };
+  }> =>
    source.pipe(
      scan(
        (acc, { choices }) => {
--- a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/README.md
+++ b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/README.md
@ -0,0 +1,37 @@
+# Observability AI Assistant Evaluation Framework
+
+## Overview
+
+This tool is developed for our team working on the Elastic Observability platform, specifically focusing on evaluating the Observability AI Assistant. It simplifies scripting and evaluating various scenarios with the Large Language Model (LLM) integration.
+
+## Setup requirements
+
+- An Elasticsearch instance
+- A Kibana instance
+- At least one .gen-ai connector set up
+
+## Running evaluations
+
+Run the tool using:
+
+`$ node x-pack/plugins/observability_ai_assistant/scripts/evaluation/index.js`
+
+This will evaluate all existing scenarios, and write the evaluation results to the terminal.
+
+### Configuration
+
+#### Kibana and Elasticsearch
+
+By default, the tool will look for a Kibana instance running locally (at `http://localhost:5601`, which is the default address for running Kibana in development mode). It will also attempt to read the Kibana config file for the Elasticsearch address & credentials. If you want to override these settings, use `--kibana` and `--es`. Only basic auth is supported, e.g. `--kibana http://username:password@localhost:5601`. If you want to use a specific space, use `--spaceId`
+
+#### Connector
+
+Use `--connectorId` to specify a `.gen-ai` connector to use. If none are given, it will prompt you to select a connector based on the ones that are available. If only a single `.gen-ai` connector is found, it will be used without prompting.
+
+#### Persisting conversations
+
+By default, completed conversations are not persisted. If you do want to persist them, for instance for reviewing purposes, set the `--persist` flag to store them. This will also generate a clickable link in the output of the evaluation that takes you to the conversation.
+
+If you want to clear conversations on startup, use the `--clear` flag. This only works when `--persist` is enabled. If `--spaceId` is set, only conversations for the current space will be cleared.
+
+When storing conversations, the name of the scenario is used as a title. Set the `--autoTitle` flag to have the LLM generate a title for you.
--- a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/cli.ts
+++ b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/cli.ts
@ -0,0 +1,78 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+import { format, parse } from 'url';
+import { Argv } from 'yargs';
+import { readKibanaConfig } from './read_kibana_config';
+
+export function options(y: Argv) {
+  const config = readKibanaConfig();
+
+  return y
+    .option('files', {
+      string: true as const,
+      array: true,
+      describe: 'A file or list of files containing the scenarios to evaluate. Defaults to all',
+    })
+    .option('grep', {
+      string: true,
+      array: false,
+      describe: 'A string or regex to filter scenarios by',
+    })
+    .option('kibana', {
+      describe: 'Where Kibana is running',
+      string: true,
+      default: process.env.KIBANA_HOST || 'http://localhost:5601',
+    })
+    .option('spaceId', {
+      describe:
+        'The space to use. If space is set, conversations will only be cleared for that spaceId',
+      string: true,
+      array: false,
+    })
+    .option('elasticsearch', {
+      alias: 'es',
+      describe: 'Where Elasticsearch is running',
+      string: true,
+      default: format({
+        ...parse(config['elasticsearch.hosts']),
+        auth: `${config['elasticsearch.username']}:${config['elasticsearch.password']}`,
+      }),
+    })
+    .option('connectorId', {
+      describe: 'The ID of the connector',
+      string: true,
+    })
+    .option('persist', {
+      describe:
+        'Whether the conversations should be stored. Adding this will generate a link at which the conversation can be opened.',
+      boolean: true,
+      default: false,
+    })
+    .option('clear', {
+      describe: 'Clear conversations on startup',
+      boolean: true,
+      default: false,
+    })
+    .option('autoTitle', {
+      describe: 'Whether to generate titles for new conversations',
+      boolean: true,
+      default: false,
+    })
+    .option('logLevel', {
+      describe: 'Log level',
+      default: 'info',
+    })
+    .check((argv) => {
+      if (!argv.persist && argv.clear) {
+        throw new Error('clear cannot be true if persist is false');
+      }
+      if (!argv.persist && argv.autoTitle) {
+        throw new Error('autoTitle cannot be true if persist is false');
+      }
+      return true;
+    });
+}
--- a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/evaluation.ts
+++ b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/evaluation.ts
@ -0,0 +1,202 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import yargs from 'yargs';
+import { run } from '@kbn/dev-cli-runner';
+import { Client } from '@elastic/elasticsearch';
+import inquirer from 'inquirer';
+import * as fastGlob from 'fast-glob';
+import Path from 'path';
+import chalk from 'chalk';
+import * as table from 'table';
+import { castArray, omit, sortBy } from 'lodash';
+import { TableUserConfig } from 'table';
+import { format, parse } from 'url';
+import { options } from './cli';
+import { getServiceUrls } from './get_service_urls';
+import { KibanaClient } from './kibana_client';
+import { EvaluationFunction } from './types';
+import { MessageRole } from '../../common';
+
+function runEvaluations() {
+  yargs(process.argv.slice(2))
+    .command('*', 'Run AI Assistant evaluations', options, (argv) => {
+      run(
+        async ({ log }) => {
+          const serviceUrls = await getServiceUrls({
+            log,
+            elasticsearch: argv.elasticsearch,
+            kibana: argv.kibana,
+          });
+
+          const kibanaClient = new KibanaClient(serviceUrls.kibanaUrl, argv.spaceId);
+          const esClient = new Client({
+            node: serviceUrls.esUrl,
+          });
+
+          const connectors = await kibanaClient.getConnectors();
+
+          if (!connectors.length) {
+            throw new Error('No connectors found');
+          }
+
+          let connector = connectors.find((item) => item.id === argv.connectorId);
+
+          if (!connector && argv.connectorId) {
+            log.warning(`Could not find connector ${argv.connectorId}`);
+          }
+
+          if (!connector && connectors.length === 1) {
+            connector = connectors[0];
+            log.debug('Using the only connector found');
+          } else {
+            const connectorChoice = await inquirer.prompt({
+              type: 'list',
+              name: 'connector',
+              message: 'Select a connector',
+              choices: connectors.map((item) => item.name),
+            });
+
+            connector = connectors.find((item) => item.name === connectorChoice.connector)!;
+          }
+
+          log.info(`Using connector ${connector.id}`);
+
+          const scenarios =
+            (argv.files !== undefined &&
+              castArray(argv.files).map((file) => Path.join(process.cwd(), file))) ||
+            fastGlob.sync(Path.join(__dirname, './scenarios/**/*.ts'));
+
+          if (!scenarios.length) {
+            throw new Error('No scenarios to run');
+          }
+
+          if (argv.clear) {
+            log.info('Clearing conversations');
+            await esClient.deleteByQuery({
+              index: '.kibana-observability-ai-assistant-conversations',
+              query: {
+                ...(argv.spaceId ? { term: { namespace: argv.spaceId } } : { match_all: {} }),
+              },
+              refresh: true,
+            });
+          }
+
+          let evaluationFunctions: Array<{
+            name: string;
+            fileName: string;
+            fn: EvaluationFunction;
+          }> = [];
+
+          for (const fileName of scenarios) {
+            log.info(`Running scenario ${fileName}`);
+            const mod = await import(fileName);
+            Object.keys(mod).forEach((key) => {
+              evaluationFunctions.push({ name: key, fileName, fn: mod[key] });
+            });
+          }
+
+          if (argv.grep) {
+            const lc = argv.grep.toLowerCase();
+            evaluationFunctions = evaluationFunctions.filter((fn) =>
+              fn.name.toLowerCase().includes(lc)
+            );
+          }
+
+          const header: string[][] = [
+            [chalk.bold('Criterion'), chalk.bold('Result'), chalk.bold('Reasoning')],
+          ];
+
+          const tableConfig: TableUserConfig = {
+            singleLine: false,
+            border: {
+              topBody: `─`,
+              topJoin: `┬`,
+              topLeft: `┌`,
+              topRight: `┐`,
+
+              bottomBody: `─`,
+              bottomJoin: `┴`,
+              bottomLeft: `└`,
+              bottomRight: `┘`,
+
+              bodyLeft: `│`,
+              bodyRight: `│`,
+              bodyJoin: `│`,
+
+              joinBody: `─`,
+              joinLeft: `├`,
+              joinRight: `┤`,
+              joinJoin: `┼`,
+            },
+            spanningCells: [
+              { row: 0, col: 0, colSpan: 3 },
+              { row: 1, col: 0, colSpan: 3 },
+            ],
+            columns: [
+              { wrapWord: true, width: 60 },
+              { wrapWord: true },
+              { wrapWord: true, width: 60 },
+            ],
+          };
+
+          const sortedEvaluationFunctions = sortBy(evaluationFunctions, 'fileName', 'name');
+
+          for (const { name, fn } of sortedEvaluationFunctions) {
+            log.debug(`Executing ${name}`);
+            const result = await fn({
+              esClient,
+              kibanaClient,
+              chatClient: kibanaClient.createChatClient({
+                connectorId: connector.id!,
+                persist: argv.persist,
+                title: argv.autoTitle ? undefined : name,
+              }),
+            });
+            log.debug(`Result:`, JSON.stringify(result));
+            const output: string[][] = [
+              [
+                result.messages.find((message) => message.role === MessageRole.User)!.content!,
+                '',
+                '',
+              ],
+              result.conversationId
+                ? [
+                    `${format(omit(parse(serviceUrls.kibanaUrl), 'auth'))}/${
+                      argv.spaceId ? `s/${argv.spaceId}/` : ''
+                    }app/observabilityAIAssistant/conversations/${result.conversationId}`,
+                    '',
+                    '',
+                  ]
+                : ['', '', ''],
+              ...header,
+            ];
+
+            result.scores.forEach((score) => {
+              output.push([
+                score.criterion,
+                score.score === 0 ? chalk.redBright('failed') : chalk.greenBright('passed'),
+                score.reasoning,
+              ]);
+            });
+            log.write(table.table(output, tableConfig));
+          }
+        },
+        {
+          log: {
+            defaultLevel: argv.logLevel as any,
+          },
+          flags: {
+            allowUnexpected: true,
+          },
+        }
+      );
+    })
+    .parse();
+}
+
+runEvaluations();
--- a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/get_service_urls.ts
+++ b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/get_service_urls.ts
@ -0,0 +1,152 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { ToolingLog } from '@kbn/tooling-log';
+import { omit } from 'lodash';
+import fetch from 'node-fetch';
+import { format, parse, Url } from 'url';
+
+async function discoverAuth(parsedTarget: Url, log: ToolingLog) {
+  const possibleCredentials = [`admin:changeme`, `elastic:changeme`];
+  for (const auth of possibleCredentials) {
+    const url = format({
+      ...parsedTarget,
+      auth,
+    });
+    let status: number;
+    try {
+      log.debug(`Fetching ${url}`);
+      const response = await fetch(url);
+      status = response.status;
+    } catch (err) {
+      log.debug(`${url} resulted in ${err.message}`);
+      status = 0;
+    }
+
+    if (status === 200) {
+      return auth;
+    }
+  }
+
+  throw new Error(`Failed to authenticate user for ${format(parsedTarget)}`);
+}
+
+async function getKibanaUrl({ kibana, log }: { kibana: string; log: ToolingLog }) {
+  try {
+    const isCI = process.env.CI?.toLowerCase() === 'true';
+
+    const parsedKibanaUrl = parse(kibana);
+
+    const kibanaUrlWithoutAuth = format(omit(parsedKibanaUrl, 'auth'));
+
+    log.debug(`Checking Kibana URL ${kibanaUrlWithoutAuth} for a redirect`);
+
+    const unredirectedResponse = await fetch(kibanaUrlWithoutAuth, {
+      headers: {
+        ...(parsedKibanaUrl.auth
+          ? { Authorization: `Basic ${Buffer.from(parsedKibanaUrl.auth).toString('base64')}` }
+          : {}),
+      },
+      method: 'HEAD',
+      follow: 1,
+      redirect: 'manual',
+    });
+
+    log.debug('Unredirected response', unredirectedResponse.headers.get('location'));
+
+    const discoveredKibanaUrl =
+      unredirectedResponse.headers
+        .get('location')
+        ?.replace('/spaces/enter', '')
+        ?.replace('spaces/space_selector', '') || kibanaUrlWithoutAuth;
+
+    log.debug(`Discovered Kibana URL at ${discoveredKibanaUrl}`);
+
+    const parsedTarget = parse(kibana);
+
+    const parsedDiscoveredUrl = parse(discoveredKibanaUrl);
+
+    const discoveredKibanaUrlWithAuth = format({
+      ...parsedDiscoveredUrl,
+      auth: parsedTarget.auth,
+    });
+
+    const redirectedResponse = await fetch(discoveredKibanaUrlWithAuth, {
+      method: 'HEAD',
+    });
+
+    if (redirectedResponse.status !== 200) {
+      throw new Error(
+        `Expected HTTP 200 from ${discoveredKibanaUrlWithAuth}, got ${redirectedResponse.status}`
+      );
+    }
+
+    const discoveredKibanaUrlWithoutAuth = format({
+      ...parsedDiscoveredUrl,
+      auth: undefined,
+    });
+
+    log.info(
+      `Discovered kibana running at: ${
+        isCI ? discoveredKibanaUrlWithoutAuth : discoveredKibanaUrlWithAuth
+      }`
+    );
+
+    return discoveredKibanaUrlWithAuth.replace(/\/$/, '');
+  } catch (error) {
+    throw new Error(`Could not connect to Kibana: ` + error.message);
+  }
+}
+
+export async function getServiceUrls({
+  log,
+  elasticsearch,
+  kibana,
+}: {
+  elasticsearch: string;
+  kibana: string;
+  log: ToolingLog;
+}) {
+  if (!elasticsearch) {
+    // assume things are running locally
+    kibana = kibana || 'http://127.0.0.1:5601';
+    elasticsearch = 'http://127.0.0.1:9200';
+  }
+
+  if (!elasticsearch) {
+    throw new Error('Could not determine an Elasticsearch target');
+  }
+
+  const parsedTarget = parse(elasticsearch);
+
+  let auth = parsedTarget.auth;
+
+  if (!parsedTarget.auth) {
+    auth = await discoverAuth(parsedTarget, log);
+  }
+
+  const formattedEsUrl = format({
+    ...parsedTarget,
+    auth,
+  });
+
+  const suspectedKibanaUrl = kibana || elasticsearch.replace('.es', '.kb');
+
+  const parsedKibanaUrl = parse(suspectedKibanaUrl);
+
+  const kibanaUrlWithAuth = format({
+    ...parsedKibanaUrl,
+    auth,
+  });
+
+  const validatedKibanaUrl = await getKibanaUrl({ kibana: kibanaUrlWithAuth, log });
+
+  return {
+    kibanaUrl: validatedKibanaUrl,
+    esUrl: formattedEsUrl,
+  };
+}
--- a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/index.js
+++ b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/index.js
@ -0,0 +1,10 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+require('@kbn/babel-register').install();
+
+require('./evaluation');
--- a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/kibana_client.ts
+++ b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/kibana_client.ts
@ -0,0 +1,327 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import axios, { AxiosInstance, AxiosResponse } from 'axios';
+import { pick } from 'lodash';
+import { filter, lastValueFrom, map, tap, toArray } from 'rxjs';
+import { format, parse, UrlObject } from 'url';
+import { Message, MessageRole } from '../../common';
+import {
+  ChatCompletionErrorCode,
+  ConversationCompletionError,
+  ConversationCreateEvent,
+  MessageAddEvent,
+  StreamingChatResponseEvent,
+  StreamingChatResponseEventType,
+} from '../../common/conversation_complete';
+import { FunctionDefinition } from '../../common/types';
+import { concatenateOpenAiChunks } from '../../common/utils/concatenate_openai_chunks';
+import { processOpenAiStream } from '../../common/utils/process_openai_stream';
+import { APIReturnType, ObservabilityAIAssistantAPIClientRequestParamsOf } from '../../public';
+import { getAssistantSetupMessage } from '../../public/service/get_assistant_setup_message';
+import { streamIntoObservable } from '../../server/service/util/stream_into_observable';
+import { EvaluationResult } from './types';
+
+type InnerMessage = Message['message'];
+type StringOrMessageList = string | InnerMessage[];
+
+interface ChatClient {
+  chat: (message: StringOrMessageList) => Promise<InnerMessage>;
+  complete: (
+    ...args: [StringOrMessageList] | [string, InnerMessage[]]
+  ) => Promise<{ conversationId?: string; messages: InnerMessage[] }>;
+
+  evaluate: (
+    {}: { conversationId?: string; messages: InnerMessage[] },
+    criteria: string[]
+  ) => Promise<EvaluationResult>;
+}
+
+export class KibanaClient {
+  axios: AxiosInstance;
+  constructor(private readonly url: string, private readonly spaceId?: string) {
+    this.axios = axios.create({
+      headers: {
+        'kbn-xsrf': 'foo',
+      },
+    });
+  }
+
+  private getUrl(props: { query?: UrlObject['query']; pathname: string }) {
+    const parsed = parse(this.url);
+
+    const baseUrl = parsed.pathname?.replaceAll('/', '') ?? '';
+
+    const url = format({
+      ...parsed,
+      pathname: `/${[
+        baseUrl,
+        ...(this.spaceId ? ['s', this.spaceId] : []),
+        props.pathname.startsWith('/') ? props.pathname.substring(1) : props.pathname,
+      ].join('/')}`,
+      query: props.query,
+    });
+
+    return url;
+  }
+
+  createChatClient({
+    connectorId,
+    persist,
+    title,
+  }: {
+    connectorId: string;
+    persist: boolean;
+    title?: string;
+  }): ChatClient {
+    function getMessages(message: string | Array<Message['message']>): Array<Message['message']> {
+      if (typeof message === 'string') {
+        return [
+          {
+            content: message,
+            role: MessageRole.User,
+          },
+        ];
+      }
+      return message;
+    }
+
+    const that = this;
+
+    async function getFunctions() {
+      const {
+        data: { functionDefinitions, contextDefinitions },
+      }: AxiosResponse<APIReturnType<'GET /internal/observability_ai_assistant/functions'>> =
+        await that.axios.get(
+          that.getUrl({ pathname: '/internal/observability_ai_assistant/functions' })
+        );
+
+      return { functionDefinitions, contextDefinitions };
+    }
+
+    async function chat({
+      messages,
+      functions,
+      functionCall,
+    }: {
+      messages: Message[];
+      functions: FunctionDefinition[];
+      functionCall?: string;
+    }) {
+      const params: ObservabilityAIAssistantAPIClientRequestParamsOf<'POST /internal/observability_ai_assistant/chat'>['params']['body'] =
+        {
+          messages,
+          connectorId,
+          functions: functions.map((fn) => pick(fn, 'name', 'description', 'parameters')),
+          functionCall,
+        };
+      const stream$ = streamIntoObservable(
+        (
+          await that.axios.post(
+            that.getUrl({
+              pathname: '/internal/observability_ai_assistant/chat',
+              query: { stream: true },
+            }),
+            params,
+            { responseType: 'stream' }
+          )
+        ).data
+      ).pipe(processOpenAiStream(), concatenateOpenAiChunks());
+
+      const receivedMessage = await lastValueFrom(stream$);
+
+      return receivedMessage.message;
+    }
+
+    return {
+      chat: async (message) => {
+        const { functionDefinitions, contextDefinitions } = await getFunctions();
+        const messages = [
+          getAssistantSetupMessage({ contexts: contextDefinitions }),
+          ...getMessages(message).map((msg) => ({
+            message: msg,
+            '@timestamp': new Date().toISOString(),
+          })),
+        ];
+        return chat({ messages, functions: functionDefinitions });
+      },
+      complete: async (...args) => {
+        const messagesArg = args.length === 1 ? args[0] : args[1];
+        const conversationId = args.length === 1 ? undefined : args[0];
+        const { contextDefinitions } = await getFunctions();
+        const messages = [
+          getAssistantSetupMessage({ contexts: contextDefinitions }),
+          ...getMessages(messagesArg).map((msg) => ({
+            message: msg,
+            '@timestamp': new Date().toISOString(),
+          })),
+        ];
+
+        const stream$ = streamIntoObservable(
+          (
+            await that.axios.post(
+              that.getUrl({
+                pathname: '/internal/observability_ai_assistant/chat/complete',
+              }),
+              {
+                conversationId,
+                messages,
+                connectorId,
+                persist,
+                title,
+              },
+              { responseType: 'stream' }
+            )
+          ).data
+        ).pipe(
+          map((line) => JSON.parse(line) as StreamingChatResponseEvent),
+          tap((event) => {
+            if (event.type === StreamingChatResponseEventType.ConversationCompletionError) {
+              throw new ConversationCompletionError(
+                event.error.code ?? ChatCompletionErrorCode.InternalError,
+                event.error.message
+              );
+            }
+          }),
+          filter(
+            (event): event is MessageAddEvent | ConversationCreateEvent =>
+              event.type === StreamingChatResponseEventType.MessageAdd ||
+              event.type === StreamingChatResponseEventType.ConversationCreate
+          ),
+          toArray()
+        );
+
+        const events = await lastValueFrom(stream$);
+
+        return {
+          messages: messages
+            .map((msg) => msg.message)
+            .concat(
+              events
+                .filter(
+                  (event): event is MessageAddEvent =>
+                    event.type === StreamingChatResponseEventType.MessageAdd
+                )
+                .map((event) => event.message.message)
+            ),
+          conversationId:
+            conversationId ||
+            events.find(
+              (event): event is ConversationCreateEvent =>
+                event.type === StreamingChatResponseEventType.ConversationCreate
+            )?.conversation.id,
+        };
+      },
+      evaluate: async ({ messages, conversationId }, criteria) => {
+        const message = await chat({
+          messages: [
+            {
+              '@timestamp': new Date().toISOString(),
+              message: {
+                role: MessageRole.System,
+                content: `You are a critical assistant for evaluating conversations with the Elastic Observability AI Assistant,
+                which helps our users make sense of their Observability data.
+
+                Your goal is to verify whether a conversation between the user and the assistant matches the given criteria.
+                
+                For each criterion, calculate a score. Explain your score, by describing what the assistant did right, and what the
+                assistant did wrong, where it could improve, and what the root cause was in case of a failure.`,
+              },
+            },
+            {
+              '@timestamp': new Date().toString(),
+              message: {
+                role: MessageRole.User,
+                content: `Evaluate the conversation according to the following criteria:
+                
+                ${criteria.map((criterion, index) => {
+                  return `${index}: ${criterion}`;
+                })}
+                
+                This is the conversation:
+                
+                ${JSON.stringify(messages)}`,
+              },
+            },
+          ],
+          functions: [
+            {
+              name: 'scores',
+              parameters: {
+                type: 'object',
+                properties: {
+                  criteria: {
+                    type: 'array',
+                    items: {
+                      type: 'object',
+                      properties: {
+                        index: {
+                          type: 'number',
+                          description: 'The number of the criterion',
+                        },
+                        score: {
+                          type: 'number',
+                          description:
+                            'A score of either 0 (criterion failed) or 1 (criterion succeeded)',
+                        },
+                        reasoning: {
+                          type: 'string',
+                          description:
+                            'Your reasoning for the score. Explain your score by mentioning what you expected to happen and what did happen.',
+                        },
+                      },
+                      required: ['index', 'score', 'reasoning'],
+                    },
+                  },
+                },
+                required: ['criteria'],
+              },
+              contexts: [],
+              description: 'Call this function to return scores for the criteria',
+            },
+          ],
+          functionCall: 'scores',
+        });
+
+        return {
+          conversationId,
+          messages,
+          scores: (
+            JSON.parse(message.function_call.arguments) as {
+              criteria: Array<{ index: number; score: number; reasoning: string }>;
+            }
+          ).criteria.map(({ index, score, reasoning }) => {
+            return {
+              criterion: criteria[index],
+              score,
+              reasoning,
+            };
+          }),
+        };
+      },
+    };
+  }
+
+  async getConnectors() {
+    const connectors: AxiosResponse<
+      Array<{
+        id: string;
+        connector_type_id: string;
+        name: string;
+        is_preconfigured: boolean;
+        is_deprecated: boolean;
+        referenced_by_count: number;
+      }>
+    > = await axios.get(
+      this.getUrl({
+        pathname: '/api/actions/connectors',
+      })
+    );
+
+    return connectors.data.filter((connector) => connector.connector_type_id === '.gen-ai');
+  }
+}
--- a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/read_kibana_config.ts
+++ b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/read_kibana_config.ts
@ -0,0 +1,44 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import path from 'path';
+import fs from 'fs';
+import yaml from 'js-yaml';
+import { identity, pickBy } from 'lodash';
+
+export type KibanaConfig = ReturnType<typeof readKibanaConfig>;
+
+export const readKibanaConfig = () => {
+  const kibanaConfigDir = path.join(__filename, '../../../../../../config');
+  const kibanaDevConfig = path.join(kibanaConfigDir, 'kibana.dev.yml');
+  const kibanaConfig = path.join(kibanaConfigDir, 'kibana.yml');
+
+  const loadedKibanaConfig = (yaml.safeLoad(
+    fs.readFileSync(fs.existsSync(kibanaDevConfig) ? kibanaDevConfig : kibanaConfig, 'utf8')
+  ) || {}) as {};
+
+  const cliEsCredentials = pickBy(
+    {
+      'elasticsearch.username': process.env.ELASTICSEARCH_USERNAME,
+      'elasticsearch.password': process.env.ELASTICSEARCH_PASSWORD,
+      'elasticsearch.hosts': process.env.ELASTICSEARCH_HOST,
+    },
+    identity
+  ) as {
+    'elasticsearch.username'?: string;
+    'elasticsearch.password'?: string;
+    'elasticsearch.hosts'?: string;
+  };
+
+  return {
+    'elasticsearch.hosts': 'http://localhost:9200',
+    'elasticsearch.username': 'elastic',
+    'elasticsearch.password': 'changeme',
+    ...loadedKibanaConfig,
+    ...cliEsCredentials,
+  };
+};
--- a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/scenarios/elasticsearch/index.ts
+++ b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/scenarios/elasticsearch/index.ts
@ -0,0 +1,21 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { EvaluationFunction } from '../../types';
+
+export const health: EvaluationFunction = async ({ chatClient }) => {
+  const conversation = await chatClient.complete(
+    'Can you tell me what the state of my Elasticsearch cluster is?'
+  );
+
+  const evaluation = await chatClient.evaluate(conversation, [
+    'Calls the Elasticsearch function with method: GET and path: _cluster/health',
+    'Describes the cluster status based on the response from the Elasticsearch function',
+  ]);
+
+  return evaluation;
+};
--- a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/scenarios/esql/index.ts
+++ b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/scenarios/esql/index.ts
@ -0,0 +1,173 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { last } from 'lodash';
+import { MessageRole } from '../../../../common';
+import { EvaluationFunction } from '../../types';
+
+function extractEsqlQuery(response: string) {
+  return response.match(/```esql([\s\S]*?)```/)?.[1];
+}
+
+function createEsqlQueryEvaluation({
+  question,
+  expected,
+  criteria = [],
+  execute = true,
+}: {
+  question: string;
+  expected?: string;
+  criteria?: string[];
+  execute?: boolean;
+}): EvaluationFunction {
+  return async ({ chatClient }) => {
+    let conversation = await chatClient.complete(question);
+
+    const esqlQuery = extractEsqlQuery(last(conversation.messages)?.content || '');
+
+    if (esqlQuery && execute) {
+      conversation = await chatClient.complete(
+        conversation.conversationId!,
+        conversation.messages.concat({
+          content: '',
+          role: MessageRole.Assistant,
+          function_call: {
+            name: 'execute_query',
+            arguments: JSON.stringify({
+              query: esqlQuery,
+            }),
+            trigger: MessageRole.User,
+          },
+        })
+      );
+    }
+
+    const evaluation = await chatClient.evaluate(conversation, [
+      ...(expected
+        ? [
+            `Returns a ES|QL query that is functionally equivalent to:      
+      ${expected}`,
+          ]
+        : []),
+      ...(execute && expected ? [`The query successfully executed without an error`] : []),
+      ...criteria,
+    ]);
+
+    return evaluation;
+  };
+}
+
+export const metricsApmQuery = createEsqlQueryEvaluation({
+  question:
+    'I want to see a query for metrics-apm*, filtering on metricset.name:transaction and metricset.interval:1m, showing the average duration (via transaction.duration.histogram), in 50 buckets.',
+  expected: `FROM metrics-apm*
+  | WHERE metricset.name == "transaction" AND metricset.interval == "1m"
+  | EVAL bucket = AUTO_BUCKET(@timestamp, 50, <start-date>, <end-date>)
+  | STATS avg_duration = AVG(transaction.duration.histogram) BY bucket`,
+});
+
+export const packetbeatUniqueDomainsQuery = createEsqlQueryEvaluation({
+  question:
+    'For standard Elastic ECS compliant packetbeat data view, create an ES|QL query that shows the top 10 unique domains by doc count',
+  expected: `FROM packetbeat-*
+  | STATS doc_count = COUNT(destination.domain) BY destination.domain
+  | SORT doc_count DESC
+  | LIMIT 10`,
+});
+
+export const fiveEarliestEmployeesQuery = createEsqlQueryEvaluation({
+  question:
+    'From employees, I want to see the 5 earliest employees (hire_date), I want to display only the month and the year that they were hired in and their employee number (emp_no). Format the date as e.g. "September 2019".',
+  expected: `FROM employees
+  | EVAL hire_date_formatted = DATE_FORMAT(hire_date, ""MMMM yyyy"")
+  | SORT hire_date
+  | KEEP emp_no, hire_date_formatted
+  | LIMIT 5`,
+  execute: false,
+});
+
+export const employeesWithPaginationQuery = createEsqlQueryEvaluation({
+  question:
+    'From employees, I want to sort the documents by salary, and then return 10 results per page, and then see the second page',
+  criteria: ['The assistant should mention that pagination is currently not supported in ES|QL'],
+});
+
+export const logsAvgCpuQuery = createEsqlQueryEvaluation({
+  question:
+    'My logs data (ECS) is in `logs-*`. Show me a query that gets the average CPU per host, limit it to the top 10 results, in 1m buckets, and only include the last 15m. ',
+  expected: `FROM logs-*
+  | WHERE @timestamp >= NOW() - 15 minutes
+  | EVAL bucket = DATE_TRUNC(1 minute, @timestamp)
+  | STATS avg_cpu = AVG(system.cpu.total.norm.pct) BY bucket, host.name
+  | LIMIT 10`,
+});
+
+export const apmServiceInventoryQuery = createEsqlQueryEvaluation({
+  question:
+    'I want to show a list of services with APM data. My data is in `traces-apm*`. I want to show the average transaction duration, the success rate (by dividing event.outcome:failure by event.outcome:failure+success), and total amount of requests. As a time range, select the last 24 hours. Use ES|QL.',
+  expected: `FROM traces-apm*
+  | WHERE @timestamp >= NOW() - 24 hours
+  | EVAL successful = CASE(event.outcome == "success", 1, 0),
+    failed = CASE(event.outcome == "failure", 1, 0)
+  | STATS success_rate = AVG(successful), 
+    avg_duration = AVG(transaction.duration), 
+    total_requests = COUNT(transaction.id) BY service.name`,
+});
+
+export const metricbeatCpuQuery = createEsqlQueryEvaluation({
+  question: `from \`metricbeat*\`, using ES|QL, I want to see the percentage of CPU time normalized by the number of CPU cores, broken down by hostname. the fields are system.cpu.user.pct, system.cpu.system.pct, and system.cpu.cores`,
+  expected: `FROM metricbeat*
+  | EVAL cpu_pct_normalized = (system.cpu.user.pct + system.cpu.system.pct) / system.cpu.cores
+  | STATS AVG(cpu_pct_normalized) BY host.name`,
+});
+
+export const postgresDurationQuery = createEsqlQueryEvaluation({
+  question:
+    'extract the query duration from postgres log messages in postgres-logs*, using ECS fields, and calculate the avg',
+  expected: `FROM postgres-logs
+  | DISSECT message "%{} duration: %{query_duration} ms"
+  | EVAL query_duration_num = TO_DOUBLE(query_duration)
+  | STATS avg_duration = AVG(query_duration_num)`,
+});
+
+export const apmExitSpanQuery = createEsqlQueryEvaluation({
+  question: `I've got APM data in \`metrics-apm\`. Filter on \`metricset.name:service_destination\` and the last 24 hours. Break down by span.destination.service.resource. Each document contains the count of total events (span.destination.service.response_time.count) for that document's interval and the total amount of latency (span.destination.service.response_time.sum.us). A document either contains an aggregate of failed events (event.outcome:success) or failed events (event.outcome:failure). A single document might represent multiple failures or successes, depending on the value of span.destination.service.response_time.count. For each value of span.destination.service.resource, give me the average throughput, latency per request, and failure rate, as a value between 0 and 1.  Just show me the query.`,
+  expected: `FROM metrics-apm
+  | WHERE metricset.name == "service_destination" AND @timestamp >= NOW() - 24 hours
+  | EVAL total_response_time = span.destination.service.response_time.sum.us / span.destination.service.response_time.count, total_failures = CASE(event.outcome == "failure", 1, 0) * span.destination.service.response_time.count
+  | STATS 
+    avg_throughput = AVG(span.destination.service.response_time.count),
+    avg_latency = AVG(total_response_time),
+    failure_rate = AVG(total_failures)
+    BY span.destination.service.resource`,
+});
+
+export const highCardinalityLogsErrorQuery = createEsqlQueryEvaluation({
+  question: `i have logs in high-cardinality-data-fake_stack.admin-console-* . errors are found when log.level contais the value ERROR. generate a query to obtain the error rate as a percetage of the total logs per day for the last 7 days`,
+  expected: `FROM high-cardinality-data-fake_stack.admin-console-*
+  | WHERE @timestamp >= NOW() - 7 days
+  | EVAL error = CASE(log.level == "ERROR", 1, 0), total = 1
+  | EVAL bucket = DATE_TRUNC(1 day, @timestamp)
+  | STATS total_errors = SUM(error), total_logs = SUM(total) BY bucket
+  | EVAL error_rate = (total_errors / total_logs) * 100`,
+});
+
+export const nycTaxisDropoffTimeQuery = createEsqlQueryEvaluation({
+  question:
+    'From `nyc_taxis`, give me a query that shows the top 10 results where the drop off time was between 6am and 10am.',
+  expected: `FROM nyc_taxis
+  | WHERE DATE_EXTRACT(drop_off_time, "hour") >= 6 AND DATE_EXTRACT(drop_off_time, "hour") < 10
+  | LIMIT 10`,
+});
+
+export const apmTraceDurationQuery = createEsqlQueryEvaluation({
+  question:
+    'My APM data is in `traces-apm*`. What’s the average for `transaction.duration.us` per service over the last hour?',
+  expected: `FROM traces-apm*
+  | WHERE @timestamp > NOW() - 1 hour
+  | STATS AVG(transaction.duration.us) BY service.name`,
+});
--- a/x-pack/plugins/observability_ai_assistant/scripts/evaluation/types.ts
+++ b/x-pack/plugins/observability_ai_assistant/scripts/evaluation/types.ts
@ -0,0 +1,28 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Client } from '@elastic/elasticsearch';
+import { Message } from '../../common';
+import { KibanaClient } from './kibana_client';
+
+export interface ScenarioOptions {
+  esClient: Client;
+  kibanaClient: KibanaClient;
+  chatClient: ReturnType<KibanaClient['createChatClient']>;
+}
+
+export interface EvaluationResult {
+  conversationId?: string;
+  messages: Array<Message['message']>;
+  scores: Array<{
+    criterion: string;
+    reasoning: string;
+    score: number;
+  }>;
+}
+
+export type EvaluationFunction = (options: ScenarioOptions) => Promise<EvaluationResult>;
--- a/x-pack/plugins/observability_ai_assistant/server/service/client/index.test.ts
+++ b/x-pack/plugins/observability_ai_assistant/server/service/client/index.test.ts
@ -90,6 +90,7 @@ describe('Observability AI Assistant service', () => {
  const loggerMock: DeeplyMockedKeys<Logger> = {
    log: jest.fn(),
    error: jest.fn(),
+    debug: jest.fn(),
  } as any;

  const functionClientMock: DeeplyMockedKeys<ChatFunctionClient> = {
--- a/x-pack/plugins/observability_ai_assistant/server/service/client/index.ts
+++ b/x-pack/plugins/observability_ai_assistant/server/service/client/index.ts
@ -406,6 +406,9 @@ export class ObservabilityAIAssistantClient {
      function_call: functionCall ? { name: functionCall } : undefined,
    };

+    this.dependencies.logger.debug(`Sending conversation to connector`);
+    this.dependencies.logger.debug(JSON.stringify(request, null, 2));
+
    const executeResult = await this.dependencies.actionsClient.execute({
      actionId: connectorId,
      params: {
--- a/x-pack/plugins/observability_ai_assistant/tsconfig.json
+++ b/x-pack/plugins/observability_ai_assistant/tsconfig.json
@ -7,6 +7,7 @@
    "../../../typings/**/*",
    "common/**/*",
    "public/**/*",
+    "scripts/**/*",
    "typings/**/*",
    "public/**/*.json",
    "server/**/*"
@ -48,7 +49,10 @@
    "@kbn/licensing-plugin",
    "@kbn/share-plugin",
    "@kbn/utility-types-jest",
-    "@kbn/analytics-client"
+    "@kbn/analytics-client",
+    "@kbn/tooling-log",
+    "@kbn/babel-register",
+    "@kbn/dev-cli-runner"
  ],
  "exclude": ["target/**/*"]
 }
--- a/yarn.lock
+++ b/yarn.lock
@ -16615,10 +16615,10 @@ fast-glob@^2.2.6:
    merge2 "^1.2.3"
    micromatch "^3.1.10"

-fast-glob@^3.0.3, fast-glob@^3.1.1, fast-glob@^3.2.11, fast-glob@^3.2.2, fast-glob@^3.2.7, fast-glob@^3.2.9:
-  version "3.2.12"
-  resolved "https://registry.yarnpkg.com/fast-glob/-/fast-glob-3.2.12.tgz#7f39ec99c2e6ab030337142da9e0c18f37afae80"
-  integrity sha512-DVj4CQIYYow0BlaelwK1pHl5n5cRSJfM60UA0zK891sVInoPri2Ekj7+e1CT3/3qxXenpI+nBBmQAcJPJgaj4w==
+fast-glob@^3.0.3, fast-glob@^3.1.1, fast-glob@^3.2.11, fast-glob@^3.2.2, fast-glob@^3.2.7, fast-glob@^3.2.9, fast-glob@^3.3.2:
+  version "3.3.2"
+  resolved "https://registry.yarnpkg.com/fast-glob/-/fast-glob-3.3.2.tgz#a904501e57cfdd2ffcded45e99a54fef55e46129"
+  integrity sha512-oX2ruAFQwf/Orj8m737Y5adxDQO0LAB7/S5MnxCdTNDd4p6BsyIVsv9JQsATbTSq8KHRpLwIHbVlUNatxd+1Ow==
  dependencies:
    "@nodelib/fs.stat" "^2.0.2"
    "@nodelib/fs.walk" "^1.2.3"
@ -28612,10 +28612,10 @@ tabbable@^5.3.3:
  resolved "https://registry.yarnpkg.com/tabbable/-/tabbable-5.3.3.tgz#aac0ff88c73b22d6c3c5a50b1586310006b47fbf"
  integrity sha512-QD9qKY3StfbZqWOPLp0++pOrAVb/HbUi5xCc8cUo4XjP19808oaMiDzn0leBY5mCespIBM0CIZePzZjgzR83kA==

-table@^6.8.0:
-  version "6.8.0"
-  resolved "https://registry.yarnpkg.com/table/-/table-6.8.0.tgz#87e28f14fa4321c3377ba286f07b79b281a3b3ca"
-  integrity sha512-s/fitrbVeEyHKFa7mFdkuQMWlH1Wgw/yEXMt5xACT4ZpzWFluehAxRtUUQKPuWhaLAWhFcVx6w3oC8VKaUfPGA==
+table@^6.8.0, table@^6.8.1:
+  version "6.8.1"
+  resolved "https://registry.yarnpkg.com/table/-/table-6.8.1.tgz#ea2b71359fe03b017a5fbc296204471158080bdf"
+  integrity sha512-Y4X9zqrCftUhMeH2EptSSERdVKt/nEdijTOacGD/97EKjhQ/Qs8RTlEGABSJNNN8lac9kheH+af7yAkEWlgneA==
  dependencies:
    ajv "^8.0.1"
    lodash.truncate "^4.4.2"