[Obs AI Assistant] Improve error handling in the evaluation framework (#212991)

Closes https://github.com/elastic/obs-ai-assistant-team/issues/196 ## Summary This PR implements the follows: - Slightly increase the backoff delay for `429` errors - Improve `convertMessagesForInference` to surface errors related to function calls - Improve the KB retrieval scenario criteria ### Checklist - [x] The PR description includes the appropriate Release Notes section, and the correct `release_note:*` label is applied per the [guidelines](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process)
2025-04-23 17:28:26 -04:00 · 2025-03-06 19:59:44 -05:00 · 2025-03-06 19:59:44 -05:00 · 6bb27b0400
commit 6bb27b0400
parent 0210468548
5 changed files with 31 additions and 11 deletions
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/common/convert_messages_for_inference.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/common/convert_messages_for_inference.ts
@ -11,9 +11,28 @@ import {
  MessageRole as InferenceMessageRole,
 } from '@kbn/inference-common';
 import { generateFakeToolCallId } from '@kbn/inference-plugin/common';
+import type { Logger } from '@kbn/logging';
 import { Message, MessageRole } from '.';

-export function convertMessagesForInference(messages: Message[]): InferenceMessage[] {
+function safeJsonParse(jsonString: string | undefined, logger: Pick<Logger, 'error'>) {
+  try {
+    return JSON.parse(jsonString ?? '{}');
+  } catch (error) {
+    logger.error(
+      `Failed to parse function call arguments when converting messages for inference: ${error}`
+    );
+    // if the LLM returns invalid JSON, it is likley because it is hallucinating
+    // the function. We don't want to propogate the error about invalid JSON here.
+    // Any errors related to the function call will be caught when the function and
+    // it's arguments are validated
+    return {};
+  }
+}
+
+export function convertMessagesForInference(
+  messages: Message[],
+  logger: Pick<Logger, 'error'>
+): InferenceMessage[] {
  const inferenceMessages: InferenceMessage[] = [];

  messages.forEach((message) => {
@ -27,7 +46,7 @@ export function convertMessagesForInference(messages: Message[]): InferenceMessa
                {
                  function: {
                    name: message.message.function_call.name,
-                    arguments: JSON.parse(message.message.function_call.arguments || '{}'),
+                    arguments: safeJsonParse(message.message.function_call.arguments, logger),
                  },
                  toolCallId: generateFakeToolCallId(),
                },
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/service/client/index.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/service/client/index.ts
@ -486,15 +486,14 @@ export class ObservabilityAIAssistantClient {
    const options = {
      connectorId,
      system: systemMessage,
-      messages: convertMessagesForInference(messages),
+      messages: convertMessagesForInference(messages, this.dependencies.logger),
      toolChoice,
      tools,
      functionCalling: (simulateFunctionCalling ? 'simulated' : 'auto') as FunctionCallingMode,
    };

    this.dependencies.logger.debug(
-      () =>
-        `Calling inference client with for name: "${name}" with options: ${JSON.stringify(options)}`
+      () => `Calling inference client for name: "${name}" with options: ${JSON.stringify(options)}`
    );

    if (stream) {
--- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/kibana_client.ts
+++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/kibana_client.ts
@ -328,10 +328,10 @@ export class KibanaClient {
              }

              if (error.message.includes('Status code: 429')) {
-                that.log.info(`429, backing off 20s`);
-
-                return timer(20000);
+                that.log.info(`429, backing off 30s`);
+                return timer(30000);
              }
+
              that.log.info(`Retrying in 5s`);
              return timer(5000);
            },
--- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/kb/index.spec.ts
+++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/kb/index.spec.ts
@ -96,7 +96,7 @@ describe('Knowledge base', () => {
      const conversation = await chatClient.complete({ messages: prompt });

      const result = await chatClient.evaluate(conversation, [
-        'Uses KB retrieval function to find information about the Quantum Revectorization Engine',
+        'Uses context function response to find information about the Quantum Revectorization Engine',
        'Correctly identifies Dr. Eliana Stone at Acme Labs in 2023 as the inventor',
        'Accurately describes that it reorders the subatomic structure of materials and can transform silicon wafers into superconductive materials',
        'Does not invent unrelated or hallucinated details not present in the KB',
@ -111,7 +111,7 @@ describe('Knowledge base', () => {
      const conversation = await chatClient.complete({ messages: prompt });

      const result = await chatClient.evaluate(conversation, [
-        'Uses KB retrieval function to find the correct document about QRE constraints',
+        'Uses context function response to find the correct document about QRE constraints',
        'Mentions the 2 nanometer limit on the revectorization radius',
        'Mentions that specialized fusion reactors are needed',
        'Does not mention information unrelated to constraints or energy (i.e., does not mention the inventor or silicon wafer transformation from doc-invention-1)',
--- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/server/functions/query/index.ts
+++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/server/functions/query/index.ts
@ -103,6 +103,7 @@ export function registerQueryFunction({
      };
    }
  );
+
  functions.registerFunction(
    {
      name: QUERY_FUNCTION_NAME,
@ -129,7 +130,8 @@ export function registerQueryFunction({
        connectorId,
        messages: convertMessagesForInference(
          // remove system message and query function request
-          messages.filter((message) => message.message.role !== MessageRole.System).slice(0, -1)
+          messages.filter((message) => message.message.role !== MessageRole.System).slice(0, -1),
+          resources.logger
        ),
        logger: resources.logger,
        tools: Object.fromEntries(