[9.0] [Obs AI Assistant] Improve system prompt and instructions to work better with Claude models (#221965) (#223009)

# Backport This will backport the following commits from `main` to `9.0`: - [[Obs AI Assistant] Improve system prompt and instructions to work better with Claude models (#221965)](https://github.com/elastic/kibana/pull/221965)  ### Questions ? Please refer to the [Backport tool documentation](https://github.com/sorenlouv/backport)
2025-06-27 18:51:07 -04:00 · 2025-06-09 09:44:30 -04:00 · 2025-06-09 09:44:30 -04:00 · 360283e6c7
commit 360283e6c7
parent 86bfdd964a
5 changed files with 44 additions and 28 deletions
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/functions/index.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/functions/index.ts
@ -106,13 +106,19 @@ ${
    if (isKnowledgeBaseReady) {
      if (availableFunctionNames.includes(SUMMARIZE_FUNCTION_NAME)) {
        instructions.push(`You can use the "${SUMMARIZE_FUNCTION_NAME}" function to store new information you have learned in a knowledge database.
-          Only use this function when the user asks to remember or store some information.
+          If the user asks to remember or store some information, always use this function.
          All summaries MUST be created in English, even if the conversation was carried out in a different language.`);
      }

      if (availableFunctionNames.includes(CONTEXT_FUNCTION_NAME)) {
        instructions.push(
-          `Additionally, you can use the "${CONTEXT_FUNCTION_NAME}" function to retrieve relevant information from the knowledge database.`
+          `You can use the "${CONTEXT_FUNCTION_NAME}" function to retrieve relevant information from the knowledge database. The response will include a "learnings" field containing information
+          from the knowledge base that is most relevant to the user's current query. You should incorporate these learnings into your responses when answering the user's questions.
+          The information in the "learnings" field contains up-to-date information that you should consider when formulating your responses. DO NOT add disclaimers about the currency or certainty of this information.
+          Present this information directly without qualifiers like "I don't have specific, up-to-date information" or "I can't be completely certain".
+          
+          Stick strictly to the information provided in the "learnings" field. DO NOT assume, infer, or add any details that are not explicitly stated in the response.
+          If the user asks for information that is not covered in the "learnings" field, acknowledge the gap and ask for clarification rather than making assumptions or offering suggestions that aren't based on the provided knowledge.`
        );
      }
    } else {
--- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/alerts/index.spec.ts
+++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/alerts/index.spec.ts
@ -12,13 +12,13 @@ import { RuleResponse } from '@kbn/alerting-plugin/common/routes/rule/response/t
 import moment from 'moment';
 import { apm, timerange } from '@kbn/apm-synthtrace-client';
 import { MessageRole } from '@kbn/observability-ai-assistant-plugin/common';
-import { chatClient, kibanaClient, synthtraceEsClients, logger } from '../../services';
+import { chatClient, kibanaClient, synthtraceEsClients, logger, esClient } from '../../services';
 import {
  apmTransactionRateAIAssistant,
  customThresholdAIAssistantLogCount,
 } from '../../alert_templates/templates';

-describe('Alert function', () => {
+describe('Alerts', () => {
  const ruleIds: any[] = [];

  before(async () => {
@ -114,7 +114,7 @@ describe('Alert function', () => {

  it('filtered alerts', async () => {
    let conversation = await chatClient.complete({
-      messages: 'Do I have any active threshold alerts related to the AI Assistant?',
+      messages: 'Do I have any active threshold alerts?',
    });

    conversation = await chatClient.complete({
@ -128,7 +128,7 @@ describe('Alert function', () => {
    const result = await chatClient.evaluate(conversation, [
      'Uses the get_alerts_dataset_info function',
      'Correctly uses the alerts function without a filter',
-      'Returns two alerts related to "Threshold surpassed in AI Assistant eval"',
+      'Returns two alerts related to threshold',
      'After the second question, uses alerts function to filtering on service.name my-service to retrieve active alerts for that service. The filter should be `service.name:"my-service"` or `service.name:my-service`.',
      'Summarizes the active alerts for the `my-service` service',
    ]);
@ -138,11 +138,16 @@ describe('Alert function', () => {

  after(async () => {
    await synthtraceEsClients.apmSynthtraceEsClient.clean();
-
+    await esClient.deleteByQuery({
+      index: '.alerts-observability-*',
+      query: {
+        match_all: {},
+      },
+      refresh: true,
+    });
    for (const ruleId of ruleIds) {
      await kibanaClient.callKibana('delete', { pathname: `/api/alerting/rule/${ruleId}` });
    }
-
    await kibanaClient.callKibana(
      'post',
      { pathname: `/api/content_management/rpc/delete` },
--- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/elasticsearch/index.spec.ts
+++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/elasticsearch/index.spec.ts
@ -11,7 +11,7 @@ import expect from '@kbn/expect';
 import { MessageRole } from '@kbn/observability-ai-assistant-plugin/common';
 import { chatClient, esClient } from '../../services';

-describe('Elasticsearch functions', () => {
+describe('Elasticsearch function', () => {
  describe('health', () => {
    it('returns the cluster health state', async () => {
      const conversation = await chatClient.complete({
--- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/kb/index.spec.ts
+++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/kb/index.spec.ts
@ -57,14 +57,19 @@ describe('Knowledge base', () => {
  describe('kb retrieval', () => {
    const testDocs = [
      {
-        id: 'doc_invention_1',
-        title: 'Quantum Revectorization Engine',
-        text: 'The Quantum Revectorization Engine (QRE), invented by Dr. Eliana Stone at Acme Labs in 2023, uses advanced quantum fields to reorder the subatomic structure of materials, effectively reconfiguring matter at a fundamental level. Its main achievement was to transform ordinary silicon wafers into superconductive materials without traditional cooling methods.',
+        id: 'acme_teams',
+        title: 'ACME DevOps Team Structure',
+        text: 'ACME maintains three primary DevOps teams: Platform Infrastructure (responsible for cloud infrastructure and Kubernetes clusters), Application Operations (responsible for application deployments and monitoring), and Security Operations (responsible for security monitoring and compliance). Each team maintains a separate on-call rotation accessible via PagerDuty. The current on-call schedule is available in the #oncall Slack channel or through the PagerDuty integration in Kibana.',
      },
      {
-        id: 'doc_invention_2',
-        title: 'Constraints of QRE',
-        text: 'Current constraints on the Quantum Revectorization Engine technology limit its revectorization radius to approximately 2 nanometers. Additionally, the energy required to maintain the quantum fields is extraordinarily high, necessitating specialized fusion reactors to sustain the process.',
+        id: 'acme_monitoring',
+        title: 'Alert Thresholds',
+        text: 'Standard alert thresholds for ACME services are: API response time > 500ms (warning) or > 1s (critical), error rate > 1% (warning) or > 5% (critical), CPU usage > 80% (warning) or > 90% (critical), memory usage > 85% (warning) or > 95% (critical). Custom thresholds for specific services are documented in the service runbooks stored in Confluence under the "Service Specifications" space.',
+      },
+      {
+        id: 'acme_infra',
+        title: 'Database Infrastructure',
+        text: 'Primary transactional data is stored in PostgreSQL clusters with read replicas in each region. Customer metadata is stored in MongoDB with M40 clusters in each region. Caching layer uses Redis Enterprise Cloud with 15GB instances. All database metrics are collected via Metricbeat with custom dashboards available under "ACME Databases" in Kibana. Database performance alerts are configured to notify the DBA team via the #db-alerts Slack channel.',
      },
    ];

@ -91,30 +96,30 @@ describe('Knowledge base', () => {
      );
    });

-    it('retrieves inventor and purpose of the QRE', async () => {
-      const prompt = 'Who invented the Quantum Revectorization Engine and what does it do?';
+    it('retrieves DevOps team structure and on-call information', async () => {
+      const prompt = 'What DevOps teams does we have and how is the on-call rotation managed?';
      const conversation = await chatClient.complete({ messages: prompt });

      const result = await chatClient.evaluate(conversation, [
-        'Uses KB retrieval function to find information about the Quantum Revectorization Engine',
-        'Correctly identifies Dr. Eliana Stone at Acme Labs in 2023 as the inventor',
-        'Accurately describes that it reorders the subatomic structure of materials and can transform silicon wafers into superconductive materials',
+        'Uses context function response to find information about ACME DevOps team structure',
+        "Correctly identifies all three teams: Platform Infrastructure, Application Operations, and Security Operations and destcribes each team's responsibilities",
+        'Mentions that on-call rotations are managed through PagerDuty and includes information about accessing the on-call schedule via Slack or Kibana',
        'Does not invent unrelated or hallucinated details not present in the KB',
      ]);

      expect(result.passed).to.be(true);
    });

-    it('retrieves constraints and energy requirements of the QRE', async () => {
+    it('retrieves monitoring thresholds and database infrastructure details', async () => {
      const prompt =
-        'What is the approximate revectorization radius of the QRE and what kind of reactor is required to power it?';
+        'What are our standard alert thresholds for services and what database technologies do we use?';
      const conversation = await chatClient.complete({ messages: prompt });

      const result = await chatClient.evaluate(conversation, [
-        'Uses KB retrieval function to find the correct document about QRE constraints',
-        'Mentions the 2 nanometer limit on the revectorization radius',
-        'Mentions that specialized fusion reactors are needed',
-        'Does not mention information unrelated to constraints or energy (i.e., does not mention the inventor or silicon wafer transformation from doc-invention-1)',
+        'Uses context function response to find the correct documents about alert thresholds and database infrastructure',
+        'Mentions the specific alert thresholds for API response time, error rate, CPU usage, and memory usage',
+        'Identifies the primary database technologies: PostgreSQL, MongoDB, and Redis and mentions that database metrics are collected via Metricbeat',
+        'Does not combine information incorrectly or hallucinate details not present in the KB',
      ]);

      expect(result.passed).to.be(true);
@ -126,7 +131,7 @@ describe('Knowledge base', () => {
        ignore_unavailable: true,
        query: {
          match: {
-            text: 'Quantum Revectorization Engine',
+            text: 'ACME',
          },
        },
        refresh: true,
--- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/server/functions/alerts.ts
+++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/server/functions/alerts.ts
@ -136,7 +136,7 @@ export function registerAlertsFunction({
    functions.registerFunction(
      {
        name: 'alerts',
-        description: `Get alerts for Observability.  Make sure ${GET_ALERTS_DATASET_INFO_NAME} was called before.
+        description: `Get alerts for Observability. Make sure ${GET_ALERTS_DATASET_INFO_NAME} was called before.
        Use this to get open (and optionally recovered) alerts for Observability assets, like services,
        hosts or containers.
        Display the response in tabular format if appropriate.