[Obs AI Assistant] Rename score properties to esScore and llmScore for more clarity (#215699)

Closes https://github.com/elastic/kibana/issues/214801 ## Summary In the recall and score function and score suggestions function we refer to both Elasticsearch score and LLM score as `score`. This is confusing and difficult to differentiate. This PR renames the `score` property based on the context of the score to either `esScore` or `llmScore`. ### Checklist - [x] The PR description includes the appropriate Release Notes section, and the correct `release_note:*` label is applied per the [guidelines](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process) --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
2025-04-24 17:59:23 -04:00 · 2025-03-27 08:01:01 +05:30 · 2025-03-27 08:01:01 +05:30 · 34cf355080
commit 34cf355080
parent 3bc1465365
13 changed files with 79 additions and 75 deletions
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/analytics/recall_ranking.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/analytics/recall_ranking.ts
@ -8,7 +8,7 @@
 import { RootSchema, EventTypeOpts } from '@kbn/core/server';

 interface ScoredDocument {
-  elserScore: number;
+  esScore: number;
  llmScore: number;
 }

@ -21,10 +21,10 @@ const schema: RootSchema<RecallRanking> = {
    type: 'array',
    items: {
      properties: {
-        elserScore: {
+        esScore: {
          type: 'float',
          _meta: {
-            description: 'The score produced by ELSER text_expansion',
+            description: 'The score produced by Elasticsearch',
          },
        },
        llmScore: {
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/functions/context.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/functions/context.ts
@ -65,7 +65,7 @@ export function registerContextFunction({
        const userPrompt = userMessage?.message.content!;
        const userMessageFunctionName = userMessage?.message.name;

-        const { scores, relevantDocuments, suggestions } = await recallAndScore({
+        const { llmScores, relevantDocuments, suggestions } = await recallAndScore({
          recall: client.recall,
          chat,
          logger: resources.logger,
@ -80,7 +80,7 @@ export function registerContextFunction({
        return {
          content: { ...content, learnings: relevantDocuments as unknown as Serializable },
          data: {
-            scores,
+            llmScores,
            suggestions,
          },
        };
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/routes/chat/route.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/routes/chat/route.ts
@ -217,12 +217,12 @@ const chatRecallRoute = createObservabilityAIAssistantServerRoute({
        signal,
      })
    ).pipe(
-      map(({ scores, suggestions, relevantDocuments }) => {
+      map(({ llmScores, suggestions, relevantDocuments }) => {
        return createFunctionResponseMessage({
          name: 'context',
          data: {
            suggestions,
-            scores,
+            llmScores,
          },
          content: {
            relevantDocuments,
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/service/knowledge_base_service/index.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/service/knowledge_base_service/index.ts
@ -48,7 +48,7 @@ export interface RecalledEntry {
  id: string;
  title?: string;
  text: string;
-  score: number | null;
+  esScore: number | null;
  is_correction?: boolean;
  labels?: Record<string, string>;
 }
@ -129,7 +129,7 @@ export class KnowledgeBaseService {
      is_correction: hit._source?.is_correction,
      labels: hit._source?.labels,
      title: hit._source?.title ?? hit._source?.doc_id, // use `doc_id` as fallback title for backwards compatibility
-      score: hit._score!,
+      esScore: hit._score!,
      id: hit._id!,
    }));
  }
@ -193,7 +193,7 @@ export class KnowledgeBaseService {

    const sortedEntries = orderBy(
      documentsFromKb.concat(documentsFromConnectors),
-      'score',
+      'esScore',
      'desc'
    ).slice(0, limit.size ?? 20);

--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/service/knowledge_base_service/recall_from_search_connectors.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/service/knowledge_base_service/recall_from_search_connectors.ts
@ -51,7 +51,11 @@ export async function recallFromSearchConnectors({
    }),
  ]);

-  return orderBy([...semanticTextConnectors, ...legacyConnectors], (entry) => entry.score, 'desc');
+  return orderBy(
+    [...semanticTextConnectors, ...legacyConnectors],
+    (entry) => entry.esScore,
+    'desc'
+  );
 }

 async function recallFromSemanticTextConnectors({
@ -108,7 +112,7 @@ async function recallFromSemanticTextConnectors({

  const results = response.hits.hits.map((hit) => ({
    text: JSON.stringify(hit._source),
-    score: hit._score!,
+    esScore: hit._score!,
    is_correction: false,
    id: hit._id!,
  }));
@ -194,7 +198,7 @@ async function recallFromLegacyConnectors({

  const results = response.hits.hits.map((hit) => ({
    text: JSON.stringify(hit._source),
-    score: hit._score!,
+    esScore: hit._score!,
    is_correction: false,
    id: hit._id!,
  }));
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/parse_suggestion_scores.test.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/parse_suggestion_scores.test.ts
@ -20,15 +20,15 @@ describe('parseSuggestionScores', () => {
    ).toEqual([
      {
        id: 'my-id',
-        score: 1,
+        llmScore: 1,
      },
      {
        id: 'my-other-id',
-        score: 7,
+        llmScore: 7,
      },
      {
        id: 'my-another-id',
-        score: 10,
+        llmScore: 10,
      },
    ]);
  });
@ -37,15 +37,15 @@ describe('parseSuggestionScores', () => {
    expect(parseSuggestionScores(`idone,1;idtwo,7;idthree,10`)).toEqual([
      {
        id: 'idone',
-        score: 1,
+        llmScore: 1,
      },
      {
        id: 'idtwo',
-        score: 7,
+        llmScore: 7,
      },
      {
        id: 'idthree',
-        score: 10,
+        llmScore: 10,
      },
    ]);
  });
@ -54,15 +54,15 @@ describe('parseSuggestionScores', () => {
    expect(parseSuggestionScores(`a,1 b,7 c,10`)).toEqual([
      {
        id: 'a',
-        score: 1,
+        llmScore: 1,
      },
      {
        id: 'b',
-        score: 7,
+        llmScore: 7,
      },
      {
        id: 'c',
-        score: 10,
+        llmScore: 10,
      },
    ]);
  });
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/parse_suggestion_scores.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/parse_suggestion_scores.ts
@ -7,7 +7,7 @@

 export function parseSuggestionScores(scoresAsString: string) {
  // make sure that spaces, semi-colons etc work as separators as well
-  const scores = scoresAsString
+  const llmScores = scoresAsString
    .replace(/[^0-9a-zA-Z\-_,]/g, ' ')
    .trim()
    .split(/\s+/)
@ -16,9 +16,9 @@ export function parseSuggestionScores(scoresAsString: string) {

      return {
        id,
-        score: parseInt(score, 10),
+        llmScore: parseInt(score, 10),
      };
    });

-  return scores;
+  return llmScores;
 }
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/recall_and_score.test.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/recall_and_score.test.ts
@ -91,7 +91,7 @@ describe('recallAndScore', () => {
  describe('when no documents are recalled', () => {
    let result: {
      relevantDocuments?: RecalledSuggestion[];
-      scores?: Array<{ id: string; score: number }>;
+      llmScores?: Array<{ id: string; llmScore: number }>;
      suggestions: RecalledSuggestion[];
    };

@ -111,7 +111,7 @@ describe('recallAndScore', () => {
    });

    it('returns empty suggestions', async () => {
-      expect(result).toEqual({ relevantDocuments: [], scores: [], suggestions: [] });
+      expect(result).toEqual({ relevantDocuments: [], llmScores: [], suggestions: [] });
    });

    it('invokes recall with user prompt and screen context', async () => {
@ -129,7 +129,7 @@ describe('recallAndScore', () => {
  });

  it('handles errors when scoring fails', async () => {
-    mockRecall.mockResolvedValue([{ id: 'doc1', text: 'Hello world', score: 0.5 }]);
+    mockRecall.mockResolvedValue([{ id: 'doc1', text: 'Hello world', esScore: 0.5 }]);
    (scoreSuggestions as jest.Mock).mockRejectedValue(new Error('Scoring failed'));

    const result = await recallAndScore({
@ -152,10 +152,10 @@ describe('recallAndScore', () => {
  });

  it('calls scoreSuggestions with correct arguments', async () => {
-    const recalledDocs = [{ id: 'doc1', text: 'Hello world', score: 0.8 }];
+    const recalledDocs = [{ id: 'doc1', text: 'Hello world', esScore: 0.8 }];
    mockRecall.mockResolvedValue(recalledDocs);
    (scoreSuggestions as jest.Mock).mockResolvedValue({
-      scores: [{ id: 'doc1', score: 7 }],
+      llmScores: [{ id: 'doc1', llmScore: 7 }],
      relevantDocuments: recalledDocs,
    });

@ -184,10 +184,10 @@ describe('recallAndScore', () => {

  it('handles the normal conversation flow correctly', async () => {
    mockRecall.mockResolvedValue([
-      { id: 'fav_color', text: 'My favourite color is blue.', score: 0.9 },
+      { id: 'fav_color', text: 'My favourite color is blue.', esScore: 0.9 },
    ]);
    (scoreSuggestions as jest.Mock).mockResolvedValue({
-      scores: [{ id: 'fav_color', score: 7 }],
+      llmScores: [{ id: 'fav_color', llmScore: 7 }],
      relevantDocuments: [{ id: 'fav_color', text: 'My favourite color is blue.' }],
    });

@ -211,10 +211,10 @@ describe('recallAndScore', () => {

  it('handles contextual insights conversation flow correctly', async () => {
    mockRecall.mockResolvedValue([
-      { id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.', score: 0.85 },
+      { id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.', esScore: 0.85 },
    ]);
    (scoreSuggestions as jest.Mock).mockResolvedValue({
-      scores: [{ id: 'alert_cause', score: 6 }],
+      llmScores: [{ id: 'alert_cause', llmScore: 6 }],
      relevantDocuments: [
        { id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.' },
      ],
@ -239,10 +239,10 @@ describe('recallAndScore', () => {
  });

  it('reports analytics with the correct structure', async () => {
-    const recalledDocs = [{ id: 'doc1', text: 'Hello world', score: 0.8 }];
+    const recalledDocs = [{ id: 'doc1', text: 'Hello world', esScore: 0.8 }];
    mockRecall.mockResolvedValue(recalledDocs);
    (scoreSuggestions as jest.Mock).mockResolvedValue({
-      scores: [{ id: 'doc1', score: 7 }],
+      llmScores: [{ id: 'doc1', llmScore: 7 }],
      relevantDocuments: recalledDocs,
    });

@ -259,7 +259,7 @@ describe('recallAndScore', () => {

    expect(mockAnalytics.reportEvent).toHaveBeenCalledWith(
      recallRankingEventType,
-      expect.objectContaining({ scoredDocuments: [{ elserScore: 0.8, llmScore: 7 }] })
+      expect.objectContaining({ scoredDocuments: [{ esScore: 0.8, llmScore: 7 }] })
    );
  });
 });
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/recall_and_score.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/recall_and_score.ts
@ -14,7 +14,7 @@ import type { FunctionCallChatFunction } from '../../service/types';
 import { RecallRanking, recallRankingEventType } from '../../analytics/recall_ranking';
 import { RecalledEntry } from '../../service/knowledge_base_service';

-export type RecalledSuggestion = Pick<RecalledEntry, 'id' | 'text' | 'score'>;
+export type RecalledSuggestion = Pick<RecalledEntry, 'id' | 'text' | 'esScore'>;

 export async function recallAndScore({
  recall,
@ -38,7 +38,7 @@ export async function recallAndScore({
  signal: AbortSignal;
 }): Promise<{
  relevantDocuments?: RecalledSuggestion[];
-  scores?: Array<{ id: string; score: number }>;
+  llmScores?: Array<{ id: string; llmScore: number }>;
  suggestions: RecalledSuggestion[];
 }> {
  const queries = [
@ -47,19 +47,19 @@ export async function recallAndScore({
  ].filter((query) => query.text.trim());

  const suggestions: RecalledSuggestion[] = (await recall({ queries })).map(
-    ({ id, text, score }) => ({ id, text, score })
+    ({ id, text, esScore }) => ({ id, text, esScore })
  );

  if (!suggestions.length) {
    return {
      relevantDocuments: [],
-      scores: [],
+      llmScores: [],
      suggestions: [],
    };
  }

  try {
-    const { scores, relevantDocuments } = await scoreSuggestions({
+    const { llmScores, relevantDocuments } = await scoreSuggestions({
      suggestions,
      logger,
      messages,
@ -72,15 +72,15 @@ export async function recallAndScore({

    analytics.reportEvent<RecallRanking>(recallRankingEventType, {
      scoredDocuments: suggestions.map((suggestion) => {
-        const llmScore = scores.find((score) => score.id === suggestion.id);
+        const llmScore = llmScores.find((score) => score.id === suggestion.id);
        return {
-          elserScore: suggestion.score ?? -1,
-          llmScore: llmScore ? llmScore.score : -1,
+          esScore: suggestion.esScore ?? -1,
+          llmScore: llmScore ? llmScore.llmScore : -1,
        };
      }),
    });

-    return { scores, relevantDocuments, suggestions };
+    return { llmScores, relevantDocuments, suggestions };
  } catch (error) {
    logger.error(`Error scoring documents: ${error.message}`, { error });
    return {
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/score_suggestions.test.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/score_suggestions.test.ts
@ -15,9 +15,9 @@ import { ChatEvent } from '../../../common/conversation_complete';
 import { contextualInsightsMessages, normalConversationMessages } from './recall_and_score.test';

 const suggestions: RecalledSuggestion[] = [
-  { id: 'doc1', text: 'Relevant document 1', score: 0.9 },
-  { id: 'doc2', text: 'Relevant document 2', score: 0.8 },
-  { id: 'doc3', text: 'Less relevant document 3', score: 0.3 },
+  { id: 'doc1', text: 'Relevant document 1', esScore: 0.9 },
+  { id: 'doc2', text: 'Relevant document 2', esScore: 0.8 },
+  { id: 'doc3', text: 'Less relevant document 3', esScore: 0.3 },
 ];

 const userPrompt = 'What is my favourite color?';
@ -52,15 +52,15 @@ describe('scoreSuggestions', () => {
      logger: mockLogger,
    });

-    expect(result.scores).toEqual([
-      { id: 'doc1', score: 7 },
-      { id: 'doc2', score: 5 },
-      { id: 'doc3', score: 3 },
+    expect(result.llmScores).toEqual([
+      { id: 'doc1', llmScore: 7 },
+      { id: 'doc2', llmScore: 5 },
+      { id: 'doc3', llmScore: 3 },
    ]);

    expect(result.relevantDocuments).toEqual([
-      { id: 'doc1', text: 'Relevant document 1', score: 0.9 },
-      { id: 'doc2', text: 'Relevant document 2', score: 0.8 },
+      { id: 'doc1', text: 'Relevant document 1', esScore: 0.9 },
+      { id: 'doc2', text: 'Relevant document 2', esScore: 0.8 },
    ]);
  });

@ -117,7 +117,7 @@ describe('scoreSuggestions', () => {
    });

    expect(result.relevantDocuments).toEqual([
-      { id: 'doc1', text: 'Relevant document 1', score: 0.9 },
+      { id: 'doc1', text: 'Relevant document 1', esScore: 0.9 },
    ]);
  });

@ -159,10 +159,10 @@ describe('scoreSuggestions', () => {
      logger: mockLogger,
    });

-    expect(result.scores).toEqual([
-      { id: 'doc1', score: 7 },
-      { id: 'doc2', score: 5 },
-      { id: 'doc3', score: 3 },
+    expect(result.llmScores).toEqual([
+      { id: 'doc1', llmScore: 7 },
+      { id: 'doc2', llmScore: 5 },
+      { id: 'doc3', llmScore: 3 },
    ]);
  });
 });
--- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/score_suggestions.ts
+++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/utils/recall/score_suggestions.ts
@ -49,7 +49,7 @@ export async function scoreSuggestions({
  logger: Logger;
 }): Promise<{
  relevantDocuments: RecalledSuggestion[];
-  scores: Array<{ id: string; score: number }>;
+  llmScores: Array<{ id: string; llmScore: number }>;
 }> {
  const shortIdTable = new ShortIdTable();

@ -72,7 +72,7 @@ export async function scoreSuggestions({
    Documents:
    ${JSON.stringify(
      suggestions.map((suggestion) => ({
-        ...omit(suggestion, 'score'), // Omit score to not bias the LLM
+        ...omit(suggestion, 'esScore'), // Omit ES score to not bias the LLM
        id: shortIdTable.take(suggestion.id), // Shorten id to save tokens
      })),
      null,
@ -126,21 +126,21 @@ export async function scoreSuggestions({
    scoreFunctionRequest.message.function_call.arguments
  );

-  const scores = parseSuggestionScores(scoresAsString)
+  const llmScores = parseSuggestionScores(scoresAsString)
    // Restore original IDs (added fallback to id for testing purposes)
-    .map(({ id, score }) => ({ id: shortIdTable.lookup(id) || id, score }));
+    .map(({ id, llmScore }) => ({ id: shortIdTable.lookup(id) || id, llmScore }));

-  if (scores.length === 0) {
+  if (llmScores.length === 0) {
    // seemingly invalid or no scores, return all
-    return { relevantDocuments: suggestions, scores: [] };
+    return { relevantDocuments: suggestions, llmScores: [] };
  }

  const suggestionIds = suggestions.map((document) => document.id);

  // get top 5 documents ids with scores > 4
-  const relevantDocumentIds = scores
-    .filter(({ score }) => score > 4)
-    .sort((a, b) => b.score - a.score)
+  const relevantDocumentIds = llmScores
+    .filter(({ llmScore }) => llmScore > 4)
+    .sort((a, b) => b.llmScore - a.llmScore)
    .slice(0, 5)
    .filter(({ id }) => suggestionIds.includes(id ?? '')) // Remove hallucinated documents
    .map(({ id }) => id);
@ -153,6 +153,6 @@ export async function scoreSuggestions({

  return {
    relevantDocuments,
-    scores: scores.map((score) => ({ id: score.id, score: score.score })),
+    llmScores: llmScores.map((score) => ({ id: score.id, llmScore: score.llmScore })),
  };
 }
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/context.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/context.spec.ts
@ -243,7 +243,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
          parsedContextResponseData.suggestions.forEach((suggestion: RecalledSuggestion) => {
            expect(suggestion).to.have.property('id');
            expect(suggestion).to.have.property('text');
-            expect(suggestion).to.have.property('score');
+            expect(suggestion).to.have.property('esScore');
          });

          const suggestionTexts = parsedContextResponseData.suggestions
--- a/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/recall.spec.ts
+++ b/x-pack/test/api_integration/deployment_agnostic/apis/observability/ai_assistant/complete/functions/recall.spec.ts
@ -90,7 +90,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
    describe('GET /internal/observability_ai_assistant/functions/recall', () => {
      it('produces unique scores for each doc', async () => {
        const entries = await recall('What happened during the database outage?');
-        const uniqueScores = uniq(entries.map(({ score }) => score));
+        const uniqueScores = uniq(entries.map(({ esScore }) => esScore));
        expect(uniqueScores.length).to.be.greaterThan(1);
        expect(uniqueScores.length).to.be(8);
      });
@ -104,7 +104,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
      it('returns entries in a consistent order', async () => {
        const entries = await recall('whales');

-        expect(entries.map(({ id, score }) => `${formatScore(score!)} - ${id}`)).to.eql([
+        expect(entries.map(({ id, esScore }) => `${formatScore(esScore!)} - ${id}`)).to.eql([
          'high - animal_whale_migration_patterns',
          'low - animal_elephants_social_structure',
          'low - technical_api_gateway_timeouts',
@ -118,12 +118,12 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon

      it('returns the "Cheetah" entry from search connectors as the top result', async () => {
        const entries = await recall('Cheetah');
-        const { text, score } = first(entries)!;
+        const { text, esScore } = first(entries)!;

        // search connector entries have their entire doc stringified in `text` field
        const parsedDoc = JSON.parse(text) as { title: string; text: string };
        expect(parsedDoc.title).to.eql('The Life of a Cheetah');
-        expect(score).to.greaterThan(0.1);
+        expect(esScore).to.greaterThan(0.1);
      });

      it('returns different result order for different queries', async () => {