[Obs AI Assistant] Rename score properties to esScore and llmScore for more clarity (#215699)

Closes https://github.com/elastic/kibana/issues/214801

## Summary

In the recall and score function and score suggestions function we refer
to both Elasticsearch score and LLM score as `score`. This is confusing
and difficult to differentiate.

This PR renames the `score` property based on the context of the score
to either `esScore` or `llmScore`.


### Checklist

- [x] The PR description includes the appropriate Release Notes section,
and the correct `release_note:*` label is applied per the
[guidelines](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process)

---------

Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
Viduni Wickramarachchi 2025-03-27 08:01:01 +05:30 committed by GitHub
parent 3bc1465365
commit 34cf355080
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 79 additions and 75 deletions

View file

@ -8,7 +8,7 @@
import { RootSchema, EventTypeOpts } from '@kbn/core/server';
interface ScoredDocument {
elserScore: number;
esScore: number;
llmScore: number;
}
@ -21,10 +21,10 @@ const schema: RootSchema<RecallRanking> = {
type: 'array',
items: {
properties: {
elserScore: {
esScore: {
type: 'float',
_meta: {
description: 'The score produced by ELSER text_expansion',
description: 'The score produced by Elasticsearch',
},
},
llmScore: {

View file

@ -65,7 +65,7 @@ export function registerContextFunction({
const userPrompt = userMessage?.message.content!;
const userMessageFunctionName = userMessage?.message.name;
const { scores, relevantDocuments, suggestions } = await recallAndScore({
const { llmScores, relevantDocuments, suggestions } = await recallAndScore({
recall: client.recall,
chat,
logger: resources.logger,
@ -80,7 +80,7 @@ export function registerContextFunction({
return {
content: { ...content, learnings: relevantDocuments as unknown as Serializable },
data: {
scores,
llmScores,
suggestions,
},
};

View file

@ -217,12 +217,12 @@ const chatRecallRoute = createObservabilityAIAssistantServerRoute({
signal,
})
).pipe(
map(({ scores, suggestions, relevantDocuments }) => {
map(({ llmScores, suggestions, relevantDocuments }) => {
return createFunctionResponseMessage({
name: 'context',
data: {
suggestions,
scores,
llmScores,
},
content: {
relevantDocuments,

View file

@ -48,7 +48,7 @@ export interface RecalledEntry {
id: string;
title?: string;
text: string;
score: number | null;
esScore: number | null;
is_correction?: boolean;
labels?: Record<string, string>;
}
@ -129,7 +129,7 @@ export class KnowledgeBaseService {
is_correction: hit._source?.is_correction,
labels: hit._source?.labels,
title: hit._source?.title ?? hit._source?.doc_id, // use `doc_id` as fallback title for backwards compatibility
score: hit._score!,
esScore: hit._score!,
id: hit._id!,
}));
}
@ -193,7 +193,7 @@ export class KnowledgeBaseService {
const sortedEntries = orderBy(
documentsFromKb.concat(documentsFromConnectors),
'score',
'esScore',
'desc'
).slice(0, limit.size ?? 20);

View file

@ -51,7 +51,11 @@ export async function recallFromSearchConnectors({
}),
]);
return orderBy([...semanticTextConnectors, ...legacyConnectors], (entry) => entry.score, 'desc');
return orderBy(
[...semanticTextConnectors, ...legacyConnectors],
(entry) => entry.esScore,
'desc'
);
}
async function recallFromSemanticTextConnectors({
@ -108,7 +112,7 @@ async function recallFromSemanticTextConnectors({
const results = response.hits.hits.map((hit) => ({
text: JSON.stringify(hit._source),
score: hit._score!,
esScore: hit._score!,
is_correction: false,
id: hit._id!,
}));
@ -194,7 +198,7 @@ async function recallFromLegacyConnectors({
const results = response.hits.hits.map((hit) => ({
text: JSON.stringify(hit._source),
score: hit._score!,
esScore: hit._score!,
is_correction: false,
id: hit._id!,
}));

View file

@ -20,15 +20,15 @@ describe('parseSuggestionScores', () => {
).toEqual([
{
id: 'my-id',
score: 1,
llmScore: 1,
},
{
id: 'my-other-id',
score: 7,
llmScore: 7,
},
{
id: 'my-another-id',
score: 10,
llmScore: 10,
},
]);
});
@ -37,15 +37,15 @@ describe('parseSuggestionScores', () => {
expect(parseSuggestionScores(`idone,1;idtwo,7;idthree,10`)).toEqual([
{
id: 'idone',
score: 1,
llmScore: 1,
},
{
id: 'idtwo',
score: 7,
llmScore: 7,
},
{
id: 'idthree',
score: 10,
llmScore: 10,
},
]);
});
@ -54,15 +54,15 @@ describe('parseSuggestionScores', () => {
expect(parseSuggestionScores(`a,1 b,7 c,10`)).toEqual([
{
id: 'a',
score: 1,
llmScore: 1,
},
{
id: 'b',
score: 7,
llmScore: 7,
},
{
id: 'c',
score: 10,
llmScore: 10,
},
]);
});

View file

@ -7,7 +7,7 @@
export function parseSuggestionScores(scoresAsString: string) {
// make sure that spaces, semi-colons etc work as separators as well
const scores = scoresAsString
const llmScores = scoresAsString
.replace(/[^0-9a-zA-Z\-_,]/g, ' ')
.trim()
.split(/\s+/)
@ -16,9 +16,9 @@ export function parseSuggestionScores(scoresAsString: string) {
return {
id,
score: parseInt(score, 10),
llmScore: parseInt(score, 10),
};
});
return scores;
return llmScores;
}

View file

@ -91,7 +91,7 @@ describe('recallAndScore', () => {
describe('when no documents are recalled', () => {
let result: {
relevantDocuments?: RecalledSuggestion[];
scores?: Array<{ id: string; score: number }>;
llmScores?: Array<{ id: string; llmScore: number }>;
suggestions: RecalledSuggestion[];
};
@ -111,7 +111,7 @@ describe('recallAndScore', () => {
});
it('returns empty suggestions', async () => {
expect(result).toEqual({ relevantDocuments: [], scores: [], suggestions: [] });
expect(result).toEqual({ relevantDocuments: [], llmScores: [], suggestions: [] });
});
it('invokes recall with user prompt and screen context', async () => {
@ -129,7 +129,7 @@ describe('recallAndScore', () => {
});
it('handles errors when scoring fails', async () => {
mockRecall.mockResolvedValue([{ id: 'doc1', text: 'Hello world', score: 0.5 }]);
mockRecall.mockResolvedValue([{ id: 'doc1', text: 'Hello world', esScore: 0.5 }]);
(scoreSuggestions as jest.Mock).mockRejectedValue(new Error('Scoring failed'));
const result = await recallAndScore({
@ -152,10 +152,10 @@ describe('recallAndScore', () => {
});
it('calls scoreSuggestions with correct arguments', async () => {
const recalledDocs = [{ id: 'doc1', text: 'Hello world', score: 0.8 }];
const recalledDocs = [{ id: 'doc1', text: 'Hello world', esScore: 0.8 }];
mockRecall.mockResolvedValue(recalledDocs);
(scoreSuggestions as jest.Mock).mockResolvedValue({
scores: [{ id: 'doc1', score: 7 }],
llmScores: [{ id: 'doc1', llmScore: 7 }],
relevantDocuments: recalledDocs,
});
@ -184,10 +184,10 @@ describe('recallAndScore', () => {
it('handles the normal conversation flow correctly', async () => {
mockRecall.mockResolvedValue([
{ id: 'fav_color', text: 'My favourite color is blue.', score: 0.9 },
{ id: 'fav_color', text: 'My favourite color is blue.', esScore: 0.9 },
]);
(scoreSuggestions as jest.Mock).mockResolvedValue({
scores: [{ id: 'fav_color', score: 7 }],
llmScores: [{ id: 'fav_color', llmScore: 7 }],
relevantDocuments: [{ id: 'fav_color', text: 'My favourite color is blue.' }],
});
@ -211,10 +211,10 @@ describe('recallAndScore', () => {
it('handles contextual insights conversation flow correctly', async () => {
mockRecall.mockResolvedValue([
{ id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.', score: 0.85 },
{ id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.', esScore: 0.85 },
]);
(scoreSuggestions as jest.Mock).mockResolvedValue({
scores: [{ id: 'alert_cause', score: 6 }],
llmScores: [{ id: 'alert_cause', llmScore: 6 }],
relevantDocuments: [
{ id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.' },
],
@ -239,10 +239,10 @@ describe('recallAndScore', () => {
});
it('reports analytics with the correct structure', async () => {
const recalledDocs = [{ id: 'doc1', text: 'Hello world', score: 0.8 }];
const recalledDocs = [{ id: 'doc1', text: 'Hello world', esScore: 0.8 }];
mockRecall.mockResolvedValue(recalledDocs);
(scoreSuggestions as jest.Mock).mockResolvedValue({
scores: [{ id: 'doc1', score: 7 }],
llmScores: [{ id: 'doc1', llmScore: 7 }],
relevantDocuments: recalledDocs,
});
@ -259,7 +259,7 @@ describe('recallAndScore', () => {
expect(mockAnalytics.reportEvent).toHaveBeenCalledWith(
recallRankingEventType,
expect.objectContaining({ scoredDocuments: [{ elserScore: 0.8, llmScore: 7 }] })
expect.objectContaining({ scoredDocuments: [{ esScore: 0.8, llmScore: 7 }] })
);
});
});

View file

@ -14,7 +14,7 @@ import type { FunctionCallChatFunction } from '../../service/types';
import { RecallRanking, recallRankingEventType } from '../../analytics/recall_ranking';
import { RecalledEntry } from '../../service/knowledge_base_service';
export type RecalledSuggestion = Pick<RecalledEntry, 'id' | 'text' | 'score'>;
export type RecalledSuggestion = Pick<RecalledEntry, 'id' | 'text' | 'esScore'>;
export async function recallAndScore({
recall,
@ -38,7 +38,7 @@ export async function recallAndScore({
signal: AbortSignal;
}): Promise<{
relevantDocuments?: RecalledSuggestion[];
scores?: Array<{ id: string; score: number }>;
llmScores?: Array<{ id: string; llmScore: number }>;
suggestions: RecalledSuggestion[];
}> {
const queries = [
@ -47,19 +47,19 @@ export async function recallAndScore({
].filter((query) => query.text.trim());
const suggestions: RecalledSuggestion[] = (await recall({ queries })).map(
({ id, text, score }) => ({ id, text, score })
({ id, text, esScore }) => ({ id, text, esScore })
);
if (!suggestions.length) {
return {
relevantDocuments: [],
scores: [],
llmScores: [],
suggestions: [],
};
}
try {
const { scores, relevantDocuments } = await scoreSuggestions({
const { llmScores, relevantDocuments } = await scoreSuggestions({
suggestions,
logger,
messages,
@ -72,15 +72,15 @@ export async function recallAndScore({
analytics.reportEvent<RecallRanking>(recallRankingEventType, {
scoredDocuments: suggestions.map((suggestion) => {
const llmScore = scores.find((score) => score.id === suggestion.id);
const llmScore = llmScores.find((score) => score.id === suggestion.id);
return {
elserScore: suggestion.score ?? -1,
llmScore: llmScore ? llmScore.score : -1,
esScore: suggestion.esScore ?? -1,
llmScore: llmScore ? llmScore.llmScore : -1,
};
}),
});
return { scores, relevantDocuments, suggestions };
return { llmScores, relevantDocuments, suggestions };
} catch (error) {
logger.error(`Error scoring documents: ${error.message}`, { error });
return {

View file

@ -15,9 +15,9 @@ import { ChatEvent } from '../../../common/conversation_complete';
import { contextualInsightsMessages, normalConversationMessages } from './recall_and_score.test';
const suggestions: RecalledSuggestion[] = [
{ id: 'doc1', text: 'Relevant document 1', score: 0.9 },
{ id: 'doc2', text: 'Relevant document 2', score: 0.8 },
{ id: 'doc3', text: 'Less relevant document 3', score: 0.3 },
{ id: 'doc1', text: 'Relevant document 1', esScore: 0.9 },
{ id: 'doc2', text: 'Relevant document 2', esScore: 0.8 },
{ id: 'doc3', text: 'Less relevant document 3', esScore: 0.3 },
];
const userPrompt = 'What is my favourite color?';
@ -52,15 +52,15 @@ describe('scoreSuggestions', () => {
logger: mockLogger,
});
expect(result.scores).toEqual([
{ id: 'doc1', score: 7 },
{ id: 'doc2', score: 5 },
{ id: 'doc3', score: 3 },
expect(result.llmScores).toEqual([
{ id: 'doc1', llmScore: 7 },
{ id: 'doc2', llmScore: 5 },
{ id: 'doc3', llmScore: 3 },
]);
expect(result.relevantDocuments).toEqual([
{ id: 'doc1', text: 'Relevant document 1', score: 0.9 },
{ id: 'doc2', text: 'Relevant document 2', score: 0.8 },
{ id: 'doc1', text: 'Relevant document 1', esScore: 0.9 },
{ id: 'doc2', text: 'Relevant document 2', esScore: 0.8 },
]);
});
@ -117,7 +117,7 @@ describe('scoreSuggestions', () => {
});
expect(result.relevantDocuments).toEqual([
{ id: 'doc1', text: 'Relevant document 1', score: 0.9 },
{ id: 'doc1', text: 'Relevant document 1', esScore: 0.9 },
]);
});
@ -159,10 +159,10 @@ describe('scoreSuggestions', () => {
logger: mockLogger,
});
expect(result.scores).toEqual([
{ id: 'doc1', score: 7 },
{ id: 'doc2', score: 5 },
{ id: 'doc3', score: 3 },
expect(result.llmScores).toEqual([
{ id: 'doc1', llmScore: 7 },
{ id: 'doc2', llmScore: 5 },
{ id: 'doc3', llmScore: 3 },
]);
});
});

View file

@ -49,7 +49,7 @@ export async function scoreSuggestions({
logger: Logger;
}): Promise<{
relevantDocuments: RecalledSuggestion[];
scores: Array<{ id: string; score: number }>;
llmScores: Array<{ id: string; llmScore: number }>;
}> {
const shortIdTable = new ShortIdTable();
@ -72,7 +72,7 @@ export async function scoreSuggestions({
Documents:
${JSON.stringify(
suggestions.map((suggestion) => ({
...omit(suggestion, 'score'), // Omit score to not bias the LLM
...omit(suggestion, 'esScore'), // Omit ES score to not bias the LLM
id: shortIdTable.take(suggestion.id), // Shorten id to save tokens
})),
null,
@ -126,21 +126,21 @@ export async function scoreSuggestions({
scoreFunctionRequest.message.function_call.arguments
);
const scores = parseSuggestionScores(scoresAsString)
const llmScores = parseSuggestionScores(scoresAsString)
// Restore original IDs (added fallback to id for testing purposes)
.map(({ id, score }) => ({ id: shortIdTable.lookup(id) || id, score }));
.map(({ id, llmScore }) => ({ id: shortIdTable.lookup(id) || id, llmScore }));
if (scores.length === 0) {
if (llmScores.length === 0) {
// seemingly invalid or no scores, return all
return { relevantDocuments: suggestions, scores: [] };
return { relevantDocuments: suggestions, llmScores: [] };
}
const suggestionIds = suggestions.map((document) => document.id);
// get top 5 documents ids with scores > 4
const relevantDocumentIds = scores
.filter(({ score }) => score > 4)
.sort((a, b) => b.score - a.score)
const relevantDocumentIds = llmScores
.filter(({ llmScore }) => llmScore > 4)
.sort((a, b) => b.llmScore - a.llmScore)
.slice(0, 5)
.filter(({ id }) => suggestionIds.includes(id ?? '')) // Remove hallucinated documents
.map(({ id }) => id);
@ -153,6 +153,6 @@ export async function scoreSuggestions({
return {
relevantDocuments,
scores: scores.map((score) => ({ id: score.id, score: score.score })),
llmScores: llmScores.map((score) => ({ id: score.id, llmScore: score.llmScore })),
};
}

View file

@ -243,7 +243,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
parsedContextResponseData.suggestions.forEach((suggestion: RecalledSuggestion) => {
expect(suggestion).to.have.property('id');
expect(suggestion).to.have.property('text');
expect(suggestion).to.have.property('score');
expect(suggestion).to.have.property('esScore');
});
const suggestionTexts = parsedContextResponseData.suggestions

View file

@ -90,7 +90,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
describe('GET /internal/observability_ai_assistant/functions/recall', () => {
it('produces unique scores for each doc', async () => {
const entries = await recall('What happened during the database outage?');
const uniqueScores = uniq(entries.map(({ score }) => score));
const uniqueScores = uniq(entries.map(({ esScore }) => esScore));
expect(uniqueScores.length).to.be.greaterThan(1);
expect(uniqueScores.length).to.be(8);
});
@ -104,7 +104,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
it('returns entries in a consistent order', async () => {
const entries = await recall('whales');
expect(entries.map(({ id, score }) => `${formatScore(score!)} - ${id}`)).to.eql([
expect(entries.map(({ id, esScore }) => `${formatScore(esScore!)} - ${id}`)).to.eql([
'high - animal_whale_migration_patterns',
'low - animal_elephants_social_structure',
'low - technical_api_gateway_timeouts',
@ -118,12 +118,12 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
it('returns the "Cheetah" entry from search connectors as the top result', async () => {
const entries = await recall('Cheetah');
const { text, score } = first(entries)!;
const { text, esScore } = first(entries)!;
// search connector entries have their entire doc stringified in `text` field
const parsedDoc = JSON.parse(text) as { title: string; text: string };
expect(parsedDoc.title).to.eql('The Life of a Cheetah');
expect(score).to.greaterThan(0.1);
expect(esScore).to.greaterThan(0.1);
});
it('returns different result order for different queries', async () => {