mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 17:59:23 -04:00
[Obs AI Assistant] Rename score properties to esScore and llmScore for more clarity (#215699)
Closes https://github.com/elastic/kibana/issues/214801 ## Summary In the recall and score function and score suggestions function we refer to both Elasticsearch score and LLM score as `score`. This is confusing and difficult to differentiate. This PR renames the `score` property based on the context of the score to either `esScore` or `llmScore`. ### Checklist - [x] The PR description includes the appropriate Release Notes section, and the correct `release_note:*` label is applied per the [guidelines](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process) --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
parent
3bc1465365
commit
34cf355080
13 changed files with 79 additions and 75 deletions
|
@ -8,7 +8,7 @@
|
|||
import { RootSchema, EventTypeOpts } from '@kbn/core/server';
|
||||
|
||||
interface ScoredDocument {
|
||||
elserScore: number;
|
||||
esScore: number;
|
||||
llmScore: number;
|
||||
}
|
||||
|
||||
|
@ -21,10 +21,10 @@ const schema: RootSchema<RecallRanking> = {
|
|||
type: 'array',
|
||||
items: {
|
||||
properties: {
|
||||
elserScore: {
|
||||
esScore: {
|
||||
type: 'float',
|
||||
_meta: {
|
||||
description: 'The score produced by ELSER text_expansion',
|
||||
description: 'The score produced by Elasticsearch',
|
||||
},
|
||||
},
|
||||
llmScore: {
|
||||
|
|
|
@ -65,7 +65,7 @@ export function registerContextFunction({
|
|||
const userPrompt = userMessage?.message.content!;
|
||||
const userMessageFunctionName = userMessage?.message.name;
|
||||
|
||||
const { scores, relevantDocuments, suggestions } = await recallAndScore({
|
||||
const { llmScores, relevantDocuments, suggestions } = await recallAndScore({
|
||||
recall: client.recall,
|
||||
chat,
|
||||
logger: resources.logger,
|
||||
|
@ -80,7 +80,7 @@ export function registerContextFunction({
|
|||
return {
|
||||
content: { ...content, learnings: relevantDocuments as unknown as Serializable },
|
||||
data: {
|
||||
scores,
|
||||
llmScores,
|
||||
suggestions,
|
||||
},
|
||||
};
|
||||
|
|
|
@ -217,12 +217,12 @@ const chatRecallRoute = createObservabilityAIAssistantServerRoute({
|
|||
signal,
|
||||
})
|
||||
).pipe(
|
||||
map(({ scores, suggestions, relevantDocuments }) => {
|
||||
map(({ llmScores, suggestions, relevantDocuments }) => {
|
||||
return createFunctionResponseMessage({
|
||||
name: 'context',
|
||||
data: {
|
||||
suggestions,
|
||||
scores,
|
||||
llmScores,
|
||||
},
|
||||
content: {
|
||||
relevantDocuments,
|
||||
|
|
|
@ -48,7 +48,7 @@ export interface RecalledEntry {
|
|||
id: string;
|
||||
title?: string;
|
||||
text: string;
|
||||
score: number | null;
|
||||
esScore: number | null;
|
||||
is_correction?: boolean;
|
||||
labels?: Record<string, string>;
|
||||
}
|
||||
|
@ -129,7 +129,7 @@ export class KnowledgeBaseService {
|
|||
is_correction: hit._source?.is_correction,
|
||||
labels: hit._source?.labels,
|
||||
title: hit._source?.title ?? hit._source?.doc_id, // use `doc_id` as fallback title for backwards compatibility
|
||||
score: hit._score!,
|
||||
esScore: hit._score!,
|
||||
id: hit._id!,
|
||||
}));
|
||||
}
|
||||
|
@ -193,7 +193,7 @@ export class KnowledgeBaseService {
|
|||
|
||||
const sortedEntries = orderBy(
|
||||
documentsFromKb.concat(documentsFromConnectors),
|
||||
'score',
|
||||
'esScore',
|
||||
'desc'
|
||||
).slice(0, limit.size ?? 20);
|
||||
|
||||
|
|
|
@ -51,7 +51,11 @@ export async function recallFromSearchConnectors({
|
|||
}),
|
||||
]);
|
||||
|
||||
return orderBy([...semanticTextConnectors, ...legacyConnectors], (entry) => entry.score, 'desc');
|
||||
return orderBy(
|
||||
[...semanticTextConnectors, ...legacyConnectors],
|
||||
(entry) => entry.esScore,
|
||||
'desc'
|
||||
);
|
||||
}
|
||||
|
||||
async function recallFromSemanticTextConnectors({
|
||||
|
@ -108,7 +112,7 @@ async function recallFromSemanticTextConnectors({
|
|||
|
||||
const results = response.hits.hits.map((hit) => ({
|
||||
text: JSON.stringify(hit._source),
|
||||
score: hit._score!,
|
||||
esScore: hit._score!,
|
||||
is_correction: false,
|
||||
id: hit._id!,
|
||||
}));
|
||||
|
@ -194,7 +198,7 @@ async function recallFromLegacyConnectors({
|
|||
|
||||
const results = response.hits.hits.map((hit) => ({
|
||||
text: JSON.stringify(hit._source),
|
||||
score: hit._score!,
|
||||
esScore: hit._score!,
|
||||
is_correction: false,
|
||||
id: hit._id!,
|
||||
}));
|
||||
|
|
|
@ -20,15 +20,15 @@ describe('parseSuggestionScores', () => {
|
|||
).toEqual([
|
||||
{
|
||||
id: 'my-id',
|
||||
score: 1,
|
||||
llmScore: 1,
|
||||
},
|
||||
{
|
||||
id: 'my-other-id',
|
||||
score: 7,
|
||||
llmScore: 7,
|
||||
},
|
||||
{
|
||||
id: 'my-another-id',
|
||||
score: 10,
|
||||
llmScore: 10,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
@ -37,15 +37,15 @@ describe('parseSuggestionScores', () => {
|
|||
expect(parseSuggestionScores(`idone,1;idtwo,7;idthree,10`)).toEqual([
|
||||
{
|
||||
id: 'idone',
|
||||
score: 1,
|
||||
llmScore: 1,
|
||||
},
|
||||
{
|
||||
id: 'idtwo',
|
||||
score: 7,
|
||||
llmScore: 7,
|
||||
},
|
||||
{
|
||||
id: 'idthree',
|
||||
score: 10,
|
||||
llmScore: 10,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
@ -54,15 +54,15 @@ describe('parseSuggestionScores', () => {
|
|||
expect(parseSuggestionScores(`a,1 b,7 c,10`)).toEqual([
|
||||
{
|
||||
id: 'a',
|
||||
score: 1,
|
||||
llmScore: 1,
|
||||
},
|
||||
{
|
||||
id: 'b',
|
||||
score: 7,
|
||||
llmScore: 7,
|
||||
},
|
||||
{
|
||||
id: 'c',
|
||||
score: 10,
|
||||
llmScore: 10,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
|
||||
export function parseSuggestionScores(scoresAsString: string) {
|
||||
// make sure that spaces, semi-colons etc work as separators as well
|
||||
const scores = scoresAsString
|
||||
const llmScores = scoresAsString
|
||||
.replace(/[^0-9a-zA-Z\-_,]/g, ' ')
|
||||
.trim()
|
||||
.split(/\s+/)
|
||||
|
@ -16,9 +16,9 @@ export function parseSuggestionScores(scoresAsString: string) {
|
|||
|
||||
return {
|
||||
id,
|
||||
score: parseInt(score, 10),
|
||||
llmScore: parseInt(score, 10),
|
||||
};
|
||||
});
|
||||
|
||||
return scores;
|
||||
return llmScores;
|
||||
}
|
||||
|
|
|
@ -91,7 +91,7 @@ describe('recallAndScore', () => {
|
|||
describe('when no documents are recalled', () => {
|
||||
let result: {
|
||||
relevantDocuments?: RecalledSuggestion[];
|
||||
scores?: Array<{ id: string; score: number }>;
|
||||
llmScores?: Array<{ id: string; llmScore: number }>;
|
||||
suggestions: RecalledSuggestion[];
|
||||
};
|
||||
|
||||
|
@ -111,7 +111,7 @@ describe('recallAndScore', () => {
|
|||
});
|
||||
|
||||
it('returns empty suggestions', async () => {
|
||||
expect(result).toEqual({ relevantDocuments: [], scores: [], suggestions: [] });
|
||||
expect(result).toEqual({ relevantDocuments: [], llmScores: [], suggestions: [] });
|
||||
});
|
||||
|
||||
it('invokes recall with user prompt and screen context', async () => {
|
||||
|
@ -129,7 +129,7 @@ describe('recallAndScore', () => {
|
|||
});
|
||||
|
||||
it('handles errors when scoring fails', async () => {
|
||||
mockRecall.mockResolvedValue([{ id: 'doc1', text: 'Hello world', score: 0.5 }]);
|
||||
mockRecall.mockResolvedValue([{ id: 'doc1', text: 'Hello world', esScore: 0.5 }]);
|
||||
(scoreSuggestions as jest.Mock).mockRejectedValue(new Error('Scoring failed'));
|
||||
|
||||
const result = await recallAndScore({
|
||||
|
@ -152,10 +152,10 @@ describe('recallAndScore', () => {
|
|||
});
|
||||
|
||||
it('calls scoreSuggestions with correct arguments', async () => {
|
||||
const recalledDocs = [{ id: 'doc1', text: 'Hello world', score: 0.8 }];
|
||||
const recalledDocs = [{ id: 'doc1', text: 'Hello world', esScore: 0.8 }];
|
||||
mockRecall.mockResolvedValue(recalledDocs);
|
||||
(scoreSuggestions as jest.Mock).mockResolvedValue({
|
||||
scores: [{ id: 'doc1', score: 7 }],
|
||||
llmScores: [{ id: 'doc1', llmScore: 7 }],
|
||||
relevantDocuments: recalledDocs,
|
||||
});
|
||||
|
||||
|
@ -184,10 +184,10 @@ describe('recallAndScore', () => {
|
|||
|
||||
it('handles the normal conversation flow correctly', async () => {
|
||||
mockRecall.mockResolvedValue([
|
||||
{ id: 'fav_color', text: 'My favourite color is blue.', score: 0.9 },
|
||||
{ id: 'fav_color', text: 'My favourite color is blue.', esScore: 0.9 },
|
||||
]);
|
||||
(scoreSuggestions as jest.Mock).mockResolvedValue({
|
||||
scores: [{ id: 'fav_color', score: 7 }],
|
||||
llmScores: [{ id: 'fav_color', llmScore: 7 }],
|
||||
relevantDocuments: [{ id: 'fav_color', text: 'My favourite color is blue.' }],
|
||||
});
|
||||
|
||||
|
@ -211,10 +211,10 @@ describe('recallAndScore', () => {
|
|||
|
||||
it('handles contextual insights conversation flow correctly', async () => {
|
||||
mockRecall.mockResolvedValue([
|
||||
{ id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.', score: 0.85 },
|
||||
{ id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.', esScore: 0.85 },
|
||||
]);
|
||||
(scoreSuggestions as jest.Mock).mockResolvedValue({
|
||||
scores: [{ id: 'alert_cause', score: 6 }],
|
||||
llmScores: [{ id: 'alert_cause', llmScore: 6 }],
|
||||
relevantDocuments: [
|
||||
{ id: 'alert_cause', text: 'The alert was triggered due to high CPU usage.' },
|
||||
],
|
||||
|
@ -239,10 +239,10 @@ describe('recallAndScore', () => {
|
|||
});
|
||||
|
||||
it('reports analytics with the correct structure', async () => {
|
||||
const recalledDocs = [{ id: 'doc1', text: 'Hello world', score: 0.8 }];
|
||||
const recalledDocs = [{ id: 'doc1', text: 'Hello world', esScore: 0.8 }];
|
||||
mockRecall.mockResolvedValue(recalledDocs);
|
||||
(scoreSuggestions as jest.Mock).mockResolvedValue({
|
||||
scores: [{ id: 'doc1', score: 7 }],
|
||||
llmScores: [{ id: 'doc1', llmScore: 7 }],
|
||||
relevantDocuments: recalledDocs,
|
||||
});
|
||||
|
||||
|
@ -259,7 +259,7 @@ describe('recallAndScore', () => {
|
|||
|
||||
expect(mockAnalytics.reportEvent).toHaveBeenCalledWith(
|
||||
recallRankingEventType,
|
||||
expect.objectContaining({ scoredDocuments: [{ elserScore: 0.8, llmScore: 7 }] })
|
||||
expect.objectContaining({ scoredDocuments: [{ esScore: 0.8, llmScore: 7 }] })
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -14,7 +14,7 @@ import type { FunctionCallChatFunction } from '../../service/types';
|
|||
import { RecallRanking, recallRankingEventType } from '../../analytics/recall_ranking';
|
||||
import { RecalledEntry } from '../../service/knowledge_base_service';
|
||||
|
||||
export type RecalledSuggestion = Pick<RecalledEntry, 'id' | 'text' | 'score'>;
|
||||
export type RecalledSuggestion = Pick<RecalledEntry, 'id' | 'text' | 'esScore'>;
|
||||
|
||||
export async function recallAndScore({
|
||||
recall,
|
||||
|
@ -38,7 +38,7 @@ export async function recallAndScore({
|
|||
signal: AbortSignal;
|
||||
}): Promise<{
|
||||
relevantDocuments?: RecalledSuggestion[];
|
||||
scores?: Array<{ id: string; score: number }>;
|
||||
llmScores?: Array<{ id: string; llmScore: number }>;
|
||||
suggestions: RecalledSuggestion[];
|
||||
}> {
|
||||
const queries = [
|
||||
|
@ -47,19 +47,19 @@ export async function recallAndScore({
|
|||
].filter((query) => query.text.trim());
|
||||
|
||||
const suggestions: RecalledSuggestion[] = (await recall({ queries })).map(
|
||||
({ id, text, score }) => ({ id, text, score })
|
||||
({ id, text, esScore }) => ({ id, text, esScore })
|
||||
);
|
||||
|
||||
if (!suggestions.length) {
|
||||
return {
|
||||
relevantDocuments: [],
|
||||
scores: [],
|
||||
llmScores: [],
|
||||
suggestions: [],
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const { scores, relevantDocuments } = await scoreSuggestions({
|
||||
const { llmScores, relevantDocuments } = await scoreSuggestions({
|
||||
suggestions,
|
||||
logger,
|
||||
messages,
|
||||
|
@ -72,15 +72,15 @@ export async function recallAndScore({
|
|||
|
||||
analytics.reportEvent<RecallRanking>(recallRankingEventType, {
|
||||
scoredDocuments: suggestions.map((suggestion) => {
|
||||
const llmScore = scores.find((score) => score.id === suggestion.id);
|
||||
const llmScore = llmScores.find((score) => score.id === suggestion.id);
|
||||
return {
|
||||
elserScore: suggestion.score ?? -1,
|
||||
llmScore: llmScore ? llmScore.score : -1,
|
||||
esScore: suggestion.esScore ?? -1,
|
||||
llmScore: llmScore ? llmScore.llmScore : -1,
|
||||
};
|
||||
}),
|
||||
});
|
||||
|
||||
return { scores, relevantDocuments, suggestions };
|
||||
return { llmScores, relevantDocuments, suggestions };
|
||||
} catch (error) {
|
||||
logger.error(`Error scoring documents: ${error.message}`, { error });
|
||||
return {
|
||||
|
|
|
@ -15,9 +15,9 @@ import { ChatEvent } from '../../../common/conversation_complete';
|
|||
import { contextualInsightsMessages, normalConversationMessages } from './recall_and_score.test';
|
||||
|
||||
const suggestions: RecalledSuggestion[] = [
|
||||
{ id: 'doc1', text: 'Relevant document 1', score: 0.9 },
|
||||
{ id: 'doc2', text: 'Relevant document 2', score: 0.8 },
|
||||
{ id: 'doc3', text: 'Less relevant document 3', score: 0.3 },
|
||||
{ id: 'doc1', text: 'Relevant document 1', esScore: 0.9 },
|
||||
{ id: 'doc2', text: 'Relevant document 2', esScore: 0.8 },
|
||||
{ id: 'doc3', text: 'Less relevant document 3', esScore: 0.3 },
|
||||
];
|
||||
|
||||
const userPrompt = 'What is my favourite color?';
|
||||
|
@ -52,15 +52,15 @@ describe('scoreSuggestions', () => {
|
|||
logger: mockLogger,
|
||||
});
|
||||
|
||||
expect(result.scores).toEqual([
|
||||
{ id: 'doc1', score: 7 },
|
||||
{ id: 'doc2', score: 5 },
|
||||
{ id: 'doc3', score: 3 },
|
||||
expect(result.llmScores).toEqual([
|
||||
{ id: 'doc1', llmScore: 7 },
|
||||
{ id: 'doc2', llmScore: 5 },
|
||||
{ id: 'doc3', llmScore: 3 },
|
||||
]);
|
||||
|
||||
expect(result.relevantDocuments).toEqual([
|
||||
{ id: 'doc1', text: 'Relevant document 1', score: 0.9 },
|
||||
{ id: 'doc2', text: 'Relevant document 2', score: 0.8 },
|
||||
{ id: 'doc1', text: 'Relevant document 1', esScore: 0.9 },
|
||||
{ id: 'doc2', text: 'Relevant document 2', esScore: 0.8 },
|
||||
]);
|
||||
});
|
||||
|
||||
|
@ -117,7 +117,7 @@ describe('scoreSuggestions', () => {
|
|||
});
|
||||
|
||||
expect(result.relevantDocuments).toEqual([
|
||||
{ id: 'doc1', text: 'Relevant document 1', score: 0.9 },
|
||||
{ id: 'doc1', text: 'Relevant document 1', esScore: 0.9 },
|
||||
]);
|
||||
});
|
||||
|
||||
|
@ -159,10 +159,10 @@ describe('scoreSuggestions', () => {
|
|||
logger: mockLogger,
|
||||
});
|
||||
|
||||
expect(result.scores).toEqual([
|
||||
{ id: 'doc1', score: 7 },
|
||||
{ id: 'doc2', score: 5 },
|
||||
{ id: 'doc3', score: 3 },
|
||||
expect(result.llmScores).toEqual([
|
||||
{ id: 'doc1', llmScore: 7 },
|
||||
{ id: 'doc2', llmScore: 5 },
|
||||
{ id: 'doc3', llmScore: 3 },
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -49,7 +49,7 @@ export async function scoreSuggestions({
|
|||
logger: Logger;
|
||||
}): Promise<{
|
||||
relevantDocuments: RecalledSuggestion[];
|
||||
scores: Array<{ id: string; score: number }>;
|
||||
llmScores: Array<{ id: string; llmScore: number }>;
|
||||
}> {
|
||||
const shortIdTable = new ShortIdTable();
|
||||
|
||||
|
@ -72,7 +72,7 @@ export async function scoreSuggestions({
|
|||
Documents:
|
||||
${JSON.stringify(
|
||||
suggestions.map((suggestion) => ({
|
||||
...omit(suggestion, 'score'), // Omit score to not bias the LLM
|
||||
...omit(suggestion, 'esScore'), // Omit ES score to not bias the LLM
|
||||
id: shortIdTable.take(suggestion.id), // Shorten id to save tokens
|
||||
})),
|
||||
null,
|
||||
|
@ -126,21 +126,21 @@ export async function scoreSuggestions({
|
|||
scoreFunctionRequest.message.function_call.arguments
|
||||
);
|
||||
|
||||
const scores = parseSuggestionScores(scoresAsString)
|
||||
const llmScores = parseSuggestionScores(scoresAsString)
|
||||
// Restore original IDs (added fallback to id for testing purposes)
|
||||
.map(({ id, score }) => ({ id: shortIdTable.lookup(id) || id, score }));
|
||||
.map(({ id, llmScore }) => ({ id: shortIdTable.lookup(id) || id, llmScore }));
|
||||
|
||||
if (scores.length === 0) {
|
||||
if (llmScores.length === 0) {
|
||||
// seemingly invalid or no scores, return all
|
||||
return { relevantDocuments: suggestions, scores: [] };
|
||||
return { relevantDocuments: suggestions, llmScores: [] };
|
||||
}
|
||||
|
||||
const suggestionIds = suggestions.map((document) => document.id);
|
||||
|
||||
// get top 5 documents ids with scores > 4
|
||||
const relevantDocumentIds = scores
|
||||
.filter(({ score }) => score > 4)
|
||||
.sort((a, b) => b.score - a.score)
|
||||
const relevantDocumentIds = llmScores
|
||||
.filter(({ llmScore }) => llmScore > 4)
|
||||
.sort((a, b) => b.llmScore - a.llmScore)
|
||||
.slice(0, 5)
|
||||
.filter(({ id }) => suggestionIds.includes(id ?? '')) // Remove hallucinated documents
|
||||
.map(({ id }) => id);
|
||||
|
@ -153,6 +153,6 @@ export async function scoreSuggestions({
|
|||
|
||||
return {
|
||||
relevantDocuments,
|
||||
scores: scores.map((score) => ({ id: score.id, score: score.score })),
|
||||
llmScores: llmScores.map((score) => ({ id: score.id, llmScore: score.llmScore })),
|
||||
};
|
||||
}
|
||||
|
|
|
@ -243,7 +243,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
|
|||
parsedContextResponseData.suggestions.forEach((suggestion: RecalledSuggestion) => {
|
||||
expect(suggestion).to.have.property('id');
|
||||
expect(suggestion).to.have.property('text');
|
||||
expect(suggestion).to.have.property('score');
|
||||
expect(suggestion).to.have.property('esScore');
|
||||
});
|
||||
|
||||
const suggestionTexts = parsedContextResponseData.suggestions
|
||||
|
|
|
@ -90,7 +90,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
|
|||
describe('GET /internal/observability_ai_assistant/functions/recall', () => {
|
||||
it('produces unique scores for each doc', async () => {
|
||||
const entries = await recall('What happened during the database outage?');
|
||||
const uniqueScores = uniq(entries.map(({ score }) => score));
|
||||
const uniqueScores = uniq(entries.map(({ esScore }) => esScore));
|
||||
expect(uniqueScores.length).to.be.greaterThan(1);
|
||||
expect(uniqueScores.length).to.be(8);
|
||||
});
|
||||
|
@ -104,7 +104,7 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
|
|||
it('returns entries in a consistent order', async () => {
|
||||
const entries = await recall('whales');
|
||||
|
||||
expect(entries.map(({ id, score }) => `${formatScore(score!)} - ${id}`)).to.eql([
|
||||
expect(entries.map(({ id, esScore }) => `${formatScore(esScore!)} - ${id}`)).to.eql([
|
||||
'high - animal_whale_migration_patterns',
|
||||
'low - animal_elephants_social_structure',
|
||||
'low - technical_api_gateway_timeouts',
|
||||
|
@ -118,12 +118,12 @@ export default function ApiTest({ getService }: DeploymentAgnosticFtrProviderCon
|
|||
|
||||
it('returns the "Cheetah" entry from search connectors as the top result', async () => {
|
||||
const entries = await recall('Cheetah');
|
||||
const { text, score } = first(entries)!;
|
||||
const { text, esScore } = first(entries)!;
|
||||
|
||||
// search connector entries have their entire doc stringified in `text` field
|
||||
const parsedDoc = JSON.parse(text) as { title: string; text: string };
|
||||
expect(parsedDoc.title).to.eql('The Life of a Cheetah');
|
||||
expect(score).to.greaterThan(0.1);
|
||||
expect(esScore).to.greaterThan(0.1);
|
||||
});
|
||||
|
||||
it('returns different result order for different queries', async () => {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue