[product doc] adapt for new format of semantic_text field (#206051)

## Summary

fix https://github.com/elastic/kibana/issues/205908

Adapt the product documentation's usages of `semantic_text` for the
breaking changes that will be introduced in 8.18 and 9.0.

This PR introduces a new format version (`2.0.0`) for the product
documentation, introducing the required changes for the incoming
`semantic_text` breaking change.
- include the `_inference_fields` meta field when bundling the doc
artifacts
- set the `index.mapping.semantic_text.use_legacy_format` index setting
to `false` to force the new format
- change the way we're internally overriding the `inference_id` when
ingesting the data
- adapt the `search` logic to retrieve the data at the right place  


Doing that with a new format version also makes the transition
invisible, as our system will simply adapt depending on the version of
the artifact's manifest.


### How to test

**1. test that the behavior is not broken for current artifacts**

Run the branch, install the product doc from the prod repository, make
sure that the 8.17 artifacts are installed, then check if the feature
still works using the o11y assistant.

**2. test that the behavior works with the new artifacts**

**Keeping your ES instance up**, configure your local Kibana to use the
dev repository (where the 8.18 artifacts with the new format are
present)

```yaml
xpack.productDocBase.artifactRepositoryUrl: "https://storage.googleapis.com/kibana-ai-assistant-kb-artifacts-dev"
```

Then restart Kibana, confirms the artifacts gets updated to 8.18
automatically, and then test that the feature still works as expected
using the o11y assistant.
This commit is contained in:
Pierre Gayvallet 2025-01-13 10:05:16 +01:00 committed by GitHub
parent 683a768941
commit f77dc3d043
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 240 additions and 17 deletions

View file

@ -10,12 +10,14 @@ import type { ArtifactManifest, ProductName } from '@kbn/product-doc-common';
export const getArtifactManifest = ({
productName,
stackVersion,
formatVersion,
}: {
productName: ProductName;
stackVersion: string;
formatVersion: string;
}): ArtifactManifest => {
return {
formatVersion: '1.0.0',
formatVersion,
productName,
productVersion: stackVersion,
};

View file

@ -8,9 +8,14 @@
import Path from 'path';
import AdmZip from 'adm-zip';
import type { ToolingLog } from '@kbn/tooling-log';
import { getArtifactName, type ProductName } from '@kbn/product-doc-common';
import {
LATEST_MANIFEST_FORMAT_VERSION,
getArtifactName,
type ProductName,
} from '@kbn/product-doc-common';
import { getArtifactMappings } from '../artifact/mappings';
import { getArtifactManifest } from '../artifact/manifest';
import { DEFAULT_ELSER } from './create_index';
export const createArtifact = async ({
productName,
@ -31,11 +36,15 @@ export const createArtifact = async ({
const zip = new AdmZip();
const mappings = getArtifactMappings('.default-elser');
const mappings = getArtifactMappings(DEFAULT_ELSER);
const mappingFileContent = JSON.stringify(mappings, undefined, 2);
zip.addFile('mappings.json', Buffer.from(mappingFileContent, 'utf-8'));
const manifest = getArtifactManifest({ productName, stackVersion });
const manifest = getArtifactManifest({
productName,
stackVersion,
formatVersion: LATEST_MANIFEST_FORMAT_VERSION,
});
const manifestFileContent = JSON.stringify(manifest, undefined, 2);
zip.addFile('manifest.json', Buffer.from(manifestFileContent, 'utf-8'));

View file

@ -30,6 +30,8 @@ export const createChunkFiles = async ({
const searchRes = await client.search({
index,
size: 10000,
// includes inference field meta info in source
fields: ['_inference_fields'],
query: {
bool: {
must: [{ term: { product_name: productName } }],

View file

@ -8,7 +8,7 @@
import type { Client } from '@elastic/elasticsearch';
import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
const DEFAULT_ELSER = '.elser-2-elasticsearch';
export const DEFAULT_ELSER = '.elser-2-elasticsearch';
const mappings: MappingTypeMapping = {
dynamic: 'strict',
@ -46,5 +46,8 @@ export const createTargetIndex = async ({
await client.indices.create({
index: indexName,
mappings,
settings: {
'index.mapping.semantic_text.use_legacy_format': false,
},
});
};

View file

@ -56,12 +56,23 @@ const processDocument = (document: ExtractedDocument) => {
})
// remove edit links
.replaceAll(/\[\s*edit\s*\]\(\s*[^)]+\s*\)/g, '')
// remove empty links
// // remove empty links
.replaceAll('[]()', '')
// remove image links
.replaceAll(/\[\]\(\s*[^)]+\s*\)/g, '')
// limit to 2 consecutive carriage return
.replaceAll(/\n\n+/g, '\n\n');
document.content_title = document.content_title.split('|')[0].trim();
// specific to security: remove rule query section as it's usually large without much value for the LLM
if (document.product_name === 'security') {
const ruleQueryTitle = '### Rule query';
const ruleQueryPos = document.content_body.indexOf(ruleQueryTitle);
if (ruleQueryPos > -1) {
document.content_body = document.content_body.substring(0, ruleQueryPos);
}
}
return document;
};

View file

@ -6,7 +6,7 @@
*/
export { getArtifactName, parseArtifactName } from './src/artifact';
export { type ArtifactManifest } from './src/manifest';
export { LATEST_MANIFEST_FORMAT_VERSION, type ArtifactManifest } from './src/manifest';
export { DocumentationProduct, type ProductName } from './src/product';
export { isArtifactContentFilePath } from './src/artifact_content';
export {

View file

@ -18,14 +18,15 @@ interface SemanticTextArrayField {
export interface ProductDocumentationAttributes {
content_title: string;
content_body: SemanticTextField;
// backward compatibility for the legacy semantic_text mode
content_body: string | SemanticTextField;
product_name: ProductName;
root_type: string;
slug: string;
url: string;
version: string;
ai_subtitle: string;
ai_summary: SemanticTextField;
ai_questions_answered: SemanticTextArrayField;
ai_summary: string | SemanticTextField;
ai_questions_answered: string[] | SemanticTextArrayField;
ai_tags: string[];
}

View file

@ -7,6 +7,8 @@
import type { ProductName } from './product';
export const LATEST_MANIFEST_FORMAT_VERSION = '2.0.0';
export interface ArtifactManifest {
formatVersion: string;
productName: ProductName;

View file

@ -24,6 +24,7 @@ jest.doMock('./steps', () => {
export const downloadToDiskMock = jest.fn();
export const openZipArchiveMock = jest.fn();
export const loadMappingFileMock = jest.fn();
export const loadManifestFileMock = jest.fn();
export const ensureDefaultElserDeployedMock = jest.fn();
jest.doMock('./utils', () => {
@ -33,6 +34,7 @@ jest.doMock('./utils', () => {
downloadToDisk: downloadToDiskMock,
openZipArchive: openZipArchiveMock,
loadMappingFile: loadMappingFileMock,
loadManifestFile: loadManifestFileMock,
ensureDefaultElserDeployed: ensureDefaultElserDeployedMock,
};
});

View file

@ -10,6 +10,7 @@ import {
createIndexMock,
populateIndexMock,
loadMappingFileMock,
loadManifestFileMock,
openZipArchiveMock,
validateArtifactArchiveMock,
fetchArtifactVersionsMock,
@ -36,6 +37,8 @@ const callOrder = (fn: { mock: { invocationCallOrder: number[] } }): number => {
return fn.mock.invocationCallOrder[0];
};
const TEST_FORMAT_VERSION = '2.0.0';
describe('PackageInstaller', () => {
let logger: MockedLogger;
let esClient: ReturnType<typeof elasticsearchServiceMock.createElasticsearchClient>;
@ -55,6 +58,12 @@ describe('PackageInstaller', () => {
artifactRepositoryUrl,
kibanaVersion,
});
loadManifestFileMock.mockResolvedValue({
formatVersion: TEST_FORMAT_VERSION,
productName: 'kibana',
productVersion: '8.17',
});
});
afterEach(() => {
@ -62,6 +71,7 @@ describe('PackageInstaller', () => {
createIndexMock.mockReset();
populateIndexMock.mockReset();
loadMappingFileMock.mockReset();
loadManifestFileMock.mockReset();
openZipArchiveMock.mockReset();
validateArtifactArchiveMock.mockReset();
fetchArtifactVersionsMock.mockReset();
@ -99,10 +109,14 @@ describe('PackageInstaller', () => {
expect(loadMappingFileMock).toHaveBeenCalledTimes(1);
expect(loadMappingFileMock).toHaveBeenCalledWith(zipArchive);
expect(loadManifestFileMock).toHaveBeenCalledTimes(1);
expect(loadManifestFileMock).toHaveBeenCalledWith(zipArchive);
expect(createIndexMock).toHaveBeenCalledTimes(1);
expect(createIndexMock).toHaveBeenCalledWith({
indexName,
mappings,
manifestVersion: TEST_FORMAT_VERSION,
esClient,
log: logger,
});
@ -111,6 +125,7 @@ describe('PackageInstaller', () => {
expect(populateIndexMock).toHaveBeenCalledWith({
indexName,
archive: zipArchive,
manifestVersion: TEST_FORMAT_VERSION,
esClient,
log: logger,
});
@ -130,6 +145,7 @@ describe('PackageInstaller', () => {
expect(callOrder(downloadToDiskMock)).toBeLessThan(callOrder(openZipArchiveMock));
expect(callOrder(openZipArchiveMock)).toBeLessThan(callOrder(loadMappingFileMock));
expect(callOrder(loadMappingFileMock)).toBeLessThan(callOrder(createIndexMock));
expect(callOrder(loadManifestFileMock)).toBeLessThan(callOrder(createIndexMock));
expect(callOrder(createIndexMock)).toBeLessThan(callOrder(populateIndexMock));
expect(callOrder(populateIndexMock)).toBeLessThan(
callOrder(productDocClient.setInstallationSuccessful)

View file

@ -18,6 +18,7 @@ import {
downloadToDisk,
openZipArchive,
loadMappingFile,
loadManifestFile,
ensureDefaultElserDeployed,
type ZipArchive,
} from './utils';
@ -158,19 +159,25 @@ export class PackageInstaller {
validateArtifactArchive(zipArchive);
const mappings = await loadMappingFile(zipArchive);
const [manifest, mappings] = await Promise.all([
loadManifestFile(zipArchive),
loadMappingFile(zipArchive),
]);
const manifestVersion = manifest.formatVersion;
const indexName = getProductDocIndexName(productName);
await createIndex({
indexName,
mappings,
manifestVersion,
esClient: this.esClient,
log: this.log,
});
await populateIndex({
indexName,
manifestVersion,
archive: zipArchive,
esClient: this.esClient,
log: this.log,

View file

@ -9,9 +9,12 @@ import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
import { loggerMock, type MockedLogger } from '@kbn/logging-mocks';
import type { ElasticsearchClient } from '@kbn/core/server';
import { elasticsearchServiceMock } from '@kbn/core/server/mocks';
import { LATEST_MANIFEST_FORMAT_VERSION } from '@kbn/product-doc-common';
import { createIndex } from './create_index';
import { internalElserInferenceId } from '../../../../common/consts';
const LEGACY_SEMANTIC_TEXT_VERSION = '1.0.0';
describe('createIndex', () => {
let log: MockedLogger;
let esClient: ElasticsearchClient;
@ -21,7 +24,7 @@ describe('createIndex', () => {
esClient = elasticsearchServiceMock.createElasticsearchClient();
});
it('calls esClient.indices.create with the right parameters', async () => {
it('calls esClient.indices.create with the right parameters for the current manifest version', async () => {
const mappings: MappingTypeMapping = {
properties: {},
};
@ -30,6 +33,7 @@ describe('createIndex', () => {
await createIndex({
indexName,
mappings,
manifestVersion: LATEST_MANIFEST_FORMAT_VERSION,
log,
esClient,
});
@ -41,6 +45,33 @@ describe('createIndex', () => {
settings: {
number_of_shards: 1,
auto_expand_replicas: '0-1',
'index.mapping.semantic_text.use_legacy_format': false,
},
});
});
it('calls esClient.indices.create with the right parameters for the manifest version 1.0.0', async () => {
const mappings: MappingTypeMapping = {
properties: {},
};
const indexName = '.some-index';
await createIndex({
indexName,
mappings,
manifestVersion: LEGACY_SEMANTIC_TEXT_VERSION,
log,
esClient,
});
expect(esClient.indices.create).toHaveBeenCalledTimes(1);
expect(esClient.indices.create).toHaveBeenCalledWith({
index: indexName,
mappings,
settings: {
number_of_shards: 1,
auto_expand_replicas: '0-1',
'index.mapping.semantic_text.use_legacy_format': true,
},
});
});
@ -61,6 +92,7 @@ describe('createIndex', () => {
await createIndex({
indexName: '.some-index',
mappings,
manifestVersion: LEGACY_SEMANTIC_TEXT_VERSION,
log,
esClient,
});

View file

@ -9,20 +9,25 @@ import type { Logger } from '@kbn/logging';
import type { ElasticsearchClient } from '@kbn/core/server';
import type { MappingTypeMapping, MappingProperty } from '@elastic/elasticsearch/lib/api/types';
import { internalElserInferenceId } from '../../../../common/consts';
import { isLegacySemanticTextVersion } from '../utils';
export const createIndex = async ({
esClient,
indexName,
manifestVersion,
mappings,
log,
}: {
esClient: ElasticsearchClient;
indexName: string;
manifestVersion: string;
mappings: MappingTypeMapping;
log: Logger;
}) => {
log.debug(`Creating index ${indexName}`);
const legacySemanticText = isLegacySemanticTextVersion(manifestVersion);
overrideInferenceId(mappings, internalElserInferenceId);
await esClient.indices.create({
@ -31,6 +36,7 @@ export const createIndex = async ({
settings: {
number_of_shards: 1,
auto_expand_replicas: '0-1',
'index.mapping.semantic_text.use_legacy_format': legacySemanticText,
},
});
};

View file

@ -8,10 +8,13 @@
import { times } from 'lodash';
import { loggerMock, type MockedLogger } from '@kbn/logging-mocks';
import { elasticsearchServiceMock } from '@kbn/core/server/mocks';
import { LATEST_MANIFEST_FORMAT_VERSION } from '@kbn/product-doc-common';
import { internalElserInferenceId } from '../../../../common/consts';
import type { ZipArchive } from '../utils/zip_archive';
import { populateIndex } from './populate_index';
const LEGACY_SEMANTIC_TEXT_VERSION = '1.0.0';
const createMockArchive = (entries: Record<string, string>): ZipArchive => {
return {
hasEntry: (entryPath) => Object.keys(entries).includes(entryPath),
@ -44,6 +47,7 @@ describe('populateIndex', () => {
await populateIndex({
indexName: '.foo',
manifestVersion: LATEST_MANIFEST_FORMAT_VERSION,
archive,
log,
esClient,
@ -59,6 +63,7 @@ describe('populateIndex', () => {
await populateIndex({
indexName: '.foo',
manifestVersion: LATEST_MANIFEST_FORMAT_VERSION,
archive,
log,
esClient,
@ -77,6 +82,47 @@ describe('populateIndex', () => {
});
it('rewrites the inference_id of semantic fields', async () => {
const archive = createMockArchive({
'content/content-0.ndjson': JSON.stringify({
semantic: 'foo',
_inference_fields: {
semantic: {
inference: {
inference_id: '.some-inference',
},
},
},
}),
});
await populateIndex({
indexName: '.foo',
manifestVersion: LATEST_MANIFEST_FORMAT_VERSION,
archive,
log,
esClient,
});
expect(esClient.bulk).toHaveBeenCalledTimes(1);
expect(esClient.bulk).toHaveBeenCalledWith({
refresh: false,
operations: [
{ index: { _index: '.foo' } },
{
semantic: 'foo',
_inference_fields: {
semantic: {
inference: {
inference_id: internalElserInferenceId,
},
},
},
},
],
});
});
it('rewrites the inference_id of semantic fields for legacy semantic_field', async () => {
const archive = createMockArchive({
'content/content-0.ndjson': JSON.stringify({
semantic: { text: 'foo', inference: { inference_id: '.some-inference' } },
@ -85,6 +131,7 @@ describe('populateIndex', () => {
await populateIndex({
indexName: '.foo',
manifestVersion: LEGACY_SEMANTIC_TEXT_VERSION,
archive,
log,
esClient,

View file

@ -11,27 +11,32 @@ import type { ElasticsearchClient } from '@kbn/core/server';
import { isArtifactContentFilePath } from '@kbn/product-doc-common';
import { internalElserInferenceId } from '../../../../common/consts';
import type { ZipArchive } from '../utils/zip_archive';
import { isLegacySemanticTextVersion } from '../utils';
export const populateIndex = async ({
esClient,
indexName,
manifestVersion,
archive,
log,
}: {
esClient: ElasticsearchClient;
indexName: string;
manifestVersion: string;
archive: ZipArchive;
log: Logger;
}) => {
log.debug(`Starting populating index ${indexName}`);
const legacySemanticText = isLegacySemanticTextVersion(manifestVersion);
const contentEntries = archive.getEntryPaths().filter(isArtifactContentFilePath);
for (let i = 0; i < contentEntries.length; i++) {
const entryPath = contentEntries[i];
log.debug(`Indexing content for entry ${entryPath}`);
const contentBuffer = await archive.getEntryContent(entryPath);
await indexContentFile({ indexName, esClient, contentBuffer });
await indexContentFile({ indexName, esClient, contentBuffer, legacySemanticText });
}
log.debug(`Done populating index ${indexName}`);
@ -41,10 +46,12 @@ const indexContentFile = async ({
indexName,
contentBuffer,
esClient,
legacySemanticText,
}: {
indexName: string;
contentBuffer: Buffer;
esClient: ElasticsearchClient;
legacySemanticText: boolean;
}) => {
const fileContent = contentBuffer.toString('utf-8');
const lines = fileContent.split('\n');
@ -55,7 +62,13 @@ const indexContentFile = async ({
.map((line) => {
return JSON.parse(line);
})
.map((doc) => rewriteInferenceId(doc, internalElserInferenceId));
.map((doc) =>
rewriteInferenceId({
document: doc,
inferenceId: internalElserInferenceId,
legacySemanticText,
})
);
const operations = documents.reduce((ops, document) => {
ops!.push(...[{ index: { _index: indexName } }, document]);
@ -73,9 +86,18 @@ const indexContentFile = async ({
}
};
const rewriteInferenceId = (document: Record<string, any>, inferenceId: string) => {
const rewriteInferenceId = ({
document,
inferenceId,
legacySemanticText,
}: {
document: Record<string, any>;
inferenceId: string;
legacySemanticText: boolean;
}) => {
const semanticFieldsRoot = legacySemanticText ? document : document._inference_fields;
// we don't need to handle nested fields, we don't have any and won't.
Object.values(document).forEach((field) => {
Object.values(semanticFieldsRoot ?? {}).forEach((field: any) => {
if (field.inference) {
field.inference.inference_id = inferenceId;
}

View file

@ -9,3 +9,4 @@ export { downloadToDisk } from './download';
export { openZipArchive, type ZipArchive } from './zip_archive';
export { loadManifestFile, loadMappingFile } from './archive_accessors';
export { ensureDefaultElserDeployed } from './ensure_default_elser_deployed';
export { isLegacySemanticTextVersion } from './manifest_versions';

View file

@ -0,0 +1,19 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { isLegacySemanticTextVersion } from './manifest_versions';
describe('isLegacySemanticTextVersion', () => {
it('returns true for version 1.0.0', () => {
expect(isLegacySemanticTextVersion('1.0.0')).toBe(true);
});
it('returns false for version 2.0.0 and higher', () => {
expect(isLegacySemanticTextVersion('2.0.0')).toBe(false);
expect(isLegacySemanticTextVersion('4.92.3')).toBe(false);
});
});

View file

@ -0,0 +1,15 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import Semver from 'semver';
/**
* checks if the provided manifest version was a version where legacy semantic_text behavior was being used
*/
export const isLegacySemanticTextVersion = (manifestVersion: string): boolean => {
return Semver.lte(manifestVersion, '1.0.0');
};

View file

@ -20,6 +20,31 @@ const createHit = (
describe('mapResult', () => {
it('returns the expected shape', () => {
const input = createHit({
content_title: 'content_title',
content_body: 'content_body',
product_name: 'kibana',
root_type: 'documentation',
slug: 'foo.html',
url: 'http://lost.com/foo.html',
version: '8.16',
ai_subtitle: 'ai_subtitle',
ai_summary: 'ai_summary',
ai_questions_answered: ['question A'],
ai_tags: ['foo', 'bar', 'test'],
});
const output = mapResult(input);
expect(output).toEqual({
content: 'content_body',
productName: 'kibana',
title: 'content_title',
url: 'http://lost.com/foo.html',
});
});
it('returns the expected shape for legacy semantic_text fields', () => {
const input = createHit({
content_title: 'content_title',
content_body: { text: 'content_body' },

View file

@ -10,9 +10,10 @@ import type { ProductDocumentationAttributes } from '@kbn/product-doc-common';
import type { DocSearchResult } from '../types';
export const mapResult = (docHit: SearchHit<ProductDocumentationAttributes>): DocSearchResult => {
const content = docHit._source!.content_body;
return {
title: docHit._source!.content_title,
content: docHit._source!.content_body.text,
content: typeof content === 'string' ? content : content.text,
url: docHit._source!.url,
productName: docHit._source!.product_name,
};