[KB] create @kbn/product-doc-artifact-builder package (#193847)

## Summary

Related https://github.com/elastic/kibana/issues/193473

Add initial implementation of the knowledge base artifact builder. This
PR only introduces the builder script, it doesn't do anything about
automation.

---------

Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
This commit is contained in:
Pierre Gayvallet 2024-10-07 14:21:09 +02:00 committed by GitHub
parent 67f2b7cad2
commit 1ab1add68e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
30 changed files with 1007 additions and 1 deletions

1
.github/CODEOWNERS vendored
View file

@ -683,6 +683,7 @@ packages/presentation/presentation_containers @elastic/kibana-presentation
src/plugins/presentation_panel @elastic/kibana-presentation
packages/presentation/presentation_publishing @elastic/kibana-presentation
src/plugins/presentation_util @elastic/kibana-presentation
x-pack/packages/ai-infra/product-doc-artifact-builder @elastic/appex-ai-infra
x-pack/plugins/observability_solution/profiling_data_access @elastic/obs-ux-infra_services-team
x-pack/plugins/observability_solution/profiling @elastic/obs-ux-infra_services-team
packages/kbn-profiling-utils @elastic/obs-ux-infra_services-team

View file

@ -1456,6 +1456,7 @@
"@kbn/picomatcher": "link:packages/kbn-picomatcher",
"@kbn/plugin-generator": "link:packages/kbn-plugin-generator",
"@kbn/plugin-helpers": "link:packages/kbn-plugin-helpers",
"@kbn/product-doc-artifact-builder": "link:x-pack/packages/ai-infra/product-doc-artifact-builder",
"@kbn/repo-file-maps": "link:packages/kbn-repo-file-maps",
"@kbn/repo-linter": "link:packages/kbn-repo-linter",
"@kbn/repo-path": "link:packages/kbn-repo-path",

View file

@ -0,0 +1,11 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/
require('../src/setup_node_env');
require('@kbn/product-doc-artifact-builder').runScript();

View file

@ -116,6 +116,7 @@ export const IGNORE_DIRECTORY_GLOBS = [
'src/babel-*',
'packages/*',
'packages/core/*/*',
'x-pack/packages/ai-infra/*',
'packages/kbn-pm/src/utils/__fixtures__/*',
'packages/kbn-check-prod-native-modules-cli/integration_tests/__fixtures__/*/node_modules/*',
'x-pack/dev-tools',

View file

@ -1360,6 +1360,8 @@
"@kbn/presentation-publishing/*": ["packages/presentation/presentation_publishing/*"],
"@kbn/presentation-util-plugin": ["src/plugins/presentation_util"],
"@kbn/presentation-util-plugin/*": ["src/plugins/presentation_util/*"],
"@kbn/product-doc-artifact-builder": ["x-pack/packages/ai-infra/product-doc-artifact-builder"],
"@kbn/product-doc-artifact-builder/*": ["x-pack/packages/ai-infra/product-doc-artifact-builder/*"],
"@kbn/profiling-data-access-plugin": ["x-pack/plugins/observability_solution/profiling_data_access"],
"@kbn/profiling-data-access-plugin/*": ["x-pack/plugins/observability_solution/profiling_data_access/*"],
"@kbn/profiling-plugin": ["x-pack/plugins/observability_solution/profiling"],
@ -2078,4 +2080,4 @@
"@kbn/ambient-storybook-types"
]
}
}
}

View file

@ -0,0 +1,3 @@
# @kbn/product-doc-artifact-builder
Script to build the knowledge base artifacts

View file

@ -0,0 +1,8 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export { runScript } from './src/command';

View file

@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
module.exports = {
preset: '@kbn/test/jest_node',
rootDir: '../../../..',
roots: ['<rootDir>/x-pack/packages/ai-infra/product-doc-artifact-builder'],
};

View file

@ -0,0 +1,6 @@
{
"type": "shared-common",
"id": "@kbn/product-doc-artifact-builder",
"owner": "@elastic/appex-ai-infra",
"devOnly": true
}

View file

@ -0,0 +1,6 @@
{
"name": "@kbn/product-doc-artifact-builder",
"private": true,
"version": "1.0.0",
"license": "Elastic License 2.0"
}

View file

@ -0,0 +1,16 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export const getArtifactName = ({
productName,
productVersion,
}: {
productName: string;
productVersion: string;
}): string => {
return `kibana-kb-${productName}-${productVersion}.zip`.toLowerCase();
};

View file

@ -0,0 +1,26 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export interface ArtifactManifest {
formatVersion: string;
productName: string;
productVersion: string;
}
export const getArtifactManifest = ({
productName,
stackVersion,
}: {
productName: string;
stackVersion: string;
}): ArtifactManifest => {
return {
formatVersion: '1.0.0',
productName,
productVersion: stackVersion,
};
};

View file

@ -0,0 +1,39 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
export const getArtifactMappings = (inferenceEndpoint: string): MappingTypeMapping => {
return {
dynamic: 'strict',
properties: {
content_title: { type: 'text' },
content_body: {
type: 'semantic_text',
inference_id: inferenceEndpoint,
},
product_name: { type: 'keyword' },
root_type: { type: 'keyword' },
slug: { type: 'keyword' },
url: { type: 'keyword' },
version: { type: 'version' },
ai_subtitle: {
type: 'semantic_text',
inference_id: inferenceEndpoint,
},
ai_summary: {
type: 'semantic_text',
inference_id: inferenceEndpoint,
},
ai_questions_answered: {
type: 'semantic_text',
inference_id: inferenceEndpoint,
},
ai_tags: { type: 'keyword' },
},
};
};

View file

@ -0,0 +1,11 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
/**
* The allowed product names, as found in the source's cluster
*/
export const sourceProductNames = ['Kibana', 'Elasticsearch', 'Security', 'Observability'];

View file

@ -0,0 +1,161 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import Path from 'path';
import { Client } from '@elastic/elasticsearch';
import { ToolingLog } from '@kbn/tooling-log';
import {
// checkConnectivity,
createTargetIndex,
extractDocumentation,
indexDocuments,
installElser,
createChunkFiles,
createArtifact,
cleanupFolders,
deleteIndex,
} from './tasks';
import type { TaskConfig } from './types';
const getSourceClient = (config: TaskConfig) => {
return new Client({
compression: true,
nodes: [config.sourceClusterUrl],
sniffOnStart: false,
auth: {
username: config.sourceClusterUsername,
password: config.sourceClusterPassword,
},
});
};
const getEmbeddingClient = (config: TaskConfig) => {
return new Client({
compression: true,
nodes: [config.embeddingClusterUrl],
auth: {
username: config.embeddingClusterUsername,
password: config.embeddingClusterPassword,
},
// generating embeddings takes time
requestTimeout: 10 * 60 * 1000,
});
};
export const buildArtifacts = async (config: TaskConfig) => {
const log = new ToolingLog({
level: 'info',
writeTo: process.stdout,
});
log.info(
`Starting building artifacts for version=[${
config.stackVersion
}] and products=[${config.productNames.join(',')}]`
);
const sourceClient = getSourceClient(config);
const embeddingClient = getEmbeddingClient(config);
// log.info('Checking connectivity against clusters');
// await checkConnectivity({ sourceClient, embeddingClient });
await cleanupFolders({ folders: [config.buildFolder] });
log.info('Ensuring ELSER is installed on the embedding cluster');
await installElser({ client: embeddingClient });
for (const productName of config.productNames) {
await buildArtifact({
productName,
stackVersion: config.stackVersion,
buildFolder: config.buildFolder,
targetFolder: config.targetFolder,
sourceClient,
embeddingClient,
log,
});
}
await cleanupFolders({ folders: [config.buildFolder] });
};
const buildArtifact = async ({
productName,
stackVersion,
buildFolder,
targetFolder,
embeddingClient,
sourceClient,
log,
}: {
productName: string;
stackVersion: string;
buildFolder: string;
targetFolder: string;
sourceClient: Client;
embeddingClient: Client;
log: ToolingLog;
}) => {
log.info(`Starting building artifact for product [${productName}] and version [${stackVersion}]`);
const targetIndex = getTargetIndexName({ productName, stackVersion });
const documents = await extractDocumentation({
client: sourceClient,
index: 'search-docs-1',
log,
productName,
stackVersion,
});
await createTargetIndex({
client: embeddingClient,
indexName: targetIndex,
});
await indexDocuments({
client: embeddingClient,
index: targetIndex,
documents,
log,
});
await createChunkFiles({
index: targetIndex,
client: embeddingClient,
productName,
destFolder: Path.join(buildFolder, productName),
log,
});
await createArtifact({
buildFolder: Path.join(buildFolder, productName),
targetFolder,
productName,
stackVersion,
log,
});
await deleteIndex({
indexName: targetIndex,
client: embeddingClient,
log,
});
log.info(`Finished building artifact for product [${productName}] and version [${stackVersion}]`);
};
const getTargetIndexName = ({
productName,
stackVersion,
}: {
productName: string;
stackVersion: string;
}) => {
return `kb-artifact-builder-${productName}-${stackVersion}`.toLowerCase();
};

View file

@ -0,0 +1,97 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import Path from 'path';
import { REPO_ROOT } from '@kbn/repo-info';
import yargs from 'yargs';
import type { TaskConfig } from './types';
import { buildArtifacts } from './build_artifacts';
import { sourceProductNames } from './artifact/product_name';
function options(y: yargs.Argv) {
return y
.option('productName', {
describe: 'name of products to generate documentation for',
array: true,
choices: sourceProductNames,
default: ['Kibana'],
})
.option('stackVersion', {
describe: 'The stack version to generate documentation for',
string: true,
default: '8.16', // TODO: master is on 9.0 now, not sure we can default to version in package.json?
})
.option('targetFolder', {
describe: 'The folder to generate the artifacts in',
string: true,
default: Path.join(REPO_ROOT, 'build', 'kb-artifacts'),
})
.option('buildFolder', {
describe: 'The folder to use for temporary files',
string: true,
default: Path.join(REPO_ROOT, 'build', 'temp-kb-artifacts'),
})
.option('sourceClusterUrl', {
describe: 'The source cluster url',
string: true,
demandOption: true,
default: process.env.KIBANA_SOURCE_CLUSTER_URL,
})
.option('sourceClusterUsername', {
describe: 'The source cluster username',
string: true,
demandOption: true,
default: process.env.KIBANA_SOURCE_CLUSTER_USERNAME,
})
.option('sourceClusterPassword', {
describe: 'The source cluster password',
string: true,
demandOption: true,
default: process.env.KIBANA_SOURCE_CLUSTER_PASSWORD,
})
.option('embeddingClusterUrl', {
describe: 'The embedding cluster url',
string: true,
demandOption: true,
default: process.env.KIBANA_EMBEDDING_CLUSTER_URL,
})
.option('embeddingClusterUsername', {
describe: 'The embedding cluster username',
string: true,
demandOption: true,
default: process.env.KIBANA_EMBEDDING_CLUSTER_USERNAME,
})
.option('embeddingClusterPassword', {
describe: 'The embedding cluster password',
string: true,
demandOption: true,
default: process.env.KIBANA_EMBEDDING_CLUSTER_PASSWORD,
})
.locale('en');
}
export function runScript() {
yargs(process.argv.slice(2))
.command('*', 'Build knowledge base artifacts', options, async (argv) => {
// argv contains additional entries - let's keep our input clear
const taskConfig: TaskConfig = {
productNames: argv.productName,
stackVersion: argv.stackVersion,
buildFolder: argv.buildFolder,
targetFolder: argv.targetFolder,
sourceClusterUrl: argv.sourceClusterUrl!,
sourceClusterUsername: argv.sourceClusterUsername!,
sourceClusterPassword: argv.sourceClusterPassword!,
embeddingClusterUrl: argv.embeddingClusterUrl!,
embeddingClusterUsername: argv.embeddingClusterUsername!,
embeddingClusterPassword: argv.embeddingClusterPassword!,
};
return buildArtifacts(taskConfig);
})
.parse();
}

View file

@ -0,0 +1,18 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { Client } from '@elastic/elasticsearch';
export const checkConnectivity = async ({
sourceClient,
embeddingClient,
}: {
sourceClient: Client;
embeddingClient: Client;
}) => {
await Promise.all([sourceClient.ping(), embeddingClient.ping()]);
};

View file

@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { rm } from 'fs/promises';
export const cleanupFolders = async ({ folders }: { folders: string[] }) => {
await Promise.all(folders.map((folder) => rm(folder, { recursive: true, force: true })));
};

View file

@ -0,0 +1,51 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import Path from 'path';
import AdmZip from 'adm-zip';
import type { ToolingLog } from '@kbn/tooling-log';
import { getArtifactMappings } from '../artifact/mappings';
import { getArtifactManifest } from '../artifact/manifest';
import { getArtifactName } from '../artifact/artifact_name';
export const createArtifact = async ({
productName,
stackVersion,
buildFolder,
targetFolder,
log,
}: {
buildFolder: string;
targetFolder: string;
productName: string;
stackVersion: string;
log: ToolingLog;
}) => {
log.info(
`Starting to create artifact from build folder [${buildFolder}] into target [${targetFolder}]`
);
const zip = new AdmZip();
const mappings = getArtifactMappings('.default-elser');
const mappingFileContent = JSON.stringify(mappings, undefined, 2);
zip.addFile('mappings.json', Buffer.from(mappingFileContent, 'utf-8'));
const manifest = getArtifactManifest({ productName, stackVersion });
const manifestFileContent = JSON.stringify(manifest, undefined, 2);
zip.addFile('manifest.json', Buffer.from(manifestFileContent, 'utf-8'));
zip.addLocalFolder(buildFolder, 'content');
const artifactName = getArtifactName({
productName,
productVersion: stackVersion,
});
zip.writeZip(Path.join(targetFolder, artifactName));
log.info(`Finished creating artifact [${artifactName}]`);
};

View file

@ -0,0 +1,68 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import Path from 'path';
import Fs from 'fs/promises';
import type { Client } from '@elastic/elasticsearch';
import type { ToolingLog } from '@kbn/tooling-log';
const fileSizeLimit = 250_000;
export const createChunkFiles = async ({
index,
productName,
destFolder,
client,
log,
}: {
index: string;
productName: string;
destFolder: string;
client: Client;
log: ToolingLog;
}) => {
log.info(`Starting to create chunk files in directory [${destFolder}]`);
const searchRes = await client.search({
index,
size: 10000,
query: {
bool: {
must: [{ term: { product_name: productName } }],
},
},
});
await Fs.mkdir(destFolder, { recursive: true });
let chunkNumber = 1;
let chunkDocCount = 0;
let chunkContent: string = '';
const writeCurrentChunk = async () => {
const chunkFileName = `content-${chunkNumber}.ndjson`;
log.info(`Writing chunk file ${chunkFileName} containing ${chunkDocCount} docs`);
await Fs.writeFile(Path.join(destFolder, chunkFileName), chunkContent);
chunkContent = '';
chunkDocCount = 0;
chunkNumber++;
};
for (let i = 0; i < searchRes.hits.hits.length; i++) {
const hit = searchRes.hits.hits[i];
chunkContent += JSON.stringify(hit._source) + '\n';
chunkDocCount++;
if (
Buffer.byteLength(chunkContent, 'utf8') > fileSizeLimit ||
i === searchRes.hits.hits.length - 1
) {
await writeCurrentChunk();
}
}
log.info(`Finished creating chunk files`);
};

View file

@ -0,0 +1,51 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { Client } from '@elastic/elasticsearch';
import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
const mappings: MappingTypeMapping = {
dynamic: 'strict',
properties: {
content_title: { type: 'text' },
content_body: {
type: 'semantic_text',
inference_id: 'kibana-elser2',
},
product_name: { type: 'keyword' },
root_type: { type: 'keyword' },
slug: { type: 'keyword' },
url: { type: 'keyword' },
version: { type: 'version' },
ai_subtitle: {
type: 'semantic_text',
inference_id: 'kibana-elser2',
},
ai_summary: {
type: 'semantic_text',
inference_id: 'kibana-elser2',
},
ai_questions_answered: {
type: 'semantic_text',
inference_id: 'kibana-elser2',
},
ai_tags: { type: 'keyword' },
},
};
export const createTargetIndex = async ({
indexName,
client,
}: {
indexName: string;
client: Client;
}) => {
await client.indices.create({
index: indexName,
mappings,
});
};

View file

@ -0,0 +1,27 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { Client } from '@elastic/elasticsearch';
import type { ToolingLog } from '@kbn/tooling-log';
export const deleteIndex = async ({
indexName,
client,
log,
}: {
indexName: string;
client: Client;
log: ToolingLog;
}) => {
log.info(`Deleting index ${indexName}`);
await client.indices.delete(
{
index: indexName,
},
{ ignore: [404] }
);
};

View file

@ -0,0 +1,102 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { Client } from '@elastic/elasticsearch';
import type { SearchHit } from '@elastic/elasticsearch/lib/api/types';
import type { ToolingLog } from '@kbn/tooling-log';
/** the list of fields to import from the source cluster */
const fields = [
'content_title',
'content_body',
'product_name', // "Kibana", "Elasticsearch"
'category', // "documentation"
'slug',
'url',
'version',
'ai_fields.ai_subtitle',
'ai_fields.ai_summary',
'ai_fields.ai_questions_answered',
'ai_fields.ai_tags',
];
export interface ExtractedDocument {
content_title: string;
content_body: string;
product_name: string;
root_type: string;
slug: string;
url: string;
version: string;
ai_subtitle: string;
ai_summary: string;
ai_questions_answered: string[];
ai_tags: string[];
}
const convertHit = (hit: SearchHit<any>): ExtractedDocument => {
const source = hit._source;
return {
content_title: source.content_title,
content_body: source.content_body,
product_name: source.product_name,
root_type: 'documentation',
slug: source.slug,
url: source.url,
version: source.version,
ai_subtitle: source.ai_fields.ai_subtitle,
ai_summary: source.ai_fields.ai_summary,
ai_questions_answered: source.ai_fields.ai_questions_answered,
ai_tags: source.ai_fields.ai_tags,
};
};
export const extractDocumentation = async ({
client,
index,
stackVersion,
productName,
log,
}: {
client: Client;
index: string;
stackVersion: string;
productName: string;
log: ToolingLog;
}) => {
log.info(`Starting to extract documents from source cluster`);
const response = await client.search({
index,
size: 10000,
query: {
bool: {
must: [
{ term: { product_name: productName } },
{ term: { version: stackVersion } },
{ exists: { field: 'ai_fields.ai_summary' } },
],
},
},
fields,
});
const totalHits =
typeof response.hits.total === 'number'
? response.hits.total // This format is to be removed in 8.0
: response.hits.total?.value ?? response.hits.hits.length;
if (totalHits > 10_000) {
throw new Error('Found more than 10k documents to extract - aborting');
}
log.info(
`Finished extracting documents from source. ${response.hits.hits.length} documents were extracted`
);
return response.hits.hits.map(convertHit);
};

View file

@ -0,0 +1,17 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export { extractDocumentation } from './extract_documentation';
export { indexDocuments } from './index_documents';
export { createTargetIndex } from './create_index';
export { installElser } from './install_elser';
export { createChunkFiles } from './create_chunk_files';
export { performSemanticSearch } from './perform_semantic_search';
export { checkConnectivity } from './check_connectivity';
export { createArtifact } from './create_artifact';
export { cleanupFolders } from './cleanup_folders';
export { deleteIndex } from './delete_index';

View file

@ -0,0 +1,50 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { chunk as toChunks } from 'lodash';
import type { Client } from '@elastic/elasticsearch';
import type { BulkRequest } from '@elastic/elasticsearch/lib/api/types';
import type { ToolingLog } from '@kbn/tooling-log';
import type { ExtractedDocument } from './extract_documentation';
const indexingChunkSize = 10;
export const indexDocuments = async ({
index,
client,
documents,
log,
}: {
index: string;
documents: ExtractedDocument[];
client: Client;
log: ToolingLog;
}) => {
const chunks = toChunks(documents, indexingChunkSize);
log.info(`Starting indexing process`);
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const before = Date.now();
await client.bulk(
{
refresh: 'wait_for',
operations: chunk.reduce((operations, document) => {
operations!.push(...[{ index: { _index: index } }, document]);
return operations;
}, [] as BulkRequest['operations']),
},
{ requestTimeout: 10 * 60 * 1000 }
);
const duration = Date.now() - before;
log.info(`Indexed ${i + 1} of ${chunks.length} chunks (took ${duration}ms)`);
}
log.info(`Finished indexing process`);
};

View file

@ -0,0 +1,74 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { Client } from '@elastic/elasticsearch';
const inferenceEndpointId = 'kibana-elser2';
export const installElser = async ({ client }: { client: Client }) => {
const getInferenceRes = await client.inference.get(
{
task_type: 'sparse_embedding',
inference_id: 'kibana-elser2',
},
{ ignore: [404] }
);
const installed = (getInferenceRes.endpoints ?? []).some(
(endpoint) => endpoint.inference_id === inferenceEndpointId
);
if (!installed) {
await client.inference.put({
task_type: 'sparse_embedding',
inference_id: inferenceEndpointId,
inference_config: {
service: 'elser',
service_settings: {
num_allocations: 1,
num_threads: 1,
model_id: '.elser_model_2',
},
task_settings: {},
},
});
}
await waitUntilDeployed({
modelId: '.elser_model_2',
client,
});
};
const waitUntilDeployed = async ({
modelId,
client,
maxRetries = 20,
delay = 2000,
}: {
modelId: string;
client: Client;
maxRetries?: number;
delay?: number;
}) => {
for (let i = 0; i < maxRetries; i++) {
const statsRes = await client.ml.getTrainedModelsStats({
model_id: modelId,
});
const deploymentStats = statsRes.trained_model_stats[0]?.deployment_stats;
// @ts-expect-error deploymentStats.nodes not defined as array even if it is.
if (!deploymentStats || deploymentStats.nodes.length === 0) {
await sleep(delay);
continue;
}
return;
}
throw new Error(`Timeout waiting for ML model ${modelId} to be deployed`);
};
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));

View file

@ -0,0 +1,92 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { Client } from '@elastic/elasticsearch';
// https://search-labs.elastic.co/search-labs/blog/elser-rag-search-for-relevance
export const performSemanticSearch = async ({
searchQuery,
index,
client,
}: {
searchQuery: string;
index: string;
client: Client;
}) => {
const results = await client.search({
index,
size: 3,
query: {
bool: {
filter: {
bool: {
must: [{ term: { version: '8.15' } }],
},
},
should: [
{
multi_match: {
query: searchQuery,
minimum_should_match: '1<-1 3<49%',
type: 'cross_fields',
fields: [
'content_title',
'content_body.text',
'ai_subtitle.text',
'ai_summary.text',
'ai_questions_answered.text',
'ai_tags',
],
},
},
{
multi_match: {
query: searchQuery,
type: 'phrase',
boost: 3,
slop: 0,
fields: [
'content_title.stem',
'content_body.stem',
'ai_subtitle.stem',
'ai_summary.stem',
'ai_questions_answered.stem',
],
},
},
{
semantic: {
field: 'content_body',
query: searchQuery,
},
},
{
semantic: {
field: 'ai_subtitle',
query: searchQuery,
},
},
{
semantic: {
field: 'ai_summary',
query: searchQuery,
},
},
{
semantic: {
field: 'ai_questions_answered',
query: searchQuery,
},
},
],
},
},
});
return results.hits.hits;
};

View file

@ -0,0 +1,19 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export interface TaskConfig {
productNames: string[];
stackVersion: string;
buildFolder: string;
targetFolder: string;
sourceClusterUrl: string;
sourceClusterUsername: string;
sourceClusterPassword: string;
embeddingClusterUrl: string;
embeddingClusterUsername: string;
embeddingClusterPassword: string;
}

View file

@ -0,0 +1,20 @@
{
"extends": "../../../../tsconfig.base.json",
"compilerOptions": {
"outDir": "target/types",
"types": [
"jest",
"node"
]
},
"include": [
"**/*.ts",
],
"exclude": [
"target/**/*"
],
"kbn_references": [
"@kbn/tooling-log",
"@kbn/repo-info",
]
}

View file

@ -6003,6 +6003,10 @@
version "0.0.0"
uid ""
"@kbn/product-doc-artifact-builder@link:x-pack/packages/ai-infra/product-doc-artifact-builder":
version "0.0.0"
uid ""
"@kbn/profiling-data-access-plugin@link:x-pack/plugins/observability_solution/profiling_data_access":
version "0.0.0"
uid ""