mirror of
https://github.com/elastic/kibana.git
synced 2025-06-27 10:40:07 -04:00
[KB] create @kbn/product-doc-artifact-builder
package (#193847)
## Summary Related https://github.com/elastic/kibana/issues/193473 Add initial implementation of the knowledge base artifact builder. This PR only introduces the builder script, it doesn't do anything about automation. --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com> Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
This commit is contained in:
parent
67f2b7cad2
commit
1ab1add68e
30 changed files with 1007 additions and 1 deletions
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
|
@ -683,6 +683,7 @@ packages/presentation/presentation_containers @elastic/kibana-presentation
|
|||
src/plugins/presentation_panel @elastic/kibana-presentation
|
||||
packages/presentation/presentation_publishing @elastic/kibana-presentation
|
||||
src/plugins/presentation_util @elastic/kibana-presentation
|
||||
x-pack/packages/ai-infra/product-doc-artifact-builder @elastic/appex-ai-infra
|
||||
x-pack/plugins/observability_solution/profiling_data_access @elastic/obs-ux-infra_services-team
|
||||
x-pack/plugins/observability_solution/profiling @elastic/obs-ux-infra_services-team
|
||||
packages/kbn-profiling-utils @elastic/obs-ux-infra_services-team
|
||||
|
|
|
@ -1456,6 +1456,7 @@
|
|||
"@kbn/picomatcher": "link:packages/kbn-picomatcher",
|
||||
"@kbn/plugin-generator": "link:packages/kbn-plugin-generator",
|
||||
"@kbn/plugin-helpers": "link:packages/kbn-plugin-helpers",
|
||||
"@kbn/product-doc-artifact-builder": "link:x-pack/packages/ai-infra/product-doc-artifact-builder",
|
||||
"@kbn/repo-file-maps": "link:packages/kbn-repo-file-maps",
|
||||
"@kbn/repo-linter": "link:packages/kbn-repo-linter",
|
||||
"@kbn/repo-path": "link:packages/kbn-repo-path",
|
||||
|
|
11
scripts/build_product_doc_artifacts.js
Normal file
11
scripts/build_product_doc_artifacts.js
Normal file
|
@ -0,0 +1,11 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the "Elastic License
|
||||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
|
||||
* Public License v 1"; you may not use this file except in compliance with, at
|
||||
* your election, the "Elastic License 2.0", the "GNU Affero General Public
|
||||
* License v3.0 only", or the "Server Side Public License, v 1".
|
||||
*/
|
||||
|
||||
require('../src/setup_node_env');
|
||||
require('@kbn/product-doc-artifact-builder').runScript();
|
|
@ -116,6 +116,7 @@ export const IGNORE_DIRECTORY_GLOBS = [
|
|||
'src/babel-*',
|
||||
'packages/*',
|
||||
'packages/core/*/*',
|
||||
'x-pack/packages/ai-infra/*',
|
||||
'packages/kbn-pm/src/utils/__fixtures__/*',
|
||||
'packages/kbn-check-prod-native-modules-cli/integration_tests/__fixtures__/*/node_modules/*',
|
||||
'x-pack/dev-tools',
|
||||
|
|
|
@ -1360,6 +1360,8 @@
|
|||
"@kbn/presentation-publishing/*": ["packages/presentation/presentation_publishing/*"],
|
||||
"@kbn/presentation-util-plugin": ["src/plugins/presentation_util"],
|
||||
"@kbn/presentation-util-plugin/*": ["src/plugins/presentation_util/*"],
|
||||
"@kbn/product-doc-artifact-builder": ["x-pack/packages/ai-infra/product-doc-artifact-builder"],
|
||||
"@kbn/product-doc-artifact-builder/*": ["x-pack/packages/ai-infra/product-doc-artifact-builder/*"],
|
||||
"@kbn/profiling-data-access-plugin": ["x-pack/plugins/observability_solution/profiling_data_access"],
|
||||
"@kbn/profiling-data-access-plugin/*": ["x-pack/plugins/observability_solution/profiling_data_access/*"],
|
||||
"@kbn/profiling-plugin": ["x-pack/plugins/observability_solution/profiling"],
|
||||
|
@ -2078,4 +2080,4 @@
|
|||
"@kbn/ambient-storybook-types"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
# @kbn/product-doc-artifact-builder
|
||||
|
||||
Script to build the knowledge base artifacts
|
|
@ -0,0 +1,8 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
export { runScript } from './src/command';
|
|
@ -0,0 +1,12 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
module.exports = {
|
||||
preset: '@kbn/test/jest_node',
|
||||
rootDir: '../../../..',
|
||||
roots: ['<rootDir>/x-pack/packages/ai-infra/product-doc-artifact-builder'],
|
||||
};
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"type": "shared-common",
|
||||
"id": "@kbn/product-doc-artifact-builder",
|
||||
"owner": "@elastic/appex-ai-infra",
|
||||
"devOnly": true
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"name": "@kbn/product-doc-artifact-builder",
|
||||
"private": true,
|
||||
"version": "1.0.0",
|
||||
"license": "Elastic License 2.0"
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
export const getArtifactName = ({
|
||||
productName,
|
||||
productVersion,
|
||||
}: {
|
||||
productName: string;
|
||||
productVersion: string;
|
||||
}): string => {
|
||||
return `kibana-kb-${productName}-${productVersion}.zip`.toLowerCase();
|
||||
};
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
export interface ArtifactManifest {
|
||||
formatVersion: string;
|
||||
productName: string;
|
||||
productVersion: string;
|
||||
}
|
||||
|
||||
export const getArtifactManifest = ({
|
||||
productName,
|
||||
stackVersion,
|
||||
}: {
|
||||
productName: string;
|
||||
stackVersion: string;
|
||||
}): ArtifactManifest => {
|
||||
return {
|
||||
formatVersion: '1.0.0',
|
||||
productName,
|
||||
productVersion: stackVersion,
|
||||
};
|
||||
};
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
|
||||
|
||||
export const getArtifactMappings = (inferenceEndpoint: string): MappingTypeMapping => {
|
||||
return {
|
||||
dynamic: 'strict',
|
||||
properties: {
|
||||
content_title: { type: 'text' },
|
||||
content_body: {
|
||||
type: 'semantic_text',
|
||||
inference_id: inferenceEndpoint,
|
||||
},
|
||||
product_name: { type: 'keyword' },
|
||||
root_type: { type: 'keyword' },
|
||||
slug: { type: 'keyword' },
|
||||
url: { type: 'keyword' },
|
||||
version: { type: 'version' },
|
||||
ai_subtitle: {
|
||||
type: 'semantic_text',
|
||||
inference_id: inferenceEndpoint,
|
||||
},
|
||||
ai_summary: {
|
||||
type: 'semantic_text',
|
||||
inference_id: inferenceEndpoint,
|
||||
},
|
||||
ai_questions_answered: {
|
||||
type: 'semantic_text',
|
||||
inference_id: inferenceEndpoint,
|
||||
},
|
||||
ai_tags: { type: 'keyword' },
|
||||
},
|
||||
};
|
||||
};
|
|
@ -0,0 +1,11 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
/**
|
||||
* The allowed product names, as found in the source's cluster
|
||||
*/
|
||||
export const sourceProductNames = ['Kibana', 'Elasticsearch', 'Security', 'Observability'];
|
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import Path from 'path';
|
||||
import { Client } from '@elastic/elasticsearch';
|
||||
import { ToolingLog } from '@kbn/tooling-log';
|
||||
import {
|
||||
// checkConnectivity,
|
||||
createTargetIndex,
|
||||
extractDocumentation,
|
||||
indexDocuments,
|
||||
installElser,
|
||||
createChunkFiles,
|
||||
createArtifact,
|
||||
cleanupFolders,
|
||||
deleteIndex,
|
||||
} from './tasks';
|
||||
import type { TaskConfig } from './types';
|
||||
|
||||
const getSourceClient = (config: TaskConfig) => {
|
||||
return new Client({
|
||||
compression: true,
|
||||
nodes: [config.sourceClusterUrl],
|
||||
sniffOnStart: false,
|
||||
auth: {
|
||||
username: config.sourceClusterUsername,
|
||||
password: config.sourceClusterPassword,
|
||||
},
|
||||
});
|
||||
};
|
||||
|
||||
const getEmbeddingClient = (config: TaskConfig) => {
|
||||
return new Client({
|
||||
compression: true,
|
||||
nodes: [config.embeddingClusterUrl],
|
||||
auth: {
|
||||
username: config.embeddingClusterUsername,
|
||||
password: config.embeddingClusterPassword,
|
||||
},
|
||||
// generating embeddings takes time
|
||||
requestTimeout: 10 * 60 * 1000,
|
||||
});
|
||||
};
|
||||
|
||||
export const buildArtifacts = async (config: TaskConfig) => {
|
||||
const log = new ToolingLog({
|
||||
level: 'info',
|
||||
writeTo: process.stdout,
|
||||
});
|
||||
|
||||
log.info(
|
||||
`Starting building artifacts for version=[${
|
||||
config.stackVersion
|
||||
}] and products=[${config.productNames.join(',')}]`
|
||||
);
|
||||
|
||||
const sourceClient = getSourceClient(config);
|
||||
const embeddingClient = getEmbeddingClient(config);
|
||||
|
||||
// log.info('Checking connectivity against clusters');
|
||||
// await checkConnectivity({ sourceClient, embeddingClient });
|
||||
|
||||
await cleanupFolders({ folders: [config.buildFolder] });
|
||||
|
||||
log.info('Ensuring ELSER is installed on the embedding cluster');
|
||||
await installElser({ client: embeddingClient });
|
||||
|
||||
for (const productName of config.productNames) {
|
||||
await buildArtifact({
|
||||
productName,
|
||||
stackVersion: config.stackVersion,
|
||||
buildFolder: config.buildFolder,
|
||||
targetFolder: config.targetFolder,
|
||||
sourceClient,
|
||||
embeddingClient,
|
||||
log,
|
||||
});
|
||||
}
|
||||
|
||||
await cleanupFolders({ folders: [config.buildFolder] });
|
||||
};
|
||||
|
||||
const buildArtifact = async ({
|
||||
productName,
|
||||
stackVersion,
|
||||
buildFolder,
|
||||
targetFolder,
|
||||
embeddingClient,
|
||||
sourceClient,
|
||||
log,
|
||||
}: {
|
||||
productName: string;
|
||||
stackVersion: string;
|
||||
buildFolder: string;
|
||||
targetFolder: string;
|
||||
sourceClient: Client;
|
||||
embeddingClient: Client;
|
||||
log: ToolingLog;
|
||||
}) => {
|
||||
log.info(`Starting building artifact for product [${productName}] and version [${stackVersion}]`);
|
||||
|
||||
const targetIndex = getTargetIndexName({ productName, stackVersion });
|
||||
|
||||
const documents = await extractDocumentation({
|
||||
client: sourceClient,
|
||||
index: 'search-docs-1',
|
||||
log,
|
||||
productName,
|
||||
stackVersion,
|
||||
});
|
||||
|
||||
await createTargetIndex({
|
||||
client: embeddingClient,
|
||||
indexName: targetIndex,
|
||||
});
|
||||
|
||||
await indexDocuments({
|
||||
client: embeddingClient,
|
||||
index: targetIndex,
|
||||
documents,
|
||||
log,
|
||||
});
|
||||
|
||||
await createChunkFiles({
|
||||
index: targetIndex,
|
||||
client: embeddingClient,
|
||||
productName,
|
||||
destFolder: Path.join(buildFolder, productName),
|
||||
log,
|
||||
});
|
||||
|
||||
await createArtifact({
|
||||
buildFolder: Path.join(buildFolder, productName),
|
||||
targetFolder,
|
||||
productName,
|
||||
stackVersion,
|
||||
log,
|
||||
});
|
||||
|
||||
await deleteIndex({
|
||||
indexName: targetIndex,
|
||||
client: embeddingClient,
|
||||
log,
|
||||
});
|
||||
|
||||
log.info(`Finished building artifact for product [${productName}] and version [${stackVersion}]`);
|
||||
};
|
||||
|
||||
const getTargetIndexName = ({
|
||||
productName,
|
||||
stackVersion,
|
||||
}: {
|
||||
productName: string;
|
||||
stackVersion: string;
|
||||
}) => {
|
||||
return `kb-artifact-builder-${productName}-${stackVersion}`.toLowerCase();
|
||||
};
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import Path from 'path';
|
||||
import { REPO_ROOT } from '@kbn/repo-info';
|
||||
import yargs from 'yargs';
|
||||
import type { TaskConfig } from './types';
|
||||
import { buildArtifacts } from './build_artifacts';
|
||||
import { sourceProductNames } from './artifact/product_name';
|
||||
|
||||
function options(y: yargs.Argv) {
|
||||
return y
|
||||
.option('productName', {
|
||||
describe: 'name of products to generate documentation for',
|
||||
array: true,
|
||||
choices: sourceProductNames,
|
||||
default: ['Kibana'],
|
||||
})
|
||||
.option('stackVersion', {
|
||||
describe: 'The stack version to generate documentation for',
|
||||
string: true,
|
||||
default: '8.16', // TODO: master is on 9.0 now, not sure we can default to version in package.json?
|
||||
})
|
||||
.option('targetFolder', {
|
||||
describe: 'The folder to generate the artifacts in',
|
||||
string: true,
|
||||
default: Path.join(REPO_ROOT, 'build', 'kb-artifacts'),
|
||||
})
|
||||
.option('buildFolder', {
|
||||
describe: 'The folder to use for temporary files',
|
||||
string: true,
|
||||
default: Path.join(REPO_ROOT, 'build', 'temp-kb-artifacts'),
|
||||
})
|
||||
.option('sourceClusterUrl', {
|
||||
describe: 'The source cluster url',
|
||||
string: true,
|
||||
demandOption: true,
|
||||
default: process.env.KIBANA_SOURCE_CLUSTER_URL,
|
||||
})
|
||||
.option('sourceClusterUsername', {
|
||||
describe: 'The source cluster username',
|
||||
string: true,
|
||||
demandOption: true,
|
||||
default: process.env.KIBANA_SOURCE_CLUSTER_USERNAME,
|
||||
})
|
||||
.option('sourceClusterPassword', {
|
||||
describe: 'The source cluster password',
|
||||
string: true,
|
||||
demandOption: true,
|
||||
default: process.env.KIBANA_SOURCE_CLUSTER_PASSWORD,
|
||||
})
|
||||
.option('embeddingClusterUrl', {
|
||||
describe: 'The embedding cluster url',
|
||||
string: true,
|
||||
demandOption: true,
|
||||
default: process.env.KIBANA_EMBEDDING_CLUSTER_URL,
|
||||
})
|
||||
.option('embeddingClusterUsername', {
|
||||
describe: 'The embedding cluster username',
|
||||
string: true,
|
||||
demandOption: true,
|
||||
default: process.env.KIBANA_EMBEDDING_CLUSTER_USERNAME,
|
||||
})
|
||||
.option('embeddingClusterPassword', {
|
||||
describe: 'The embedding cluster password',
|
||||
string: true,
|
||||
demandOption: true,
|
||||
default: process.env.KIBANA_EMBEDDING_CLUSTER_PASSWORD,
|
||||
})
|
||||
.locale('en');
|
||||
}
|
||||
|
||||
export function runScript() {
|
||||
yargs(process.argv.slice(2))
|
||||
.command('*', 'Build knowledge base artifacts', options, async (argv) => {
|
||||
// argv contains additional entries - let's keep our input clear
|
||||
const taskConfig: TaskConfig = {
|
||||
productNames: argv.productName,
|
||||
stackVersion: argv.stackVersion,
|
||||
buildFolder: argv.buildFolder,
|
||||
targetFolder: argv.targetFolder,
|
||||
sourceClusterUrl: argv.sourceClusterUrl!,
|
||||
sourceClusterUsername: argv.sourceClusterUsername!,
|
||||
sourceClusterPassword: argv.sourceClusterPassword!,
|
||||
embeddingClusterUrl: argv.embeddingClusterUrl!,
|
||||
embeddingClusterUsername: argv.embeddingClusterUsername!,
|
||||
embeddingClusterPassword: argv.embeddingClusterPassword!,
|
||||
};
|
||||
|
||||
return buildArtifacts(taskConfig);
|
||||
})
|
||||
.parse();
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { Client } from '@elastic/elasticsearch';
|
||||
|
||||
export const checkConnectivity = async ({
|
||||
sourceClient,
|
||||
embeddingClient,
|
||||
}: {
|
||||
sourceClient: Client;
|
||||
embeddingClient: Client;
|
||||
}) => {
|
||||
await Promise.all([sourceClient.ping(), embeddingClient.ping()]);
|
||||
};
|
|
@ -0,0 +1,12 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { rm } from 'fs/promises';
|
||||
|
||||
export const cleanupFolders = async ({ folders }: { folders: string[] }) => {
|
||||
await Promise.all(folders.map((folder) => rm(folder, { recursive: true, force: true })));
|
||||
};
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import Path from 'path';
|
||||
import AdmZip from 'adm-zip';
|
||||
import type { ToolingLog } from '@kbn/tooling-log';
|
||||
import { getArtifactMappings } from '../artifact/mappings';
|
||||
import { getArtifactManifest } from '../artifact/manifest';
|
||||
import { getArtifactName } from '../artifact/artifact_name';
|
||||
|
||||
export const createArtifact = async ({
|
||||
productName,
|
||||
stackVersion,
|
||||
buildFolder,
|
||||
targetFolder,
|
||||
log,
|
||||
}: {
|
||||
buildFolder: string;
|
||||
targetFolder: string;
|
||||
productName: string;
|
||||
stackVersion: string;
|
||||
log: ToolingLog;
|
||||
}) => {
|
||||
log.info(
|
||||
`Starting to create artifact from build folder [${buildFolder}] into target [${targetFolder}]`
|
||||
);
|
||||
|
||||
const zip = new AdmZip();
|
||||
|
||||
const mappings = getArtifactMappings('.default-elser');
|
||||
const mappingFileContent = JSON.stringify(mappings, undefined, 2);
|
||||
zip.addFile('mappings.json', Buffer.from(mappingFileContent, 'utf-8'));
|
||||
|
||||
const manifest = getArtifactManifest({ productName, stackVersion });
|
||||
const manifestFileContent = JSON.stringify(manifest, undefined, 2);
|
||||
zip.addFile('manifest.json', Buffer.from(manifestFileContent, 'utf-8'));
|
||||
|
||||
zip.addLocalFolder(buildFolder, 'content');
|
||||
|
||||
const artifactName = getArtifactName({
|
||||
productName,
|
||||
productVersion: stackVersion,
|
||||
});
|
||||
zip.writeZip(Path.join(targetFolder, artifactName));
|
||||
|
||||
log.info(`Finished creating artifact [${artifactName}]`);
|
||||
};
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import Path from 'path';
|
||||
import Fs from 'fs/promises';
|
||||
import type { Client } from '@elastic/elasticsearch';
|
||||
import type { ToolingLog } from '@kbn/tooling-log';
|
||||
|
||||
const fileSizeLimit = 250_000;
|
||||
|
||||
export const createChunkFiles = async ({
|
||||
index,
|
||||
productName,
|
||||
destFolder,
|
||||
client,
|
||||
log,
|
||||
}: {
|
||||
index: string;
|
||||
productName: string;
|
||||
destFolder: string;
|
||||
client: Client;
|
||||
log: ToolingLog;
|
||||
}) => {
|
||||
log.info(`Starting to create chunk files in directory [${destFolder}]`);
|
||||
|
||||
const searchRes = await client.search({
|
||||
index,
|
||||
size: 10000,
|
||||
query: {
|
||||
bool: {
|
||||
must: [{ term: { product_name: productName } }],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await Fs.mkdir(destFolder, { recursive: true });
|
||||
|
||||
let chunkNumber = 1;
|
||||
let chunkDocCount = 0;
|
||||
let chunkContent: string = '';
|
||||
|
||||
const writeCurrentChunk = async () => {
|
||||
const chunkFileName = `content-${chunkNumber}.ndjson`;
|
||||
log.info(`Writing chunk file ${chunkFileName} containing ${chunkDocCount} docs`);
|
||||
await Fs.writeFile(Path.join(destFolder, chunkFileName), chunkContent);
|
||||
chunkContent = '';
|
||||
chunkDocCount = 0;
|
||||
chunkNumber++;
|
||||
};
|
||||
|
||||
for (let i = 0; i < searchRes.hits.hits.length; i++) {
|
||||
const hit = searchRes.hits.hits[i];
|
||||
chunkContent += JSON.stringify(hit._source) + '\n';
|
||||
chunkDocCount++;
|
||||
if (
|
||||
Buffer.byteLength(chunkContent, 'utf8') > fileSizeLimit ||
|
||||
i === searchRes.hits.hits.length - 1
|
||||
) {
|
||||
await writeCurrentChunk();
|
||||
}
|
||||
}
|
||||
|
||||
log.info(`Finished creating chunk files`);
|
||||
};
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { Client } from '@elastic/elasticsearch';
|
||||
import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
|
||||
|
||||
const mappings: MappingTypeMapping = {
|
||||
dynamic: 'strict',
|
||||
properties: {
|
||||
content_title: { type: 'text' },
|
||||
content_body: {
|
||||
type: 'semantic_text',
|
||||
inference_id: 'kibana-elser2',
|
||||
},
|
||||
product_name: { type: 'keyword' },
|
||||
root_type: { type: 'keyword' },
|
||||
slug: { type: 'keyword' },
|
||||
url: { type: 'keyword' },
|
||||
version: { type: 'version' },
|
||||
ai_subtitle: {
|
||||
type: 'semantic_text',
|
||||
inference_id: 'kibana-elser2',
|
||||
},
|
||||
ai_summary: {
|
||||
type: 'semantic_text',
|
||||
inference_id: 'kibana-elser2',
|
||||
},
|
||||
ai_questions_answered: {
|
||||
type: 'semantic_text',
|
||||
inference_id: 'kibana-elser2',
|
||||
},
|
||||
ai_tags: { type: 'keyword' },
|
||||
},
|
||||
};
|
||||
|
||||
export const createTargetIndex = async ({
|
||||
indexName,
|
||||
client,
|
||||
}: {
|
||||
indexName: string;
|
||||
client: Client;
|
||||
}) => {
|
||||
await client.indices.create({
|
||||
index: indexName,
|
||||
mappings,
|
||||
});
|
||||
};
|
|
@ -0,0 +1,27 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { Client } from '@elastic/elasticsearch';
|
||||
import type { ToolingLog } from '@kbn/tooling-log';
|
||||
|
||||
export const deleteIndex = async ({
|
||||
indexName,
|
||||
client,
|
||||
log,
|
||||
}: {
|
||||
indexName: string;
|
||||
client: Client;
|
||||
log: ToolingLog;
|
||||
}) => {
|
||||
log.info(`Deleting index ${indexName}`);
|
||||
await client.indices.delete(
|
||||
{
|
||||
index: indexName,
|
||||
},
|
||||
{ ignore: [404] }
|
||||
);
|
||||
};
|
|
@ -0,0 +1,102 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { Client } from '@elastic/elasticsearch';
|
||||
import type { SearchHit } from '@elastic/elasticsearch/lib/api/types';
|
||||
import type { ToolingLog } from '@kbn/tooling-log';
|
||||
|
||||
/** the list of fields to import from the source cluster */
|
||||
const fields = [
|
||||
'content_title',
|
||||
'content_body',
|
||||
'product_name', // "Kibana", "Elasticsearch"
|
||||
'category', // "documentation"
|
||||
'slug',
|
||||
'url',
|
||||
'version',
|
||||
'ai_fields.ai_subtitle',
|
||||
'ai_fields.ai_summary',
|
||||
'ai_fields.ai_questions_answered',
|
||||
'ai_fields.ai_tags',
|
||||
];
|
||||
|
||||
export interface ExtractedDocument {
|
||||
content_title: string;
|
||||
content_body: string;
|
||||
product_name: string;
|
||||
root_type: string;
|
||||
slug: string;
|
||||
url: string;
|
||||
version: string;
|
||||
ai_subtitle: string;
|
||||
ai_summary: string;
|
||||
ai_questions_answered: string[];
|
||||
ai_tags: string[];
|
||||
}
|
||||
|
||||
const convertHit = (hit: SearchHit<any>): ExtractedDocument => {
|
||||
const source = hit._source;
|
||||
return {
|
||||
content_title: source.content_title,
|
||||
content_body: source.content_body,
|
||||
product_name: source.product_name,
|
||||
root_type: 'documentation',
|
||||
slug: source.slug,
|
||||
url: source.url,
|
||||
version: source.version,
|
||||
ai_subtitle: source.ai_fields.ai_subtitle,
|
||||
ai_summary: source.ai_fields.ai_summary,
|
||||
ai_questions_answered: source.ai_fields.ai_questions_answered,
|
||||
ai_tags: source.ai_fields.ai_tags,
|
||||
};
|
||||
};
|
||||
|
||||
export const extractDocumentation = async ({
|
||||
client,
|
||||
index,
|
||||
stackVersion,
|
||||
productName,
|
||||
log,
|
||||
}: {
|
||||
client: Client;
|
||||
index: string;
|
||||
stackVersion: string;
|
||||
productName: string;
|
||||
log: ToolingLog;
|
||||
}) => {
|
||||
log.info(`Starting to extract documents from source cluster`);
|
||||
|
||||
const response = await client.search({
|
||||
index,
|
||||
size: 10000,
|
||||
query: {
|
||||
bool: {
|
||||
must: [
|
||||
{ term: { product_name: productName } },
|
||||
{ term: { version: stackVersion } },
|
||||
{ exists: { field: 'ai_fields.ai_summary' } },
|
||||
],
|
||||
},
|
||||
},
|
||||
fields,
|
||||
});
|
||||
|
||||
const totalHits =
|
||||
typeof response.hits.total === 'number'
|
||||
? response.hits.total // This format is to be removed in 8.0
|
||||
: response.hits.total?.value ?? response.hits.hits.length;
|
||||
|
||||
if (totalHits > 10_000) {
|
||||
throw new Error('Found more than 10k documents to extract - aborting');
|
||||
}
|
||||
|
||||
log.info(
|
||||
`Finished extracting documents from source. ${response.hits.hits.length} documents were extracted`
|
||||
);
|
||||
|
||||
return response.hits.hits.map(convertHit);
|
||||
};
|
|
@ -0,0 +1,17 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
export { extractDocumentation } from './extract_documentation';
|
||||
export { indexDocuments } from './index_documents';
|
||||
export { createTargetIndex } from './create_index';
|
||||
export { installElser } from './install_elser';
|
||||
export { createChunkFiles } from './create_chunk_files';
|
||||
export { performSemanticSearch } from './perform_semantic_search';
|
||||
export { checkConnectivity } from './check_connectivity';
|
||||
export { createArtifact } from './create_artifact';
|
||||
export { cleanupFolders } from './cleanup_folders';
|
||||
export { deleteIndex } from './delete_index';
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { chunk as toChunks } from 'lodash';
|
||||
import type { Client } from '@elastic/elasticsearch';
|
||||
import type { BulkRequest } from '@elastic/elasticsearch/lib/api/types';
|
||||
import type { ToolingLog } from '@kbn/tooling-log';
|
||||
import type { ExtractedDocument } from './extract_documentation';
|
||||
|
||||
const indexingChunkSize = 10;
|
||||
|
||||
export const indexDocuments = async ({
|
||||
index,
|
||||
client,
|
||||
documents,
|
||||
log,
|
||||
}: {
|
||||
index: string;
|
||||
documents: ExtractedDocument[];
|
||||
client: Client;
|
||||
log: ToolingLog;
|
||||
}) => {
|
||||
const chunks = toChunks(documents, indexingChunkSize);
|
||||
|
||||
log.info(`Starting indexing process`);
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
const before = Date.now();
|
||||
await client.bulk(
|
||||
{
|
||||
refresh: 'wait_for',
|
||||
operations: chunk.reduce((operations, document) => {
|
||||
operations!.push(...[{ index: { _index: index } }, document]);
|
||||
return operations;
|
||||
}, [] as BulkRequest['operations']),
|
||||
},
|
||||
{ requestTimeout: 10 * 60 * 1000 }
|
||||
);
|
||||
|
||||
const duration = Date.now() - before;
|
||||
log.info(`Indexed ${i + 1} of ${chunks.length} chunks (took ${duration}ms)`);
|
||||
}
|
||||
|
||||
log.info(`Finished indexing process`);
|
||||
};
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { Client } from '@elastic/elasticsearch';
|
||||
|
||||
const inferenceEndpointId = 'kibana-elser2';
|
||||
|
||||
export const installElser = async ({ client }: { client: Client }) => {
|
||||
const getInferenceRes = await client.inference.get(
|
||||
{
|
||||
task_type: 'sparse_embedding',
|
||||
inference_id: 'kibana-elser2',
|
||||
},
|
||||
{ ignore: [404] }
|
||||
);
|
||||
|
||||
const installed = (getInferenceRes.endpoints ?? []).some(
|
||||
(endpoint) => endpoint.inference_id === inferenceEndpointId
|
||||
);
|
||||
|
||||
if (!installed) {
|
||||
await client.inference.put({
|
||||
task_type: 'sparse_embedding',
|
||||
inference_id: inferenceEndpointId,
|
||||
inference_config: {
|
||||
service: 'elser',
|
||||
service_settings: {
|
||||
num_allocations: 1,
|
||||
num_threads: 1,
|
||||
model_id: '.elser_model_2',
|
||||
},
|
||||
task_settings: {},
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
await waitUntilDeployed({
|
||||
modelId: '.elser_model_2',
|
||||
client,
|
||||
});
|
||||
};
|
||||
|
||||
const waitUntilDeployed = async ({
|
||||
modelId,
|
||||
client,
|
||||
maxRetries = 20,
|
||||
delay = 2000,
|
||||
}: {
|
||||
modelId: string;
|
||||
client: Client;
|
||||
maxRetries?: number;
|
||||
delay?: number;
|
||||
}) => {
|
||||
for (let i = 0; i < maxRetries; i++) {
|
||||
const statsRes = await client.ml.getTrainedModelsStats({
|
||||
model_id: modelId,
|
||||
});
|
||||
const deploymentStats = statsRes.trained_model_stats[0]?.deployment_stats;
|
||||
// @ts-expect-error deploymentStats.nodes not defined as array even if it is.
|
||||
if (!deploymentStats || deploymentStats.nodes.length === 0) {
|
||||
await sleep(delay);
|
||||
continue;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
throw new Error(`Timeout waiting for ML model ${modelId} to be deployed`);
|
||||
};
|
||||
|
||||
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
@ -0,0 +1,92 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { Client } from '@elastic/elasticsearch';
|
||||
|
||||
// https://search-labs.elastic.co/search-labs/blog/elser-rag-search-for-relevance
|
||||
|
||||
export const performSemanticSearch = async ({
|
||||
searchQuery,
|
||||
index,
|
||||
client,
|
||||
}: {
|
||||
searchQuery: string;
|
||||
index: string;
|
||||
client: Client;
|
||||
}) => {
|
||||
const results = await client.search({
|
||||
index,
|
||||
size: 3,
|
||||
query: {
|
||||
bool: {
|
||||
filter: {
|
||||
bool: {
|
||||
must: [{ term: { version: '8.15' } }],
|
||||
},
|
||||
},
|
||||
should: [
|
||||
{
|
||||
multi_match: {
|
||||
query: searchQuery,
|
||||
minimum_should_match: '1<-1 3<49%',
|
||||
type: 'cross_fields',
|
||||
fields: [
|
||||
'content_title',
|
||||
'content_body.text',
|
||||
'ai_subtitle.text',
|
||||
'ai_summary.text',
|
||||
'ai_questions_answered.text',
|
||||
'ai_tags',
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
multi_match: {
|
||||
query: searchQuery,
|
||||
type: 'phrase',
|
||||
boost: 3,
|
||||
slop: 0,
|
||||
fields: [
|
||||
'content_title.stem',
|
||||
'content_body.stem',
|
||||
'ai_subtitle.stem',
|
||||
'ai_summary.stem',
|
||||
'ai_questions_answered.stem',
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
semantic: {
|
||||
field: 'content_body',
|
||||
query: searchQuery,
|
||||
},
|
||||
},
|
||||
{
|
||||
semantic: {
|
||||
field: 'ai_subtitle',
|
||||
query: searchQuery,
|
||||
},
|
||||
},
|
||||
{
|
||||
semantic: {
|
||||
field: 'ai_summary',
|
||||
query: searchQuery,
|
||||
},
|
||||
},
|
||||
{
|
||||
semantic: {
|
||||
field: 'ai_questions_answered',
|
||||
query: searchQuery,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return results.hits.hits;
|
||||
};
|
|
@ -0,0 +1,19 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
export interface TaskConfig {
|
||||
productNames: string[];
|
||||
stackVersion: string;
|
||||
buildFolder: string;
|
||||
targetFolder: string;
|
||||
sourceClusterUrl: string;
|
||||
sourceClusterUsername: string;
|
||||
sourceClusterPassword: string;
|
||||
embeddingClusterUrl: string;
|
||||
embeddingClusterUsername: string;
|
||||
embeddingClusterPassword: string;
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
{
|
||||
"extends": "../../../../tsconfig.base.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "target/types",
|
||||
"types": [
|
||||
"jest",
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"include": [
|
||||
"**/*.ts",
|
||||
],
|
||||
"exclude": [
|
||||
"target/**/*"
|
||||
],
|
||||
"kbn_references": [
|
||||
"@kbn/tooling-log",
|
||||
"@kbn/repo-info",
|
||||
]
|
||||
}
|
|
@ -6003,6 +6003,10 @@
|
|||
version "0.0.0"
|
||||
uid ""
|
||||
|
||||
"@kbn/product-doc-artifact-builder@link:x-pack/packages/ai-infra/product-doc-artifact-builder":
|
||||
version "0.0.0"
|
||||
uid ""
|
||||
|
||||
"@kbn/profiling-data-access-plugin@link:x-pack/plugins/observability_solution/profiling_data_access":
|
||||
version "0.0.0"
|
||||
uid ""
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue