mirror of
https://github.com/elastic/kibana.git
synced 2025-04-23 09:19:04 -04:00
Tweak product doc generation for 8.17 (#205189)
## Summary - use default elser - adapt cleaning for new markdown format
This commit is contained in:
parent
9078287e8b
commit
81a5aa97f1
5 changed files with 14 additions and 82 deletions
|
@ -14,7 +14,6 @@ import {
|
|||
createTargetIndex,
|
||||
extractDocumentation,
|
||||
indexDocuments,
|
||||
installElser,
|
||||
createChunkFiles,
|
||||
createArtifact,
|
||||
cleanupFolders,
|
||||
|
@ -68,9 +67,6 @@ export const buildArtifacts = async (config: TaskConfig) => {
|
|||
|
||||
await cleanupFolders({ folders: [config.buildFolder] });
|
||||
|
||||
log.info('Ensuring ELSER is installed on the embedding cluster');
|
||||
await installElser({ client: embeddingClient });
|
||||
|
||||
for (const productName of config.productNames) {
|
||||
await buildArtifact({
|
||||
productName,
|
||||
|
|
|
@ -8,13 +8,15 @@
|
|||
import type { Client } from '@elastic/elasticsearch';
|
||||
import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
|
||||
|
||||
const DEFAULT_ELSER = '.elser-2-elasticsearch';
|
||||
|
||||
const mappings: MappingTypeMapping = {
|
||||
dynamic: 'strict',
|
||||
properties: {
|
||||
content_title: { type: 'text' },
|
||||
content_body: {
|
||||
type: 'semantic_text',
|
||||
inference_id: 'kibana-elser2',
|
||||
inference_id: DEFAULT_ELSER,
|
||||
},
|
||||
product_name: { type: 'keyword' },
|
||||
root_type: { type: 'keyword' },
|
||||
|
@ -24,11 +26,11 @@ const mappings: MappingTypeMapping = {
|
|||
ai_subtitle: { type: 'text' },
|
||||
ai_summary: {
|
||||
type: 'semantic_text',
|
||||
inference_id: 'kibana-elser2',
|
||||
inference_id: DEFAULT_ELSER,
|
||||
},
|
||||
ai_questions_answered: {
|
||||
type: 'semantic_text',
|
||||
inference_id: 'kibana-elser2',
|
||||
inference_id: DEFAULT_ELSER,
|
||||
},
|
||||
ai_tags: { type: 'keyword' },
|
||||
},
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
export { extractDocumentation } from './extract_documentation';
|
||||
export { indexDocuments } from './index_documents';
|
||||
export { createTargetIndex } from './create_index';
|
||||
export { installElser } from './install_elser';
|
||||
export { createChunkFiles } from './create_chunk_files';
|
||||
export { checkConnectivity } from './check_connectivity';
|
||||
export { createArtifact } from './create_artifact';
|
||||
|
|
|
@ -1,73 +0,0 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import type { Client } from '@elastic/elasticsearch';
|
||||
|
||||
const inferenceEndpointId = 'kibana-elser2';
|
||||
|
||||
export const installElser = async ({ client }: { client: Client }) => {
|
||||
const getInferenceRes = await client.inference.get(
|
||||
{
|
||||
task_type: 'sparse_embedding',
|
||||
inference_id: 'kibana-elser2',
|
||||
},
|
||||
{ ignore: [404] }
|
||||
);
|
||||
|
||||
const installed = (getInferenceRes.endpoints ?? []).some(
|
||||
(endpoint) => endpoint.inference_id === inferenceEndpointId
|
||||
);
|
||||
|
||||
if (!installed) {
|
||||
await client.inference.put({
|
||||
task_type: 'sparse_embedding',
|
||||
inference_id: inferenceEndpointId,
|
||||
inference_config: {
|
||||
service: 'elser',
|
||||
service_settings: {
|
||||
num_allocations: 1,
|
||||
num_threads: 1,
|
||||
model_id: '.elser_model_2',
|
||||
},
|
||||
task_settings: {},
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
await waitUntilDeployed({
|
||||
modelId: '.elser_model_2',
|
||||
client,
|
||||
});
|
||||
};
|
||||
|
||||
const waitUntilDeployed = async ({
|
||||
modelId,
|
||||
client,
|
||||
maxRetries = 20,
|
||||
delay = 2000,
|
||||
}: {
|
||||
modelId: string;
|
||||
client: Client;
|
||||
maxRetries?: number;
|
||||
delay?: number;
|
||||
}) => {
|
||||
for (let i = 0; i < maxRetries; i++) {
|
||||
const statsRes = await client.ml.getTrainedModelsStats({
|
||||
model_id: modelId,
|
||||
});
|
||||
const deploymentStats = statsRes.trained_model_stats[0]?.deployment_stats;
|
||||
if (!deploymentStats || deploymentStats.nodes.length === 0) {
|
||||
await sleep(delay);
|
||||
continue;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
throw new Error(`Timeout waiting for ML model ${modelId} to be deployed`);
|
||||
};
|
||||
|
||||
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
@ -33,13 +33,15 @@ const removeDuplicates = (documents: ExtractedDocument[]): ExtractedDocument[] =
|
|||
return uniqBy(documents, (doc) => doc.slug);
|
||||
};
|
||||
|
||||
const EMPTY_DOC_TOKEN_LIMIT = 120;
|
||||
|
||||
/**
|
||||
* Filter "this content has moved" or "deleted pages" type of documents, just based on token count.
|
||||
*/
|
||||
const filterEmptyDocs = (documents: ExtractedDocument[]): ExtractedDocument[] => {
|
||||
return documents.filter((doc) => {
|
||||
const tokenCount = encode(doc.content_body).length;
|
||||
if (tokenCount < 100) {
|
||||
if (tokenCount < EMPTY_DOC_TOKEN_LIMIT) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -52,8 +54,14 @@ const processDocument = (document: ExtractedDocument) => {
|
|||
.replaceAll(/([a-zA-Z])edit\n/g, (match) => {
|
||||
return `${match[0]}\n`;
|
||||
})
|
||||
// remove edit links
|
||||
.replaceAll(/\[\s*edit\s*\]\(\s*[^)]+\s*\)/g, '')
|
||||
// remove empty links
|
||||
.replaceAll('[]()', '')
|
||||
// limit to 2 consecutive carriage return
|
||||
.replaceAll(/\n\n+/g, '\n\n');
|
||||
|
||||
document.content_title = document.content_title.split('|')[0].trim();
|
||||
|
||||
return document;
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue