Tweak product doc generation for 8.17 (#205189)

## Summary

- use default elser
- adapt cleaning for new markdown format
This commit is contained in:
Pierre Gayvallet 2025-01-08 16:19:45 +01:00 committed by GitHub
parent 9078287e8b
commit 81a5aa97f1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 14 additions and 82 deletions

View file

@ -14,7 +14,6 @@ import {
createTargetIndex,
extractDocumentation,
indexDocuments,
installElser,
createChunkFiles,
createArtifact,
cleanupFolders,
@ -68,9 +67,6 @@ export const buildArtifacts = async (config: TaskConfig) => {
await cleanupFolders({ folders: [config.buildFolder] });
log.info('Ensuring ELSER is installed on the embedding cluster');
await installElser({ client: embeddingClient });
for (const productName of config.productNames) {
await buildArtifact({
productName,

View file

@ -8,13 +8,15 @@
import type { Client } from '@elastic/elasticsearch';
import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
const DEFAULT_ELSER = '.elser-2-elasticsearch';
const mappings: MappingTypeMapping = {
dynamic: 'strict',
properties: {
content_title: { type: 'text' },
content_body: {
type: 'semantic_text',
inference_id: 'kibana-elser2',
inference_id: DEFAULT_ELSER,
},
product_name: { type: 'keyword' },
root_type: { type: 'keyword' },
@ -24,11 +26,11 @@ const mappings: MappingTypeMapping = {
ai_subtitle: { type: 'text' },
ai_summary: {
type: 'semantic_text',
inference_id: 'kibana-elser2',
inference_id: DEFAULT_ELSER,
},
ai_questions_answered: {
type: 'semantic_text',
inference_id: 'kibana-elser2',
inference_id: DEFAULT_ELSER,
},
ai_tags: { type: 'keyword' },
},

View file

@ -8,7 +8,6 @@
export { extractDocumentation } from './extract_documentation';
export { indexDocuments } from './index_documents';
export { createTargetIndex } from './create_index';
export { installElser } from './install_elser';
export { createChunkFiles } from './create_chunk_files';
export { checkConnectivity } from './check_connectivity';
export { createArtifact } from './create_artifact';

View file

@ -1,73 +0,0 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { Client } from '@elastic/elasticsearch';
const inferenceEndpointId = 'kibana-elser2';
export const installElser = async ({ client }: { client: Client }) => {
const getInferenceRes = await client.inference.get(
{
task_type: 'sparse_embedding',
inference_id: 'kibana-elser2',
},
{ ignore: [404] }
);
const installed = (getInferenceRes.endpoints ?? []).some(
(endpoint) => endpoint.inference_id === inferenceEndpointId
);
if (!installed) {
await client.inference.put({
task_type: 'sparse_embedding',
inference_id: inferenceEndpointId,
inference_config: {
service: 'elser',
service_settings: {
num_allocations: 1,
num_threads: 1,
model_id: '.elser_model_2',
},
task_settings: {},
},
});
}
await waitUntilDeployed({
modelId: '.elser_model_2',
client,
});
};
const waitUntilDeployed = async ({
modelId,
client,
maxRetries = 20,
delay = 2000,
}: {
modelId: string;
client: Client;
maxRetries?: number;
delay?: number;
}) => {
for (let i = 0; i < maxRetries; i++) {
const statsRes = await client.ml.getTrainedModelsStats({
model_id: modelId,
});
const deploymentStats = statsRes.trained_model_stats[0]?.deployment_stats;
if (!deploymentStats || deploymentStats.nodes.length === 0) {
await sleep(delay);
continue;
}
return;
}
throw new Error(`Timeout waiting for ML model ${modelId} to be deployed`);
};
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));

View file

@ -33,13 +33,15 @@ const removeDuplicates = (documents: ExtractedDocument[]): ExtractedDocument[] =
return uniqBy(documents, (doc) => doc.slug);
};
const EMPTY_DOC_TOKEN_LIMIT = 120;
/**
* Filter "this content has moved" or "deleted pages" type of documents, just based on token count.
*/
const filterEmptyDocs = (documents: ExtractedDocument[]): ExtractedDocument[] => {
return documents.filter((doc) => {
const tokenCount = encode(doc.content_body).length;
if (tokenCount < 100) {
if (tokenCount < EMPTY_DOC_TOKEN_LIMIT) {
return false;
}
return true;
@ -52,8 +54,14 @@ const processDocument = (document: ExtractedDocument) => {
.replaceAll(/([a-zA-Z])edit\n/g, (match) => {
return `${match[0]}\n`;
})
// remove edit links
.replaceAll(/\[\s*edit\s*\]\(\s*[^)]+\s*\)/g, '')
// remove empty links
.replaceAll('[]()', '')
// limit to 2 consecutive carriage return
.replaceAll(/\n\n+/g, '\n\n');
document.content_title = document.content_title.split('|')[0].trim();
return document;
};