Tweak product doc generation for 8.17 (#205189)

## Summary - use default elser - adapt cleaning for new markdown format
2025-04-23 09:19:04 -04:00 · 2025-01-08 16:19:45 +01:00 · 2025-01-08 16:19:45 +01:00 · 81a5aa97f1
commit 81a5aa97f1
parent 9078287e8b
5 changed files with 14 additions and 82 deletions
--- a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts
+++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts
@ -14,7 +14,6 @@ import {
  createTargetIndex,
  extractDocumentation,
  indexDocuments,
-  installElser,
  createChunkFiles,
  createArtifact,
  cleanupFolders,
@ -68,9 +67,6 @@ export const buildArtifacts = async (config: TaskConfig) => {

  await cleanupFolders({ folders: [config.buildFolder] });

-  log.info('Ensuring ELSER is installed on the embedding cluster');
-  await installElser({ client: embeddingClient });
-
  for (const productName of config.productNames) {
    await buildArtifact({
      productName,
--- a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts
+++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts
@ -8,13 +8,15 @@
 import type { Client } from '@elastic/elasticsearch';
 import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';

+const DEFAULT_ELSER = '.elser-2-elasticsearch';
+
 const mappings: MappingTypeMapping = {
  dynamic: 'strict',
  properties: {
    content_title: { type: 'text' },
    content_body: {
      type: 'semantic_text',
-      inference_id: 'kibana-elser2',
+      inference_id: DEFAULT_ELSER,
    },
    product_name: { type: 'keyword' },
    root_type: { type: 'keyword' },
@ -24,11 +26,11 @@ const mappings: MappingTypeMapping = {
    ai_subtitle: { type: 'text' },
    ai_summary: {
      type: 'semantic_text',
-      inference_id: 'kibana-elser2',
+      inference_id: DEFAULT_ELSER,
    },
    ai_questions_answered: {
      type: 'semantic_text',
-      inference_id: 'kibana-elser2',
+      inference_id: DEFAULT_ELSER,
    },
    ai_tags: { type: 'keyword' },
  },
--- a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts
+++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts
@ -8,7 +8,6 @@
 export { extractDocumentation } from './extract_documentation';
 export { indexDocuments } from './index_documents';
 export { createTargetIndex } from './create_index';
-export { installElser } from './install_elser';
 export { createChunkFiles } from './create_chunk_files';
 export { checkConnectivity } from './check_connectivity';
 export { createArtifact } from './create_artifact';
--- a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts
+++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts
@ -1,73 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-import type { Client } from '@elastic/elasticsearch';
-
-const inferenceEndpointId = 'kibana-elser2';
-
-export const installElser = async ({ client }: { client: Client }) => {
-  const getInferenceRes = await client.inference.get(
-    {
-      task_type: 'sparse_embedding',
-      inference_id: 'kibana-elser2',
-    },
-    { ignore: [404] }
-  );
-
-  const installed = (getInferenceRes.endpoints ?? []).some(
-    (endpoint) => endpoint.inference_id === inferenceEndpointId
-  );
-
-  if (!installed) {
-    await client.inference.put({
-      task_type: 'sparse_embedding',
-      inference_id: inferenceEndpointId,
-      inference_config: {
-        service: 'elser',
-        service_settings: {
-          num_allocations: 1,
-          num_threads: 1,
-          model_id: '.elser_model_2',
-        },
-        task_settings: {},
-      },
-    });
-  }
-
-  await waitUntilDeployed({
-    modelId: '.elser_model_2',
-    client,
-  });
-};
-
-const waitUntilDeployed = async ({
-  modelId,
-  client,
-  maxRetries = 20,
-  delay = 2000,
-}: {
-  modelId: string;
-  client: Client;
-  maxRetries?: number;
-  delay?: number;
-}) => {
-  for (let i = 0; i < maxRetries; i++) {
-    const statsRes = await client.ml.getTrainedModelsStats({
-      model_id: modelId,
-    });
-    const deploymentStats = statsRes.trained_model_stats[0]?.deployment_stats;
-    if (!deploymentStats || deploymentStats.nodes.length === 0) {
-      await sleep(delay);
-      continue;
-    }
-    return;
-  }
-
-  throw new Error(`Timeout waiting for ML model ${modelId} to be deployed`);
-};
-
-const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
--- a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/process_documents.ts
+++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/process_documents.ts
@ -33,13 +33,15 @@ const removeDuplicates = (documents: ExtractedDocument[]): ExtractedDocument[] =
  return uniqBy(documents, (doc) => doc.slug);
 };

+const EMPTY_DOC_TOKEN_LIMIT = 120;
+
 /**
 * Filter "this content has moved" or "deleted pages" type of documents, just based on token count.
 */
 const filterEmptyDocs = (documents: ExtractedDocument[]): ExtractedDocument[] => {
  return documents.filter((doc) => {
    const tokenCount = encode(doc.content_body).length;
-    if (tokenCount < 100) {
+    if (tokenCount < EMPTY_DOC_TOKEN_LIMIT) {
      return false;
    }
    return true;
@ -52,8 +54,14 @@ const processDocument = (document: ExtractedDocument) => {
    .replaceAll(/([a-zA-Z])edit\n/g, (match) => {
      return `${match[0]}\n`;
    })
+    // remove edit links
+    .replaceAll(/\[\s*edit\s*\]\(\s*[^)]+\s*\)/g, '')
+    // remove empty links
+    .replaceAll('[]()', '')
    // limit to 2 consecutive carriage return
    .replaceAll(/\n\n+/g, '\n\n');

+  document.content_title = document.content_title.split('|')[0].trim();
+
  return document;
 };