From 7d203012892f543f24027f268ca19c8d51990b27 Mon Sep 17 00:00:00 2001 From: Dario Gieselaar Date: Thu, 26 Jun 2025 17:24:45 +0200 Subject: [PATCH] Load huggingface content datasets (#224543) Implements a huggingface dataset loader for RAG evals - see [x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/README.md](https://github.com/dgieselaar/kibana/blob/hf-dataset-loader/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/README.md). Additionally, a `@kbn/cache-cli` tool was added that allows tooling authors to cache to disk (possibly remote storage later). Used o3 for finding datasets on HuggingFace and doing an initial pass on a line-by-line dataset processor ([see conversation](https://chatgpt.com/share/6853e49a-e870-8000-9c65-f7a5a3a72af0)) Libraries added: - `cache-manager`, `cache-manager-fs-hash`, `keyv`, `@types/cache-manager-fs-hash`: caching libraries and plugins. could not find any existing caching libraries in the repo. - `@huggingface/hub`: api client for HF. --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com> Co-authored-by: Elastic Machine --- .github/CODEOWNERS | 2 + .github/codeql/codeql-config.yml | 3 + package.json | 7 ++ renovate.json | 7 +- .../packages/shared/kbn-cache-cli/README.md | 49 ++++++++ .../packages/shared/kbn-cache-cli/index.ts | 13 +++ .../shared/kbn-cache-cli/jest.config.js | 14 +++ .../shared/kbn-cache-cli/kibana.jsonc | 8 ++ .../shared/kbn-cache-cli/package.json | 6 + .../kbn-cache-cli/src/from_cache.test.ts | 87 ++++++++++++++ .../shared/kbn-cache-cli/src/from_cache.ts | 29 +++++ .../stores/create_local_disk_cache_store.ts | 30 +++++ .../stores/create_tmp_dir_disk_cache_store.ts | 30 +++++ .../shared/kbn-cache-cli/tsconfig.json | 19 ++++ tsconfig.base.json | 4 + .../shared/kbn-ai-tools-cli/README.md | 3 + .../packages/shared/kbn-ai-tools-cli/index.ts | 10 ++ .../shared/kbn-ai-tools-cli/jest.config.js | 12 ++ .../shared/kbn-ai-tools-cli/kibana.jsonc | 8 ++ .../shared/kbn-ai-tools-cli/package.json | 6 + .../scripts/hf_dataset_loader.ts | 91 +++++++++++++++ .../src/hf_dataset_loader/README.md | 40 +++++++ .../src/hf_dataset_loader/config.ts | 87 ++++++++++++++ .../ensure_dataset_index_exists.ts | 46 ++++++++ .../fetch_rows_from_dataset.ts | 106 ++++++++++++++++++ .../src/hf_dataset_loader/get_embeddings.ts | 54 +++++++++ .../src/hf_dataset_loader/index_documents.ts | 49 ++++++++ .../load_hugging_face_datasets.ts | 103 +++++++++++++++++ .../src/hf_dataset_loader/types.ts | 29 +++++ .../shared/kbn-ai-tools-cli/tsconfig.json | 22 ++++ .../kbn-kibana-api-cli/src/proxy_transport.ts | 9 +- yarn.lock | 67 +++++++++++ 32 files changed, 1047 insertions(+), 3 deletions(-) create mode 100644 src/platform/packages/shared/kbn-cache-cli/README.md create mode 100644 src/platform/packages/shared/kbn-cache-cli/index.ts create mode 100644 src/platform/packages/shared/kbn-cache-cli/jest.config.js create mode 100644 src/platform/packages/shared/kbn-cache-cli/kibana.jsonc create mode 100644 src/platform/packages/shared/kbn-cache-cli/package.json create mode 100644 src/platform/packages/shared/kbn-cache-cli/src/from_cache.test.ts create mode 100644 src/platform/packages/shared/kbn-cache-cli/src/from_cache.ts create mode 100644 src/platform/packages/shared/kbn-cache-cli/src/stores/create_local_disk_cache_store.ts create mode 100644 src/platform/packages/shared/kbn-cache-cli/src/stores/create_tmp_dir_disk_cache_store.ts create mode 100644 src/platform/packages/shared/kbn-cache-cli/tsconfig.json create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/README.md create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/index.ts create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/jest.config.js create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/kibana.jsonc create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/package.json create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/scripts/hf_dataset_loader.ts create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/README.md create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/config.ts create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/ensure_dataset_index_exists.ts create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/fetch_rows_from_dataset.ts create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/get_embeddings.ts create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/index_documents.ts create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/load_hugging_face_datasets.ts create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/types.ts create mode 100644 x-pack/platform/packages/shared/kbn-ai-tools-cli/tsconfig.json diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0a06db9144d3..6d8a245285cf 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -426,6 +426,7 @@ src/platform/packages/shared/kbn-apm-utils @elastic/obs-ux-infra_services-team src/platform/packages/shared/kbn-avc-banner @elastic/security-defend-workflows src/platform/packages/shared/kbn-axe-config @elastic/appex-qa src/platform/packages/shared/kbn-babel-register @elastic/kibana-operations +src/platform/packages/shared/kbn-cache-cli @elastic/kibana-operations src/platform/packages/shared/kbn-calculate-auto @elastic/obs-ux-management-team src/platform/packages/shared/kbn-calculate-width-from-char-count @elastic/kibana-visualizations src/platform/packages/shared/kbn-cases-components @elastic/response-ops @@ -839,6 +840,7 @@ x-pack/platform/packages/shared/file-upload-common @elastic/ml-ui x-pack/platform/packages/shared/index-lifecycle-management/index_lifecycle_management_common_shared @elastic/kibana-management x-pack/platform/packages/shared/index-management/index_management_shared_types @elastic/kibana-management x-pack/platform/packages/shared/kbn-ai-assistant @elastic/search-kibana @elastic/obs-ai-assistant +x-pack/platform/packages/shared/kbn-ai-tools-cli @elastic/appex-ai-infra x-pack/platform/packages/shared/kbn-alerting-comparators @elastic/response-ops x-pack/platform/packages/shared/kbn-apm-types @elastic/obs-ux-infra_services-team x-pack/platform/packages/shared/kbn-cloud-security-posture/common @elastic/kibana-cloud-security-posture diff --git a/.github/codeql/codeql-config.yml b/.github/codeql/codeql-config.yml index aa920b0bfede..ea56a41d2dfb 100644 --- a/.github/codeql/codeql-config.yml +++ b/.github/codeql/codeql-config.yml @@ -102,6 +102,7 @@ paths-ignore: - src/platform/packages/private/kbn-telemetry-tools - src/platform/packages/shared/kbn-apm-synthtrace - src/platform/packages/shared/kbn-axe-config + - src/platform/packages/shared/kbn-cache-cli - src/platform/packages/shared/kbn-dev-cli-errors - src/platform/packages/shared/kbn-dev-cli-runner - src/platform/packages/shared/kbn-dev-proc-runner @@ -119,7 +120,9 @@ paths-ignore: - x-pack/examples - x-pack/packages/ai-infra/product-doc-artifact-builder - x-pack/packages/kbn-synthetics-private-location + - x-pack/platform/packages/shared/kbn-ai-tools-cli - x-pack/platform/packages/shared/kbn-inference-cli + - x-pack/platform/packages/shared/kbn-kibana-api-cli - x-pack/platform/packages/shared/kbn-sample-parser - x-pack/platform/plugins/private/cloud_integrations/cloud_full_story/public/assets/** - x-pack/platform/test diff --git a/package.json b/package.json index 93e57e2b5bb5..08a5b7cb7185 100644 --- a/package.json +++ b/package.json @@ -1420,11 +1420,13 @@ "@emotion/jest": "^11.11.0", "@fast-check/jest": "^2.1.0", "@frsource/cypress-plugin-visual-regression-diff": "^3.3.10", + "@huggingface/hub": "^2.2.0", "@jest/console": "^29.7.0", "@jest/reporters": "^29.7.0", "@jest/transform": "^29.6.1", "@jest/types": "^29.6.3", "@kayahr/text-encoding": "^1.3.0", + "@kbn/ai-tools-cli": "link:x-pack/platform/packages/shared/kbn-ai-tools-cli", "@kbn/alerting-api-integration-helpers": "link:x-pack/platform/test/alerting_api_integration/packages/helpers", "@kbn/ambient-common-types": "link:src/platform/packages/private/kbn-ambient-common-types", "@kbn/ambient-ftr-types": "link:src/platform/packages/private/kbn-ambient-ftr-types", @@ -1438,6 +1440,7 @@ "@kbn/babel-register": "link:src/platform/packages/shared/kbn-babel-register", "@kbn/babel-transform": "link:src/platform/packages/private/kbn-babel-transform", "@kbn/bazel-runner": "link:packages/kbn-bazel-runner", + "@kbn/cache-cli": "link:src/platform/packages/shared/kbn-cache-cli", "@kbn/capture-oas-snapshot-cli": "link:packages/kbn-capture-oas-snapshot-cli", "@kbn/check-mappings-update-cli": "link:packages/kbn-check-mappings-update-cli", "@kbn/check-prod-native-modules-cli": "link:packages/kbn-check-prod-native-modules-cli", @@ -1657,6 +1660,7 @@ "@types/aws4": "^1.5.0", "@types/base64-js": "^1.5.0", "@types/byte-size": "^8.1.2", + "@types/cache-manager-fs-hash": "^0.0.5", "@types/chance": "^1.0.0", "@types/chroma-js": "^2.1.0", "@types/chrome-remote-interface": "^0.31.14", @@ -1793,6 +1797,8 @@ "backport": "^10.0.1", "blob-polyfill": "^9.0.20240710", "buildkite-test-collector": "^1.8.1", + "cache-manager": "^7.0.0", + "cache-manager-fs-hash": "^2.0.0", "callsites": "^3.1.0", "chance": "1.0.18", "chromedriver": "^137.0.0", @@ -1875,6 +1881,7 @@ "json-schema-typed": "^8.0.1", "json5": "^2.2.3", "jsondiffpatch": "0.4.1", + "keyv": "^5.3.4", "license-checker": "^25.0.1", "lighthouse": "^12.6.1", "listr2": "^8.2.5", diff --git a/renovate.json b/renovate.json index c9dac09af9ef..860108ba6555 100644 --- a/renovate.json +++ b/renovate.json @@ -191,7 +191,12 @@ "langsmith", "openai", "@types/json-schema", - "table" + "table", + "@huggingface/hub", + "cache-manager", + "cache-manager-fs-hash", + "keyv", + "@types/cache-manager-fs-hash" ], "reviewers": [ "team:appex-ai-infra" diff --git a/src/platform/packages/shared/kbn-cache-cli/README.md b/src/platform/packages/shared/kbn-cache-cli/README.md new file mode 100644 index 000000000000..b21413d08a57 --- /dev/null +++ b/src/platform/packages/shared/kbn-cache-cli/README.md @@ -0,0 +1,49 @@ +# @kbn/cache-cli + +Centralised caching helpers for scripts and CLIs in the Kibana repo. + +The goal is to make it easy for engineers to cache computationally or I/O expensive operations on disk, or in the future, possible remote. + +--- + +## Quick start + +```ts +import { fromCache, createLocalDirDiskCacheStore } from '@kbn/cache-cli'; +import { createCache } from 'cache-manager'; + +const DOC_CACHE = createCache({ + stores: [createLocalDirDiskCacheStore({ dir: 'my_docs', ttl: 60 * 60 /* 1h */ })], +}); + +const docs = await fromCache('docs', DOC_CACHE, async () => fetchDocs()); +``` + +`fromCache(key, cache, cb, validator?)` semantics: + +1. Tries `cache.get(key)` (skipped when `process.env.DISABLE_KBN_CACHE` is truthy). +2. Runs the optional `validator(cached)` – return `false` to force a refresh. +3. Calls `cb()` if the cache miss / invalid. +4. Persists the fresh value via `cache.set(key, value)` and returns it. + +--- + +## Available cache stores + +`@kbn/cache-cli` wraps [`cache-manager`](https://github.com/node-cache-manager/node-cache-manager) so any **Keyv compatible** store works. The helpers below ship out-of-the-box: + +| Helper | Backing store | Typical use-case | +| --------------------------------------------- | --------------------------------------------------- | ---------------------------------------------------------- | +| `createLocalDirDiskCacheStore({ dir, ttl? })` | `cache-manager-fs-hash` on `/data/{dir}` | Persist in `./data` with an unknown ttl | +| `createTmpDirDiskCacheStore({ dir, ttl? })` | `cache-manager-fs-hash` on `/{dir}` | Persist in os tmp dir which might be cleared over restarts | + +--- + +## Cache invalidation strategies + +1. **Manual bypass** – set `DISABLE_KBN_CACHE=true` to force fresh data (useful in CI workflows). +2. **Time-to-live (TTL)** – pass `ttl` when creating a store to let the backend expire entries automatically. +3. **Programmatic validation** – supply the `cacheValidator` callback to `fromCache()`; it receives the cached value and should return `true` when it is still valid. +4. **Clear on disk** – delete the relevant directory under `data/` if you need a hard reset. + +Choose whichever fits your script. They can be combined (e.g. a TTL plus a validator). diff --git a/src/platform/packages/shared/kbn-cache-cli/index.ts b/src/platform/packages/shared/kbn-cache-cli/index.ts new file mode 100644 index 000000000000..a5a9fbfdeed5 --- /dev/null +++ b/src/platform/packages/shared/kbn-cache-cli/index.ts @@ -0,0 +1,13 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +export { createLocalDirDiskCacheStore } from './src/stores/create_local_disk_cache_store'; +export { createTmpDirDiskCacheStore } from './src/stores/create_tmp_dir_disk_cache_store'; + +export { fromCache } from './src/from_cache'; diff --git a/src/platform/packages/shared/kbn-cache-cli/jest.config.js b/src/platform/packages/shared/kbn-cache-cli/jest.config.js new file mode 100644 index 000000000000..f971bfe661cb --- /dev/null +++ b/src/platform/packages/shared/kbn-cache-cli/jest.config.js @@ -0,0 +1,14 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +module.exports = { + preset: '@kbn/test/jest_node', + rootDir: '../../../../..', + roots: ['/src/platform/packages/shared/kbn-cache-cli'], +}; diff --git a/src/platform/packages/shared/kbn-cache-cli/kibana.jsonc b/src/platform/packages/shared/kbn-cache-cli/kibana.jsonc new file mode 100644 index 000000000000..ebbbda0c8046 --- /dev/null +++ b/src/platform/packages/shared/kbn-cache-cli/kibana.jsonc @@ -0,0 +1,8 @@ +{ + "type": "shared-common", + "id": "@kbn/cache-cli", + "owner": "@elastic/kibana-operations", + "group": "platform", + "visibility": "shared", + "devOnly": true +} diff --git a/src/platform/packages/shared/kbn-cache-cli/package.json b/src/platform/packages/shared/kbn-cache-cli/package.json new file mode 100644 index 000000000000..031b147e5e54 --- /dev/null +++ b/src/platform/packages/shared/kbn-cache-cli/package.json @@ -0,0 +1,6 @@ +{ + "name": "@kbn/cache-cli", + "private": true, + "version": "1.0.0", + "license": "Elastic License 2.0 OR AGPL-3.0-only OR SSPL-1.0" +} \ No newline at end of file diff --git a/src/platform/packages/shared/kbn-cache-cli/src/from_cache.test.ts b/src/platform/packages/shared/kbn-cache-cli/src/from_cache.test.ts new file mode 100644 index 000000000000..d7f1fcd77948 --- /dev/null +++ b/src/platform/packages/shared/kbn-cache-cli/src/from_cache.test.ts @@ -0,0 +1,87 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +import { fromCache } from './from_cache'; +import type { Cache } from 'cache-manager'; + +function createMockCache(): { store: Map; cache: Cache } { + const backing = new Map(); + const cache = { + get: jest.fn(async (key: string) => backing.get(key)), + set: jest.fn(async (key: string, value: unknown) => { + backing.set(key, value); + }), + } as Partial; + return { store: backing, cache: cache as Cache }; +} + +describe('fromCache', () => { + const KEY = 'test-key'; + const NEW_VAL = 'fresh-value'; + + afterEach(() => { + jest.clearAllMocks(); + delete process.env.DISABLE_KBN_CLI_CACHE; + }); + + it('returns the cached value when present', async () => { + const { cache, store } = createMockCache(); + store.set(KEY, 'cached-value'); + + const cb = jest.fn().mockResolvedValue(NEW_VAL); + const result = await fromCache(KEY, cache, cb); + + expect(result).toBe('cached-value'); + expect(cb).not.toHaveBeenCalled(); + expect(cache.get).toHaveBeenCalledWith(KEY); + // value should not be overwritten, but invalidated + expect(cache.set).toHaveBeenCalledWith(KEY, 'cached-value'); + }); + + it('bypasses cache when DISABLE_KBN_CACHE env var is set', async () => { + process.env.DISABLE_KBN_CLI_CACHE = 'true'; + const { cache } = createMockCache(); + const cb = jest.fn().mockResolvedValue(NEW_VAL); + + const result = await fromCache(KEY, cache, cb); + + expect(cb).toHaveBeenCalledTimes(1); + expect(result).toBe(NEW_VAL); + + // still updates the cache with the new value + expect(cache.set).toHaveBeenCalledWith(KEY, NEW_VAL); + }); + + it('validates cached value with cacheValidator and recomputes when invalid', async () => { + const { cache, store } = createMockCache(); + store.set(KEY, 'stale'); + + const cb = jest.fn().mockResolvedValue(NEW_VAL); + + const validator = jest.fn((val: string) => val === 'fresh-value'); + + const result = await fromCache(KEY, cache, cb, validator); + + expect(validator).toHaveBeenCalledWith('stale'); + expect(cb).toHaveBeenCalledTimes(1); + expect(result).toBe(NEW_VAL); + expect(cache.set).toHaveBeenCalledWith(KEY, NEW_VAL); + }); + + it('stores newly computed value in cache when no cached value exists', async () => { + const { cache } = createMockCache(); + const cb = jest.fn().mockResolvedValue(NEW_VAL); + + const result = await fromCache(KEY, cache, cb); + + expect(result).toBe(NEW_VAL); + expect(cb).toHaveBeenCalledTimes(1); + expect(cache.set).toHaveBeenCalledWith(KEY, NEW_VAL); + }); +}); diff --git a/src/platform/packages/shared/kbn-cache-cli/src/from_cache.ts b/src/platform/packages/shared/kbn-cache-cli/src/from_cache.ts new file mode 100644 index 000000000000..354c7398b57e --- /dev/null +++ b/src/platform/packages/shared/kbn-cache-cli/src/from_cache.ts @@ -0,0 +1,29 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ +import { Cache } from 'cache-manager'; + +export async function fromCache( + key: string, + store: Cache, + cb: () => Promise, + cacheValidator?: (val: T) => boolean +): Promise { + let val = process.env.DISABLE_KBN_CLI_CACHE ? undefined : await store.get(key); + + if (val !== undefined && cacheValidator) { + val = cacheValidator(val) ? val : undefined; + } + + if (val === undefined) { + val = await cb(); + } + + store.set(key, val); + return val; +} diff --git a/src/platform/packages/shared/kbn-cache-cli/src/stores/create_local_disk_cache_store.ts b/src/platform/packages/shared/kbn-cache-cli/src/stores/create_local_disk_cache_store.ts new file mode 100644 index 000000000000..c3fe6b6f6376 --- /dev/null +++ b/src/platform/packages/shared/kbn-cache-cli/src/stores/create_local_disk_cache_store.ts @@ -0,0 +1,30 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +import DiskStore from 'cache-manager-fs-hash'; +import { KeyvAdapter } from 'cache-manager'; +import Path from 'path'; +import { REPO_ROOT } from '@kbn/repo-info'; +import { Keyv } from 'keyv'; + +export interface LocalDiskCacheOptions { + dir: string; + ttl?: number; +} + +export function createLocalDirDiskCacheStore(opts: LocalDiskCacheOptions): Keyv { + const adapter = new KeyvAdapter( + DiskStore.create({ + store: DiskStore, + options: { path: Path.join(REPO_ROOT, 'data', opts.dir), ttl: opts.ttl }, + }) + ); + + return new Keyv({ store: adapter }); +} diff --git a/src/platform/packages/shared/kbn-cache-cli/src/stores/create_tmp_dir_disk_cache_store.ts b/src/platform/packages/shared/kbn-cache-cli/src/stores/create_tmp_dir_disk_cache_store.ts new file mode 100644 index 000000000000..413823bdce6c --- /dev/null +++ b/src/platform/packages/shared/kbn-cache-cli/src/stores/create_tmp_dir_disk_cache_store.ts @@ -0,0 +1,30 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +import DiskStore from 'cache-manager-fs-hash'; +import { KeyvAdapter } from 'cache-manager'; +import Os from 'os'; +import Path from 'path'; +import { Keyv } from 'keyv'; + +export interface TmpDirDiskCacheOptions { + dir: string; + ttl?: number; +} + +export function createTmpDirDiskCacheStore(opts: TmpDirDiskCacheOptions): Keyv { + const adapter = new KeyvAdapter( + DiskStore.create({ + store: DiskStore, + options: { path: Path.join(Os.tmpdir(), opts.dir), ttl: opts.ttl }, + }) + ); + + return new Keyv(adapter); +} diff --git a/src/platform/packages/shared/kbn-cache-cli/tsconfig.json b/src/platform/packages/shared/kbn-cache-cli/tsconfig.json new file mode 100644 index 000000000000..cda2f9440f4f --- /dev/null +++ b/src/platform/packages/shared/kbn-cache-cli/tsconfig.json @@ -0,0 +1,19 @@ +{ + "extends": "../../../../../tsconfig.base.json", + "compilerOptions": { + "outDir": "target/types", + "types": [ + "jest", + "node" + ] + }, + "include": [ + "**/*.ts", + ], + "exclude": [ + "target/**/*" + ], + "kbn_references": [ + "@kbn/repo-info", + ] +} diff --git a/tsconfig.base.json b/tsconfig.base.json index 379300c206a1..9c9a85b19900 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -26,6 +26,8 @@ "@kbn/ai-assistant-management-plugin/*": ["src/platform/plugins/shared/ai_assistant_management/selection/*"], "@kbn/ai-security-labs-content": ["x-pack/solutions/security/packages/ai-security-labs-content"], "@kbn/ai-security-labs-content/*": ["x-pack/solutions/security/packages/ai-security-labs-content/*"], + "@kbn/ai-tools-cli": ["x-pack/platform/packages/shared/kbn-ai-tools-cli"], + "@kbn/ai-tools-cli/*": ["x-pack/platform/packages/shared/kbn-ai-tools-cli/*"], "@kbn/aiops-change-point-detection": ["x-pack/platform/packages/private/ml/aiops_change_point_detection"], "@kbn/aiops-change-point-detection/*": ["x-pack/platform/packages/private/ml/aiops_change_point_detection/*"], "@kbn/aiops-common": ["x-pack/platform/packages/shared/ml/aiops_common"], @@ -130,6 +132,8 @@ "@kbn/banners-plugin/*": ["x-pack/platform/plugins/private/banners/*"], "@kbn/bazel-runner": ["packages/kbn-bazel-runner"], "@kbn/bazel-runner/*": ["packages/kbn-bazel-runner/*"], + "@kbn/cache-cli": ["src/platform/packages/shared/kbn-cache-cli"], + "@kbn/cache-cli/*": ["src/platform/packages/shared/kbn-cache-cli/*"], "@kbn/calculate-auto": ["src/platform/packages/shared/kbn-calculate-auto"], "@kbn/calculate-auto/*": ["src/platform/packages/shared/kbn-calculate-auto/*"], "@kbn/calculate-width-from-char-count": ["src/platform/packages/shared/kbn-calculate-width-from-char-count"], diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/README.md b/x-pack/platform/packages/shared/kbn-ai-tools-cli/README.md new file mode 100644 index 000000000000..08656078c942 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/README.md @@ -0,0 +1,3 @@ +# @kbn/ai-tools-cli + +Empty package generated by @kbn/generate diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/index.ts b/x-pack/platform/packages/shared/kbn-ai-tools-cli/index.ts new file mode 100644 index 000000000000..16e4499a0996 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/index.ts @@ -0,0 +1,10 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +export { loadHuggingFaceDatasets } from './src/hf_dataset_loader/load_hugging_face_datasets'; +export type { HuggingFaceDatasetSpec } from './src/hf_dataset_loader/types'; +export { ALL_HUGGING_FACE_DATASETS } from './src/hf_dataset_loader/config'; diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/jest.config.js b/x-pack/platform/packages/shared/kbn-ai-tools-cli/jest.config.js new file mode 100644 index 000000000000..ea0fa023b086 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/jest.config.js @@ -0,0 +1,12 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +module.exports = { + preset: '@kbn/test/jest_node', + rootDir: '../../../../..', + roots: ['/x-pack/platform/packages/shared/kbn-ai-tools-cli'], +}; diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/kibana.jsonc b/x-pack/platform/packages/shared/kbn-ai-tools-cli/kibana.jsonc new file mode 100644 index 000000000000..ce2d2f8853ce --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/kibana.jsonc @@ -0,0 +1,8 @@ +{ + "type": "shared-server", + "id": "@kbn/ai-tools-cli", + "owner": "@elastic/appex-ai-infra", + "group": "platform", + "visibility": "shared", + "devOnly": true +} diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/package.json b/x-pack/platform/packages/shared/kbn-ai-tools-cli/package.json new file mode 100644 index 000000000000..30cc78abf576 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/package.json @@ -0,0 +1,6 @@ +{ + "name": "@kbn/ai-tools-cli", + "private": true, + "version": "1.0.0", + "license": "Elastic License 2.0" +} diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/scripts/hf_dataset_loader.ts b/x-pack/platform/packages/shared/kbn-ai-tools-cli/scripts/hf_dataset_loader.ts new file mode 100644 index 000000000000..6033cedf1136 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/scripts/hf_dataset_loader.ts @@ -0,0 +1,91 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { run } from '@kbn/dev-cli-runner'; +import { createKibanaClient, toolingLogToLogger } from '@kbn/kibana-api-cli'; +import { castArray, keyBy } from 'lodash'; +import { loadHuggingFaceDatasets } from '../src/hf_dataset_loader/load_hugging_face_datasets'; +import { ALL_HUGGING_FACE_DATASETS } from '../src/hf_dataset_loader/config'; + +interface Flags { + // the number of rows per dataset to load into ES + limit?: string; + // the names of the datasets to load + datasets?: string | string[]; + // whether all specified dataset's indices should be cleared before loading + clear?: boolean; +} + +run( + async ({ log, flags }) => { + const signal = new AbortController().signal; + + const accessToken = process.env.HUGGING_FACE_ACCESS_TOKEN; + + if (!accessToken) { + throw new Error( + `process.env.HUGGING_FACE_ACCESS_TOKEN not set - this is required for API access` + ); + } + + const kibanaClient = await createKibanaClient({ + log, + signal, + }); + + // destructure and normalize CLI flags + const { limit, datasets, clear } = flags as Flags; + + const datasetNames = !!datasets + ? castArray(datasets) + .flatMap((set) => set.split(',')) + .map((set) => set.trim()) + .filter(Boolean) + : undefined; + + const specsByName = keyBy(ALL_HUGGING_FACE_DATASETS, (val) => val.name); + + const specs = + datasetNames?.map((name) => { + if (!specsByName[name]) { + throw new Error(`Dataset spec for ${name} not found`); + } + return specsByName[name]; + }) ?? ALL_HUGGING_FACE_DATASETS; + + if (!specs.length) { + throw new Error(`No datasets to load`); + } + + await loadHuggingFaceDatasets({ + esClient: kibanaClient.es, + logger: toolingLogToLogger({ flags, log }), + clear: Boolean(clear), + limit: !!limit ? Number(limit) : undefined, + datasets: specs, + accessToken, + }); + }, + { + description: `Loads HuggingFace datasets into an Elasticsearch cluster`, + flags: { + string: ['limit', 'datasets'], + boolean: ['clear'], + help: ` + Usage: node --require ./src/setup_node_env/index.js x-pack/platform/packages/shared/kbn-ai-tools-cli/scripts/hf_dataset_loader.ts [options] + + --datasets Comma-separated list of HuggingFace dataset names to load + --limit Number of rows per dataset to load into Elasticsearch + --clear Clear the existing indices for the specified datasets before loading + `, + default: { + clear: false, + }, + allowUnexpected: false, + }, + } +); diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/README.md b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/README.md new file mode 100644 index 000000000000..debd0995a44d --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/README.md @@ -0,0 +1,40 @@ +# HuggingFace Dataset Loader + +`loadHuggingFaceDatasets()` loads publicly + +A small Kibana Dev CLI script that ingests one or more public HuggingFace datasets into the **Elasticsearch instance discovered from your local Kibana**. It uses the default ELSER v2 endpoint to generate embeddings and index them into your cluster. You can then use these indices for evaluating RAG-based workflows and features. + +## Prerequisites + +- A running **Kibana** + **Elasticsearch** (the script will auto-discover the base URL using `@kbn/kibana-api-cli`) +- Internet connection – the datasets are downloaded straight from the HF Hub and cached on disk (`./data`) unless `DISABLE_KBN_CLI_CACHE=1`). +- [A HuggingFace access token](https://huggingface.co/docs/hub/en/security-tokens) - this can be acquired by signing up to HF (free). + +## Usage + +```bash +HUGGING_FACE_ACCESS_TOKEN= \ +node --require ./src/setup_node_env/index.js \ + x-pack/platform/packages/shared/kbn-ai-tools-cli/scripts/hf_dataset_loader.ts \ + --datasets beir-trec-covid,beir-msmarco \ + --limit 1000 \ + --clear +``` + +### CLI flags + +| Flag | Type | Description | +| ------------ | --------- | ----------------------------------------------------------------------------------------------------- | +| `--datasets` | `string` | Comma-separated list of dataset **names** to load. Omit the flag to load **all** predefined datasets. | +| `--limit` | `number` | Max docs per dataset (handy while testing). Defaults to 1k. | +| `--clear` | `boolean` | Delete the target index **before** indexing. Defaults to `false`. | + +## Built-in dataset specs + +The script ships with ready-made specifications located in `config.ts`. + +Feel free to extend or tweak these specs in `src/hf_dataset_loader/config.ts`. + +## Disabling local cache + +Set the environment variable `DISABLE_KBN_CLI_CACHE=1` to force fresh downloads instead of using the on-disk cache. diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/config.ts b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/config.ts new file mode 100644 index 000000000000..5f473d409ffd --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/config.ts @@ -0,0 +1,87 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { HuggingFaceDatasetSpec } from './types'; + +const BEIR_NAMES = [ + 'trec-covid', + 'msmarco', + 'nq', + 'hotpotqa', + 'fiqa', + 'dbpedia-entity', + 'robust04', + 'touche-2020', + 'arguana', + 'climate-fever', + 'scifact', + 'scidocs', + 'quora', +]; + +const INFERENCE_ENDPOINT = `.elser-2-elasticsearch`; + +const SEMANTIC_TEXT = { + type: 'semantic_text' as const, + inference_id: INFERENCE_ENDPOINT, +}; + +const BEIR_DATASETS: HuggingFaceDatasetSpec[] = BEIR_NAMES.map((name) => ({ + name: `beir-${name}`, + repo: `BeIR/${name}`, + file: 'corpus.jsonl.gz', + revision: 'main', + index: `beir-${name}`, + mapDocument: (r) => ({ + _id: r._id, + title: r.title, + content: r.text, + }), + mapping: { + properties: { + title: SEMANTIC_TEXT, + content: SEMANTIC_TEXT, + }, + }, +})); + +const EXTRA_DATASETS: HuggingFaceDatasetSpec[] = [ + { + name: 'huffpost', + repo: 'khalidalt/HuffPost', + file: 'News_Category_Dataset_v2.json', + index: 'huffpost', + mapDocument: (r) => ({ + _id: r.link, + title: r.headline, + content: r.short_description, + date: r.date, + author: r.authors, + category: r.category, + }), + mapping: { + properties: { + title: SEMANTIC_TEXT, + content: SEMANTIC_TEXT, + author: { + type: 'keyword', + }, + category: { + type: 'keyword', + }, + date: { + type: 'date', + }, + }, + }, + }, +]; + +export const ALL_HUGGING_FACE_DATASETS: HuggingFaceDatasetSpec[] = [ + ...BEIR_DATASETS, + ...EXTRA_DATASETS, +]; diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/ensure_dataset_index_exists.ts b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/ensure_dataset_index_exists.ts new file mode 100644 index 000000000000..48d65fd0e5a3 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/ensure_dataset_index_exists.ts @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { ElasticsearchClient } from '@kbn/core/server'; +import { errors } from '@elastic/elasticsearch'; +import { HuggingFaceDatasetSpec } from './types'; + +export async function ensureDatasetIndexExists({ + esClient, + dataset, + clear, +}: { + esClient: ElasticsearchClient; + dataset: HuggingFaceDatasetSpec; + clear?: boolean; +}) { + const { index, mapping } = dataset; + + let exists = await esClient.indices.exists({ index, allow_no_indices: true }).catch((error) => { + if (error instanceof errors.ResponseError && error.statusCode === 404) { + return false; + } + throw error; + }); + + if (clear && exists) { + await esClient.indices.delete({ index, allow_no_indices: true }); + exists = false; + } + + if (exists) { + await esClient.indices.putMapping({ + ...mapping, + index, + }); + } else { + await esClient.indices.create({ + index, + mappings: mapping, + }); + } +} diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/fetch_rows_from_dataset.ts b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/fetch_rows_from_dataset.ts new file mode 100644 index 000000000000..f6d001225eaa --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/fetch_rows_from_dataset.ts @@ -0,0 +1,106 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { fileDownloadInfo } from '@huggingface/hub'; +import { Logger } from '@kbn/core/server'; +import streamWeb from 'stream/web'; +import { Readable } from 'stream'; +import { createGunzip } from 'zlib'; +import * as readline from 'node:readline'; +import { pickBy } from 'lodash'; +import { format } from 'util'; +import { HuggingFaceDatasetSpec } from './types'; + +function toMb(bytes: number): string { + return (bytes / 1024 / 1024).toFixed(1) + 'mb'; +} + +export async function fetchRowsFromDataset({ + dataset, + logger, + limit = 1000, + accessToken, +}: { + dataset: HuggingFaceDatasetSpec; + logger: Logger; + limit?: number; + accessToken: string; +}): Promise>> { + const options = { + repo: dataset.repo, + path: dataset.file, + revision: dataset.revision ?? 'main', + hubUrl: `https://huggingface.co/datasets`, + accessToken, + }; + + const fileInfo = await fileDownloadInfo(options); + + if (!fileInfo) { + throw new Error( + `Cannot fetch files for dataset (${dataset.repo}/${dataset.file}@${options.revision})` + ); + } + + const { url, size } = fileInfo; + + const res = await fetch(url); + if (!res.ok || !res.body) { + throw new Error(`HTTP ${res.status} while fetching ${url}`); + } + + const inputStream = Readable.fromWeb(res.body as unknown as streamWeb.ReadableStream); + + const isGzip = new URL(url).searchParams.get('response-content-type') === 'application/gzip'; + + const totalMb = toMb(size); + + let downloadedBytes = 0; + + let lastDownloadLog = Date.now(); + + inputStream.on('data', (chunk: Buffer) => { + downloadedBytes += chunk.length; + const now = Date.now(); + if (now - lastDownloadLog >= 10_000) { + lastDownloadLog = now; + const downloadedMb = toMb(downloadedBytes); + logger.info(`Downloading ${dataset.name}: ${downloadedMb} out of ${totalMb} so far`); + lastDownloadLog = now; + } + }); + + inputStream.on('end', () => { + logger.debug('Completed download'); + }); + + inputStream.on('error', (err) => { + logger.debug(`Ended download prematurely: ${format(err)}`); + }); + + const decompressed: Readable = isGzip ? inputStream.pipe(createGunzip()) : inputStream; + + const rl = readline.createInterface({ input: decompressed, crlfDelay: Infinity }); + + const docs: Array> = []; + for await (const line of rl) { + if (!line) continue; + const raw = JSON.parse(line); + const doc = dataset.mapDocument(raw); + docs.push(pickBy(doc, (val) => val !== undefined && val !== null && val !== '')); + + if (docs.length === limit) { + break; + } + } + + inputStream.destroy(); + + logger.debug(`Fetched ${docs.length} rows for ${dataset.name}`); + + return docs; +} diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/get_embeddings.ts b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/get_embeddings.ts new file mode 100644 index 000000000000..f2dd21adb053 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/get_embeddings.ts @@ -0,0 +1,54 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { ElasticsearchClient, Logger } from '@kbn/core/server'; +import { indexDocuments } from './index_documents'; +import { HuggingFaceDatasetSpec } from './types'; + +export async function getEmbeddings({ + esClient, + documents, + dataset, + logger, +}: { + esClient: ElasticsearchClient; + documents: Array>; + dataset: HuggingFaceDatasetSpec; + logger: Logger; +}): Promise>> { + const indexName = dataset.index + '_tmp'; + + await indexDocuments({ + documents, + dataset: { + ...dataset, + index: indexName, + }, + esClient, + logger, + }); + + const docsWithEmbeddings = await esClient + .search>({ + index: indexName, + size: documents.length, + fields: ['_inference_fields'], + }) + .then((response) => + response.hits.hits.map((hit) => { + const source = hit._source!; + Object.entries(source._inference_fields ?? {}).forEach(([fieldName, config]) => { + delete (config as Record).inference.model_settings.service; + }); + return source; + }) + ); + + await esClient.indices.delete({ index: indexName }); + + return docsWithEmbeddings; +} diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/index_documents.ts b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/index_documents.ts new file mode 100644 index 000000000000..dcd9974d64ff --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/index_documents.ts @@ -0,0 +1,49 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { ElasticsearchClient, Logger } from '@kbn/core/server'; +import { Readable } from 'stream'; +import { inspect } from 'util'; +import { HuggingFaceDatasetSpec } from './types'; +import { ensureDatasetIndexExists } from './ensure_dataset_index_exists'; + +export async function indexDocuments({ + esClient, + documents, + dataset, + logger, +}: { + esClient: ElasticsearchClient; + documents: Array>; + dataset: HuggingFaceDatasetSpec; + logger: Logger; +}): Promise { + const indexName = dataset.index; + + await ensureDatasetIndexExists({ + dataset, + esClient, + }); + + logger.debug(`Indexing ${documents.length} into ${indexName}`); + + await esClient.helpers.bulk>({ + datasource: Readable.from(documents), + index: indexName, + retries: 2, + concurrency: 1, + flushBytes: 1024 * 128, + onDocument: (document) => { + const { _id, ...doc } = document; + return [{ index: { _id: String(_id) } }, doc]; + }, + onDrop: (doc) => { + logger.warn(`Dropped document: ${doc.status} (${inspect(doc.error, { depth: 5 })})`); + }, + refresh: 'wait_for', + }); +} diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/load_hugging_face_datasets.ts b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/load_hugging_face_datasets.ts new file mode 100644 index 000000000000..2c47d24a7bd9 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/load_hugging_face_datasets.ts @@ -0,0 +1,103 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { ElasticsearchClient, Logger } from '@kbn/core/server'; +import { createLocalDirDiskCacheStore, fromCache } from '@kbn/cache-cli'; +import { createCache } from 'cache-manager'; +import { errors } from '@elastic/elasticsearch'; +import { ALL_HUGGING_FACE_DATASETS } from './config'; +import { HuggingFaceDatasetSpec } from './types'; +import { ensureDatasetIndexExists } from './ensure_dataset_index_exists'; +import { fetchRowsFromDataset } from './fetch_rows_from_dataset'; +import { indexDocuments } from './index_documents'; +import { getEmbeddings } from './get_embeddings'; + +const DATASET_ROWS_CACHE = createCache({ + stores: [ + createLocalDirDiskCacheStore({ + dir: `hugging_face_dataset_rows`, + }), + ], +}); + +const DATASET_EMBEDDINGS_CACHE = createCache({ + stores: [ + createLocalDirDiskCacheStore({ + dir: `hugging_face_dataset_embeddings`, + }), + ], +}); + +export async function loadHuggingFaceDatasets({ + esClient, + logger, + accessToken, + datasets = ALL_HUGGING_FACE_DATASETS, + limit = 1000, + clear = false, +}: { + esClient: ElasticsearchClient; + logger: Logger; + accessToken: string; + datasets?: HuggingFaceDatasetSpec[]; + limit?: number; + clear?: boolean; +}) { + if (clear) { + await esClient.indices + .delete({ + index: datasets.map((dataset) => dataset.index), + allow_no_indices: true, + }) + .catch((error) => { + if (error instanceof errors.ResponseError && error.statusCode === 404) { + return; + } + throw error; + }); + } + + for (const dataset of datasets) { + logger.info(`Indexing dataset ${dataset.name}`); + + await ensureDatasetIndexExists({ + esClient, + dataset, + }); + + const documents = await fromCache(dataset.name, DATASET_ROWS_CACHE, () => + fetchRowsFromDataset({ + dataset, + logger, + limit, + accessToken, + }) + ); + + logger.debug('Generating embeddings'); + + const docsWithEmbeddings = await fromCache(dataset.name, DATASET_EMBEDDINGS_CACHE, () => + getEmbeddings({ + esClient, + documents, + dataset, + logger, + }) + ); + + logger.debug(`Indexing documents with embeddings`); + + await indexDocuments({ + esClient, + documents: docsWithEmbeddings, + dataset, + logger, + }); + + logger.debug(`Indexed dataset`); + } +} diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/types.ts b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/types.ts new file mode 100644 index 000000000000..d628e430b575 --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/src/hf_dataset_loader/types.ts @@ -0,0 +1,29 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { IndicesPutMappingRequest } from '@elastic/elasticsearch/lib/api/types'; + +/** One dataset to import. */ +export interface HuggingFaceDatasetSpec { + /** Human-readable name (purely for logging). */ + name: string; + /** HuggingFace Hub repository id, e.g. "BeIR/msmarco" (required when `url` is omitted). */ + repo: string; + /** File path inside the repo, e.g. "corpus.jsonl.gz" (required when `url` is omitted). */ + file: string; + /** Optional revision (tag/branch/commit). Defaults to "main" when not provided. */ + revision?: string; + /** Target Elasticsearch index. */ + index: string; + mapping: Omit; + /** + * Convert raw JSON objects into whatever you want stored. + * Return value **must** include the doc‘s unique identifier + * under `_id` (or change the code below). + */ + mapDocument: (raw: any) => Record; +} diff --git a/x-pack/platform/packages/shared/kbn-ai-tools-cli/tsconfig.json b/x-pack/platform/packages/shared/kbn-ai-tools-cli/tsconfig.json new file mode 100644 index 000000000000..800fac2a27df --- /dev/null +++ b/x-pack/platform/packages/shared/kbn-ai-tools-cli/tsconfig.json @@ -0,0 +1,22 @@ +{ + "extends": "../../../../../tsconfig.base.json", + "compilerOptions": { + "outDir": "target/types", + "types": [ + "jest", + "node" + ] + }, + "include": [ + "**/*.ts", + ], + "exclude": [ + "target/**/*" + ], + "kbn_references": [ + "@kbn/core", + "@kbn/cache-cli", + "@kbn/dev-cli-runner", + "@kbn/kibana-api-cli" + ] +} diff --git a/x-pack/platform/packages/shared/kbn-kibana-api-cli/src/proxy_transport.ts b/x-pack/platform/packages/shared/kbn-kibana-api-cli/src/proxy_transport.ts index 4f4e24e89c26..7e97c1d1356f 100644 --- a/x-pack/platform/packages/shared/kbn-kibana-api-cli/src/proxy_transport.ts +++ b/x-pack/platform/packages/shared/kbn-kibana-api-cli/src/proxy_transport.ts @@ -26,7 +26,12 @@ export function createProxyTransport({ }): typeof Transport { return class ProxyTransport extends Transport { constructor(options: TransportOptions) { - super(options); + super({ + ...options, + // the elastic-x-product headers cause issues w/ the proxy transport, + // as the returned headers are from the proxy endpoint and not ES + productCheck: undefined, + }); } request( @@ -84,7 +89,7 @@ export function createProxyTransport({ if (statusCode >= 400) { throw new errors.ResponseError({ - statusCode: response.statusCode, + statusCode, body: response.body, meta: response.meta, warnings: response.warnings, diff --git a/yarn.lock b/yarn.lock index 69f6bd5785ad..0016573fa3fd 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3386,6 +3386,18 @@ react-redux "^9.2.0" redux "^5.0.1" +"@huggingface/hub@^2.2.0": + version "2.2.0" + resolved "https://registry.yarnpkg.com/@huggingface/hub/-/hub-2.2.0.tgz#0fbe96d09341e68e927315b860b866aee2b1c85d" + integrity sha512-G+VS1eMp80KovIHBlsiEigS6I6qmI4j+VQ1UZ8CaXT+pw2A7tj6e/crfxFdKNE2uOK5oQkRFiCBJykMwrWQ8OA== + dependencies: + "@huggingface/tasks" "^0.19.11" + +"@huggingface/tasks@^0.19.11": + version "0.19.16" + resolved "https://registry.yarnpkg.com/@huggingface/tasks/-/tasks-0.19.16.tgz#f3a83a04b0c06cdd022718504dae9eceb1c87d3b" + integrity sha512-8PfeMpvHqax9biZaw9/u1Ut9xWIJ1qIUAd8jAKkv6bKvIAAyTRCZgnwHJmzPffDE6B0VkLlPl5it6GaSg/kxlw== + "@humanwhocodes/config-array@^0.11.14": version "0.11.14" resolved "https://registry.yarnpkg.com/@humanwhocodes/config-array/-/config-array-0.11.14.tgz#d78e481a039f7566ecc9660b4ea7fe6b1fec442b" @@ -3821,6 +3833,10 @@ version "0.0.0" uid "" +"@kbn/ai-tools-cli@link:x-pack/platform/packages/shared/kbn-ai-tools-cli": + version "0.0.0" + uid "" + "@kbn/aiops-change-point-detection@link:x-pack/platform/packages/private/ml/aiops_change_point_detection": version "0.0.0" uid "" @@ -4029,6 +4045,10 @@ version "0.0.0" uid "" +"@kbn/cache-cli@link:src/platform/packages/shared/kbn-cache-cli": + version "0.0.0" + uid "" + "@kbn/calculate-auto@link:src/platform/packages/shared/kbn-calculate-auto": version "0.0.0" uid "" @@ -8229,6 +8249,13 @@ version "0.0.0" uid "" +"@keyv/serialize@^1.0.3": + version "1.0.3" + resolved "https://registry.yarnpkg.com/@keyv/serialize/-/serialize-1.0.3.tgz#e0fe3710e2a379cb0490cd41e5a5ffa2bab58bf6" + integrity sha512-qnEovoOp5Np2JDGonIDL6Ayihw0RhnRh6vxPuHo4RDn1UOzwEo4AeIfpL6UGIrsceWrCMiVPgwRjbHu4vYFc3g== + dependencies: + buffer "^6.0.3" + "@kwsites/file-exists@^1.1.1": version "1.1.1" resolved "https://registry.yarnpkg.com/@kwsites/file-exists/-/file-exists-1.1.1.tgz#ad1efcac13e1987d8dbaf235ef3be5b0d96faa99" @@ -11388,6 +11415,18 @@ resolved "https://registry.yarnpkg.com/@types/byte-size/-/byte-size-8.1.2.tgz#abb3d70ab62c400d8753bed1ff2aa315ef9ff7f5" integrity sha512-jGyVzYu6avI8yuqQCNTZd65tzI8HZrLjKX9sdMqZrGWVlNChu0rf6p368oVEDCYJe5BMx2Ov04tD1wqtgTwGSA== +"@types/cache-manager-fs-hash@^0.0.5": + version "0.0.5" + resolved "https://registry.yarnpkg.com/@types/cache-manager-fs-hash/-/cache-manager-fs-hash-0.0.5.tgz#39c2c93b0a6a873dfefbbd4a926b34216cda66c8" + integrity sha512-mSqk9YisfK/NkB/R5SzGeuSIVtwHhM5m6MLB0VrrFteTphKiQ2Fyz88IRtiX+SYEX6Nw2H3kB9qtpfnVSE/mSQ== + dependencies: + "@types/cache-manager" "<4" + +"@types/cache-manager@<4": + version "3.4.3" + resolved "https://registry.yarnpkg.com/@types/cache-manager/-/cache-manager-3.4.3.tgz#eba99bf795b997ad0c309658101398c34d7faecb" + integrity sha512-71aBXoFYXZW4TnDHHH8gExw2lS28BZaWeKefgsiJI7QYZeJfUEbMKw6CQtzGjlYQcGIWwB76hcCrkVA3YHSvsw== + "@types/cacheable-request@^6.0.1": version "6.0.1" resolved "https://registry.yarnpkg.com/@types/cacheable-request/-/cacheable-request-6.0.1.tgz#5d22f3dded1fd3a84c0bbeb5039a7419c2c91976" @@ -14843,6 +14882,20 @@ cache-base@^1.0.1: union-value "^1.0.0" unset-value "^1.0.0" +cache-manager-fs-hash@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/cache-manager-fs-hash/-/cache-manager-fs-hash-2.0.0.tgz#daa422ffe689ef16d15c8e3fd47932c8afd14171" + integrity sha512-w03tp8mvfglRUFtItCdC114rFyzk0umu5LnnRM5spnu2+Mj8/2PrDHCnaoPltto/2fK94fC/Kw2rHqBXqIEgTQ== + dependencies: + lockfile "^1.0.4" + +cache-manager@^7.0.0: + version "7.0.0" + resolved "https://registry.yarnpkg.com/cache-manager/-/cache-manager-7.0.0.tgz#3a591187372bcfa32e9cb479764a411a0a0d6a74" + integrity sha512-5HLGorfU4g2GyLTXd+bbq8RhZPwLRlVm7hfS1EssJx4Ujq1FjyQAjHND93sI6ByQTlUlCQ0jrHZqLI0qtBFyHA== + dependencies: + keyv "^5.3.3" + cacheable-lookup@6: version "6.1.0" resolved "https://registry.yarnpkg.com/cacheable-lookup/-/cacheable-lookup-6.1.0.tgz#0330a543471c61faa4e9035db583aad753b36385" @@ -22823,6 +22876,13 @@ keyv@^4.0.0: dependencies: json-buffer "3.0.1" +keyv@^5.3.3, keyv@^5.3.4: + version "5.3.4" + resolved "https://registry.yarnpkg.com/keyv/-/keyv-5.3.4.tgz#e0548d9449c51fc332abdd637c2b3bb2d24c9bc9" + integrity sha512-ypEvQvInNpUe+u+w8BIcPkQvEqXquyyibWE/1NB5T2BTzIpS5cGEV1LZskDzPSTvNAaT4+5FutvzlvnkxOSKlw== + dependencies: + "@keyv/serialize" "^1.0.3" + kind-of@^3.0.2, kind-of@^3.0.3, kind-of@^3.2.0: version "3.2.2" resolved "https://registry.yarnpkg.com/kind-of/-/kind-of-3.2.2.tgz#31ea21a734bab9bbb0f32466d893aea51e4a3c64" @@ -23211,6 +23271,13 @@ locate-path@^7.1.0: dependencies: p-locate "^6.0.0" +lockfile@^1.0.4: + version "1.0.4" + resolved "https://registry.yarnpkg.com/lockfile/-/lockfile-1.0.4.tgz#07f819d25ae48f87e538e6578b6964a4981a5609" + integrity sha512-cvbTwETRfsFh4nHsL1eGWapU1XFi5Ot9E85sWAwia7Y7EgB7vfqcZhTKZ+l7hCGxSPoushMv5GKhT5PdLv03WA== + dependencies: + signal-exit "^3.0.2" + lodash-es@^4.17.21: version "4.17.21" resolved "https://registry.yarnpkg.com/lodash-es/-/lodash-es-4.17.21.tgz#43e626c46e6591b7750beb2b50117390c609e3ee"