[ML] File data viz limiting uploaded doc chunk size (#44768) (#44904)

* [ML] File data viz limitiing upload chunk size * adding comments * refactor * fixing incorrect overwrite of array
2025-04-24 17:59:23 -04:00 · 2019-09-05 18:29:13 +01:00 · 2019-09-05 18:29:13 +01:00 · 5bc76f8438
commit 5bc76f8438
parent b72debb66c
1 changed files with 43 additions and 7 deletions
--- a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js
+++ b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js
@ -10,7 +10,8 @@ import { chunk } from 'lodash';
 import moment from 'moment';
 import { i18n } from '@kbn/i18n';

-const CHUNK_SIZE = 10000;
+const CHUNK_SIZE = 5000;
+const MAX_CHUNK_CHAR_COUNT = 1000000;
 const IMPORT_RETRIES = 5;

 export class Importer {
@ -21,6 +22,7 @@ export class Importer {

    this.data = [];
    this.docArray = [];
+    this.docSizeArray = [];
  }

  async initializeImport(index) {
@ -58,7 +60,7 @@ export class Importer {
      };
    }

-    const chunks = chunk(this.docArray, CHUNK_SIZE);
+    const chunks = createDocumentChunks(this.docArray);

    const ingestPipeline = {
      id: pipelineId,
@ -86,13 +88,18 @@ export class Importer {
      };

      while (resp.success === false && retries > 0) {
-        resp = await ml.fileDatavisualizer.import(aggs);
+        try {
+          resp = await ml.fileDatavisualizer.import(aggs);

-        if (retries < IMPORT_RETRIES) {
-          console.log(`Retrying import ${IMPORT_RETRIES - retries}`);
+          if (retries < IMPORT_RETRIES) {
+            console.log(`Retrying import ${IMPORT_RETRIES - retries}`);
+          }
+
+          retries--;
+        } catch (err) {
+          resp = { success: false, error: err };
+          retries = 0;
        }
-
-        retries--;
      }

      if (resp.success) {
@ -152,3 +159,32 @@ function updatePipelineTimezone(ingestPipeline) {
    }
  }
 }
+
+function createDocumentChunks(docArray) {
+  const chunks = [];
+  // chop docArray into 5000 doc chunks
+  const tempChunks = chunk(docArray, CHUNK_SIZE);
+
+  // loop over tempChunks and check that the total character length
+  // for each chunk is within the MAX_CHUNK_CHAR_COUNT.
+  // if the length is too long, split the chunk into smaller chunks
+  // based on how much larger it is than MAX_CHUNK_CHAR_COUNT
+  // note, each document is a different size, so dividing by charCountOfDocs
+  // only produces an average chunk size that should be smaller than the max length
+  for (let i = 0; i < tempChunks.length; i++) {
+    const docs = tempChunks[i];
+    const numberOfDocs = docs.length;
+
+    const charCountOfDocs = JSON.stringify(docs).length;
+    if (charCountOfDocs > MAX_CHUNK_CHAR_COUNT) {
+      // calculate new chunk size which should produce a chunk
+      // who's length is on average around MAX_CHUNK_CHAR_COUNT
+      const adjustedChunkSize = Math.floor((MAX_CHUNK_CHAR_COUNT / charCountOfDocs) * numberOfDocs);
+      const smallerChunks = chunk(docs, adjustedChunkSize);
+      chunks.push(...smallerChunks);
+    } else {
+      chunks.push(docs);
+    }
+  }
+  return chunks;
+}