[ML] Categorization jobs improvements (#54579) (#54774)

* chunking token examples * disabling bucket span estimator * passing sample size to client * better handing of token errors * changes based on review
2025-04-24 01:38:56 -04:00 · 2020-01-14 19:10:31 +00:00 · 2020-01-14 19:10:31 +00:00 · b1d2c00c3c
commit b1d2c00c3c
parent 449cf8fa5c
9 changed files with 106 additions and 46 deletions
--- a/x-pack/legacy/plugins/ml/common/constants/new_job.ts
+++ b/x-pack/legacy/plugins/ml/common/constants/new_job.ts
@ -27,6 +27,6 @@ export const DEFAULT_QUERY_DELAY = '60s';
 export const SHARED_RESULTS_INDEX_NAME = 'shared';

 export const NUMBER_OF_CATEGORY_EXAMPLES = 5;
-export const CATEGORY_EXAMPLES_MULTIPLIER = 20;
+export const CATEGORY_EXAMPLES_SAMPLE_SIZE = 1000;
 export const CATEGORY_EXAMPLES_WARNING_LIMIT = 0.75;
-export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.2;
+export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.02;
--- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/job_creator/categorization_job_creator.ts
+++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/job_creator/categorization_job_creator.ts
@ -102,10 +102,10 @@ export class CategorizationJobCreator extends JobCreator {
  }

  public async loadCategorizationFieldExamples() {
-    const { valid, examples } = await this._examplesLoader.loadExamples();
+    const { valid, examples, sampleSize } = await this._examplesLoader.loadExamples();
    this._categoryFieldExamples = examples;
    this._categoryFieldValid = valid;
-    return { valid, examples };
+    return { valid, examples, sampleSize };
  }

  public get categoryFieldExamples() {
--- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/results_loader/categorization_examples_loader.ts
+++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/results_loader/categorization_examples_loader.ts
@ -36,7 +36,7 @@ export class CategorizationExamplesLoader {
    const analyzer = this._jobCreator.categorizationAnalyzer;
    const categorizationFieldName = this._jobCreator.categorizationFieldName;
    if (categorizationFieldName === null) {
-      return { valid: 0, examples: [] };
+      return { valid: 0, examples: [], sampleSize: 0 };
    }

    const start = Math.floor(
--- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/bucket_span_estimator/bucket_span_estimator.tsx
+++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/bucket_span_estimator/bucket_span_estimator.tsx
@ -7,7 +7,9 @@
 import React, { FC, useState, useEffect, useContext } from 'react';
 import { FormattedMessage } from '@kbn/i18n/react';
 import { EuiButton } from '@elastic/eui';
+import { isAdvancedJobCreator } from '../../../../../common/job_creator';
 import { JobCreatorContext } from '../../../job_creator_context';
+import { MLCATEGORY } from '../../../../../../../../../common/constants/field_types';

 import { useEstimateBucketSpan, ESTIMATE_STATUS } from './estimate_bucket_span';

@ -19,6 +21,7 @@ export const BucketSpanEstimator: FC<Props> = ({ setEstimating }) => {
  const { jobCreator, jobCreatorUpdate } = useContext(JobCreatorContext);
  const { status, estimateBucketSpan } = useEstimateBucketSpan();
  const [noDetectors, setNoDetectors] = useState(jobCreator.detectors.length === 0);
+  const [isUsingMlCategory, setIsUsingMlCategory] = useState(checkIsUsingMlCategory());

  useEffect(() => {
    setEstimating(status === ESTIMATE_STATUS.RUNNING);
@ -26,11 +29,29 @@ export const BucketSpanEstimator: FC<Props> = ({ setEstimating }) => {

  useEffect(() => {
    setNoDetectors(jobCreator.detectors.length === 0);
+    setIsUsingMlCategory(checkIsUsingMlCategory());
  }, [jobCreatorUpdate]);

+  function checkIsUsingMlCategory() {
+    return (
+      isAdvancedJobCreator(jobCreator) &&
+      jobCreator.detectors.some(d => {
+        if (
+          d.partition_field_name === MLCATEGORY ||
+          d.over_field_name === MLCATEGORY ||
+          d.by_field_name === MLCATEGORY
+        ) {
+          return true;
+        }
+      })
+    );
+  }
+
  return (
    <EuiButton
-      disabled={status === ESTIMATE_STATUS.RUNNING || noDetectors === true}
+      disabled={
+        status === ESTIMATE_STATUS.RUNNING || noDetectors === true || isUsingMlCategory === true
+      }
      onClick={estimateBucketSpan}
    >
      <FormattedMessage
--- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_field/description.tsx
+++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_field/description.tsx
@ -30,7 +30,7 @@ export const Description: FC<Props> = memo(({ children, isOptional }) => {
          ) : (
            <FormattedMessage
              id="xpack.ml.newJob.wizard.pickFieldsStep.categorizationField.description"
-              defaultMessage="Specifies which field will be categorized. Using text data types is recommended."
+              defaultMessage="Specifies which field will be categorized. Using text data types is recommended. Categorization works best on machine written log messages, typically logging written by a developer for the purpose of system troubleshooting."
            />
          )}
        </>
--- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/examples_valid_callout.tsx
+++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/examples_valid_callout.tsx
@ -12,8 +12,6 @@ import { FormattedMessage } from '@kbn/i18n/react';
 import { CategorizationAnalyzer } from '../../../../../../../services/ml_server_info';
 import { EditCategorizationAnalyzerFlyout } from '../../../common/edit_categorization_analyzer_flyout';
 import {
-  NUMBER_OF_CATEGORY_EXAMPLES,
-  CATEGORY_EXAMPLES_MULTIPLIER,
  CATEGORY_EXAMPLES_ERROR_LIMIT,
  CATEGORY_EXAMPLES_WARNING_LIMIT,
 } from '../../../../../../../../../common/constants/new_job';
@ -22,11 +20,16 @@ type CategorizationAnalyzerType = CategorizationAnalyzer | null;

 interface Props {
  examplesValid: number;
+  sampleSize: number;
  categorizationAnalyzer: CategorizationAnalyzerType;
 }

-export const ExamplesValidCallout: FC<Props> = ({ examplesValid, categorizationAnalyzer }) => {
-  const percentageText = <PercentageText examplesValid={examplesValid} />;
+export const ExamplesValidCallout: FC<Props> = ({
+  examplesValid,
+  categorizationAnalyzer,
+  sampleSize,
+}) => {
+  const percentageText = <PercentageText examplesValid={examplesValid} sampleSize={sampleSize} />;
  const analyzerUsed = <AnalyzerUsed categorizationAnalyzer={categorizationAnalyzer} />;

  let color: EuiCallOutProps['color'] = 'success';
@ -64,13 +67,16 @@ export const ExamplesValidCallout: FC<Props> = ({ examplesValid, categorizationA
  );
 };

-const PercentageText: FC<{ examplesValid: number }> = ({ examplesValid }) => (
+const PercentageText: FC<{ examplesValid: number; sampleSize: number }> = ({
+  examplesValid,
+  sampleSize,
+}) => (
  <div>
    <FormattedMessage
      id="xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldPercentage"
-      defaultMessage="{number} field values analyzed, {percentage}% contain valid tokens."
+      defaultMessage="{number} field {number, plural, zero {value} one {value} other {values}} analyzed, {percentage}% contain valid tokens."
      values={{
-        number: NUMBER_OF_CATEGORY_EXAMPLES * CATEGORY_EXAMPLES_MULTIPLIER,
+        number: sampleSize,
        percentage: Math.floor(examplesValid * 100),
      }}
    />
--- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/metric_selection.tsx
+++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/metric_selection.tsx
@ -6,6 +6,7 @@

 import React, { FC, useContext, useEffect, useState } from 'react';
 import { EuiHorizontalRule } from '@elastic/eui';
+import { mlMessageBarService } from '../../../../../../../components/messagebar';

 import { JobCreatorContext } from '../../../job_creator_context';
 import { CategorizationJobCreator } from '../../../../../common/job_creator';
@ -32,6 +33,7 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
  );
  const [fieldExamples, setFieldExamples] = useState<CategoryExample[] | null>(null);
  const [examplesValid, setExamplesValid] = useState(0);
+  const [sampleSize, setSampleSize] = useState(0);

  const [categorizationFieldName, setCategorizationFieldName] = useState(
    jobCreator.categorizationFieldName
@ -69,10 +71,20 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
  async function loadFieldExamples() {
    if (categorizationFieldName !== null) {
      setLoadingData(true);
-      const { valid, examples } = await jobCreator.loadCategorizationFieldExamples();
-      setFieldExamples(examples);
-      setExamplesValid(valid);
-      setLoadingData(false);
+      try {
+        const {
+          valid,
+          examples,
+          sampleSize: tempSampleSize,
+        } = await jobCreator.loadCategorizationFieldExamples();
+        setFieldExamples(examples);
+        setExamplesValid(valid);
+        setLoadingData(false);
+        setSampleSize(tempSampleSize);
+      } catch (error) {
+        setLoadingData(false);
+        mlMessageBarService.notify.error(error);
+      }
    } else {
      setFieldExamples(null);
      setExamplesValid(0);
@ -97,6 +109,7 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
      {fieldExamples !== null && loadingData === false && (
        <>
          <ExamplesValidCallout
+            sampleSize={sampleSize}
            examplesValid={examplesValid}
            categorizationAnalyzer={jobCreator.categorizationAnalyzer}
          />
--- a/x-pack/legacy/plugins/ml/public/application/services/ml_api_service/index.d.ts
+++ b/x-pack/legacy/plugins/ml/public/application/services/ml_api_service/index.d.ts
@ -185,7 +185,7 @@ declare interface Ml {
      start: number,
      end: number,
      analyzer: any
-    ): Promise<{ valid: number; examples: any[] }>;
+    ): Promise<{ valid: number; examples: any[]; sampleSize: number }>;
    topCategories(
      jobId: string,
      count: number
--- a/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization.ts
+++ b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization.ts
@ -4,11 +4,15 @@
 * you may not use this file except in compliance with the Elastic License.
 */

+import { chunk } from 'lodash';
 import { ML_RESULTS_INDEX_PATTERN } from '../../../../common/constants/index_patterns';
-import { CATEGORY_EXAMPLES_MULTIPLIER } from '../../../../common/constants/new_job';
+import { CATEGORY_EXAMPLES_SAMPLE_SIZE } from '../../../../common/constants/new_job';
 import { CategoryId, Category, Token } from '../../../../common/types/categories';
 import { callWithRequestType } from '../../../../common/types/kibana';

+const VALID_TOKEN_COUNT = 3;
+const CHUNK_SIZE = 100;
+
 export function categorizationExamplesProvider(callWithRequest: callWithRequestType) {
  async function categorizationExamples(
    indexPatternTitle: string,
@ -54,21 +58,31 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
    });
    const examples: string[] = results.hits?.hits
      ?.map((doc: any) => doc._source[categorizationFieldName])
-      .filter((example: string | undefined) => example !== undefined);
+      .filter((example: string | null | undefined) => example !== undefined && example !== null);

-    let tokens: Token[] = [];
-    try {
-      const { tokens: tempTokens } = await callWithRequest('indices.analyze', {
-        body: {
-          ...getAnalyzer(analyzer),
-          text: examples,
-        },
-      });
-      tokens = tempTokens;
-    } catch (error) {
-      // fail silently, the tokens could not be loaded
-      // an empty list of tokens will be returned for each example
+    async function loadTokens(chunkSize: number) {
+      const exampleChunks = chunk(examples, chunkSize);
+      const tokensPerChunks = await Promise.all(exampleChunks.map(c => getTokens(c, analyzer)));
+      const tokensPerExample = tokensPerChunks.flat();
+      return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] }));
    }
+    try {
+      return loadTokens(CHUNK_SIZE);
+    } catch (error) {
+      // if an error is thrown when loading the tokens, lower the chunk size by half and try again
+      // the error may have been caused by too many tokens being found.
+      // the _analyze endpoint has a maximum of 10000 tokens.
+      return loadTokens(CHUNK_SIZE / 2);
+    }
+  }
+
+  async function getTokens(examples: string[], analyzer?: any) {
+    const { tokens }: { tokens: Token[] } = await callWithRequest('indices.analyze', {
+      body: {
+        ...getAnalyzer(analyzer),
+        text: examples,
+      },
+    });

    const lengths = examples.map(e => e.length);
    const sumLengths = lengths.map((s => (a: number) => (s += a))(0));
@ -88,8 +102,7 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
        }
      }
    });
-
-    return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] }));
+    return tokensPerExample;
  }

  function getAnalyzer(analyzer: any) {
@ -110,10 +123,10 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
    end: number,
    analyzer?: any
  ) {
-    const examples = await categorizationExamples(
+    const resp = await categorizationExamples(
      indexPatternTitle,
      query,
-      size * CATEGORY_EXAMPLES_MULTIPLIER,
+      CATEGORY_EXAMPLES_SAMPLE_SIZE,
      categorizationFieldName,
      timeField,
      start,
@ -121,20 +134,27 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
      analyzer
    );

-    const sortedExamples = examples
+    const sortedExamples = resp
      .map((e, i) => ({ ...e, origIndex: i }))
      .sort((a, b) => b.tokens.length - a.tokens.length);
-    const validExamples = sortedExamples.filter(e => e.tokens.length > 1);
+    const validExamples = sortedExamples.filter(e => e.tokens.length >= VALID_TOKEN_COUNT);
+    const sampleSize = sortedExamples.length;
+
+    const multiple = Math.floor(sampleSize / size) || sampleSize;
+    const filteredExamples = [];
+    let i = 0;
+    while (filteredExamples.length < size && i < sortedExamples.length) {
+      filteredExamples.push(sortedExamples[i]);
+      i += multiple;
+    }
+    const examples = filteredExamples
+      .sort((a, b) => a.origIndex - b.origIndex)
+      .map(e => ({ text: e.text, tokens: e.tokens }));

    return {
+      sampleSize,
      valid: sortedExamples.length === 0 ? 0 : validExamples.length / sortedExamples.length,
-      examples: sortedExamples
-        .filter(
-          (e, i) =>
-            i / CATEGORY_EXAMPLES_MULTIPLIER - Math.floor(i / CATEGORY_EXAMPLES_MULTIPLIER) === 0
-        )
-        .sort((a, b) => a.origIndex - b.origIndex)
-        .map(e => ({ text: e.text, tokens: e.tokens })),
+      examples,
    };
  }