mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 01:38:56 -04:00
* chunking token examples * disabling bucket span estimator * passing sample size to client * better handing of token errors * changes based on review
This commit is contained in:
parent
449cf8fa5c
commit
b1d2c00c3c
9 changed files with 106 additions and 46 deletions
|
@ -27,6 +27,6 @@ export const DEFAULT_QUERY_DELAY = '60s';
|
|||
export const SHARED_RESULTS_INDEX_NAME = 'shared';
|
||||
|
||||
export const NUMBER_OF_CATEGORY_EXAMPLES = 5;
|
||||
export const CATEGORY_EXAMPLES_MULTIPLIER = 20;
|
||||
export const CATEGORY_EXAMPLES_SAMPLE_SIZE = 1000;
|
||||
export const CATEGORY_EXAMPLES_WARNING_LIMIT = 0.75;
|
||||
export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.2;
|
||||
export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.02;
|
||||
|
|
|
@ -102,10 +102,10 @@ export class CategorizationJobCreator extends JobCreator {
|
|||
}
|
||||
|
||||
public async loadCategorizationFieldExamples() {
|
||||
const { valid, examples } = await this._examplesLoader.loadExamples();
|
||||
const { valid, examples, sampleSize } = await this._examplesLoader.loadExamples();
|
||||
this._categoryFieldExamples = examples;
|
||||
this._categoryFieldValid = valid;
|
||||
return { valid, examples };
|
||||
return { valid, examples, sampleSize };
|
||||
}
|
||||
|
||||
public get categoryFieldExamples() {
|
||||
|
|
|
@ -36,7 +36,7 @@ export class CategorizationExamplesLoader {
|
|||
const analyzer = this._jobCreator.categorizationAnalyzer;
|
||||
const categorizationFieldName = this._jobCreator.categorizationFieldName;
|
||||
if (categorizationFieldName === null) {
|
||||
return { valid: 0, examples: [] };
|
||||
return { valid: 0, examples: [], sampleSize: 0 };
|
||||
}
|
||||
|
||||
const start = Math.floor(
|
||||
|
|
|
@ -7,7 +7,9 @@
|
|||
import React, { FC, useState, useEffect, useContext } from 'react';
|
||||
import { FormattedMessage } from '@kbn/i18n/react';
|
||||
import { EuiButton } from '@elastic/eui';
|
||||
import { isAdvancedJobCreator } from '../../../../../common/job_creator';
|
||||
import { JobCreatorContext } from '../../../job_creator_context';
|
||||
import { MLCATEGORY } from '../../../../../../../../../common/constants/field_types';
|
||||
|
||||
import { useEstimateBucketSpan, ESTIMATE_STATUS } from './estimate_bucket_span';
|
||||
|
||||
|
@ -19,6 +21,7 @@ export const BucketSpanEstimator: FC<Props> = ({ setEstimating }) => {
|
|||
const { jobCreator, jobCreatorUpdate } = useContext(JobCreatorContext);
|
||||
const { status, estimateBucketSpan } = useEstimateBucketSpan();
|
||||
const [noDetectors, setNoDetectors] = useState(jobCreator.detectors.length === 0);
|
||||
const [isUsingMlCategory, setIsUsingMlCategory] = useState(checkIsUsingMlCategory());
|
||||
|
||||
useEffect(() => {
|
||||
setEstimating(status === ESTIMATE_STATUS.RUNNING);
|
||||
|
@ -26,11 +29,29 @@ export const BucketSpanEstimator: FC<Props> = ({ setEstimating }) => {
|
|||
|
||||
useEffect(() => {
|
||||
setNoDetectors(jobCreator.detectors.length === 0);
|
||||
setIsUsingMlCategory(checkIsUsingMlCategory());
|
||||
}, [jobCreatorUpdate]);
|
||||
|
||||
function checkIsUsingMlCategory() {
|
||||
return (
|
||||
isAdvancedJobCreator(jobCreator) &&
|
||||
jobCreator.detectors.some(d => {
|
||||
if (
|
||||
d.partition_field_name === MLCATEGORY ||
|
||||
d.over_field_name === MLCATEGORY ||
|
||||
d.by_field_name === MLCATEGORY
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<EuiButton
|
||||
disabled={status === ESTIMATE_STATUS.RUNNING || noDetectors === true}
|
||||
disabled={
|
||||
status === ESTIMATE_STATUS.RUNNING || noDetectors === true || isUsingMlCategory === true
|
||||
}
|
||||
onClick={estimateBucketSpan}
|
||||
>
|
||||
<FormattedMessage
|
||||
|
|
|
@ -30,7 +30,7 @@ export const Description: FC<Props> = memo(({ children, isOptional }) => {
|
|||
) : (
|
||||
<FormattedMessage
|
||||
id="xpack.ml.newJob.wizard.pickFieldsStep.categorizationField.description"
|
||||
defaultMessage="Specifies which field will be categorized. Using text data types is recommended."
|
||||
defaultMessage="Specifies which field will be categorized. Using text data types is recommended. Categorization works best on machine written log messages, typically logging written by a developer for the purpose of system troubleshooting."
|
||||
/>
|
||||
)}
|
||||
</>
|
||||
|
|
|
@ -12,8 +12,6 @@ import { FormattedMessage } from '@kbn/i18n/react';
|
|||
import { CategorizationAnalyzer } from '../../../../../../../services/ml_server_info';
|
||||
import { EditCategorizationAnalyzerFlyout } from '../../../common/edit_categorization_analyzer_flyout';
|
||||
import {
|
||||
NUMBER_OF_CATEGORY_EXAMPLES,
|
||||
CATEGORY_EXAMPLES_MULTIPLIER,
|
||||
CATEGORY_EXAMPLES_ERROR_LIMIT,
|
||||
CATEGORY_EXAMPLES_WARNING_LIMIT,
|
||||
} from '../../../../../../../../../common/constants/new_job';
|
||||
|
@ -22,11 +20,16 @@ type CategorizationAnalyzerType = CategorizationAnalyzer | null;
|
|||
|
||||
interface Props {
|
||||
examplesValid: number;
|
||||
sampleSize: number;
|
||||
categorizationAnalyzer: CategorizationAnalyzerType;
|
||||
}
|
||||
|
||||
export const ExamplesValidCallout: FC<Props> = ({ examplesValid, categorizationAnalyzer }) => {
|
||||
const percentageText = <PercentageText examplesValid={examplesValid} />;
|
||||
export const ExamplesValidCallout: FC<Props> = ({
|
||||
examplesValid,
|
||||
categorizationAnalyzer,
|
||||
sampleSize,
|
||||
}) => {
|
||||
const percentageText = <PercentageText examplesValid={examplesValid} sampleSize={sampleSize} />;
|
||||
const analyzerUsed = <AnalyzerUsed categorizationAnalyzer={categorizationAnalyzer} />;
|
||||
|
||||
let color: EuiCallOutProps['color'] = 'success';
|
||||
|
@ -64,13 +67,16 @@ export const ExamplesValidCallout: FC<Props> = ({ examplesValid, categorizationA
|
|||
);
|
||||
};
|
||||
|
||||
const PercentageText: FC<{ examplesValid: number }> = ({ examplesValid }) => (
|
||||
const PercentageText: FC<{ examplesValid: number; sampleSize: number }> = ({
|
||||
examplesValid,
|
||||
sampleSize,
|
||||
}) => (
|
||||
<div>
|
||||
<FormattedMessage
|
||||
id="xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldPercentage"
|
||||
defaultMessage="{number} field values analyzed, {percentage}% contain valid tokens."
|
||||
defaultMessage="{number} field {number, plural, zero {value} one {value} other {values}} analyzed, {percentage}% contain valid tokens."
|
||||
values={{
|
||||
number: NUMBER_OF_CATEGORY_EXAMPLES * CATEGORY_EXAMPLES_MULTIPLIER,
|
||||
number: sampleSize,
|
||||
percentage: Math.floor(examplesValid * 100),
|
||||
}}
|
||||
/>
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
import React, { FC, useContext, useEffect, useState } from 'react';
|
||||
import { EuiHorizontalRule } from '@elastic/eui';
|
||||
import { mlMessageBarService } from '../../../../../../../components/messagebar';
|
||||
|
||||
import { JobCreatorContext } from '../../../job_creator_context';
|
||||
import { CategorizationJobCreator } from '../../../../../common/job_creator';
|
||||
|
@ -32,6 +33,7 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
|
|||
);
|
||||
const [fieldExamples, setFieldExamples] = useState<CategoryExample[] | null>(null);
|
||||
const [examplesValid, setExamplesValid] = useState(0);
|
||||
const [sampleSize, setSampleSize] = useState(0);
|
||||
|
||||
const [categorizationFieldName, setCategorizationFieldName] = useState(
|
||||
jobCreator.categorizationFieldName
|
||||
|
@ -69,10 +71,20 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
|
|||
async function loadFieldExamples() {
|
||||
if (categorizationFieldName !== null) {
|
||||
setLoadingData(true);
|
||||
const { valid, examples } = await jobCreator.loadCategorizationFieldExamples();
|
||||
setFieldExamples(examples);
|
||||
setExamplesValid(valid);
|
||||
setLoadingData(false);
|
||||
try {
|
||||
const {
|
||||
valid,
|
||||
examples,
|
||||
sampleSize: tempSampleSize,
|
||||
} = await jobCreator.loadCategorizationFieldExamples();
|
||||
setFieldExamples(examples);
|
||||
setExamplesValid(valid);
|
||||
setLoadingData(false);
|
||||
setSampleSize(tempSampleSize);
|
||||
} catch (error) {
|
||||
setLoadingData(false);
|
||||
mlMessageBarService.notify.error(error);
|
||||
}
|
||||
} else {
|
||||
setFieldExamples(null);
|
||||
setExamplesValid(0);
|
||||
|
@ -97,6 +109,7 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
|
|||
{fieldExamples !== null && loadingData === false && (
|
||||
<>
|
||||
<ExamplesValidCallout
|
||||
sampleSize={sampleSize}
|
||||
examplesValid={examplesValid}
|
||||
categorizationAnalyzer={jobCreator.categorizationAnalyzer}
|
||||
/>
|
||||
|
|
|
@ -185,7 +185,7 @@ declare interface Ml {
|
|||
start: number,
|
||||
end: number,
|
||||
analyzer: any
|
||||
): Promise<{ valid: number; examples: any[] }>;
|
||||
): Promise<{ valid: number; examples: any[]; sampleSize: number }>;
|
||||
topCategories(
|
||||
jobId: string,
|
||||
count: number
|
||||
|
|
|
@ -4,11 +4,15 @@
|
|||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
import { chunk } from 'lodash';
|
||||
import { ML_RESULTS_INDEX_PATTERN } from '../../../../common/constants/index_patterns';
|
||||
import { CATEGORY_EXAMPLES_MULTIPLIER } from '../../../../common/constants/new_job';
|
||||
import { CATEGORY_EXAMPLES_SAMPLE_SIZE } from '../../../../common/constants/new_job';
|
||||
import { CategoryId, Category, Token } from '../../../../common/types/categories';
|
||||
import { callWithRequestType } from '../../../../common/types/kibana';
|
||||
|
||||
const VALID_TOKEN_COUNT = 3;
|
||||
const CHUNK_SIZE = 100;
|
||||
|
||||
export function categorizationExamplesProvider(callWithRequest: callWithRequestType) {
|
||||
async function categorizationExamples(
|
||||
indexPatternTitle: string,
|
||||
|
@ -54,21 +58,31 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
|
|||
});
|
||||
const examples: string[] = results.hits?.hits
|
||||
?.map((doc: any) => doc._source[categorizationFieldName])
|
||||
.filter((example: string | undefined) => example !== undefined);
|
||||
.filter((example: string | null | undefined) => example !== undefined && example !== null);
|
||||
|
||||
let tokens: Token[] = [];
|
||||
try {
|
||||
const { tokens: tempTokens } = await callWithRequest('indices.analyze', {
|
||||
body: {
|
||||
...getAnalyzer(analyzer),
|
||||
text: examples,
|
||||
},
|
||||
});
|
||||
tokens = tempTokens;
|
||||
} catch (error) {
|
||||
// fail silently, the tokens could not be loaded
|
||||
// an empty list of tokens will be returned for each example
|
||||
async function loadTokens(chunkSize: number) {
|
||||
const exampleChunks = chunk(examples, chunkSize);
|
||||
const tokensPerChunks = await Promise.all(exampleChunks.map(c => getTokens(c, analyzer)));
|
||||
const tokensPerExample = tokensPerChunks.flat();
|
||||
return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] }));
|
||||
}
|
||||
try {
|
||||
return loadTokens(CHUNK_SIZE);
|
||||
} catch (error) {
|
||||
// if an error is thrown when loading the tokens, lower the chunk size by half and try again
|
||||
// the error may have been caused by too many tokens being found.
|
||||
// the _analyze endpoint has a maximum of 10000 tokens.
|
||||
return loadTokens(CHUNK_SIZE / 2);
|
||||
}
|
||||
}
|
||||
|
||||
async function getTokens(examples: string[], analyzer?: any) {
|
||||
const { tokens }: { tokens: Token[] } = await callWithRequest('indices.analyze', {
|
||||
body: {
|
||||
...getAnalyzer(analyzer),
|
||||
text: examples,
|
||||
},
|
||||
});
|
||||
|
||||
const lengths = examples.map(e => e.length);
|
||||
const sumLengths = lengths.map((s => (a: number) => (s += a))(0));
|
||||
|
@ -88,8 +102,7 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
|
|||
}
|
||||
}
|
||||
});
|
||||
|
||||
return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] }));
|
||||
return tokensPerExample;
|
||||
}
|
||||
|
||||
function getAnalyzer(analyzer: any) {
|
||||
|
@ -110,10 +123,10 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
|
|||
end: number,
|
||||
analyzer?: any
|
||||
) {
|
||||
const examples = await categorizationExamples(
|
||||
const resp = await categorizationExamples(
|
||||
indexPatternTitle,
|
||||
query,
|
||||
size * CATEGORY_EXAMPLES_MULTIPLIER,
|
||||
CATEGORY_EXAMPLES_SAMPLE_SIZE,
|
||||
categorizationFieldName,
|
||||
timeField,
|
||||
start,
|
||||
|
@ -121,20 +134,27 @@ export function categorizationExamplesProvider(callWithRequest: callWithRequestT
|
|||
analyzer
|
||||
);
|
||||
|
||||
const sortedExamples = examples
|
||||
const sortedExamples = resp
|
||||
.map((e, i) => ({ ...e, origIndex: i }))
|
||||
.sort((a, b) => b.tokens.length - a.tokens.length);
|
||||
const validExamples = sortedExamples.filter(e => e.tokens.length > 1);
|
||||
const validExamples = sortedExamples.filter(e => e.tokens.length >= VALID_TOKEN_COUNT);
|
||||
const sampleSize = sortedExamples.length;
|
||||
|
||||
const multiple = Math.floor(sampleSize / size) || sampleSize;
|
||||
const filteredExamples = [];
|
||||
let i = 0;
|
||||
while (filteredExamples.length < size && i < sortedExamples.length) {
|
||||
filteredExamples.push(sortedExamples[i]);
|
||||
i += multiple;
|
||||
}
|
||||
const examples = filteredExamples
|
||||
.sort((a, b) => a.origIndex - b.origIndex)
|
||||
.map(e => ({ text: e.text, tokens: e.tokens }));
|
||||
|
||||
return {
|
||||
sampleSize,
|
||||
valid: sortedExamples.length === 0 ? 0 : validExamples.length / sortedExamples.length,
|
||||
examples: sortedExamples
|
||||
.filter(
|
||||
(e, i) =>
|
||||
i / CATEGORY_EXAMPLES_MULTIPLIER - Math.floor(i / CATEGORY_EXAMPLES_MULTIPLIER) === 0
|
||||
)
|
||||
.sort((a, b) => a.origIndex - b.origIndex)
|
||||
.map(e => ({ text: e.text, tokens: e.tokens })),
|
||||
examples,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue