[ML] Fixing categorization wizard example results (#54924) (#55438)

* [ML] Fixing categorization wizard example results

* moving validation results to class

* cleaning up category analyzer types

* small tweaks

* removing commented out code

* fixing string ids

* small refactor

* improving validation messages

* fixing types

* updating message text

* fixing typo

* adding privileges error

* updating privilege message

* changes based on review

* removing old warning message

* fixing translations

* renaming enum
This commit is contained in:
James Gowdy 2020-01-21 19:13:16 +00:00 committed by GitHub
parent c96ca202ac
commit 15e584fa5b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 763 additions and 420 deletions

View file

@ -26,7 +26,14 @@ export const DEFAULT_QUERY_DELAY = '60s';
export const SHARED_RESULTS_INDEX_NAME = 'shared';
// Categorization
export const NUMBER_OF_CATEGORY_EXAMPLES = 5;
export const CATEGORY_EXAMPLES_SAMPLE_SIZE = 1000;
export const CATEGORY_EXAMPLES_WARNING_LIMIT = 0.75;
export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.02;
export enum CATEGORY_EXAMPLES_VALIDATION_STATUS {
VALID = 'valid',
PARTIALLY_VALID = 'partially_valid',
INVALID = 'invalid',
}

View file

@ -4,6 +4,8 @@
* you may not use this file except in compliance with the Elastic License.
*/
import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../constants/new_job';
export type CategoryId = number;
export interface Category {
@ -23,3 +25,30 @@ export interface Token {
type: string;
position: number;
}
export interface CategorizationAnalyzer {
char_filter?: any[];
tokenizer?: string;
filter?: any[];
analyzer?: string;
}
export interface CategoryFieldExample {
text: string;
tokens: Token[];
}
export enum VALIDATION_RESULT {
TOKEN_COUNT,
MEDIAN_LINE_LENGTH,
NULL_VALUES,
TOO_MANY_TOKENS,
FAILED_TO_TOKENIZE,
INSUFFICIENT_PRIVILEGES,
}
export interface FieldExampleCheck {
id: VALIDATION_RESULT;
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS;
message: string;
}

View file

@ -4,7 +4,17 @@
* you may not use this file except in compliance with the Elastic License.
*/
import { renderTemplate } from './string_utils';
import { renderTemplate, getMedianStringLength } from './string_utils';
const strings: string[] = [
'foo',
'foofoofoofoofoo',
'foofoofoo',
'f',
'f',
'foofoofoofoofoofoofoo',
];
const noStrings: string[] = [];
describe('ML - string utils', () => {
describe('renderTemplate', () => {
@ -24,4 +34,16 @@ describe('ML - string utils', () => {
expect(result).toBe('string with 1 replacement, and a 2nd one.');
});
});
describe('getMedianStringLength', () => {
test('test median for string array', () => {
const result = getMedianStringLength(strings);
expect(result).toBe(9);
});
test('test median for no strings', () => {
const result = getMedianStringLength(noStrings);
expect(result).toBe(0);
});
});
});

View file

@ -17,3 +17,8 @@ export function renderTemplate(str: string, data?: Record<string, string>): stri
return str;
}
export function getMedianStringLength(strings: string[]) {
const sortedStringLengths = strings.map(s => s.length).sort((a, b) => a - b);
return sortedStringLengths[Math.floor(sortedStringLengths.length / 2)] || 0;
}

View file

@ -16,25 +16,31 @@ import {
CREATED_BY_LABEL,
DEFAULT_BUCKET_SPAN,
DEFAULT_RARE_BUCKET_SPAN,
CATEGORY_EXAMPLES_VALIDATION_STATUS,
} from '../../../../../../common/constants/new_job';
import { ML_JOB_AGGREGATION } from '../../../../../../common/constants/aggregation_types';
import {
CategorizationAnalyzer,
CategoryFieldExample,
FieldExampleCheck,
} from '../../../../../../common/types/categories';
import { getRichDetectors } from './util/general';
import { CategorizationExamplesLoader, CategoryExample } from '../results_loader';
import { CategorizationAnalyzer, getNewJobDefaults } from '../../../../services/ml_server_info';
type CategorizationAnalyzerType = CategorizationAnalyzer | null;
import { CategorizationExamplesLoader } from '../results_loader';
import { getNewJobDefaults } from '../../../../services/ml_server_info';
export class CategorizationJobCreator extends JobCreator {
protected _type: JOB_TYPE = JOB_TYPE.CATEGORIZATION;
private _createCountDetector: () => void = () => {};
private _createRareDetector: () => void = () => {};
private _examplesLoader: CategorizationExamplesLoader;
private _categoryFieldExamples: CategoryExample[] = [];
private _categoryFieldValid: number = 0;
private _categoryFieldExamples: CategoryFieldExample[] = [];
private _validationChecks: FieldExampleCheck[] = [];
private _overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS =
CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID;
private _detectorType: ML_JOB_AGGREGATION.COUNT | ML_JOB_AGGREGATION.RARE =
ML_JOB_AGGREGATION.COUNT;
private _categorizationAnalyzer: CategorizationAnalyzerType = null;
private _defaultCategorizationAnalyzer: CategorizationAnalyzerType;
private _categorizationAnalyzer: CategorizationAnalyzer = {};
private _defaultCategorizationAnalyzer: CategorizationAnalyzer;
constructor(
indexPattern: IndexPattern,
@ -46,7 +52,7 @@ export class CategorizationJobCreator extends JobCreator {
this._examplesLoader = new CategorizationExamplesLoader(this, indexPattern, query);
const { anomaly_detectors: anomalyDetectors } = getNewJobDefaults();
this._defaultCategorizationAnalyzer = anomalyDetectors.categorization_analyzer || null;
this._defaultCategorizationAnalyzer = anomalyDetectors.categorization_analyzer || {};
}
public setDefaultDetectorProperties(
@ -93,7 +99,7 @@ export class CategorizationJobCreator extends JobCreator {
} else {
delete this._job_config.analysis_config.categorization_field_name;
this._categoryFieldExamples = [];
this._categoryFieldValid = 0;
this._validationChecks = [];
}
}
@ -102,31 +108,38 @@ export class CategorizationJobCreator extends JobCreator {
}
public async loadCategorizationFieldExamples() {
const { valid, examples, sampleSize } = await this._examplesLoader.loadExamples();
const {
examples,
sampleSize,
overallValidStatus,
validationChecks,
} = await this._examplesLoader.loadExamples();
this._categoryFieldExamples = examples;
this._categoryFieldValid = valid;
return { valid, examples, sampleSize };
this._validationChecks = validationChecks;
this._overallValidStatus = overallValidStatus;
return { examples, sampleSize, overallValidStatus, validationChecks };
}
public get categoryFieldExamples() {
return this._categoryFieldExamples;
}
public get categoryFieldValid() {
return this._categoryFieldValid;
public get validationChecks() {
return this._validationChecks;
}
public get overallValidStatus() {
return this._overallValidStatus;
}
public get selectedDetectorType() {
return this._detectorType;
}
public set categorizationAnalyzer(analyzer: CategorizationAnalyzerType) {
public set categorizationAnalyzer(analyzer: CategorizationAnalyzer) {
this._categorizationAnalyzer = analyzer;
if (
analyzer === null ||
isEqual(this._categorizationAnalyzer, this._defaultCategorizationAnalyzer)
) {
if (isEqual(this._categorizationAnalyzer, this._defaultCategorizationAnalyzer)) {
delete this._job_config.analysis_config.categorization_analyzer;
} else {
this._job_config.analysis_config.categorization_analyzer = analyzer;

View file

@ -16,7 +16,7 @@ import { JobCreator, JobCreatorType, isCategorizationJobCreator } from '../job_c
import { populateValidationMessages, checkForExistingJobAndGroupIds } from './util';
import { ExistingJobsAndGroups } from '../../../../services/job_service';
import { cardinalityValidator, CardinalityValidatorResult } from './validators';
import { CATEGORY_EXAMPLES_ERROR_LIMIT } from '../../../../../../common/constants/new_job';
import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../../../common/constants/new_job';
// delay start of validation to allow the user to make changes
// e.g. if they are typing in a new value, try not to validate
@ -207,7 +207,7 @@ export class JobValidator {
private _runAdvancedValidation() {
if (isCategorizationJobCreator(this._jobCreator)) {
this._advancedValidations.categorizationFieldValid.valid =
this._jobCreator.categoryFieldValid > CATEGORY_EXAMPLES_ERROR_LIMIT;
this._jobCreator.overallValidStatus !== CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID;
}
}

View file

@ -6,15 +6,12 @@
import { IndexPattern } from '../../../../../../../../../../src/plugins/data/public';
import { IndexPatternTitle } from '../../../../../../common/types/kibana';
import { Token } from '../../../../../../common/types/categories';
import { CategorizationJobCreator } from '../job_creator';
import { ml } from '../../../../services/ml_api_service';
import { NUMBER_OF_CATEGORY_EXAMPLES } from '../../../../../../common/constants/new_job';
export interface CategoryExample {
text: string;
tokens: Token[];
}
import {
NUMBER_OF_CATEGORY_EXAMPLES,
CATEGORY_EXAMPLES_VALIDATION_STATUS,
} from '../../../../../../common/constants/new_job';
export class CategorizationExamplesLoader {
private _jobCreator: CategorizationJobCreator;
@ -36,20 +33,22 @@ export class CategorizationExamplesLoader {
const analyzer = this._jobCreator.categorizationAnalyzer;
const categorizationFieldName = this._jobCreator.categorizationFieldName;
if (categorizationFieldName === null) {
return { valid: 0, examples: [], sampleSize: 0 };
return {
examples: [],
sampleSize: 0,
overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID,
validationChecks: [],
};
}
const start = Math.floor(
this._jobCreator.start + (this._jobCreator.end - this._jobCreator.start) / 2
);
const resp = await ml.jobs.categorizationFieldExamples(
this._indexPatternTitle,
this._query,
NUMBER_OF_CATEGORY_EXAMPLES,
categorizationFieldName,
this._timeFieldName,
start,
0,
this._jobCreator.start,
this._jobCreator.end,
analyzer
);
return resp;

View file

@ -5,4 +5,4 @@
*/
export { ResultsLoader, Results, ModelItem, Anomaly } from './results_loader';
export { CategorizationExamplesLoader, CategoryExample } from './categorization_examples_loader';
export { CategorizationExamplesLoader } from './categorization_examples_loader';

View file

@ -9,27 +9,24 @@ import { EuiCallOut, EuiSpacer, EuiCallOutProps } from '@elastic/eui';
import { i18n } from '@kbn/i18n';
import { FormattedMessage } from '@kbn/i18n/react';
import { CategorizationAnalyzer } from '../../../../../../../services/ml_server_info';
import { EditCategorizationAnalyzerFlyout } from '../../../common/edit_categorization_analyzer_flyout';
import {
CATEGORY_EXAMPLES_ERROR_LIMIT,
CATEGORY_EXAMPLES_WARNING_LIMIT,
} from '../../../../../../../../../common/constants/new_job';
type CategorizationAnalyzerType = CategorizationAnalyzer | null;
CategorizationAnalyzer,
FieldExampleCheck,
} from '../../../../../../../../../common/types/categories';
import { EditCategorizationAnalyzerFlyout } from '../../../common/edit_categorization_analyzer_flyout';
import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../../../../../../common/constants/new_job';
interface Props {
examplesValid: number;
sampleSize: number;
categorizationAnalyzer: CategorizationAnalyzerType;
validationChecks: FieldExampleCheck[];
overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS;
categorizationAnalyzer: CategorizationAnalyzer;
}
export const ExamplesValidCallout: FC<Props> = ({
examplesValid,
overallValidStatus,
validationChecks,
categorizationAnalyzer,
sampleSize,
}) => {
const percentageText = <PercentageText examplesValid={examplesValid} sampleSize={sampleSize} />;
const analyzerUsed = <AnalyzerUsed categorizationAnalyzer={categorizationAnalyzer} />;
let color: EuiCallOutProps['color'] = 'success';
@ -40,7 +37,7 @@ export const ExamplesValidCallout: FC<Props> = ({
}
);
if (examplesValid < CATEGORY_EXAMPLES_ERROR_LIMIT) {
if (overallValidStatus === CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID) {
color = 'danger';
title = i18n.translate(
'xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldCalloutTitle.invalid',
@ -48,7 +45,7 @@ export const ExamplesValidCallout: FC<Props> = ({
defaultMessage: 'Selected category field is invalid',
}
);
} else if (examplesValid < CATEGORY_EXAMPLES_WARNING_LIMIT) {
} else if (overallValidStatus === CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID) {
color = 'warning';
title = i18n.translate(
'xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldCalloutTitle.possiblyInvalid',
@ -60,45 +57,24 @@ export const ExamplesValidCallout: FC<Props> = ({
return (
<EuiCallOut color={color} title={title}>
{percentageText}
{validationChecks.map((v, i) => (
<div key={i}>{v.message}</div>
))}
<EuiSpacer size="s" />
{analyzerUsed}
</EuiCallOut>
);
};
const PercentageText: FC<{ examplesValid: number; sampleSize: number }> = ({
examplesValid,
sampleSize,
}) => (
<div>
<FormattedMessage
id="xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldPercentage"
defaultMessage="{number} field {number, plural, zero {value} one {value} other {values}} analyzed, {percentage}% contain valid tokens."
values={{
number: sampleSize,
percentage: Math.floor(examplesValid * 100),
}}
/>
</div>
);
const AnalyzerUsed: FC<{ categorizationAnalyzer: CategorizationAnalyzerType }> = ({
const AnalyzerUsed: FC<{ categorizationAnalyzer: CategorizationAnalyzer }> = ({
categorizationAnalyzer,
}) => {
let analyzer = '';
if (typeof categorizationAnalyzer === null) {
return null;
}
if (typeof categorizationAnalyzer === 'string') {
analyzer = categorizationAnalyzer;
} else {
if (categorizationAnalyzer?.tokenizer !== undefined) {
analyzer = categorizationAnalyzer?.tokenizer!;
} else if (categorizationAnalyzer?.analyzer !== undefined) {
analyzer = categorizationAnalyzer?.analyzer!;
}
if (categorizationAnalyzer?.tokenizer !== undefined) {
analyzer = categorizationAnalyzer.tokenizer;
} else if (categorizationAnalyzer?.analyzer !== undefined) {
analyzer = categorizationAnalyzer.analyzer;
}
return (

View file

@ -7,10 +7,10 @@
import React, { FC } from 'react';
import { i18n } from '@kbn/i18n';
import { EuiBasicTable, EuiText } from '@elastic/eui';
import { CategoryExample } from '../../../../../common/results_loader';
import { CategoryFieldExample } from '../../../../../../../../../common/types/categories';
interface Props {
fieldExamples: CategoryExample[] | null;
fieldExamples: CategoryFieldExample[] | null;
}
const TOKEN_HIGHLIGHT_COLOR = '#b0ccf7';

View file

@ -14,7 +14,11 @@ import { CategorizationField } from '../categorization_field';
import { CategorizationDetector } from '../categorization_detector';
import { FieldExamples } from './field_examples';
import { ExamplesValidCallout } from './examples_valid_callout';
import { CategoryExample } from '../../../../../common/results_loader';
import {
CategoryFieldExample,
FieldExampleCheck,
} from '../../../../../../../../../common/types/categories';
import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../../../../../../common/constants/new_job';
import { LoadingWrapper } from '../../../charts/loading_wrapper';
interface Props {
@ -31,9 +35,11 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
const [categorizationAnalyzerString, setCategorizationAnalyzerString] = useState(
JSON.stringify(jobCreator.categorizationAnalyzer)
);
const [fieldExamples, setFieldExamples] = useState<CategoryExample[] | null>(null);
const [examplesValid, setExamplesValid] = useState(0);
const [sampleSize, setSampleSize] = useState(0);
const [fieldExamples, setFieldExamples] = useState<CategoryFieldExample[] | null>(null);
const [overallValidStatus, setOverallValidStatus] = useState(
CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID
);
const [validationChecks, setValidationChecks] = useState<FieldExampleCheck[]>([]);
const [categorizationFieldName, setCategorizationFieldName] = useState(
jobCreator.categorizationFieldName
@ -73,28 +79,32 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
setLoadingData(true);
try {
const {
valid,
examples,
sampleSize: tempSampleSize,
overallValidStatus: tempOverallValidStatus,
validationChecks: tempValidationChecks,
} = await jobCreator.loadCategorizationFieldExamples();
setFieldExamples(examples);
setExamplesValid(valid);
setOverallValidStatus(tempOverallValidStatus);
setValidationChecks(tempValidationChecks);
setLoadingData(false);
setSampleSize(tempSampleSize);
} catch (error) {
setLoadingData(false);
setFieldExamples(null);
setValidationChecks([]);
setOverallValidStatus(CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID);
mlMessageBarService.notify.error(error);
}
} else {
setFieldExamples(null);
setExamplesValid(0);
setValidationChecks([]);
setOverallValidStatus(CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID);
}
setIsValid(categorizationFieldName !== null);
}
useEffect(() => {
jobCreatorUpdate();
}, [examplesValid]);
}, [overallValidStatus]);
return (
<>
@ -109,8 +119,8 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
{fieldExamples !== null && loadingData === false && (
<>
<ExamplesValidCallout
sampleSize={sampleSize}
examplesValid={examplesValid}
overallValidStatus={overallValidStatus}
validationChecks={validationChecks}
categorizationAnalyzer={jobCreator.categorizationAnalyzer}
/>
<FieldExamples fieldExamples={fieldExamples} />

View file

@ -22,6 +22,12 @@ import { PartitionFieldsDefinition } from '../results_service/result_service_rx'
import { annotations } from './annotations';
import { Calendar, CalendarId, UpdateCalendar } from '../../../../common/types/calendars';
import { CombinedJob, JobId } from '../../jobs/new_job/common/job_creator/configs';
import {
CategorizationAnalyzer,
CategoryFieldExample,
FieldExampleCheck,
} from '../../../../common/types/categories';
import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../common/constants/new_job';
// TODO This is not a complete representation of all methods of `ml.*`.
// It just satisfies needs for other parts of the code area which use
@ -184,8 +190,13 @@ declare interface Ml {
timeField: string | undefined,
start: number,
end: number,
analyzer: any
): Promise<{ valid: number; examples: any[]; sampleSize: number }>;
analyzer: CategorizationAnalyzer
): Promise<{
examples: CategoryFieldExample[];
sampleSize: number;
overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS;
validationChecks: FieldExampleCheck[];
}>;
topCategories(
jobId: string,
count: number

View file

@ -5,6 +5,7 @@
*/
import { ml } from './ml_api_service';
import { CategorizationAnalyzer } from '../../../common/types/categories';
export interface MlServerDefaults {
anomaly_detectors: {
@ -16,13 +17,6 @@ export interface MlServerDefaults {
datafeeds: { scroll_size?: number };
}
export interface CategorizationAnalyzer {
char_filter?: any[];
tokenizer?: string;
filter?: any[];
analyzer?: string;
}
export interface MlServerLimits {
max_model_memory_limit?: string;
}

View file

@ -8,7 +8,11 @@ import { datafeedsProvider } from './datafeeds';
import { jobsProvider } from './jobs';
import { groupsProvider } from './groups';
import { newJobCapsProvider } from './new_job_caps';
import { newJobChartsProvider, categorizationExamplesProvider } from './new_job';
import {
newJobChartsProvider,
categorizationExamplesProvider,
topCategoriesProvider,
} from './new_job';
export function jobServiceProvider(callWithRequest, request) {
return {
@ -18,5 +22,6 @@ export function jobServiceProvider(callWithRequest, request) {
...newJobCapsProvider(callWithRequest, request),
...newJobChartsProvider(callWithRequest, request),
...categorizationExamplesProvider(callWithRequest, request),
...topCategoriesProvider(callWithRequest, request),
};
}

View file

@ -1,314 +0,0 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
import { chunk } from 'lodash';
import { ML_RESULTS_INDEX_PATTERN } from '../../../../common/constants/index_patterns';
import { CATEGORY_EXAMPLES_SAMPLE_SIZE } from '../../../../common/constants/new_job';
import { CategoryId, Category, Token } from '../../../../common/types/categories';
import { callWithRequestType } from '../../../../common/types/kibana';
const VALID_TOKEN_COUNT = 3;
const CHUNK_SIZE = 100;
export function categorizationExamplesProvider(callWithRequest: callWithRequestType) {
async function categorizationExamples(
indexPatternTitle: string,
query: any,
size: number,
categorizationFieldName: string,
timeField: string | undefined,
start: number,
end: number,
analyzer?: any
) {
if (timeField !== undefined) {
const range = {
range: {
[timeField]: {
gte: start,
format: 'epoch_millis',
},
},
};
if (query.bool === undefined) {
query.bool = {};
}
if (query.bool.filter === undefined) {
query.bool.filter = range;
} else {
if (Array.isArray(query.bool.filter)) {
query.bool.filter.push(range);
} else {
query.bool.filter.range = range;
}
}
}
const results = await callWithRequest('search', {
index: indexPatternTitle,
size,
body: {
_source: categorizationFieldName,
query,
},
});
const examples: string[] = results.hits?.hits
?.map((doc: any) => doc._source[categorizationFieldName])
.filter((example: string | null | undefined) => example !== undefined && example !== null);
async function loadTokens(chunkSize: number) {
const exampleChunks = chunk(examples, chunkSize);
const tokensPerChunks = await Promise.all(exampleChunks.map(c => getTokens(c, analyzer)));
const tokensPerExample = tokensPerChunks.flat();
return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] }));
}
try {
return loadTokens(CHUNK_SIZE);
} catch (error) {
// if an error is thrown when loading the tokens, lower the chunk size by half and try again
// the error may have been caused by too many tokens being found.
// the _analyze endpoint has a maximum of 10000 tokens.
return loadTokens(CHUNK_SIZE / 2);
}
}
async function getTokens(examples: string[], analyzer?: any) {
const { tokens }: { tokens: Token[] } = await callWithRequest('indices.analyze', {
body: {
...getAnalyzer(analyzer),
text: examples,
},
});
const lengths = examples.map(e => e.length);
const sumLengths = lengths.map((s => (a: number) => (s += a))(0));
const tokensPerExample: Token[][] = examples.map(e => []);
tokens.forEach((t, i) => {
for (let g = 0; g < sumLengths.length; g++) {
if (t.start_offset <= sumLengths[g] + g) {
const offset = g > 0 ? sumLengths[g - 1] + g : 0;
tokensPerExample[g].push({
...t,
start_offset: t.start_offset - offset,
end_offset: t.end_offset - offset,
});
break;
}
}
});
return tokensPerExample;
}
function getAnalyzer(analyzer: any) {
if (typeof analyzer === 'object' && analyzer.tokenizer !== undefined) {
return analyzer;
} else {
return { analyzer: 'standard' };
}
}
async function validateCategoryExamples(
indexPatternTitle: string,
query: any,
size: number,
categorizationFieldName: string,
timeField: string | undefined,
start: number,
end: number,
analyzer?: any
) {
const resp = await categorizationExamples(
indexPatternTitle,
query,
CATEGORY_EXAMPLES_SAMPLE_SIZE,
categorizationFieldName,
timeField,
start,
end,
analyzer
);
const sortedExamples = resp
.map((e, i) => ({ ...e, origIndex: i }))
.sort((a, b) => b.tokens.length - a.tokens.length);
const validExamples = sortedExamples.filter(e => e.tokens.length >= VALID_TOKEN_COUNT);
const sampleSize = sortedExamples.length;
const multiple = Math.floor(sampleSize / size) || sampleSize;
const filteredExamples = [];
let i = 0;
while (filteredExamples.length < size && i < sortedExamples.length) {
filteredExamples.push(sortedExamples[i]);
i += multiple;
}
const examples = filteredExamples
.sort((a, b) => a.origIndex - b.origIndex)
.map(e => ({ text: e.text, tokens: e.tokens }));
return {
sampleSize,
valid: sortedExamples.length === 0 ? 0 : validExamples.length / sortedExamples.length,
examples,
};
}
async function getTotalCategories(jobId: string): Promise<{ total: number }> {
const totalResp = await callWithRequest('search', {
index: ML_RESULTS_INDEX_PATTERN,
size: 0,
body: {
query: {
bool: {
filter: [
{
term: {
job_id: jobId,
},
},
{
exists: {
field: 'category_id',
},
},
],
},
},
},
});
return totalResp?.hits?.total?.value ?? 0;
}
async function getTopCategoryCounts(jobId: string, numberOfCategories: number) {
const top = await callWithRequest('search', {
index: ML_RESULTS_INDEX_PATTERN,
size: 0,
body: {
query: {
bool: {
filter: [
{
term: {
job_id: jobId,
},
},
{
term: {
result_type: 'model_plot',
},
},
{
term: {
by_field_name: 'mlcategory',
},
},
],
},
},
aggs: {
cat_count: {
terms: {
field: 'by_field_value',
size: numberOfCategories,
},
},
},
},
});
const catCounts: Array<{
id: CategoryId;
count: number;
}> = top.aggregations?.cat_count?.buckets.map((c: any) => ({
id: c.key,
count: c.doc_count,
}));
return catCounts || [];
}
async function getCategories(
jobId: string,
catIds: CategoryId[],
size: number
): Promise<Category[]> {
const categoryFilter = catIds.length
? {
terms: {
category_id: catIds,
},
}
: {
exists: {
field: 'category_id',
},
};
const result = await callWithRequest('search', {
index: ML_RESULTS_INDEX_PATTERN,
size,
body: {
query: {
bool: {
filter: [
{
term: {
job_id: jobId,
},
},
categoryFilter,
],
},
},
},
});
return result.hits.hits?.map((c: { _source: Category }) => c._source) || [];
}
async function topCategories(jobId: string, numberOfCategories: number) {
const catCounts = await getTopCategoryCounts(jobId, numberOfCategories);
const categories = await getCategories(
jobId,
catCounts.map(c => c.id),
catCounts.length || numberOfCategories
);
const catsById = categories.reduce((p, c) => {
p[c.category_id] = c;
return p;
}, {} as { [id: number]: Category });
const total = await getTotalCategories(jobId);
if (catCounts.length) {
return {
total,
categories: catCounts.map(({ id, count }) => {
return {
count,
category: catsById[id] ?? null,
};
}),
};
} else {
return {
total,
categories: categories.map(category => {
return {
category,
};
}),
};
}
}
return {
categorizationExamples,
validateCategoryExamples,
topCategories,
};
}

View file

@ -0,0 +1,206 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
import { chunk } from 'lodash';
import { SearchResponse } from 'elasticsearch';
import { CATEGORY_EXAMPLES_SAMPLE_SIZE } from '../../../../../common/constants/new_job';
import {
Token,
CategorizationAnalyzer,
CategoryFieldExample,
} from '../../../../../common/types/categories';
import { callWithRequestType } from '../../../../../common/types/kibana';
import { ValidationResults } from './validation_results';
const CHUNK_SIZE = 100;
export function categorizationExamplesProvider(callWithRequest: callWithRequestType) {
const validationResults = new ValidationResults();
async function categorizationExamples(
indexPatternTitle: string,
query: any,
size: number,
categorizationFieldName: string,
timeField: string | undefined,
start: number,
end: number,
analyzer: CategorizationAnalyzer
): Promise<{ examples: CategoryFieldExample[]; error?: any }> {
if (timeField !== undefined) {
const range = {
range: {
[timeField]: {
gte: start,
lt: end,
format: 'epoch_millis',
},
},
};
if (query.bool === undefined) {
query.bool = {};
}
if (query.bool.filter === undefined) {
query.bool.filter = range;
} else {
if (Array.isArray(query.bool.filter)) {
query.bool.filter.push(range);
} else {
query.bool.filter.range = range;
}
}
}
const results: SearchResponse<{ [id: string]: string }> = await callWithRequest('search', {
index: indexPatternTitle,
size,
body: {
_source: categorizationFieldName,
query,
sort: ['_doc'],
},
});
const tempExamples = results.hits.hits.map(({ _source }) => _source[categorizationFieldName]);
validationResults.createNullValueResult(tempExamples);
const allExamples = tempExamples.filter(
(example: string | null | undefined) => example !== undefined && example !== null
);
validationResults.createMedianMessageLengthResult(allExamples);
try {
const examplesWithTokens = await getTokens(CHUNK_SIZE, allExamples, analyzer);
return { examples: examplesWithTokens };
} catch (err) {
// console.log('dropping to 50 chunk size');
// if an error is thrown when loading the tokens, lower the chunk size by half and try again
// the error may have been caused by too many tokens being found.
// the _analyze endpoint has a maximum of 10000 tokens.
const halfExamples = allExamples.splice(0, Math.ceil(allExamples.length / 2));
const halfChunkSize = CHUNK_SIZE / 2;
try {
const examplesWithTokens = await getTokens(halfChunkSize, halfExamples, analyzer);
return { examples: examplesWithTokens };
} catch (error) {
validationResults.createTooManyTokensResult(error, halfChunkSize);
return { examples: halfExamples.map(e => ({ text: e, tokens: [] })) };
}
}
}
async function getTokens(
chunkSize: number,
examples: string[],
analyzer: CategorizationAnalyzer
): Promise<CategoryFieldExample[]> {
const exampleChunks = chunk(examples, chunkSize);
const tokensPerExampleChunks: Token[][][] = [];
for (const c of exampleChunks) {
tokensPerExampleChunks.push(await loadTokens(c, analyzer));
}
const tokensPerExample = tokensPerExampleChunks.flat();
return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] }));
}
async function loadTokens(examples: string[], analyzer: CategorizationAnalyzer) {
const { tokens }: { tokens: Token[] } = await callWithRequest('indices.analyze', {
body: {
...getAnalyzer(analyzer),
text: examples,
},
});
const lengths = examples.map(e => e.length);
const sumLengths = lengths.map((s => (a: number) => (s += a))(0));
const tokensPerExample: Token[][] = examples.map(e => []);
tokens.forEach((t, i) => {
for (let g = 0; g < sumLengths.length; g++) {
if (t.start_offset <= sumLengths[g] + g) {
const offset = g > 0 ? sumLengths[g - 1] + g : 0;
tokensPerExample[g].push({
...t,
start_offset: t.start_offset - offset,
end_offset: t.end_offset - offset,
});
break;
}
}
});
return tokensPerExample;
}
function getAnalyzer(analyzer: CategorizationAnalyzer) {
if (typeof analyzer === 'object' && analyzer.tokenizer !== undefined) {
return analyzer;
} else {
return { analyzer: 'standard' };
}
}
async function validateCategoryExamples(
indexPatternTitle: string,
query: any,
size: number,
categorizationFieldName: string,
timeField: string | undefined,
start: number,
end: number,
analyzer: CategorizationAnalyzer
) {
const resp = await categorizationExamples(
indexPatternTitle,
query,
CATEGORY_EXAMPLES_SAMPLE_SIZE,
categorizationFieldName,
timeField,
start,
end,
analyzer
);
const { examples } = resp;
const sampleSize = examples.length;
validationResults.createTokenCountResult(examples, sampleSize);
// sort examples by number of tokens, keeping track of their original order
// with an origIndex property
const sortedExamples = examples
.map((e, i) => ({ ...e, origIndex: i }))
.sort((a, b) => b.tokens.length - a.tokens.length);
// we only want 'size' (e.g. 5) number of examples,
// so loop through the sorted examples, taking 5 at evenly
// spread intervals
const multiple = Math.floor(sampleSize / size) || sampleSize;
const filteredExamples = [];
let i = 0;
while (filteredExamples.length < size && i < sampleSize) {
filteredExamples.push(sortedExamples[i]);
i += multiple;
}
// sort back into original order and remove origIndex property
const processedExamples = filteredExamples
.sort((a, b) => a.origIndex - b.origIndex)
.map(e => ({ text: e.text, tokens: e.tokens }));
return {
overallValidStatus: validationResults.overallResult,
validationChecks: validationResults.results,
sampleSize,
examples: processedExamples,
};
}
return {
validateCategoryExamples,
};
}

View file

@ -0,0 +1,8 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
export { categorizationExamplesProvider } from './examples';
export { topCategoriesProvider } from './top_categories';

View file

@ -0,0 +1,164 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
import { SearchResponse } from 'elasticsearch';
import { ML_RESULTS_INDEX_PATTERN } from '../../../../../common/constants/index_patterns';
import { CategoryId, Category } from '../../../../../common/types/categories';
import { callWithRequestType } from '../../../../../common/types/kibana';
export function topCategoriesProvider(callWithRequest: callWithRequestType) {
async function getTotalCategories(jobId: string): Promise<{ total: number }> {
const totalResp = await callWithRequest('search', {
index: ML_RESULTS_INDEX_PATTERN,
size: 0,
body: {
query: {
bool: {
filter: [
{
term: {
job_id: jobId,
},
},
{
exists: {
field: 'category_id',
},
},
],
},
},
},
});
return totalResp?.hits?.total?.value ?? 0;
}
async function getTopCategoryCounts(jobId: string, numberOfCategories: number) {
const top: SearchResponse<any> = await callWithRequest('search', {
index: ML_RESULTS_INDEX_PATTERN,
size: 0,
body: {
query: {
bool: {
filter: [
{
term: {
job_id: jobId,
},
},
{
term: {
result_type: 'model_plot',
},
},
{
term: {
by_field_name: 'mlcategory',
},
},
],
},
},
aggs: {
cat_count: {
terms: {
field: 'by_field_value',
size: numberOfCategories,
},
},
},
},
});
const catCounts: Array<{
id: CategoryId;
count: number;
}> = top.aggregations?.cat_count?.buckets.map((c: any) => ({
id: c.key,
count: c.doc_count,
}));
return catCounts || [];
}
async function getCategories(
jobId: string,
catIds: CategoryId[],
size: number
): Promise<Category[]> {
const categoryFilter = catIds.length
? {
terms: {
category_id: catIds,
},
}
: {
exists: {
field: 'category_id',
},
};
const result: SearchResponse<any> = await callWithRequest('search', {
index: ML_RESULTS_INDEX_PATTERN,
size,
body: {
query: {
bool: {
filter: [
{
term: {
job_id: jobId,
},
},
categoryFilter,
],
},
},
},
});
return result.hits.hits?.map((c: { _source: Category }) => c._source) || [];
}
async function topCategories(jobId: string, numberOfCategories: number) {
const catCounts = await getTopCategoryCounts(jobId, numberOfCategories);
const categories = await getCategories(
jobId,
catCounts.map(c => c.id),
catCounts.length || numberOfCategories
);
const catsById = categories.reduce((p, c) => {
p[c.category_id] = c;
return p;
}, {} as { [id: number]: Category });
const total = await getTotalCategories(jobId);
if (catCounts.length) {
return {
total,
categories: catCounts.map(({ id, count }) => {
return {
count,
category: catsById[id] ?? null,
};
}),
};
} else {
return {
total,
categories: categories.map(category => {
return {
category,
};
}),
};
}
}
return {
topCategories,
};
}

View file

@ -0,0 +1,208 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
import { i18n } from '@kbn/i18n';
import {
CATEGORY_EXAMPLES_VALIDATION_STATUS,
CATEGORY_EXAMPLES_ERROR_LIMIT,
CATEGORY_EXAMPLES_WARNING_LIMIT,
} from '../../../../../common/constants/new_job';
import {
FieldExampleCheck,
CategoryFieldExample,
VALIDATION_RESULT,
} from '../../../../../common/types/categories';
import { getMedianStringLength } from '../../../../../common/util/string_utils';
const VALID_TOKEN_COUNT = 3;
const MEDIAN_LINE_LENGTH_LIMIT = 400;
const NULL_COUNT_PERCENT_LIMIT = 0.75;
export class ValidationResults {
private _results: FieldExampleCheck[] = [];
public get results() {
return this._results;
}
public get overallResult() {
if (this._results.some(c => c.valid === CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID)) {
return CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID;
}
if (this._results.some(c => c.valid === CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID)) {
return CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID;
}
return CATEGORY_EXAMPLES_VALIDATION_STATUS.VALID;
}
private _resultExists(id: VALIDATION_RESULT) {
return this._results.some(r => r.id === id);
}
public createTokenCountResult(examples: CategoryFieldExample[], sampleSize: number) {
if (examples.length === 0) {
this.createNoExamplesResult();
return;
}
if (this._resultExists(VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES) === true) {
// if tokenizing has failed due to insufficient privileges, don't show
// the message about token count
return;
}
const validExamplesSize = examples.filter(e => e.tokens.length >= VALID_TOKEN_COUNT).length;
const percentValid = sampleSize === 0 ? 0 : validExamplesSize / sampleSize;
let valid = CATEGORY_EXAMPLES_VALIDATION_STATUS.VALID;
if (percentValid < CATEGORY_EXAMPLES_ERROR_LIMIT) {
valid = CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID;
} else if (percentValid < CATEGORY_EXAMPLES_WARNING_LIMIT) {
valid = CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID;
}
const message = i18n.translate(
'xpack.ml.models.jobService.categorization.messages.tokenLengthValidation',
{
defaultMessage:
'{number} field {number, plural, zero {value} one {value} other {values}} analyzed, {percentage}% contain {validTokenCount} or more tokens.',
values: {
number: sampleSize,
percentage: Math.floor(percentValid * 100),
validTokenCount: VALID_TOKEN_COUNT,
},
}
);
if (
this._resultExists(VALIDATION_RESULT.TOO_MANY_TOKENS) === false &&
this._resultExists(VALIDATION_RESULT.FAILED_TO_TOKENIZE) === false
) {
this._results.unshift({
id: VALIDATION_RESULT.TOKEN_COUNT,
valid,
message,
});
}
}
public createMedianMessageLengthResult(examples: string[]) {
const median = getMedianStringLength(examples);
if (median > MEDIAN_LINE_LENGTH_LIMIT) {
this._results.push({
id: VALIDATION_RESULT.MEDIAN_LINE_LENGTH,
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID,
message: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.medianLineLength',
{
defaultMessage:
'The median length for the field values analyzed is over {medianLimit} characters.',
values: { medianLimit: MEDIAN_LINE_LENGTH_LIMIT },
}
),
});
}
}
public createNoExamplesResult() {
this._results.push({
id: VALIDATION_RESULT.NULL_VALUES,
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID,
message: i18n.translate('xpack.ml.models.jobService.categorization.messages.noDataFound', {
defaultMessage:
'No examples for this field could be found. Please ensure the selected date range contains data.',
}),
});
}
public createNullValueResult(examples: Array<string | null | undefined>) {
const nullCount = examples.filter(e => e === null).length;
if (nullCount / examples.length >= NULL_COUNT_PERCENT_LIMIT) {
this._results.push({
id: VALIDATION_RESULT.NULL_VALUES,
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID,
message: i18n.translate('xpack.ml.models.jobService.categorization.messages.nullValues', {
defaultMessage: 'More than {percent}% of field values are null.',
values: { percent: NULL_COUNT_PERCENT_LIMIT * 100 },
}),
});
}
}
public createTooManyTokensResult(error: any, sampleSize: number) {
// expecting error message:
// The number of tokens produced by calling _analyze has exceeded the allowed maximum of [10000].
// This limit can be set by changing the [index.analyze.max_token_count] index level setting.
if (error.statusCode === 403) {
this.createPrivilegesErrorResult(error);
return;
}
const message: string = error.message;
if (message) {
const rxp = /exceeded the allowed maximum of \[(\d+?)\]/;
const match = rxp.exec(message);
if (match?.length === 2) {
const tokenLimit = match[1];
this._results.push({
id: VALIDATION_RESULT.TOO_MANY_TOKENS,
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID,
message: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.tooManyTokens',
{
defaultMessage:
'Tokenization of field value examples has failed due to more than {tokenLimit} tokens being found in a sample of {sampleSize} values.',
values: { sampleSize, tokenLimit },
}
),
});
return;
}
return;
}
this.createFailureToTokenize(message);
}
public createPrivilegesErrorResult(error: any) {
const message: string = error.message;
if (message) {
this._results.push({
id: VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES,
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID,
message: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.insufficientPrivileges',
{
defaultMessage:
'Tokenization of field value examples could not be performed due to insufficient privileges. Field values cannot therefore be checked to see if they are appropriate for use in a categorization job.',
}
),
});
this._results.push({
id: VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES,
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID,
message,
});
return;
}
}
public createFailureToTokenize(message: string | undefined) {
this._results.push({
id: VALIDATION_RESULT.FAILED_TO_TOKENIZE,
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID,
message: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.failureToGetTokens',
{
defaultMessage:
'It was not possible to tokenize a sample of example field values. {message}',
values: { message: message || '' },
}
),
});
}
}

View file

@ -5,4 +5,4 @@
*/
export { newJobChartsProvider } from './charts';
export { categorizationExamplesProvider } from './categorization';
export { categorizationExamplesProvider, topCategoriesProvider } from './categorization';