mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 01:38:56 -04:00
* [ML] Fixing categorization wizard example results * moving validation results to class * cleaning up category analyzer types * small tweaks * removing commented out code * fixing string ids * small refactor * improving validation messages * fixing types * updating message text * fixing typo * adding privileges error * updating privilege message * changes based on review * removing old warning message * fixing translations * renaming enum
This commit is contained in:
parent
c96ca202ac
commit
15e584fa5b
20 changed files with 763 additions and 420 deletions
|
@ -26,7 +26,14 @@ export const DEFAULT_QUERY_DELAY = '60s';
|
|||
|
||||
export const SHARED_RESULTS_INDEX_NAME = 'shared';
|
||||
|
||||
// Categorization
|
||||
export const NUMBER_OF_CATEGORY_EXAMPLES = 5;
|
||||
export const CATEGORY_EXAMPLES_SAMPLE_SIZE = 1000;
|
||||
export const CATEGORY_EXAMPLES_WARNING_LIMIT = 0.75;
|
||||
export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.02;
|
||||
|
||||
export enum CATEGORY_EXAMPLES_VALIDATION_STATUS {
|
||||
VALID = 'valid',
|
||||
PARTIALLY_VALID = 'partially_valid',
|
||||
INVALID = 'invalid',
|
||||
}
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../constants/new_job';
|
||||
|
||||
export type CategoryId = number;
|
||||
|
||||
export interface Category {
|
||||
|
@ -23,3 +25,30 @@ export interface Token {
|
|||
type: string;
|
||||
position: number;
|
||||
}
|
||||
|
||||
export interface CategorizationAnalyzer {
|
||||
char_filter?: any[];
|
||||
tokenizer?: string;
|
||||
filter?: any[];
|
||||
analyzer?: string;
|
||||
}
|
||||
|
||||
export interface CategoryFieldExample {
|
||||
text: string;
|
||||
tokens: Token[];
|
||||
}
|
||||
|
||||
export enum VALIDATION_RESULT {
|
||||
TOKEN_COUNT,
|
||||
MEDIAN_LINE_LENGTH,
|
||||
NULL_VALUES,
|
||||
TOO_MANY_TOKENS,
|
||||
FAILED_TO_TOKENIZE,
|
||||
INSUFFICIENT_PRIVILEGES,
|
||||
}
|
||||
|
||||
export interface FieldExampleCheck {
|
||||
id: VALIDATION_RESULT;
|
||||
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS;
|
||||
message: string;
|
||||
}
|
||||
|
|
|
@ -4,7 +4,17 @@
|
|||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
import { renderTemplate } from './string_utils';
|
||||
import { renderTemplate, getMedianStringLength } from './string_utils';
|
||||
|
||||
const strings: string[] = [
|
||||
'foo',
|
||||
'foofoofoofoofoo',
|
||||
'foofoofoo',
|
||||
'f',
|
||||
'f',
|
||||
'foofoofoofoofoofoofoo',
|
||||
];
|
||||
const noStrings: string[] = [];
|
||||
|
||||
describe('ML - string utils', () => {
|
||||
describe('renderTemplate', () => {
|
||||
|
@ -24,4 +34,16 @@ describe('ML - string utils', () => {
|
|||
expect(result).toBe('string with 1 replacement, and a 2nd one.');
|
||||
});
|
||||
});
|
||||
|
||||
describe('getMedianStringLength', () => {
|
||||
test('test median for string array', () => {
|
||||
const result = getMedianStringLength(strings);
|
||||
expect(result).toBe(9);
|
||||
});
|
||||
|
||||
test('test median for no strings', () => {
|
||||
const result = getMedianStringLength(noStrings);
|
||||
expect(result).toBe(0);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
@ -17,3 +17,8 @@ export function renderTemplate(str: string, data?: Record<string, string>): stri
|
|||
|
||||
return str;
|
||||
}
|
||||
|
||||
export function getMedianStringLength(strings: string[]) {
|
||||
const sortedStringLengths = strings.map(s => s.length).sort((a, b) => a - b);
|
||||
return sortedStringLengths[Math.floor(sortedStringLengths.length / 2)] || 0;
|
||||
}
|
||||
|
|
|
@ -16,25 +16,31 @@ import {
|
|||
CREATED_BY_LABEL,
|
||||
DEFAULT_BUCKET_SPAN,
|
||||
DEFAULT_RARE_BUCKET_SPAN,
|
||||
CATEGORY_EXAMPLES_VALIDATION_STATUS,
|
||||
} from '../../../../../../common/constants/new_job';
|
||||
import { ML_JOB_AGGREGATION } from '../../../../../../common/constants/aggregation_types';
|
||||
import {
|
||||
CategorizationAnalyzer,
|
||||
CategoryFieldExample,
|
||||
FieldExampleCheck,
|
||||
} from '../../../../../../common/types/categories';
|
||||
import { getRichDetectors } from './util/general';
|
||||
import { CategorizationExamplesLoader, CategoryExample } from '../results_loader';
|
||||
import { CategorizationAnalyzer, getNewJobDefaults } from '../../../../services/ml_server_info';
|
||||
|
||||
type CategorizationAnalyzerType = CategorizationAnalyzer | null;
|
||||
import { CategorizationExamplesLoader } from '../results_loader';
|
||||
import { getNewJobDefaults } from '../../../../services/ml_server_info';
|
||||
|
||||
export class CategorizationJobCreator extends JobCreator {
|
||||
protected _type: JOB_TYPE = JOB_TYPE.CATEGORIZATION;
|
||||
private _createCountDetector: () => void = () => {};
|
||||
private _createRareDetector: () => void = () => {};
|
||||
private _examplesLoader: CategorizationExamplesLoader;
|
||||
private _categoryFieldExamples: CategoryExample[] = [];
|
||||
private _categoryFieldValid: number = 0;
|
||||
private _categoryFieldExamples: CategoryFieldExample[] = [];
|
||||
private _validationChecks: FieldExampleCheck[] = [];
|
||||
private _overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS =
|
||||
CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID;
|
||||
private _detectorType: ML_JOB_AGGREGATION.COUNT | ML_JOB_AGGREGATION.RARE =
|
||||
ML_JOB_AGGREGATION.COUNT;
|
||||
private _categorizationAnalyzer: CategorizationAnalyzerType = null;
|
||||
private _defaultCategorizationAnalyzer: CategorizationAnalyzerType;
|
||||
private _categorizationAnalyzer: CategorizationAnalyzer = {};
|
||||
private _defaultCategorizationAnalyzer: CategorizationAnalyzer;
|
||||
|
||||
constructor(
|
||||
indexPattern: IndexPattern,
|
||||
|
@ -46,7 +52,7 @@ export class CategorizationJobCreator extends JobCreator {
|
|||
this._examplesLoader = new CategorizationExamplesLoader(this, indexPattern, query);
|
||||
|
||||
const { anomaly_detectors: anomalyDetectors } = getNewJobDefaults();
|
||||
this._defaultCategorizationAnalyzer = anomalyDetectors.categorization_analyzer || null;
|
||||
this._defaultCategorizationAnalyzer = anomalyDetectors.categorization_analyzer || {};
|
||||
}
|
||||
|
||||
public setDefaultDetectorProperties(
|
||||
|
@ -93,7 +99,7 @@ export class CategorizationJobCreator extends JobCreator {
|
|||
} else {
|
||||
delete this._job_config.analysis_config.categorization_field_name;
|
||||
this._categoryFieldExamples = [];
|
||||
this._categoryFieldValid = 0;
|
||||
this._validationChecks = [];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -102,31 +108,38 @@ export class CategorizationJobCreator extends JobCreator {
|
|||
}
|
||||
|
||||
public async loadCategorizationFieldExamples() {
|
||||
const { valid, examples, sampleSize } = await this._examplesLoader.loadExamples();
|
||||
const {
|
||||
examples,
|
||||
sampleSize,
|
||||
overallValidStatus,
|
||||
validationChecks,
|
||||
} = await this._examplesLoader.loadExamples();
|
||||
this._categoryFieldExamples = examples;
|
||||
this._categoryFieldValid = valid;
|
||||
return { valid, examples, sampleSize };
|
||||
this._validationChecks = validationChecks;
|
||||
this._overallValidStatus = overallValidStatus;
|
||||
return { examples, sampleSize, overallValidStatus, validationChecks };
|
||||
}
|
||||
|
||||
public get categoryFieldExamples() {
|
||||
return this._categoryFieldExamples;
|
||||
}
|
||||
|
||||
public get categoryFieldValid() {
|
||||
return this._categoryFieldValid;
|
||||
public get validationChecks() {
|
||||
return this._validationChecks;
|
||||
}
|
||||
|
||||
public get overallValidStatus() {
|
||||
return this._overallValidStatus;
|
||||
}
|
||||
|
||||
public get selectedDetectorType() {
|
||||
return this._detectorType;
|
||||
}
|
||||
|
||||
public set categorizationAnalyzer(analyzer: CategorizationAnalyzerType) {
|
||||
public set categorizationAnalyzer(analyzer: CategorizationAnalyzer) {
|
||||
this._categorizationAnalyzer = analyzer;
|
||||
|
||||
if (
|
||||
analyzer === null ||
|
||||
isEqual(this._categorizationAnalyzer, this._defaultCategorizationAnalyzer)
|
||||
) {
|
||||
if (isEqual(this._categorizationAnalyzer, this._defaultCategorizationAnalyzer)) {
|
||||
delete this._job_config.analysis_config.categorization_analyzer;
|
||||
} else {
|
||||
this._job_config.analysis_config.categorization_analyzer = analyzer;
|
||||
|
|
|
@ -16,7 +16,7 @@ import { JobCreator, JobCreatorType, isCategorizationJobCreator } from '../job_c
|
|||
import { populateValidationMessages, checkForExistingJobAndGroupIds } from './util';
|
||||
import { ExistingJobsAndGroups } from '../../../../services/job_service';
|
||||
import { cardinalityValidator, CardinalityValidatorResult } from './validators';
|
||||
import { CATEGORY_EXAMPLES_ERROR_LIMIT } from '../../../../../../common/constants/new_job';
|
||||
import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../../../common/constants/new_job';
|
||||
|
||||
// delay start of validation to allow the user to make changes
|
||||
// e.g. if they are typing in a new value, try not to validate
|
||||
|
@ -207,7 +207,7 @@ export class JobValidator {
|
|||
private _runAdvancedValidation() {
|
||||
if (isCategorizationJobCreator(this._jobCreator)) {
|
||||
this._advancedValidations.categorizationFieldValid.valid =
|
||||
this._jobCreator.categoryFieldValid > CATEGORY_EXAMPLES_ERROR_LIMIT;
|
||||
this._jobCreator.overallValidStatus !== CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -6,15 +6,12 @@
|
|||
|
||||
import { IndexPattern } from '../../../../../../../../../../src/plugins/data/public';
|
||||
import { IndexPatternTitle } from '../../../../../../common/types/kibana';
|
||||
import { Token } from '../../../../../../common/types/categories';
|
||||
import { CategorizationJobCreator } from '../job_creator';
|
||||
import { ml } from '../../../../services/ml_api_service';
|
||||
import { NUMBER_OF_CATEGORY_EXAMPLES } from '../../../../../../common/constants/new_job';
|
||||
|
||||
export interface CategoryExample {
|
||||
text: string;
|
||||
tokens: Token[];
|
||||
}
|
||||
import {
|
||||
NUMBER_OF_CATEGORY_EXAMPLES,
|
||||
CATEGORY_EXAMPLES_VALIDATION_STATUS,
|
||||
} from '../../../../../../common/constants/new_job';
|
||||
|
||||
export class CategorizationExamplesLoader {
|
||||
private _jobCreator: CategorizationJobCreator;
|
||||
|
@ -36,20 +33,22 @@ export class CategorizationExamplesLoader {
|
|||
const analyzer = this._jobCreator.categorizationAnalyzer;
|
||||
const categorizationFieldName = this._jobCreator.categorizationFieldName;
|
||||
if (categorizationFieldName === null) {
|
||||
return { valid: 0, examples: [], sampleSize: 0 };
|
||||
return {
|
||||
examples: [],
|
||||
sampleSize: 0,
|
||||
overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID,
|
||||
validationChecks: [],
|
||||
};
|
||||
}
|
||||
|
||||
const start = Math.floor(
|
||||
this._jobCreator.start + (this._jobCreator.end - this._jobCreator.start) / 2
|
||||
);
|
||||
const resp = await ml.jobs.categorizationFieldExamples(
|
||||
this._indexPatternTitle,
|
||||
this._query,
|
||||
NUMBER_OF_CATEGORY_EXAMPLES,
|
||||
categorizationFieldName,
|
||||
this._timeFieldName,
|
||||
start,
|
||||
0,
|
||||
this._jobCreator.start,
|
||||
this._jobCreator.end,
|
||||
analyzer
|
||||
);
|
||||
return resp;
|
||||
|
|
|
@ -5,4 +5,4 @@
|
|||
*/
|
||||
|
||||
export { ResultsLoader, Results, ModelItem, Anomaly } from './results_loader';
|
||||
export { CategorizationExamplesLoader, CategoryExample } from './categorization_examples_loader';
|
||||
export { CategorizationExamplesLoader } from './categorization_examples_loader';
|
||||
|
|
|
@ -9,27 +9,24 @@ import { EuiCallOut, EuiSpacer, EuiCallOutProps } from '@elastic/eui';
|
|||
import { i18n } from '@kbn/i18n';
|
||||
import { FormattedMessage } from '@kbn/i18n/react';
|
||||
|
||||
import { CategorizationAnalyzer } from '../../../../../../../services/ml_server_info';
|
||||
import { EditCategorizationAnalyzerFlyout } from '../../../common/edit_categorization_analyzer_flyout';
|
||||
import {
|
||||
CATEGORY_EXAMPLES_ERROR_LIMIT,
|
||||
CATEGORY_EXAMPLES_WARNING_LIMIT,
|
||||
} from '../../../../../../../../../common/constants/new_job';
|
||||
|
||||
type CategorizationAnalyzerType = CategorizationAnalyzer | null;
|
||||
CategorizationAnalyzer,
|
||||
FieldExampleCheck,
|
||||
} from '../../../../../../../../../common/types/categories';
|
||||
import { EditCategorizationAnalyzerFlyout } from '../../../common/edit_categorization_analyzer_flyout';
|
||||
import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../../../../../../common/constants/new_job';
|
||||
|
||||
interface Props {
|
||||
examplesValid: number;
|
||||
sampleSize: number;
|
||||
categorizationAnalyzer: CategorizationAnalyzerType;
|
||||
validationChecks: FieldExampleCheck[];
|
||||
overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS;
|
||||
categorizationAnalyzer: CategorizationAnalyzer;
|
||||
}
|
||||
|
||||
export const ExamplesValidCallout: FC<Props> = ({
|
||||
examplesValid,
|
||||
overallValidStatus,
|
||||
validationChecks,
|
||||
categorizationAnalyzer,
|
||||
sampleSize,
|
||||
}) => {
|
||||
const percentageText = <PercentageText examplesValid={examplesValid} sampleSize={sampleSize} />;
|
||||
const analyzerUsed = <AnalyzerUsed categorizationAnalyzer={categorizationAnalyzer} />;
|
||||
|
||||
let color: EuiCallOutProps['color'] = 'success';
|
||||
|
@ -40,7 +37,7 @@ export const ExamplesValidCallout: FC<Props> = ({
|
|||
}
|
||||
);
|
||||
|
||||
if (examplesValid < CATEGORY_EXAMPLES_ERROR_LIMIT) {
|
||||
if (overallValidStatus === CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID) {
|
||||
color = 'danger';
|
||||
title = i18n.translate(
|
||||
'xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldCalloutTitle.invalid',
|
||||
|
@ -48,7 +45,7 @@ export const ExamplesValidCallout: FC<Props> = ({
|
|||
defaultMessage: 'Selected category field is invalid',
|
||||
}
|
||||
);
|
||||
} else if (examplesValid < CATEGORY_EXAMPLES_WARNING_LIMIT) {
|
||||
} else if (overallValidStatus === CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID) {
|
||||
color = 'warning';
|
||||
title = i18n.translate(
|
||||
'xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldCalloutTitle.possiblyInvalid',
|
||||
|
@ -60,45 +57,24 @@ export const ExamplesValidCallout: FC<Props> = ({
|
|||
|
||||
return (
|
||||
<EuiCallOut color={color} title={title}>
|
||||
{percentageText}
|
||||
{validationChecks.map((v, i) => (
|
||||
<div key={i}>{v.message}</div>
|
||||
))}
|
||||
<EuiSpacer size="s" />
|
||||
{analyzerUsed}
|
||||
</EuiCallOut>
|
||||
);
|
||||
};
|
||||
|
||||
const PercentageText: FC<{ examplesValid: number; sampleSize: number }> = ({
|
||||
examplesValid,
|
||||
sampleSize,
|
||||
}) => (
|
||||
<div>
|
||||
<FormattedMessage
|
||||
id="xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldPercentage"
|
||||
defaultMessage="{number} field {number, plural, zero {value} one {value} other {values}} analyzed, {percentage}% contain valid tokens."
|
||||
values={{
|
||||
number: sampleSize,
|
||||
percentage: Math.floor(examplesValid * 100),
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
);
|
||||
|
||||
const AnalyzerUsed: FC<{ categorizationAnalyzer: CategorizationAnalyzerType }> = ({
|
||||
const AnalyzerUsed: FC<{ categorizationAnalyzer: CategorizationAnalyzer }> = ({
|
||||
categorizationAnalyzer,
|
||||
}) => {
|
||||
let analyzer = '';
|
||||
if (typeof categorizationAnalyzer === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (typeof categorizationAnalyzer === 'string') {
|
||||
analyzer = categorizationAnalyzer;
|
||||
} else {
|
||||
if (categorizationAnalyzer?.tokenizer !== undefined) {
|
||||
analyzer = categorizationAnalyzer?.tokenizer!;
|
||||
} else if (categorizationAnalyzer?.analyzer !== undefined) {
|
||||
analyzer = categorizationAnalyzer?.analyzer!;
|
||||
}
|
||||
if (categorizationAnalyzer?.tokenizer !== undefined) {
|
||||
analyzer = categorizationAnalyzer.tokenizer;
|
||||
} else if (categorizationAnalyzer?.analyzer !== undefined) {
|
||||
analyzer = categorizationAnalyzer.analyzer;
|
||||
}
|
||||
|
||||
return (
|
||||
|
|
|
@ -7,10 +7,10 @@
|
|||
import React, { FC } from 'react';
|
||||
import { i18n } from '@kbn/i18n';
|
||||
import { EuiBasicTable, EuiText } from '@elastic/eui';
|
||||
import { CategoryExample } from '../../../../../common/results_loader';
|
||||
import { CategoryFieldExample } from '../../../../../../../../../common/types/categories';
|
||||
|
||||
interface Props {
|
||||
fieldExamples: CategoryExample[] | null;
|
||||
fieldExamples: CategoryFieldExample[] | null;
|
||||
}
|
||||
|
||||
const TOKEN_HIGHLIGHT_COLOR = '#b0ccf7';
|
||||
|
|
|
@ -14,7 +14,11 @@ import { CategorizationField } from '../categorization_field';
|
|||
import { CategorizationDetector } from '../categorization_detector';
|
||||
import { FieldExamples } from './field_examples';
|
||||
import { ExamplesValidCallout } from './examples_valid_callout';
|
||||
import { CategoryExample } from '../../../../../common/results_loader';
|
||||
import {
|
||||
CategoryFieldExample,
|
||||
FieldExampleCheck,
|
||||
} from '../../../../../../../../../common/types/categories';
|
||||
import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../../../../../../common/constants/new_job';
|
||||
import { LoadingWrapper } from '../../../charts/loading_wrapper';
|
||||
|
||||
interface Props {
|
||||
|
@ -31,9 +35,11 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
|
|||
const [categorizationAnalyzerString, setCategorizationAnalyzerString] = useState(
|
||||
JSON.stringify(jobCreator.categorizationAnalyzer)
|
||||
);
|
||||
const [fieldExamples, setFieldExamples] = useState<CategoryExample[] | null>(null);
|
||||
const [examplesValid, setExamplesValid] = useState(0);
|
||||
const [sampleSize, setSampleSize] = useState(0);
|
||||
const [fieldExamples, setFieldExamples] = useState<CategoryFieldExample[] | null>(null);
|
||||
const [overallValidStatus, setOverallValidStatus] = useState(
|
||||
CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID
|
||||
);
|
||||
const [validationChecks, setValidationChecks] = useState<FieldExampleCheck[]>([]);
|
||||
|
||||
const [categorizationFieldName, setCategorizationFieldName] = useState(
|
||||
jobCreator.categorizationFieldName
|
||||
|
@ -73,28 +79,32 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
|
|||
setLoadingData(true);
|
||||
try {
|
||||
const {
|
||||
valid,
|
||||
examples,
|
||||
sampleSize: tempSampleSize,
|
||||
overallValidStatus: tempOverallValidStatus,
|
||||
validationChecks: tempValidationChecks,
|
||||
} = await jobCreator.loadCategorizationFieldExamples();
|
||||
setFieldExamples(examples);
|
||||
setExamplesValid(valid);
|
||||
setOverallValidStatus(tempOverallValidStatus);
|
||||
setValidationChecks(tempValidationChecks);
|
||||
setLoadingData(false);
|
||||
setSampleSize(tempSampleSize);
|
||||
} catch (error) {
|
||||
setLoadingData(false);
|
||||
setFieldExamples(null);
|
||||
setValidationChecks([]);
|
||||
setOverallValidStatus(CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID);
|
||||
mlMessageBarService.notify.error(error);
|
||||
}
|
||||
} else {
|
||||
setFieldExamples(null);
|
||||
setExamplesValid(0);
|
||||
setValidationChecks([]);
|
||||
setOverallValidStatus(CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID);
|
||||
}
|
||||
setIsValid(categorizationFieldName !== null);
|
||||
}
|
||||
|
||||
useEffect(() => {
|
||||
jobCreatorUpdate();
|
||||
}, [examplesValid]);
|
||||
}, [overallValidStatus]);
|
||||
|
||||
return (
|
||||
<>
|
||||
|
@ -109,8 +119,8 @@ export const CategorizationDetectors: FC<Props> = ({ setIsValid }) => {
|
|||
{fieldExamples !== null && loadingData === false && (
|
||||
<>
|
||||
<ExamplesValidCallout
|
||||
sampleSize={sampleSize}
|
||||
examplesValid={examplesValid}
|
||||
overallValidStatus={overallValidStatus}
|
||||
validationChecks={validationChecks}
|
||||
categorizationAnalyzer={jobCreator.categorizationAnalyzer}
|
||||
/>
|
||||
<FieldExamples fieldExamples={fieldExamples} />
|
||||
|
|
|
@ -22,6 +22,12 @@ import { PartitionFieldsDefinition } from '../results_service/result_service_rx'
|
|||
import { annotations } from './annotations';
|
||||
import { Calendar, CalendarId, UpdateCalendar } from '../../../../common/types/calendars';
|
||||
import { CombinedJob, JobId } from '../../jobs/new_job/common/job_creator/configs';
|
||||
import {
|
||||
CategorizationAnalyzer,
|
||||
CategoryFieldExample,
|
||||
FieldExampleCheck,
|
||||
} from '../../../../common/types/categories';
|
||||
import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../common/constants/new_job';
|
||||
|
||||
// TODO This is not a complete representation of all methods of `ml.*`.
|
||||
// It just satisfies needs for other parts of the code area which use
|
||||
|
@ -184,8 +190,13 @@ declare interface Ml {
|
|||
timeField: string | undefined,
|
||||
start: number,
|
||||
end: number,
|
||||
analyzer: any
|
||||
): Promise<{ valid: number; examples: any[]; sampleSize: number }>;
|
||||
analyzer: CategorizationAnalyzer
|
||||
): Promise<{
|
||||
examples: CategoryFieldExample[];
|
||||
sampleSize: number;
|
||||
overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS;
|
||||
validationChecks: FieldExampleCheck[];
|
||||
}>;
|
||||
topCategories(
|
||||
jobId: string,
|
||||
count: number
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
*/
|
||||
|
||||
import { ml } from './ml_api_service';
|
||||
import { CategorizationAnalyzer } from '../../../common/types/categories';
|
||||
|
||||
export interface MlServerDefaults {
|
||||
anomaly_detectors: {
|
||||
|
@ -16,13 +17,6 @@ export interface MlServerDefaults {
|
|||
datafeeds: { scroll_size?: number };
|
||||
}
|
||||
|
||||
export interface CategorizationAnalyzer {
|
||||
char_filter?: any[];
|
||||
tokenizer?: string;
|
||||
filter?: any[];
|
||||
analyzer?: string;
|
||||
}
|
||||
|
||||
export interface MlServerLimits {
|
||||
max_model_memory_limit?: string;
|
||||
}
|
||||
|
|
|
@ -8,7 +8,11 @@ import { datafeedsProvider } from './datafeeds';
|
|||
import { jobsProvider } from './jobs';
|
||||
import { groupsProvider } from './groups';
|
||||
import { newJobCapsProvider } from './new_job_caps';
|
||||
import { newJobChartsProvider, categorizationExamplesProvider } from './new_job';
|
||||
import {
|
||||
newJobChartsProvider,
|
||||
categorizationExamplesProvider,
|
||||
topCategoriesProvider,
|
||||
} from './new_job';
|
||||
|
||||
export function jobServiceProvider(callWithRequest, request) {
|
||||
return {
|
||||
|
@ -18,5 +22,6 @@ export function jobServiceProvider(callWithRequest, request) {
|
|||
...newJobCapsProvider(callWithRequest, request),
|
||||
...newJobChartsProvider(callWithRequest, request),
|
||||
...categorizationExamplesProvider(callWithRequest, request),
|
||||
...topCategoriesProvider(callWithRequest, request),
|
||||
};
|
||||
}
|
||||
|
|
|
@ -1,314 +0,0 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
import { chunk } from 'lodash';
|
||||
import { ML_RESULTS_INDEX_PATTERN } from '../../../../common/constants/index_patterns';
|
||||
import { CATEGORY_EXAMPLES_SAMPLE_SIZE } from '../../../../common/constants/new_job';
|
||||
import { CategoryId, Category, Token } from '../../../../common/types/categories';
|
||||
import { callWithRequestType } from '../../../../common/types/kibana';
|
||||
|
||||
const VALID_TOKEN_COUNT = 3;
|
||||
const CHUNK_SIZE = 100;
|
||||
|
||||
export function categorizationExamplesProvider(callWithRequest: callWithRequestType) {
|
||||
async function categorizationExamples(
|
||||
indexPatternTitle: string,
|
||||
query: any,
|
||||
size: number,
|
||||
categorizationFieldName: string,
|
||||
timeField: string | undefined,
|
||||
start: number,
|
||||
end: number,
|
||||
analyzer?: any
|
||||
) {
|
||||
if (timeField !== undefined) {
|
||||
const range = {
|
||||
range: {
|
||||
[timeField]: {
|
||||
gte: start,
|
||||
format: 'epoch_millis',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
if (query.bool === undefined) {
|
||||
query.bool = {};
|
||||
}
|
||||
if (query.bool.filter === undefined) {
|
||||
query.bool.filter = range;
|
||||
} else {
|
||||
if (Array.isArray(query.bool.filter)) {
|
||||
query.bool.filter.push(range);
|
||||
} else {
|
||||
query.bool.filter.range = range;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const results = await callWithRequest('search', {
|
||||
index: indexPatternTitle,
|
||||
size,
|
||||
body: {
|
||||
_source: categorizationFieldName,
|
||||
query,
|
||||
},
|
||||
});
|
||||
const examples: string[] = results.hits?.hits
|
||||
?.map((doc: any) => doc._source[categorizationFieldName])
|
||||
.filter((example: string | null | undefined) => example !== undefined && example !== null);
|
||||
|
||||
async function loadTokens(chunkSize: number) {
|
||||
const exampleChunks = chunk(examples, chunkSize);
|
||||
const tokensPerChunks = await Promise.all(exampleChunks.map(c => getTokens(c, analyzer)));
|
||||
const tokensPerExample = tokensPerChunks.flat();
|
||||
return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] }));
|
||||
}
|
||||
try {
|
||||
return loadTokens(CHUNK_SIZE);
|
||||
} catch (error) {
|
||||
// if an error is thrown when loading the tokens, lower the chunk size by half and try again
|
||||
// the error may have been caused by too many tokens being found.
|
||||
// the _analyze endpoint has a maximum of 10000 tokens.
|
||||
return loadTokens(CHUNK_SIZE / 2);
|
||||
}
|
||||
}
|
||||
|
||||
async function getTokens(examples: string[], analyzer?: any) {
|
||||
const { tokens }: { tokens: Token[] } = await callWithRequest('indices.analyze', {
|
||||
body: {
|
||||
...getAnalyzer(analyzer),
|
||||
text: examples,
|
||||
},
|
||||
});
|
||||
|
||||
const lengths = examples.map(e => e.length);
|
||||
const sumLengths = lengths.map((s => (a: number) => (s += a))(0));
|
||||
|
||||
const tokensPerExample: Token[][] = examples.map(e => []);
|
||||
|
||||
tokens.forEach((t, i) => {
|
||||
for (let g = 0; g < sumLengths.length; g++) {
|
||||
if (t.start_offset <= sumLengths[g] + g) {
|
||||
const offset = g > 0 ? sumLengths[g - 1] + g : 0;
|
||||
tokensPerExample[g].push({
|
||||
...t,
|
||||
start_offset: t.start_offset - offset,
|
||||
end_offset: t.end_offset - offset,
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
return tokensPerExample;
|
||||
}
|
||||
|
||||
function getAnalyzer(analyzer: any) {
|
||||
if (typeof analyzer === 'object' && analyzer.tokenizer !== undefined) {
|
||||
return analyzer;
|
||||
} else {
|
||||
return { analyzer: 'standard' };
|
||||
}
|
||||
}
|
||||
|
||||
async function validateCategoryExamples(
|
||||
indexPatternTitle: string,
|
||||
query: any,
|
||||
size: number,
|
||||
categorizationFieldName: string,
|
||||
timeField: string | undefined,
|
||||
start: number,
|
||||
end: number,
|
||||
analyzer?: any
|
||||
) {
|
||||
const resp = await categorizationExamples(
|
||||
indexPatternTitle,
|
||||
query,
|
||||
CATEGORY_EXAMPLES_SAMPLE_SIZE,
|
||||
categorizationFieldName,
|
||||
timeField,
|
||||
start,
|
||||
end,
|
||||
analyzer
|
||||
);
|
||||
|
||||
const sortedExamples = resp
|
||||
.map((e, i) => ({ ...e, origIndex: i }))
|
||||
.sort((a, b) => b.tokens.length - a.tokens.length);
|
||||
const validExamples = sortedExamples.filter(e => e.tokens.length >= VALID_TOKEN_COUNT);
|
||||
const sampleSize = sortedExamples.length;
|
||||
|
||||
const multiple = Math.floor(sampleSize / size) || sampleSize;
|
||||
const filteredExamples = [];
|
||||
let i = 0;
|
||||
while (filteredExamples.length < size && i < sortedExamples.length) {
|
||||
filteredExamples.push(sortedExamples[i]);
|
||||
i += multiple;
|
||||
}
|
||||
const examples = filteredExamples
|
||||
.sort((a, b) => a.origIndex - b.origIndex)
|
||||
.map(e => ({ text: e.text, tokens: e.tokens }));
|
||||
|
||||
return {
|
||||
sampleSize,
|
||||
valid: sortedExamples.length === 0 ? 0 : validExamples.length / sortedExamples.length,
|
||||
examples,
|
||||
};
|
||||
}
|
||||
|
||||
async function getTotalCategories(jobId: string): Promise<{ total: number }> {
|
||||
const totalResp = await callWithRequest('search', {
|
||||
index: ML_RESULTS_INDEX_PATTERN,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
{
|
||||
term: {
|
||||
job_id: jobId,
|
||||
},
|
||||
},
|
||||
{
|
||||
exists: {
|
||||
field: 'category_id',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
return totalResp?.hits?.total?.value ?? 0;
|
||||
}
|
||||
|
||||
async function getTopCategoryCounts(jobId: string, numberOfCategories: number) {
|
||||
const top = await callWithRequest('search', {
|
||||
index: ML_RESULTS_INDEX_PATTERN,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
{
|
||||
term: {
|
||||
job_id: jobId,
|
||||
},
|
||||
},
|
||||
{
|
||||
term: {
|
||||
result_type: 'model_plot',
|
||||
},
|
||||
},
|
||||
{
|
||||
term: {
|
||||
by_field_name: 'mlcategory',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
cat_count: {
|
||||
terms: {
|
||||
field: 'by_field_value',
|
||||
size: numberOfCategories,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const catCounts: Array<{
|
||||
id: CategoryId;
|
||||
count: number;
|
||||
}> = top.aggregations?.cat_count?.buckets.map((c: any) => ({
|
||||
id: c.key,
|
||||
count: c.doc_count,
|
||||
}));
|
||||
return catCounts || [];
|
||||
}
|
||||
|
||||
async function getCategories(
|
||||
jobId: string,
|
||||
catIds: CategoryId[],
|
||||
size: number
|
||||
): Promise<Category[]> {
|
||||
const categoryFilter = catIds.length
|
||||
? {
|
||||
terms: {
|
||||
category_id: catIds,
|
||||
},
|
||||
}
|
||||
: {
|
||||
exists: {
|
||||
field: 'category_id',
|
||||
},
|
||||
};
|
||||
const result = await callWithRequest('search', {
|
||||
index: ML_RESULTS_INDEX_PATTERN,
|
||||
size,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
{
|
||||
term: {
|
||||
job_id: jobId,
|
||||
},
|
||||
},
|
||||
categoryFilter,
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return result.hits.hits?.map((c: { _source: Category }) => c._source) || [];
|
||||
}
|
||||
|
||||
async function topCategories(jobId: string, numberOfCategories: number) {
|
||||
const catCounts = await getTopCategoryCounts(jobId, numberOfCategories);
|
||||
const categories = await getCategories(
|
||||
jobId,
|
||||
catCounts.map(c => c.id),
|
||||
catCounts.length || numberOfCategories
|
||||
);
|
||||
|
||||
const catsById = categories.reduce((p, c) => {
|
||||
p[c.category_id] = c;
|
||||
return p;
|
||||
}, {} as { [id: number]: Category });
|
||||
|
||||
const total = await getTotalCategories(jobId);
|
||||
|
||||
if (catCounts.length) {
|
||||
return {
|
||||
total,
|
||||
categories: catCounts.map(({ id, count }) => {
|
||||
return {
|
||||
count,
|
||||
category: catsById[id] ?? null,
|
||||
};
|
||||
}),
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
total,
|
||||
categories: categories.map(category => {
|
||||
return {
|
||||
category,
|
||||
};
|
||||
}),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
categorizationExamples,
|
||||
validateCategoryExamples,
|
||||
topCategories,
|
||||
};
|
||||
}
|
|
@ -0,0 +1,206 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
import { chunk } from 'lodash';
|
||||
import { SearchResponse } from 'elasticsearch';
|
||||
import { CATEGORY_EXAMPLES_SAMPLE_SIZE } from '../../../../../common/constants/new_job';
|
||||
import {
|
||||
Token,
|
||||
CategorizationAnalyzer,
|
||||
CategoryFieldExample,
|
||||
} from '../../../../../common/types/categories';
|
||||
import { callWithRequestType } from '../../../../../common/types/kibana';
|
||||
import { ValidationResults } from './validation_results';
|
||||
|
||||
const CHUNK_SIZE = 100;
|
||||
|
||||
export function categorizationExamplesProvider(callWithRequest: callWithRequestType) {
|
||||
const validationResults = new ValidationResults();
|
||||
|
||||
async function categorizationExamples(
|
||||
indexPatternTitle: string,
|
||||
query: any,
|
||||
size: number,
|
||||
categorizationFieldName: string,
|
||||
timeField: string | undefined,
|
||||
start: number,
|
||||
end: number,
|
||||
analyzer: CategorizationAnalyzer
|
||||
): Promise<{ examples: CategoryFieldExample[]; error?: any }> {
|
||||
if (timeField !== undefined) {
|
||||
const range = {
|
||||
range: {
|
||||
[timeField]: {
|
||||
gte: start,
|
||||
lt: end,
|
||||
format: 'epoch_millis',
|
||||
},
|
||||
},
|
||||
};
|
||||
if (query.bool === undefined) {
|
||||
query.bool = {};
|
||||
}
|
||||
if (query.bool.filter === undefined) {
|
||||
query.bool.filter = range;
|
||||
} else {
|
||||
if (Array.isArray(query.bool.filter)) {
|
||||
query.bool.filter.push(range);
|
||||
} else {
|
||||
query.bool.filter.range = range;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const results: SearchResponse<{ [id: string]: string }> = await callWithRequest('search', {
|
||||
index: indexPatternTitle,
|
||||
size,
|
||||
body: {
|
||||
_source: categorizationFieldName,
|
||||
query,
|
||||
sort: ['_doc'],
|
||||
},
|
||||
});
|
||||
|
||||
const tempExamples = results.hits.hits.map(({ _source }) => _source[categorizationFieldName]);
|
||||
|
||||
validationResults.createNullValueResult(tempExamples);
|
||||
|
||||
const allExamples = tempExamples.filter(
|
||||
(example: string | null | undefined) => example !== undefined && example !== null
|
||||
);
|
||||
|
||||
validationResults.createMedianMessageLengthResult(allExamples);
|
||||
|
||||
try {
|
||||
const examplesWithTokens = await getTokens(CHUNK_SIZE, allExamples, analyzer);
|
||||
return { examples: examplesWithTokens };
|
||||
} catch (err) {
|
||||
// console.log('dropping to 50 chunk size');
|
||||
// if an error is thrown when loading the tokens, lower the chunk size by half and try again
|
||||
// the error may have been caused by too many tokens being found.
|
||||
// the _analyze endpoint has a maximum of 10000 tokens.
|
||||
const halfExamples = allExamples.splice(0, Math.ceil(allExamples.length / 2));
|
||||
const halfChunkSize = CHUNK_SIZE / 2;
|
||||
try {
|
||||
const examplesWithTokens = await getTokens(halfChunkSize, halfExamples, analyzer);
|
||||
return { examples: examplesWithTokens };
|
||||
} catch (error) {
|
||||
validationResults.createTooManyTokensResult(error, halfChunkSize);
|
||||
return { examples: halfExamples.map(e => ({ text: e, tokens: [] })) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getTokens(
|
||||
chunkSize: number,
|
||||
examples: string[],
|
||||
analyzer: CategorizationAnalyzer
|
||||
): Promise<CategoryFieldExample[]> {
|
||||
const exampleChunks = chunk(examples, chunkSize);
|
||||
const tokensPerExampleChunks: Token[][][] = [];
|
||||
for (const c of exampleChunks) {
|
||||
tokensPerExampleChunks.push(await loadTokens(c, analyzer));
|
||||
}
|
||||
const tokensPerExample = tokensPerExampleChunks.flat();
|
||||
return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] }));
|
||||
}
|
||||
|
||||
async function loadTokens(examples: string[], analyzer: CategorizationAnalyzer) {
|
||||
const { tokens }: { tokens: Token[] } = await callWithRequest('indices.analyze', {
|
||||
body: {
|
||||
...getAnalyzer(analyzer),
|
||||
text: examples,
|
||||
},
|
||||
});
|
||||
|
||||
const lengths = examples.map(e => e.length);
|
||||
const sumLengths = lengths.map((s => (a: number) => (s += a))(0));
|
||||
|
||||
const tokensPerExample: Token[][] = examples.map(e => []);
|
||||
|
||||
tokens.forEach((t, i) => {
|
||||
for (let g = 0; g < sumLengths.length; g++) {
|
||||
if (t.start_offset <= sumLengths[g] + g) {
|
||||
const offset = g > 0 ? sumLengths[g - 1] + g : 0;
|
||||
tokensPerExample[g].push({
|
||||
...t,
|
||||
start_offset: t.start_offset - offset,
|
||||
end_offset: t.end_offset - offset,
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
return tokensPerExample;
|
||||
}
|
||||
|
||||
function getAnalyzer(analyzer: CategorizationAnalyzer) {
|
||||
if (typeof analyzer === 'object' && analyzer.tokenizer !== undefined) {
|
||||
return analyzer;
|
||||
} else {
|
||||
return { analyzer: 'standard' };
|
||||
}
|
||||
}
|
||||
|
||||
async function validateCategoryExamples(
|
||||
indexPatternTitle: string,
|
||||
query: any,
|
||||
size: number,
|
||||
categorizationFieldName: string,
|
||||
timeField: string | undefined,
|
||||
start: number,
|
||||
end: number,
|
||||
analyzer: CategorizationAnalyzer
|
||||
) {
|
||||
const resp = await categorizationExamples(
|
||||
indexPatternTitle,
|
||||
query,
|
||||
CATEGORY_EXAMPLES_SAMPLE_SIZE,
|
||||
categorizationFieldName,
|
||||
timeField,
|
||||
start,
|
||||
end,
|
||||
analyzer
|
||||
);
|
||||
|
||||
const { examples } = resp;
|
||||
const sampleSize = examples.length;
|
||||
validationResults.createTokenCountResult(examples, sampleSize);
|
||||
|
||||
// sort examples by number of tokens, keeping track of their original order
|
||||
// with an origIndex property
|
||||
const sortedExamples = examples
|
||||
.map((e, i) => ({ ...e, origIndex: i }))
|
||||
.sort((a, b) => b.tokens.length - a.tokens.length);
|
||||
|
||||
// we only want 'size' (e.g. 5) number of examples,
|
||||
// so loop through the sorted examples, taking 5 at evenly
|
||||
// spread intervals
|
||||
const multiple = Math.floor(sampleSize / size) || sampleSize;
|
||||
const filteredExamples = [];
|
||||
let i = 0;
|
||||
while (filteredExamples.length < size && i < sampleSize) {
|
||||
filteredExamples.push(sortedExamples[i]);
|
||||
i += multiple;
|
||||
}
|
||||
|
||||
// sort back into original order and remove origIndex property
|
||||
const processedExamples = filteredExamples
|
||||
.sort((a, b) => a.origIndex - b.origIndex)
|
||||
.map(e => ({ text: e.text, tokens: e.tokens }));
|
||||
|
||||
return {
|
||||
overallValidStatus: validationResults.overallResult,
|
||||
validationChecks: validationResults.results,
|
||||
sampleSize,
|
||||
examples: processedExamples,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
validateCategoryExamples,
|
||||
};
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
export { categorizationExamplesProvider } from './examples';
|
||||
export { topCategoriesProvider } from './top_categories';
|
|
@ -0,0 +1,164 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
import { SearchResponse } from 'elasticsearch';
|
||||
import { ML_RESULTS_INDEX_PATTERN } from '../../../../../common/constants/index_patterns';
|
||||
import { CategoryId, Category } from '../../../../../common/types/categories';
|
||||
import { callWithRequestType } from '../../../../../common/types/kibana';
|
||||
|
||||
export function topCategoriesProvider(callWithRequest: callWithRequestType) {
|
||||
async function getTotalCategories(jobId: string): Promise<{ total: number }> {
|
||||
const totalResp = await callWithRequest('search', {
|
||||
index: ML_RESULTS_INDEX_PATTERN,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
{
|
||||
term: {
|
||||
job_id: jobId,
|
||||
},
|
||||
},
|
||||
{
|
||||
exists: {
|
||||
field: 'category_id',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
return totalResp?.hits?.total?.value ?? 0;
|
||||
}
|
||||
|
||||
async function getTopCategoryCounts(jobId: string, numberOfCategories: number) {
|
||||
const top: SearchResponse<any> = await callWithRequest('search', {
|
||||
index: ML_RESULTS_INDEX_PATTERN,
|
||||
size: 0,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
{
|
||||
term: {
|
||||
job_id: jobId,
|
||||
},
|
||||
},
|
||||
{
|
||||
term: {
|
||||
result_type: 'model_plot',
|
||||
},
|
||||
},
|
||||
{
|
||||
term: {
|
||||
by_field_name: 'mlcategory',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
aggs: {
|
||||
cat_count: {
|
||||
terms: {
|
||||
field: 'by_field_value',
|
||||
size: numberOfCategories,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const catCounts: Array<{
|
||||
id: CategoryId;
|
||||
count: number;
|
||||
}> = top.aggregations?.cat_count?.buckets.map((c: any) => ({
|
||||
id: c.key,
|
||||
count: c.doc_count,
|
||||
}));
|
||||
return catCounts || [];
|
||||
}
|
||||
|
||||
async function getCategories(
|
||||
jobId: string,
|
||||
catIds: CategoryId[],
|
||||
size: number
|
||||
): Promise<Category[]> {
|
||||
const categoryFilter = catIds.length
|
||||
? {
|
||||
terms: {
|
||||
category_id: catIds,
|
||||
},
|
||||
}
|
||||
: {
|
||||
exists: {
|
||||
field: 'category_id',
|
||||
},
|
||||
};
|
||||
const result: SearchResponse<any> = await callWithRequest('search', {
|
||||
index: ML_RESULTS_INDEX_PATTERN,
|
||||
size,
|
||||
body: {
|
||||
query: {
|
||||
bool: {
|
||||
filter: [
|
||||
{
|
||||
term: {
|
||||
job_id: jobId,
|
||||
},
|
||||
},
|
||||
categoryFilter,
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return result.hits.hits?.map((c: { _source: Category }) => c._source) || [];
|
||||
}
|
||||
|
||||
async function topCategories(jobId: string, numberOfCategories: number) {
|
||||
const catCounts = await getTopCategoryCounts(jobId, numberOfCategories);
|
||||
const categories = await getCategories(
|
||||
jobId,
|
||||
catCounts.map(c => c.id),
|
||||
catCounts.length || numberOfCategories
|
||||
);
|
||||
|
||||
const catsById = categories.reduce((p, c) => {
|
||||
p[c.category_id] = c;
|
||||
return p;
|
||||
}, {} as { [id: number]: Category });
|
||||
|
||||
const total = await getTotalCategories(jobId);
|
||||
|
||||
if (catCounts.length) {
|
||||
return {
|
||||
total,
|
||||
categories: catCounts.map(({ id, count }) => {
|
||||
return {
|
||||
count,
|
||||
category: catsById[id] ?? null,
|
||||
};
|
||||
}),
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
total,
|
||||
categories: categories.map(category => {
|
||||
return {
|
||||
category,
|
||||
};
|
||||
}),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
topCategories,
|
||||
};
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
import { i18n } from '@kbn/i18n';
|
||||
import {
|
||||
CATEGORY_EXAMPLES_VALIDATION_STATUS,
|
||||
CATEGORY_EXAMPLES_ERROR_LIMIT,
|
||||
CATEGORY_EXAMPLES_WARNING_LIMIT,
|
||||
} from '../../../../../common/constants/new_job';
|
||||
import {
|
||||
FieldExampleCheck,
|
||||
CategoryFieldExample,
|
||||
VALIDATION_RESULT,
|
||||
} from '../../../../../common/types/categories';
|
||||
import { getMedianStringLength } from '../../../../../common/util/string_utils';
|
||||
|
||||
const VALID_TOKEN_COUNT = 3;
|
||||
const MEDIAN_LINE_LENGTH_LIMIT = 400;
|
||||
const NULL_COUNT_PERCENT_LIMIT = 0.75;
|
||||
|
||||
export class ValidationResults {
|
||||
private _results: FieldExampleCheck[] = [];
|
||||
|
||||
public get results() {
|
||||
return this._results;
|
||||
}
|
||||
|
||||
public get overallResult() {
|
||||
if (this._results.some(c => c.valid === CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID)) {
|
||||
return CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID;
|
||||
}
|
||||
if (this._results.some(c => c.valid === CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID)) {
|
||||
return CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID;
|
||||
}
|
||||
return CATEGORY_EXAMPLES_VALIDATION_STATUS.VALID;
|
||||
}
|
||||
|
||||
private _resultExists(id: VALIDATION_RESULT) {
|
||||
return this._results.some(r => r.id === id);
|
||||
}
|
||||
|
||||
public createTokenCountResult(examples: CategoryFieldExample[], sampleSize: number) {
|
||||
if (examples.length === 0) {
|
||||
this.createNoExamplesResult();
|
||||
return;
|
||||
}
|
||||
|
||||
if (this._resultExists(VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES) === true) {
|
||||
// if tokenizing has failed due to insufficient privileges, don't show
|
||||
// the message about token count
|
||||
return;
|
||||
}
|
||||
|
||||
const validExamplesSize = examples.filter(e => e.tokens.length >= VALID_TOKEN_COUNT).length;
|
||||
const percentValid = sampleSize === 0 ? 0 : validExamplesSize / sampleSize;
|
||||
|
||||
let valid = CATEGORY_EXAMPLES_VALIDATION_STATUS.VALID;
|
||||
if (percentValid < CATEGORY_EXAMPLES_ERROR_LIMIT) {
|
||||
valid = CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID;
|
||||
} else if (percentValid < CATEGORY_EXAMPLES_WARNING_LIMIT) {
|
||||
valid = CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID;
|
||||
}
|
||||
|
||||
const message = i18n.translate(
|
||||
'xpack.ml.models.jobService.categorization.messages.tokenLengthValidation',
|
||||
{
|
||||
defaultMessage:
|
||||
'{number} field {number, plural, zero {value} one {value} other {values}} analyzed, {percentage}% contain {validTokenCount} or more tokens.',
|
||||
values: {
|
||||
number: sampleSize,
|
||||
percentage: Math.floor(percentValid * 100),
|
||||
validTokenCount: VALID_TOKEN_COUNT,
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
if (
|
||||
this._resultExists(VALIDATION_RESULT.TOO_MANY_TOKENS) === false &&
|
||||
this._resultExists(VALIDATION_RESULT.FAILED_TO_TOKENIZE) === false
|
||||
) {
|
||||
this._results.unshift({
|
||||
id: VALIDATION_RESULT.TOKEN_COUNT,
|
||||
valid,
|
||||
message,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public createMedianMessageLengthResult(examples: string[]) {
|
||||
const median = getMedianStringLength(examples);
|
||||
|
||||
if (median > MEDIAN_LINE_LENGTH_LIMIT) {
|
||||
this._results.push({
|
||||
id: VALIDATION_RESULT.MEDIAN_LINE_LENGTH,
|
||||
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID,
|
||||
message: i18n.translate(
|
||||
'xpack.ml.models.jobService.categorization.messages.medianLineLength',
|
||||
{
|
||||
defaultMessage:
|
||||
'The median length for the field values analyzed is over {medianLimit} characters.',
|
||||
values: { medianLimit: MEDIAN_LINE_LENGTH_LIMIT },
|
||||
}
|
||||
),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public createNoExamplesResult() {
|
||||
this._results.push({
|
||||
id: VALIDATION_RESULT.NULL_VALUES,
|
||||
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID,
|
||||
message: i18n.translate('xpack.ml.models.jobService.categorization.messages.noDataFound', {
|
||||
defaultMessage:
|
||||
'No examples for this field could be found. Please ensure the selected date range contains data.',
|
||||
}),
|
||||
});
|
||||
}
|
||||
|
||||
public createNullValueResult(examples: Array<string | null | undefined>) {
|
||||
const nullCount = examples.filter(e => e === null).length;
|
||||
|
||||
if (nullCount / examples.length >= NULL_COUNT_PERCENT_LIMIT) {
|
||||
this._results.push({
|
||||
id: VALIDATION_RESULT.NULL_VALUES,
|
||||
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID,
|
||||
message: i18n.translate('xpack.ml.models.jobService.categorization.messages.nullValues', {
|
||||
defaultMessage: 'More than {percent}% of field values are null.',
|
||||
values: { percent: NULL_COUNT_PERCENT_LIMIT * 100 },
|
||||
}),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public createTooManyTokensResult(error: any, sampleSize: number) {
|
||||
// expecting error message:
|
||||
// The number of tokens produced by calling _analyze has exceeded the allowed maximum of [10000].
|
||||
// This limit can be set by changing the [index.analyze.max_token_count] index level setting.
|
||||
|
||||
if (error.statusCode === 403) {
|
||||
this.createPrivilegesErrorResult(error);
|
||||
return;
|
||||
}
|
||||
const message: string = error.message;
|
||||
if (message) {
|
||||
const rxp = /exceeded the allowed maximum of \[(\d+?)\]/;
|
||||
const match = rxp.exec(message);
|
||||
if (match?.length === 2) {
|
||||
const tokenLimit = match[1];
|
||||
this._results.push({
|
||||
id: VALIDATION_RESULT.TOO_MANY_TOKENS,
|
||||
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID,
|
||||
message: i18n.translate(
|
||||
'xpack.ml.models.jobService.categorization.messages.tooManyTokens',
|
||||
{
|
||||
defaultMessage:
|
||||
'Tokenization of field value examples has failed due to more than {tokenLimit} tokens being found in a sample of {sampleSize} values.',
|
||||
values: { sampleSize, tokenLimit },
|
||||
}
|
||||
),
|
||||
});
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
this.createFailureToTokenize(message);
|
||||
}
|
||||
|
||||
public createPrivilegesErrorResult(error: any) {
|
||||
const message: string = error.message;
|
||||
if (message) {
|
||||
this._results.push({
|
||||
id: VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES,
|
||||
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID,
|
||||
message: i18n.translate(
|
||||
'xpack.ml.models.jobService.categorization.messages.insufficientPrivileges',
|
||||
{
|
||||
defaultMessage:
|
||||
'Tokenization of field value examples could not be performed due to insufficient privileges. Field values cannot therefore be checked to see if they are appropriate for use in a categorization job.',
|
||||
}
|
||||
),
|
||||
});
|
||||
this._results.push({
|
||||
id: VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES,
|
||||
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID,
|
||||
message,
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
public createFailureToTokenize(message: string | undefined) {
|
||||
this._results.push({
|
||||
id: VALIDATION_RESULT.FAILED_TO_TOKENIZE,
|
||||
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID,
|
||||
message: i18n.translate(
|
||||
'xpack.ml.models.jobService.categorization.messages.failureToGetTokens',
|
||||
{
|
||||
defaultMessage:
|
||||
'It was not possible to tokenize a sample of example field values. {message}',
|
||||
values: { message: message || '' },
|
||||
}
|
||||
),
|
||||
});
|
||||
}
|
||||
}
|
|
@ -5,4 +5,4 @@
|
|||
*/
|
||||
|
||||
export { newJobChartsProvider } from './charts';
|
||||
export { categorizationExamplesProvider } from './categorization';
|
||||
export { categorizationExamplesProvider, topCategoriesProvider } from './categorization';
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue