[ML] Editing semi-structured text fields in grok pattern (#122274)

* [ML] Editing semi-structured text fields in grok pattern

* changes based on review

* fixing apache type patterns

* adding tests

* renaming test description

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
James Gowdy 2022-01-10 15:41:07 +00:00 committed by GitHub
parent eb095f5b1d
commit 052ab06c61
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 325 additions and 103 deletions

View file

@ -5,9 +5,9 @@
* 2.0.
*/
import { parseInterval } from '../../../../common/utils/parse_interval';
import { parseInterval } from './parse_interval';
describe('ML parse interval util', () => {
describe('parse interval util', () => {
test('should correctly parse an interval containing a valid unit and value', () => {
expect(parseInterval('1d')!.as('d')).toBe(1);
expect(parseInterval('2y')!.as('y')).toBe(2);

View file

@ -6,7 +6,12 @@
*/
import { isEqual } from 'lodash';
import { AnalysisResult, InputOverrides, MB } from '../../../../../../file_upload/common';
import {
AnalysisResult,
InputOverrides,
MB,
FILE_FORMATS,
} from '../../../../../../file_upload/common';
export const DEFAULT_LINES_TO_SAMPLE = 1000;
const UPLOAD_SIZE_MB = 5;
@ -72,7 +77,7 @@ export function createUrlOverrides(overrides: InputOverrides, originalSettings:
}
}
if (formattedOverrides.format === '' && originalSettings.format === 'delimited') {
if (formattedOverrides.format === '' && originalSettings.format === FILE_FORMATS.DELIMITED) {
if (
formattedOverrides.should_trim_fields !== '' ||
formattedOverrides.has_header_row !== '' ||
@ -88,13 +93,19 @@ export function createUrlOverrides(overrides: InputOverrides, originalSettings:
}
}
if (formattedOverrides.format === '' && originalSettings.format === 'semi_structured_text') {
if (
formattedOverrides.format === '' &&
originalSettings.format === FILE_FORMATS.SEMI_STRUCTURED_TEXT
) {
if (formattedOverrides.grok_pattern !== '') {
formattedOverrides.format = originalSettings.format;
}
}
if (formattedOverrides.format === 'ndjson' || originalSettings.format === 'ndjson') {
if (
formattedOverrides.format === FILE_FORMATS.NDJSON ||
originalSettings.format === FILE_FORMATS.NDJSON
) {
formattedOverrides.should_trim_fields = '';
formattedOverrides.has_header_row = '';
formattedOverrides.delimiter = '';

View file

@ -0,0 +1,65 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { getFieldsFromGrokPattern, replaceFieldInGrokPattern } from './grok_pattern';
const GROK_PATTERN =
'<%{INT:field}>%{INT:field2}: .*?: %{SYSLOGTIMESTAMP:timestamp}.*?: %.*?: .*? .*? .*? .*?%{IP:ipaddress}/%{INT:field3}, .*? .*?%{IP:ipaddress2}, .*?/%{BASE16NUM:field5} .*?/%{INT:field4}/%{NUMBER:field6}.*';
const APACHE_LOG_PATTERN = '%{COMBINEDAPACHELOG}';
describe('grok pattern', () => {
it('should return the correct fields for normal grok pattern', () => {
const expectedFields = [
{ name: 'field', type: 'INT' },
{ name: 'field2', type: 'INT' },
{ name: 'timestamp', type: 'SYSLOGTIMESTAMP' },
{ name: 'ipaddress', type: 'IP' },
{ name: 'field3', type: 'INT' },
{ name: 'ipaddress2', type: 'IP' },
{ name: 'field5', type: 'BASE16NUM' },
{ name: 'field4', type: 'INT' },
{ name: 'field6', type: 'NUMBER' },
];
const fields = getFieldsFromGrokPattern(GROK_PATTERN);
expect(fields).toEqual(expectedFields);
});
it('should return no fields for apache grok pattern', () => {
const fields = getFieldsFromGrokPattern(APACHE_LOG_PATTERN);
expect(fields).toEqual([]);
});
it('should rename the correct field', () => {
const index = 1;
const renamedField = 'field2_renamed';
const expectedGrokPattern = `<%{INT:field}>%{INT:${renamedField}}: .*?: %{SYSLOGTIMESTAMP:timestamp}.*?: %.*?: .*? .*? .*? .*?%{IP:ipaddress}/%{INT:field3}, .*? .*?%{IP:ipaddress2}, .*?/%{BASE16NUM:field5} .*?/%{INT:field4}/%{NUMBER:field6}.*`;
const grokPattern = replaceFieldInGrokPattern(GROK_PATTERN, renamedField, index);
expect(grokPattern).toEqual(expectedGrokPattern);
});
it('should not rename the field if incorrect index is supplied', () => {
const index = 2; // wrong index
const renamedField = 'field2_renamed';
const expectedGrokPattern = `<%{INT:field}>%{INT:${renamedField}}: .*?: %{SYSLOGTIMESTAMP:timestamp}.*?: %.*?: .*? .*? .*? .*?%{IP:ipaddress}/%{INT:field3}, .*? .*?%{IP:ipaddress2}, .*?/%{BASE16NUM:field5} .*?/%{INT:field4}/%{NUMBER:field6}.*`;
const grokPattern = replaceFieldInGrokPattern(GROK_PATTERN, renamedField, index);
expect(grokPattern).not.toEqual(expectedGrokPattern);
});
it('should not rename apache grok fields', () => {
const index = 1;
const renamedField = 'field2_renamed';
const expectedGrokPattern = APACHE_LOG_PATTERN;
const grokPattern = replaceFieldInGrokPattern(APACHE_LOG_PATTERN, renamedField, index);
expect(grokPattern).toEqual(expectedGrokPattern);
});
});

View file

@ -0,0 +1,73 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
const ALIAS_PATTERN = /^%{\w*?}$/;
const MATCH_FIELDS = /(%{\w*?:\w*?})/;
const MATCH_AND_CAPTURE_FIELDS = /%{(\w*?):(\w*?)}/;
function isAliasPattern(pattern: string) {
// check to see if the pattern is a single alias pattern,
// e.g. %{COMBINEDAPACHELOG}
return (pattern.match(ALIAS_PATTERN) ?? []).length === 1;
}
function splitGrok(grokPattern: string, filter: boolean = false) {
const grokList = grokPattern.split(MATCH_FIELDS);
return filter === true ? grokList.filter((d) => d[0] === '%' && d[1] === '{') : grokList;
}
function getGrokField(field: string) {
if (field[0] !== '%' && field[1] !== '{') {
return { valid: false, type: '', name: '' };
}
const match = field.match(MATCH_AND_CAPTURE_FIELDS);
if (match === null) {
return { valid: false, type: '', name: '' };
}
const [, type, name] = match;
return { valid: true, type, name };
}
export function getFieldsFromGrokPattern(grokPattern: string) {
if (isAliasPattern(grokPattern)) {
return [];
}
return splitGrok(grokPattern, true).map((d) => {
const { valid, name, type } = getGrokField(d);
if (valid === false) {
return { name: d, type };
}
return { name, type };
});
}
export function replaceFieldInGrokPattern(grokPattern: string, fieldName: string, index: number) {
if (isAliasPattern(grokPattern)) {
return grokPattern;
}
let count = 0;
return splitGrok(grokPattern)
.map((d) => {
const { valid, type } = getGrokField(d);
if (valid) {
if (count === index) {
count++;
const newField = `%{${type}:${fieldName}}`;
const { valid: validNew } = getGrokField(newField);
// don't replace if new field is not valid
return validNew ? newField : d;
}
count++;
}
return d;
})
.join('');
}

View file

@ -9,7 +9,7 @@ import { FormattedMessage } from '@kbn/i18n-react';
import React, { FC } from 'react';
import { EuiTitle, EuiSpacer, EuiDescriptionList } from '@elastic/eui';
import { FindFileStructureResponse } from '../../../../../../file_upload/common';
import { FindFileStructureResponse, FILE_FORMATS } from '../../../../../../file_upload/common';
export const AnalysisSummary: FC<{ results: FindFileStructureResponse }> = ({ results }) => {
const items = createDisplayItems(results);
@ -60,7 +60,7 @@ function createDisplayItems(results: FindFileStructureResponse) {
description: results.format,
});
if (results.format === 'delimited') {
if (results.format === FILE_FORMATS.DELIMITED) {
items.push({
title: (
<FormattedMessage

View file

@ -5,13 +5,6 @@
* 2.0.
*/
export const FORMAT_OPTIONS = [
'delimited',
'ndjson',
'semi_structured_text',
// 'xml',
];
export const CUSTOM_DROPDOWN_OPTION = 'custom';
export const TIMESTAMP_OPTIONS = [

View file

@ -5,20 +5,21 @@
* 2.0.
*/
import { FILE_FORMATS } from '../../../../../../../file_upload/common';
import {
FORMAT_OPTIONS,
TIMESTAMP_OPTIONS,
DELIMITER_OPTIONS,
QUOTE_OPTIONS,
CHARSET_OPTIONS,
} from './option_lists';
function getOptions(list) {
function getOptions(list: string[]) {
return list.map((o) => ({ label: o }));
}
export function getFormatOptions() {
return getOptions(FORMAT_OPTIONS);
return getOptions(Object.values(FILE_FORMATS));
}
export function getTimestampFormatOptions() {

View file

@ -23,6 +23,8 @@ import {
EuiTextArea,
} from '@elastic/eui';
import { FILE_FORMATS } from '../../../../../../file_upload/common';
import {
getFormatOptions,
getTimestampFormatOptions,
@ -32,6 +34,16 @@ import {
} from './options';
import { isTimestampFormatValid } from './overrides_validation';
import { withKibana } from '../../../../../../../../src/plugins/kibana_react/public';
import { replaceFieldInGrokPattern } from '../../../common/util/grok_pattern';
import {
convertDelimiter,
convertDelimiterBack,
getColumnNames,
getGrokFieldNames,
isLinesToSampleValid,
LINES_TO_SAMPLE_VALUE_MIN,
LINES_TO_SAMPLE_VALUE_MAX,
} from './overrides_utils';
import { TIMESTAMP_OPTIONS, CUSTOM_DROPDOWN_OPTION } from './options/option_lists';
@ -39,10 +51,6 @@ const formatOptions = getFormatOptions();
const timestampFormatOptions = getTimestampFormatOptions();
const delimiterOptions = getDelimiterOptions();
const quoteOptions = getQuoteOptions();
// const charsetOptions = getCharsetOptions();
const LINES_TO_SAMPLE_VALUE_MIN = 3;
const LINES_TO_SAMPLE_VALUE_MAX = 1000000;
class OverridesUI extends Component {
constructor(props) {
@ -93,11 +101,14 @@ class OverridesUI extends Component {
const { newColumnNames, originalColumnNames } = getColumnNames(columnNames, originalSettings);
const newGrokFieldNames = getGrokFieldNames(grokPattern, originalSettings.grokPattern);
const overrides = {
charset: charset === undefined ? originalSettings.charset : charset,
format: format === undefined ? originalSettings.format : format,
hasHeaderRow: hasHeaderRow === undefined ? originalSettings.hasHeaderRow : hasHeaderRow,
columnNames: newColumnNames,
grokFieldNames: newGrokFieldNames,
delimiter: d,
quote: quote === undefined ? originalSettings.quote : quote,
shouldTrimFields:
@ -112,6 +123,7 @@ class OverridesUI extends Component {
return {
originalColumnNames,
originalGrokFieldNames: newGrokFieldNames,
customDelimiter: customD === undefined ? '' : customD,
customTimestampFormat: '',
linesToSampleValid: true,
@ -224,8 +236,17 @@ class OverridesUI extends Component {
this.setOverride({ columnNames });
};
onGrokPatternFieldChange = (e, i) => {
const name = e.target.value;
const newGrokPattern = replaceFieldInGrokPattern(this.state.overrides.grokPattern, name, i);
const newGrokFieldNames = getGrokFieldNames(newGrokPattern, this.state.overrides.grokPattern);
this.setOverride({ grokPattern: newGrokPattern, grokFieldNames: newGrokFieldNames });
};
grokPatternChange = (e) => {
this.setOverride({ grokPattern: e.target.value });
const newGrokPattern = e.target.value;
const newGrokFieldNames = getGrokFieldNames(newGrokPattern, this.state.overrides.grokPattern);
this.setOverride({ grokPattern: newGrokPattern, grokFieldNames: newGrokFieldNames });
};
onLinesToSampleChange = (e) => {
@ -247,6 +268,7 @@ class OverridesUI extends Component {
customDelimiter,
customTimestampFormat,
originalColumnNames,
originalGrokFieldNames,
linesToSampleValid,
timestampFormatError,
timestampFormatValid,
@ -263,6 +285,7 @@ class OverridesUI extends Component {
shouldTrimFields,
// charset,
columnNames,
grokFieldNames,
grokPattern,
linesToSample,
} = overrides;
@ -319,7 +342,7 @@ class OverridesUI extends Component {
isClearable={false}
/>
</EuiFormRow>
{format === 'delimited' && (
{format === FILE_FORMATS.DELIMITED && (
<React.Fragment>
<EuiFormRow
label={
@ -396,7 +419,7 @@ class OverridesUI extends Component {
</EuiFormRow>
</React.Fragment>
)}
{format === 'semi_structured_text' && (
{format === FILE_FORMATS.SEMI_STRUCTURED_TEXT && (
<React.Fragment>
<EuiFormRow
label={
@ -477,7 +500,7 @@ class OverridesUI extends Component {
isClearable={false}
/>
</EuiFormRow> */}
{format === 'delimited' && originalColumnNames.length > 0 && (
{format === FILE_FORMATS.DELIMITED && originalColumnNames.length > 0 && (
<React.Fragment>
<EuiSpacer />
<EuiTitle size="s">
@ -499,6 +522,29 @@ class OverridesUI extends Component {
))}
</React.Fragment>
)}
{format === FILE_FORMATS.SEMI_STRUCTURED_TEXT && originalGrokFieldNames.length > 0 && (
<React.Fragment>
<EuiSpacer />
<EuiTitle size="s">
<h3>
<FormattedMessage
id="xpack.dataVisualizer.file.editFlyout.overrides.editFieldNamesTitle"
defaultMessage="Edit field names"
/>
</h3>
</EuiTitle>
{originalGrokFieldNames.map((f, i) => (
<EuiFormRow label={f} key={f}>
<EuiFieldText
value={grokFieldNames[i]}
onChange={(e) => this.onGrokPatternFieldChange(e, i, grokPattern)}
/>
</EuiFormRow>
))}
</React.Fragment>
)}
</EuiForm>
);
}
@ -517,75 +563,3 @@ function getSortedFields(fields) {
.map((f) => ({ label: f }))
.sort((a, b) => a.label.localeCompare(b.label, undefined, { numeric: true }));
}
// Some delimiter characters cannot be used as items in select list.
// so show a textual description of the character instead.
function convertDelimiter(d) {
switch (d) {
case ',':
return {
delimiter: 'comma',
};
case '\t':
return {
delimiter: 'tab',
};
case ';':
return {
delimiter: 'semicolon',
};
case '|':
return {
delimiter: 'pipe',
};
case ' ':
return {
delimiter: 'space',
};
default:
return {
delimiter: CUSTOM_DROPDOWN_OPTION,
customDelimiter: d,
};
}
}
// Convert the delimiter textual descriptions back to their real characters.
function convertDelimiterBack(delimiter, customDelimiter) {
switch (delimiter) {
case 'comma':
return ',';
case 'tab':
return '\t';
case 'semicolon':
return ';';
case 'pipe':
return '|';
case 'space':
return ' ';
case CUSTOM_DROPDOWN_OPTION:
return customDelimiter;
default:
return undefined;
}
}
function getColumnNames(columnNames, originalSettings) {
const newColumnNames =
columnNames === undefined && originalSettings.columnNames !== undefined
? [...originalSettings.columnNames]
: columnNames;
const originalColumnNames = newColumnNames !== undefined ? [...newColumnNames] : [];
return {
newColumnNames,
originalColumnNames,
};
}
function isLinesToSampleValid(linesToSample) {
return linesToSample > LINES_TO_SAMPLE_VALUE_MIN && linesToSample <= LINES_TO_SAMPLE_VALUE_MAX;
}

View file

@ -7,6 +7,7 @@
import { mountWithIntl, shallowWithIntl } from '@kbn/test/jest';
import React from 'react';
import { FILE_FORMATS } from '../../../../../../file_upload/common/constants';
import { Overrides } from './overrides';
@ -48,8 +49,8 @@ describe('Overrides', () => {
});
test('render overrides and trigger a state change', () => {
const FORMAT_1 = 'delimited';
const FORMAT_2 = 'ndjson';
const FORMAT_1 = FILE_FORMATS.DELIMITED;
const FORMAT_2 = FILE_FORMATS.NDJSON;
const props = getProps();
props.overrides.format = FORMAT_1;

View file

@ -0,0 +1,96 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { getFieldsFromGrokPattern } from '../../../common/util/grok_pattern';
import { CUSTOM_DROPDOWN_OPTION } from './options/option_lists';
export const LINES_TO_SAMPLE_VALUE_MIN = 3;
export const LINES_TO_SAMPLE_VALUE_MAX = 1000000;
// Some delimiter characters cannot be used as items in select list.
// so show a textual description of the character instead.
export function convertDelimiter(d: string) {
switch (d) {
case ',':
return {
delimiter: 'comma',
};
case '\t':
return {
delimiter: 'tab',
};
case ';':
return {
delimiter: 'semicolon',
};
case '|':
return {
delimiter: 'pipe',
};
case ' ':
return {
delimiter: 'space',
};
default:
return {
delimiter: CUSTOM_DROPDOWN_OPTION,
customDelimiter: d,
};
}
}
// Convert the delimiter textual descriptions back to their real characters.
export function convertDelimiterBack(delimiter: string, customDelimiter: string) {
switch (delimiter) {
case 'comma':
return ',';
case 'tab':
return '\t';
case 'semicolon':
return ';';
case 'pipe':
return '|';
case 'space':
return ' ';
case CUSTOM_DROPDOWN_OPTION:
return customDelimiter;
default:
return undefined;
}
}
export function getColumnNames(columnNames: string | undefined, originalSettings: any) {
const newColumnNames =
columnNames === undefined && originalSettings.columnNames !== undefined
? [...originalSettings.columnNames]
: columnNames;
const originalColumnNames = newColumnNames !== undefined ? [...newColumnNames] : [];
return {
newColumnNames,
originalColumnNames,
};
}
export function getGrokFieldNames(grokPattern: string, originalGrokPattern: string) {
if (originalGrokPattern === undefined) {
return [];
}
if (grokPattern === undefined) {
return getFieldsFromGrokPattern(originalGrokPattern).map((f) => f.name);
}
return getFieldsFromGrokPattern(grokPattern).map((f) => f.name);
}
export function isLinesToSampleValid(linesToSample: number) {
return linesToSample > LINES_TO_SAMPLE_VALUE_MIN && linesToSample <= LINES_TO_SAMPLE_VALUE_MAX;
}

View file

@ -17,3 +17,10 @@ export const FILE_SIZE_DISPLAY_FORMAT = '0,0.[0] b';
// Value to use in the Elasticsearch index mapping meta data to identify the
// index as having been created by the ML File Data Visualizer.
export const INDEX_META_DATA_CREATED_BY = 'file-data-visualizer';
export const FILE_FORMATS = {
DELIMITED: 'delimited',
NDJSON: 'ndjson',
SEMI_STRUCTURED_TEXT: 'semi_structured_text',
// XML: 'xml',
};

View file

@ -8,6 +8,7 @@
import { MessageImporter } from './message_importer';
import { NdjsonImporter } from './ndjson_importer';
import { ImportFactoryOptions } from './types';
import { FILE_FORMATS } from '../../common/constants';
export function importerFactory(format: string, options: ImportFactoryOptions) {
switch (format) {
@ -15,10 +16,10 @@ export function importerFactory(format: string, options: ImportFactoryOptions) {
// file into messages, then sending these to ES for further processing
// in an ingest pipeline in documents containing a single "message"
// field (like Filebeat does)
case 'delimited':
case 'semi_structured_text':
case FILE_FORMATS.DELIMITED:
case FILE_FORMATS.SEMI_STRUCTURED_TEXT:
return new MessageImporter(options);
case 'ndjson':
case FILE_FORMATS.NDJSON:
return new NdjsonImporter();
default:
return;