mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 17:59:23 -04:00
# Backport This will backport the following commits from `main` to `8.x`: - [[Auto Import] Improve the ECS mapping extraction logic (#195167)](https://github.com/elastic/kibana/pull/195167) <!--- Backport version: 9.4.3 --> ### Questions ? Please refer to the [Backport tool documentation](https://github.com/sqren/backport) <!--BACKPORT [{"author":{"name":"Ilya Nikokoshev","email":"ilya.nikokoshev@elastic.co"},"sourceCommit":{"committedDate":"2024-10-09T12:24:00Z","message":"[Auto Import] Improve the ECS mapping extraction logic (#195167)\n\n## Release Notes\r\n\r\nAutomatic Import is more forgiving if an LLM returns an ECS mapping in a\r\nslightly unexpected format.\r\n\r\n## Summary\r\n\r\nWhen implementing https://github.com/elastic/kibana/pull/194386 an issue\r\nhas been encountered where Claude returns the field name `date_format`\r\ninstead of expected `date_formats` and the ECS chain breaks down.\r\n\r\nWe add this case as a test to\r\n`x-pack/plugins/integration_assistant/server/graphs/ecs/validate.test`.\r\n\r\nWithout the changes in this PR the list returned by\r\n`findInvalidEcsFields` is\r\n\r\n```\r\n [\r\n 'Reserved ECS field mapping identified for event.created : ai_postgres_202410050058.logs.column1.target',\r\n 'Invalid ECS field mapping identified for 0.9 : ai_postgres_202410050058.logs.column1.confidence, ai_postgres_202410050058.logs.column5.confidence',\r\n 'Invalid ECS field mapping identified for date : ai_postgres_202410050058.logs.column1.type, ai_postgres_202410050058.logs.column9.type',\r\n 'Invalid ECS field mapping identified for 0.95 : ai_postgres_202410050058.logs.column12.confidence',\r\n 'Invalid ECS field mapping identified for string : ai_postgres_202410050058.logs.column12.type, ai_postgres_202410050058.logs.column14.type, ai_postgres_202410050058.logs.column24.type, ai_postgres_202410050058.logs.column5.type, ai_postgres_202410050058.logs.column3.type, ai_postgres_202410050058.logs.column2.type',\r\n 'Invalid ECS field mapping identified for 0.8 : ai_postgres_202410050058.logs.column9.confidence, ai_postgres_202410050058.logs.column3.confidence',\r\n 'Invalid ECS field mapping identified for 0.7 : ai_postgres_202410050058.logs.column14.confidence, ai_postgres_202410050058.logs.column2.confidence',\r\n 'Invalid ECS field mapping identified for 0.85 : ai_postgres_202410050058.logs.column24.confidence'\r\n ]\r\n```\r\n\r\nwhile with these changes the result does not contain any `Invalid ECS field` messages.\r\n\r\nThe key changes are in the `processMapping` function:\r\n\r\n1. We made function more forgiving in regards to the input, accepting\r\n`date_format` in lieu of `date_formats`.\r\n2. We have removed the collection of \"other paths\", that is, the reverse\r\nindex for simple values like `0.8`.\r\n\r\nThe latter change generally limits the impact of any other format issues\r\nin the ECS mapping in the future.\r\n\r\nAdditionally, the function has been renamed to `extractECSMapping`, its\r\noutput type validated, and documentation has been added.\r\n\r\n---------\r\n\r\nCo-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>","sha":"637d796071f067f8cab37165dd8f80111251ae81","branchLabelMapping":{"^v9.0.0$":"main","^v8.16.0$":"8.x","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:fix","v9.0.0","backport:prev-major","Team:Security-Scalability","Feature:AutomaticImport"],"title":"[Auto Import] Improve the ECS mapping extraction logic","number":195167,"url":"https://github.com/elastic/kibana/pull/195167","mergeCommit":{"message":"[Auto Import] Improve the ECS mapping extraction logic (#195167)\n\n## Release Notes\r\n\r\nAutomatic Import is more forgiving if an LLM returns an ECS mapping in a\r\nslightly unexpected format.\r\n\r\n## Summary\r\n\r\nWhen implementing https://github.com/elastic/kibana/pull/194386 an issue\r\nhas been encountered where Claude returns the field name `date_format`\r\ninstead of expected `date_formats` and the ECS chain breaks down.\r\n\r\nWe add this case as a test to\r\n`x-pack/plugins/integration_assistant/server/graphs/ecs/validate.test`.\r\n\r\nWithout the changes in this PR the list returned by\r\n`findInvalidEcsFields` is\r\n\r\n```\r\n [\r\n 'Reserved ECS field mapping identified for event.created : ai_postgres_202410050058.logs.column1.target',\r\n 'Invalid ECS field mapping identified for 0.9 : ai_postgres_202410050058.logs.column1.confidence, ai_postgres_202410050058.logs.column5.confidence',\r\n 'Invalid ECS field mapping identified for date : ai_postgres_202410050058.logs.column1.type, ai_postgres_202410050058.logs.column9.type',\r\n 'Invalid ECS field mapping identified for 0.95 : ai_postgres_202410050058.logs.column12.confidence',\r\n 'Invalid ECS field mapping identified for string : ai_postgres_202410050058.logs.column12.type, ai_postgres_202410050058.logs.column14.type, ai_postgres_202410050058.logs.column24.type, ai_postgres_202410050058.logs.column5.type, ai_postgres_202410050058.logs.column3.type, ai_postgres_202410050058.logs.column2.type',\r\n 'Invalid ECS field mapping identified for 0.8 : ai_postgres_202410050058.logs.column9.confidence, ai_postgres_202410050058.logs.column3.confidence',\r\n 'Invalid ECS field mapping identified for 0.7 : ai_postgres_202410050058.logs.column14.confidence, ai_postgres_202410050058.logs.column2.confidence',\r\n 'Invalid ECS field mapping identified for 0.85 : ai_postgres_202410050058.logs.column24.confidence'\r\n ]\r\n```\r\n\r\nwhile with these changes the result does not contain any `Invalid ECS field` messages.\r\n\r\nThe key changes are in the `processMapping` function:\r\n\r\n1. We made function more forgiving in regards to the input, accepting\r\n`date_format` in lieu of `date_formats`.\r\n2. We have removed the collection of \"other paths\", that is, the reverse\r\nindex for simple values like `0.8`.\r\n\r\nThe latter change generally limits the impact of any other format issues\r\nin the ECS mapping in the future.\r\n\r\nAdditionally, the function has been renamed to `extractECSMapping`, its\r\noutput type validated, and documentation has been added.\r\n\r\n---------\r\n\r\nCo-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>","sha":"637d796071f067f8cab37165dd8f80111251ae81"}},"sourceBranch":"main","suggestedTargetBranches":[],"targetPullRequestStates":[{"branch":"main","label":"v9.0.0","branchLabelMappingKey":"^v9.0.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/195167","number":195167,"mergeCommit":{"message":"[Auto Import] Improve the ECS mapping extraction logic (#195167)\n\n## Release Notes\r\n\r\nAutomatic Import is more forgiving if an LLM returns an ECS mapping in a\r\nslightly unexpected format.\r\n\r\n## Summary\r\n\r\nWhen implementing https://github.com/elastic/kibana/pull/194386 an issue\r\nhas been encountered where Claude returns the field name `date_format`\r\ninstead of expected `date_formats` and the ECS chain breaks down.\r\n\r\nWe add this case as a test to\r\n`x-pack/plugins/integration_assistant/server/graphs/ecs/validate.test`.\r\n\r\nWithout the changes in this PR the list returned by\r\n`findInvalidEcsFields` is\r\n\r\n```\r\n [\r\n 'Reserved ECS field mapping identified for event.created : ai_postgres_202410050058.logs.column1.target',\r\n 'Invalid ECS field mapping identified for 0.9 : ai_postgres_202410050058.logs.column1.confidence, ai_postgres_202410050058.logs.column5.confidence',\r\n 'Invalid ECS field mapping identified for date : ai_postgres_202410050058.logs.column1.type, ai_postgres_202410050058.logs.column9.type',\r\n 'Invalid ECS field mapping identified for 0.95 : ai_postgres_202410050058.logs.column12.confidence',\r\n 'Invalid ECS field mapping identified for string : ai_postgres_202410050058.logs.column12.type, ai_postgres_202410050058.logs.column14.type, ai_postgres_202410050058.logs.column24.type, ai_postgres_202410050058.logs.column5.type, ai_postgres_202410050058.logs.column3.type, ai_postgres_202410050058.logs.column2.type',\r\n 'Invalid ECS field mapping identified for 0.8 : ai_postgres_202410050058.logs.column9.confidence, ai_postgres_202410050058.logs.column3.confidence',\r\n 'Invalid ECS field mapping identified for 0.7 : ai_postgres_202410050058.logs.column14.confidence, ai_postgres_202410050058.logs.column2.confidence',\r\n 'Invalid ECS field mapping identified for 0.85 : ai_postgres_202410050058.logs.column24.confidence'\r\n ]\r\n```\r\n\r\nwhile with these changes the result does not contain any `Invalid ECS field` messages.\r\n\r\nThe key changes are in the `processMapping` function:\r\n\r\n1. We made function more forgiving in regards to the input, accepting\r\n`date_format` in lieu of `date_formats`.\r\n2. We have removed the collection of \"other paths\", that is, the reverse\r\nindex for simple values like `0.8`.\r\n\r\nThe latter change generally limits the impact of any other format issues\r\nin the ECS mapping in the future.\r\n\r\nAdditionally, the function has been renamed to `extractECSMapping`, its\r\noutput type validated, and documentation has been added.\r\n\r\n---------\r\n\r\nCo-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>","sha":"637d796071f067f8cab37165dd8f80111251ae81"}}]}] BACKPORT--> Co-authored-by: Ilya Nikokoshev <ilya.nikokoshev@elastic.co>
This commit is contained in:
parent
deeb60428d
commit
9edfe2efab
2 changed files with 193 additions and 36 deletions
|
@ -8,14 +8,14 @@
|
|||
import { ECS_RESERVED } from './constants';
|
||||
|
||||
import {
|
||||
extractECSMapping,
|
||||
findDuplicateFields,
|
||||
findInvalidEcsFields,
|
||||
processMapping,
|
||||
removeReservedFields,
|
||||
} from './validate';
|
||||
|
||||
describe('Testing ecs handler', () => {
|
||||
it('processMapping()', async () => {
|
||||
it('extractECSMapping()', async () => {
|
||||
const path: string[] = [];
|
||||
const value = {
|
||||
checkpoint: {
|
||||
|
@ -50,7 +50,7 @@ describe('Testing ecs handler', () => {
|
|||
},
|
||||
};
|
||||
const output: Record<string, string[][]> = {};
|
||||
await processMapping(path, value, output);
|
||||
await extractECSMapping(path, value, output);
|
||||
expect(output).toEqual({
|
||||
'source.address': [['checkpoint', 'firewall', 'origin']],
|
||||
'user.name': [['checkpoint', 'firewall', 'administrator']],
|
||||
|
@ -96,6 +96,110 @@ describe('findInvalidEcsFields', () => {
|
|||
const invalid = findInvalidEcsFields(ecsMappingReserved);
|
||||
expect(invalid.length).toBe(1);
|
||||
});
|
||||
|
||||
it('invalid: date_format fields (natural example)', async () => {
|
||||
const misspelledDateFormatMapping = {
|
||||
ai_postgres_202410050058: {
|
||||
logs: {
|
||||
column1: {
|
||||
target: 'event.created',
|
||||
confidence: 0.9,
|
||||
type: 'date',
|
||||
date_format: ['yyyy-MM-dd HH:mm:ss.SSS z'],
|
||||
},
|
||||
column12: {
|
||||
target: 'log.level',
|
||||
confidence: 0.95,
|
||||
type: 'string',
|
||||
date_format: [],
|
||||
},
|
||||
column11: null,
|
||||
column4: null,
|
||||
column9: {
|
||||
target: 'event.start',
|
||||
confidence: 0.8,
|
||||
type: 'date',
|
||||
date_format: ['yyyy-MM-dd HH:mm:ss z'],
|
||||
},
|
||||
column7: null,
|
||||
column6: null,
|
||||
column14: {
|
||||
target: 'event.reason',
|
||||
confidence: 0.7,
|
||||
type: 'string',
|
||||
date_format: [],
|
||||
},
|
||||
column13: null,
|
||||
column24: {
|
||||
target: 'process.name',
|
||||
confidence: 0.85,
|
||||
type: 'string',
|
||||
date_format: [],
|
||||
},
|
||||
column23: null,
|
||||
column10: null,
|
||||
column5: {
|
||||
target: 'source.address',
|
||||
confidence: 0.9,
|
||||
type: 'string',
|
||||
date_format: [],
|
||||
},
|
||||
column3: {
|
||||
target: 'user.name',
|
||||
confidence: 0.8,
|
||||
type: 'string',
|
||||
date_format: [],
|
||||
},
|
||||
column2: {
|
||||
target: 'destination.user.name',
|
||||
confidence: 0.7,
|
||||
type: 'string',
|
||||
date_format: [],
|
||||
},
|
||||
column8: null,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const invalid = findInvalidEcsFields(misspelledDateFormatMapping);
|
||||
expect(invalid.length).toBe(1);
|
||||
});
|
||||
|
||||
it('invalid: date_format fields (handcrafted example)', async () => {
|
||||
const mixedMapping = {
|
||||
some_title: {
|
||||
logs: {
|
||||
column1: {
|
||||
target: 'event.created',
|
||||
confidence: 0.9,
|
||||
type: 'date',
|
||||
date_format: ['yyyy-MM-dd HH:mm:ss.SSS z'],
|
||||
},
|
||||
column12: {
|
||||
target: 'log.level',
|
||||
confidence: 0.95,
|
||||
type: 'string',
|
||||
date_formats: [],
|
||||
},
|
||||
column11: null,
|
||||
column4: null,
|
||||
column9: {
|
||||
target: 'event.start',
|
||||
confidence: 0.8,
|
||||
type: 'date',
|
||||
date_format: 'yyyy-MM-dd HH:mm:ss z',
|
||||
},
|
||||
column2: {
|
||||
target: 'destination.user.name',
|
||||
type: 'string',
|
||||
date_format: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
const invalid = findInvalidEcsFields(mixedMapping);
|
||||
expect(invalid.length).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
describe('findDuplicateFields', () => {
|
||||
|
|
|
@ -10,7 +10,6 @@ import { mergeSamples } from '../../util/samples';
|
|||
import { ECS_RESERVED } from './constants';
|
||||
import type { EcsBaseNodeParams } from './types';
|
||||
|
||||
const valueFieldKeys = new Set(['target', 'confidence', 'date_formats', 'type']);
|
||||
type AnyObject = Record<string, any>;
|
||||
|
||||
function extractKeys(data: AnyObject, prefix: string = ''): Set<string> {
|
||||
|
@ -46,43 +45,97 @@ function findMissingFields(combinedSamples: string, ecsMapping: AnyObject): stri
|
|||
return missingKeys;
|
||||
}
|
||||
|
||||
export function processMapping(
|
||||
// Describes an LLM-generated ECS mapping candidate.
|
||||
interface ECSFieldTarget {
|
||||
target: string;
|
||||
type: string;
|
||||
confidence: number;
|
||||
date_formats: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a given object as an ECSFieldTarget object if it meets the required structure.
|
||||
*
|
||||
* @param value - The value to be converted to an ECSMapping object. It should be an object
|
||||
* with properties `target` and `type`. It should have `confidence` field and
|
||||
* either `date_formats` or `date_format`, though we also fill in these otherwise.
|
||||
* @returns An ECSFieldTarget object if the conversion succeeded, otherwise null.
|
||||
*/
|
||||
function asECSFieldTarget(value: any): ECSFieldTarget | null {
|
||||
if (value === null || typeof value !== 'object' || Array.isArray(value)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (
|
||||
value.target &&
|
||||
typeof value.target === 'string' &&
|
||||
value.type &&
|
||||
typeof value.type === 'string'
|
||||
) {
|
||||
let confidence = 0.5;
|
||||
if (value.confidence && typeof value.confidence === 'number') {
|
||||
confidence = value.confidence;
|
||||
}
|
||||
let dateFormats: string[] = [];
|
||||
if (value.date_formats && Array.isArray(value.date_formats)) {
|
||||
dateFormats = value.date_formats;
|
||||
} else if (value.date_format && Array.isArray(value.date_format)) {
|
||||
dateFormats = value.date_format;
|
||||
} else if (value.date_format && typeof value.date_format === 'string') {
|
||||
dateFormats = [value.date_format];
|
||||
}
|
||||
return {
|
||||
target: value.target,
|
||||
type: value.type,
|
||||
confidence,
|
||||
date_formats: dateFormats,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts ECS (Elastic Common Schema) field mapping dictionary from the LLM output.
|
||||
*
|
||||
* @param path - The current path in the object being traversed (an array of strings).
|
||||
* @param value - The value to be processed, which can be an array, object, or other types.
|
||||
* @param output - A record where the extracted ECS mappings will be stored. The keys are ECS targets, and the values are arrays of paths.
|
||||
*
|
||||
* This function recursively traverses the provided value. If the value is an array, it processes each item in the array.
|
||||
* If the value can be interpreted as an ECS mapping, it adds the path to the output record under the appropriate ECS target.
|
||||
* If the value is a regular object, it continues traversing its properties.
|
||||
*/
|
||||
export function extractECSMapping(
|
||||
path: string[],
|
||||
value: any,
|
||||
output: Record<string, string[][]>
|
||||
): void {
|
||||
if (typeof value === 'object' && value !== null) {
|
||||
if (!Array.isArray(value)) {
|
||||
// If the value is a dict with all the keys returned for each source field, this is the full path of the field.
|
||||
const valueKeys = new Set(Object.keys(value));
|
||||
if (Array.isArray(value)) {
|
||||
// If the value is an array, iterate through items and process them.
|
||||
for (const item of value) {
|
||||
if (typeof item === 'object' && item !== null) {
|
||||
extractECSMapping(path, item, output);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if ([...valueFieldKeys].every((k) => valueKeys.has(k))) {
|
||||
if (value?.target !== null) {
|
||||
if (!output[value?.target]) {
|
||||
output[value.target] = [];
|
||||
}
|
||||
output[value.target].push(path);
|
||||
}
|
||||
} else {
|
||||
// Regular dictionary, continue traversing
|
||||
for (const [k, v] of Object.entries(value)) {
|
||||
processMapping([...path, k], v, output);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If the value is an array, iterate through items and process them
|
||||
for (const item of value) {
|
||||
if (typeof item === 'object' && item !== null) {
|
||||
processMapping(path, item, output);
|
||||
}
|
||||
}
|
||||
const ecsFieldTarget = asECSFieldTarget(value);
|
||||
if (ecsFieldTarget) {
|
||||
// If we can interpret the value as an ECSFieldTarget.
|
||||
if (!output[ecsFieldTarget.target]) {
|
||||
output[ecsFieldTarget.target] = [];
|
||||
}
|
||||
} else if (value !== null) {
|
||||
// Direct value, accumulate path
|
||||
if (!output[value]) {
|
||||
output[value] = [];
|
||||
output[ecsFieldTarget.target].push(path);
|
||||
return;
|
||||
}
|
||||
|
||||
if (typeof value === 'object' && value !== null) {
|
||||
// Regular dictionary, continue traversing.
|
||||
for (const [k, v] of Object.entries(value)) {
|
||||
extractECSMapping([...path, k], v, output);
|
||||
}
|
||||
output[value].push(path);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,7 +149,7 @@ export function findDuplicateFields(prefixedSamples: string[], ecsMapping: AnyOb
|
|||
const output: Record<string, string[][]> = {};
|
||||
|
||||
// Get all keys for each target ECS mapping field
|
||||
processMapping([], ecsMapping, output);
|
||||
extractECSMapping([], ecsMapping, output);
|
||||
|
||||
// Filter out any ECS field that does not have multiple source fields mapped to it
|
||||
const filteredOutput = Object.fromEntries(
|
||||
|
@ -138,7 +191,7 @@ export function findInvalidEcsFields(currentMapping: AnyObject): string[] {
|
|||
const ecsDict = ECS_FULL;
|
||||
const ecsReserved = ECS_RESERVED;
|
||||
|
||||
processMapping([], currentMapping, output);
|
||||
extractECSMapping([], currentMapping, output);
|
||||
const filteredOutput = Object.fromEntries(
|
||||
Object.entries(output).filter(([key, _]) => key !== null)
|
||||
);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue