mirror of
https://github.com/elastic/kibana.git
synced 2025-04-23 09:19:04 -04:00
[ML] Move chi2test
to package (#167237)
The `chi2test` utils so fare were only used within data comparison view. We plan to use it with other plugins, so moving it so a separate package in this PR. `SIGNIFICANCE_LEVELS` was updated to include some more digits.
This commit is contained in:
parent
93fc80704a
commit
1b9993eb07
21 changed files with 1247 additions and 1092 deletions
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
|
@ -502,6 +502,7 @@ x-pack/plugins/metrics_data_access @elastic/infra-monitoring-ui
|
|||
x-pack/packages/ml/agg_utils @elastic/ml-ui
|
||||
x-pack/packages/ml/anomaly_utils @elastic/ml-ui
|
||||
x-pack/packages/ml/category_validator @elastic/ml-ui
|
||||
x-pack/packages/ml/chi2test @elastic/ml-ui
|
||||
x-pack/packages/ml/data_frame_analytics_utils @elastic/ml-ui
|
||||
x-pack/packages/ml/data_grid @elastic/ml-ui
|
||||
x-pack/packages/ml/date_picker @elastic/ml-ui
|
||||
|
|
|
@ -523,6 +523,7 @@
|
|||
"@kbn/ml-agg-utils": "link:x-pack/packages/ml/agg_utils",
|
||||
"@kbn/ml-anomaly-utils": "link:x-pack/packages/ml/anomaly_utils",
|
||||
"@kbn/ml-category-validator": "link:x-pack/packages/ml/category_validator",
|
||||
"@kbn/ml-chi2test": "link:x-pack/packages/ml/chi2test",
|
||||
"@kbn/ml-data-frame-analytics-utils": "link:x-pack/packages/ml/data_frame_analytics_utils",
|
||||
"@kbn/ml-data-grid": "link:x-pack/packages/ml/data_grid",
|
||||
"@kbn/ml-date-picker": "link:x-pack/packages/ml/date_picker",
|
||||
|
|
|
@ -998,6 +998,8 @@
|
|||
"@kbn/ml-anomaly-utils/*": ["x-pack/packages/ml/anomaly_utils/*"],
|
||||
"@kbn/ml-category-validator": ["x-pack/packages/ml/category_validator"],
|
||||
"@kbn/ml-category-validator/*": ["x-pack/packages/ml/category_validator/*"],
|
||||
"@kbn/ml-chi2test": ["x-pack/packages/ml/chi2test"],
|
||||
"@kbn/ml-chi2test/*": ["x-pack/packages/ml/chi2test/*"],
|
||||
"@kbn/ml-data-frame-analytics-utils": ["x-pack/packages/ml/data_frame_analytics_utils"],
|
||||
"@kbn/ml-data-frame-analytics-utils/*": ["x-pack/packages/ml/data_frame_analytics_utils/*"],
|
||||
"@kbn/ml-data-grid": ["x-pack/packages/ml/data_grid"],
|
||||
|
|
4
x-pack/packages/ml/chi2test/README.md
Normal file
4
x-pack/packages/ml/chi2test/README.md
Normal file
|
@ -0,0 +1,4 @@
|
|||
# @kbn/ml-chi2test
|
||||
|
||||
`computeChi2PValue` computes the p-value for how similar the datasets are.
|
||||
Returned value ranges from 0 to 1, with 1 meaning the datasets are identical.
|
|
@ -5,8 +5,8 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
import { computeChi2PValue } from './data_drift_utils';
|
||||
import { Histogram } from './types';
|
||||
import { computeChi2PValue } from './compute_chi_2_pvalue';
|
||||
import type { Histogram } from './types';
|
||||
|
||||
describe('computeChi2PValue()', () => {
|
||||
test('should return close to 1 if datasets are both empty or nearly identical', () => {
|
||||
|
@ -83,6 +83,6 @@ describe('computeChi2PValue()', () => {
|
|||
percentage: 1,
|
||||
},
|
||||
];
|
||||
expect(computeChi2PValue(referenceTerms, comparisonTerms)).toStrictEqual(0);
|
||||
expect(computeChi2PValue(referenceTerms, comparisonTerms)).toStrictEqual(0.000001);
|
||||
});
|
||||
});
|
48
x-pack/packages/ml/chi2test/compute_chi_2_pvalue.ts
Normal file
48
x-pack/packages/ml/chi2test/compute_chi_2_pvalue.ts
Normal file
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { criticalTableLookup } from './critical_table_lookup';
|
||||
import type { Histogram } from './types';
|
||||
|
||||
/**
|
||||
* Compute the p-value for how similar the datasets are.
|
||||
* Returned value ranges from 0 to 1, with 1 meaning the datasets are identical.
|
||||
*
|
||||
* @param {Histogram[]} normalizedBaselineTerms - An array of normalized baseline terms (Histogram objects).
|
||||
* @param {Histogram[]} normalizedDriftedTerms - An array of normalized drifted terms (Histogram objects).
|
||||
* @returns {number} The p-value indicating the similarity of the datasets.
|
||||
*/
|
||||
export const computeChi2PValue = (
|
||||
normalizedBaselineTerms: Histogram[],
|
||||
normalizedDriftedTerms: Histogram[]
|
||||
) => {
|
||||
// Get all unique keys from both arrays
|
||||
const allKeys: string[] = Array.from(
|
||||
new Set([
|
||||
...normalizedBaselineTerms.map((term) => term.key.toString()),
|
||||
...normalizedDriftedTerms.map((term) => term.key.toString()),
|
||||
])
|
||||
).slice(0, 100);
|
||||
|
||||
// Calculate the chi-squared statistic and degrees of freedom
|
||||
let chiSquared: number = 0;
|
||||
const degreesOfFreedom: number = allKeys.length - 1;
|
||||
|
||||
if (degreesOfFreedom === 0) return 1;
|
||||
|
||||
allKeys.forEach((key) => {
|
||||
const baselineTerm = normalizedBaselineTerms.find((term) => term.key === key);
|
||||
const driftedTerm = normalizedDriftedTerms.find((term) => term.key === key);
|
||||
|
||||
const observed: number = driftedTerm?.percentage ?? 0;
|
||||
const expected: number = baselineTerm?.percentage ?? 0;
|
||||
chiSquared += Math.pow(observed - expected, 2) / (expected > 0 ? expected : 1e-6); // Prevent divide by zero
|
||||
});
|
||||
|
||||
// Use the criticalTableLookup function to determine the p-value
|
||||
return criticalTableLookup(chiSquared, degreesOfFreedom);
|
||||
};
|
1038
x-pack/packages/ml/chi2test/constants.ts
Normal file
1038
x-pack/packages/ml/chi2test/constants.ts
Normal file
File diff suppressed because it is too large
Load diff
40
x-pack/packages/ml/chi2test/critical_table_lookup.ts
Normal file
40
x-pack/packages/ml/chi2test/critical_table_lookup.ts
Normal file
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
import { CRITICAL_VALUES_TABLE, SIGNIFICANCE_LEVELS } from './constants';
|
||||
|
||||
/**
|
||||
* Performs a lookup in a critical values table to determine the significance level
|
||||
* associated with a given chi-squared statistic and degrees of freedom.
|
||||
*
|
||||
* @param {number} chi2Statistic - The chi-squared statistic for which the significance level is to be determined.
|
||||
* @param {number} df - The degrees of freedom (an integer) for the chi-squared test.
|
||||
* @returns {number} The significance level corresponding to the chi-squared statistic and degrees of freedom.
|
||||
* @throws {Error} If df is less than 1 or not an integer.
|
||||
*/
|
||||
export const criticalTableLookup = (chi2Statistic: number, df: number) => {
|
||||
if (df < 1) return 1;
|
||||
if (!Number.isInteger(df)) throw Error('Degrees of freedom must be a valid integer');
|
||||
|
||||
// Get the row index
|
||||
const rowIndex: number = df - 1;
|
||||
|
||||
// Get the column index
|
||||
let minDiff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][0] - chi2Statistic);
|
||||
let columnIndex: number = 0;
|
||||
for (let j = 1; j < CRITICAL_VALUES_TABLE[rowIndex].length; j++) {
|
||||
const diff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][j] - chi2Statistic);
|
||||
if (diff < minDiff) {
|
||||
minDiff = diff;
|
||||
columnIndex = j;
|
||||
}
|
||||
}
|
||||
|
||||
// Determine the significance level from the column index
|
||||
const significanceLevel: number = SIGNIFICANCE_LEVELS[columnIndex];
|
||||
return significanceLevel;
|
||||
};
|
11
x-pack/packages/ml/chi2test/index.ts
Normal file
11
x-pack/packages/ml/chi2test/index.ts
Normal file
|
@ -0,0 +1,11 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
export { computeChi2PValue } from './compute_chi_2_pvalue';
|
||||
export { criticalTableLookup } from './critical_table_lookup';
|
||||
export { CRITICAL_VALUES_TABLE, SIGNIFICANCE_LEVELS } from './constants';
|
||||
export type { Histogram } from './types';
|
12
x-pack/packages/ml/chi2test/jest.config.js
Normal file
12
x-pack/packages/ml/chi2test/jest.config.js
Normal file
|
@ -0,0 +1,12 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
module.exports = {
|
||||
preset: '@kbn/test',
|
||||
rootDir: '../../../..',
|
||||
roots: ['<rootDir>/x-pack/packages/ml/chi2test'],
|
||||
};
|
5
x-pack/packages/ml/chi2test/kibana.jsonc
Normal file
5
x-pack/packages/ml/chi2test/kibana.jsonc
Normal file
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
"type": "shared-common",
|
||||
"id": "@kbn/ml-chi2test",
|
||||
"owner": "@elastic/ml-ui"
|
||||
}
|
6
x-pack/packages/ml/chi2test/package.json
Normal file
6
x-pack/packages/ml/chi2test/package.json
Normal file
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"name": "@kbn/ml-chi2test",
|
||||
"private": true,
|
||||
"version": "1.0.0",
|
||||
"license": "Elastic License 2.0"
|
||||
}
|
19
x-pack/packages/ml/chi2test/tsconfig.json
Normal file
19
x-pack/packages/ml/chi2test/tsconfig.json
Normal file
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"extends": "../../../../tsconfig.base.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "target/types",
|
||||
"types": [
|
||||
"jest",
|
||||
"node",
|
||||
"react"
|
||||
]
|
||||
},
|
||||
"include": [
|
||||
"**/*.ts",
|
||||
"**/*.tsx",
|
||||
],
|
||||
"exclude": [
|
||||
"target/**/*"
|
||||
],
|
||||
"kbn_references": []
|
||||
}
|
24
x-pack/packages/ml/chi2test/types.ts
Normal file
24
x-pack/packages/ml/chi2test/types.ts
Normal file
|
@ -0,0 +1,24 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Interface for the Histogram type used by computeChi2PValue.
|
||||
*/
|
||||
export interface Histogram {
|
||||
/**
|
||||
* The doc count.
|
||||
*/
|
||||
doc_count: number;
|
||||
/**
|
||||
* The key.
|
||||
*/
|
||||
key: string | number;
|
||||
/**
|
||||
* Optional percentage.
|
||||
*/
|
||||
percentage?: number;
|
||||
}
|
|
@ -5,15 +5,20 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
import React from 'react';
|
||||
|
||||
import { SeriesColorAccessor } from '@elastic/charts/dist/chart_types/xy_chart/utils/specs';
|
||||
import { Axis, BarSeries, Chart, Position, ScaleType, Settings, Tooltip } from '@elastic/charts';
|
||||
import React from 'react';
|
||||
|
||||
import { FIELD_FORMAT_IDS } from '@kbn/field-formats-plugin/common';
|
||||
import { getFieldFormatType, useFieldFormatter } from './default_value_formatter';
|
||||
import type { Histogram } from '@kbn/ml-chi2test';
|
||||
|
||||
import { DataComparisonChartTooltipBody } from '../data_drift_chart_tooltip_body';
|
||||
import { NoChartsData } from './no_charts_data';
|
||||
import { DATA_COMPARISON_TYPE } from '../constants';
|
||||
import { DataDriftField, Feature, Histogram } from '../types';
|
||||
import type { DataDriftField, Feature } from '../types';
|
||||
|
||||
import { getFieldFormatType, useFieldFormatter } from './default_value_formatter';
|
||||
import { NoChartsData } from './no_charts_data';
|
||||
|
||||
export const SingleDistributionChart = ({
|
||||
data,
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -5,67 +5,6 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
import { CRITICAL_VALUES_TABLE, SIGNIFICANCE_LEVELS } from './constants';
|
||||
import { Histogram } from './types';
|
||||
|
||||
const criticalTableLookup = (chi2Statistic: number, df: number) => {
|
||||
if (df < 1) return 1;
|
||||
if (!Number.isInteger(df)) throw Error('Degrees of freedom must be a valid integer');
|
||||
|
||||
// Get the row index
|
||||
const rowIndex: number = df - 1;
|
||||
|
||||
// Get the column index
|
||||
let minDiff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][0] - chi2Statistic);
|
||||
let columnIndex: number = 0;
|
||||
for (let j = 1; j < CRITICAL_VALUES_TABLE[rowIndex].length; j++) {
|
||||
const diff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][j] - chi2Statistic);
|
||||
if (diff < minDiff) {
|
||||
minDiff = diff;
|
||||
columnIndex = j;
|
||||
}
|
||||
}
|
||||
|
||||
const significanceLevel: number = SIGNIFICANCE_LEVELS[columnIndex];
|
||||
return significanceLevel;
|
||||
};
|
||||
|
||||
/**
|
||||
* Compute the p-value for how similar the datasets are.
|
||||
* Returned value ranges from 0 to 1, with 1 meaning the datasets are identical.
|
||||
* @param normalizedBaselineTerms
|
||||
* @param normalizedDriftedTerms
|
||||
*/
|
||||
export const computeChi2PValue = (
|
||||
normalizedBaselineTerms: Histogram[],
|
||||
normalizedDriftedTerms: Histogram[]
|
||||
) => {
|
||||
// Get all unique keys from both arrays
|
||||
const allKeys: string[] = Array.from(
|
||||
new Set([
|
||||
...normalizedBaselineTerms.map((term) => term.key.toString()),
|
||||
...normalizedDriftedTerms.map((term) => term.key.toString()),
|
||||
])
|
||||
).slice(0, 100);
|
||||
|
||||
// Calculate the chi-squared statistic and degrees of freedom
|
||||
let chiSquared: number = 0;
|
||||
const degreesOfFreedom: number = allKeys.length - 1;
|
||||
|
||||
if (degreesOfFreedom === 0) return 1;
|
||||
|
||||
allKeys.forEach((key) => {
|
||||
const baselineTerm = normalizedBaselineTerms.find((term) => term.key === key);
|
||||
const driftedTerm = normalizedDriftedTerms.find((term) => term.key === key);
|
||||
|
||||
const observed: number = driftedTerm?.percentage ?? 0;
|
||||
const expected: number = baselineTerm?.percentage ?? 0;
|
||||
chiSquared += Math.pow(observed - expected, 2) / (expected > 0 ? expected : 1e-6); // Prevent divide by zero
|
||||
});
|
||||
|
||||
return criticalTableLookup(chiSquared, degreesOfFreedom);
|
||||
};
|
||||
|
||||
/**
|
||||
* formatSignificanceLevel
|
||||
* @param significanceLevel
|
||||
|
|
|
@ -5,10 +5,13 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
import * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
|
||||
import { isPopulatedObject } from '@kbn/ml-is-populated-object';
|
||||
import type { Filter, Query } from '@kbn/es-query';
|
||||
import * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import { SEARCH_QUERY_LANGUAGE, SearchQueryLanguage } from '@kbn/ml-query-utils';
|
||||
import type { Histogram } from '@kbn/ml-chi2test';
|
||||
|
||||
import { DATA_COMPARISON_TYPE } from './constants';
|
||||
|
||||
export interface DataComparisonQueryState {
|
||||
|
@ -52,12 +55,6 @@ export const getDefaultDataComparisonState = (
|
|||
...overrides,
|
||||
});
|
||||
|
||||
export interface Histogram {
|
||||
doc_count: number;
|
||||
key: string | number;
|
||||
percentage?: number;
|
||||
}
|
||||
|
||||
export interface ComparisonHistogram extends Histogram {
|
||||
g: string;
|
||||
}
|
||||
|
|
|
@ -5,27 +5,35 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
import { chunk, cloneDeep, flatten } from 'lodash';
|
||||
import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
|
||||
import type { IKibanaSearchRequest } from '@kbn/data-plugin/common';
|
||||
import { lastValueFrom } from 'rxjs';
|
||||
|
||||
import * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import type {
|
||||
MappingRuntimeFields,
|
||||
QueryDslBoolQuery,
|
||||
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import { AggregationsAggregate } from '@elastic/elasticsearch/lib/api/types';
|
||||
|
||||
import type { IKibanaSearchRequest } from '@kbn/data-plugin/common';
|
||||
import type { DataView } from '@kbn/data-views-plugin/public';
|
||||
import { isPopulatedObject } from '@kbn/ml-is-populated-object';
|
||||
import type { Query } from '@kbn/data-plugin/common';
|
||||
import { chunk, cloneDeep, flatten } from 'lodash';
|
||||
import type { MappingRuntimeFields } from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import type { SearchQueryLanguage } from '@kbn/ml-query-utils';
|
||||
import { getDefaultDSLQuery } from '@kbn/ml-query-utils';
|
||||
import { i18n } from '@kbn/i18n';
|
||||
import { RandomSamplerWrapper } from '@kbn/ml-random-sampler-utils';
|
||||
import { extractErrorMessage } from '@kbn/ml-error-utils';
|
||||
import { AggregationsAggregate } from '@elastic/elasticsearch/lib/api/types';
|
||||
import { QueryDslBoolQuery } from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
|
||||
import { isDefined } from '@kbn/ml-is-defined';
|
||||
import { computeChi2PValue, type Histogram } from '@kbn/ml-chi2test';
|
||||
import { mapAndFlattenFilters } from '@kbn/data-plugin/public';
|
||||
|
||||
import { createMergedEsQuery } from '../index_data_visualizer/utils/saved_search_utils';
|
||||
import { useDataDriftStateManagerContext } from './use_state_manager';
|
||||
import { useDataVisualizerKibana } from '../kibana_context';
|
||||
|
||||
import { useDataDriftStateManagerContext } from './use_state_manager';
|
||||
|
||||
import {
|
||||
REFERENCE_LABEL,
|
||||
COMPARISON_LABEL,
|
||||
|
@ -34,7 +42,6 @@ import {
|
|||
} from './constants';
|
||||
|
||||
import {
|
||||
Histogram,
|
||||
NumericDriftData,
|
||||
CategoricalDriftData,
|
||||
Range,
|
||||
|
@ -46,7 +53,6 @@ import {
|
|||
TimeRange,
|
||||
ComparisonHistogram,
|
||||
} from './types';
|
||||
import { computeChi2PValue } from './data_drift_utils';
|
||||
|
||||
export const getDataComparisonType = (kibanaType: string): DataDriftField['type'] => {
|
||||
switch (kibanaType) {
|
||||
|
|
|
@ -69,7 +69,8 @@
|
|||
"@kbn/ml-random-sampler-utils",
|
||||
"@kbn/data-service",
|
||||
"@kbn/core-notifications-browser",
|
||||
"@kbn/ebt-tools"
|
||||
"@kbn/ebt-tools",
|
||||
"@kbn/ml-chi2test"
|
||||
],
|
||||
"exclude": [
|
||||
"target/**/*",
|
||||
|
|
|
@ -4939,6 +4939,10 @@
|
|||
version "0.0.0"
|
||||
uid ""
|
||||
|
||||
"@kbn/ml-chi2test@link:x-pack/packages/ml/chi2test":
|
||||
version "0.0.0"
|
||||
uid ""
|
||||
|
||||
"@kbn/ml-data-frame-analytics-utils@link:x-pack/packages/ml/data_frame_analytics_utils":
|
||||
version "0.0.0"
|
||||
uid ""
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue