[ML] Move chi2test to package (#167237)

The `chi2test` utils so fare were only used within data comparison view.
We plan to use it with other plugins, so moving it so a separate package
in this PR. `SIGNIFICANCE_LEVELS` was updated to include some more
digits.
This commit is contained in:
Walter Rafelsberger 2023-09-27 18:43:27 +02:00 committed by GitHub
parent 93fc80704a
commit 1b9993eb07
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
21 changed files with 1247 additions and 1092 deletions

1
.github/CODEOWNERS vendored
View file

@ -502,6 +502,7 @@ x-pack/plugins/metrics_data_access @elastic/infra-monitoring-ui
x-pack/packages/ml/agg_utils @elastic/ml-ui
x-pack/packages/ml/anomaly_utils @elastic/ml-ui
x-pack/packages/ml/category_validator @elastic/ml-ui
x-pack/packages/ml/chi2test @elastic/ml-ui
x-pack/packages/ml/data_frame_analytics_utils @elastic/ml-ui
x-pack/packages/ml/data_grid @elastic/ml-ui
x-pack/packages/ml/date_picker @elastic/ml-ui

View file

@ -523,6 +523,7 @@
"@kbn/ml-agg-utils": "link:x-pack/packages/ml/agg_utils",
"@kbn/ml-anomaly-utils": "link:x-pack/packages/ml/anomaly_utils",
"@kbn/ml-category-validator": "link:x-pack/packages/ml/category_validator",
"@kbn/ml-chi2test": "link:x-pack/packages/ml/chi2test",
"@kbn/ml-data-frame-analytics-utils": "link:x-pack/packages/ml/data_frame_analytics_utils",
"@kbn/ml-data-grid": "link:x-pack/packages/ml/data_grid",
"@kbn/ml-date-picker": "link:x-pack/packages/ml/date_picker",

View file

@ -998,6 +998,8 @@
"@kbn/ml-anomaly-utils/*": ["x-pack/packages/ml/anomaly_utils/*"],
"@kbn/ml-category-validator": ["x-pack/packages/ml/category_validator"],
"@kbn/ml-category-validator/*": ["x-pack/packages/ml/category_validator/*"],
"@kbn/ml-chi2test": ["x-pack/packages/ml/chi2test"],
"@kbn/ml-chi2test/*": ["x-pack/packages/ml/chi2test/*"],
"@kbn/ml-data-frame-analytics-utils": ["x-pack/packages/ml/data_frame_analytics_utils"],
"@kbn/ml-data-frame-analytics-utils/*": ["x-pack/packages/ml/data_frame_analytics_utils/*"],
"@kbn/ml-data-grid": ["x-pack/packages/ml/data_grid"],

View file

@ -0,0 +1,4 @@
# @kbn/ml-chi2test
`computeChi2PValue` computes the p-value for how similar the datasets are.
Returned value ranges from 0 to 1, with 1 meaning the datasets are identical.

View file

@ -5,8 +5,8 @@
* 2.0.
*/
import { computeChi2PValue } from './data_drift_utils';
import { Histogram } from './types';
import { computeChi2PValue } from './compute_chi_2_pvalue';
import type { Histogram } from './types';
describe('computeChi2PValue()', () => {
test('should return close to 1 if datasets are both empty or nearly identical', () => {
@ -83,6 +83,6 @@ describe('computeChi2PValue()', () => {
percentage: 1,
},
];
expect(computeChi2PValue(referenceTerms, comparisonTerms)).toStrictEqual(0);
expect(computeChi2PValue(referenceTerms, comparisonTerms)).toStrictEqual(0.000001);
});
});

View file

@ -0,0 +1,48 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { criticalTableLookup } from './critical_table_lookup';
import type { Histogram } from './types';
/**
* Compute the p-value for how similar the datasets are.
* Returned value ranges from 0 to 1, with 1 meaning the datasets are identical.
*
* @param {Histogram[]} normalizedBaselineTerms - An array of normalized baseline terms (Histogram objects).
* @param {Histogram[]} normalizedDriftedTerms - An array of normalized drifted terms (Histogram objects).
* @returns {number} The p-value indicating the similarity of the datasets.
*/
export const computeChi2PValue = (
normalizedBaselineTerms: Histogram[],
normalizedDriftedTerms: Histogram[]
) => {
// Get all unique keys from both arrays
const allKeys: string[] = Array.from(
new Set([
...normalizedBaselineTerms.map((term) => term.key.toString()),
...normalizedDriftedTerms.map((term) => term.key.toString()),
])
).slice(0, 100);
// Calculate the chi-squared statistic and degrees of freedom
let chiSquared: number = 0;
const degreesOfFreedom: number = allKeys.length - 1;
if (degreesOfFreedom === 0) return 1;
allKeys.forEach((key) => {
const baselineTerm = normalizedBaselineTerms.find((term) => term.key === key);
const driftedTerm = normalizedDriftedTerms.find((term) => term.key === key);
const observed: number = driftedTerm?.percentage ?? 0;
const expected: number = baselineTerm?.percentage ?? 0;
chiSquared += Math.pow(observed - expected, 2) / (expected > 0 ? expected : 1e-6); // Prevent divide by zero
});
// Use the criticalTableLookup function to determine the p-value
return criticalTableLookup(chiSquared, degreesOfFreedom);
};

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,40 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { CRITICAL_VALUES_TABLE, SIGNIFICANCE_LEVELS } from './constants';
/**
* Performs a lookup in a critical values table to determine the significance level
* associated with a given chi-squared statistic and degrees of freedom.
*
* @param {number} chi2Statistic - The chi-squared statistic for which the significance level is to be determined.
* @param {number} df - The degrees of freedom (an integer) for the chi-squared test.
* @returns {number} The significance level corresponding to the chi-squared statistic and degrees of freedom.
* @throws {Error} If df is less than 1 or not an integer.
*/
export const criticalTableLookup = (chi2Statistic: number, df: number) => {
if (df < 1) return 1;
if (!Number.isInteger(df)) throw Error('Degrees of freedom must be a valid integer');
// Get the row index
const rowIndex: number = df - 1;
// Get the column index
let minDiff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][0] - chi2Statistic);
let columnIndex: number = 0;
for (let j = 1; j < CRITICAL_VALUES_TABLE[rowIndex].length; j++) {
const diff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][j] - chi2Statistic);
if (diff < minDiff) {
minDiff = diff;
columnIndex = j;
}
}
// Determine the significance level from the column index
const significanceLevel: number = SIGNIFICANCE_LEVELS[columnIndex];
return significanceLevel;
};

View file

@ -0,0 +1,11 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
export { computeChi2PValue } from './compute_chi_2_pvalue';
export { criticalTableLookup } from './critical_table_lookup';
export { CRITICAL_VALUES_TABLE, SIGNIFICANCE_LEVELS } from './constants';
export type { Histogram } from './types';

View file

@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
module.exports = {
preset: '@kbn/test',
rootDir: '../../../..',
roots: ['<rootDir>/x-pack/packages/ml/chi2test'],
};

View file

@ -0,0 +1,5 @@
{
"type": "shared-common",
"id": "@kbn/ml-chi2test",
"owner": "@elastic/ml-ui"
}

View file

@ -0,0 +1,6 @@
{
"name": "@kbn/ml-chi2test",
"private": true,
"version": "1.0.0",
"license": "Elastic License 2.0"
}

View file

@ -0,0 +1,19 @@
{
"extends": "../../../../tsconfig.base.json",
"compilerOptions": {
"outDir": "target/types",
"types": [
"jest",
"node",
"react"
]
},
"include": [
"**/*.ts",
"**/*.tsx",
],
"exclude": [
"target/**/*"
],
"kbn_references": []
}

View file

@ -0,0 +1,24 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
/**
* Interface for the Histogram type used by computeChi2PValue.
*/
export interface Histogram {
/**
* The doc count.
*/
doc_count: number;
/**
* The key.
*/
key: string | number;
/**
* Optional percentage.
*/
percentage?: number;
}

View file

@ -5,15 +5,20 @@
* 2.0.
*/
import React from 'react';
import { SeriesColorAccessor } from '@elastic/charts/dist/chart_types/xy_chart/utils/specs';
import { Axis, BarSeries, Chart, Position, ScaleType, Settings, Tooltip } from '@elastic/charts';
import React from 'react';
import { FIELD_FORMAT_IDS } from '@kbn/field-formats-plugin/common';
import { getFieldFormatType, useFieldFormatter } from './default_value_formatter';
import type { Histogram } from '@kbn/ml-chi2test';
import { DataComparisonChartTooltipBody } from '../data_drift_chart_tooltip_body';
import { NoChartsData } from './no_charts_data';
import { DATA_COMPARISON_TYPE } from '../constants';
import { DataDriftField, Feature, Histogram } from '../types';
import type { DataDriftField, Feature } from '../types';
import { getFieldFormatType, useFieldFormatter } from './default_value_formatter';
import { NoChartsData } from './no_charts_data';
export const SingleDistributionChart = ({
data,

View file

@ -5,67 +5,6 @@
* 2.0.
*/
import { CRITICAL_VALUES_TABLE, SIGNIFICANCE_LEVELS } from './constants';
import { Histogram } from './types';
const criticalTableLookup = (chi2Statistic: number, df: number) => {
if (df < 1) return 1;
if (!Number.isInteger(df)) throw Error('Degrees of freedom must be a valid integer');
// Get the row index
const rowIndex: number = df - 1;
// Get the column index
let minDiff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][0] - chi2Statistic);
let columnIndex: number = 0;
for (let j = 1; j < CRITICAL_VALUES_TABLE[rowIndex].length; j++) {
const diff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][j] - chi2Statistic);
if (diff < minDiff) {
minDiff = diff;
columnIndex = j;
}
}
const significanceLevel: number = SIGNIFICANCE_LEVELS[columnIndex];
return significanceLevel;
};
/**
* Compute the p-value for how similar the datasets are.
* Returned value ranges from 0 to 1, with 1 meaning the datasets are identical.
* @param normalizedBaselineTerms
* @param normalizedDriftedTerms
*/
export const computeChi2PValue = (
normalizedBaselineTerms: Histogram[],
normalizedDriftedTerms: Histogram[]
) => {
// Get all unique keys from both arrays
const allKeys: string[] = Array.from(
new Set([
...normalizedBaselineTerms.map((term) => term.key.toString()),
...normalizedDriftedTerms.map((term) => term.key.toString()),
])
).slice(0, 100);
// Calculate the chi-squared statistic and degrees of freedom
let chiSquared: number = 0;
const degreesOfFreedom: number = allKeys.length - 1;
if (degreesOfFreedom === 0) return 1;
allKeys.forEach((key) => {
const baselineTerm = normalizedBaselineTerms.find((term) => term.key === key);
const driftedTerm = normalizedDriftedTerms.find((term) => term.key === key);
const observed: number = driftedTerm?.percentage ?? 0;
const expected: number = baselineTerm?.percentage ?? 0;
chiSquared += Math.pow(observed - expected, 2) / (expected > 0 ? expected : 1e-6); // Prevent divide by zero
});
return criticalTableLookup(chiSquared, degreesOfFreedom);
};
/**
* formatSignificanceLevel
* @param significanceLevel

View file

@ -5,10 +5,13 @@
* 2.0.
*/
import * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { isPopulatedObject } from '@kbn/ml-is-populated-object';
import type { Filter, Query } from '@kbn/es-query';
import * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { SEARCH_QUERY_LANGUAGE, SearchQueryLanguage } from '@kbn/ml-query-utils';
import type { Histogram } from '@kbn/ml-chi2test';
import { DATA_COMPARISON_TYPE } from './constants';
export interface DataComparisonQueryState {
@ -52,12 +55,6 @@ export const getDefaultDataComparisonState = (
...overrides,
});
export interface Histogram {
doc_count: number;
key: string | number;
percentage?: number;
}
export interface ComparisonHistogram extends Histogram {
g: string;
}

View file

@ -5,27 +5,35 @@
* 2.0.
*/
import { chunk, cloneDeep, flatten } from 'lodash';
import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
import type { IKibanaSearchRequest } from '@kbn/data-plugin/common';
import { lastValueFrom } from 'rxjs';
import * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import type {
MappingRuntimeFields,
QueryDslBoolQuery,
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { AggregationsAggregate } from '@elastic/elasticsearch/lib/api/types';
import type { IKibanaSearchRequest } from '@kbn/data-plugin/common';
import type { DataView } from '@kbn/data-views-plugin/public';
import { isPopulatedObject } from '@kbn/ml-is-populated-object';
import type { Query } from '@kbn/data-plugin/common';
import { chunk, cloneDeep, flatten } from 'lodash';
import type { MappingRuntimeFields } from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import type { SearchQueryLanguage } from '@kbn/ml-query-utils';
import { getDefaultDSLQuery } from '@kbn/ml-query-utils';
import { i18n } from '@kbn/i18n';
import { RandomSamplerWrapper } from '@kbn/ml-random-sampler-utils';
import { extractErrorMessage } from '@kbn/ml-error-utils';
import { AggregationsAggregate } from '@elastic/elasticsearch/lib/api/types';
import { QueryDslBoolQuery } from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { isDefined } from '@kbn/ml-is-defined';
import { computeChi2PValue, type Histogram } from '@kbn/ml-chi2test';
import { mapAndFlattenFilters } from '@kbn/data-plugin/public';
import { createMergedEsQuery } from '../index_data_visualizer/utils/saved_search_utils';
import { useDataDriftStateManagerContext } from './use_state_manager';
import { useDataVisualizerKibana } from '../kibana_context';
import { useDataDriftStateManagerContext } from './use_state_manager';
import {
REFERENCE_LABEL,
COMPARISON_LABEL,
@ -34,7 +42,6 @@ import {
} from './constants';
import {
Histogram,
NumericDriftData,
CategoricalDriftData,
Range,
@ -46,7 +53,6 @@ import {
TimeRange,
ComparisonHistogram,
} from './types';
import { computeChi2PValue } from './data_drift_utils';
export const getDataComparisonType = (kibanaType: string): DataDriftField['type'] => {
switch (kibanaType) {

View file

@ -69,7 +69,8 @@
"@kbn/ml-random-sampler-utils",
"@kbn/data-service",
"@kbn/core-notifications-browser",
"@kbn/ebt-tools"
"@kbn/ebt-tools",
"@kbn/ml-chi2test"
],
"exclude": [
"target/**/*",

View file

@ -4939,6 +4939,10 @@
version "0.0.0"
uid ""
"@kbn/ml-chi2test@link:x-pack/packages/ml/chi2test":
version "0.0.0"
uid ""
"@kbn/ml-data-frame-analytics-utils@link:x-pack/packages/ml/data_frame_analytics_utils":
version "0.0.0"
uid ""