[ML] Update alerts-as-data payload for Anomaly detection health and Transform health rules (#176307)

This commit is contained in:
Dima Arnautov 2024-02-13 20:55:30 +01:00 committed by GitHub
parent 2598b81704
commit ec8e464251
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 672 additions and 94 deletions

View file

@ -0,0 +1,111 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
// ---------------------------------- WARNING ----------------------------------
// this file was generated, and should not be edited by hand
// ---------------------------------- WARNING ----------------------------------
import * as rt from 'io-ts';
import { Either } from 'fp-ts/lib/Either';
import { AlertSchema } from './alert_schema';
const ISO_DATE_PATTERN = /^d{4}-d{2}-d{2}Td{2}:d{2}:d{2}.d{3}Z$/;
export const IsoDateString = new rt.Type<string, string, unknown>(
'IsoDateString',
rt.string.is,
(input, context): Either<rt.Errors, string> => {
if (typeof input === 'string' && ISO_DATE_PATTERN.test(input)) {
return rt.success(input);
} else {
return rt.failure(input, context);
}
},
rt.identity
);
export type IsoDateStringC = typeof IsoDateString;
export const schemaUnknown = rt.unknown;
export const schemaUnknownArray = rt.array(rt.unknown);
export const schemaString = rt.string;
export const schemaStringArray = rt.array(schemaString);
export const schemaNumber = rt.number;
export const schemaNumberArray = rt.array(schemaNumber);
export const schemaDate = rt.union([IsoDateString, schemaNumber]);
export const schemaDateArray = rt.array(schemaDate);
export const schemaDateRange = rt.partial({
gte: schemaDate,
lte: schemaDate,
});
export const schemaDateRangeArray = rt.array(schemaDateRange);
export const schemaStringOrNumber = rt.union([schemaString, schemaNumber]);
export const schemaStringOrNumberArray = rt.array(schemaStringOrNumber);
export const schemaBoolean = rt.boolean;
export const schemaBooleanArray = rt.array(schemaBoolean);
const schemaGeoPointCoords = rt.type({
type: schemaString,
coordinates: schemaNumberArray,
});
const schemaGeoPointString = schemaString;
const schemaGeoPointLatLon = rt.type({
lat: schemaNumber,
lon: schemaNumber,
});
const schemaGeoPointLocation = rt.type({
location: schemaNumberArray,
});
const schemaGeoPointLocationString = rt.type({
location: schemaString,
});
export const schemaGeoPoint = rt.union([
schemaGeoPointCoords,
schemaGeoPointString,
schemaGeoPointLatLon,
schemaGeoPointLocation,
schemaGeoPointLocationString,
]);
export const schemaGeoPointArray = rt.array(schemaGeoPoint);
// prettier-ignore
const MlAnomalyDetectionHealthAlertRequired = rt.type({
});
// prettier-ignore
const MlAnomalyDetectionHealthAlertOptional = rt.partial({
'kibana.alert.datafeed_results': rt.array(
rt.partial({
datafeed_id: schemaString,
datafeed_state: schemaString,
job_id: schemaString,
job_state: schemaString,
})
),
'kibana.alert.delayed_data_results': rt.array(
rt.partial({
annotation: schemaString,
end_timestamp: schemaDate,
job_id: schemaString,
missed_docs_count: schemaStringOrNumber,
})
),
'kibana.alert.job_errors_results': rt.array(
rt.partial({
errors: schemaUnknown,
job_id: schemaString,
})
),
'kibana.alert.mml_results': rt.array(
rt.partial({
job_id: schemaString,
log_time: schemaDate,
memory_status: schemaString,
model_bytes: schemaStringOrNumber,
model_bytes_exceeded: schemaStringOrNumber,
model_bytes_memory_limit: schemaStringOrNumber,
peak_model_bytes: schemaStringOrNumber,
})
),
});
// prettier-ignore
export const MlAnomalyDetectionHealthAlertSchema = rt.intersection([MlAnomalyDetectionHealthAlertRequired, MlAnomalyDetectionHealthAlertOptional, AlertSchema]);
// prettier-ignore
export type MlAnomalyDetectionHealthAlert = rt.TypeOf<typeof MlAnomalyDetectionHealthAlertSchema>;

View file

@ -0,0 +1,88 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
// ---------------------------------- WARNING ----------------------------------
// this file was generated, and should not be edited by hand
// ---------------------------------- WARNING ----------------------------------
import * as rt from 'io-ts';
import { Either } from 'fp-ts/lib/Either';
import { AlertSchema } from './alert_schema';
const ISO_DATE_PATTERN = /^d{4}-d{2}-d{2}Td{2}:d{2}:d{2}.d{3}Z$/;
export const IsoDateString = new rt.Type<string, string, unknown>(
'IsoDateString',
rt.string.is,
(input, context): Either<rt.Errors, string> => {
if (typeof input === 'string' && ISO_DATE_PATTERN.test(input)) {
return rt.success(input);
} else {
return rt.failure(input, context);
}
},
rt.identity
);
export type IsoDateStringC = typeof IsoDateString;
export const schemaUnknown = rt.unknown;
export const schemaUnknownArray = rt.array(rt.unknown);
export const schemaString = rt.string;
export const schemaStringArray = rt.array(schemaString);
export const schemaNumber = rt.number;
export const schemaNumberArray = rt.array(schemaNumber);
export const schemaDate = rt.union([IsoDateString, schemaNumber]);
export const schemaDateArray = rt.array(schemaDate);
export const schemaDateRange = rt.partial({
gte: schemaDate,
lte: schemaDate,
});
export const schemaDateRangeArray = rt.array(schemaDateRange);
export const schemaStringOrNumber = rt.union([schemaString, schemaNumber]);
export const schemaStringOrNumberArray = rt.array(schemaStringOrNumber);
export const schemaBoolean = rt.boolean;
export const schemaBooleanArray = rt.array(schemaBoolean);
const schemaGeoPointCoords = rt.type({
type: schemaString,
coordinates: schemaNumberArray,
});
const schemaGeoPointString = schemaString;
const schemaGeoPointLatLon = rt.type({
lat: schemaNumber,
lon: schemaNumber,
});
const schemaGeoPointLocation = rt.type({
location: schemaNumberArray,
});
const schemaGeoPointLocationString = rt.type({
location: schemaString,
});
export const schemaGeoPoint = rt.union([
schemaGeoPointCoords,
schemaGeoPointString,
schemaGeoPointLatLon,
schemaGeoPointLocation,
schemaGeoPointLocationString,
]);
export const schemaGeoPointArray = rt.array(schemaGeoPoint);
// prettier-ignore
const TransformHealthAlertRequired = rt.type({
});
// prettier-ignore
const TransformHealthAlertOptional = rt.partial({
'kibana.alert.results': rt.array(
rt.partial({
description: schemaString,
health_status: schemaString,
issues: schemaUnknown,
node_name: schemaString,
transform_id: schemaString,
transform_state: schemaString,
})
),
});
// prettier-ignore
export const TransformHealthAlertSchema = rt.intersection([TransformHealthAlertRequired, TransformHealthAlertOptional, AlertSchema]);
// prettier-ignore
export type TransformHealthAlert = rt.TypeOf<typeof TransformHealthAlertSchema>;

View file

@ -15,6 +15,8 @@ import type { ObservabilityUptimeAlert } from './generated/observability_uptime_
import type { SecurityAlert } from './generated/security_schema';
import type { MlAnomalyDetectionAlert } from './generated/ml_anomaly_detection_schema';
import type { DefaultAlert } from './generated/default_schema';
import type { MlAnomalyDetectionHealthAlert } from './generated/ml_anomaly_detection_health_schema';
import type { TransformHealthAlert } from './generated/transform_health_schema';
export * from './create_schema_from_field_map';
@ -27,7 +29,9 @@ export type { ObservabilityUptimeAlert } from './generated/observability_uptime_
export type { SecurityAlert } from './generated/security_schema';
export type { StackAlert } from './generated/stack_schema';
export type { MlAnomalyDetectionAlert } from './generated/ml_anomaly_detection_schema';
export type { MlAnomalyDetectionHealthAlert } from './generated/ml_anomaly_detection_health_schema';
export type { DefaultAlert } from './generated/default_schema';
export type { TransformHealthAlert } from './generated/transform_health_schema';
export type AADAlert =
| Alert
@ -38,4 +42,6 @@ export type AADAlert =
| ObservabilityUptimeAlert
| SecurityAlert
| MlAnomalyDetectionAlert
| MlAnomalyDetectionHealthAlert
| TransformHealthAlert
| DefaultAlert;

View file

@ -9115,7 +9115,34 @@ Object {
exports[`Alert as data fields checks detect AAD fields changes for: transform_health 1`] = `
Object {
"fieldMap": Object {},
"fieldMap": Object {
"kibana.alert.results": Object {
"array": true,
"dynamic": false,
"properties": Object {
"description": Object {
"type": "text",
},
"health_status": Object {
"type": "keyword",
},
"issues": Object {
"type": "object",
},
"node_name": Object {
"type": "keyword",
},
"transform_id": Object {
"type": "keyword",
},
"transform_state": Object {
"type": "keyword",
},
},
"required": false,
"type": "object",
},
},
}
`;
@ -9233,7 +9260,91 @@ Object {
exports[`Alert as data fields checks detect AAD fields changes for: xpack.ml.anomaly_detection_jobs_health 1`] = `
Object {
"fieldMap": Object {},
"fieldMap": Object {
"kibana.alert.datafeed_results": Object {
"array": true,
"dynamic": false,
"properties": Object {
"datafeed_id": Object {
"type": "keyword",
},
"datafeed_state": Object {
"type": "keyword",
},
"job_id": Object {
"type": "keyword",
},
"job_state": Object {
"type": "keyword",
},
},
"required": false,
"type": "object",
},
"kibana.alert.delayed_data_results": Object {
"array": true,
"dynamic": false,
"properties": Object {
"annotation": Object {
"type": "text",
},
"end_timestamp": Object {
"type": "date",
},
"job_id": Object {
"type": "keyword",
},
"missed_docs_count": Object {
"type": "long",
},
},
"required": false,
"type": "object",
},
"kibana.alert.job_errors_results": Object {
"array": true,
"dynamic": false,
"properties": Object {
"errors": Object {
"type": "object",
},
"job_id": Object {
"type": "keyword",
},
},
"required": false,
"type": "object",
},
"kibana.alert.mml_results": Object {
"array": true,
"dynamic": false,
"properties": Object {
"job_id": Object {
"type": "keyword",
},
"log_time": Object {
"type": "date",
},
"memory_status": Object {
"type": "keyword",
},
"model_bytes": Object {
"type": "long",
},
"model_bytes_exceeded": Object {
"type": "long",
},
"model_bytes_memory_limit": Object {
"type": "long",
},
"peak_model_bytes": Object {
"type": "long",
},
},
"required": false,
"type": "object",
},
},
}
`;

View file

@ -85,6 +85,8 @@ export const HEALTH_CHECK_NAMES: Record<JobsHealthTests, { name: string; descrip
};
const ML_ALERT_NAMESPACE = ALERT_NAMESPACE;
// Anomaly detection rule type fields
export const ALERT_ANOMALY_TIMESTAMP = `${ML_ALERT_NAMESPACE}.anomaly_timestamp` as const;
export const ALERT_ANOMALY_DETECTION_JOB_ID = `${ML_ALERT_NAMESPACE}.job_id` as const;
export const ALERT_ANOMALY_SCORE = `${ML_ALERT_NAMESPACE}.anomaly_score` as const;
@ -92,6 +94,12 @@ export const ALERT_ANOMALY_IS_INTERIM = `${ML_ALERT_NAMESPACE}.is_interim` as co
export const ALERT_TOP_RECORDS = `${ML_ALERT_NAMESPACE}.top_records` as const;
export const ALERT_TOP_INFLUENCERS = `${ML_ALERT_NAMESPACE}.top_influencers` as const;
// Anomaly detection health rule type fields
export const ALERT_MML_RESULTS = `${ML_ALERT_NAMESPACE}.mml_results` as const;
export const ALERT_DATAFEED_RESULTS = `${ML_ALERT_NAMESPACE}.datafeed_results` as const;
export const ALERT_DELAYED_DATA_RESULTS = `${ML_ALERT_NAMESPACE}.delayed_data_results` as const;
export const ALERT_JOB_ERRORS_RESULTS = `${ML_ALERT_NAMESPACE}.job_errors_results` as const;
export const alertFieldNameMap = Object.freeze<Record<string, string>>({
[ALERT_RULE_NAME]: i18n.translate('xpack.ml.alertsTable.columns.ruleName', {
defaultMessage: 'Rule name',

View file

@ -254,6 +254,10 @@ describe('JobsHealthService', () => {
message: 'No errors in the jobs messages.',
results: [],
},
payload: {
'kibana.alert.job_errors_results': [],
'kibana.alert.reason': 'No errors in the jobs messages.',
},
isHealthy: true,
name: 'Errors in job messages',
},
@ -310,6 +314,18 @@ describe('JobsHealthService', () => {
],
message: 'Job test_job_01 is suffering from delayed data.',
},
payload: {
'kibana.alert.delayed_data_results': [
{
annotation:
'Datafeed has missed 11 documents due to ingest latency, latest bucket with missing data is [2021-07-30T13:50:00.000Z]. Consider increasing query_delay',
end_timestamp: 1627653300000,
job_id: 'test_job_01',
missed_docs_count: 11,
},
],
'kibana.alert.reason': 'Job test_job_01 is suffering from delayed data.',
},
},
]);
});
@ -365,6 +381,17 @@ describe('JobsHealthService', () => {
],
message: 'Datafeed is not started for job test_job_02',
},
payload: {
'kibana.alert.datafeed_results': [
{
datafeed_id: 'test_datafeed_02',
datafeed_state: 'stopped',
job_id: 'test_job_02',
job_state: 'opened',
},
],
'kibana.alert.reason': 'Datafeed is not started for job test_job_02',
},
},
{
isHealthy: false,
@ -384,6 +411,21 @@ describe('JobsHealthService', () => {
message:
'Job test_job_01 reached the hard model memory limit. Assign more memory to the job and restore it from a snapshot taken prior to reaching the hard limit.',
},
payload: {
'kibana.alert.mml_results': [
{
job_id: 'test_job_01',
log_time: 1626935914540,
memory_status: 'hard_limit',
model_bytes: 1000000,
model_bytes_exceeded: 200000,
model_bytes_memory_limit: 800000,
peak_model_bytes: 1000000,
},
],
'kibana.alert.reason':
'Job test_job_01 reached the hard model memory limit. Assign more memory to the job and restore it from a snapshot taken prior to reaching the hard limit.',
},
},
{
isHealthy: false,
@ -407,6 +449,25 @@ describe('JobsHealthService', () => {
],
message: 'Jobs test_job_01, test_job_02 are suffering from delayed data.',
},
payload: {
'kibana.alert.delayed_data_results': [
{
annotation:
'Datafeed has missed 11 documents due to ingest latency, latest bucket with missing data is [2021-07-30T13:50:00.000Z]. Consider increasing query_delay',
end_timestamp: 1627653300000,
job_id: 'test_job_01',
missed_docs_count: 11,
},
{
annotation:
'Datafeed has missed 8 documents due to ingest latency, latest bucket with missing data is [2021-07-30T13:50:00.000Z]. Consider increasing query_delay',
end_timestamp: 1627653300000,
job_id: 'test_job_02',
missed_docs_count: 8,
},
],
'kibana.alert.reason': 'Jobs test_job_01, test_job_02 are suffering from delayed data.',
},
},
{
isHealthy: true,
@ -415,6 +476,10 @@ describe('JobsHealthService', () => {
message: 'No errors in the jobs messages.',
results: [],
},
payload: {
'kibana.alert.job_errors_results': [],
'kibana.alert.reason': 'No errors in the jobs messages.',
},
},
]);
});

View file

@ -10,18 +10,27 @@ import type { KibanaRequest, Logger, SavedObjectsClientContract } from '@kbn/cor
import { i18n } from '@kbn/i18n';
import type { MlJob } from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { isDefined } from '@kbn/ml-is-defined';
import { ALERT_REASON } from '@kbn/rule-data-utils';
import type { MlClient } from '../ml_client';
import type { JobSelection } from '../../routes/schemas/alerting_schema';
import { datafeedsProvider, type DatafeedsService } from '../../models/job_service/datafeeds';
import { ALL_JOBS_SELECTION, HEALTH_CHECK_NAMES } from '../../../common/constants/alerts';
import {
ALERT_DATAFEED_RESULTS,
ALERT_DELAYED_DATA_RESULTS,
ALERT_JOB_ERRORS_RESULTS,
ALERT_MML_RESULTS,
ALL_JOBS_SELECTION,
HEALTH_CHECK_NAMES,
} from '../../../common/constants/alerts';
import type { DatafeedStats } from '../../../common/types/anomaly_detection_jobs';
import type { GetGuards } from '../../shared_services/shared_services';
import type {
AnomalyDetectionJobHealthAlertPayload,
AnomalyDetectionJobsHealthAlertContext,
DelayedDataResponse,
DelayedDataPayloadResponse,
JobsErrorsResponse,
JobsHealthExecutorOptions,
MmlTestResponse,
MmlTestPayloadResponse,
NotStartedDatafeedResponse,
} from './register_jobs_monitoring_rule_type';
import {
@ -40,8 +49,9 @@ import type { FieldFormatsRegistryProvider } from '../../../common/types/kibana'
export interface TestResult {
name: string;
context: AnomalyDetectionJobsHealthAlertContext;
payload: AnomalyDetectionJobHealthAlertPayload;
/**
* Indicates if the health check is successful.
* Indicates if the health check is successful.
*/
isHealthy: boolean;
}
@ -61,11 +71,34 @@ export function jobsHealthServiceProvider(
*/
const getFormatters = memoize(async () => {
const fieldFormatsRegistry = await getFieldsFormatRegistry();
const dateFormatter = fieldFormatsRegistry.deserialize({ id: 'date' });
const bytesFormatter = fieldFormatsRegistry.deserialize({ id: 'bytes' });
const dateFormat = fieldFormatsRegistry.deserialize({ id: 'date' });
const bytesFormat = fieldFormatsRegistry.deserialize({ id: 'bytes' });
const dateFormatter = dateFormat.convert.bind(dateFormat);
const bytesFormatter = bytesFormat.convert.bind(bytesFormat);
return {
dateFormatter: dateFormatter.convert.bind(dateFormatter),
bytesFormatter: bytesFormatter.convert.bind(bytesFormatter),
dateFormatter,
bytesFormatter,
mmlResultsFormatter: (payload: MmlTestPayloadResponse) => {
return {
job_id: payload.job_id,
memory_status: payload.memory_status,
log_time: dateFormatter(payload.log_time),
model_bytes: bytesFormatter(payload.model_bytes),
model_bytes_memory_limit: bytesFormatter(payload.model_bytes_memory_limit),
peak_model_bytes: bytesFormatter(payload.peak_model_bytes),
model_bytes_exceeded: bytesFormatter(payload.model_bytes_exceeded),
};
},
delayedDataFormatter: (payload: DelayedDataPayloadResponse) => {
return {
job_id: payload.job_id,
annotation: payload.annotation,
missed_docs_count: payload.missed_docs_count,
end_timestamp: dateFormatter(payload.end_timestamp),
};
},
};
});
@ -189,22 +222,20 @@ export function jobsHealthServiceProvider(
* Gets the model memory report for opened jobs.
* @param jobIds
*/
async getMmlReport(jobIds: string[]): Promise<MmlTestResponse[]> {
async getMmlReport(jobIds: string[]): Promise<MmlTestPayloadResponse[]> {
const jobsStats = await getJobStats(jobIds);
const { dateFormatter, bytesFormatter } = await getFormatters();
return jobsStats
.filter((j) => j.state === 'opened')
.map(({ job_id: jobId, model_size_stats: modelSizeStats }) => {
return {
job_id: jobId,
memory_status: modelSizeStats.memory_status,
log_time: dateFormatter(modelSizeStats.log_time),
model_bytes: bytesFormatter(modelSizeStats.model_bytes),
model_bytes_memory_limit: bytesFormatter(modelSizeStats.model_bytes_memory_limit),
peak_model_bytes: bytesFormatter(modelSizeStats.peak_model_bytes),
model_bytes_exceeded: bytesFormatter(modelSizeStats.model_bytes_exceeded),
log_time: modelSizeStats.log_time,
model_bytes: modelSizeStats.model_bytes,
model_bytes_memory_limit: modelSizeStats.model_bytes_memory_limit!,
peak_model_bytes: modelSizeStats.peak_model_bytes!,
model_bytes_exceeded: modelSizeStats.model_bytes_exceeded!,
};
});
},
@ -221,7 +252,7 @@ export function jobsHealthServiceProvider(
jobs: MlJob[],
timeInterval: string | null,
docsCount: number | null
): Promise<[DelayedDataResponse[], DelayedDataResponse[]]> {
): Promise<[DelayedDataPayloadResponse[], DelayedDataPayloadResponse[]]> {
const jobIds = getJobIds(jobs);
const datafeeds = await getDatafeeds(jobIds);
@ -235,8 +266,6 @@ export function jobsHealthServiceProvider(
const defaultLookbackInterval = resolveLookbackInterval(resultJobs, datafeeds!);
const earliestMs = getDelayedDataLookbackTimestamp(timeInterval, defaultLookbackInterval);
const { dateFormatter } = await getFormatters();
const annotationsData = (
await annotationService.getDelayedDataAnnotations({
jobIds: resultJobIds,
@ -268,12 +297,6 @@ export function jobsHealthServiceProvider(
v.end_timestamp > getDelayedDataLookbackTimestamp(timeInterval, jobLookbackInterval);
return isEndTimestampWithinRange;
})
.map((v) => {
return {
...v,
end_timestamp: dateFormatter(v.end_timestamp),
};
});
return partition(annotationsData, (v) => {
@ -343,28 +366,28 @@ export function jobsHealthServiceProvider(
const datafeedResults = isHealthy ? startedDatafeeds : notStartedDatafeeds;
const { count, jobsString } = getJobsAlertingMessageValues(datafeedResults);
const message = isHealthy
? i18n.translate('xpack.ml.alertTypes.jobsHealthAlertingRule.datafeedRecoveryMessage', {
defaultMessage:
'Datafeed is started for {count, plural, one {job} other {jobs}} {jobsString}',
values: { count, jobsString },
})
: i18n.translate('xpack.ml.alertTypes.jobsHealthAlertingRule.datafeedStateMessage', {
defaultMessage:
'Datafeed is not started for {count, plural, one {job} other {jobs}} {jobsString}',
values: { count, jobsString },
});
results.push({
isHealthy,
name: HEALTH_CHECK_NAMES.datafeed.name,
payload: {
[ALERT_REASON]: message,
[ALERT_DATAFEED_RESULTS]: datafeedResults,
},
context: {
results: datafeedResults,
message: isHealthy
? i18n.translate(
'xpack.ml.alertTypes.jobsHealthAlertingRule.datafeedRecoveryMessage',
{
defaultMessage:
'Datafeed is started for {count, plural, one {job} other {jobs}} {jobsString}',
values: { count, jobsString },
}
)
: i18n.translate(
'xpack.ml.alertTypes.jobsHealthAlertingRule.datafeedStateMessage',
{
defaultMessage:
'Datafeed is not started for {count, plural, one {job} other {jobs}} {jobsString}',
values: { count, jobsString },
}
),
message,
},
});
}
@ -424,13 +447,23 @@ export function jobsHealthServiceProvider(
}
}
const mmlResults = isHealthy
? okJobs
: [...(hardLimitJobs ?? []), ...(softLimitJobs ?? [])];
const { mmlResultsFormatter } = await getFormatters();
results.push({
isHealthy,
name: HEALTH_CHECK_NAMES.mml.name,
context: {
results: isHealthy ? okJobs : [...(hardLimitJobs ?? []), ...(softLimitJobs ?? [])],
results: mmlResults.map(mmlResultsFormatter),
message,
},
payload: {
[ALERT_REASON]: message,
[ALERT_MML_RESULTS]: mmlResults,
},
});
}
}
@ -446,23 +479,33 @@ export function jobsHealthServiceProvider(
const isHealthy = exceededThresholdAnnotations.length === 0;
const { count, jobsString } = getJobsAlertingMessageValues(exceededThresholdAnnotations);
const message = isHealthy
? i18n.translate(
'xpack.ml.alertTypes.jobsHealthAlertingRule.delayedDataRecoveryMessage',
{
defaultMessage: 'No data delay has occurred.',
}
)
: i18n.translate('xpack.ml.alertTypes.jobsHealthAlertingRule.delayedDataMessage', {
defaultMessage:
'{count, plural, one {Job} other {Jobs}} {jobsString} {count, plural, one {is} other {are}} suffering from delayed data.',
values: { count, jobsString },
});
const delayedDataResults = isHealthy
? withinThresholdAnnotations
: exceededThresholdAnnotations;
results.push({
isHealthy,
name: HEALTH_CHECK_NAMES.delayedData.name,
context: {
results: isHealthy ? withinThresholdAnnotations : exceededThresholdAnnotations,
message: isHealthy
? i18n.translate(
'xpack.ml.alertTypes.jobsHealthAlertingRule.delayedDataRecoveryMessage',
{
defaultMessage: 'No data delay has occurred.',
}
)
: i18n.translate('xpack.ml.alertTypes.jobsHealthAlertingRule.delayedDataMessage', {
defaultMessage:
'{count, plural, one {Job} other {Jobs}} {jobsString} {count, plural, one {is} other {are}} suffering from delayed data.',
values: { count, jobsString },
}),
results: delayedDataResults.map((await getFormatters()).delayedDataFormatter),
message,
},
payload: {
[ALERT_REASON]: message,
[ALERT_DELAYED_DATA_RESULTS]: delayedDataResults,
},
});
}
@ -472,25 +515,31 @@ export function jobsHealthServiceProvider(
const { count, jobsString } = getJobsAlertingMessageValues(response);
const isHealthy = response.length === 0;
const message = isHealthy
? i18n.translate(
'xpack.ml.alertTypes.jobsHealthAlertingRule.errorMessagesRecoveredMessage',
{
defaultMessage:
'No errors in the {count, plural, one {job} other {jobs}} messages.',
values: { count },
}
)
: i18n.translate('xpack.ml.alertTypes.jobsHealthAlertingRule.errorMessagesMessage', {
defaultMessage:
'{count, plural, one {Job} other {Jobs}} {jobsString} {count, plural, one {contains} other {contain}} errors in the messages.',
values: { count, jobsString },
});
results.push({
isHealthy,
name: HEALTH_CHECK_NAMES.errorMessages.name,
context: {
results: response,
message: isHealthy
? i18n.translate(
'xpack.ml.alertTypes.jobsHealthAlertingRule.errorMessagesRecoveredMessage',
{
defaultMessage:
'No errors in the {count, plural, one {job} other {jobs}} messages.',
values: { count },
}
)
: i18n.translate('xpack.ml.alertTypes.jobsHealthAlertingRule.errorMessagesMessage', {
defaultMessage:
'{count, plural, one {Job} other {Jobs}} {jobsString} {count, plural, one {contains} other {contain}} errors in the messages.',
values: { count, jobsString },
}),
message,
},
payload: {
[ALERT_REASON]: message,
[ALERT_JOB_ERRORS_RESULTS]: response,
},
});
}

View file

@ -6,8 +6,10 @@
*/
import { i18n } from '@kbn/i18n';
import { KibanaRequest, DEFAULT_APP_CATEGORIES } from '@kbn/core/server';
import { DEFAULT_APP_CATEGORIES, KibanaRequest } from '@kbn/core/server';
import type {
ByteSize,
DateTime,
MlDatafeedState,
MlJobState,
MlJobStats,
@ -19,11 +21,18 @@ import type {
RecoveredActionGroupId,
RuleTypeState,
} from '@kbn/alerting-plugin/common';
import { AlertsClientError, DEFAULT_AAD_CONFIG } from '@kbn/alerting-plugin/server';
import type { RuleExecutorOptions } from '@kbn/alerting-plugin/server';
import type { DefaultAlert } from '@kbn/alerts-as-data-utils';
import { AlertsClientError, IRuleTypeAlerts } from '@kbn/alerting-plugin/server';
import { MlAnomalyDetectionHealthAlert } from '@kbn/alerts-as-data-utils';
import { ALERT_REASON } from '@kbn/rule-data-utils';
import { ML_ALERT_TYPES } from '../../../common/constants/alerts';
import { ES_FIELD_TYPES } from '@kbn/field-types';
import {
ALERT_DATAFEED_RESULTS,
ALERT_DELAYED_DATA_RESULTS,
ALERT_JOB_ERRORS_RESULTS,
ALERT_MML_RESULTS,
ML_ALERT_TYPES,
} from '../../../common/constants/alerts';
import { PLUGIN_ID } from '../../../common/constants/app';
import { MINIMUM_FULL_LICENSE } from '../../../common/license';
import {
@ -38,13 +47,23 @@ type ModelSizeStats = MlJobStats['model_size_stats'];
export interface MmlTestResponse {
job_id: string;
memory_status: ModelSizeStats['memory_status'];
log_time: ModelSizeStats['log_time'];
log_time: string;
model_bytes: string;
model_bytes_memory_limit: string;
peak_model_bytes: string;
model_bytes_exceeded: string;
}
export interface MmlTestPayloadResponse {
job_id: string;
memory_status: ModelSizeStats['memory_status'];
log_time: ModelSizeStats['log_time'];
model_bytes: ByteSize;
model_bytes_memory_limit: ByteSize;
peak_model_bytes: ByteSize;
model_bytes_exceeded: ByteSize;
}
export interface NotStartedDatafeedResponse {
datafeed_id: string;
datafeed_state: MlDatafeedState;
@ -62,6 +81,15 @@ export interface DelayedDataResponse {
end_timestamp: string;
}
export interface DelayedDataPayloadResponse {
job_id: string;
/** Annotation string */
annotation: string;
/** Number of missed documents */
missed_docs_count: number;
end_timestamp: DateTime;
}
export interface JobsErrorsResponse {
job_id: string;
errors: Array<Omit<JobMessage, 'timestamp'> & { timestamp: string }>;
@ -78,6 +106,15 @@ export type AnomalyDetectionJobsHealthAlertContext = {
message: string;
} & AlertInstanceContext;
export type AnomalyDetectionJobHealthAlertPayload = {
[ALERT_REASON]: string;
} & (
| { [ALERT_MML_RESULTS]: MmlTestPayloadResponse[] }
| { [ALERT_DATAFEED_RESULTS]: NotStartedDatafeedResponse[] }
| { [ALERT_DELAYED_DATA_RESULTS]: DelayedDataPayloadResponse[] }
| { [ALERT_JOB_ERRORS_RESULTS]: JobsErrorsResponse[] }
);
export const ANOMALY_DETECTION_JOB_REALTIME_ISSUE = 'anomaly_detection_realtime_issue';
export type AnomalyDetectionJobRealtimeIssue = typeof ANOMALY_DETECTION_JOB_REALTIME_ISSUE;
@ -95,9 +132,69 @@ export type JobsHealthExecutorOptions = RuleExecutorOptions<
Record<string, unknown>,
AnomalyDetectionJobsHealthAlertContext,
AnomalyDetectionJobRealtimeIssue,
DefaultAlert
MlAnomalyDetectionHealthAlert
>;
export const ANOMALY_DETECTION_HEALTH_AAD_INDEX_NAME = 'ml.anomaly-detection-health';
export const ANOMALY_DETECTION_HEALTH_AAD_CONFIG: IRuleTypeAlerts<MlAnomalyDetectionHealthAlert> = {
context: ANOMALY_DETECTION_HEALTH_AAD_INDEX_NAME,
mappings: {
fieldMap: {
[ALERT_MML_RESULTS]: {
type: ES_FIELD_TYPES.OBJECT,
array: true,
required: false,
dynamic: false,
properties: {
job_id: { type: ES_FIELD_TYPES.KEYWORD },
memory_status: { type: ES_FIELD_TYPES.KEYWORD },
log_time: { type: ES_FIELD_TYPES.DATE },
model_bytes: { type: ES_FIELD_TYPES.LONG },
model_bytes_memory_limit: { type: ES_FIELD_TYPES.LONG },
peak_model_bytes: { type: ES_FIELD_TYPES.LONG },
model_bytes_exceeded: { type: ES_FIELD_TYPES.LONG },
},
},
[ALERT_DATAFEED_RESULTS]: {
type: ES_FIELD_TYPES.OBJECT,
array: true,
required: false,
dynamic: false,
properties: {
job_id: { type: ES_FIELD_TYPES.KEYWORD },
job_state: { type: ES_FIELD_TYPES.KEYWORD },
datafeed_id: { type: ES_FIELD_TYPES.KEYWORD },
datafeed_state: { type: ES_FIELD_TYPES.KEYWORD },
},
},
[ALERT_DELAYED_DATA_RESULTS]: {
type: ES_FIELD_TYPES.OBJECT,
array: true,
required: false,
dynamic: false,
properties: {
job_id: { type: ES_FIELD_TYPES.KEYWORD },
annotation: { type: ES_FIELD_TYPES.TEXT },
missed_docs_count: { type: ES_FIELD_TYPES.LONG },
end_timestamp: { type: ES_FIELD_TYPES.DATE },
},
},
[ALERT_JOB_ERRORS_RESULTS]: {
type: ES_FIELD_TYPES.OBJECT,
array: true,
required: false,
dynamic: false,
properties: {
job_id: { type: ES_FIELD_TYPES.KEYWORD },
errors: { type: ES_FIELD_TYPES.OBJECT },
},
},
},
},
shouldWrite: true,
};
export function registerJobsMonitoringRuleType({
alerting,
mlServicesProviders,
@ -111,7 +208,7 @@ export function registerJobsMonitoringRuleType({
AnomalyDetectionJobsHealthAlertContext,
AnomalyDetectionJobRealtimeIssue,
RecoveredActionGroupId,
DefaultAlert
MlAnomalyDetectionHealthAlert
>({
id: ML_ALERT_TYPES.AD_JOBS_HEALTH,
name: i18n.translate('xpack.ml.jobsHealthAlertingRule.name', {
@ -155,7 +252,7 @@ export function registerJobsMonitoringRuleType({
minimumLicenseRequired: MINIMUM_FULL_LICENSE,
isExportable: true,
doesSetRecoveryContext: true,
alerts: DEFAULT_AAD_CONFIG,
alerts: ANOMALY_DETECTION_HEALTH_AAD_CONFIG,
async executor(options) {
const {
services,
@ -185,14 +282,12 @@ export function registerJobsMonitoringRuleType({
.join(', ')}`
);
unhealthyTests.forEach(({ name: alertName, context }) => {
unhealthyTests.forEach(({ name: alertName, context, payload }) => {
alertsClient.report({
id: alertName,
actionGroup: ANOMALY_DETECTION_JOB_REALTIME_ISSUE,
context,
payload: {
[ALERT_REASON]: context.message,
},
payload,
});
});
}
@ -205,9 +300,7 @@ export function registerJobsMonitoringRuleType({
alertsClient.setAlertData({
id: recoveredAlertId,
context: testResult.context,
payload: {
[ALERT_REASON]: testResult.context.message,
},
payload: testResult.payload,
});
}
}

View file

@ -8,6 +8,7 @@
import { i18n } from '@kbn/i18n';
import { LicenseType } from '@kbn/licensing-plugin/common/types';
import { ALERT_NAMESPACE } from '@kbn/rule-data-utils';
import { TransformHealthTests } from './types/alerting';
export const DEFAULT_REFRESH_INTERVAL_MS = 30000;
@ -163,10 +164,15 @@ export const TRANSFORM_FUNCTION = {
export type TransformFunction = typeof TRANSFORM_FUNCTION[keyof typeof TRANSFORM_FUNCTION];
// Alerting
export const TRANSFORM_RULE_TYPE = {
TRANSFORM_HEALTH: 'transform_health',
} as const;
const TRANSFORM_ALERT_NAMESPACE = ALERT_NAMESPACE;
export const TRANSFORM_HEALTH_RESULTS = `${TRANSFORM_ALERT_NAMESPACE}.results` as const;
export const ALL_TRANSFORMS_SELECTION = '*';
export const TRANSFORM_HEALTH_CHECK_NAMES: Record<

View file

@ -14,12 +14,18 @@ import type {
RecoveredActionGroupId,
RuleTypeState,
} from '@kbn/alerting-plugin/common';
import { AlertsClientError, DEFAULT_AAD_CONFIG, RuleType } from '@kbn/alerting-plugin/server';
import { AlertsClientError, IRuleTypeAlerts, RuleType } from '@kbn/alerting-plugin/server';
import type { PluginSetupContract as AlertingSetup } from '@kbn/alerting-plugin/server';
import type { FieldFormatsStart } from '@kbn/field-formats-plugin/server';
import type { DefaultAlert } from '@kbn/alerts-as-data-utils';
import type { TransformHealthAlert } from '@kbn/alerts-as-data-utils';
import { ALERT_REASON } from '@kbn/rule-data-utils';
import { PLUGIN, type TransformHealth, TRANSFORM_RULE_TYPE } from '../../../../common/constants';
import { ES_FIELD_TYPES } from '@kbn/field-types';
import {
PLUGIN,
type TransformHealth,
TRANSFORM_RULE_TYPE,
TRANSFORM_HEALTH_RESULTS,
} from '../../../../common/constants';
import { transformHealthRuleParams, TransformHealthRuleParams } from './schema';
import { transformHealthServiceProvider } from './transform_health_service';
@ -35,6 +41,9 @@ export interface TransformStateReportResponse extends BaseTransformAlertResponse
node_name?: string;
}
/**
* @deprecated This health check is no longer in use
*/
export interface ErrorMessagesTransformResponse extends BaseTransformAlertResponse {
error_messages: Array<{ message: string; timestamp: number; node_name?: string }>;
}
@ -63,6 +72,31 @@ interface RegisterParams {
getFieldFormatsStart: () => FieldFormatsStart;
}
export const TRANSFORM_HEALTH_AAD_INDEX_NAME = 'transform.health';
export const TRANSFORM_HEALTH_AAD_CONFIG: IRuleTypeAlerts<TransformHealthAlert> = {
context: TRANSFORM_HEALTH_AAD_INDEX_NAME,
mappings: {
fieldMap: {
[TRANSFORM_HEALTH_RESULTS]: {
type: ES_FIELD_TYPES.OBJECT,
array: true,
required: false,
dynamic: false,
properties: {
transform_id: { type: ES_FIELD_TYPES.KEYWORD },
description: { type: ES_FIELD_TYPES.TEXT },
health_status: { type: ES_FIELD_TYPES.KEYWORD },
issues: { type: ES_FIELD_TYPES.OBJECT },
transform_state: { type: ES_FIELD_TYPES.KEYWORD },
node_name: { type: ES_FIELD_TYPES.KEYWORD },
},
},
},
},
shouldWrite: true,
};
export function registerTransformHealthRuleType(params: RegisterParams) {
const { alerting } = params;
alerting.registerType(getTransformHealthRuleType(params.getFieldFormatsStart));
@ -78,7 +112,7 @@ export function getTransformHealthRuleType(
TransformHealthAlertContext,
TransformIssue,
RecoveredActionGroupId,
DefaultAlert
TransformHealthAlert
> {
return {
id: TRANSFORM_RULE_TYPE.TRANSFORM_HEALTH,
@ -121,7 +155,7 @@ export function getTransformHealthRuleType(
minimumLicenseRequired: PLUGIN.MINIMUM_LICENSE_REQUIRED,
isExportable: true,
doesSetRecoveryContext: true,
alerts: DEFAULT_AAD_CONFIG,
alerts: TRANSFORM_HEALTH_AAD_CONFIG,
async executor(options) {
const {
services: { scopedClusterClient, alertsClient, uiSettingsClient },
@ -153,6 +187,7 @@ export function getTransformHealthRuleType(
context,
payload: {
[ALERT_REASON]: context.message,
[TRANSFORM_HEALTH_RESULTS]: context.results,
},
});
});
@ -168,6 +203,7 @@ export function getTransformHealthRuleType(
context: testResult.context,
payload: {
[ALERT_REASON]: testResult.context.message,
[TRANSFORM_HEALTH_RESULTS]: testResult.context.results,
},
});
}

View file

@ -154,6 +154,7 @@ export function transformHealthServiceProvider({
},
/**
* Returns report about transforms that contain error messages
* @deprecated This health check is no longer in use
* @param transformIds
*/
async getErrorMessagesReport(

View file

@ -19,6 +19,7 @@ import {
ALERT_STATUS,
EVENT_ACTION,
} from '@kbn/rule-data-utils';
import { TRANSFORM_HEALTH_RESULTS } from '@kbn/transform-plugin/common/constants';
import { FtrProviderContext } from '../../../../../../common/ftr_provider_context';
import { getUrlPrefix, ObjectRemover } from '../../../../../../common/lib';
import { Spaces } from '../../../../../scenarios';
@ -76,7 +77,7 @@ export default function ruleTests({ getService }: FtrProviderContext) {
const esTestIndexToolAAD = new ESTestIndexTool(
es,
retry,
`.internal.alerts-default.alerts-default-000001`
`.internal.alerts-transform.health.alerts-default-000001`
);
describe('rule', async () => {
@ -132,6 +133,9 @@ export default function ruleTests({ getService }: FtrProviderContext) {
const aadDocs = await getAllAADDocs(1);
const alertDoc = aadDocs.body.hits.hits[0]._source;
expect(alertDoc[ALERT_REASON]).to.be(`Transform test_transform_01 is not started.`);
expect(alertDoc[TRANSFORM_HEALTH_RESULTS]).to.eql([
{ transform_id: 'test_transform_01', transform_state: 'stopped', health_status: 'green' },
]);
expect(alertDoc[ALERT_RULE_CATEGORY]).to.be(`Transform health`);
expect(alertDoc[ALERT_RULE_NAME]).to.be(`Test all transforms`);
expect(alertDoc[ALERT_RULE_TYPE_ID]).to.be(`transform_health`);

View file

@ -133,7 +133,7 @@ export default function ({ getService }: FtrProviderContext) {
await new Promise((resolve) => setTimeout(resolve, 3000));
await retry.tryForTime(10000, async () => {
await retry.tryForTime(15000, async () => {
const resp2 = await supertest
.post(`/internal/search/ese/${id}`)
.set(ELASTIC_HTTP_VERSION_HEADER, '1')