[ML] AIOps: Adds log rate analysis to alert details page contextual insight. (#187690)

## Summary Part of #178501. This adds log rate analysis results to the prompt used for contextual insights on alert details pages: <img width="1149" alt="image" src="https://github.com/user-attachments/assets/80b0f8e6-1ea1-4dbf-86ff-82c2ef175aa6"> <img width="1129" alt="image" src="https://github.com/user-attachments/assets/a538cc3c-6f13-43e4-ad7d-8a93a779d349"> ### Checklist - [x] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios - [x] This was checked for breaking API changes and was [labeled appropriately](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process) --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
2025-04-24 09:48:58 -04:00 · 2024-08-28 13:37:15 +02:00 · 2024-08-28 13:37:15 +02:00 · 3c2ce3c839
commit 3c2ce3c839
parent 1344d3b238
11 changed files with 387 additions and 44 deletions
--- a/x-pack/packages/ml/aiops_log_rate_analysis/get_log_rate_analysis_parameters_from_alert.ts
+++ b/x-pack/packages/ml/aiops_log_rate_analysis/get_log_rate_analysis_parameters_from_alert.ts
@ -11,7 +11,7 @@ export interface GetLogRateAnalysisParametersFromAlertArgs {
  alertStartedAt: string;
  alertEndedAt?: string;
  timeSize?: number;
-  timeUnit?: moment.unitOfTime.DurationConstructor;
+  timeUnit?: string;
 }

 export const getLogRateAnalysisParametersFromAlert = ({
@ -20,12 +20,7 @@ export const getLogRateAnalysisParametersFromAlert = ({
  timeSize,
  timeUnit,
 }: GetLogRateAnalysisParametersFromAlertArgs) => {
-  // Identify `intervalFactor` to adjust time ranges based on alert settings.
-  // The default time ranges for `initialAnalysisStart` are suitable for a `1m` lookback.
-  // If an alert would have a `5m` lookback, this would result in a factor of `5`.
-  const lookbackDuration =
-    timeSize && timeUnit ? moment.duration(timeSize, timeUnit) : moment.duration(1, 'm');
-  const intervalFactor = Math.max(1, lookbackDuration.asSeconds() / 60);
+  const intervalFactor = getIntervalFactor(timeSize, timeUnit);

  const alertStart = moment(alertStartedAt);
  const alertEnd = alertEndedAt ? moment(alertEndedAt) : undefined;
@ -43,6 +38,21 @@ export const getLogRateAnalysisParametersFromAlert = ({
  };
 };

+// Identify `intervalFactor` to adjust time ranges based on alert settings.
+// The default time ranges for `initialAnalysisStart` are suitable for a `1m` lookback.
+// If an alert would have a `5m` lookback, this would result in a factor of `5`.
+export const getIntervalFactor = (timeSize?: number, timeUnit?: string) => {
+  const lookbackDuration =
+    timeSize && timeUnit
+      ? moment.duration(
+          timeSize,
+          // workaround to cast the string based time unit to moment's format.
+          timeUnit as unknown as moment.unitOfTime.DurationConstructor | undefined
+        )
+      : moment.duration(1, 'm');
+  return Math.max(1, lookbackDuration.asSeconds() / 60);
+};
+
 interface GetParameterHelperArgs {
  alertStart: Moment;
  intervalFactor: number;
--- a/x-pack/packages/ml/aiops_log_rate_analysis/queries/fetch_log_rate_analysis_for_alert.ts
+++ b/x-pack/packages/ml/aiops_log_rate_analysis/queries/fetch_log_rate_analysis_for_alert.ts
@ -0,0 +1,213 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import moment from 'moment';
+import { queue } from 'async';
+import { chunk } from 'lodash';
+
+import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
+
+import { withSpan } from '@kbn/apm-utils';
+import type { ElasticsearchClient } from '@kbn/core/server';
+import type { SignificantItem } from '@kbn/ml-agg-utils';
+import { getSampleProbability } from '@kbn/ml-random-sampler-utils';
+
+import type { AiopsLogRateAnalysisSchema } from '../api/schema';
+import { getIntervalFactor } from '../get_log_rate_analysis_parameters_from_alert';
+import { getSwappedWindowParameters } from '../get_swapped_window_parameters';
+import { getLogRateChange } from '../get_log_rate_change';
+import { getBaselineAndDeviationRates } from '../get_baseline_and_deviation_rates';
+import { getLogRateAnalysisTypeForCounts } from '../get_log_rate_analysis_type_for_counts';
+import { LOG_RATE_ANALYSIS_TYPE } from '../log_rate_analysis_type';
+
+import { fetchIndexInfo } from './fetch_index_info';
+import { fetchSignificantCategories } from './fetch_significant_categories';
+import { fetchSignificantTermPValues } from './fetch_significant_term_p_values';
+
+const MAX_CONCURRENT_QUERIES = 5;
+const CHUNK_SIZE = 50;
+
+interface QueueItem {
+  fn: typeof fetchSignificantCategories | typeof fetchSignificantTermPValues;
+  fieldNames: string[];
+}
+
+/**
+ * Runs log rate analysis data on an index given some alert metadata.
+ */
+export async function fetchLogRateAnalysisForAlert({
+  esClient,
+  abortSignal,
+  arguments: args,
+}: {
+  esClient: ElasticsearchClient;
+  abortSignal?: AbortSignal;
+  arguments: {
+    index: string;
+    alertStartedAt: string;
+    alertRuleParameterTimeSize?: number;
+    alertRuleParameterTimeUnit?: string;
+    timefield?: string;
+    searchQuery?: estypes.QueryDslQueryContainer;
+  };
+}) {
+  const { alertStartedAt, timefield = '@timestamp' } = args;
+  const alertStart = moment(alertStartedAt);
+
+  const intervalFactor = getIntervalFactor(
+    args.alertRuleParameterTimeSize,
+    args.alertRuleParameterTimeUnit
+  );
+
+  // The deviation time range is 1 lookback duration before the alert start.
+  // The baseline time range is 2 lookback durations before the deviation time range.
+  const windowParameters = {
+    baselineMin: alertStart
+      .clone()
+      .subtract(3 * intervalFactor, 'minutes')
+      .valueOf(),
+    baselineMax: alertStart
+      .clone()
+      .subtract(1 * intervalFactor, 'minutes')
+      .valueOf(),
+    deviationMin: alertStart
+      .clone()
+      .subtract(1 * intervalFactor, 'minutes')
+      .valueOf(),
+    deviationMax: alertStart.valueOf(),
+  };
+
+  const { searchQuery = { match_all: {} } } = args;
+
+  // Step 1: Get field candidates and total doc counts.
+  const indexInfoParams: AiopsLogRateAnalysisSchema = {
+    index: args.index,
+    start: windowParameters.baselineMin,
+    end: windowParameters.deviationMax,
+    searchQuery: JSON.stringify(searchQuery),
+    timeFieldName: timefield,
+    ...windowParameters,
+  };
+
+  const indexInfo = await withSpan(
+    { name: 'fetch_index_info', type: 'aiops-log-rate-analysis-for-alert' },
+    () =>
+      fetchIndexInfo({
+        esClient,
+        abortSignal,
+        arguments: {
+          ...indexInfoParams,
+          textFieldCandidatesOverrides: ['message', 'error.message'],
+        },
+      })
+  );
+  const { textFieldCandidates, keywordFieldCandidates } = indexInfo;
+
+  const logRateAnalysisType = getLogRateAnalysisTypeForCounts({
+    baselineCount: indexInfo.baselineTotalDocCount,
+    deviationCount: indexInfo.deviationTotalDocCount,
+    windowParameters,
+  });
+
+  // Just in case the log rate analysis type is 'dip', we need to swap
+  // the window parameters for the analysis.
+  const analysisWindowParameters =
+    logRateAnalysisType === LOG_RATE_ANALYSIS_TYPE.SPIKE
+      ? windowParameters
+      : getSwappedWindowParameters(windowParameters);
+
+  // Step 2: Identify significant items.
+  // The following code will fetch significant categories and term p-values
+  // using an async queue. The field candidates will be passed on as chunks
+  // of 50 fields with up to 5 concurrent queries. This is to prevent running
+  // into bucket limit issues if we'd throw possibly hundreds of field candidates
+  // into a single query.
+
+  const significantItems: SignificantItem[] = [];
+
+  // Set up the queue: A queue item is an object with the function to call and
+  // the field names to be passed to the function. This is done so we can push
+  // queries for both keyword fields (using significant_terms/p-values) and
+  // text fields (using categorize_text + custom code to identify significance)
+  // into the same queue.
+  const significantItemsQueue = queue(async function ({ fn, fieldNames }: QueueItem) {
+    significantItems.push(
+      ...(await fn({
+        esClient,
+        abortSignal,
+        arguments: {
+          ...indexInfoParams,
+          ...analysisWindowParameters,
+          fieldNames,
+          sampleProbability: getSampleProbability(
+            indexInfo.deviationTotalDocCount + indexInfo.baselineTotalDocCount
+          ),
+        },
+      }))
+    );
+  }, MAX_CONCURRENT_QUERIES);
+
+  // Push the actual items to the queue. We don't need to chunk the text fields
+  // since they are just `message` and `error.message`.
+  significantItemsQueue.push(
+    [
+      { fn: fetchSignificantCategories, fieldNames: textFieldCandidates },
+      ...chunk(keywordFieldCandidates, CHUNK_SIZE).map((fieldNames) => ({
+        fn: fetchSignificantTermPValues,
+        fieldNames,
+      })),
+    ],
+    (err) => {
+      if (err) significantItemsQueue.kill();
+    }
+  );
+
+  // Wait for the queue to finish.
+  await withSpan(
+    { name: 'fetch_significant_items', type: 'aiops-log-rate-analysis-for-alert' },
+    () => significantItemsQueue.drain()
+  );
+
+  // RETURN DATA
+  // Adapt the raw significant items data for contextual insights.
+  return {
+    logRateAnalysisType,
+    significantItems: significantItems
+      .map(({ fieldName, fieldValue, type, doc_count: docCount, bg_count: bgCount }) => {
+        const { baselineBucketRate, deviationBucketRate } = getBaselineAndDeviationRates(
+          logRateAnalysisType,
+          // Normalize the amount of baseline buckets based on treating the
+          // devation duration as 1 bucket.
+          (windowParameters.baselineMax - windowParameters.baselineMin) /
+            (windowParameters.deviationMax - windowParameters.deviationMin),
+          1,
+          docCount,
+          bgCount
+        );
+
+        const fieldType = type === 'keyword' ? 'metadata' : 'log message pattern';
+
+        const data = {
+          fieldType,
+          fieldName,
+          fieldValue: String(fieldValue).substring(0, 140),
+          logRateChange: getLogRateChange(
+            logRateAnalysisType,
+            baselineBucketRate,
+            deviationBucketRate
+          ).message,
+        };
+
+        return {
+          logRateChangeSort: bgCount > 0 ? docCount / bgCount : docCount,
+          data,
+        };
+      })
+      .sort((a, b) => b.logRateChangeSort - a.logRateChangeSort)
+      .map((d) => d.data),
+  };
+}
--- a/x-pack/packages/ml/aiops_log_rate_analysis/tsconfig.json
+++ b/x-pack/packages/ml/aiops_log_rate_analysis/tsconfig.json
@ -31,5 +31,6 @@
    "@kbn/ml-string-hash",
    "@kbn/ml-response-stream",
    "@kbn/i18n",
+    "@kbn/apm-utils",
  ]
 }
--- a/x-pack/plugins/observability_solution/apm/server/routes/assistant_functions/get_log_categories/index.ts
+++ b/x-pack/plugins/observability_solution/apm/server/routes/assistant_functions/get_log_categories/index.ts
@ -14,6 +14,7 @@ import { APMEventClient } from '../../../lib/helpers/create_es_client/create_apm
 import { PROCESSOR_EVENT, TRACE_ID } from '../../../../common/es_fields/apm';
 import { getTypedSearch } from '../../../utils/create_typed_es_client';
 import { getDownstreamServiceResource } from '../get_observability_alert_details_context/get_downstream_dependency_name';
+import { getShouldMatchOrNotExistFilter } from '../utils/get_should_match_or_not_exist_filter';

 export interface LogCategory {
  errorCategory: string;
@ -101,7 +102,7 @@ export async function getLogCategories({
          categories: {
            categorize_text: {
              field: 'message',
-              size: 500,
+              size: 10,
            },
            aggs: {
              sample: {
@ -147,37 +148,3 @@ export async function getLogCategories({
    entities: flattenObject(sampleDoc),
  };
 }
-
-// field/value pairs should match, or the field should not exist
-export function getShouldMatchOrNotExistFilter(
-  keyValuePairs: Array<{
-    field: string;
-    value?: string;
-  }>
-) {
-  return keyValuePairs
-    .filter(({ value }) => value)
-    .map(({ field, value }) => {
-      return {
-        bool: {
-          should: [
-            {
-              bool: {
-                filter: [{ term: { [field]: value } }],
-              },
-            },
-            {
-              bool: {
-                must_not: {
-                  bool: {
-                    filter: [{ exists: { field } }],
-                  },
-                },
-              },
-            },
-          ],
-          minimum_should_match: 1,
-        },
-      };
-    });
-}
--- a/x-pack/plugins/observability_solution/apm/server/routes/assistant_functions/get_log_rate_analysis_for_alert/index.ts
+++ b/x-pack/plugins/observability_solution/apm/server/routes/assistant_functions/get_log_rate_analysis_for_alert/index.ts
@ -0,0 +1,63 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
+import type { CoreRequestHandlerContext } from '@kbn/core/server';
+import { aiAssistantLogsIndexPattern } from '@kbn/observability-ai-assistant-plugin/server';
+import { fetchLogRateAnalysisForAlert } from '@kbn/aiops-log-rate-analysis/queries/fetch_log_rate_analysis_for_alert';
+import { PROCESSOR_EVENT } from '../../../../common/es_fields/apm';
+import { getShouldMatchOrNotExistFilter } from '../utils/get_should_match_or_not_exist_filter';
+
+/**
+ * Runs log rate analysis data on an index given some alert metadata.
+ */
+export async function getLogRateAnalysisForAlert({
+  esClient,
+  coreContext,
+  arguments: args,
+}: {
+  esClient: ElasticsearchClient;
+  coreContext: Pick<CoreRequestHandlerContext, 'uiSettings'>;
+  arguments: {
+    alertStartedAt: string;
+    alertRuleParameterTimeSize?: number;
+    alertRuleParameterTimeUnit?: string;
+    entities: {
+      'service.name'?: string;
+      'host.name'?: string;
+      'container.id'?: string;
+      'kubernetes.pod.name'?: string;
+    };
+  };
+}): ReturnType<typeof fetchLogRateAnalysisForAlert> {
+  const index = await coreContext.uiSettings.client.get<string>(aiAssistantLogsIndexPattern);
+
+  const keyValueFilters = getShouldMatchOrNotExistFilter(
+    Object.entries(args.entities).map(([key, value]) => ({ field: key, value }))
+  );
+
+  const searchQuery = {
+    bool: {
+      must_not: [
+        // exclude APM errors
+        { term: { [PROCESSOR_EVENT]: 'error' } },
+      ],
+      filter: [...keyValueFilters],
+    },
+  };
+
+  return fetchLogRateAnalysisForAlert({
+    esClient,
+    arguments: {
+      index,
+      alertStartedAt: args.alertStartedAt,
+      alertRuleParameterTimeSize: args.alertRuleParameterTimeSize,
+      alertRuleParameterTimeUnit: args.alertRuleParameterTimeUnit,
+      searchQuery,
+    },
+  });
+}
--- a/x-pack/plugins/observability_solution/apm/server/routes/assistant_functions/get_observability_alert_details_context/index.ts
+++ b/x-pack/plugins/observability_solution/apm/server/routes/assistant_functions/get_observability_alert_details_context/index.ts
@ -22,6 +22,7 @@ import {
  APMDownstreamDependency,
  getAssistantDownstreamDependencies,
 } from '../get_apm_downstream_dependencies';
+import { getLogRateAnalysisForAlert } from '../get_log_rate_analysis_for_alert';
 import { getLogCategories, LogCategory } from '../get_log_categories';
 import { getAnomalies } from '../get_apm_service_summary/get_anomalies';
 import { getServiceNameFromSignals } from './get_service_name_from_signals';
@ -160,6 +161,42 @@ export const getAlertDetailsContextHandler = (
      });
    }

+    // log rate analysis
+    dataFetchers.push(async () => {
+      const { logRateAnalysisType, significantItems } = await getLogRateAnalysisForAlert({
+        esClient,
+        coreContext,
+        arguments: {
+          alertStartedAt: moment(alertStartedAt).toISOString(),
+          alertRuleParameterTimeSize: query.alert_rule_parameter_time_size
+            ? parseInt(query.alert_rule_parameter_time_size, 10)
+            : undefined,
+          alertRuleParameterTimeUnit: query.alert_rule_parameter_time_unit,
+          entities: {
+            'service.name': serviceName,
+            'host.name': hostName,
+            'container.id': containerId,
+            'kubernetes.pod.name': kubernetesPodName,
+          },
+        },
+      });
+
+      if (logRateAnalysisType !== 'spike' || significantItems.length === 0) {
+        return {
+          key: 'logRateAnalysis',
+          description:
+            'Log rate analysis did not identify any significant metadata or log patterns.',
+          data: [],
+        };
+      }
+
+      return {
+        key: 'logRateAnalysis',
+        description: `Statistically significant log metadata and log message patterns occurring in the lookback period before the alert was triggered.`,
+        data: significantItems,
+      };
+    });
+
    // log categories
    dataFetchers.push(async () => {
      const downstreamDependencies = await downstreamDependenciesPromise;
--- a/x-pack/plugins/observability_solution/apm/server/routes/assistant_functions/utils/get_should_match_or_not_exist_filter.test.ts
+++ b/x-pack/plugins/observability_solution/apm/server/routes/assistant_functions/utils/get_should_match_or_not_exist_filter.test.ts
@ -5,7 +5,7 @@
 * 2.0.
 */

-import { getShouldMatchOrNotExistFilter } from '.';
+import { getShouldMatchOrNotExistFilter } from './get_should_match_or_not_exist_filter';

 describe('getShouldMatchOrNotExistFilter', () => {
  describe('when all fields are provided', () => {
--- a/x-pack/plugins/observability_solution/apm/server/routes/assistant_functions/utils/get_should_match_or_not_exist_filter.ts
+++ b/x-pack/plugins/observability_solution/apm/server/routes/assistant_functions/utils/get_should_match_or_not_exist_filter.ts
@ -0,0 +1,40 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+// field/value pairs should match, or the field should not exist
+export function getShouldMatchOrNotExistFilter(
+  keyValuePairs: Array<{
+    field: string;
+    value?: string;
+  }>
+) {
+  return keyValuePairs
+    .filter(({ value }) => value)
+    .map(({ field, value }) => {
+      return {
+        bool: {
+          should: [
+            {
+              bool: {
+                filter: [{ term: { [field]: value } }],
+              },
+            },
+            {
+              bool: {
+                must_not: {
+                  bool: {
+                    filter: [{ exists: { field } }],
+                  },
+                },
+              },
+            },
+          ],
+          minimum_should_match: 1,
+        },
+      };
+    });
+}
--- a/x-pack/plugins/observability_solution/apm/tsconfig.json
+++ b/x-pack/plugins/observability_solution/apm/tsconfig.json
@ -127,7 +127,8 @@
    "@kbn/server-route-repository-utils",
    "@kbn/core-analytics-browser",
    "@kbn/apm-types",
-    "@kbn/entities-schema"
+    "@kbn/entities-schema",
+    "@kbn/aiops-log-rate-analysis"
  ],
  "exclude": [
    "target/**/*"
--- a/x-pack/plugins/observability_solution/observability/public/pages/alert_details/alert_details_contextual_insights.tsx
+++ b/x-pack/plugins/observability_solution/observability/public/pages/alert_details/alert_details_contextual_insights.tsx
@ -9,6 +9,7 @@ import { EuiFlexGroup, EuiFlexItem } from '@elastic/eui';

 import React, { useCallback } from 'react';
 import { i18n } from '@kbn/i18n';
+import { ALERT_RULE_PARAMETERS } from '@kbn/rule-data-utils';
 import dedent from 'dedent';
 import { type AlertDetailsContextualInsight } from '../../../server/services';
 import { useKibana } from '../../utils/kibana_react';
@ -35,6 +36,12 @@ export function AlertDetailContextualInsights({ alert }: { alert: AlertData | nu
        query: {
          alert_started_at: new Date(alert.formatted.start).toISOString(),

+          // alert fields used for log rate analysis
+          alert_rule_parameter_time_size: alert.formatted.fields[ALERT_RULE_PARAMETERS]
+            ?.timeSize as string | undefined,
+          alert_rule_parameter_time_unit: alert.formatted.fields[ALERT_RULE_PARAMETERS]
+            ?.timeUnit as string | undefined,
+
          // service fields
          'service.name': fields['service.name'],
          'service.environment': fields['service.environment'],
--- a/x-pack/plugins/observability_solution/observability/server/services/index.ts
+++ b/x-pack/plugins/observability_solution/observability/server/services/index.ts
@ -20,6 +20,10 @@ export const alertDetailsContextRt = t.intersection([
    alert_started_at: t.string,
  }),
  t.partial({
+    // alert fields used for log rate analysis
+    alert_rule_parameter_time_size: t.string,
+    alert_rule_parameter_time_unit: t.string,
+
    // apm fields
    'service.name': t.string,
    'service.environment': t.string,