[Detection Engine] Add apm context info for security rule executions (#211088)

## Summary Adds extra labels and context for security rule executions. Follow up work will add more context specific to the different security rule type. This PR focuses on capturing information about the configuration in logic shared across all rule types. The info collected is generally intended for use to narrow down _why_ a rule might be performing poorly. The `rule_id` param is collected so we can aggregate and identify prebuilt rules that perform poorly across many environments and ask the TRaDE team to help tune the rule query/config. ## Testing ![image](https://github.com/user-attachments/assets/f207265a-47ba-4f1f-a19e-3cfbd85461b1) 1. Spin up an Observability cluster on cloud.elastic.co 2. Setup your local cluster to send APM data to the cloud cluster ``` elastic: apm: active: true serverUrl: <apm url from cloud console> secretToken: <secret token> ``` To find the secret token, login to Kibana on your cloud obs cluster and go to `Management -> Fleet -> Elastic cloud agent policy -> Elastic APM -> Agent Authorization -> Secret token` 3. Run rules in your local test environment. Observe APM data in the cloud cluster ## Uses In addition to debugging, we can use this new apm data to create dashboards like the screenshot below, showing the slowest rules by `rule_id` (only one rule in the test environment, but in production this would show the slowest rules across all apm enabled clusters). ![image](https://github.com/user-attachments/assets/59e8af39-49e6-40df-9b1d-1b4005e256d8) --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
2025-04-23 09:19:04 -04:00 · 2025-03-11 15:29:06 -04:00 · 2025-03-11 15:29:06 -04:00 · 323cbdb9cd
commit 323cbdb9cd
parent d01b9c6911
6 changed files with 223 additions and 27 deletions
--- a/x-pack/solutions/security/plugins/lists/server/apm_field_names.ts
+++ b/x-pack/solutions/security/plugins/lists/server/apm_field_names.ts
@ -0,0 +1,21 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+/**
+ * Number of non-value list exceptions.
+ */
+export const SECURITY_NUM_REGULAR_EXCEPTIONS = 'security_exceptions_num_regular';
+
+/**
+ * Number of small value list exceptions.
+ */
+export const SECURITY_NUM_SMALL_LIST_EXCEPTIONS = 'security_exceptions_num_small_list';
+
+/**
+ * Number of large value list exceptions.
+ */
+export const SECURITY_NUM_LARGE_LIST_EXCEPTIONS = 'security_exceptions_num_large_list';
--- a/x-pack/solutions/security/plugins/lists/server/services/exception_lists/build_exception_filter.ts
+++ b/x-pack/solutions/security/plugins/lists/server/services/exception_lists/build_exception_filter.ts
@ -5,6 +5,7 @@
 * 2.0.
 */

+import agent from 'elastic-apm-node';
 import { chunk } from 'lodash/fp';
 import {
  CreateExceptionListItemSchema,
@ -34,6 +35,11 @@ import {
  MAXIMUM_SMALL_VALUE_LIST_SIZE,
 } from '@kbn/securitysolution-list-constants';

+import {
+  SECURITY_NUM_LARGE_LIST_EXCEPTIONS,
+  SECURITY_NUM_REGULAR_EXCEPTIONS,
+  SECURITY_NUM_SMALL_LIST_EXCEPTIONS,
+} from '../../apm_field_names';
 import type { ListClient } from '../..';

 type ExceptionEntry = Entry | EntryNested;
@ -320,6 +326,11 @@ export const buildExceptionFilter = async <
  unprocessedExceptions.push(...unprocessableValueListExceptions);

  if (exceptionsWithoutValueLists.length === 0 && exceptionsWithValueLists.length === 0) {
+    agent.setCustomContext({
+      [SECURITY_NUM_LARGE_LIST_EXCEPTIONS]: unprocessedExceptions.length,
+      [SECURITY_NUM_REGULAR_EXCEPTIONS]: 0,
+      [SECURITY_NUM_SMALL_LIST_EXCEPTIONS]: 0,
+    });
    return { filter: undefined, unprocessedExceptions };
  }
  const { orClauses, unprocessableExceptionItems } = await createOrClauses<T>({
@ -329,6 +340,14 @@ export const buildExceptionFilter = async <
    listClient,
  });

+  agent.setCustomContext({
+    [SECURITY_NUM_LARGE_LIST_EXCEPTIONS]:
+      unprocessableValueListExceptions.length + unprocessableExceptionItems.length,
+    [SECURITY_NUM_REGULAR_EXCEPTIONS]: exceptionsWithoutValueLists.length,
+    [SECURITY_NUM_SMALL_LIST_EXCEPTIONS]:
+      exceptionsWithValueLists.length - unprocessableExceptionItems.length,
+  });
+
  const exceptionFilter: Filter = {
    meta: {
      alias,
--- a/x-pack/solutions/security/plugins/security_solution/server/lib/detection_engine/rule_monitoring/logic/rule_execution_log/client_for_executors/client.ts
+++ b/x-pack/solutions/security/plugins/security_solution/server/lib/detection_engine/rule_monitoring/logic/rule_execution_log/client_for_executors/client.ts
@ -5,6 +5,7 @@
 * 2.0.
 */

+import agent from 'elastic-apm-node';
 import type { Logger } from '@kbn/core/server';
 import { sum } from 'lodash';
 import type { Duration } from 'moment';
@ -39,6 +40,7 @@ import type {
 } from './client_interface';
 import type { RuleExecutionMetrics } from '../../../../../../../common/api/detection_engine/rule_monitoring/model';
 import { LogLevelEnum } from '../../../../../../../common/api/detection_engine/rule_monitoring/model';
+import { SECURITY_RULE_STATUS } from '../../../../rule_types/utils/apm_field_names';

 export const createRuleExecutionLogClientForExecutors = (
  settings: RuleExecutionSettings,
@ -84,6 +86,8 @@ export const createRuleExecutionLogClientForExecutors = (
        const correlationIds = baseCorrelationIds.withStatus(args.newStatus);
        const logMeta = correlationIds.getLogMeta();

+        agent.addLabels({ [SECURITY_RULE_STATUS]: args.newStatus });
+
        try {
          const normalizedArgs = normalizeStatusChangeArgs(args);

--- a/x-pack/solutions/security/plugins/security_solution/server/lib/detection_engine/rule_types/create_security_rule_type_wrapper.ts
+++ b/x-pack/solutions/security/plugins/security_solution/server/lib/detection_engine/rule_types/create_security_rule_type_wrapper.ts
@ -48,6 +48,21 @@ import { TIMESTAMP_RUNTIME_FIELD } from './constants';
 import { buildTimestampRuntimeMapping } from './utils/build_timestamp_runtime_mapping';
 import { alertsFieldMap, rulesFieldMap } from '../../../../common/field_maps';
 import { sendAlertSuppressionTelemetryEvent } from './utils/telemetry/send_alert_suppression_telemetry_event';
+import type { RuleParams } from '../rule_schema';
+import {
+  SECURITY_FROM,
+  SECURITY_IMMUTABLE,
+  SECURITY_INPUT_INDEX,
+  SECURITY_MAX_SIGNALS,
+  SECURITY_MERGE_STRATEGY,
+  SECURITY_NUM_ALERTS_CREATED,
+  SECURITY_NUM_IGNORE_FIELDS_REGEX,
+  SECURITY_NUM_IGNORE_FIELDS_STANDARD,
+  SECURITY_NUM_RANGE_TUPLES,
+  SECURITY_PARAMS,
+  SECURITY_RULE_ID,
+  SECURITY_TO,
+} from './utils/apm_field_names';

 const aliasesFieldMap: FieldMap = {};
 Object.entries(aadFieldConversion).forEach(([key, value]) => {
@ -58,6 +73,19 @@ Object.entries(aadFieldConversion).forEach(([key, value]) => {
  };
 });

+const addApmLabelsFromParams = (params: RuleParams) => {
+  agent.addLabels(
+    {
+      [SECURITY_FROM]: params.from,
+      [SECURITY_IMMUTABLE]: params.immutable,
+      [SECURITY_MAX_SIGNALS]: params.maxSignals,
+      [SECURITY_RULE_ID]: params.ruleId,
+      [SECURITY_TO]: params.to,
+    },
+    false
+  );
+};
+
 export const securityRuleTypeFieldMap = {
  ...technicalRuleFieldMap,
  ...alertsFieldMap,
@ -136,6 +164,9 @@ export const createSecurityRuleTypeWrapper: CreateSecurityRuleTypeWrapper =
            state,
            rule,
          } = options;
+          addApmLabelsFromParams(params);
+          agent.setCustomContext({ [SECURITY_MERGE_STRATEGY]: mergeStrategy });
+          agent.setCustomContext({ [SECURITY_PARAMS]: params });
          let runState = state;
          let inputIndex: string[] = [];
          let runtimeMappings: estypes.MappingRuntimeFields | undefined;
@ -256,6 +287,10 @@ export const createSecurityRuleTypeWrapper: CreateSecurityRuleTypeWrapper =
            }
          }

+          // Make a copy of `inputIndex` or else the APM agent reports it as [Circular] for most rule types because it's the same object
+          // as `index`
+          agent.setCustomContext({ [SECURITY_INPUT_INDEX]: [...inputIndex] });
+
          // check if rule has permissions to access given index pattern
          // move this collection of lines into a function in utils
          // so that we can use it in create rules route, bulk, etc.
@ -332,6 +367,8 @@ export const createSecurityRuleTypeWrapper: CreateSecurityRuleTypeWrapper =
            wrapperWarnings.push(rangeTuplesWarningMessage);
          }

+          agent.setCustomContext({ [SECURITY_NUM_RANGE_TUPLES]: tuples.length });
+
          if (remainingGap.asMilliseconds() > 0) {
            const gapDuration = `${remainingGap.humanize()} (${remainingGap.asMilliseconds()}ms)`;
            const gapErrorMessage = `${gapDuration} were not queried between this rule execution and the last execution, so signals may have been missed. Consider increasing your look behind time or adding more Kibana instances`;
@ -377,6 +414,12 @@ export const createSecurityRuleTypeWrapper: CreateSecurityRuleTypeWrapper =
            ignoreFieldsStandard.forEach((field) => {
              ignoreFieldsObject[field] = true;
            });
+
+            agent.setCustomContext({
+              [SECURITY_NUM_IGNORE_FIELDS_STANDARD]: ignoreFieldsStandard.length,
+              [SECURITY_NUM_IGNORE_FIELDS_REGEX]: ignoreFieldsRegexes.length,
+            });
+
            const intendedTimestamp = startedAtOverridden ? startedAt : undefined;
            const wrapHits = wrapHitsFactory({
              ignoreFields: ignoreFieldsObject,
@ -478,6 +521,8 @@ export const createSecurityRuleTypeWrapper: CreateSecurityRuleTypeWrapper =

            const createdSignalsCount = result.createdSignals.length;

+            agent.setCustomContext({ [SECURITY_NUM_ALERTS_CREATED]: createdSignalsCount });
+
            if (disabledActions.length > 0) {
              const disabledActionsWarning = getDisabledActionsWarningText({
                alertsCreated: createdSignalsCount > 0,
--- a/x-pack/solutions/security/plugins/security_solution/server/lib/detection_engine/rule_types/utils/apm_field_names.ts
+++ b/x-pack/solutions/security/plugins/security_solution/server/lib/detection_engine/rule_types/utils/apm_field_names.ts
@ -0,0 +1,92 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+// Schema field names and descriptions for APM labels set by security rule executors
+
+/**
+ * Copy of `from` value from rule params.
+ */
+export const SECURITY_FROM = 'security_from';
+
+/**
+ * Copy of `to` value from rule params.
+ */
+export const SECURITY_TO = 'security_to';
+
+/**
+ * Difference between `from` and `to`, in seconds.
+ */
+export const SECURITY_QUERY_SPAN_S = 'security_query_span_s';
+/**
+ * Copy of `immutable` value from rule params.
+ */
+export const SECURITY_IMMUTABLE = 'security_immutable';
+
+/**
+ * Copy of `max_signals` value from rule params.
+ */
+export const SECURITY_MAX_SIGNALS = 'security_max_signals';
+
+/**
+ * Copy of `rule_id` value from rule params.
+ */
+export const SECURITY_RULE_ID = 'security_rule_id';
+
+/**
+ * Rule status.
+ */
+export const SECURITY_RULE_STATUS = 'security_rule_status';
+
+// Schema field names and descriptions for APM custom context set by security rule executors
+
+/**
+ * Copy of all rule parameters.
+ */
+export const SECURITY_PARAMS = 'security_params';
+
+/**
+ * Index patterns queried by the rule (if applicable to rule type). Either a copy of the `index` rule param or the
+ * index patterns loaded from the rule's data view.
+ */
+export const SECURITY_INPUT_INDEX = 'security_input_index';
+
+/**
+ * Number of separate time intervals the rule will query. Rules query 1 interval if no gaps are detected, or more than
+ * 1 if gaps are detected.
+ */
+export const SECURITY_NUM_RANGE_TUPLES = 'security_num_range_tuples';
+
+/**
+ * Number of "ignore fields" - fields that are stripped from `fields` part of search response before merging `fields` into _source. "Standard"
+ * means field names that are normal strings.
+ */
+export const SECURITY_NUM_IGNORE_FIELDS_STANDARD = 'security_ignore_fields_num_standard';
+
+/**
+ * Number of "ignore fields" regular expressions
+ */
+export const SECURITY_NUM_IGNORE_FIELDS_REGEX = 'security_ignore_fields_num_regex';
+
+/**
+ * Number of exception items.
+ */
+export const SECURITY_NUM_EXCEPTION_ITEMS = 'security_exceptions_num_total';
+
+/**
+ * Merge strategy used by the rule to combine `_source` and `fields` when building alerts.
+ */
+export const SECURITY_MERGE_STRATEGY = 'security_merge_strategy';
+
+/**
+ * Number of alerts generated by the rule execution.
+ */
+export const SECURITY_NUM_ALERTS_CREATED = 'security_num_alerts_created';
+
+/**
+ * Number of concrete indices matching index pattern.
+ */
+export const SECURITY_NUM_INDICES_MATCHING_PATTERN = 'security_num_indices_matching_pattern';
--- a/x-pack/solutions/security/plugins/security_solution/server/lib/detection_engine/rule_types/utils/utils.ts
+++ b/x-pack/solutions/security/plugins/security_solution/server/lib/detection_engine/rule_types/utils/utils.ts
@ -4,6 +4,8 @@
 * 2.0; you may not use this file except in compliance with the Elastic License
 * 2.0.
 */
+
+import agent from 'elastic-apm-node';
 import { createHash } from 'crypto';
 import { chunk, get, invert, isEmpty, merge, partition } from 'lodash';
 import moment from 'moment';
@ -90,6 +92,11 @@ import type {
 import type { BuildReasonMessage } from './reason_formatters';
 import { getSuppressionTerms } from './suppression_utils';
 import { robustGet } from './source_fields_merging/utils/robust_field_access';
+import {
+  SECURITY_NUM_EXCEPTION_ITEMS,
+  SECURITY_NUM_INDICES_MATCHING_PATTERN,
+  SECURITY_QUERY_SPAN_S,
+} from './apm_field_names';

 export const MAX_RULE_GAP_RATIO = 4;

@ -144,6 +151,10 @@ export const hasTimestampFields = async (args: {
  const { timestampField, timestampFieldCapsResponse, inputIndices, ruleExecutionLogger } = args;
  const { ruleName } = ruleExecutionLogger.context;

+  agent.setCustomContext({
+    [SECURITY_NUM_INDICES_MATCHING_PATTERN]: timestampFieldCapsResponse.body.indices?.length,
+  });
+
  if (isEmpty(timestampFieldCapsResponse.body.indices)) {
    const errorString = `This rule is attempting to query data from Elasticsearch indices listed in the "Index patterns" section of the rule definition, however no index matching: ${JSON.stringify(
      inputIndices
@ -276,36 +287,39 @@ export const getExceptions = async ({
  client: ExceptionListClient;
  lists: ListArray;
 }): Promise<ExceptionListItemSchema[]> => {
-  if (lists.length > 0) {
-    try {
-      const listIds = lists.map(({ list_id: listId }) => listId);
-      const namespaceTypes = lists.map(({ namespace_type: namespaceType }) => namespaceType);
+  return withSecuritySpan('getExceptions', async () => {
+    if (lists.length > 0) {
+      try {
+        const listIds = lists.map(({ list_id: listId }) => listId);
+        const namespaceTypes = lists.map(({ namespace_type: namespaceType }) => namespaceType);

-      // Stream the results from the Point In Time (PIT) finder into this array
-      let items: ExceptionListItemSchema[] = [];
-      const executeFunctionOnStream = (response: FoundExceptionListItemSchema): void => {
-        items = [...items, ...response.data];
-      };
+        // Stream the results from the Point In Time (PIT) finder into this array
+        let items: ExceptionListItemSchema[] = [];
+        const executeFunctionOnStream = (response: FoundExceptionListItemSchema): void => {
+          items = [...items, ...response.data];
+        };

-      await client.findExceptionListsItemPointInTimeFinder({
-        executeFunctionOnStream,
-        listId: listIds,
-        namespaceType: namespaceTypes,
-        perPage: 1_000, // See https://github.com/elastic/kibana/issues/93770 for choice of 1k
-        filter: [],
-        maxSize: undefined, // NOTE: This is unbounded when it is "undefined"
-        sortOrder: undefined,
-        sortField: undefined,
-      });
-      return items;
-    } catch (e) {
-      throw new Error(
-        `unable to fetch exception list items, message: "${e.message}" full error: "${e}"`
-      );
+        await client.findExceptionListsItemPointInTimeFinder({
+          executeFunctionOnStream,
+          listId: listIds,
+          namespaceType: namespaceTypes,
+          perPage: 1_000, // See https://github.com/elastic/kibana/issues/93770 for choice of 1k
+          filter: [],
+          maxSize: undefined, // NOTE: This is unbounded when it is "undefined"
+          sortOrder: undefined,
+          sortField: undefined,
+        });
+        agent.setCustomContext({ [SECURITY_NUM_EXCEPTION_ITEMS]: items.length });
+        return items;
+      } catch (e) {
+        throw new Error(
+          `unable to fetch exception list items, message: "${e.message}" full error: "${e}"`
+        );
+      }
+    } else {
+      return [];
    }
-  } else {
-    return [];
-  }
+  });
 };

 export const generateId = (
@ -385,6 +399,7 @@ export const getGapBetweenRuns = ({
    return moment.duration(0);
  }
  const driftTolerance = moment.duration(originalTo.diff(originalFrom));
+  agent.addLabels({ [SECURITY_QUERY_SPAN_S]: driftTolerance.asSeconds() }, false);
  const currentDuration = moment.duration(moment(startedAt).diff(previousStartedAt));
  return currentDuration.subtract(driftTolerance);
 };