[Profiling] Reduce CPU usage for flamegraph and TopN function (#141017)

* Add instrumentation for flamegraph I inlined the construction of the flamegraph into the respective route so that we could add fine-grained instrumentation. We now use APM and console logging to understand how long flamegraph construction takes. * Remove unnecessary Set usage * Remove superfluous clone This was likely added when we needed to avoid infinite recursion when serializing to JSON. This no longer has a useful function. * Pass in pre-calculated frame group info I noticed that we were creating frame group info multiple times so I added it as a parameter for the intermediate node. * Sort callees in one place Callees should be sorted first by samples decreasing and then by frame groups. Combining the two sorts makes the post-processing clearer to future readers and/or maintainers. * Capitalize fields in preparation of merging * Align both node data structures * Pass metadata instead of copying fields * Refactor frame label method * Use pre-calculated array length * Use pre-allocated array * Refactor intermediate node * Remove intermediate node structure * Move if statement out of for loop * Fix comments * Sort sibling nodes by frame group ID * Calculate graph size during creation * Add missing groupStackFrameMetadataByStackTrace * Fix formatting * Fix generated callee source * Fix creation of frame group * Fix test * Remove filter for relevant traces * Stop passing frame group * Create root node inside createCallerCalleeGraph * Fix timestamps * Remove frame group comparator * Add instrumentation for topN functions * Allow for missing stacktraces * Use Date.now instead
2025-04-24 17:59:23 -04:00 · 2022-09-22 09:37:45 -07:00 · 2022-09-22 09:37:45 -07:00 · d8901857aa
commit d8901857aa
parent 0fcfaec2dd
14 changed files with 413 additions and 809 deletions
--- a/x-pack/plugins/profiling/server/routes/flamechart.ts
+++ b/x-pack/plugins/profiling/server/routes/flamechart.ts
@ -8,7 +8,12 @@
 import { schema } from '@kbn/config-schema';
 import { RouteRegisterParameters } from '.';
 import { getRoutePaths } from '../../common';
-import { FlameGraph } from '../../common/flamegraph';
+import { createCallerCalleeGraph } from '../../common/callercallee';
+import {
+  createColumnarCallerCallee,
+  createFlameGraph,
+  ElasticFlameGraph,
+} from '../../common/flamegraph';
 import { createProfilingEsClient } from '../utils/create_profiling_es_client';
 import { withProfilingSpan } from '../utils/with_profiling_span';
 import { getClient } from './compat';
@ -39,6 +44,7 @@ export function registerFlameChartSearchRoute({ router, logger }: RouteRegisterP
          timeTo,
          kuery,
        });
+        const totalSeconds = timeTo - timeFrom;

        const { stackTraces, executables, stackFrames, eventsIndex, totalCount, stackTraceEvents } =
          await getExecutablesAndStackTraces({
@ -48,23 +54,47 @@ export function registerFlameChartSearchRoute({ router, logger }: RouteRegisterP
            sampleSize: targetSampleSize,
          });

-        const flamegraph = await withProfilingSpan('collect_flamegraph', async () => {
-          return new FlameGraph({
-            sampleRate: eventsIndex.sampleRate,
-            totalCount,
-            events: stackTraceEvents,
+        const flamegraph = await withProfilingSpan('create_flamegraph', async () => {
+          const t0 = Date.now();
+          const graph = createCallerCalleeGraph(
+            stackTraceEvents,
            stackTraces,
            stackFrames,
-            executables,
-            totalSeconds: timeTo - timeFrom,
-          }).toElastic();
+            executables
+          );
+          logger.info(`creating caller-callee graph took ${Date.now() - t0} ms`);
+
+          const t1 = Date.now();
+          const columnar = createColumnarCallerCallee(graph);
+          logger.info(`creating columnar caller-callee graph took ${Date.now() - t1} ms`);
+
+          const t2 = Date.now();
+          const fg = createFlameGraph(columnar);
+          logger.info(`creating flamegraph took ${Date.now() - t2} ms`);
+
+          return fg;
        });

+        // sampleRate is 1/5^N, with N being the downsampled index the events were fetched from.
+        // N=0: full events table (sampleRate is 1)
+        // N=1: downsampled by 5 (sampleRate is 0.2)
+        // ...
+
+        // totalCount is the sum(Count) of all events in the filter range in the
+        // downsampled index we were looking at.
+        // To estimate how many events we have in the full events index: totalCount / sampleRate.
+        // Do the same for single entries in the events array.
+
+        const body: ElasticFlameGraph = {
+          ...flamegraph,
+          TotalSeconds: totalSeconds,
+          TotalTraces: Math.floor(totalCount / eventsIndex.sampleRate),
+          SampledTraces: totalCount,
+        };
+
        logger.info('returning payload response to client');

-        return response.ok({
-          body: flamegraph,
-        });
+        return response.ok({ body });
      } catch (e) {
        logger.error(e);
        return response.customError({
--- a/x-pack/plugins/profiling/server/routes/functions.ts
+++ b/x-pack/plugins/profiling/server/routes/functions.ts
@ -54,7 +54,8 @@ export function registerTopNFunctionsSearchRoute({ router, logger }: RouteRegist
            sampleSize: targetSampleSize,
          });

-        const topNFunctions = await withProfilingSpan('collect_topn_functions', async () => {
+        const t0 = Date.now();
+        const topNFunctions = await withProfilingSpan('create_topn_functions', async () => {
          return createTopNFunctions(
            stackTraceEvents,
            stackTraces,
@ -64,6 +65,7 @@ export function registerTopNFunctionsSearchRoute({ router, logger }: RouteRegist
            endIndex
          );
        });
+        logger.info(`creating topN functions took ${Date.now() - t0} ms`);

        logger.info('returning payload response to client');

--- a/x-pack/plugins/profiling/server/routes/get_executables_and_stacktraces.ts
+++ b/x-pack/plugins/profiling/server/routes/get_executables_and_stacktraces.ts
@ -50,13 +50,13 @@ export async function getExecutablesAndStackTraces({
    if (totalCount > sampleSize * 1.1) {
      p = sampleSize / totalCount;
      logger.info('downsampling events with p=' + p);
-      const t0 = new Date().getTime();
+      const t0 = Date.now();
      const downsampledTotalCount = downsampleEventsRandomly(
        stackTraceEvents,
        p,
        filter.toString()
      );
-      logger.info(`downsampling events took ${new Date().getTime() - t0} ms`);
+      logger.info(`downsampling events took ${Date.now() - t0} ms`);
      logger.info('downsampled total count: ' + downsampledTotalCount);
      logger.info('unique downsampled stacktraces: ' + stackTraceEvents.size);
    }
--- a/x-pack/plugins/profiling/server/routes/logger.ts
+++ b/x-pack/plugins/profiling/server/routes/logger.ts
@ -12,9 +12,9 @@ export async function logExecutionLatency<T>(
  activity: string,
  func: () => Promise<T>
 ): Promise<T> {
-  const start = new Date().getTime();
+  const start = Date.now();
  return await func().then((res) => {
-    logger.info(activity + ' took ' + (new Date().getTime() - start) + 'ms');
+    logger.info(activity + ' took ' + (Date.now() - start) + 'ms');
    return res;
  });
 }
--- a/x-pack/plugins/profiling/server/routes/stacktrace.ts
+++ b/x-pack/plugins/profiling/server/routes/stacktrace.ts
@ -269,7 +269,7 @@ export async function mgetStackTraces({
  const stackFrameDocIDs = new Set<string>();
  const executableDocIDs = new Set<string>();

-  const t0 = new Date().getTime();
+  const t0 = Date.now();
  // flatMap() is significantly slower than an explicit for loop
  for (const res of stackResponses) {
    for (const trace of res.docs) {
@ -298,7 +298,7 @@ export async function mgetStackTraces({
      }
    }
  }
-  logger.info(`processing data took ${new Date().getTime() - t0} ms`);
+  logger.info(`processing data took ${Date.now() - t0} ms`);

  if (stackTraces.size !== 0) {
    logger.info('Average size of stacktrace: ' + totalFrames / stackTraces.size);
@ -336,7 +336,7 @@ export async function mgetStackFrames({

  // Create a lookup map StackFrameID -> StackFrame.
  let framesFound = 0;
-  const t0 = new Date().getTime();
+  const t0 = Date.now();
  const docs = resStackFrames.docs;
  for (const frame of docs) {
    if ('error' in frame) {
@ -361,7 +361,7 @@ export async function mgetStackFrames({
      });
    }
  }
-  logger.info(`processing data took ${new Date().getTime() - t0} ms`);
+  logger.info(`processing data took ${Date.now() - t0} ms`);

  logger.info('found ' + framesFound + ' / ' + stackFrameIDs.size + ' frames');

@ -391,7 +391,7 @@ export async function mgetExecutables({

  // Create a lookup map StackFrameID -> StackFrame.
  let exeFound = 0;
-  const t0 = new Date().getTime();
+  const t0 = Date.now();
  const docs = resExecutables.docs;
  for (const exe of docs) {
    if ('error' in exe) {
@ -408,7 +408,7 @@ export async function mgetExecutables({
      });
    }
  }
-  logger.info(`processing data took ${new Date().getTime() - t0} ms`);
+  logger.info(`processing data took ${Date.now() - t0} ms`);

  logger.info('found ' + exeFound + ' / ' + executableIDs.size + ' executables');

--- a/x-pack/plugins/profiling/server/routes/topn.ts
+++ b/x-pack/plugins/profiling/server/routes/topn.ts
@ -8,7 +8,7 @@
 import { schema } from '@kbn/config-schema';
 import type { IRouter, Logger } from '@kbn/core/server';
 import { RouteRegisterParameters } from '.';
-import { fromMapToRecord, getRoutePaths, INDEX_EVENTS } from '../../common';
+import { getRoutePaths, INDEX_EVENTS } from '../../common';
 import { ProfilingESField } from '../../common/elasticsearch';
 import { computeBucketWidthFromTimeRangeAndBucketCount } from '../../common/histogram';
 import { groupStackFrameMetadataByStackTrace, StackTraceID } from '../../common/profiling';
@ -142,9 +142,7 @@ export async function topNElasticSearchQuery({
  );

  const metadata = await withProfilingSpan('collect_stackframe_metadata', async () => {
-    return fromMapToRecord(
-      groupStackFrameMetadataByStackTrace(stackTraces, stackFrames, executables)
-    );
+    return groupStackFrameMetadataByStackTrace(stackTraces, stackFrames, executables);
  });

  logger.info('returning payload response to client');