[Profiling] Reduce CPU usage for flamegraph and TopN function (#141017)

* Add instrumentation for flamegraph

I inlined the construction of the flamegraph into the respective route
so that we could add fine-grained instrumentation. We now use APM and
console logging to understand how long flamegraph construction takes.

* Remove unnecessary Set usage

* Remove superfluous clone

This was likely added when we needed to avoid infinite recursion when
serializing to JSON. This no longer has a useful function.

* Pass in pre-calculated frame group info

I noticed that we were creating frame group info multiple times so I
added it as a parameter for the intermediate node.

* Sort callees in one place

Callees should be sorted first by samples decreasing and then by frame
groups. Combining the two sorts makes the post-processing clearer to
future readers and/or maintainers.

* Capitalize fields in preparation of merging

* Align both node data structures

* Pass metadata instead of copying fields

* Refactor frame label method

* Use pre-calculated array length

* Use pre-allocated array

* Refactor intermediate node

* Remove intermediate node structure

* Move if statement out of for loop

* Fix comments

* Sort sibling nodes by frame group ID

* Calculate graph size during creation

* Add missing groupStackFrameMetadataByStackTrace

* Fix formatting

* Fix generated callee source

* Fix creation of frame group

* Fix test

* Remove filter for relevant traces

* Stop passing frame group

* Create root node inside createCallerCalleeGraph

* Fix timestamps

* Remove frame group comparator

* Add instrumentation for topN functions

* Allow for missing stacktraces

* Use Date.now instead
This commit is contained in:
Joseph Crail 2022-09-22 09:37:45 -07:00 committed by GitHub
parent 0fcfaec2dd
commit d8901857aa
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 413 additions and 809 deletions

View file

@ -8,7 +8,12 @@
import { schema } from '@kbn/config-schema';
import { RouteRegisterParameters } from '.';
import { getRoutePaths } from '../../common';
import { FlameGraph } from '../../common/flamegraph';
import { createCallerCalleeGraph } from '../../common/callercallee';
import {
createColumnarCallerCallee,
createFlameGraph,
ElasticFlameGraph,
} from '../../common/flamegraph';
import { createProfilingEsClient } from '../utils/create_profiling_es_client';
import { withProfilingSpan } from '../utils/with_profiling_span';
import { getClient } from './compat';
@ -39,6 +44,7 @@ export function registerFlameChartSearchRoute({ router, logger }: RouteRegisterP
timeTo,
kuery,
});
const totalSeconds = timeTo - timeFrom;
const { stackTraces, executables, stackFrames, eventsIndex, totalCount, stackTraceEvents } =
await getExecutablesAndStackTraces({
@ -48,23 +54,47 @@ export function registerFlameChartSearchRoute({ router, logger }: RouteRegisterP
sampleSize: targetSampleSize,
});
const flamegraph = await withProfilingSpan('collect_flamegraph', async () => {
return new FlameGraph({
sampleRate: eventsIndex.sampleRate,
totalCount,
events: stackTraceEvents,
const flamegraph = await withProfilingSpan('create_flamegraph', async () => {
const t0 = Date.now();
const graph = createCallerCalleeGraph(
stackTraceEvents,
stackTraces,
stackFrames,
executables,
totalSeconds: timeTo - timeFrom,
}).toElastic();
executables
);
logger.info(`creating caller-callee graph took ${Date.now() - t0} ms`);
const t1 = Date.now();
const columnar = createColumnarCallerCallee(graph);
logger.info(`creating columnar caller-callee graph took ${Date.now() - t1} ms`);
const t2 = Date.now();
const fg = createFlameGraph(columnar);
logger.info(`creating flamegraph took ${Date.now() - t2} ms`);
return fg;
});
// sampleRate is 1/5^N, with N being the downsampled index the events were fetched from.
// N=0: full events table (sampleRate is 1)
// N=1: downsampled by 5 (sampleRate is 0.2)
// ...
// totalCount is the sum(Count) of all events in the filter range in the
// downsampled index we were looking at.
// To estimate how many events we have in the full events index: totalCount / sampleRate.
// Do the same for single entries in the events array.
const body: ElasticFlameGraph = {
...flamegraph,
TotalSeconds: totalSeconds,
TotalTraces: Math.floor(totalCount / eventsIndex.sampleRate),
SampledTraces: totalCount,
};
logger.info('returning payload response to client');
return response.ok({
body: flamegraph,
});
return response.ok({ body });
} catch (e) {
logger.error(e);
return response.customError({

View file

@ -54,7 +54,8 @@ export function registerTopNFunctionsSearchRoute({ router, logger }: RouteRegist
sampleSize: targetSampleSize,
});
const topNFunctions = await withProfilingSpan('collect_topn_functions', async () => {
const t0 = Date.now();
const topNFunctions = await withProfilingSpan('create_topn_functions', async () => {
return createTopNFunctions(
stackTraceEvents,
stackTraces,
@ -64,6 +65,7 @@ export function registerTopNFunctionsSearchRoute({ router, logger }: RouteRegist
endIndex
);
});
logger.info(`creating topN functions took ${Date.now() - t0} ms`);
logger.info('returning payload response to client');

View file

@ -50,13 +50,13 @@ export async function getExecutablesAndStackTraces({
if (totalCount > sampleSize * 1.1) {
p = sampleSize / totalCount;
logger.info('downsampling events with p=' + p);
const t0 = new Date().getTime();
const t0 = Date.now();
const downsampledTotalCount = downsampleEventsRandomly(
stackTraceEvents,
p,
filter.toString()
);
logger.info(`downsampling events took ${new Date().getTime() - t0} ms`);
logger.info(`downsampling events took ${Date.now() - t0} ms`);
logger.info('downsampled total count: ' + downsampledTotalCount);
logger.info('unique downsampled stacktraces: ' + stackTraceEvents.size);
}

View file

@ -12,9 +12,9 @@ export async function logExecutionLatency<T>(
activity: string,
func: () => Promise<T>
): Promise<T> {
const start = new Date().getTime();
const start = Date.now();
return await func().then((res) => {
logger.info(activity + ' took ' + (new Date().getTime() - start) + 'ms');
logger.info(activity + ' took ' + (Date.now() - start) + 'ms');
return res;
});
}

View file

@ -269,7 +269,7 @@ export async function mgetStackTraces({
const stackFrameDocIDs = new Set<string>();
const executableDocIDs = new Set<string>();
const t0 = new Date().getTime();
const t0 = Date.now();
// flatMap() is significantly slower than an explicit for loop
for (const res of stackResponses) {
for (const trace of res.docs) {
@ -298,7 +298,7 @@ export async function mgetStackTraces({
}
}
}
logger.info(`processing data took ${new Date().getTime() - t0} ms`);
logger.info(`processing data took ${Date.now() - t0} ms`);
if (stackTraces.size !== 0) {
logger.info('Average size of stacktrace: ' + totalFrames / stackTraces.size);
@ -336,7 +336,7 @@ export async function mgetStackFrames({
// Create a lookup map StackFrameID -> StackFrame.
let framesFound = 0;
const t0 = new Date().getTime();
const t0 = Date.now();
const docs = resStackFrames.docs;
for (const frame of docs) {
if ('error' in frame) {
@ -361,7 +361,7 @@ export async function mgetStackFrames({
});
}
}
logger.info(`processing data took ${new Date().getTime() - t0} ms`);
logger.info(`processing data took ${Date.now() - t0} ms`);
logger.info('found ' + framesFound + ' / ' + stackFrameIDs.size + ' frames');
@ -391,7 +391,7 @@ export async function mgetExecutables({
// Create a lookup map StackFrameID -> StackFrame.
let exeFound = 0;
const t0 = new Date().getTime();
const t0 = Date.now();
const docs = resExecutables.docs;
for (const exe of docs) {
if ('error' in exe) {
@ -408,7 +408,7 @@ export async function mgetExecutables({
});
}
}
logger.info(`processing data took ${new Date().getTime() - t0} ms`);
logger.info(`processing data took ${Date.now() - t0} ms`);
logger.info('found ' + exeFound + ' / ' + executableIDs.size + ' executables');

View file

@ -8,7 +8,7 @@
import { schema } from '@kbn/config-schema';
import type { IRouter, Logger } from '@kbn/core/server';
import { RouteRegisterParameters } from '.';
import { fromMapToRecord, getRoutePaths, INDEX_EVENTS } from '../../common';
import { getRoutePaths, INDEX_EVENTS } from '../../common';
import { ProfilingESField } from '../../common/elasticsearch';
import { computeBucketWidthFromTimeRangeAndBucketCount } from '../../common/histogram';
import { groupStackFrameMetadataByStackTrace, StackTraceID } from '../../common/profiling';
@ -142,9 +142,7 @@ export async function topNElasticSearchQuery({
);
const metadata = await withProfilingSpan('collect_stackframe_metadata', async () => {
return fromMapToRecord(
groupStackFrameMetadataByStackTrace(stackTraces, stackFrames, executables)
);
return groupStackFrameMetadataByStackTrace(stackTraces, stackFrames, executables);
});
logger.info('returning payload response to client');