mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 17:59:23 -04:00
[Profiling] Reduce CPU usage for flamegraph and TopN function (#141017)
* Add instrumentation for flamegraph I inlined the construction of the flamegraph into the respective route so that we could add fine-grained instrumentation. We now use APM and console logging to understand how long flamegraph construction takes. * Remove unnecessary Set usage * Remove superfluous clone This was likely added when we needed to avoid infinite recursion when serializing to JSON. This no longer has a useful function. * Pass in pre-calculated frame group info I noticed that we were creating frame group info multiple times so I added it as a parameter for the intermediate node. * Sort callees in one place Callees should be sorted first by samples decreasing and then by frame groups. Combining the two sorts makes the post-processing clearer to future readers and/or maintainers. * Capitalize fields in preparation of merging * Align both node data structures * Pass metadata instead of copying fields * Refactor frame label method * Use pre-calculated array length * Use pre-allocated array * Refactor intermediate node * Remove intermediate node structure * Move if statement out of for loop * Fix comments * Sort sibling nodes by frame group ID * Calculate graph size during creation * Add missing groupStackFrameMetadataByStackTrace * Fix formatting * Fix generated callee source * Fix creation of frame group * Fix test * Remove filter for relevant traces * Stop passing frame group * Create root node inside createCallerCalleeGraph * Fix timestamps * Remove frame group comparator * Add instrumentation for topN functions * Allow for missing stacktraces * Use Date.now instead
This commit is contained in:
parent
0fcfaec2dd
commit
d8901857aa
14 changed files with 413 additions and 809 deletions
|
@ -8,7 +8,12 @@
|
|||
import { schema } from '@kbn/config-schema';
|
||||
import { RouteRegisterParameters } from '.';
|
||||
import { getRoutePaths } from '../../common';
|
||||
import { FlameGraph } from '../../common/flamegraph';
|
||||
import { createCallerCalleeGraph } from '../../common/callercallee';
|
||||
import {
|
||||
createColumnarCallerCallee,
|
||||
createFlameGraph,
|
||||
ElasticFlameGraph,
|
||||
} from '../../common/flamegraph';
|
||||
import { createProfilingEsClient } from '../utils/create_profiling_es_client';
|
||||
import { withProfilingSpan } from '../utils/with_profiling_span';
|
||||
import { getClient } from './compat';
|
||||
|
@ -39,6 +44,7 @@ export function registerFlameChartSearchRoute({ router, logger }: RouteRegisterP
|
|||
timeTo,
|
||||
kuery,
|
||||
});
|
||||
const totalSeconds = timeTo - timeFrom;
|
||||
|
||||
const { stackTraces, executables, stackFrames, eventsIndex, totalCount, stackTraceEvents } =
|
||||
await getExecutablesAndStackTraces({
|
||||
|
@ -48,23 +54,47 @@ export function registerFlameChartSearchRoute({ router, logger }: RouteRegisterP
|
|||
sampleSize: targetSampleSize,
|
||||
});
|
||||
|
||||
const flamegraph = await withProfilingSpan('collect_flamegraph', async () => {
|
||||
return new FlameGraph({
|
||||
sampleRate: eventsIndex.sampleRate,
|
||||
totalCount,
|
||||
events: stackTraceEvents,
|
||||
const flamegraph = await withProfilingSpan('create_flamegraph', async () => {
|
||||
const t0 = Date.now();
|
||||
const graph = createCallerCalleeGraph(
|
||||
stackTraceEvents,
|
||||
stackTraces,
|
||||
stackFrames,
|
||||
executables,
|
||||
totalSeconds: timeTo - timeFrom,
|
||||
}).toElastic();
|
||||
executables
|
||||
);
|
||||
logger.info(`creating caller-callee graph took ${Date.now() - t0} ms`);
|
||||
|
||||
const t1 = Date.now();
|
||||
const columnar = createColumnarCallerCallee(graph);
|
||||
logger.info(`creating columnar caller-callee graph took ${Date.now() - t1} ms`);
|
||||
|
||||
const t2 = Date.now();
|
||||
const fg = createFlameGraph(columnar);
|
||||
logger.info(`creating flamegraph took ${Date.now() - t2} ms`);
|
||||
|
||||
return fg;
|
||||
});
|
||||
|
||||
// sampleRate is 1/5^N, with N being the downsampled index the events were fetched from.
|
||||
// N=0: full events table (sampleRate is 1)
|
||||
// N=1: downsampled by 5 (sampleRate is 0.2)
|
||||
// ...
|
||||
|
||||
// totalCount is the sum(Count) of all events in the filter range in the
|
||||
// downsampled index we were looking at.
|
||||
// To estimate how many events we have in the full events index: totalCount / sampleRate.
|
||||
// Do the same for single entries in the events array.
|
||||
|
||||
const body: ElasticFlameGraph = {
|
||||
...flamegraph,
|
||||
TotalSeconds: totalSeconds,
|
||||
TotalTraces: Math.floor(totalCount / eventsIndex.sampleRate),
|
||||
SampledTraces: totalCount,
|
||||
};
|
||||
|
||||
logger.info('returning payload response to client');
|
||||
|
||||
return response.ok({
|
||||
body: flamegraph,
|
||||
});
|
||||
return response.ok({ body });
|
||||
} catch (e) {
|
||||
logger.error(e);
|
||||
return response.customError({
|
||||
|
|
|
@ -54,7 +54,8 @@ export function registerTopNFunctionsSearchRoute({ router, logger }: RouteRegist
|
|||
sampleSize: targetSampleSize,
|
||||
});
|
||||
|
||||
const topNFunctions = await withProfilingSpan('collect_topn_functions', async () => {
|
||||
const t0 = Date.now();
|
||||
const topNFunctions = await withProfilingSpan('create_topn_functions', async () => {
|
||||
return createTopNFunctions(
|
||||
stackTraceEvents,
|
||||
stackTraces,
|
||||
|
@ -64,6 +65,7 @@ export function registerTopNFunctionsSearchRoute({ router, logger }: RouteRegist
|
|||
endIndex
|
||||
);
|
||||
});
|
||||
logger.info(`creating topN functions took ${Date.now() - t0} ms`);
|
||||
|
||||
logger.info('returning payload response to client');
|
||||
|
||||
|
|
|
@ -50,13 +50,13 @@ export async function getExecutablesAndStackTraces({
|
|||
if (totalCount > sampleSize * 1.1) {
|
||||
p = sampleSize / totalCount;
|
||||
logger.info('downsampling events with p=' + p);
|
||||
const t0 = new Date().getTime();
|
||||
const t0 = Date.now();
|
||||
const downsampledTotalCount = downsampleEventsRandomly(
|
||||
stackTraceEvents,
|
||||
p,
|
||||
filter.toString()
|
||||
);
|
||||
logger.info(`downsampling events took ${new Date().getTime() - t0} ms`);
|
||||
logger.info(`downsampling events took ${Date.now() - t0} ms`);
|
||||
logger.info('downsampled total count: ' + downsampledTotalCount);
|
||||
logger.info('unique downsampled stacktraces: ' + stackTraceEvents.size);
|
||||
}
|
||||
|
|
|
@ -12,9 +12,9 @@ export async function logExecutionLatency<T>(
|
|||
activity: string,
|
||||
func: () => Promise<T>
|
||||
): Promise<T> {
|
||||
const start = new Date().getTime();
|
||||
const start = Date.now();
|
||||
return await func().then((res) => {
|
||||
logger.info(activity + ' took ' + (new Date().getTime() - start) + 'ms');
|
||||
logger.info(activity + ' took ' + (Date.now() - start) + 'ms');
|
||||
return res;
|
||||
});
|
||||
}
|
||||
|
|
|
@ -269,7 +269,7 @@ export async function mgetStackTraces({
|
|||
const stackFrameDocIDs = new Set<string>();
|
||||
const executableDocIDs = new Set<string>();
|
||||
|
||||
const t0 = new Date().getTime();
|
||||
const t0 = Date.now();
|
||||
// flatMap() is significantly slower than an explicit for loop
|
||||
for (const res of stackResponses) {
|
||||
for (const trace of res.docs) {
|
||||
|
@ -298,7 +298,7 @@ export async function mgetStackTraces({
|
|||
}
|
||||
}
|
||||
}
|
||||
logger.info(`processing data took ${new Date().getTime() - t0} ms`);
|
||||
logger.info(`processing data took ${Date.now() - t0} ms`);
|
||||
|
||||
if (stackTraces.size !== 0) {
|
||||
logger.info('Average size of stacktrace: ' + totalFrames / stackTraces.size);
|
||||
|
@ -336,7 +336,7 @@ export async function mgetStackFrames({
|
|||
|
||||
// Create a lookup map StackFrameID -> StackFrame.
|
||||
let framesFound = 0;
|
||||
const t0 = new Date().getTime();
|
||||
const t0 = Date.now();
|
||||
const docs = resStackFrames.docs;
|
||||
for (const frame of docs) {
|
||||
if ('error' in frame) {
|
||||
|
@ -361,7 +361,7 @@ export async function mgetStackFrames({
|
|||
});
|
||||
}
|
||||
}
|
||||
logger.info(`processing data took ${new Date().getTime() - t0} ms`);
|
||||
logger.info(`processing data took ${Date.now() - t0} ms`);
|
||||
|
||||
logger.info('found ' + framesFound + ' / ' + stackFrameIDs.size + ' frames');
|
||||
|
||||
|
@ -391,7 +391,7 @@ export async function mgetExecutables({
|
|||
|
||||
// Create a lookup map StackFrameID -> StackFrame.
|
||||
let exeFound = 0;
|
||||
const t0 = new Date().getTime();
|
||||
const t0 = Date.now();
|
||||
const docs = resExecutables.docs;
|
||||
for (const exe of docs) {
|
||||
if ('error' in exe) {
|
||||
|
@ -408,7 +408,7 @@ export async function mgetExecutables({
|
|||
});
|
||||
}
|
||||
}
|
||||
logger.info(`processing data took ${new Date().getTime() - t0} ms`);
|
||||
logger.info(`processing data took ${Date.now() - t0} ms`);
|
||||
|
||||
logger.info('found ' + exeFound + ' / ' + executableIDs.size + ' executables');
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
import { schema } from '@kbn/config-schema';
|
||||
import type { IRouter, Logger } from '@kbn/core/server';
|
||||
import { RouteRegisterParameters } from '.';
|
||||
import { fromMapToRecord, getRoutePaths, INDEX_EVENTS } from '../../common';
|
||||
import { getRoutePaths, INDEX_EVENTS } from '../../common';
|
||||
import { ProfilingESField } from '../../common/elasticsearch';
|
||||
import { computeBucketWidthFromTimeRangeAndBucketCount } from '../../common/histogram';
|
||||
import { groupStackFrameMetadataByStackTrace, StackTraceID } from '../../common/profiling';
|
||||
|
@ -142,9 +142,7 @@ export async function topNElasticSearchQuery({
|
|||
);
|
||||
|
||||
const metadata = await withProfilingSpan('collect_stackframe_metadata', async () => {
|
||||
return fromMapToRecord(
|
||||
groupStackFrameMetadataByStackTrace(stackTraces, stackFrames, executables)
|
||||
);
|
||||
return groupStackFrameMetadataByStackTrace(stackTraces, stackFrames, executables);
|
||||
});
|
||||
|
||||
logger.info('returning payload response to client');
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue