[Uptime] Fix performance of summaries overview (#40724)

The scripted_metric doesn't perform well over large numbers of documents. We can narrow down the document-set with a preliminary query that narrows down the result set to only the most recent documents per agent per monitor.

A good way to check the perf difference is to let a few monitors run for a while with a 1s schedule. This creates quite a bit of data quickly. You'll notice that looking at even a few hours of data without this patch is slow, and looking at a few days can time out the query.
This commit is contained in:
Andrew Cholakian 2019-07-11 13:44:08 -05:00 committed by GitHub
parent be93d9066d
commit 1025f2df7a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 155 additions and 301 deletions

View file

@ -10,4 +10,4 @@ export { CONTEXT_DEFAULTS } from './context_defaults';
export { INDEX_NAMES } from './index_names';
export { INTEGRATED_SOLUTIONS } from './capabilities';
export { PLUGIN } from './plugin';
export { QUERY } from './query';
export { QUERY, LEGACY_STATES_QUERY_SIZE } from './query';

View file

@ -23,3 +23,6 @@ export const QUERY = {
'error.type',
],
};
// Number of results returned for a legacy states query
export const LEGACY_STATES_QUERY_SIZE = 50;

View file

@ -14,7 +14,7 @@ import {
SnapshotCount,
StatesIndexStatus,
} from '../../../../common/graphql/types';
import { INDEX_NAMES } from '../../../../common/constants';
import { INDEX_NAMES, LEGACY_STATES_QUERY_SIZE } from '../../../../common/constants';
import { getHistogramInterval, getFilteredQueryAndStatusFilter } from '../../helper';
type SortChecks = (check: Check) => string[];
@ -29,11 +29,119 @@ interface LegacyMonitorStatesQueryResult {
afterKey: any | null;
}
interface LegacyMonitorStatesRecentCheckGroupsQueryResult {
checkGroups: string[];
afterKey: any | null;
}
export class ElasticsearchMonitorStatesAdapter implements UMMonitorStatesAdapter {
constructor(private readonly database: DatabaseAdapter) {
this.database = database;
}
// This query returns the most recent check groups for a given
// monitor ID.
private async runLegacyMonitorStatesRecentCheckGroupsQuery(
request: any,
query: any,
searchAfter?: any
): Promise<LegacyMonitorStatesRecentCheckGroupsQueryResult> {
const body = {
query: {
bool: {
filter: [
{
// We check for summary.up to ensure that the check group
// is complete. Summary fields are only present on
// completed check groups.
exists: {
field: 'summary.up',
},
},
query,
],
},
},
sort: [
{
'@timestamp': 'desc',
},
],
size: 0,
aggs: {
monitors: {
composite: {
size: LEGACY_STATES_QUERY_SIZE,
sources: [
{
monitor_id: {
terms: {
field: 'monitor.id',
},
},
},
],
},
aggs: {
top: {
top_hits: {
sort: [
{
'@timestamp': 'desc',
},
],
_source: {
includes: ['monitor.check_group', '@timestamp', 'agent.id'],
},
// The idea here is that we want to get enough documents to get all
// possible agent IDs. Doing that in a deterministic way is hard,
// but all agent IDs should be represented in the top 50 results in most cases.
// There's an edge case here where a user has accidentally configured
// two agents to run on different schedules, but that's an issue on the user side.
size: 50,
},
},
},
},
},
};
if (searchAfter) {
set(body, 'aggs.monitors.composite.after', searchAfter);
}
const params = {
index: INDEX_NAMES.HEARTBEAT,
body,
};
const result = await this.database.search(request, params);
const checkGroups = result.aggregations.monitors.buckets.flatMap((bucket: any) => {
const topHits = get<any[]>(bucket, 'top.hits.hits', []);
const latestAgentGroup: { [key: string]: { timestamp: string; checkGroup: string } } = {};
topHits.forEach(({ _source: source }) => {
// We set the agent group to the first thing we see since it's already sorted
// by timestamp descending
if (!latestAgentGroup[source.agent.id]) {
latestAgentGroup[source.agent.id] = {
timestamp: source['@timestamp'],
checkGroup: source.monitor.check_group,
};
}
});
return Object.values(latestAgentGroup).map(({ checkGroup }) => checkGroup);
});
const afterKey = get<any | null>(result, 'aggregations.monitors.after_key', null);
return {
checkGroups,
afterKey,
};
}
private async runLegacyMonitorStatesQuery(
request: any,
dateRangeStart: string,
@ -46,15 +154,38 @@ export class ElasticsearchMonitorStatesAdapter implements UMMonitorStatesAdapter
dateRangeEnd,
filters
);
// First we fetch the most recent check groups for this query
// This is a critical performance optimization.
// Without this the expensive scripted_metric agg below will run
// over large numbers of documents.
// It only really needs to run over the latest complete check group for each
// agent.
const { checkGroups, afterKey } = await this.runLegacyMonitorStatesRecentCheckGroupsQuery(
request,
query,
searchAfter
);
const params = {
index: INDEX_NAMES.HEARTBEAT,
body: {
query,
query: {
bool: {
filter: [
{ terms: { 'monitor.check_group': checkGroups } },
// Even though this work is already done when calculating the groups
// this helps the planner
query,
],
},
},
sort: [{ '@timestamp': 'desc' }],
size: 0,
aggs: {
monitors: {
composite: {
size: 100,
size: LEGACY_STATES_QUERY_SIZE,
sources: [
{
monitor_id: {
@ -234,11 +365,7 @@ export class ElasticsearchMonitorStatesAdapter implements UMMonitorStatesAdapter
},
};
if (searchAfter) {
set(params, 'body.aggs.monitors.composite.after', searchAfter);
}
const result = await this.database.search(request, params);
const afterKey = get<any | null>(result, 'aggregations.monitors.after_key', null);
return { afterKey, result, statusFilter };
}
@ -270,7 +397,7 @@ export class ElasticsearchMonitorStatesAdapter implements UMMonitorStatesAdapter
);
monitors.push(...this.getMonitorBuckets(result, statusFilter));
searchAfter = afterKey;
} while (searchAfter !== null && monitors.length < 200);
} while (searchAfter !== null && monitors.length < LEGACY_STATES_QUERY_SIZE);
const monitorIds: string[] = [];
const summaries: MonitorSummary[] = monitors.map((monitor: any) => {
@ -400,7 +527,7 @@ export class ElasticsearchMonitorStatesAdapter implements UMMonitorStatesAdapter
by_id: {
terms: {
field: 'monitor.id',
size: 200,
size: LEGACY_STATES_QUERY_SIZE,
},
aggs: {
histogram: {

View file

@ -30,38 +30,6 @@
"state": {
"agent": null,
"checks": [
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.10.100", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548698895076"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.10.228", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548698323074"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.10.68", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548698607075"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.11.36", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548699431074"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
@ -69,44 +37,12 @@
"monitor": { "ip": "172.217.12.132", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700995077"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.12.164", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700551074"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.6.228", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700851075"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.7.4", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700295075"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.9.228", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548699691076"
}
],
"geo": null,
"observer": { "geo": { "name": [], "location": null } },
"monitor": { "id": null, "name": null, "status": "up", "type": null },
"summary": { "up": 9, "down": 0, "geo": null },
"summary": { "up": 1, "down": 0, "geo": null },
"url": { "full": "https://www.google.com/", "domain": "www.google.com" },
"timestamp": 1548700995077
}
@ -184,38 +120,6 @@
"state": {
"agent": null,
"checks": [
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.10.100", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548698865076"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.10.228", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548698325076"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.10.68", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548698595076"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.11.36", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548699430074"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
@ -223,44 +127,12 @@
"monitor": { "ip": "172.217.12.132", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700995077"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.12.164", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700565078"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.6.228", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700820075"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.7.4", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700295075"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.9.228", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548699705078"
}
],
"geo": null,
"observer": { "geo": { "name": [], "location": null } },
"monitor": { "id": null, "name": null, "status": "up", "type": null },
"summary": { "up": 9, "down": 0, "geo": null },
"summary": { "up": 1, "down": 0, "geo": null },
"url": { "full": "http://www.google.com/", "domain": "www.google.com" },
"timestamp": 1548700995077
}
@ -300,20 +172,12 @@
"monitor": { "ip": "192.30.253.112", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700995077"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "192.30.253.113", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700939074"
}
],
"geo": null,
"observer": { "geo": { "name": [], "location": null } },
"monitor": { "id": null, "name": null, "status": "up", "type": null },
"summary": { "up": 2, "down": 0, "geo": null },
"summary": { "up": 1, "down": 0, "geo": null },
"url": { "full": "https://www.github.com/", "domain": "www.github.com" },
"timestamp": 1548700995077
}
@ -346,14 +210,6 @@
"state": {
"agent": null,
"checks": [
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "184.168.131.233", "name": "", "status": "down" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548699315077"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
@ -366,7 +222,7 @@
"geo": null,
"observer": { "geo": { "name": [], "location": null } },
"monitor": { "id": null, "name": null, "status": "down", "type": null },
"summary": { "up": 0, "down": 2, "geo": null },
"summary": { "up": 0, "down": 1, "geo": null },
"url": { "full": "http://www.example.com/", "domain": "www.example.com" },
"timestamp": 1548700987078
}
@ -405,7 +261,7 @@
"kubernetes": null,
"monitor": { "ip": "208.80.154.224", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700915076"
"timestamp": "1548700975074"
}
],
"geo": null,
@ -413,7 +269,7 @@
"monitor": { "id": null, "name": null, "status": "up", "type": null },
"summary": { "up": 1, "down": 0, "geo": null },
"url": { "full": "https://www.wikipedia.org/", "domain": "www.wikipedia.org" },
"timestamp": 1548700915076
"timestamp": 1548700975074
}
},
{
@ -444,30 +300,6 @@
"state": {
"agent": null,
"checks": [
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": null, "name": "", "status": "down" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548698590077"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "151.101.201.140", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700813076"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "151.101.209.140", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700651074"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
@ -479,8 +311,8 @@
],
"geo": null,
"observer": { "geo": { "name": [], "location": null } },
"monitor": { "id": null, "name": null, "status": "mixed", "type": null },
"summary": { "up": 3, "down": 1, "geo": null },
"monitor": { "id": null, "name": null, "status": "up", "type": null },
"summary": { "up": 1, "down": 0, "geo": null },
"url": { "full": "http://www.reddit.com/", "domain": "www.reddit.com" },
"timestamp": 1548700993074
}
@ -513,22 +345,6 @@
"state": {
"agent": null,
"checks": [
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "151.101.202.217", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700903078"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "151.101.210.217", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700651073"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
@ -541,7 +357,7 @@
"geo": null,
"observer": { "geo": { "name": [], "location": null } },
"monitor": { "id": null, "name": null, "status": "up", "type": null },
"summary": { "up": 3, "down": 0, "geo": null },
"summary": { "up": 1, "down": 0, "geo": null },
"url": { "full": "https://www.elastic.co", "domain": "www.elastic.co" },
"timestamp": 1548700993074
}
@ -574,93 +390,21 @@
"state": {
"agent": null,
"checks": [
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.10.14", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548699775075"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.10.238", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700735077"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.11.14", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548699235073"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.12.142", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548698575076"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.12.174", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700135074"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.3.110", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548699415073"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.6.206", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548698935078"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.6.238", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700375075"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "172.217.9.238", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548699955075"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "216.58.219.206", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700915076"
"timestamp": "1548700975074"
}
],
"geo": null,
"observer": { "geo": { "name": [], "location": null } },
"monitor": { "id": null, "name": null, "status": "up", "type": null },
"summary": { "up": 10, "down": 0, "geo": null },
"summary": { "up": 1, "down": 0, "geo": null },
"url": { "full": "https://news.google.com/", "domain": "news.google.com" },
"timestamp": 1548700915076
"timestamp": 1548700975074
}
},
{

View file

@ -30,22 +30,6 @@
"state": {
"agent": null,
"checks": [
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "151.101.202.217", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700903078"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
"kubernetes": null,
"monitor": { "ip": "151.101.210.217", "name": "", "status": "up" },
"observer": { "geo": { "name": null, "location": null } },
"timestamp": "1548700651073"
},
{
"agent": { "id": "5884d7f7-9a49-4b0e-bff2-72a475aa695f" },
"container": null,
@ -58,7 +42,7 @@
"geo": null,
"observer": { "geo": { "name": [], "location": null } },
"monitor": { "id": null, "name": null, "status": "up", "type": null },
"summary": { "up": 3, "down": 0, "geo": null },
"summary": { "up": 1, "down": 0, "geo": null },
"url": { "full": "https://www.elastic.co", "domain": "www.elastic.co" },
"timestamp": 1548700993074
}

View file

@ -1,5 +1 @@
{
"snapshot": {
"counts": { "down": 2, "mixed": 1, "up": 7, "total": 10 }
}
}
{ "snapshot": { "counts": { "down": 2, "mixed": 0, "up": 8, "total": 10 } } }

View file

@ -1,5 +1,5 @@
{
"snapshot": {
"counts": { "down": 0, "mixed": 0, "up": 7, "total": 7 }
"counts": { "down": 0, "mixed": 0, "up": 8, "total": 8 }
}
}