[8.6] [Fleet] Add Agent logs panic messages from last hour to telemetry (#149825) (#150037)

# Backport

This will backport the following commits from `main` to `8.6`:
- [[Fleet] Add Agent logs panic messages from last hour to telemetry
(#149825)](https://github.com/elastic/kibana/pull/149825)

<!--- Backport version: 8.9.7 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"Mark
Hopkin","email":"mark.hopkin@elastic.co"},"sourceCommit":{"committedDate":"2023-01-30T17:20:08Z","message":"[Fleet]
Add Agent logs panic messages from last hour to telemetry
(#149825)\n\n## Summary\r\n\r\nCloses
https://github.com/elastic/ingest-dev/issues/1486\r\n\r\nAdd
`agent_logs_panics_last_hour` telemetry field which contains
the\r\nmessage and timestamp of all log messages containing the word
panic that\r\nocurred in the last hour.\r\n\r\nCapped at 100
messages.","sha":"b9a999f7f831ce968fd57823ce63fbdcef85402d","branchLabelMapping":{"^v8.7.0$":"main","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:skip","backport:skip","Team:Fleet","v8.7.0"],"number":149825,"url":"https://github.com/elastic/kibana/pull/149825","mergeCommit":{"message":"[Fleet]
Add Agent logs panic messages from last hour to telemetry
(#149825)\n\n## Summary\r\n\r\nCloses
https://github.com/elastic/ingest-dev/issues/1486\r\n\r\nAdd
`agent_logs_panics_last_hour` telemetry field which contains
the\r\nmessage and timestamp of all log messages containing the word
panic that\r\nocurred in the last hour.\r\n\r\nCapped at 100
messages.","sha":"b9a999f7f831ce968fd57823ce63fbdcef85402d"}},"sourceBranch":"main","suggestedTargetBranches":[],"targetPullRequestStates":[{"branch":"main","label":"v8.7.0","labelRegex":"^v8.7.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/149825","number":149825,"mergeCommit":{"message":"[Fleet]
Add Agent logs panic messages from last hour to telemetry
(#149825)\n\n## Summary\r\n\r\nCloses
https://github.com/elastic/ingest-dev/issues/1486\r\n\r\nAdd
`agent_logs_panics_last_hour` telemetry field which contains
the\r\nmessage and timestamp of all log messages containing the word
panic that\r\nocurred in the last hour.\r\n\r\nCapped at 100
messages.","sha":"b9a999f7f831ce968fd57823ce63fbdcef85402d"}}]}]
BACKPORT-->
This commit is contained in:
Mark Hopkin 2023-02-01 13:46:26 +00:00 committed by GitHub
parent 5d8aa894ff
commit daf9cfe5a8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 142 additions and 2 deletions

View file

@ -0,0 +1,65 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
const AGENT_LOGS_INDEX_PATTERN = 'logs-elastic_agent-*';
const MAX_MESSAGE_COUNT = 100;
export interface AgentPanicLogsData {
agent_logs_panics_last_hour: Array<{ message: string; timestamp: string }>;
}
interface MaybeLogsDoc {
message?: string;
'@timestamp'?: string;
}
const DEFAULT_LOGS_DATA = {
agent_logs_panics_last_hour: [],
};
export async function getPanicLogsLastHour(
esClient?: ElasticsearchClient
): Promise<AgentPanicLogsData> {
if (!esClient) {
return DEFAULT_LOGS_DATA;
}
const res = await esClient.search<MaybeLogsDoc>({
index: AGENT_LOGS_INDEX_PATTERN,
size: MAX_MESSAGE_COUNT,
sort: [{ '@timestamp': 'desc' }],
_source: ['message', '@timestamp'],
query: {
bool: {
filter: [
{
range: {
'@timestamp': {
gte: 'now-1h',
},
},
},
{
match: {
message: 'panic',
},
},
],
},
},
});
const panicLogsLastHour = res.hits.hits.map((hit) => ({
message: hit._source?.message || '',
timestamp: hit._source?.['@timestamp'] || '',
}));
return {
agent_logs_panics_last_hour: panicLogsLastHour,
};
}

View file

@ -19,6 +19,8 @@ import type { PackageUsage } from './package_collectors';
import { getFleetServerUsage, getFleetServerConfig } from './fleet_server_collector';
import type { FleetServerUsage } from './fleet_server_collector';
import { getAgentPoliciesUsage } from './agent_policies';
import type { AgentPanicLogsData } from './agent_logs_panics';
import { getPanicLogsLastHour } from './agent_logs_panics';
export interface Usage {
agents_enabled: boolean;
@ -39,6 +41,7 @@ export interface FleetUsage extends Usage {
degraded: number;
};
agents_per_policy: number[];
agent_logs_panics_last_hour: AgentPanicLogsData['agent_logs_panics_last_hour'];
agent_logs_top_errors?: string[];
fleet_server_logs_top_errors?: string[];
}
@ -47,7 +50,7 @@ export const fetchFleetUsage = async (
core: CoreSetup,
config: FleetConfigType,
abortController: AbortController
) => {
): Promise<FleetUsage | undefined> => {
const [soClient, esClient] = await getInternalClients(core);
if (!soClient || !esClient) {
return;
@ -60,6 +63,7 @@ export const fetchFleetUsage = async (
...(await getAgentData(esClient, abortController)),
fleet_server_config: await getFleetServerConfig(soClient),
agent_policies: await getAgentPoliciesUsage(esClient, abortController),
...(await getPanicLogsLastHour(esClient)),
// TODO removed top errors telemetry as it causes this issue: https://github.com/elastic/kibana/issues/148976
// ...(await getAgentLogsTopErrors(esClient)),
};

View file

@ -176,7 +176,7 @@ describe('fleet usage telemetry', () => {
await esClient.create({
index: 'logs-elastic_agent-default',
id: 'log1',
id: 'panic1',
body: {
log: {
level: 'error',
@ -187,6 +187,45 @@ describe('fleet usage telemetry', () => {
refresh: 'wait_for',
});
await esClient.create({
index: 'logs-elastic_agent-default',
id: 'panic2',
body: {
log: {
level: 'error',
},
'@timestamp': new Date(Date.now() - 1000 * 60).toISOString(),
message: 'stderr panic some other panic',
},
refresh: 'wait_for',
});
await esClient.create({
index: 'logs-elastic_agent-default',
id: 'not-panic',
body: {
log: {
level: 'error',
},
'@timestamp': new Date().toISOString(),
message: 'this should not be included in metrics',
},
refresh: 'wait_for',
});
await esClient.create({
index: 'logs-elastic_agent-default',
id: 'panic-outside-time-range',
body: {
log: {
level: 'error',
},
'@timestamp': new Date(Date.now() - 2000 * 60 * 60).toISOString(),
message: 'stderr panic this should not be included in metrics',
},
refresh: 'wait_for',
});
await esClient.create({
index: 'logs-elastic_agent.fleet_server-default',
id: 'log2',
@ -279,6 +318,16 @@ describe('fleet usage telemetry', () => {
],
},
agent_policies: { count: 3, output_types: ['elasticsearch'] },
agent_logs_panics_last_hour: [
{
timestamp: expect.any(String),
message: 'stderr panic close of closed channel',
},
{
timestamp: expect.any(String),
message: 'stderr panic some other panic',
},
],
// agent_logs_top_errors: ['stderr panic close of closed channel'],
// fleet_server_logs_top_errors: ['failed to unenroll offline agents'],
})

View file

@ -177,4 +177,26 @@ export const fleetUsagesSchema: RootSchema<any> = {
},
},
},
agent_logs_panics_last_hour: {
type: 'array',
_meta: {
description: 'Array of log messages containing the word panic from the last hour',
},
items: {
properties: {
timestamp: {
type: 'date',
_meta: {
description: 'Timestamp of the log message containing the word panic',
},
},
message: {
type: 'text',
_meta: {
description: 'Log message containing the word panic',
},
},
},
},
},
};