[8.9][Fleet] added agent logs top errors from 100 hits (#162135) (#163278)

Backport https://github.com/elastic/kibana/pull/162135
This commit is contained in:
Julia Bardi 2023-08-07 13:07:49 +02:00 committed by GitHub
parent 99b0b1dfa1
commit 40743390be
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 134 additions and 25 deletions

View file

@ -0,0 +1,84 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import type { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
import { getAgentLogsTopErrors } from './agent_logs_top_errors';
describe('getAgentLogsTopErrors', () => {
it('should return top 3 errors from 100 hits', async () => {
const esClientMock = {
search: jest.fn().mockImplementation((params) => {
if (params.index === 'logs-elastic_agent-*')
return {
hits: {
hits: [
{
_source: {
message: 'error 2',
},
},
{
_source: {
message: 'error 2',
},
},
{
_source: {
message: 'error 3',
},
},
{
_source: {
message: 'error 3',
},
},
{
_source: {
message: 'error 3',
},
},
{
_source: {
message: 'error 1',
},
},
],
},
};
else
return {
hits: {
hits: [
{
_source: {
message: 'fleet server error 2',
},
},
{
_source: {
message: 'fleet server error 2',
},
},
{
_source: {
message: 'fleet server error 1',
},
},
],
},
};
}),
} as unknown as ElasticsearchClient;
const topErrors = await getAgentLogsTopErrors(esClientMock);
expect(topErrors).toEqual({
agent_logs_top_errors: ['error 3', 'error 2', 'error 1'],
fleet_server_logs_top_errors: ['fleet server error 2', 'fleet server error 1'],
});
});
});

View file

@ -7,6 +7,10 @@
import type { ElasticsearchClient } from '@kbn/core-elasticsearch-server';
import { sortBy } from 'lodash';
import { DATA_TIERS } from '../../common/constants';
import { appContextService } from '../services';
export interface AgentLogsData {
@ -29,10 +33,16 @@ export async function getAgentLogsTopErrors(
const queryTopMessages = (index: string) =>
esClient.search({
index,
size: 0,
size: 100,
_source: ['message'],
query: {
bool: {
filter: [
{
terms: {
_tier: DATA_TIERS,
},
},
{
term: {
'log.level': 'error',
@ -48,35 +58,32 @@ export async function getAgentLogsTopErrors(
],
},
},
aggs: {
message_sample: {
sampler: {
shard_size: 200,
},
aggs: {
categories: {
categorize_text: {
field: 'message',
size: 10,
},
},
},
},
},
});
const transformBuckets = (resp: any) =>
((resp?.aggregations?.message_sample as any)?.categories?.buckets ?? [])
const getTopErrors = (resp: any) => {
const counts = (resp?.hits.hits ?? []).reduce((acc: any, curr: any) => {
if (!acc[curr._source.message]) {
acc[curr._source.message] = 0;
}
acc[curr._source.message]++;
return acc;
}, {});
const top3 = sortBy(
Object.entries(counts).map(([key, value]) => ({ key, value })),
'value'
)
.slice(0, 3)
.map((bucket: any) => bucket.key);
.reverse();
return top3.map(({ key, value }) => key);
};
const agentResponse = await queryTopMessages('logs-elastic_agent-*');
const fleetServerResponse = await queryTopMessages('logs-elastic_agent.fleet_server-*');
return {
agent_logs_top_errors: transformBuckets(agentResponse),
fleet_server_logs_top_errors: transformBuckets(fleetServerResponse),
agent_logs_top_errors: getTopErrors(agentResponse),
fleet_server_logs_top_errors: getTopErrors(fleetServerResponse),
};
} catch (error) {
if (error.statusCode === 404) {

View file

@ -21,6 +21,7 @@ import type { FleetServerUsage } from './fleet_server_collector';
import { getAgentPoliciesUsage } from './agent_policies';
import type { AgentPanicLogsData } from './agent_logs_panics';
import { getPanicLogsLastHour } from './agent_logs_panics';
import { getAgentLogsTopErrors } from './agent_logs_top_errors';
export interface Usage {
agents_enabled: boolean;
@ -64,8 +65,7 @@ export const fetchFleetUsage = async (
fleet_server_config: await getFleetServerConfig(soClient),
agent_policies: await getAgentPoliciesUsage(soClient),
...(await getPanicLogsLastHour(esClient)),
// TODO removed top errors telemetry as it causes this issue: https://github.com/elastic/kibana/issues/148976
// ...(await getAgentLogsTopErrors(esClient)),
...(await getAgentLogsTopErrors(esClient)),
};
return usage;
};

View file

@ -382,8 +382,12 @@ describe('fleet usage telemetry', () => {
message: 'stderr panic some other panic',
},
],
// agent_logs_top_errors: ['stderr panic close of closed channel'],
// fleet_server_logs_top_errors: ['failed to unenroll offline agents'],
agent_logs_top_errors: [
'stderr panic some other panic',
'stderr panic close of closed channel',
'this should not be included in metrics',
],
fleet_server_logs_top_errors: ['failed to unenroll offline agents'],
})
);
});

View file

@ -211,4 +211,18 @@ export const fleetUsagesSchema: RootSchema<any> = {
},
},
},
agent_logs_top_errors: {
type: 'array',
items: {
type: 'text',
_meta: { description: 'Top messages from agent error logs' },
},
},
fleet_server_logs_top_errors: {
type: 'array',
items: {
type: 'text',
_meta: { description: 'Top messages from fleet server error logs' },
},
},
};