mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 09:48:58 -04:00
[Fleet] Add unhealthy reason (input/output/other) to agent metrics (#178605)
## Summary Closes https://github.com/elastic/ingest-dev/issues/2522 Added `unhealthy_reason` aggregation when querying agent metrics. The [mapping change](https://github.com/elastic/elasticsearch/pull/106246) and [fleet-server change](https://github.com/elastic/fleet-server/pull/3338) is needed to be merged first to verify end to end. Steps to verify: - enroll an agent with docker - add endpoint integration, expect an input and output unit error status on the agent doc - wait a few seconds so that the agent metrics are published - verify that the agent metrics include `unhealthy_reason`, using the query below ``` GET metrics-fleet_server.agent_status-default/_search { "_source": ["fleet.agents"] } "hits": [ { "_index": ".ds-metrics-fleet_server.agent_status-default-2024.03.11-000001", "_id": "3JdPioUh-9j8DxQrAAABjjclRhU", "_score": 1, "_source": { "fleet": { "agents": { "enrolled": 12, "healthy": 0, "inactive": 0, "offline": 11, "total": 13, "unenrolled": 1, "unhealthy": 1, "updating": 0, "upgrading_step": { "downloading": 0, "extracting": 0, "failed": 0, "replacing": 0, "requested": 0, "restarting": 0, "rollback": 0, "scheduled": 0, "watching": 0 }, "unhealthy_reason": { "input": 1, "output": 1 } } } } }, ``` ### Checklist - [x] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios
This commit is contained in:
parent
300b8ee368
commit
e34a6d54e6
6 changed files with 104 additions and 1 deletions
|
@ -107,6 +107,13 @@ interface AgentBase {
|
|||
tags?: string[];
|
||||
components?: FleetServerAgentComponent[];
|
||||
agent?: FleetServerAgentMetadata;
|
||||
unhealthy_reason?: UnhealthyReason[];
|
||||
}
|
||||
|
||||
export enum UnhealthyReason {
|
||||
INPUT = 'input',
|
||||
OUTPUT = 'output',
|
||||
OTHER = 'other',
|
||||
}
|
||||
|
||||
export interface AgentMetrics {
|
||||
|
@ -336,6 +343,11 @@ export interface FleetServerAgent {
|
|||
* Outputs map
|
||||
*/
|
||||
outputs?: OutputMap;
|
||||
|
||||
/**
|
||||
* Unhealthy reason: input, output, other
|
||||
*/
|
||||
unhealthy_reason?: UnhealthyReason[];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -92,6 +92,7 @@ export function searchHitToAgent(
|
|||
// key-value pairs
|
||||
user_provided_metadata: hit._source?.user_provided_metadata!,
|
||||
local_metadata: hit._source?.local_metadata!,
|
||||
unhealthy_reason: hit._source?.unhealthy_reason,
|
||||
};
|
||||
|
||||
if (!hit.fields?.status?.length) {
|
||||
|
|
|
@ -71,6 +71,22 @@ describe('fetchAgentMetrics', () => {
|
|||
},
|
||||
],
|
||||
},
|
||||
unhealthy_reason: {
|
||||
buckets: [
|
||||
{
|
||||
key: 'input',
|
||||
doc_count: 2,
|
||||
},
|
||||
{
|
||||
key: 'output',
|
||||
doc_count: 1,
|
||||
},
|
||||
{
|
||||
key: 'other',
|
||||
doc_count: 3,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
|
@ -95,6 +111,11 @@ describe('fetchAgentMetrics', () => {
|
|||
scheduled: 0,
|
||||
watching: 0,
|
||||
},
|
||||
unhealthy_reason: {
|
||||
input: 2,
|
||||
output: 1,
|
||||
other: 3,
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
@ -20,6 +20,7 @@ export interface AgentMetrics {
|
|||
agents: AgentUsage;
|
||||
agents_per_version: AgentPerVersion[];
|
||||
upgrading_step: UpgradingSteps;
|
||||
unhealthy_reason: UnhealthyReason;
|
||||
}
|
||||
|
||||
export interface UpgradingSteps {
|
||||
|
@ -34,6 +35,12 @@ export interface UpgradingSteps {
|
|||
failed: number;
|
||||
}
|
||||
|
||||
export interface UnhealthyReason {
|
||||
input: number;
|
||||
output: number;
|
||||
other: number;
|
||||
}
|
||||
|
||||
export const fetchAgentMetrics = async (
|
||||
core: CoreSetup,
|
||||
abortController: AbortController
|
||||
|
@ -63,6 +70,7 @@ export const fetchAgentMetrics = async (
|
|||
agents: await getAgentUsage(soClient, esClient),
|
||||
agents_per_version: await getAgentsPerVersion(esClient, abortController),
|
||||
upgrading_step: await getUpgradingSteps(esClient, abortController),
|
||||
unhealthy_reason: await getUnhealthyReason(esClient, abortController),
|
||||
};
|
||||
return usage;
|
||||
};
|
||||
|
@ -195,3 +203,53 @@ export const getUpgradingSteps = async (
|
|||
return upgradingSteps;
|
||||
}
|
||||
};
|
||||
|
||||
export const getUnhealthyReason = async (
|
||||
esClient: ElasticsearchClient,
|
||||
abortController: AbortController
|
||||
): Promise<UnhealthyReason> => {
|
||||
const unhealthyReason = {
|
||||
input: 0,
|
||||
output: 0,
|
||||
other: 0,
|
||||
};
|
||||
try {
|
||||
const response = await retryTransientEsErrors(() =>
|
||||
esClient.search(
|
||||
{
|
||||
index: AGENTS_INDEX,
|
||||
size: 0,
|
||||
aggs: {
|
||||
unhealthy_reason: {
|
||||
terms: { field: 'unhealthy_reason' },
|
||||
},
|
||||
},
|
||||
},
|
||||
{ signal: abortController.signal }
|
||||
)
|
||||
);
|
||||
((response?.aggregations?.unhealthy_reason as any)?.buckets ?? []).forEach((bucket: any) => {
|
||||
switch (bucket.key) {
|
||||
case 'input':
|
||||
unhealthyReason.input = bucket.doc_count;
|
||||
break;
|
||||
case 'output':
|
||||
unhealthyReason.output = bucket.doc_count;
|
||||
break;
|
||||
case 'other':
|
||||
unhealthyReason.other = bucket.doc_count;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
});
|
||||
return unhealthyReason;
|
||||
} catch (error) {
|
||||
if (error.statusCode === 404) {
|
||||
appContextService.getLogger().debug('Index .fleet-agents does not exist yet.');
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
return unhealthyReason;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -104,6 +104,11 @@ describe('fleet metrics task', () => {
|
|||
count: 2,
|
||||
},
|
||||
],
|
||||
unhealthy_reason: {
|
||||
input: 2,
|
||||
output: 1,
|
||||
other: 3,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -149,6 +154,11 @@ describe('fleet metrics task', () => {
|
|||
scheduled: 1,
|
||||
requested: 1,
|
||||
},
|
||||
unhealthy_reason: {
|
||||
input: 2,
|
||||
output: 1,
|
||||
other: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
}),
|
||||
|
|
|
@ -20,7 +20,7 @@ import { appContextService } from '../app_context';
|
|||
import type { AgentMetrics } from './fetch_agent_metrics';
|
||||
|
||||
export const TYPE = 'Fleet-Metrics-Task';
|
||||
export const VERSION = '1.0.0';
|
||||
export const VERSION = '1.1.0';
|
||||
const TITLE = 'Fleet Metrics Task';
|
||||
const TIMEOUT = '1m';
|
||||
const SCOPE = ['fleet'];
|
||||
|
@ -115,6 +115,7 @@ export class FleetMetricsTask {
|
|||
unhealthy: agents.unhealthy,
|
||||
inactive: agents.inactive,
|
||||
upgrading_step: agentMetrics.upgrading_step,
|
||||
unhealthy_reason: agentMetrics.unhealthy_reason,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue