[Fleet] Add unhealthy reason (input/output/other) to agent metrics (#178605)

## Summary

Closes https://github.com/elastic/ingest-dev/issues/2522

Added `unhealthy_reason` aggregation when querying agent metrics.

The [mapping
change](https://github.com/elastic/elasticsearch/pull/106246) and
[fleet-server change](https://github.com/elastic/fleet-server/pull/3338)
is needed to be merged first to verify end to end.

Steps to verify:
- enroll an agent with docker
- add endpoint integration, expect an input and output unit error status
on the agent doc
- wait a few seconds so that the agent metrics are published
- verify that the agent metrics include `unhealthy_reason`, using the
query below

```
GET metrics-fleet_server.agent_status-default/_search
{
  "_source": ["fleet.agents"]
}

  "hits": [
      {
        "_index": ".ds-metrics-fleet_server.agent_status-default-2024.03.11-000001",
        "_id": "3JdPioUh-9j8DxQrAAABjjclRhU",
        "_score": 1,
        "_source": {
          "fleet": {
            "agents": {
              "enrolled": 12,
              "healthy": 0,
              "inactive": 0,
              "offline": 11,
              "total": 13,
              "unenrolled": 1,
              "unhealthy": 1,
              "updating": 0,
              "upgrading_step": {
                "downloading": 0,
                "extracting": 0,
                "failed": 0,
                "replacing": 0,
                "requested": 0,
                "restarting": 0,
                "rollback": 0,
                "scheduled": 0,
                "watching": 0
              },
              "unhealthy_reason": {
                  "input": 1,
                  "output": 1
                }
            }
          }
        }
      },
```


### Checklist

- [x] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios
This commit is contained in:
Julia Bardi 2024-03-18 11:33:18 +01:00 committed by GitHub
parent 300b8ee368
commit e34a6d54e6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 104 additions and 1 deletions

View file

@ -107,6 +107,13 @@ interface AgentBase {
tags?: string[];
components?: FleetServerAgentComponent[];
agent?: FleetServerAgentMetadata;
unhealthy_reason?: UnhealthyReason[];
}
export enum UnhealthyReason {
INPUT = 'input',
OUTPUT = 'output',
OTHER = 'other',
}
export interface AgentMetrics {
@ -336,6 +343,11 @@ export interface FleetServerAgent {
* Outputs map
*/
outputs?: OutputMap;
/**
* Unhealthy reason: input, output, other
*/
unhealthy_reason?: UnhealthyReason[];
}
/**

View file

@ -92,6 +92,7 @@ export function searchHitToAgent(
// key-value pairs
user_provided_metadata: hit._source?.user_provided_metadata!,
local_metadata: hit._source?.local_metadata!,
unhealthy_reason: hit._source?.unhealthy_reason,
};
if (!hit.fields?.status?.length) {

View file

@ -71,6 +71,22 @@ describe('fetchAgentMetrics', () => {
},
],
},
unhealthy_reason: {
buckets: [
{
key: 'input',
doc_count: 2,
},
{
key: 'output',
doc_count: 1,
},
{
key: 'other',
doc_count: 3,
},
],
},
},
});
@ -95,6 +111,11 @@ describe('fetchAgentMetrics', () => {
scheduled: 0,
watching: 0,
},
unhealthy_reason: {
input: 2,
output: 1,
other: 3,
},
});
});
});

View file

@ -20,6 +20,7 @@ export interface AgentMetrics {
agents: AgentUsage;
agents_per_version: AgentPerVersion[];
upgrading_step: UpgradingSteps;
unhealthy_reason: UnhealthyReason;
}
export interface UpgradingSteps {
@ -34,6 +35,12 @@ export interface UpgradingSteps {
failed: number;
}
export interface UnhealthyReason {
input: number;
output: number;
other: number;
}
export const fetchAgentMetrics = async (
core: CoreSetup,
abortController: AbortController
@ -63,6 +70,7 @@ export const fetchAgentMetrics = async (
agents: await getAgentUsage(soClient, esClient),
agents_per_version: await getAgentsPerVersion(esClient, abortController),
upgrading_step: await getUpgradingSteps(esClient, abortController),
unhealthy_reason: await getUnhealthyReason(esClient, abortController),
};
return usage;
};
@ -195,3 +203,53 @@ export const getUpgradingSteps = async (
return upgradingSteps;
}
};
export const getUnhealthyReason = async (
esClient: ElasticsearchClient,
abortController: AbortController
): Promise<UnhealthyReason> => {
const unhealthyReason = {
input: 0,
output: 0,
other: 0,
};
try {
const response = await retryTransientEsErrors(() =>
esClient.search(
{
index: AGENTS_INDEX,
size: 0,
aggs: {
unhealthy_reason: {
terms: { field: 'unhealthy_reason' },
},
},
},
{ signal: abortController.signal }
)
);
((response?.aggregations?.unhealthy_reason as any)?.buckets ?? []).forEach((bucket: any) => {
switch (bucket.key) {
case 'input':
unhealthyReason.input = bucket.doc_count;
break;
case 'output':
unhealthyReason.output = bucket.doc_count;
break;
case 'other':
unhealthyReason.other = bucket.doc_count;
break;
default:
break;
}
});
return unhealthyReason;
} catch (error) {
if (error.statusCode === 404) {
appContextService.getLogger().debug('Index .fleet-agents does not exist yet.');
} else {
throw error;
}
return unhealthyReason;
}
};

View file

@ -104,6 +104,11 @@ describe('fleet metrics task', () => {
count: 2,
},
],
unhealthy_reason: {
input: 2,
output: 1,
other: 3,
},
});
});
@ -149,6 +154,11 @@ describe('fleet metrics task', () => {
scheduled: 1,
requested: 1,
},
unhealthy_reason: {
input: 2,
output: 1,
other: 3,
},
},
},
}),

View file

@ -20,7 +20,7 @@ import { appContextService } from '../app_context';
import type { AgentMetrics } from './fetch_agent_metrics';
export const TYPE = 'Fleet-Metrics-Task';
export const VERSION = '1.0.0';
export const VERSION = '1.1.0';
const TITLE = 'Fleet Metrics Task';
const TIMEOUT = '1m';
const SCOPE = ['fleet'];
@ -115,6 +115,7 @@ export class FleetMetricsTask {
unhealthy: agents.unhealthy,
inactive: agents.inactive,
upgrading_step: agentMetrics.upgrading_step,
unhealthy_reason: agentMetrics.unhealthy_reason,
},
},
};