[Stack Monitoring] fix ccr read_exceptions alert (#153888)

## Summary

Closes https://github.com/elastic/kibana/issues/153298

## Testing
- Setup CCR ([see how
to](https://github.com/elastic/kibana/blob/main/x-pack/plugins/monitoring/dev_docs/how_to/running_components_from_source.md#multi-cluster-tests-for-ccrccs-or-listing).
I used two cloud clusters, happy to provide credentials to reviewers to
avoid the setup)
- Setup CCR between two clusters, create a follower indice and replicate
some data
- Intentionally break the remote cluster connection on the follower
cluster (update the cluster endpoint). At this point read_exceptions
should appear in ccr documents
- Start local stack
- local elasticsearch should have [this
change](https://github.com/elastic/elasticsearch/pull/94875). See [howto
run component from
source](https://github.com/elastic/kibana/blob/main/x-pack/plugins/monitoring/dev_docs/how_to/running_components_from_source.md#single-cluster-testing)
- Enable stack monitoring default rules
- Start metricbeat collection of the follower cluster
- Ensure ccr alert triggers with metricbeat 7.x (this won't work against
cloud cluster because the license is not supported in 7.x but support
was added later, I'll see if we can [backport this
change](https://github.com/elastic/beats/pull/34105))
- Ensure ccr alert triggers with metricbeat 8.x (build should include
[this change](https://github.com/elastic/beats/pull/34957))
This commit is contained in:
Kevin Lacabane 2023-04-03 18:17:53 +02:00 committed by GitHub
parent ed56403817
commit d1e5dbc5c9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 209 additions and 6 deletions

View file

@ -29,12 +29,14 @@ describe('fetchCCReadExceptions', () => {
},
},
};
const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser;
esClient.search.mockResponse(
// @ts-expect-error not full response interface
esRes
);
it('should call ES with correct query', async () => {
const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser;
esClient.search.mockResponse(
// @ts-expect-error not full response interface
esRes
);
await fetchCCRReadExceptions(esClient, 1643306331418, 1643309869056, 10000);
expect(esClient.search).toHaveBeenCalledWith({
index:
@ -125,7 +127,13 @@ describe('fetchCCReadExceptions', () => {
},
});
});
it('should call ES with correct query when ccs disabled', async () => {
const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser;
esClient.search.mockResponse(
// @ts-expect-error not full response interface
esRes
);
// @ts-ignore
Globals.app.config.ui.ccs.enabled = false;
let params = null;
@ -139,4 +147,199 @@ describe('fetchCCReadExceptions', () => {
// @ts-ignore
expect(params.index).toBe('.monitoring-es-*,metrics-elasticsearch.stack_monitoring.ccr-*');
});
it('should return read exceptions from legacy documents', async () => {
const legacyRes = {
aggregations: {
remote_clusters: {
buckets: [
{
key: 'secondary',
doc_count: 21,
follower_indices: {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: 'foobar_follower',
doc_count: 21,
hits: {
hits: {
total: { value: 21, relation: 'eq' },
max_score: null,
hits: [
{
_index: '.monitoring-es-7-mb-2023.03.30',
_id: '0YmUM4cBxRuN6VWqFo3H',
_score: null,
_source: {
ccr_stats: {
shard_id: 0,
read_exceptions: [
{
retries: 1,
exception: {
reason:
'java.lang.IllegalArgumentException: unknown host [secondary.es.us-central1.gcp.cloud.es.ioe]',
caused_by: {
type: 'illegal_argument_exception',
reason:
'unknown host [secondary.es.us-central1.gcp.cloud.es.ioe]',
caused_by: {
type: 'unknown_host_exception',
reason: 'secondary.es.us-central1.gcp.cloud.es.ioe',
},
},
type: 'exception',
},
from_seq_no: 28,
},
],
leader_index: 'foobar',
},
cluster_uuid: 'jRHXRb4pSnySw_JEBv_dHg',
},
sort: [1680197555160],
},
],
},
},
},
],
},
},
],
},
},
};
const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser;
esClient.search.mockResponse(
// @ts-expect-error not full response interface
legacyRes
);
const result = await fetchCCRReadExceptions(esClient, 1643306331418, 1643309869056, 10000);
expect(result).toStrictEqual([
{
clusterUuid: 'jRHXRb4pSnySw_JEBv_dHg',
remoteCluster: 'secondary',
followerIndex: 'foobar_follower',
leaderIndex: 'foobar',
shardId: 0,
lastReadException: {
type: 'exception',
reason:
'java.lang.IllegalArgumentException: unknown host [secondary.es.us-central1.gcp.cloud.es.ioe]',
caused_by: {
type: 'illegal_argument_exception',
reason: 'unknown host [secondary.es.us-central1.gcp.cloud.es.ioe]',
caused_by: {
type: 'unknown_host_exception',
reason: 'secondary.es.us-central1.gcp.cloud.es.ioe',
},
},
},
ccs: null,
},
]);
});
it('should return read exceptions from ecs documents', async () => {
const ecsRes = {
aggregations: {
remote_clusters: {
buckets: [
{
key: 'secondary',
doc_count: 44,
follower_indices: {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: 'foobar_follower',
doc_count: 44,
hits: {
hits: {
total: { value: 44, relation: 'eq' },
max_score: null,
hits: [
{
_index: '.ds-.monitoring-es-8-mb-2023.03.30-000001',
_id: '6YmAM4cBxRuN6VWqx4Sg',
_score: null,
_source: {
elasticsearch: {
cluster: { id: 'jRHXRb4pSnySw_JEBv_dHg' },
ccr: {
read_exceptions: [
{
from_seq_no: 28,
retries: 1,
exception: {
type: 'exception',
reason:
'java.lang.IllegalArgumentException: unknown host [secondary.es.us-central1.gcp.cloud.es.ioe]',
caused_by: {
type: 'illegal_argument_exception',
reason:
'unknown host [secondary.es.us-central1.gcp.cloud.es.ioe]',
caused_by: {
type: 'unknown_host_exception',
reason: 'secondary.es.us-central1.gcp.cloud.es.ioe',
},
},
},
},
],
leader: { index: 'foobar' },
},
},
},
sort: [1680196289074],
},
],
},
},
},
],
},
},
],
},
},
};
const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser;
esClient.search.mockResponse(
// @ts-expect-error not full response interface
ecsRes
);
const result = await fetchCCRReadExceptions(esClient, 1643306331418, 1643309869056, 10000);
expect(result).toStrictEqual([
{
clusterUuid: 'jRHXRb4pSnySw_JEBv_dHg',
remoteCluster: 'secondary',
followerIndex: 'foobar_follower',
leaderIndex: 'foobar',
shardId: undefined,
lastReadException: {
type: 'exception',
reason:
'java.lang.IllegalArgumentException: unknown host [secondary.es.us-central1.gcp.cloud.es.ioe]',
caused_by: {
type: 'illegal_argument_exception',
reason: 'unknown host [secondary.es.us-central1.gcp.cloud.es.ioe]',
caused_by: {
type: 'unknown_host_exception',
reason: 'secondary.es.us-central1.gcp.cloud.es.ioe',
},
},
},
ccs: null,
},
]);
});
});

View file

@ -162,7 +162,7 @@ export async function fetchCCRReadExceptions(
const { read_exceptions: readExceptions, shard_id: shardId } = ccrStats;
const leaderIndex = ccrStats.leaderIndex || ccrStats.leader.index;
const leaderIndex = ccrStats.leader_index || ccrStats.leader.index;
const { exception: lastReadException } = readExceptions[readExceptions.length - 1];