[8.12] [monitoring] Revert CPU Usage rule changes (#172913) (#172959)

# Backport

This will backport the following commits from `main` to `8.12`:
- [[monitoring] Revert CPU Usage rule changes
(#172913)](https://github.com/elastic/kibana/pull/172913)

<!--- Backport version: 8.9.7 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"Milton
Hultgren","email":"milton.hultgren@elastic.co"},"sourceCommit":{"committedDate":"2023-12-08T15:25:23Z","message":"[monitoring]
Revert CPU Usage rule changes (#172913)\n\nReverts
https://github.com/elastic/kibana/pull/159351\r\nReverts
https://github.com/elastic/kibana/pull/167244\r\n\r\nDue to the many
unexpected issues that these changes introduced we've\r\ndecided to
revert these changes until we have better solutions for the\r\nproblems
we've learnt about.\r\n\r\nProblems:\r\n- Gaps in data cause alerts to
fire (see next point)\r\n- Normal CPU rescaling causes alerts to
fire\r\nhttps://github.com/elastic/kibana/issues/160905\r\n- Any error
fires an alert (since there is no other way to inform the\r\nuser about
the problems faced by the rule executor)\r\n- Many assumptions about
cgroups only being for container users are\r\nwrong\r\n\r\nTo address
some of these issues we also need more functionality in the\r\nalerting
framework to be able to register secondary actions so that we\r\nmay
trigger non-oncall workflows for when a rule faces issues
with\r\nevaluating the stats.\r\n\r\nOriginal issue
https://github.com/elastic/kibana/issues/116128","sha":"55bc6d505977e8831633cc76e0f46b2ca66ef559","branchLabelMapping":{"^v8.13.0$":"main","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:fix","backport:prev-minor","v8.12.0","v8.13.0"],"number":172913,"url":"https://github.com/elastic/kibana/pull/172913","mergeCommit":{"message":"[monitoring]
Revert CPU Usage rule changes (#172913)\n\nReverts
https://github.com/elastic/kibana/pull/159351\r\nReverts
https://github.com/elastic/kibana/pull/167244\r\n\r\nDue to the many
unexpected issues that these changes introduced we've\r\ndecided to
revert these changes until we have better solutions for the\r\nproblems
we've learnt about.\r\n\r\nProblems:\r\n- Gaps in data cause alerts to
fire (see next point)\r\n- Normal CPU rescaling causes alerts to
fire\r\nhttps://github.com/elastic/kibana/issues/160905\r\n- Any error
fires an alert (since there is no other way to inform the\r\nuser about
the problems faced by the rule executor)\r\n- Many assumptions about
cgroups only being for container users are\r\nwrong\r\n\r\nTo address
some of these issues we also need more functionality in the\r\nalerting
framework to be able to register secondary actions so that we\r\nmay
trigger non-oncall workflows for when a rule faces issues
with\r\nevaluating the stats.\r\n\r\nOriginal issue
https://github.com/elastic/kibana/issues/116128","sha":"55bc6d505977e8831633cc76e0f46b2ca66ef559"}},"sourceBranch":"main","suggestedTargetBranches":["8.12"],"targetPullRequestStates":[{"branch":"8.12","label":"v8.12.0","labelRegex":"^v(\\d+).(\\d+).\\d+$","isSourceBranch":false,"state":"NOT_CREATED"},{"branch":"main","label":"v8.13.0","labelRegex":"^v8.13.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/172913","number":172913,"mergeCommit":{"message":"[monitoring]
Revert CPU Usage rule changes (#172913)\n\nReverts
https://github.com/elastic/kibana/pull/159351\r\nReverts
https://github.com/elastic/kibana/pull/167244\r\n\r\nDue to the many
unexpected issues that these changes introduced we've\r\ndecided to
revert these changes until we have better solutions for the\r\nproblems
we've learnt about.\r\n\r\nProblems:\r\n- Gaps in data cause alerts to
fire (see next point)\r\n- Normal CPU rescaling causes alerts to
fire\r\nhttps://github.com/elastic/kibana/issues/160905\r\n- Any error
fires an alert (since there is no other way to inform the\r\nuser about
the problems faced by the rule executor)\r\n- Many assumptions about
cgroups only being for container users are\r\nwrong\r\n\r\nTo address
some of these issues we also need more functionality in the\r\nalerting
framework to be able to register secondary actions so that we\r\nmay
trigger non-oncall workflows for when a rule faces issues
with\r\nevaluating the stats.\r\n\r\nOriginal issue
https://github.com/elastic/kibana/issues/116128","sha":"55bc6d505977e8831633cc76e0f46b2ca66ef559"}}]}]
BACKPORT-->

Co-authored-by: Milton Hultgren <milton.hultgren@elastic.co>
This commit is contained in:
Kibana Machine 2023-12-08 11:46:31 -05:00 committed by GitHub
parent 8506d96358
commit b79c4b3db9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 466 additions and 1453 deletions

View file

@ -169,9 +169,10 @@ export interface AlertNodeStats {
}
export interface AlertCpuUsageNodeStats extends AlertNodeStats {
cpuUsage?: number;
limitsChanged?: boolean;
unexpectedLimits?: boolean;
cpuUsage: number;
containerUsage: number;
containerPeriods: number;
containerQuota: number;
}
export interface AlertThreadPoolRejectionsStats {

View file

@ -42,7 +42,7 @@ describe('CpuUsageRule', () => {
expect(rule.ruleOptions.throttle).toBe('1d');
expect(rule.ruleOptions.defaultParams).toStrictEqual({ threshold: 85, duration: '5m' });
expect(rule.ruleOptions.actionVariables).toStrictEqual([
{ name: 'node', description: 'The node reporting high CPU usage.' },
{ name: 'node', description: 'The node reporting high cpu usage.' },
{
name: 'internalShortMessage',
description: 'The short internal message generated by Elastic.',
@ -114,7 +114,7 @@ describe('CpuUsageRule', () => {
getState.mockReset();
});
it('should fire actions when threshold is exceeded', async () => {
it('should fire actions', async () => {
const rule = new CpuUsageRule();
const type = rule.getRuleType();
await type.executor({
@ -122,7 +122,6 @@ describe('CpuUsageRule', () => {
params: rule.ruleOptions.defaultParams,
} as any);
const count = 1;
const threshold = rule.ruleOptions.defaultParams?.threshold;
expect(replaceState).toHaveBeenCalledWith({
alertStates: [
{
@ -135,14 +134,13 @@ describe('CpuUsageRule', () => {
cpuUsage,
nodeId,
nodeName,
threshold,
},
nodeId,
nodeName,
ui: {
isFiring: true,
message: {
text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`,
text: `Node #start_link${nodeName}#end_link is reporting cpu usage of ${cpuUsage}% at #absolute`,
nextSteps: [
{
text: '#start_linkCheck hot threads#end_link',
@ -170,12 +168,6 @@ describe('CpuUsageRule', () => {
},
],
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'link',
url: 'elasticsearch/nodes/myNodeId',
},
{
startToken: '#absolute',
type: 'time',
@ -183,6 +175,12 @@ describe('CpuUsageRule', () => {
isRelative: false,
timestamp: 1,
},
{
startToken: '#start_link',
endToken: '#end_link',
type: 'link',
url: 'elasticsearch/nodes/myNodeId',
},
],
},
severity: 'danger',
@ -193,10 +191,10 @@ describe('CpuUsageRule', () => {
],
});
expect(scheduleActions).toHaveBeenCalledWith('default', {
internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`,
internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. Verify CPU level of node.`,
action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
actionPlain: 'Verify CPU usage of node.',
actionPlain: 'Verify CPU level of node.',
clusterName,
count,
nodes: `${nodeName}:${cpuUsage}`,
@ -244,10 +242,10 @@ describe('CpuUsageRule', () => {
} as any);
const count = 1;
expect(scheduleActions).toHaveBeenCalledWith('default', {
internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:${ccs}))`,
internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`,
internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:${ccs}))`,
internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. Verify CPU level of node.`,
action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:testCluster))`,
actionPlain: 'Verify CPU usage of node.',
actionPlain: 'Verify CPU level of node.',
clusterName,
count,
nodes: `${nodeName}:${cpuUsage}`,
@ -255,324 +253,5 @@ describe('CpuUsageRule', () => {
state: 'firing',
});
});
it('should fire actions when resource limits are missing', async () => {
(fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => {
return [stat];
});
const rule = new CpuUsageRule();
const type = rule.getRuleType();
await type.executor({
...executorOptions,
params: rule.ruleOptions.defaultParams,
} as any);
const count = 1;
const threshold = rule.ruleOptions.defaultParams?.threshold;
expect(replaceState).toHaveBeenCalledWith({
alertStates: [
{
ccs: undefined,
cluster: { clusterUuid, clusterName },
cpuUsage,
itemLabel: undefined,
meta: {
clusterUuid,
cpuUsage,
nodeId,
nodeName,
threshold,
},
nodeId,
nodeName,
ui: {
isFiring: true,
message: {
text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`,
nextSteps: [
{
text: '#start_linkCheck hot threads#end_link',
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'docLink',
partialUrl:
'{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/cluster-nodes-hot-threads.html',
},
],
},
{
text: '#start_linkCheck long running tasks#end_link',
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'docLink',
partialUrl:
'{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html',
},
],
},
],
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'link',
url: 'elasticsearch/nodes/myNodeId',
},
{
startToken: '#absolute',
type: 'time',
isAbsolute: true,
isRelative: false,
timestamp: 1,
},
],
},
severity: 'danger',
triggeredMS: 1,
lastCheckedMS: 0,
},
},
],
});
expect(scheduleActions).toHaveBeenCalledWith('default', {
internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`,
action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
actionPlain: 'Verify CPU usage of node.',
clusterName,
count,
nodes: `${nodeName}:${cpuUsage}`,
node: `${nodeName}:${cpuUsage}`,
state: 'firing',
});
});
it('should fire actions when resource limits have changed', async () => {
(fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => {
return [
{
...stat,
limitsChanged: true,
},
];
});
const rule = new CpuUsageRule();
const type = rule.getRuleType();
await type.executor({
...executorOptions,
params: rule.ruleOptions.defaultParams,
} as any);
const count = 1;
const threshold = rule.ruleOptions.defaultParams?.threshold;
expect(replaceState).toHaveBeenCalledWith({
alertStates: [
{
ccs: undefined,
cluster: { clusterUuid, clusterName },
cpuUsage,
itemLabel: undefined,
meta: {
clusterUuid,
cpuUsage,
nodeId,
nodeName,
threshold,
limitsChanged: true,
},
nodeId,
nodeName,
ui: {
isFiring: true,
message: {
text: 'Resource limits for node #start_linkmyNodeName#end_link has changed within the look back window, unable to confidently calculate CPU usage for alerting. Please monitor the usage until the window has moved. Last checked at #absolute',
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'link',
url: 'elasticsearch/nodes/myNodeId',
},
{
startToken: '#absolute',
type: 'time',
isAbsolute: true,
isRelative: false,
timestamp: 1,
},
],
},
severity: 'danger',
triggeredMS: 1,
lastCheckedMS: 0,
},
},
],
});
expect(scheduleActions).toHaveBeenCalledWith('default', {
internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`,
action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
actionPlain: 'Verify CPU usage of node.',
clusterName,
count,
nodes: `${nodeName}:${cpuUsage}`,
node: `${nodeName}:${cpuUsage}`,
state: 'firing',
});
});
it('should fire actions when resource limits are set but not expected', async () => {
(fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => {
return [
{
...stat,
unexpectedLimits: true,
},
];
});
const rule = new CpuUsageRule();
const type = rule.getRuleType();
await type.executor({
...executorOptions,
params: rule.ruleOptions.defaultParams,
} as any);
const count = 1;
const threshold = rule.ruleOptions.defaultParams?.threshold;
expect(replaceState).toHaveBeenCalledWith({
alertStates: [
{
ccs: undefined,
cluster: { clusterUuid, clusterName },
cpuUsage,
itemLabel: undefined,
meta: {
clusterUuid,
cpuUsage,
nodeId,
nodeName,
threshold,
unexpectedLimits: true,
},
nodeId,
nodeName,
ui: {
isFiring: true,
message: {
text: `Kibana is configured for non-containerized workloads but node #start_linkmyNodeName#end_link has resource limits configured. Node reports usage of ${cpuUsage}%. Last checked at #absolute`,
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'link',
url: 'elasticsearch/nodes/myNodeId',
},
{
startToken: '#absolute',
type: 'time',
isAbsolute: true,
isRelative: false,
timestamp: 1,
},
],
},
severity: 'danger',
triggeredMS: 1,
lastCheckedMS: 0,
},
},
],
});
expect(scheduleActions).toHaveBeenCalledWith('default', {
internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`,
action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
actionPlain: 'Verify CPU usage of node.',
clusterName,
count,
nodes: `${nodeName}:${cpuUsage}`,
node: `${nodeName}:${cpuUsage}`,
state: 'firing',
});
});
it('should fire actions when it fails to calculate CPU usage', async () => {
(fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => {
return [
{
...stat,
cpuUsage: undefined,
},
];
});
const rule = new CpuUsageRule();
const type = rule.getRuleType();
await type.executor({
...executorOptions,
params: rule.ruleOptions.defaultParams,
} as any);
const count = 1;
const threshold = rule.ruleOptions.defaultParams?.threshold;
expect(replaceState).toHaveBeenCalledWith({
alertStates: [
{
ccs: undefined,
cluster: { clusterUuid, clusterName },
cpuUsage: undefined,
itemLabel: undefined,
meta: {
clusterUuid,
cpuUsage: undefined,
nodeId,
nodeName,
threshold,
},
nodeId,
nodeName,
ui: {
isFiring: true,
message: {
text: 'Failed to compute CPU usage for node #start_linkmyNodeName#end_link. Please check the Kibana logs for more details. Last checked at #absolute',
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'link',
url: 'elasticsearch/nodes/myNodeId',
},
{
startToken: '#absolute',
type: 'time',
isAbsolute: true,
isRelative: false,
timestamp: 1,
},
],
},
severity: 'warning',
triggeredMS: 1,
lastCheckedMS: 0,
},
},
],
});
expect(scheduleActions).toHaveBeenCalledWith('default', {
internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`,
action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
actionPlain: 'Verify CPU usage of node.',
clusterName,
count,
nodes: `${nodeName}:undefined`,
node: `${nodeName}:undefined`,
state: 'firing',
});
});
});
});

View file

@ -11,7 +11,6 @@ import { ElasticsearchClient } from '@kbn/core/server';
import { Alert } from '@kbn/alerting-plugin/server';
import { RawAlertInstance, SanitizedRule } from '@kbn/alerting-plugin/common';
import { parseDuration } from '@kbn/alerting-plugin/common/parse_duration';
import { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types';
import { BaseRule } from './base_rule';
import {
AlertData,
@ -47,7 +46,7 @@ export class CpuUsageRule extends BaseRule {
{
name: 'node',
description: i18n.translate('xpack.monitoring.alerts.cpuUsage.actionVariables.node', {
defaultMessage: 'The node reporting high CPU usage.',
defaultMessage: 'The node reporting high cpu usage.',
}),
},
...Object.values(AlertingDefaults.ALERT_TYPE.context),
@ -63,52 +62,28 @@ export class CpuUsageRule extends BaseRule {
const duration = parseDuration(params.duration);
const endMs = +new Date();
const startMs = endMs - duration;
let filterQuery;
if (params.filterQuery) {
try {
filterQuery = JSON.parse(params.filterQuery) as QueryDslQueryContainer;
} catch (error) {
throw new Error(`Failed to parse filter query in CPU usage rule ${error}`);
}
}
const stats = await fetchCpuUsageNodeStats(
{
esClient,
clusterUuids: clusters.map((cluster) => cluster.clusterUuid),
startMs,
endMs,
filterQuery,
logger: this.scopedLogger,
},
Globals.app.config
esClient,
clusters,
startMs,
endMs,
Globals.app.config.ui.max_bucket_size,
params.filterQuery
);
return stats.map((stat) => ({
clusterUuid: stat.clusterUuid,
...this.outcomeAndSeverity(stat, params.threshold!),
meta: {
...stat,
threshold: params.threshold!,
},
ccs: stat.ccs,
}));
}
private outcomeAndSeverity(
stat: AlertCpuUsageNodeStats,
threshold: number
): { shouldFire: boolean; severity: AlertSeverity } {
if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
let severity = AlertSeverity.Warning;
if (stat.cpuUsage && stat.cpuUsage > threshold) {
severity = AlertSeverity.Danger;
return stats.map((stat) => {
if (Globals.app.config.ui.container.elasticsearch.enabled) {
stat.cpuUsage =
(stat.containerUsage / (stat.containerPeriods * stat.containerQuota * 1000)) * 100;
}
return { shouldFire: true, severity };
}
return { shouldFire: stat.cpuUsage > threshold, severity: AlertSeverity.Danger };
return {
clusterUuid: stat.clusterUuid,
shouldFire: stat.cpuUsage > params.threshold!,
severity: AlertSeverity.Danger,
meta: stat,
ccs: stat.ccs,
};
});
}
protected filterAlertInstance(alertInstance: RawAlertInstance, filters: CommonAlertFilter[]) {
@ -127,67 +102,13 @@ export class CpuUsageRule extends BaseRule {
}
protected getUiMessage(alertState: AlertState, item: AlertData): AlertMessage {
const stat = item.meta as AlertCpuUsageNodeStats & Pick<CommonAlertParams, 'threshold'>;
const tokens = [
{
startToken: '#start_link',
endToken: '#end_link',
type: AlertMessageTokenType.Link,
url: `elasticsearch/nodes/${stat.nodeId}`,
} as AlertMessageLinkToken,
{
startToken: '#absolute',
type: AlertMessageTokenType.Time,
isAbsolute: true,
isRelative: false,
timestamp: alertState.ui.triggeredMS,
} as AlertMessageTimeToken,
];
if (stat.unexpectedLimits) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', {
defaultMessage: `Kibana is configured for non-containerized workloads but node #start_link{nodeName}#end_link has resource limits configured. Node reports usage of {cpuUsage}%. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT),
},
}),
tokens,
};
}
if (stat.limitsChanged) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.limitsChanged', {
defaultMessage: `Resource limits for node #start_link{nodeName}#end_link has changed within the look back window, unable to confidently calculate CPU usage for alerting. Please monitor the usage until the window has moved. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
},
}),
tokens,
};
}
if (stat.cpuUsage === undefined) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage', {
defaultMessage: `Failed to compute CPU usage for node #start_link{nodeName}#end_link. Please check the Kibana logs for more details. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
},
}),
tokens,
};
}
const stat = item.meta as AlertCpuUsageNodeStats;
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.firingMessage', {
defaultMessage: `Node #start_link{nodeName}#end_link is reporting CPU usage of {cpuUsage}% which is above the configured threshold of {threshold}%. Last checked at #absolute`,
defaultMessage: `Node #start_link{nodeName}#end_link is reporting cpu usage of {cpuUsage}% at #absolute`,
values: {
nodeName: stat.nodeName,
cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT),
threshold: stat.threshold,
},
}),
nextSteps: [
@ -204,7 +125,21 @@ export class CpuUsageRule extends BaseRule {
`{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html`
),
],
tokens,
tokens: [
{
startToken: '#absolute',
type: AlertMessageTokenType.Time,
isAbsolute: true,
isRelative: false,
timestamp: alertState.ui.triggeredMS,
} as AlertMessageTimeToken,
{
startToken: '#start_link',
endToken: '#end_link',
type: AlertMessageTokenType.Link,
url: `elasticsearch/nodes/${stat.nodeId}`,
} as AlertMessageLinkToken,
],
};
}
@ -222,7 +157,7 @@ export class CpuUsageRule extends BaseRule {
return;
}
const shortActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.shortAction', {
defaultMessage: 'Verify CPU usage of node.',
defaultMessage: 'Verify CPU level of node.',
});
const fullActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.fullAction', {
defaultMessage: 'View node',
@ -234,8 +169,28 @@ export class CpuUsageRule extends BaseRule {
ccs
);
const action = `[${fullActionText}](${globalStateLink})`;
const internalShortMessage = this.getMessage(firingNode, cluster.clusterName, shortActionText);
const internalFullMessage = this.getMessage(firingNode, cluster.clusterName, action);
const internalShortMessage = i18n.translate(
'xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage',
{
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {shortActionText}`,
values: {
clusterName: cluster.clusterName,
nodeName: firingNode.nodeName,
shortActionText,
},
}
);
const internalFullMessage = i18n.translate(
'xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage',
{
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {action}`,
values: {
clusterName: cluster.clusterName,
nodeName: firingNode.nodeName,
action,
},
}
);
instance.scheduleActions('default', {
internalShortMessage,
internalFullMessage: Globals.app.isCloud ? internalShortMessage : internalFullMessage,
@ -251,28 +206,4 @@ export class CpuUsageRule extends BaseRule {
actionPlain: shortActionText,
});
}
private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) {
const stat = state.meta as AlertCpuUsageNodeStats;
if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', {
defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`,
values: {
clusterName,
nodeName: state.nodeName,
action,
},
});
}
return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessage', {
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster {clusterName}. {action}`,
values: {
clusterName,
nodeName: state.nodeName,
action,
},
});
}
}

View file

@ -1,247 +0,0 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
exports[`fetchCpuUsageNodeStats when running in a container calculates the containerized CPU usage 1`] = `
Object {
"aggs": Object {
"clusters": Object {
"aggs": Object {
"nodes": Object {
"aggs": Object {
"average_cpu_usage_percent": Object {
"avg": Object {
"field": "node_stats.process.cpu.percent",
},
},
"index": Object {
"terms": Object {
"field": "_index",
"size": 1,
},
},
"max_periods": Object {
"max": Object {
"field": "node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods",
},
},
"max_usage_nanos": Object {
"max": Object {
"field": "node_stats.os.cgroup.cpuacct.usage_nanos",
},
},
"min_periods": Object {
"min": Object {
"field": "node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods",
},
},
"min_usage_nanos": Object {
"min": Object {
"field": "node_stats.os.cgroup.cpuacct.usage_nanos",
},
},
"name": Object {
"terms": Object {
"field": "source_node.name",
"size": 1,
},
},
"quota_micros_max": Object {
"max": Object {
"field": "node_stats.os.cgroup.cpu.cfs_quota_micros",
},
},
"quota_micros_min": Object {
"min": Object {
"field": "node_stats.os.cgroup.cpu.cfs_quota_micros",
},
},
},
"terms": Object {
"field": "node_stats.node_id",
"size": 10,
},
},
},
"terms": Object {
"field": "cluster_uuid",
"size": 10,
},
},
},
"filter_path": Array [
"aggregations",
],
"index": ".monitoring-es-*,metrics-elasticsearch.stack_monitoring.node_stats-*",
"query": Object {
"bool": Object {
"filter": Array [
Object {
"bool": Object {
"minimum_should_match": 1,
"should": Array [
Object {
"term": Object {
"type": "node_stats",
},
},
Object {
"term": Object {
"metricset.name": "node_stats",
},
},
Object {
"term": Object {
"data_stream.dataset": "elasticsearch.stack_monitoring.node_stats",
},
},
],
},
},
Object {
"terms": Object {
"cluster_uuid": Array [
"my-test-cluster",
],
},
},
Object {
"range": Object {
"timestamp": Object {
"format": "epoch_millis",
"gte": 0,
"lte": 10,
},
},
},
Object {
"bool": Object {
"minimum_should_match": 1,
"should": Array [
Object {
"term": Object {
"cluster_uuid": Object {
"value": "my-test-cluster",
},
},
},
],
},
},
],
},
},
"size": 0,
}
`;
exports[`fetchCpuUsageNodeStats when running outside a container calculates the CPU usage 1`] = `
Object {
"aggs": Object {
"clusters": Object {
"aggs": Object {
"nodes": Object {
"aggs": Object {
"average_cpu": Object {
"avg": Object {
"field": "node_stats.process.cpu.percent",
},
},
"index": Object {
"terms": Object {
"field": "_index",
"size": 1,
},
},
"name": Object {
"terms": Object {
"field": "source_node.name",
"size": 1,
},
},
"quota_micros_max": Object {
"max": Object {
"field": "node_stats.os.cgroup.cpu.cfs_quota_micros",
},
},
"quota_micros_min": Object {
"min": Object {
"field": "node_stats.os.cgroup.cpu.cfs_quota_micros",
},
},
},
"terms": Object {
"field": "node_stats.node_id",
"size": 10,
},
},
},
"terms": Object {
"field": "cluster_uuid",
"size": 10,
},
},
},
"filter_path": Array [
"aggregations",
],
"index": ".monitoring-es-*,metrics-elasticsearch.stack_monitoring.node_stats-*",
"query": Object {
"bool": Object {
"filter": Array [
Object {
"bool": Object {
"minimum_should_match": 1,
"should": Array [
Object {
"term": Object {
"type": "node_stats",
},
},
Object {
"term": Object {
"metricset.name": "node_stats",
},
},
Object {
"term": Object {
"data_stream.dataset": "elasticsearch.stack_monitoring.node_stats",
},
},
],
},
},
Object {
"terms": Object {
"cluster_uuid": Array [
"my-test-cluster",
],
},
},
Object {
"range": Object {
"timestamp": Object {
"format": "epoch_millis",
"gte": 0,
"lte": 10,
},
},
},
Object {
"bool": Object {
"minimum_should_match": 1,
"should": Array [
Object {
"term": Object {
"cluster_uuid": Object {
"value": "my-test-cluster",
},
},
},
],
},
},
],
},
},
"size": 0,
}
`;

View file

@ -5,74 +5,63 @@
* 2.0.
*/
import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { elasticsearchClientMock } from '@kbn/core-elasticsearch-client-server-mocks';
import { loggerMock } from '@kbn/logging-mocks';
import { fetchCpuUsageNodeStats } from './fetch_cpu_usage_node_stats';
jest.mock('../../static_globals', () => ({
Globals: {
app: {
config: {
ui: {
ccs: { enabled: true },
},
},
},
},
}));
describe('fetchCpuUsageNodeStats', () => {
describe('when running outside a container', () => {
const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser;
const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser;
const clusters = [
{
clusterUuid: 'abc123',
clusterName: 'test',
},
];
const startMs = 0;
const endMs = 0;
const size = 10;
const configSlice: any = {
ui: {
ccs: { enabled: false },
container: {
elasticsearch: {
enabled: false,
},
},
max_bucket_size: 10,
},
};
const filterQuery = {
bool: {
should: [
{
term: {
cluster_uuid: {
value: 'my-test-cluster',
},
},
},
],
minimum_should_match: 1,
},
};
it('calculates the CPU usage', async () => {
esClient.search.mockResponse({
it('fetch normal stats', async () => {
esClient.search.mockResponse(
// @ts-expect-error not full response interface
{
aggregations: {
clusters: {
buckets: [
{
key: 'my-test-cluster',
key: clusters[0].clusterUuid,
nodes: {
buckets: [
{
key: 'my-test-node',
average_cpu: {
value: 45,
},
quota_micros_max: {
value: null,
},
quota_micros_min: {
value: null,
key: 'theNodeId',
index: {
buckets: [
{
key: '.monitoring-es-TODAY',
},
],
},
name: {
buckets: [
{
key: 'test-node',
key: 'theNodeName',
},
],
},
index: {
buckets: [
{
key: 'a-local-index',
},
],
average_cpu: {
value: 10,
},
},
],
@ -81,377 +70,217 @@ describe('fetchCpuUsageNodeStats', () => {
],
},
},
} as any);
const stats = await fetchCpuUsageNodeStats(
{
esClient,
clusterUuids: ['my-test-cluster'],
startMs: 0,
endMs: 10,
filterQuery,
logger: loggerMock.create(),
},
configSlice
);
expect(stats).toEqual([
{
clusterUuid: 'my-test-cluster',
nodeId: 'my-test-node',
nodeName: 'test-node',
ccs: undefined,
cpuUsage: 45,
unexpectedLimits: false,
},
]);
// If this check fails, it means the query has changed which `might` mean the response shape has changed and
// the test data needs to be updated to reflect the new format.
expect(esClient.search.mock.calls[0][0]).toMatchSnapshot();
});
it('warns about container metrics being present', async () => {
esClient.search.mockResponse({
aggregations: {
clusters: {
buckets: [
{
key: 'my-test-cluster',
nodes: {
buckets: [
{
key: 'my-test-node',
average_cpu: {
value: 45,
},
quota_micros_max: {
value: 2000,
},
quota_micros_min: {
value: 2000,
},
name: {
buckets: [
{
key: 'test-node',
},
],
},
index: {
buckets: [
{
key: 'a-local-index',
},
],
},
},
],
},
},
],
},
},
} as any);
const stats = await fetchCpuUsageNodeStats(
{
esClient,
clusterUuids: ['my-test-cluster'],
startMs: 0,
endMs: 10,
filterQuery,
logger: loggerMock.create(),
},
configSlice
);
expect(stats).toEqual([
{
unexpectedLimits: true,
clusterUuid: 'my-test-cluster',
nodeId: 'my-test-node',
nodeName: 'test-node',
ccs: undefined,
cpuUsage: 45,
},
]);
});
}
);
const result = await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size);
expect(result).toEqual([
{
clusterUuid: clusters[0].clusterUuid,
nodeName: 'theNodeName',
nodeId: 'theNodeId',
cpuUsage: 10,
containerUsage: undefined,
containerPeriods: undefined,
containerQuota: undefined,
ccs: null,
},
]);
});
describe('when running in a container', () => {
const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser;
const configSlice: any = {
ui: {
ccs: { enabled: false },
container: {
elasticsearch: {
enabled: true,
it('fetch container stats', async () => {
esClient.search.mockResponse(
// @ts-expect-error not full response interface
{
aggregations: {
clusters: {
buckets: [
{
key: clusters[0].clusterUuid,
nodes: {
buckets: [
{
key: 'theNodeId',
index: {
buckets: [
{
key: '.monitoring-es-TODAY',
},
],
},
name: {
buckets: [
{
key: 'theNodeName',
},
],
},
histo: {
buckets: [
null,
{
usage_deriv: {
normalized_value: 10,
},
periods_deriv: {
normalized_value: 5,
},
},
],
},
average_quota: {
value: 50,
},
},
],
},
},
],
},
},
max_bucket_size: 10,
}
);
const result = await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size);
expect(result).toEqual([
{
clusterUuid: clusters[0].clusterUuid,
nodeName: 'theNodeName',
nodeId: 'theNodeId',
cpuUsage: undefined,
containerUsage: 10,
containerPeriods: 5,
containerQuota: 50,
ccs: null,
},
};
]);
});
const filterQuery = {
bool: {
should: [
{
term: {
cluster_uuid: {
value: 'my-test-cluster',
it('fetch properly return ccs', async () => {
esClient.search.mockResponse(
// @ts-expect-error not full response interface
{
aggregations: {
clusters: {
buckets: [
{
key: clusters[0].clusterUuid,
nodes: {
buckets: [
{
key: 'theNodeId',
index: {
buckets: [
{
key: 'foo:.monitoring-es-TODAY',
},
],
},
name: {
buckets: [
{
key: 'theNodeName',
},
],
},
average_usage: {
value: 10,
},
average_periods: {
value: 5,
},
average_quota: {
value: 50,
},
},
],
},
},
],
},
},
}
);
const result = await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size);
expect(result[0].ccs).toBe('foo');
});
it('should use consistent params', async () => {
let params = null;
esClient.search.mockImplementation((...args) => {
params = args[0];
return Promise.resolve({} as estypes.SearchResponse);
});
const filterQuery =
'{"bool":{"should":[{"exists":{"field":"cluster_uuid"}}],"minimum_should_match":1}}';
await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size, filterQuery);
expect(params).toStrictEqual({
index:
'*:.monitoring-es-*,.monitoring-es-*,*:metrics-elasticsearch.stack_monitoring.node_stats-*,metrics-elasticsearch.stack_monitoring.node_stats-*',
filter_path: ['aggregations'],
body: {
size: 0,
query: {
bool: {
filter: [
{ terms: { cluster_uuid: ['abc123'] } },
{
bool: {
should: [
{ term: { type: 'node_stats' } },
{ term: { 'metricset.name': 'node_stats' } },
{
term: { 'data_stream.dataset': 'elasticsearch.stack_monitoring.node_stats' },
},
],
minimum_should_match: 1,
},
},
{ range: { timestamp: { format: 'epoch_millis', gte: 0, lte: 0 } } },
{
bool: { should: [{ exists: { field: 'cluster_uuid' } }], minimum_should_match: 1 },
},
],
},
},
aggs: {
clusters: {
terms: { field: 'cluster_uuid', size: 10, include: ['abc123'] },
aggs: {
nodes: {
terms: { field: 'node_stats.node_id', size: 10 },
aggs: {
index: { terms: { field: '_index', size: 1 } },
average_cpu: { avg: { field: 'node_stats.process.cpu.percent' } },
average_quota: { avg: { field: 'node_stats.os.cgroup.cpu.cfs_quota_micros' } },
name: { terms: { field: 'source_node.name', size: 1 } },
histo: {
date_histogram: { field: 'timestamp', fixed_interval: '0m' },
aggs: {
average_periods: {
max: { field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods' },
},
average_usage: { max: { field: 'node_stats.os.cgroup.cpuacct.usage_nanos' } },
usage_deriv: {
derivative: {
buckets_path: 'average_usage',
gap_policy: 'skip',
unit: '1s',
},
},
periods_deriv: {
derivative: {
buckets_path: 'average_periods',
gap_policy: 'skip',
unit: '1s',
},
},
},
},
},
},
},
},
],
minimum_should_match: 1,
},
},
};
it('calculates the containerized CPU usage', async () => {
// 45% CPU usage
const maxPeriods = 1000;
const quotaMicros = 100000;
const usageLimitNanos = maxPeriods * quotaMicros * 1000;
const maxUsageNanos = 0.45 * usageLimitNanos;
esClient.search.mockResponse({
aggregations: {
clusters: {
buckets: [
{
key: 'my-test-cluster',
nodes: {
buckets: [
{
key: 'my-test-node',
min_usage_nanos: {
value: 0,
},
max_usage_nanos: {
value: maxUsageNanos,
},
min_periods: {
value: 0,
},
max_periods: {
value: maxPeriods,
},
quota_micros_min: {
value: quotaMicros,
},
quota_micros_max: {
value: quotaMicros,
},
average_cpu_usage_percent: {
value: 45,
},
name: {
buckets: [
{
key: 'test-node',
},
],
},
index: {
buckets: [
{
key: 'a-local-index',
},
],
},
},
],
},
},
],
},
},
} as any);
const stats = await fetchCpuUsageNodeStats(
{
esClient,
clusterUuids: ['my-test-cluster'],
startMs: 0,
endMs: 10,
filterQuery,
logger: loggerMock.create(),
},
configSlice
);
expect(stats).toEqual([
{
clusterUuid: 'my-test-cluster',
nodeId: 'my-test-node',
nodeName: 'test-node',
ccs: undefined,
cpuUsage: 45,
},
]);
// If this check fails, it means the query has changed which `might` mean the response shape has changed and
// the test data needs to be updated to reflect the new format.
expect(esClient.search.mock.calls[0][0]).toMatchSnapshot();
});
it('warns about resource usage limits not being set', async () => {
esClient.search.mockResponse({
aggregations: {
clusters: {
buckets: [
{
key: 'my-test-cluster',
nodes: {
buckets: [
{
key: 'my-test-node',
min_usage_nanos: {
value: 0,
},
max_usage_nanos: {
value: 1000,
},
min_periods: {
value: 0,
},
max_periods: {
value: 100,
},
quota_micros_min: {
value: -1,
},
quota_micros_max: {
value: -1,
},
average_cpu_usage_percent: {
value: 45,
},
name: {
buckets: [
{
key: 'test-node',
},
],
},
index: {
buckets: [
{
key: 'a-local-index',
},
],
},
},
],
},
},
],
},
},
} as any);
const stats = await fetchCpuUsageNodeStats(
{
esClient,
clusterUuids: ['my-test-cluster'],
startMs: 0,
endMs: 10,
filterQuery,
logger: loggerMock.create(),
},
configSlice
);
expect(stats).toEqual([
{
clusterUuid: 'my-test-cluster',
nodeId: 'my-test-node',
nodeName: 'test-node',
ccs: undefined,
cpuUsage: 45,
},
]);
});
it('warns about resource usage limits being changed', async () => {
esClient.search.mockResponse({
aggregations: {
clusters: {
buckets: [
{
key: 'my-test-cluster',
nodes: {
buckets: [
{
key: 'my-test-node',
min_usage_nanos: {
value: 0,
},
max_usage_nanos: {
value: 1000,
},
min_periods: {
value: 0,
},
max_periods: {
value: 100,
},
quota_micros_min: {
value: -1,
},
quota_micros_max: {
value: 10000,
},
average_cpu_usage_percent: {
value: 45,
},
name: {
buckets: [
{
key: 'test-node',
},
],
},
index: {
buckets: [
{
key: 'a-local-index',
},
],
},
},
],
},
},
],
},
},
} as any);
const stats = await fetchCpuUsageNodeStats(
{
esClient,
clusterUuids: ['my-test-cluster'],
startMs: 0,
endMs: 10,
filterQuery,
logger: loggerMock.create(),
},
configSlice
);
expect(stats).toEqual([
{
limitsChanged: true,
clusterUuid: 'my-test-cluster',
nodeId: 'my-test-node',
nodeName: 'test-node',
ccs: undefined,
cpuUsage: undefined,
},
]);
});
});
});

View file

@ -5,136 +5,139 @@
* 2.0.
*/
import { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types';
import { ElasticsearchClient, Logger } from '@kbn/core/server';
import { InferSearchResponseOf } from '@kbn/es-types';
import { CCS_REMOTE_PATTERN } from '../../../common/constants';
import { AlertCpuUsageNodeStats } from '../../../common/types/alerts';
import { MonitoringConfig } from '../../config';
import { getElasticsearchDataset, getIndexPatterns } from '../cluster/get_index_patterns';
import { ElasticsearchClient } from '@kbn/core/server';
import { get } from 'lodash';
import moment from 'moment';
import { NORMALIZED_DERIVATIVE_UNIT } from '../../../common/constants';
import { AlertCluster, AlertCpuUsageNodeStats } from '../../../common/types/alerts';
import { createDatasetFilter } from './create_dataset_query_filter';
import { getIndexPatterns, getElasticsearchDataset } from '../cluster/get_index_patterns';
import { Globals } from '../../static_globals';
import { CCS_REMOTE_PATTERN } from '../../../common/constants';
interface Options {
esClient: ElasticsearchClient;
clusterUuids: string[];
startMs: number;
endMs: number;
filterQuery?: QueryDslQueryContainer;
logger: Logger;
interface NodeBucketESResponse {
key: string;
average_cpu: { value: number };
}
interface ClusterBucketESResponse {
key: string;
nodes: {
buckets: NodeBucketESResponse[];
};
}
export async function fetchCpuUsageNodeStats(
options: Options,
config: MonitoringConfig
esClient: ElasticsearchClient,
clusters: AlertCluster[],
startMs: number,
endMs: number,
size: number,
filterQuery?: string
): Promise<AlertCpuUsageNodeStats[]> {
if (config.ui.container.elasticsearch.enabled) {
options.logger.debug('CPU usage rule: Computing usage for containerized clusters');
return fetchContainerStats(options, config);
}
// Using pure MS didn't seem to work well with the date_histogram interval
// but minutes does
const intervalInMinutes = moment.duration(endMs - startMs).asMinutes();
options.logger.debug('CPU usage rule: Computing usage for non-containerized clusters');
return fetchNonContainerStats(options, config);
}
async function fetchContainerStats(
{ esClient, startMs, endMs, clusterUuids, filterQuery }: Options,
config: MonitoringConfig
) {
const indexPatterns = getIndexPatterns({
config,
config: Globals.app.config,
moduleType: 'elasticsearch',
dataset: 'node_stats',
ccs: CCS_REMOTE_PATTERN,
});
const params = {
index: indexPatterns,
filter_path: ['aggregations'],
size: 0,
query: {
bool: {
filter: [
createDatasetFilter('node_stats', 'node_stats', getElasticsearchDataset('node_stats')),
{
terms: {
cluster_uuid: clusterUuids,
},
},
{
range: {
timestamp: {
format: 'epoch_millis',
gte: startMs,
lte: endMs,
body: {
size: 0,
query: {
bool: {
filter: [
{
terms: {
cluster_uuid: clusters.map((cluster) => cluster.clusterUuid),
},
},
},
],
},
},
aggs: {
clusters: {
terms: {
field: 'cluster_uuid',
size: config.ui.max_bucket_size,
createDatasetFilter('node_stats', 'node_stats', getElasticsearchDataset('node_stats')),
{
range: {
timestamp: {
format: 'epoch_millis',
gte: startMs,
lte: endMs,
},
},
},
],
},
aggs: {
nodes: {
terms: {
field: 'node_stats.node_id',
size: config.ui.max_bucket_size,
},
aggs: {
name: {
terms: {
field: 'source_node.name',
size: 1,
},
},
aggs: {
clusters: {
terms: {
field: 'cluster_uuid',
size,
include: clusters.map((cluster) => cluster.clusterUuid),
},
aggs: {
nodes: {
terms: {
field: 'node_stats.node_id',
size,
},
// Used to check for CCS and get the remote cluster name
index: {
terms: {
field: '_index',
size: 1,
aggs: {
index: {
terms: {
field: '_index',
size: 1,
},
},
},
// Fallback value in case container limits are not specified
average_cpu_usage_percent: {
avg: {
field: 'node_stats.process.cpu.percent',
average_cpu: {
avg: {
field: 'node_stats.process.cpu.percent',
},
},
},
// Container limit min and max, to calculate usage and detect config changes
quota_micros_max: {
max: {
field: 'node_stats.os.cgroup.cpu.cfs_quota_micros',
average_quota: {
avg: {
field: 'node_stats.os.cgroup.cpu.cfs_quota_micros',
},
},
},
quota_micros_min: {
min: {
field: 'node_stats.os.cgroup.cpu.cfs_quota_micros',
name: {
terms: {
field: 'source_node.name',
size: 1,
},
},
},
// Usage to calculate delta
max_usage_nanos: {
max: {
field: 'node_stats.os.cgroup.cpuacct.usage_nanos',
},
},
min_usage_nanos: {
min: {
field: 'node_stats.os.cgroup.cpuacct.usage_nanos',
},
},
// Periods to calculate delta
max_periods: {
max: {
field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods',
},
},
min_periods: {
min: {
field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods',
histo: {
date_histogram: {
field: 'timestamp',
fixed_interval: `${intervalInMinutes}m`,
},
aggs: {
average_periods: {
max: {
field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods',
},
},
average_usage: {
max: {
field: 'node_stats.os.cgroup.cpuacct.usage_nanos',
},
},
usage_deriv: {
derivative: {
buckets_path: 'average_usage',
gap_policy: 'skip' as const,
unit: NORMALIZED_DERIVATIVE_UNIT,
},
},
periods_deriv: {
derivative: {
buckets_path: 'average_periods',
gap_policy: 'skip' as const,
unit: NORMALIZED_DERIVATIVE_UNIT,
},
},
},
},
},
},
@ -144,211 +147,38 @@ async function fetchContainerStats(
},
};
if (filterQuery) {
(params.query!.bool!.filter! as QueryDslQueryContainer[]).push(filterQuery);
try {
if (filterQuery) {
const filterQueryObject = JSON.parse(filterQuery);
params.body.query.bool.filter.push(filterQueryObject);
}
} catch (e) {
// meh
}
const response = (await esClient.search<unknown>(params)) as unknown as InferSearchResponseOf<
unknown,
typeof params
>;
if (!response.aggregations) {
throw new Error('Failed to resolve needed aggregations for CPU Usage Rule');
}
return response.aggregations.clusters.buckets.flatMap((cluster) => {
return cluster.nodes.buckets.map((node): AlertCpuUsageNodeStats => {
let nodeName;
if (node.name.buckets.length) {
nodeName = node.name.buckets[0].key as string;
}
let ccs;
if (node.index.buckets.length) {
const index = node.index.buckets[0].key as string;
ccs = index.includes(':') ? index.split(':')[0] : undefined;
}
const nodeStats = {
clusterUuid: cluster.key as string,
nodeId: node.key as string,
nodeName,
ccs,
const response = await esClient.search(params);
const stats: AlertCpuUsageNodeStats[] = [];
const clusterBuckets = get(
response,
'aggregations.clusters.buckets',
[]
) as ClusterBucketESResponse[];
for (const clusterBucket of clusterBuckets) {
for (const node of clusterBucket.nodes.buckets) {
const lastBucket = get(node, 'histo.buckets[1]', {});
const indexName = get(node, 'index.buckets[0].key', '');
const stat = {
clusterUuid: clusterBucket.key,
nodeId: node.key,
nodeName: get(node, 'name.buckets[0].key'),
cpuUsage: get(node, 'average_cpu.value'),
containerUsage: get(lastBucket, 'usage_deriv.normalized_value'),
containerPeriods: get(lastBucket, 'periods_deriv.normalized_value'),
containerQuota: get(node, 'average_quota.value'),
ccs: indexName.includes(':') ? indexName.split(':')[0] : null,
};
const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1;
if (
limitsNotSet ||
node.max_usage_nanos.value === null ||
node.min_usage_nanos.value === null ||
node.max_periods.value === null ||
node.min_periods.value === null ||
node.quota_micros_max.value === null
) {
return {
...nodeStats,
cpuUsage: node.average_cpu_usage_percent.value ?? undefined,
};
}
if (node.quota_micros_min.value !== node.quota_micros_max.value) {
return {
...nodeStats,
limitsChanged: true,
cpuUsage: undefined,
};
}
const usageDeltaNanos = node.max_usage_nanos.value - node.min_usage_nanos.value;
const periodsDelta = node.max_periods.value - node.min_periods.value;
const cpuUsage = computeCfsPercentCpuUsage(
usageDeltaNanos,
node.quota_micros_max.value,
periodsDelta
);
return {
...nodeStats,
cpuUsage: Math.round(cpuUsage * 100) / 100,
};
});
});
}
function computeCfsPercentCpuUsage(usageNanos: number, quotaMicros: number, periods: number) {
// See https://github.com/elastic/kibana/pull/159351 for an explanation of this formula
const quotaNanos = quotaMicros * 1000;
const limitNanos = quotaNanos * periods;
const usageAsFactor = usageNanos / limitNanos;
return usageAsFactor * 100;
}
async function fetchNonContainerStats(
{ esClient, startMs, endMs, clusterUuids, filterQuery }: Options,
config: MonitoringConfig
) {
const indexPatterns = getIndexPatterns({
config,
moduleType: 'elasticsearch',
dataset: 'node_stats',
ccs: CCS_REMOTE_PATTERN,
});
const params = {
index: indexPatterns,
filter_path: ['aggregations'],
size: 0,
query: {
bool: {
filter: [
createDatasetFilter('node_stats', 'node_stats', getElasticsearchDataset('node_stats')),
{
terms: {
cluster_uuid: clusterUuids,
},
},
{
range: {
timestamp: {
format: 'epoch_millis',
gte: startMs,
lte: endMs,
},
},
},
],
},
},
aggs: {
clusters: {
terms: {
field: 'cluster_uuid',
size: config.ui.max_bucket_size,
},
aggs: {
nodes: {
terms: {
field: 'node_stats.node_id',
size: config.ui.max_bucket_size,
},
aggs: {
name: {
terms: {
field: 'source_node.name',
size: 1,
},
},
// Used to check for CCS and get the remote cluster name
index: {
terms: {
field: '_index',
size: 1,
},
},
average_cpu: {
avg: {
field: 'node_stats.process.cpu.percent',
},
},
// Container limit min and max, to detect possible config errors
quota_micros_max: {
max: {
field: 'node_stats.os.cgroup.cpu.cfs_quota_micros',
},
},
quota_micros_min: {
min: {
field: 'node_stats.os.cgroup.cpu.cfs_quota_micros',
},
},
},
},
},
},
},
};
if (filterQuery) {
(params.query!.bool!.filter! as QueryDslQueryContainer[]).push(filterQuery);
stats.push(stat);
}
}
const response = (await esClient.search<unknown>(params)) as unknown as InferSearchResponseOf<
unknown,
typeof params
>;
if (!response.aggregations) {
throw new Error('Failed to resolve needed aggregations for CPU Usage Rule');
}
return response.aggregations.clusters.buckets.flatMap((cluster) => {
return cluster.nodes.buckets.map((node): AlertCpuUsageNodeStats => {
let nodeName;
if (node.name.buckets.length) {
nodeName = node.name.buckets[0].key as string;
}
let ccs;
if (node.index.buckets.length) {
const index = node.index.buckets[0].key as string;
ccs = index.includes(':') ? index.split(':')[0] : undefined;
}
const runningInAContainerWithLimits =
(node.quota_micros_min.value !== null && node.quota_micros_min.value !== -1) ||
(node.quota_micros_max.value !== null && node.quota_micros_max.value !== -1);
return {
clusterUuid: cluster.key as string,
nodeId: node.key as string,
cpuUsage: node.average_cpu.value ?? undefined,
nodeName,
ccs,
unexpectedLimits: runningInAContainerWithLimits,
};
});
});
return stats;
}

View file

@ -41,7 +41,6 @@
"@kbn/shared-ux-router",
"@kbn/observability-shared-plugin",
"@kbn/shared-ux-link-redirect-app",
"@kbn/es-types",
"@kbn/logs-shared-plugin",
],
"exclude": [

View file

@ -27206,12 +27206,9 @@
"xpack.monitoring.alerts.clusterHealth.firing.internalShortMessage": "L'alerte d'intégrité de cluster se déclenche pour {clusterName}. L'intégrité actuelle est {health}. {actionText}",
"xpack.monitoring.alerts.clusterHealth.ui.firingMessage": "L'intégrité du cluster Elasticsearch est {health}.",
"xpack.monitoring.alerts.clusterHealth.ui.nextSteps.message1": "{message}. #start_linkView now#end_link",
"xpack.monitoring.alerts.cpuUsage.firing.internalMessage": "L'alerte d'utilisation CPU se déclenche pour le nœud {nodeName} dans les cluster {clusterName}. {action}",
"xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure": "L'alerte d'utilisation CPU pour le nœud {nodeName} dans le cluster {clusterName} a rencontré des problèmes lors de l'évaluation de l'utilisation. {action}",
"xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage": "Le calcul de l'utilisation du CPU pour le nœud #start_link{nodeName}#end_link a échoué. Pour en savoir plus, veuillez consulter les logs Kibana. Dernière vérification : #absolute",
"xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "Le nœud #start_link{nodeName}#end_link signale une utilisation du CPU de {cpuUsage} %, ce qui est supérieur au seuil configuré de {threshold} %. Dernière vérification : #absolute",
"xpack.monitoring.alerts.cpuUsage.ui.limitsChanged": "Les limites de ressources pour le nœud #start_link{nodeName}#end_link ont changé dans la fenêtre de visualisation. Impossible de calculer avec assurance l'utilisation du CPU pour les alertes. Veuillez monitorer l'utilisation jusqu'à ce que la fenêtre soit déplacée. Dernière vérification : #absolute",
"xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits": "Kibana est configuré pour les charges de travail non conteneurisées mais le nœud #start_link{nodeName}#end_link dispose de limites de ressources configurées. Le nœud signale une utilisation de {cpuUsage} %. Dernière vérification : #absolute",
"xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage": "L'alerte d'utilisation CPU se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {action}",
"xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage": "L'alerte d'utilisation CPU se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {shortActionText}",
"xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "Le nœud #start_link{nodeName}#end_link signale une utilisation CPU de {cpuUsage} % à #absolute",
"xpack.monitoring.alerts.diskUsage.firing.internalFullMessage": "L'alerte d'utilisation du disque se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {action}",
"xpack.monitoring.alerts.diskUsage.firing.internalShortMessage": "L'alerte d'utilisation du disque se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {shortActionText}",
"xpack.monitoring.alerts.diskUsage.ui.firingMessage": "Le nœud #start_link{nodeName}#end_link signale une utilisation du disque de {diskUsage} % à #absolute",

View file

@ -27206,12 +27206,9 @@
"xpack.monitoring.alerts.clusterHealth.firing.internalShortMessage": "クラスター正常性アラートが{clusterName}に対して作動しています。現在のヘルスは{health}です。{actionText}",
"xpack.monitoring.alerts.clusterHealth.ui.firingMessage": "Elasticsearchクラスターの正常性は{health}です。",
"xpack.monitoring.alerts.clusterHealth.ui.nextSteps.message1": "{message}. #start_linkView now#end_link",
"xpack.monitoring.alerts.cpuUsage.firing.internalMessage": "クラスター{clusterName}のノード{nodeName}について、CPU使用率のアラートが発生しています。{action}",
"xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure": "クラスター{clusterName}のノード{nodeName}のCPU使用率アラートでは、使用率の評価中に問題が発生しました。{action}",
"xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage": "ノード#start_link{nodeName}#end_linkのCPU使用率の計算に失敗しました。詳細については、Kibanaログを確認してください。最終確認 #absolute",
"xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "ノード#start_link{nodeName}#end_linkのCPU使用率が{cpuUsage}%で、設定されたしきい値{threshold}%を超えています。最終確認 #absolute",
"xpack.monitoring.alerts.cpuUsage.ui.limitsChanged": "ノード#start_link{nodeName}#end_linkのリソース制限がルックバックウィンドウ内で変更されたため、アラート用のCPU使用率を正確に計算できません。ウィンドウが移動するまで、使用状況を監視してください。最終確認 #absolute",
"xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits": "Kibanaはコンテナー化されていないワークロード用に構成されていますが、ード#start_link{nodeName}#end_linkにはリソース制限が設定されています。ードは使用率{cpuUsage}%を報告しています。最終確認 #absolute",
"xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage": "クラスター{clusterName}のノード{nodeName}について、CPU使用率のアラートが発生しています。{action}",
"xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage": "クラスター{clusterName}のノード{nodeName}について、CPU使用率のアラートが発生しています。{shortActionText}",
"xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "ノード#start_link{nodeName}#end_linkは、#absoluteでCPU使用率{cpuUsage}%を報告しています",
"xpack.monitoring.alerts.diskUsage.firing.internalFullMessage": "クラスター{clusterName}のノード{nodeName}について、ディスク使用率のアラートが発生しています。{action}",
"xpack.monitoring.alerts.diskUsage.firing.internalShortMessage": "クラスター{clusterName}のノード{nodeName}について、ディスク使用率のアラートが発生しています。{shortActionText}",
"xpack.monitoring.alerts.diskUsage.ui.firingMessage": "ノード#start_link{nodeName}#end_linkは、#absoluteでディスク使用率{diskUsage}%を報告しています",

View file

@ -27204,12 +27204,9 @@
"xpack.monitoring.alerts.clusterHealth.firing.internalShortMessage": "为 {clusterName} 触发了集群运行状况告警。当前运行状况为 {health}。{actionText}",
"xpack.monitoring.alerts.clusterHealth.ui.firingMessage": "Elasticsearch 集群运行状况为 {health}。",
"xpack.monitoring.alerts.clusterHealth.ui.nextSteps.message1": "{message}. #start_linkView now#end_link",
"xpack.monitoring.alerts.cpuUsage.firing.internalMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了 CPU 使用率告警。{action}",
"xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure": "评估使用率时,集群 {clusterName} 中节点 {nodeName} 的 CPU 使用率告警出现问题。{action}",
"xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage": "无法计算节点 #start_link{nodeName}#end_link 的 CPU 使用率。请检查 Kibana 日志了解更多详情。上次检查时间为 #absolute",
"xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "节点 #start_link{nodeName}#end_link 报告 CPU 使用率为 {cpuUsage}%,这超出了配置的阈值 {threshold}%。上次检查时间为 #absolute",
"xpack.monitoring.alerts.cpuUsage.ui.limitsChanged": "节点 #start_link{nodeName}#end_link 的资源限制已在回溯时间窗口内更改,无法放心用于计算 CPU 使用率以进行告警。请监测使用率,直到时间窗口已过去。上次检查时间为 #absolute",
"xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits": "已为非容器化工作负载配置 Kibana但节点 #start_link{nodeName}#end_link 具有配置的资源限制。节点报告使用率为 {cpuUsage}%。上次检查时间为 #absolute",
"xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了 CPU 使用率告警。{action}",
"xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了 CPU 使用率告警。{shortActionText}",
"xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "节点 #start_link{nodeName}#end_link 于 #absolute报告 cpu 使用率为 {cpuUsage}%",
"xpack.monitoring.alerts.diskUsage.firing.internalFullMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了磁盘使用率告警。{action}",
"xpack.monitoring.alerts.diskUsage.firing.internalShortMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了磁盘使用率告警。{shortActionText}",
"xpack.monitoring.alerts.diskUsage.ui.firingMessage": "节点 #start_link{nodeName}#end_link 于 #absolute 报告磁盘使用率为 {diskUsage}%",