Make TaskManager health status error/warning reasons more visible (#154045)

Resolves: #152289

With this PR we make some of the `debug` logs `warn` and return the
message to the health API as reason to add in the status summary
message.
This commit is contained in:
Ersin Erdal 2023-04-18 14:56:57 +02:00 committed by GitHub
parent dec97d4129
commit c76a68bd89
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 126 additions and 85 deletions

View file

@ -1,14 +0,0 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
const createCalculateHealthStatusMock = () => {
return jest.fn();
};
export const calculateHealthStatusMock = {
create: createCalculateHealthStatusMock,
};

View file

@ -16,7 +16,7 @@ export function calculateHealthStatus(
config: TaskManagerConfig,
shouldRunTasks: boolean,
logger: Logger
): HealthStatus {
): { status: HealthStatus; reason?: string } {
const now = Date.now();
// if "hot" health stats are any more stale than monitored_stats_required_freshness
@ -28,27 +28,35 @@ export function calculateHealthStatus(
const requiredColdStatsFreshness: number = config.monitored_aggregated_stats_refresh_rate * 1.5;
if (hasStatus(summarizedStats.stats, HealthStatus.Error)) {
return HealthStatus.Error;
return {
status: HealthStatus.Error,
reason: summarizedStats.stats.capacity_estimation?.reason,
};
}
// Hot timestamps look at runtime stats which are not available when tasks are not running
if (shouldRunTasks) {
if (hasExpiredHotTimestamps(summarizedStats, now, requiredHotStatsFreshness)) {
logger.debug('setting HealthStatus.Error because of expired hot timestamps');
return HealthStatus.Error;
const reason = 'setting HealthStatus.Error because of expired hot timestamps';
logger.warn(reason);
return { status: HealthStatus.Error, reason };
}
}
if (hasExpiredColdTimestamps(summarizedStats, now, requiredColdStatsFreshness)) {
logger.debug('setting HealthStatus.Error because of expired cold timestamps');
return HealthStatus.Error;
const reason = 'setting HealthStatus.Error because of expired cold timestamps';
logger.warn(reason);
return { status: HealthStatus.Error, reason };
}
if (hasStatus(summarizedStats.stats, HealthStatus.Warning)) {
return HealthStatus.Warning;
return {
status: HealthStatus.Warning,
reason: summarizedStats.stats.capacity_estimation?.reason,
};
}
return HealthStatus.OK;
return { status: HealthStatus.OK };
}
function hasStatus(stats: RawMonitoringStats['stats'], status: HealthStatus): boolean {

View file

@ -41,16 +41,30 @@ describe('logHealthMetrics', () => {
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
// We must change from OK to Warning
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.OK,
}));
logHealthMetrics(health, logger, config, true, docLinks);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
() => HealthStatus.Warning
);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.Warning,
}));
logHealthMetrics(health, logger, config, true, docLinks);
// We must change from OK to Error
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.OK,
}));
logHealthMetrics(health, logger, config, true, docLinks);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.Error,
}));
logHealthMetrics(health, logger, config, true, docLinks);
const debugCalls = (logger as jest.Mocked<Logger>).debug.mock.calls;
@ -175,9 +189,11 @@ describe('logHealthMetrics', () => {
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
() => HealthStatus.Warning
);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.Warning,
}));
logHealthMetrics(health, logger, config, true, docLinks);
@ -201,7 +217,11 @@ describe('logHealthMetrics', () => {
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.Error,
}));
logHealthMetrics(health, logger, config, true, docLinks);

View file

@ -40,12 +40,9 @@ export function logHealthMetrics(
capacity_estimation: undefined,
},
};
const statusWithoutCapacity = calculateHealthStatus(
healthWithoutCapacity,
config,
shouldRunTasks,
logger
);
const healthStatus = calculateHealthStatus(healthWithoutCapacity, config, shouldRunTasks, logger);
const statusWithoutCapacity = healthStatus?.status;
if (statusWithoutCapacity === HealthStatus.Warning) {
logLevel = LogLevel.Warn;
} else if (statusWithoutCapacity === HealthStatus.Error && !isEmpty(monitoredHealth.stats)) {

View file

@ -184,13 +184,14 @@ export function estimateCapacity(
averageCapacityUsedByNonRecurringAndEphemeralTasksPerKibana +
averageRecurringRequiredPerMinute / assumedKibanaInstances;
const status = getHealthStatus(logger, {
const { status, reason } = getHealthStatus(logger, {
assumedRequiredThroughputPerMinutePerKibana,
assumedAverageRecurringRequiredThroughputPerMinutePerKibana,
capacityPerMinutePerKibana,
});
return {
status,
reason,
timestamp: new Date().toISOString(),
value: {
observed: mapValues(
@ -231,27 +232,28 @@ interface GetHealthStatusParams {
capacityPerMinutePerKibana: number;
}
function getHealthStatus(logger: Logger, params: GetHealthStatusParams): HealthStatus {
function getHealthStatus(
logger: Logger,
params: GetHealthStatusParams
): { status: HealthStatus; reason?: string } {
const {
assumedRequiredThroughputPerMinutePerKibana,
assumedAverageRecurringRequiredThroughputPerMinutePerKibana,
capacityPerMinutePerKibana,
} = params;
if (assumedRequiredThroughputPerMinutePerKibana < capacityPerMinutePerKibana) {
return HealthStatus.OK;
return { status: HealthStatus.OK };
}
if (assumedAverageRecurringRequiredThroughputPerMinutePerKibana < capacityPerMinutePerKibana) {
logger.debug(
`setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) < capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`
);
return HealthStatus.Warning;
const reason = `setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) < capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`;
logger.warn(reason);
return { status: HealthStatus.Warning, reason };
}
logger.debug(
`setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (${assumedRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana}) AND assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`
);
return HealthStatus.Error;
const reason = `setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (${assumedRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana}) AND assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`;
logger.warn(reason);
return { status: HealthStatus.Error, reason };
}
export function withCapacityEstimate(

View file

@ -68,6 +68,7 @@ export interface MonitoredStat<T> {
}
export type RawMonitoredStat<T extends JsonObject> = MonitoredStat<T> & {
status: HealthStatus;
reason?: string;
};
export interface RawMonitoringStats {

View file

@ -375,24 +375,24 @@ describe('Task Run Statistics', () => {
{ Success: 40, RetryScheduled: 40, Failed: 20, status: 'OK' },
]);
expect(logger.debug).toHaveBeenCalledTimes(5);
expect(logger.debug).toHaveBeenNthCalledWith(
expect(logger.warn).toHaveBeenCalledTimes(5);
expect(logger.warn).toHaveBeenNthCalledWith(
1,
'Health Status warn threshold has been exceeded, resultFrequencySummary.Failed (40) is greater than warn_threshold (39)'
);
expect(logger.debug).toHaveBeenNthCalledWith(
expect(logger.warn).toHaveBeenNthCalledWith(
2,
'Health Status error threshold has been exceeded, resultFrequencySummary.Failed (60) is greater than error_threshold (59)'
);
expect(logger.debug).toHaveBeenNthCalledWith(
expect(logger.warn).toHaveBeenNthCalledWith(
3,
'Health Status error threshold has been exceeded, resultFrequencySummary.Failed (60) is greater than error_threshold (59)'
);
expect(logger.debug).toHaveBeenNthCalledWith(
expect(logger.warn).toHaveBeenNthCalledWith(
4,
'Health Status error threshold has been exceeded, resultFrequencySummary.Failed (60) is greater than error_threshold (59)'
);
expect(logger.debug).toHaveBeenNthCalledWith(
expect(logger.warn).toHaveBeenNthCalledWith(
5,
'Health Status warn threshold has been exceeded, resultFrequencySummary.Failed (40) is greater than warn_threshold (39)'
);

View file

@ -433,11 +433,11 @@ function getHealthStatus(
): HealthStatus {
if (resultFrequencySummary.Failed > executionErrorThreshold.warn_threshold) {
if (resultFrequencySummary.Failed > executionErrorThreshold.error_threshold) {
logger.debug(
logger.warn(
`Health Status error threshold has been exceeded, resultFrequencySummary.Failed (${resultFrequencySummary.Failed}) is greater than error_threshold (${executionErrorThreshold.error_threshold})`
);
} else {
logger.debug(
logger.warn(
`Health Status warn threshold has been exceeded, resultFrequencySummary.Failed (${resultFrequencySummary.Failed}) is greater than warn_threshold (${executionErrorThreshold.warn_threshold})`
);
}

View file

@ -15,15 +15,9 @@ import { mockHandlerArguments } from './_mock_handler_arguments';
import { sleep } from '../test_utils';
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
import { usageCountersServiceMock } from '@kbn/usage-collection-plugin/server/usage_counters/usage_counters_service.mock';
import {
HealthStatus,
MonitoringStats,
RawMonitoringStats,
summarizeMonitoringStats,
} from '../monitoring';
import { MonitoringStats, RawMonitoringStats, summarizeMonitoringStats } from '../monitoring';
import { ServiceStatusLevels, Logger } from '@kbn/core/server';
import { configSchema, TaskManagerConfig } from '../config';
import { calculateHealthStatusMock } from '../lib/calculate_health_status.mock';
import { FillPoolResult } from '../lib/fill_pool';
jest.mock('../lib/log_health_metrics', () => ({
@ -191,8 +185,6 @@ describe('healthRoute', () => {
it('logs the Task Manager stats at a fixed interval', async () => {
const router = httpServiceMock.createRouter();
const calculateHealthStatus = calculateHealthStatusMock.create();
calculateHealthStatus.mockImplementation(() => HealthStatus.OK);
const { logHealthMetrics } = jest.requireMock('../lib/log_health_metrics');
const mockStat = mockHealthStats();
@ -253,8 +245,6 @@ describe('healthRoute', () => {
it(`logs at a warn level if the status is warning`, async () => {
const router = httpServiceMock.createRouter();
const calculateHealthStatus = calculateHealthStatusMock.create();
calculateHealthStatus.mockImplementation(() => HealthStatus.Warning);
const { logHealthMetrics } = jest.requireMock('../lib/log_health_metrics');
const warnRuntimeStat = mockHealthStats();
@ -265,7 +255,7 @@ describe('healthRoute', () => {
const stats$ = new Subject<MonitoringStats>();
const id = uuidv4();
healthRoute({
const { serviceStatus$ } = healthRoute({
router,
monitoringStats$: stats$,
logger,
@ -287,6 +277,8 @@ describe('healthRoute', () => {
docLinks,
});
const serviceStatus = getLatest(serviceStatus$);
stats$.next(warnRuntimeStat);
await sleep(1001);
stats$.next(warnConfigurationStat);
@ -295,6 +287,12 @@ describe('healthRoute', () => {
await sleep(1001);
stats$.next(warnEphemeralStat);
expect(await serviceStatus).toMatchObject({
level: ServiceStatusLevels.degraded,
summary:
'Task Manager is unhealthy - Reason: setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (78.28472222222223) < capacityPerMinutePerKibana (200)',
});
expect(logHealthMetrics).toBeCalledTimes(4);
expect(logHealthMetrics.mock.calls[0][0]).toMatchObject({
id,
@ -332,8 +330,6 @@ describe('healthRoute', () => {
it(`logs at an error level if the status is error`, async () => {
const router = httpServiceMock.createRouter();
const calculateHealthStatus = calculateHealthStatusMock.create();
calculateHealthStatus.mockImplementation(() => HealthStatus.Error);
const { logHealthMetrics } = jest.requireMock('../lib/log_health_metrics');
const errorRuntimeStat = mockHealthStats();
@ -344,7 +340,7 @@ describe('healthRoute', () => {
const stats$ = new Subject<MonitoringStats>();
const id = uuidv4();
healthRoute({
const { serviceStatus$ } = healthRoute({
router,
monitoringStats$: stats$,
logger,
@ -366,6 +362,8 @@ describe('healthRoute', () => {
docLinks,
});
const serviceStatus = getLatest(serviceStatus$);
stats$.next(errorRuntimeStat);
await sleep(1001);
stats$.next(errorConfigurationStat);
@ -374,6 +372,12 @@ describe('healthRoute', () => {
await sleep(1001);
stats$.next(errorEphemeralStat);
expect(await serviceStatus).toMatchObject({
level: ServiceStatusLevels.degraded,
summary:
'Task Manager is unhealthy - Reason: setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (78.28472222222223) < capacityPerMinutePerKibana (200)',
});
expect(logHealthMetrics).toBeCalledTimes(4);
expect(logHealthMetrics.mock.calls[0][0]).toMatchObject({
id,
@ -481,12 +485,13 @@ describe('healthRoute', () => {
expect(await serviceStatus).toMatchObject({
level: ServiceStatusLevels.degraded,
summary: 'Task Manager is unhealthy',
summary:
'Task Manager is unhealthy - Reason: setting HealthStatus.Error because of expired hot timestamps',
});
const debugCalls = (logger as jest.Mocked<Logger>).debug.mock.calls as string[][];
const warnCalls = (logger as jest.Mocked<Logger>).warn.mock.calls as string[][];
const warnMessage =
/^setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana/;
const found = debugCalls
const found = warnCalls
.map((arr) => arr[0])
.find((message) => message.match(warnMessage) != null);
expect(found).toMatch(warnMessage);
@ -497,7 +502,7 @@ describe('healthRoute', () => {
const stats$ = new Subject<MonitoringStats>();
healthRoute({
const { serviceStatus$ } = healthRoute({
router,
monitoringStats$: stats$,
logger,
@ -514,6 +519,8 @@ describe('healthRoute', () => {
docLinks,
});
const serviceStatus = getLatest(serviceStatus$);
await sleep(0);
const lastUpdateOfWorkload = new Date(Date.now() - 120000).toISOString();
@ -533,6 +540,11 @@ describe('healthRoute', () => {
await sleep(2000);
expect(await serviceStatus).toMatchObject({
level: ServiceStatusLevels.degraded,
summary:
'Task Manager is unhealthy - Reason: setting HealthStatus.Error because of expired cold timestamps',
});
expect(await handler(context, req, res)).toMatchObject({
body: {
status: 'error',
@ -572,7 +584,7 @@ describe('healthRoute', () => {
const router = httpServiceMock.createRouter();
const stats$ = new Subject<MonitoringStats>();
healthRoute({
const { serviceStatus$ } = healthRoute({
router,
monitoringStats$: stats$,
logger,
@ -588,7 +600,7 @@ describe('healthRoute', () => {
shouldRunTasks: true,
docLinks,
});
const serviceStatus = getLatest(serviceStatus$);
await sleep(0);
// eslint-disable-next-line @typescript-eslint/naming-convention
@ -611,6 +623,11 @@ describe('healthRoute', () => {
const [context, req, res] = mockHandlerArguments({}, {}, ['ok']);
expect(await serviceStatus).toMatchObject({
level: ServiceStatusLevels.degraded,
summary:
'Task Manager is unhealthy - Reason: setting HealthStatus.Error because of expired hot timestamps',
});
expect(await handler(context, req, res)).toMatchObject({
body: {
status: 'error',

View file

@ -30,6 +30,7 @@ import { calculateHealthStatus } from '../lib/calculate_health_status';
export type MonitoredHealth = RawMonitoringStats & {
id: string;
reason?: string;
status: HealthStatus;
timestamp: string;
};
@ -87,10 +88,15 @@ export function healthRoute(params: HealthRouteParams): {
function getHealthStatus(monitoredStats: MonitoringStats) {
const summarizedStats = summarizeMonitoringStats(logger, monitoredStats, config);
const status = calculateHealthStatus(summarizedStats, config, shouldRunTasks, logger);
const { status, reason } = calculateHealthStatus(
summarizedStats,
config,
shouldRunTasks,
logger
);
const now = Date.now();
const timestamp = new Date(now).toISOString();
return { id: taskManagerId, timestamp, status, ...summarizedStats };
return { id: taskManagerId, timestamp, status, reason, ...summarizedStats };
}
const serviceStatus$: Subject<TaskManagerServiceStatus> = new Subject<TaskManagerServiceStatus>();
@ -106,7 +112,7 @@ export function healthRoute(params: HealthRouteParams): {
tap((stats) => {
lastMonitoredStats = stats;
}),
// Only calculate the summerized stats (calculates all runnign averages and evaluates state)
// Only calculate the summarized stats (calculates all running averages and evaluates state)
// when needed by throttling down to the requiredHotStatsFreshness
map((stats) => withServiceStatus(getHealthStatus(stats)))
)
@ -174,15 +180,19 @@ export function healthRoute(params: HealthRouteParams): {
export function withServiceStatus(
monitoredHealth: MonitoredHealth
): [MonitoredHealth, TaskManagerServiceStatus] {
const { reason, status } = monitoredHealth;
const level =
monitoredHealth.status === HealthStatus.OK
? ServiceStatusLevels.available
: ServiceStatusLevels.degraded;
status === HealthStatus.OK ? ServiceStatusLevels.available : ServiceStatusLevels.degraded;
const defaultMessage = LEVEL_SUMMARY[level.toString()];
const summary = reason ? `${defaultMessage} - Reason: ${reason}` : defaultMessage;
return [
monitoredHealth,
{
level,
summary: LEVEL_SUMMARY[level.toString()],
summary,
},
];
}