mirror of
https://github.com/elastic/kibana.git
synced 2025-04-23 01:13:23 -04:00
Make TaskManager health status error/warning reasons more visible (#154045)
Resolves: #152289 With this PR we make some of the `debug` logs `warn` and return the message to the health API as reason to add in the status summary message.
This commit is contained in:
parent
dec97d4129
commit
c76a68bd89
10 changed files with 126 additions and 85 deletions
|
@ -1,14 +0,0 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License
|
||||
* 2.0; you may not use this file except in compliance with the Elastic License
|
||||
* 2.0.
|
||||
*/
|
||||
|
||||
const createCalculateHealthStatusMock = () => {
|
||||
return jest.fn();
|
||||
};
|
||||
|
||||
export const calculateHealthStatusMock = {
|
||||
create: createCalculateHealthStatusMock,
|
||||
};
|
|
@ -16,7 +16,7 @@ export function calculateHealthStatus(
|
|||
config: TaskManagerConfig,
|
||||
shouldRunTasks: boolean,
|
||||
logger: Logger
|
||||
): HealthStatus {
|
||||
): { status: HealthStatus; reason?: string } {
|
||||
const now = Date.now();
|
||||
|
||||
// if "hot" health stats are any more stale than monitored_stats_required_freshness
|
||||
|
@ -28,27 +28,35 @@ export function calculateHealthStatus(
|
|||
const requiredColdStatsFreshness: number = config.monitored_aggregated_stats_refresh_rate * 1.5;
|
||||
|
||||
if (hasStatus(summarizedStats.stats, HealthStatus.Error)) {
|
||||
return HealthStatus.Error;
|
||||
return {
|
||||
status: HealthStatus.Error,
|
||||
reason: summarizedStats.stats.capacity_estimation?.reason,
|
||||
};
|
||||
}
|
||||
|
||||
// Hot timestamps look at runtime stats which are not available when tasks are not running
|
||||
if (shouldRunTasks) {
|
||||
if (hasExpiredHotTimestamps(summarizedStats, now, requiredHotStatsFreshness)) {
|
||||
logger.debug('setting HealthStatus.Error because of expired hot timestamps');
|
||||
return HealthStatus.Error;
|
||||
const reason = 'setting HealthStatus.Error because of expired hot timestamps';
|
||||
logger.warn(reason);
|
||||
return { status: HealthStatus.Error, reason };
|
||||
}
|
||||
}
|
||||
|
||||
if (hasExpiredColdTimestamps(summarizedStats, now, requiredColdStatsFreshness)) {
|
||||
logger.debug('setting HealthStatus.Error because of expired cold timestamps');
|
||||
return HealthStatus.Error;
|
||||
const reason = 'setting HealthStatus.Error because of expired cold timestamps';
|
||||
logger.warn(reason);
|
||||
return { status: HealthStatus.Error, reason };
|
||||
}
|
||||
|
||||
if (hasStatus(summarizedStats.stats, HealthStatus.Warning)) {
|
||||
return HealthStatus.Warning;
|
||||
return {
|
||||
status: HealthStatus.Warning,
|
||||
reason: summarizedStats.stats.capacity_estimation?.reason,
|
||||
};
|
||||
}
|
||||
|
||||
return HealthStatus.OK;
|
||||
return { status: HealthStatus.OK };
|
||||
}
|
||||
|
||||
function hasStatus(stats: RawMonitoringStats['stats'], status: HealthStatus): boolean {
|
||||
|
|
|
@ -41,16 +41,30 @@ describe('logHealthMetrics', () => {
|
|||
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
|
||||
|
||||
// We must change from OK to Warning
|
||||
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
|
||||
(
|
||||
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
|
||||
).mockImplementation(() => ({
|
||||
status: HealthStatus.OK,
|
||||
}));
|
||||
logHealthMetrics(health, logger, config, true, docLinks);
|
||||
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
|
||||
() => HealthStatus.Warning
|
||||
);
|
||||
(
|
||||
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
|
||||
).mockImplementation(() => ({
|
||||
status: HealthStatus.Warning,
|
||||
}));
|
||||
logHealthMetrics(health, logger, config, true, docLinks);
|
||||
// We must change from OK to Error
|
||||
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
|
||||
(
|
||||
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
|
||||
).mockImplementation(() => ({
|
||||
status: HealthStatus.OK,
|
||||
}));
|
||||
logHealthMetrics(health, logger, config, true, docLinks);
|
||||
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
|
||||
(
|
||||
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
|
||||
).mockImplementation(() => ({
|
||||
status: HealthStatus.Error,
|
||||
}));
|
||||
logHealthMetrics(health, logger, config, true, docLinks);
|
||||
|
||||
const debugCalls = (logger as jest.Mocked<Logger>).debug.mock.calls;
|
||||
|
@ -175,9 +189,11 @@ describe('logHealthMetrics', () => {
|
|||
});
|
||||
const health = getMockMonitoredHealth();
|
||||
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
|
||||
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
|
||||
() => HealthStatus.Warning
|
||||
);
|
||||
(
|
||||
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
|
||||
).mockImplementation(() => ({
|
||||
status: HealthStatus.Warning,
|
||||
}));
|
||||
|
||||
logHealthMetrics(health, logger, config, true, docLinks);
|
||||
|
||||
|
@ -201,7 +217,11 @@ describe('logHealthMetrics', () => {
|
|||
});
|
||||
const health = getMockMonitoredHealth();
|
||||
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
|
||||
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
|
||||
(
|
||||
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
|
||||
).mockImplementation(() => ({
|
||||
status: HealthStatus.Error,
|
||||
}));
|
||||
|
||||
logHealthMetrics(health, logger, config, true, docLinks);
|
||||
|
||||
|
|
|
@ -40,12 +40,9 @@ export function logHealthMetrics(
|
|||
capacity_estimation: undefined,
|
||||
},
|
||||
};
|
||||
const statusWithoutCapacity = calculateHealthStatus(
|
||||
healthWithoutCapacity,
|
||||
config,
|
||||
shouldRunTasks,
|
||||
logger
|
||||
);
|
||||
const healthStatus = calculateHealthStatus(healthWithoutCapacity, config, shouldRunTasks, logger);
|
||||
|
||||
const statusWithoutCapacity = healthStatus?.status;
|
||||
if (statusWithoutCapacity === HealthStatus.Warning) {
|
||||
logLevel = LogLevel.Warn;
|
||||
} else if (statusWithoutCapacity === HealthStatus.Error && !isEmpty(monitoredHealth.stats)) {
|
||||
|
|
|
@ -184,13 +184,14 @@ export function estimateCapacity(
|
|||
averageCapacityUsedByNonRecurringAndEphemeralTasksPerKibana +
|
||||
averageRecurringRequiredPerMinute / assumedKibanaInstances;
|
||||
|
||||
const status = getHealthStatus(logger, {
|
||||
const { status, reason } = getHealthStatus(logger, {
|
||||
assumedRequiredThroughputPerMinutePerKibana,
|
||||
assumedAverageRecurringRequiredThroughputPerMinutePerKibana,
|
||||
capacityPerMinutePerKibana,
|
||||
});
|
||||
return {
|
||||
status,
|
||||
reason,
|
||||
timestamp: new Date().toISOString(),
|
||||
value: {
|
||||
observed: mapValues(
|
||||
|
@ -231,27 +232,28 @@ interface GetHealthStatusParams {
|
|||
capacityPerMinutePerKibana: number;
|
||||
}
|
||||
|
||||
function getHealthStatus(logger: Logger, params: GetHealthStatusParams): HealthStatus {
|
||||
function getHealthStatus(
|
||||
logger: Logger,
|
||||
params: GetHealthStatusParams
|
||||
): { status: HealthStatus; reason?: string } {
|
||||
const {
|
||||
assumedRequiredThroughputPerMinutePerKibana,
|
||||
assumedAverageRecurringRequiredThroughputPerMinutePerKibana,
|
||||
capacityPerMinutePerKibana,
|
||||
} = params;
|
||||
if (assumedRequiredThroughputPerMinutePerKibana < capacityPerMinutePerKibana) {
|
||||
return HealthStatus.OK;
|
||||
return { status: HealthStatus.OK };
|
||||
}
|
||||
|
||||
if (assumedAverageRecurringRequiredThroughputPerMinutePerKibana < capacityPerMinutePerKibana) {
|
||||
logger.debug(
|
||||
`setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) < capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`
|
||||
);
|
||||
return HealthStatus.Warning;
|
||||
const reason = `setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) < capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`;
|
||||
logger.warn(reason);
|
||||
return { status: HealthStatus.Warning, reason };
|
||||
}
|
||||
|
||||
logger.debug(
|
||||
`setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (${assumedRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana}) AND assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`
|
||||
);
|
||||
return HealthStatus.Error;
|
||||
const reason = `setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (${assumedRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana}) AND assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`;
|
||||
logger.warn(reason);
|
||||
return { status: HealthStatus.Error, reason };
|
||||
}
|
||||
|
||||
export function withCapacityEstimate(
|
||||
|
|
|
@ -68,6 +68,7 @@ export interface MonitoredStat<T> {
|
|||
}
|
||||
export type RawMonitoredStat<T extends JsonObject> = MonitoredStat<T> & {
|
||||
status: HealthStatus;
|
||||
reason?: string;
|
||||
};
|
||||
|
||||
export interface RawMonitoringStats {
|
||||
|
|
|
@ -375,24 +375,24 @@ describe('Task Run Statistics', () => {
|
|||
{ Success: 40, RetryScheduled: 40, Failed: 20, status: 'OK' },
|
||||
]);
|
||||
|
||||
expect(logger.debug).toHaveBeenCalledTimes(5);
|
||||
expect(logger.debug).toHaveBeenNthCalledWith(
|
||||
expect(logger.warn).toHaveBeenCalledTimes(5);
|
||||
expect(logger.warn).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
'Health Status warn threshold has been exceeded, resultFrequencySummary.Failed (40) is greater than warn_threshold (39)'
|
||||
);
|
||||
expect(logger.debug).toHaveBeenNthCalledWith(
|
||||
expect(logger.warn).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
'Health Status error threshold has been exceeded, resultFrequencySummary.Failed (60) is greater than error_threshold (59)'
|
||||
);
|
||||
expect(logger.debug).toHaveBeenNthCalledWith(
|
||||
expect(logger.warn).toHaveBeenNthCalledWith(
|
||||
3,
|
||||
'Health Status error threshold has been exceeded, resultFrequencySummary.Failed (60) is greater than error_threshold (59)'
|
||||
);
|
||||
expect(logger.debug).toHaveBeenNthCalledWith(
|
||||
expect(logger.warn).toHaveBeenNthCalledWith(
|
||||
4,
|
||||
'Health Status error threshold has been exceeded, resultFrequencySummary.Failed (60) is greater than error_threshold (59)'
|
||||
);
|
||||
expect(logger.debug).toHaveBeenNthCalledWith(
|
||||
expect(logger.warn).toHaveBeenNthCalledWith(
|
||||
5,
|
||||
'Health Status warn threshold has been exceeded, resultFrequencySummary.Failed (40) is greater than warn_threshold (39)'
|
||||
);
|
||||
|
|
|
@ -433,11 +433,11 @@ function getHealthStatus(
|
|||
): HealthStatus {
|
||||
if (resultFrequencySummary.Failed > executionErrorThreshold.warn_threshold) {
|
||||
if (resultFrequencySummary.Failed > executionErrorThreshold.error_threshold) {
|
||||
logger.debug(
|
||||
logger.warn(
|
||||
`Health Status error threshold has been exceeded, resultFrequencySummary.Failed (${resultFrequencySummary.Failed}) is greater than error_threshold (${executionErrorThreshold.error_threshold})`
|
||||
);
|
||||
} else {
|
||||
logger.debug(
|
||||
logger.warn(
|
||||
`Health Status warn threshold has been exceeded, resultFrequencySummary.Failed (${resultFrequencySummary.Failed}) is greater than warn_threshold (${executionErrorThreshold.warn_threshold})`
|
||||
);
|
||||
}
|
||||
|
|
|
@ -15,15 +15,9 @@ import { mockHandlerArguments } from './_mock_handler_arguments';
|
|||
import { sleep } from '../test_utils';
|
||||
import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
|
||||
import { usageCountersServiceMock } from '@kbn/usage-collection-plugin/server/usage_counters/usage_counters_service.mock';
|
||||
import {
|
||||
HealthStatus,
|
||||
MonitoringStats,
|
||||
RawMonitoringStats,
|
||||
summarizeMonitoringStats,
|
||||
} from '../monitoring';
|
||||
import { MonitoringStats, RawMonitoringStats, summarizeMonitoringStats } from '../monitoring';
|
||||
import { ServiceStatusLevels, Logger } from '@kbn/core/server';
|
||||
import { configSchema, TaskManagerConfig } from '../config';
|
||||
import { calculateHealthStatusMock } from '../lib/calculate_health_status.mock';
|
||||
import { FillPoolResult } from '../lib/fill_pool';
|
||||
|
||||
jest.mock('../lib/log_health_metrics', () => ({
|
||||
|
@ -191,8 +185,6 @@ describe('healthRoute', () => {
|
|||
|
||||
it('logs the Task Manager stats at a fixed interval', async () => {
|
||||
const router = httpServiceMock.createRouter();
|
||||
const calculateHealthStatus = calculateHealthStatusMock.create();
|
||||
calculateHealthStatus.mockImplementation(() => HealthStatus.OK);
|
||||
const { logHealthMetrics } = jest.requireMock('../lib/log_health_metrics');
|
||||
|
||||
const mockStat = mockHealthStats();
|
||||
|
@ -253,8 +245,6 @@ describe('healthRoute', () => {
|
|||
|
||||
it(`logs at a warn level if the status is warning`, async () => {
|
||||
const router = httpServiceMock.createRouter();
|
||||
const calculateHealthStatus = calculateHealthStatusMock.create();
|
||||
calculateHealthStatus.mockImplementation(() => HealthStatus.Warning);
|
||||
const { logHealthMetrics } = jest.requireMock('../lib/log_health_metrics');
|
||||
|
||||
const warnRuntimeStat = mockHealthStats();
|
||||
|
@ -265,7 +255,7 @@ describe('healthRoute', () => {
|
|||
const stats$ = new Subject<MonitoringStats>();
|
||||
|
||||
const id = uuidv4();
|
||||
healthRoute({
|
||||
const { serviceStatus$ } = healthRoute({
|
||||
router,
|
||||
monitoringStats$: stats$,
|
||||
logger,
|
||||
|
@ -287,6 +277,8 @@ describe('healthRoute', () => {
|
|||
docLinks,
|
||||
});
|
||||
|
||||
const serviceStatus = getLatest(serviceStatus$);
|
||||
|
||||
stats$.next(warnRuntimeStat);
|
||||
await sleep(1001);
|
||||
stats$.next(warnConfigurationStat);
|
||||
|
@ -295,6 +287,12 @@ describe('healthRoute', () => {
|
|||
await sleep(1001);
|
||||
stats$.next(warnEphemeralStat);
|
||||
|
||||
expect(await serviceStatus).toMatchObject({
|
||||
level: ServiceStatusLevels.degraded,
|
||||
summary:
|
||||
'Task Manager is unhealthy - Reason: setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (78.28472222222223) < capacityPerMinutePerKibana (200)',
|
||||
});
|
||||
|
||||
expect(logHealthMetrics).toBeCalledTimes(4);
|
||||
expect(logHealthMetrics.mock.calls[0][0]).toMatchObject({
|
||||
id,
|
||||
|
@ -332,8 +330,6 @@ describe('healthRoute', () => {
|
|||
|
||||
it(`logs at an error level if the status is error`, async () => {
|
||||
const router = httpServiceMock.createRouter();
|
||||
const calculateHealthStatus = calculateHealthStatusMock.create();
|
||||
calculateHealthStatus.mockImplementation(() => HealthStatus.Error);
|
||||
const { logHealthMetrics } = jest.requireMock('../lib/log_health_metrics');
|
||||
|
||||
const errorRuntimeStat = mockHealthStats();
|
||||
|
@ -344,7 +340,7 @@ describe('healthRoute', () => {
|
|||
const stats$ = new Subject<MonitoringStats>();
|
||||
|
||||
const id = uuidv4();
|
||||
healthRoute({
|
||||
const { serviceStatus$ } = healthRoute({
|
||||
router,
|
||||
monitoringStats$: stats$,
|
||||
logger,
|
||||
|
@ -366,6 +362,8 @@ describe('healthRoute', () => {
|
|||
docLinks,
|
||||
});
|
||||
|
||||
const serviceStatus = getLatest(serviceStatus$);
|
||||
|
||||
stats$.next(errorRuntimeStat);
|
||||
await sleep(1001);
|
||||
stats$.next(errorConfigurationStat);
|
||||
|
@ -374,6 +372,12 @@ describe('healthRoute', () => {
|
|||
await sleep(1001);
|
||||
stats$.next(errorEphemeralStat);
|
||||
|
||||
expect(await serviceStatus).toMatchObject({
|
||||
level: ServiceStatusLevels.degraded,
|
||||
summary:
|
||||
'Task Manager is unhealthy - Reason: setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (78.28472222222223) < capacityPerMinutePerKibana (200)',
|
||||
});
|
||||
|
||||
expect(logHealthMetrics).toBeCalledTimes(4);
|
||||
expect(logHealthMetrics.mock.calls[0][0]).toMatchObject({
|
||||
id,
|
||||
|
@ -481,12 +485,13 @@ describe('healthRoute', () => {
|
|||
|
||||
expect(await serviceStatus).toMatchObject({
|
||||
level: ServiceStatusLevels.degraded,
|
||||
summary: 'Task Manager is unhealthy',
|
||||
summary:
|
||||
'Task Manager is unhealthy - Reason: setting HealthStatus.Error because of expired hot timestamps',
|
||||
});
|
||||
const debugCalls = (logger as jest.Mocked<Logger>).debug.mock.calls as string[][];
|
||||
const warnCalls = (logger as jest.Mocked<Logger>).warn.mock.calls as string[][];
|
||||
const warnMessage =
|
||||
/^setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana/;
|
||||
const found = debugCalls
|
||||
const found = warnCalls
|
||||
.map((arr) => arr[0])
|
||||
.find((message) => message.match(warnMessage) != null);
|
||||
expect(found).toMatch(warnMessage);
|
||||
|
@ -497,7 +502,7 @@ describe('healthRoute', () => {
|
|||
|
||||
const stats$ = new Subject<MonitoringStats>();
|
||||
|
||||
healthRoute({
|
||||
const { serviceStatus$ } = healthRoute({
|
||||
router,
|
||||
monitoringStats$: stats$,
|
||||
logger,
|
||||
|
@ -514,6 +519,8 @@ describe('healthRoute', () => {
|
|||
docLinks,
|
||||
});
|
||||
|
||||
const serviceStatus = getLatest(serviceStatus$);
|
||||
|
||||
await sleep(0);
|
||||
|
||||
const lastUpdateOfWorkload = new Date(Date.now() - 120000).toISOString();
|
||||
|
@ -533,6 +540,11 @@ describe('healthRoute', () => {
|
|||
|
||||
await sleep(2000);
|
||||
|
||||
expect(await serviceStatus).toMatchObject({
|
||||
level: ServiceStatusLevels.degraded,
|
||||
summary:
|
||||
'Task Manager is unhealthy - Reason: setting HealthStatus.Error because of expired cold timestamps',
|
||||
});
|
||||
expect(await handler(context, req, res)).toMatchObject({
|
||||
body: {
|
||||
status: 'error',
|
||||
|
@ -572,7 +584,7 @@ describe('healthRoute', () => {
|
|||
const router = httpServiceMock.createRouter();
|
||||
|
||||
const stats$ = new Subject<MonitoringStats>();
|
||||
healthRoute({
|
||||
const { serviceStatus$ } = healthRoute({
|
||||
router,
|
||||
monitoringStats$: stats$,
|
||||
logger,
|
||||
|
@ -588,7 +600,7 @@ describe('healthRoute', () => {
|
|||
shouldRunTasks: true,
|
||||
docLinks,
|
||||
});
|
||||
|
||||
const serviceStatus = getLatest(serviceStatus$);
|
||||
await sleep(0);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||
|
@ -611,6 +623,11 @@ describe('healthRoute', () => {
|
|||
|
||||
const [context, req, res] = mockHandlerArguments({}, {}, ['ok']);
|
||||
|
||||
expect(await serviceStatus).toMatchObject({
|
||||
level: ServiceStatusLevels.degraded,
|
||||
summary:
|
||||
'Task Manager is unhealthy - Reason: setting HealthStatus.Error because of expired hot timestamps',
|
||||
});
|
||||
expect(await handler(context, req, res)).toMatchObject({
|
||||
body: {
|
||||
status: 'error',
|
||||
|
|
|
@ -30,6 +30,7 @@ import { calculateHealthStatus } from '../lib/calculate_health_status';
|
|||
|
||||
export type MonitoredHealth = RawMonitoringStats & {
|
||||
id: string;
|
||||
reason?: string;
|
||||
status: HealthStatus;
|
||||
timestamp: string;
|
||||
};
|
||||
|
@ -87,10 +88,15 @@ export function healthRoute(params: HealthRouteParams): {
|
|||
|
||||
function getHealthStatus(monitoredStats: MonitoringStats) {
|
||||
const summarizedStats = summarizeMonitoringStats(logger, monitoredStats, config);
|
||||
const status = calculateHealthStatus(summarizedStats, config, shouldRunTasks, logger);
|
||||
const { status, reason } = calculateHealthStatus(
|
||||
summarizedStats,
|
||||
config,
|
||||
shouldRunTasks,
|
||||
logger
|
||||
);
|
||||
const now = Date.now();
|
||||
const timestamp = new Date(now).toISOString();
|
||||
return { id: taskManagerId, timestamp, status, ...summarizedStats };
|
||||
return { id: taskManagerId, timestamp, status, reason, ...summarizedStats };
|
||||
}
|
||||
|
||||
const serviceStatus$: Subject<TaskManagerServiceStatus> = new Subject<TaskManagerServiceStatus>();
|
||||
|
@ -106,7 +112,7 @@ export function healthRoute(params: HealthRouteParams): {
|
|||
tap((stats) => {
|
||||
lastMonitoredStats = stats;
|
||||
}),
|
||||
// Only calculate the summerized stats (calculates all runnign averages and evaluates state)
|
||||
// Only calculate the summarized stats (calculates all running averages and evaluates state)
|
||||
// when needed by throttling down to the requiredHotStatsFreshness
|
||||
map((stats) => withServiceStatus(getHealthStatus(stats)))
|
||||
)
|
||||
|
@ -174,15 +180,19 @@ export function healthRoute(params: HealthRouteParams): {
|
|||
export function withServiceStatus(
|
||||
monitoredHealth: MonitoredHealth
|
||||
): [MonitoredHealth, TaskManagerServiceStatus] {
|
||||
const { reason, status } = monitoredHealth;
|
||||
|
||||
const level =
|
||||
monitoredHealth.status === HealthStatus.OK
|
||||
? ServiceStatusLevels.available
|
||||
: ServiceStatusLevels.degraded;
|
||||
status === HealthStatus.OK ? ServiceStatusLevels.available : ServiceStatusLevels.degraded;
|
||||
|
||||
const defaultMessage = LEVEL_SUMMARY[level.toString()];
|
||||
const summary = reason ? `${defaultMessage} - Reason: ${reason}` : defaultMessage;
|
||||
|
||||
return [
|
||||
monitoredHealth,
|
||||
{
|
||||
level,
|
||||
summary: LEVEL_SUMMARY[level.toString()],
|
||||
summary,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue