[Response Ops][Task Manager] Returning status as OK immediately when not running background tasks (#177631)

Resolves https://github.com/elastic/kibana-team/issues/764

## Summary

We've been seeing `taskManager plugin is now unavailable: Status check
timed out after 30s` a lot in the serverless logs. After some
investigation, these were all coming from the UI-only Kibana nodes. It
looks like for background task nodes, TM status is emitted at the
polling interval (default `3s`) whereas for the UI nodes, TM status is
emitted less frequently (every `1m`). The first emission always shows as
unavailable because monitoring stats are not available, however this
means for the UI nodes, TM emits `unavailable`, then doesn't emit
`available` until one minute later, which exceeds the status check
timeout.

This PR checks whether a node is UI only and emits `OK`, skipping the
unavailable state, since UI nodes don't really depend on TM monitoring
stats for availability.

## To Verify
1. Start ES locally
2. Add `node.roles: ["migrator"]` to your Kibana config and start Kibana
(the migrator node must run first if we're running with split roles)
3. Update config to use `node.roles: ["ui"]` and restart Kibana. You
should see Kibana start up with no errors in the logs about Task manager
being unavailable.

---------

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
Ying Mao 2024-03-05 13:27:07 -05:00 committed by GitHub
parent f2de74259f
commit a925d25711
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 74 additions and 13 deletions

View file

@ -260,7 +260,7 @@ describe('calculateHealthStatus', () => {
});
});
test('should return OK status when stats are not yet populated', () => {
test('should return Uninitialized status when stats are not yet populated and shouldRunTasks = true', () => {
expect(
calculateHealthStatus(
{
@ -274,6 +274,20 @@ describe('calculateHealthStatus', () => {
).toEqual({ status: HealthStatus.Uninitialized, reason: `no health stats available` });
});
test('should return OK status when stats are not yet populated and shouldRunTasks = false', () => {
expect(
calculateHealthStatus(
{
last_update: '2023-05-09T12:59:57.000Z',
stats: {},
},
config,
false,
logger
)
).toEqual({ status: HealthStatus.OK });
});
test('should return error status if any stat has status error', () => {
const errorReason = `setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (222.85972222222222) >= capacityPerMinutePerKibana (200) AND assumedAverageRecurringRequiredThroughputPerMinutePerKibana (222.85972222222222) >= capacityPerMinutePerKibana (200)`;
const stats = getStatsWithTimestamp();

View file

@ -21,7 +21,11 @@ export function calculateHealthStatus(
// if stats are empty, return a warning
if (isEmpty(summarizedStats.stats)) {
return { status: HealthStatus.Uninitialized, reason: `no health stats available` };
if (!shouldRunTasks) {
return { status: HealthStatus.OK };
} else {
return { status: HealthStatus.Uninitialized, reason: `no health stats available` };
}
}
// if "hot" health stats are any more stale than monitored_stats_required_freshness

View file

@ -5,8 +5,7 @@
* 2.0.
*/
import { Observable, of, Subject } from 'rxjs';
import { take } from 'rxjs/operators';
import { firstValueFrom, of, Subject } from 'rxjs';
import { merge } from 'lodash';
import { v4 as uuidv4 } from 'uuid';
import { httpServiceMock, docLinksServiceMock } from '@kbn/core/server/mocks';
@ -277,7 +276,7 @@ describe('healthRoute', () => {
docLinks,
});
const serviceStatus = getLatest(serviceStatus$);
const serviceStatus = firstValueFrom(serviceStatus$);
stats$.next(warnRuntimeStat);
await sleep(1001);
@ -362,7 +361,7 @@ describe('healthRoute', () => {
docLinks,
});
const serviceStatus = getLatest(serviceStatus$);
const serviceStatus = firstValueFrom(serviceStatus$);
stats$.next(errorRuntimeStat);
await sleep(1001);
@ -435,7 +434,7 @@ describe('healthRoute', () => {
docLinks,
});
const serviceStatus = getLatest(serviceStatus$);
const serviceStatus = firstValueFrom(serviceStatus$);
const [, handler] = router.get.mock.calls[0];
@ -519,7 +518,7 @@ describe('healthRoute', () => {
docLinks,
});
const serviceStatus = getLatest(serviceStatus$);
const serviceStatus = firstValueFrom(serviceStatus$);
await sleep(0);
@ -600,7 +599,7 @@ describe('healthRoute', () => {
shouldRunTasks: true,
docLinks,
});
const serviceStatus = getLatest(serviceStatus$);
const serviceStatus = firstValueFrom(serviceStatus$);
await sleep(0);
// eslint-disable-next-line @typescript-eslint/naming-convention
@ -662,6 +661,54 @@ describe('healthRoute', () => {
},
});
});
it('returns a OK status for empty if shouldRunTasks is false', async () => {
const router = httpServiceMock.createRouter();
const stats$ = new Subject<MonitoringStats>();
const { serviceStatus$ } = healthRoute({
router,
monitoringStats$: stats$,
logger,
taskManagerId: uuidv4(),
config: getTaskManagerConfig({
monitored_stats_required_freshness: 1000,
monitored_aggregated_stats_refresh_rate: 60000,
}),
kibanaVersion: '8.0',
kibanaIndexName: '.kibana',
getClusterClient: () => Promise.resolve(elasticsearchServiceMock.createClusterClient()),
usageCounter: mockUsageCounter,
shouldRunTasks: false,
docLinks,
});
const serviceStatus = firstValueFrom(serviceStatus$);
await sleep(0);
const lastUpdate = new Date().toISOString();
stats$.next({
last_update: lastUpdate,
stats: {},
});
const [, handler] = router.get.mock.calls[0];
const [context, req, res] = mockHandlerArguments({}, {}, ['ok']);
expect(await serviceStatus).toMatchObject({
level: ServiceStatusLevels.available,
summary: 'Task Manager is healthy',
});
expect(await handler(context, req, res)).toMatchObject({
body: {
id: expect.any(String),
timestamp: expect.any(String),
status: 'OK',
last_update: lastUpdate,
stats: {},
},
});
});
});
function ignoreCapacityEstimation(stats: RawMonitoringStats) {
@ -754,10 +801,6 @@ function mockHealthStats(overrides = {}) {
return merge(stub, overrides) as unknown as MonitoringStats;
}
async function getLatest<T>(stream$: Observable<T>) {
return new Promise<T>((resolve) => stream$.pipe(take(1)).subscribe((stats) => resolve(stats)));
}
const getTaskManagerConfig = (overrides: Partial<TaskManagerConfig> = {}) =>
configSchema.validate(
overrides.monitored_stats_required_freshness