mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 01:38:56 -04:00
[Response Ops][Task Manager] Returning status as OK
immediately when not running background tasks (#177631)
Resolves https://github.com/elastic/kibana-team/issues/764 ## Summary We've been seeing `taskManager plugin is now unavailable: Status check timed out after 30s` a lot in the serverless logs. After some investigation, these were all coming from the UI-only Kibana nodes. It looks like for background task nodes, TM status is emitted at the polling interval (default `3s`) whereas for the UI nodes, TM status is emitted less frequently (every `1m`). The first emission always shows as unavailable because monitoring stats are not available, however this means for the UI nodes, TM emits `unavailable`, then doesn't emit `available` until one minute later, which exceeds the status check timeout. This PR checks whether a node is UI only and emits `OK`, skipping the unavailable state, since UI nodes don't really depend on TM monitoring stats for availability. ## To Verify 1. Start ES locally 2. Add `node.roles: ["migrator"]` to your Kibana config and start Kibana (the migrator node must run first if we're running with split roles) 3. Update config to use `node.roles: ["ui"]` and restart Kibana. You should see Kibana start up with no errors in the logs about Task manager being unavailable. --------- Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
This commit is contained in:
parent
f2de74259f
commit
a925d25711
3 changed files with 74 additions and 13 deletions
|
@ -260,7 +260,7 @@ describe('calculateHealthStatus', () => {
|
|||
});
|
||||
});
|
||||
|
||||
test('should return OK status when stats are not yet populated', () => {
|
||||
test('should return Uninitialized status when stats are not yet populated and shouldRunTasks = true', () => {
|
||||
expect(
|
||||
calculateHealthStatus(
|
||||
{
|
||||
|
@ -274,6 +274,20 @@ describe('calculateHealthStatus', () => {
|
|||
).toEqual({ status: HealthStatus.Uninitialized, reason: `no health stats available` });
|
||||
});
|
||||
|
||||
test('should return OK status when stats are not yet populated and shouldRunTasks = false', () => {
|
||||
expect(
|
||||
calculateHealthStatus(
|
||||
{
|
||||
last_update: '2023-05-09T12:59:57.000Z',
|
||||
stats: {},
|
||||
},
|
||||
config,
|
||||
false,
|
||||
logger
|
||||
)
|
||||
).toEqual({ status: HealthStatus.OK });
|
||||
});
|
||||
|
||||
test('should return error status if any stat has status error', () => {
|
||||
const errorReason = `setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (222.85972222222222) >= capacityPerMinutePerKibana (200) AND assumedAverageRecurringRequiredThroughputPerMinutePerKibana (222.85972222222222) >= capacityPerMinutePerKibana (200)`;
|
||||
const stats = getStatsWithTimestamp();
|
||||
|
|
|
@ -21,7 +21,11 @@ export function calculateHealthStatus(
|
|||
|
||||
// if stats are empty, return a warning
|
||||
if (isEmpty(summarizedStats.stats)) {
|
||||
return { status: HealthStatus.Uninitialized, reason: `no health stats available` };
|
||||
if (!shouldRunTasks) {
|
||||
return { status: HealthStatus.OK };
|
||||
} else {
|
||||
return { status: HealthStatus.Uninitialized, reason: `no health stats available` };
|
||||
}
|
||||
}
|
||||
|
||||
// if "hot" health stats are any more stale than monitored_stats_required_freshness
|
||||
|
|
|
@ -5,8 +5,7 @@
|
|||
* 2.0.
|
||||
*/
|
||||
|
||||
import { Observable, of, Subject } from 'rxjs';
|
||||
import { take } from 'rxjs/operators';
|
||||
import { firstValueFrom, of, Subject } from 'rxjs';
|
||||
import { merge } from 'lodash';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import { httpServiceMock, docLinksServiceMock } from '@kbn/core/server/mocks';
|
||||
|
@ -277,7 +276,7 @@ describe('healthRoute', () => {
|
|||
docLinks,
|
||||
});
|
||||
|
||||
const serviceStatus = getLatest(serviceStatus$);
|
||||
const serviceStatus = firstValueFrom(serviceStatus$);
|
||||
|
||||
stats$.next(warnRuntimeStat);
|
||||
await sleep(1001);
|
||||
|
@ -362,7 +361,7 @@ describe('healthRoute', () => {
|
|||
docLinks,
|
||||
});
|
||||
|
||||
const serviceStatus = getLatest(serviceStatus$);
|
||||
const serviceStatus = firstValueFrom(serviceStatus$);
|
||||
|
||||
stats$.next(errorRuntimeStat);
|
||||
await sleep(1001);
|
||||
|
@ -435,7 +434,7 @@ describe('healthRoute', () => {
|
|||
docLinks,
|
||||
});
|
||||
|
||||
const serviceStatus = getLatest(serviceStatus$);
|
||||
const serviceStatus = firstValueFrom(serviceStatus$);
|
||||
|
||||
const [, handler] = router.get.mock.calls[0];
|
||||
|
||||
|
@ -519,7 +518,7 @@ describe('healthRoute', () => {
|
|||
docLinks,
|
||||
});
|
||||
|
||||
const serviceStatus = getLatest(serviceStatus$);
|
||||
const serviceStatus = firstValueFrom(serviceStatus$);
|
||||
|
||||
await sleep(0);
|
||||
|
||||
|
@ -600,7 +599,7 @@ describe('healthRoute', () => {
|
|||
shouldRunTasks: true,
|
||||
docLinks,
|
||||
});
|
||||
const serviceStatus = getLatest(serviceStatus$);
|
||||
const serviceStatus = firstValueFrom(serviceStatus$);
|
||||
await sleep(0);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/naming-convention
|
||||
|
@ -662,6 +661,54 @@ describe('healthRoute', () => {
|
|||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('returns a OK status for empty if shouldRunTasks is false', async () => {
|
||||
const router = httpServiceMock.createRouter();
|
||||
|
||||
const stats$ = new Subject<MonitoringStats>();
|
||||
const { serviceStatus$ } = healthRoute({
|
||||
router,
|
||||
monitoringStats$: stats$,
|
||||
logger,
|
||||
taskManagerId: uuidv4(),
|
||||
config: getTaskManagerConfig({
|
||||
monitored_stats_required_freshness: 1000,
|
||||
monitored_aggregated_stats_refresh_rate: 60000,
|
||||
}),
|
||||
kibanaVersion: '8.0',
|
||||
kibanaIndexName: '.kibana',
|
||||
getClusterClient: () => Promise.resolve(elasticsearchServiceMock.createClusterClient()),
|
||||
usageCounter: mockUsageCounter,
|
||||
shouldRunTasks: false,
|
||||
docLinks,
|
||||
});
|
||||
const serviceStatus = firstValueFrom(serviceStatus$);
|
||||
await sleep(0);
|
||||
|
||||
const lastUpdate = new Date().toISOString();
|
||||
stats$.next({
|
||||
last_update: lastUpdate,
|
||||
stats: {},
|
||||
});
|
||||
|
||||
const [, handler] = router.get.mock.calls[0];
|
||||
|
||||
const [context, req, res] = mockHandlerArguments({}, {}, ['ok']);
|
||||
|
||||
expect(await serviceStatus).toMatchObject({
|
||||
level: ServiceStatusLevels.available,
|
||||
summary: 'Task Manager is healthy',
|
||||
});
|
||||
expect(await handler(context, req, res)).toMatchObject({
|
||||
body: {
|
||||
id: expect.any(String),
|
||||
timestamp: expect.any(String),
|
||||
status: 'OK',
|
||||
last_update: lastUpdate,
|
||||
stats: {},
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
function ignoreCapacityEstimation(stats: RawMonitoringStats) {
|
||||
|
@ -754,10 +801,6 @@ function mockHealthStats(overrides = {}) {
|
|||
return merge(stub, overrides) as unknown as MonitoringStats;
|
||||
}
|
||||
|
||||
async function getLatest<T>(stream$: Observable<T>) {
|
||||
return new Promise<T>((resolve) => stream$.pipe(take(1)).subscribe((stats) => resolve(stats)));
|
||||
}
|
||||
|
||||
const getTaskManagerConfig = (overrides: Partial<TaskManagerConfig> = {}) =>
|
||||
configSchema.validate(
|
||||
overrides.monitored_stats_required_freshness
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue