kibana/x-pack/plugins/task_manager/server/polling_lifecycle.ts
Mike Côté cb2e28d1e4
Fix task manager polling flow controls (#153491)
Fixes https://github.com/elastic/kibana/issues/151938

In this PR, I'm re-writing the Task Manager poller so it doesn't run
concurrently when timeouts occur while also fixing the issue where
polling requests would pile up when polling takes time. To support this,
I've also made the following changes:
- Removed the observable monitor and the
`xpack.task_manager.max_poll_inactivity_cycles` setting
- Make the task store `search` and `updateByQuery` functions have no
retries. This prevents the request from retrying 5x whenever a timeout
occurs, causing each call taking up to 2 1/2 minutes before Kibana sees
the error (now down to 30s each). We have polling to manage retries in
these situations.
- Switch the task poller tests to use `sinon` for faking timers
- Removing the `assertStillInSetup` checks on plugin setup. Felt like a
maintenance burden that wasn't necessary to fix with my code changes.

The main code changes are within these files (to review thoroughly so
the polling cycle doesn't suddenly stop):
- x-pack/plugins/task_manager/server/polling/task_poller.ts
- x-pack/plugins/task_manager/server/polling_lifecycle.ts (easier to
review if you disregard whitespace `?w=1`)

## To verify
1. Tasks run normally (create a rule or something that goes through task
manager regularly).
2. When the update by query takes a while, the request is cancelled
after 30s or the time manually configured.
4. When the search for claimed tasks query takes a while, the request is
cancelled after 30s or the time manually configured.

**Tips:**
<details><summary>how to slowdown search for claimed task
queries</summary>

```
diff --git a/x-pack/plugins/task_manager/server/queries/task_claiming.ts b/x-pack/plugins/task_manager/server/queries/task_claiming.ts
index 07042650a37..2caefd63672 100644
--- a/x-pack/plugins/task_manager/server/queries/task_claiming.ts
+++ b/x-pack/plugins/task_manager/server/queries/task_claiming.ts
@@ -247,7 +247,7 @@ export class TaskClaiming {
         taskTypes,
       });

-    const docs = tasksUpdated > 0 ? await this.sweepForClaimedTasks(taskTypes, size) : [];
+    const docs = await this.sweepForClaimedTasks(taskTypes, size);

     this.emitEvents(docs.map((doc) => asTaskClaimEvent(doc.id, asOk(doc))));

@@ -346,6 +346,13 @@ export class TaskClaiming {
       size,
       sort: SortByRunAtAndRetryAt,
       seq_no_primary_term: true,
+      aggs: {
+        delay: {
+          shard_delay: {
+            value: '40s',
+          },
+        },
+      },
     });

     return docs;
```
</details>

<details><summary>how to slow down update by queries</summary>
Not the cleanest way but you'll see occasional request timeouts from the
updateByQuery calls. I had more luck creating rules running every 1s.

```
diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts
index a06ee7b918a..07aa81e5388 100644
--- a/x-pack/plugins/task_manager/server/task_store.ts
+++ b/x-pack/plugins/task_manager/server/task_store.ts
@@ -126,6 +126,7 @@ export class TaskStore {
       // Timeouts are retried and make requests timeout after (requestTimeout * (1 + maxRetries))
       // The poller doesn't need retry logic because it will try again at the next polling cycle
       maxRetries: 0,
+      requestTimeout: 900,
     });
   }

@@ -458,6 +459,7 @@ export class TaskStore {
           ignore_unavailable: true,
           refresh: true,
           conflicts: 'proceed',
+          requests_per_second: 1,
           body: {
             ...opts,
             max_docs,
```
</details>

---------

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
2023-05-03 09:33:10 -04:00

327 lines
11 KiB
TypeScript

/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { Subject, Observable } from 'rxjs';
import { pipe } from 'fp-ts/lib/pipeable';
import { map as mapOptional } from 'fp-ts/lib/Option';
import { tap } from 'rxjs/operators';
import { UsageCounter } from '@kbn/usage-collection-plugin/server';
import type { Logger, ExecutionContextStart } from '@kbn/core/server';
import { Result, asErr, mapErr, asOk, map, mapOk } from './lib/result_type';
import { ManagedConfiguration } from './lib/create_managed_configuration';
import { TaskManagerConfig } from './config';
import {
TaskMarkRunning,
TaskRun,
TaskClaim,
TaskRunRequest,
asTaskRunRequestEvent,
TaskPollingCycle,
asTaskPollingCycleEvent,
TaskManagerStat,
asTaskManagerStatEvent,
EphemeralTaskRejectedDueToCapacity,
} from './task_events';
import { fillPool, FillPoolResult, TimedFillPoolResult } from './lib/fill_pool';
import { Middleware } from './lib/middleware';
import { intervalFromNow } from './lib/intervals';
import { ConcreteTaskInstance } from './task';
import { createTaskPoller, PollingError, PollingErrorType } from './polling';
import { TaskPool } from './task_pool';
import { TaskManagerRunner, TaskRunner } from './task_running';
import { TaskStore } from './task_store';
import { identifyEsError, isEsCannotExecuteScriptError } from './lib/identify_es_error';
import { BufferedTaskStore } from './buffered_task_store';
import { TaskTypeDictionary } from './task_type_dictionary';
import { delayOnClaimConflicts } from './polling';
import { TaskClaiming, ClaimOwnershipResult } from './queries/task_claiming';
export type TaskPollingLifecycleOpts = {
logger: Logger;
definitions: TaskTypeDictionary;
unusedTypes: string[];
taskStore: TaskStore;
config: TaskManagerConfig;
middleware: Middleware;
elasticsearchAndSOAvailability$: Observable<boolean>;
executionContext: ExecutionContextStart;
usageCounter?: UsageCounter;
} & ManagedConfiguration;
export type TaskLifecycleEvent =
| TaskMarkRunning
| TaskRun
| TaskClaim
| TaskRunRequest
| TaskPollingCycle
| TaskManagerStat
| EphemeralTaskRejectedDueToCapacity;
/**
* The public interface into the task manager system.
*/
export class TaskPollingLifecycle {
private definitions: TaskTypeDictionary;
private store: TaskStore;
private taskClaiming: TaskClaiming;
private bufferedStore: BufferedTaskStore;
private readonly executionContext: ExecutionContextStart;
private logger: Logger;
public pool: TaskPool;
// all task related events (task claimed, task marked as running, etc.) are emitted through events$
private events$ = new Subject<TaskLifecycleEvent>();
private middleware: Middleware;
private usageCounter?: UsageCounter;
private config: TaskManagerConfig;
/**
* Initializes the task manager, preventing any further addition of middleware,
* enabling the task manipulation methods, and beginning the background polling
* mechanism.
*/
constructor({
logger,
middleware,
maxWorkersConfiguration$,
pollIntervalConfiguration$,
// Elasticsearch and SavedObjects availability status
elasticsearchAndSOAvailability$,
config,
taskStore,
definitions,
unusedTypes,
executionContext,
usageCounter,
}: TaskPollingLifecycleOpts) {
this.logger = logger;
this.middleware = middleware;
this.definitions = definitions;
this.store = taskStore;
this.executionContext = executionContext;
this.usageCounter = usageCounter;
this.config = config;
const emitEvent = (event: TaskLifecycleEvent) => this.events$.next(event);
this.bufferedStore = new BufferedTaskStore(this.store, {
bufferMaxOperations: config.max_workers,
logger,
});
this.pool = new TaskPool({
logger,
maxWorkers$: maxWorkersConfiguration$,
});
this.pool.load.subscribe(emitEvent);
this.taskClaiming = new TaskClaiming({
taskStore,
maxAttempts: config.max_attempts,
excludedTaskTypes: config.unsafe.exclude_task_types,
definitions,
unusedTypes,
logger: this.logger,
getCapacity: (taskType?: string) =>
taskType && this.definitions.get(taskType)?.maxConcurrency
? Math.max(
Math.min(
this.pool.availableWorkers,
this.definitions.get(taskType)!.maxConcurrency! -
this.pool.getOccupiedWorkersByType(taskType)
),
0
)
: this.pool.availableWorkers,
});
// pipe taskClaiming events into the lifecycle event stream
this.taskClaiming.events.subscribe(emitEvent);
const { poll_interval: pollInterval } = config;
const pollIntervalDelay$ = delayOnClaimConflicts(
maxWorkersConfiguration$,
pollIntervalConfiguration$,
this.events$,
config.version_conflict_threshold,
config.monitored_stats_running_average_window
).pipe(tap((delay) => emitEvent(asTaskManagerStatEvent('pollingDelay', asOk(delay)))));
const poller = createTaskPoller<string, TimedFillPoolResult>({
logger,
initialPollInterval: pollInterval,
pollInterval$: pollIntervalConfiguration$,
pollIntervalDelay$,
getCapacity: () => {
const capacity = this.pool.availableWorkers;
if (!capacity) {
// if there isn't capacity, emit a load event so that we can expose how often
// high load causes the poller to skip work (work isn'tcalled when there is no capacity)
this.emitEvent(asTaskManagerStatEvent('load', asOk(this.pool.workerLoad)));
// Emit event indicating task manager utilization
this.emitEvent(asTaskManagerStatEvent('workerUtilization', asOk(this.pool.workerLoad)));
}
return capacity;
},
work: this.pollForWork,
});
this.subscribeToPoller(poller.events$);
elasticsearchAndSOAvailability$.subscribe((areESAndSOAvailable) => {
if (areESAndSOAvailable) {
// start polling for work
poller.start();
} else if (!areESAndSOAvailable) {
poller.stop();
this.pool.cancelRunningTasks();
}
});
}
public get events(): Observable<TaskLifecycleEvent> {
return this.events$;
}
private emitEvent = (event: TaskLifecycleEvent) => {
this.events$.next(event);
};
private createTaskRunnerForTask = (instance: ConcreteTaskInstance) => {
return new TaskManagerRunner({
logger: this.logger,
instance,
store: this.bufferedStore,
definitions: this.definitions,
beforeRun: this.middleware.beforeRun,
beforeMarkRunning: this.middleware.beforeMarkRunning,
onTaskEvent: this.emitEvent,
defaultMaxAttempts: this.taskClaiming.maxAttempts,
executionContext: this.executionContext,
usageCounter: this.usageCounter,
eventLoopDelayConfig: { ...this.config.event_loop_delay },
});
};
private pollForWork = async (): Promise<TimedFillPoolResult> => {
return fillPool(
// claim available tasks
() => {
return claimAvailableTasks(this.taskClaiming, this.logger).pipe(
tap(
mapOk(({ timing }: ClaimOwnershipResult) => {
if (timing) {
this.emitEvent(
asTaskManagerStatEvent('claimDuration', asOk(timing.stop - timing.start))
);
}
})
)
);
},
// wrap each task in a Task Runner
this.createTaskRunnerForTask,
// place tasks in the Task Pool
async (tasks: TaskRunner[]) => {
const tasksToRun = [];
const removeTaskPromises = [];
for (const task of tasks) {
if (task.isAdHocTaskAndOutOfAttempts) {
this.logger.debug(`Removing ${task} because the max attempts have been reached.`);
removeTaskPromises.push(task.removeTask());
} else {
tasksToRun.push(task);
}
}
// Wait for all the promises at once to speed up the polling cycle
const [result] = await Promise.all([this.pool.run(tasksToRun), ...removeTaskPromises]);
// Emit the load after fetching tasks, giving us a good metric for evaluating how
// busy Task manager tends to be in this Kibana instance
this.emitEvent(asTaskManagerStatEvent('load', asOk(this.pool.workerLoad)));
return result;
}
);
};
private subscribeToPoller(
poller$: Observable<Result<TimedFillPoolResult, PollingError<string>>>
) {
return poller$
.pipe(
tap(
mapErr((error: PollingError<string>) => {
if (error.type === PollingErrorType.RequestCapacityReached) {
pipe(
error.data,
mapOptional((id) => this.emitEvent(asTaskRunRequestEvent(id, asErr(error))))
);
}
this.logger.error(error.message);
})
)
)
.pipe(
tap(
mapOk(() => {
// Emit event indicating task manager utilization % at the end of a polling cycle
this.emitEvent(asTaskManagerStatEvent('workerUtilization', asOk(this.pool.workerLoad)));
})
)
)
.subscribe((result: Result<TimedFillPoolResult, PollingError<string>>) => {
this.emitEvent(
map(
result,
({ timing, ...event }) => asTaskPollingCycleEvent<string>(asOk(event), timing),
(event) => asTaskPollingCycleEvent<string>(asErr(event))
)
);
});
}
}
export function claimAvailableTasks(
taskClaiming: TaskClaiming,
logger: Logger
): Observable<Result<ClaimOwnershipResult, FillPoolResult>> {
return new Observable((observer) => {
taskClaiming
.claimAvailableTasksIfCapacityIsAvailable({
claimOwnershipUntil: intervalFromNow('30s')!,
})
.subscribe(
(claimResult) => {
observer.next(claimResult);
},
(ex) => {
// if the `taskClaiming` stream errors out we want to catch it and see if
// we can identify the reason
// if we can - we emit an FillPoolResult error rather than erroring out the wrapping Observable
// returned by `claimAvailableTasks`
if (isEsCannotExecuteScriptError(ex)) {
logger.warn(
`Task Manager cannot operate when inline scripts are disabled in Elasticsearch`
);
observer.next(asErr(FillPoolResult.Failed));
observer.complete();
} else {
const esError = identifyEsError(ex);
// as we could't identify the reason - we'll error out the wrapping Observable too
observer.error(esError.length > 0 ? esError : ex);
}
},
() => {
observer.complete();
}
);
});
}