kibana/x-pack/plugins/task_manager/server/config.ts
Mike Côté cb2e28d1e4
Fix task manager polling flow controls (#153491)
Fixes https://github.com/elastic/kibana/issues/151938

In this PR, I'm re-writing the Task Manager poller so it doesn't run
concurrently when timeouts occur while also fixing the issue where
polling requests would pile up when polling takes time. To support this,
I've also made the following changes:
- Removed the observable monitor and the
`xpack.task_manager.max_poll_inactivity_cycles` setting
- Make the task store `search` and `updateByQuery` functions have no
retries. This prevents the request from retrying 5x whenever a timeout
occurs, causing each call taking up to 2 1/2 minutes before Kibana sees
the error (now down to 30s each). We have polling to manage retries in
these situations.
- Switch the task poller tests to use `sinon` for faking timers
- Removing the `assertStillInSetup` checks on plugin setup. Felt like a
maintenance burden that wasn't necessary to fix with my code changes.

The main code changes are within these files (to review thoroughly so
the polling cycle doesn't suddenly stop):
- x-pack/plugins/task_manager/server/polling/task_poller.ts
- x-pack/plugins/task_manager/server/polling_lifecycle.ts (easier to
review if you disregard whitespace `?w=1`)

## To verify
1. Tasks run normally (create a rule or something that goes through task
manager regularly).
2. When the update by query takes a while, the request is cancelled
after 30s or the time manually configured.
4. When the search for claimed tasks query takes a while, the request is
cancelled after 30s or the time manually configured.

**Tips:**
<details><summary>how to slowdown search for claimed task
queries</summary>

```
diff --git a/x-pack/plugins/task_manager/server/queries/task_claiming.ts b/x-pack/plugins/task_manager/server/queries/task_claiming.ts
index 07042650a37..2caefd63672 100644
--- a/x-pack/plugins/task_manager/server/queries/task_claiming.ts
+++ b/x-pack/plugins/task_manager/server/queries/task_claiming.ts
@@ -247,7 +247,7 @@ export class TaskClaiming {
         taskTypes,
       });

-    const docs = tasksUpdated > 0 ? await this.sweepForClaimedTasks(taskTypes, size) : [];
+    const docs = await this.sweepForClaimedTasks(taskTypes, size);

     this.emitEvents(docs.map((doc) => asTaskClaimEvent(doc.id, asOk(doc))));

@@ -346,6 +346,13 @@ export class TaskClaiming {
       size,
       sort: SortByRunAtAndRetryAt,
       seq_no_primary_term: true,
+      aggs: {
+        delay: {
+          shard_delay: {
+            value: '40s',
+          },
+        },
+      },
     });

     return docs;
```
</details>

<details><summary>how to slow down update by queries</summary>
Not the cleanest way but you'll see occasional request timeouts from the
updateByQuery calls. I had more luck creating rules running every 1s.

```
diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts
index a06ee7b918a..07aa81e5388 100644
--- a/x-pack/plugins/task_manager/server/task_store.ts
+++ b/x-pack/plugins/task_manager/server/task_store.ts
@@ -126,6 +126,7 @@ export class TaskStore {
       // Timeouts are retried and make requests timeout after (requestTimeout * (1 + maxRetries))
       // The poller doesn't need retry logic because it will try again at the next polling cycle
       maxRetries: 0,
+      requestTimeout: 900,
     });
   }

@@ -458,6 +459,7 @@ export class TaskStore {
           ignore_unavailable: true,
           refresh: true,
           conflicts: 'proceed',
+          requests_per_second: 1,
           body: {
             ...opts,
             max_docs,
```
</details>

---------

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
2023-05-03 09:33:10 -04:00

155 lines
5.9 KiB
TypeScript

/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { schema, TypeOf } from '@kbn/config-schema';
export const MAX_WORKERS_LIMIT = 100;
export const DEFAULT_MAX_WORKERS = 10;
export const DEFAULT_POLL_INTERVAL = 3000;
export const DEFAULT_VERSION_CONFLICT_THRESHOLD = 80;
export const DEFAULT_MAX_EPHEMERAL_REQUEST_CAPACITY = MAX_WORKERS_LIMIT;
// Monitoring Constants
// ===================
// Refresh aggregated monitored stats at a default rate of once a minute
export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000;
export const DEFAULT_MONITORING_STATS_RUNNING_AVERAGE_WINDOW = 50;
export const DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS = 60;
// At the default poll interval of 3sec, this averages over the last 15sec.
export const DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW = 5;
export const taskExecutionFailureThresholdSchema = schema.object(
{
error_threshold: schema.number({
defaultValue: 90,
min: 0,
}),
warn_threshold: schema.number({
defaultValue: 80,
min: 0,
}),
},
{
validate(config) {
if (config.error_threshold < config.warn_threshold) {
return `warn_threshold (${config.warn_threshold}) must be less than, or equal to, error_threshold (${config.error_threshold})`;
}
},
}
);
const eventLoopDelaySchema = schema.object({
monitor: schema.boolean({ defaultValue: true }),
warn_threshold: schema.number({
defaultValue: 5000,
min: 10,
}),
});
export const configSchema = schema.object(
{
/* The maximum number of times a task will be attempted before being abandoned as failed */
max_attempts: schema.number({
defaultValue: 3,
min: 1,
}),
/* How often, in milliseconds, the task manager will look for more work. */
poll_interval: schema.number({
defaultValue: DEFAULT_POLL_INTERVAL,
min: 100,
}),
/* How many requests can Task Manager buffer before it rejects new requests. */
request_capacity: schema.number({
// a nice round contrived number, feel free to change as we learn how it behaves
defaultValue: 1000,
min: 1,
}),
/* The maximum number of tasks that this Kibana instance will run simultaneously. */
max_workers: schema.number({
defaultValue: DEFAULT_MAX_WORKERS,
// disable the task manager rather than trying to specify it with 0 workers
min: 1,
}),
/* The threshold percenatge for workers experiencing version conflicts for shifting the polling interval. */
version_conflict_threshold: schema.number({
defaultValue: DEFAULT_VERSION_CONFLICT_THRESHOLD,
min: 50,
max: 100,
}),
/* The rate at which we emit fresh monitored stats. By default we'll use the poll_interval (+ a slight buffer) */
monitored_stats_required_freshness: schema.number({
defaultValue: (config?: unknown) =>
((config as { poll_interval: number })?.poll_interval ?? DEFAULT_POLL_INTERVAL) + 1000,
min: 100,
}),
/* The rate at which we refresh monitored stats that require aggregation queries against ES. */
monitored_aggregated_stats_refresh_rate: schema.number({
defaultValue: DEFAULT_MONITORING_REFRESH_RATE,
/* don't run monitored stat aggregations any faster than once every 5 seconds */
min: 5000,
}),
/* The size of the running average window for monitored stats. */
monitored_stats_running_average_window: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_RUNNING_AVERAGE_WINDOW,
max: 100,
min: 10,
}),
/* Task Execution result warn & error thresholds. */
monitored_task_execution_thresholds: schema.object({
default: taskExecutionFailureThresholdSchema,
custom: schema.recordOf(schema.string(), taskExecutionFailureThresholdSchema, {
defaultValue: {},
}),
}),
monitored_stats_health_verbose_log: schema.object({
enabled: schema.boolean({ defaultValue: false }),
level: schema.oneOf([schema.literal('debug'), schema.literal('info')], {
defaultValue: 'debug',
}),
/* The amount of seconds we allow a task to delay before printing a warning server log */
warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
}),
}),
ephemeral_tasks: schema.object({
enabled: schema.boolean({ defaultValue: false }),
/* How many requests can Task Manager buffer before it rejects new requests. */
request_capacity: schema.number({
// a nice round contrived number, feel free to change as we learn how it behaves
defaultValue: 10,
min: 1,
max: DEFAULT_MAX_EPHEMERAL_REQUEST_CAPACITY,
}),
}),
event_loop_delay: eventLoopDelaySchema,
worker_utilization_running_average_window: schema.number({
defaultValue: DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW,
max: 100,
min: 1,
}),
/* These are not designed to be used by most users. Please use caution when changing these */
unsafe: schema.object({
exclude_task_types: schema.arrayOf(schema.string(), { defaultValue: [] }),
}),
},
{
validate: (config) => {
if (
config.monitored_stats_required_freshness &&
config.poll_interval &&
config.monitored_stats_required_freshness < config.poll_interval
) {
return `The specified monitored_stats_required_freshness (${config.monitored_stats_required_freshness}) is invalid, as it is below the poll_interval (${config.poll_interval})`;
}
},
}
);
export type TaskManagerConfig = TypeOf<typeof configSchema>;
export type TaskExecutionFailureThreshold = TypeOf<typeof taskExecutionFailureThresholdSchema>;
export type EventLoopDelayConfig = TypeOf<typeof eventLoopDelaySchema>;