kibana/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts
Mike Côté cb2e28d1e4
Fix task manager polling flow controls (#153491)
Fixes https://github.com/elastic/kibana/issues/151938

In this PR, I'm re-writing the Task Manager poller so it doesn't run
concurrently when timeouts occur while also fixing the issue where
polling requests would pile up when polling takes time. To support this,
I've also made the following changes:
- Removed the observable monitor and the
`xpack.task_manager.max_poll_inactivity_cycles` setting
- Make the task store `search` and `updateByQuery` functions have no
retries. This prevents the request from retrying 5x whenever a timeout
occurs, causing each call taking up to 2 1/2 minutes before Kibana sees
the error (now down to 30s each). We have polling to manage retries in
these situations.
- Switch the task poller tests to use `sinon` for faking timers
- Removing the `assertStillInSetup` checks on plugin setup. Felt like a
maintenance burden that wasn't necessary to fix with my code changes.

The main code changes are within these files (to review thoroughly so
the polling cycle doesn't suddenly stop):
- x-pack/plugins/task_manager/server/polling/task_poller.ts
- x-pack/plugins/task_manager/server/polling_lifecycle.ts (easier to
review if you disregard whitespace `?w=1`)

## To verify
1. Tasks run normally (create a rule or something that goes through task
manager regularly).
2. When the update by query takes a while, the request is cancelled
after 30s or the time manually configured.
4. When the search for claimed tasks query takes a while, the request is
cancelled after 30s or the time manually configured.

**Tips:**
<details><summary>how to slowdown search for claimed task
queries</summary>

```
diff --git a/x-pack/plugins/task_manager/server/queries/task_claiming.ts b/x-pack/plugins/task_manager/server/queries/task_claiming.ts
index 07042650a37..2caefd63672 100644
--- a/x-pack/plugins/task_manager/server/queries/task_claiming.ts
+++ b/x-pack/plugins/task_manager/server/queries/task_claiming.ts
@@ -247,7 +247,7 @@ export class TaskClaiming {
         taskTypes,
       });

-    const docs = tasksUpdated > 0 ? await this.sweepForClaimedTasks(taskTypes, size) : [];
+    const docs = await this.sweepForClaimedTasks(taskTypes, size);

     this.emitEvents(docs.map((doc) => asTaskClaimEvent(doc.id, asOk(doc))));

@@ -346,6 +346,13 @@ export class TaskClaiming {
       size,
       sort: SortByRunAtAndRetryAt,
       seq_no_primary_term: true,
+      aggs: {
+        delay: {
+          shard_delay: {
+            value: '40s',
+          },
+        },
+      },
     });

     return docs;
```
</details>

<details><summary>how to slow down update by queries</summary>
Not the cleanest way but you'll see occasional request timeouts from the
updateByQuery calls. I had more luck creating rules running every 1s.

```
diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts
index a06ee7b918a..07aa81e5388 100644
--- a/x-pack/plugins/task_manager/server/task_store.ts
+++ b/x-pack/plugins/task_manager/server/task_store.ts
@@ -126,6 +126,7 @@ export class TaskStore {
       // Timeouts are retried and make requests timeout after (requestTimeout * (1 + maxRetries))
       // The poller doesn't need retry logic because it will try again at the next polling cycle
       maxRetries: 0,
+      requestTimeout: 900,
     });
   }

@@ -458,6 +459,7 @@ export class TaskStore {
           ignore_unavailable: true,
           refresh: true,
           conflicts: 'proceed',
+          requests_per_second: 1,
           body: {
             ...opts,
             max_docs,
```
</details>

---------

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
2023-05-03 09:33:10 -04:00

401 lines
14 KiB
TypeScript

/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { Subject } from 'rxjs';
import { TaskLifecycleEvent } from './polling_lifecycle';
import { createInitialMiddleware } from './lib/middleware';
import { TaskTypeDictionary } from './task_type_dictionary';
import { mockLogger } from './test_utils';
import { asErr, asOk } from './lib/result_type';
import { FillPoolResult } from './lib/fill_pool';
import { EphemeralTaskLifecycle, EphemeralTaskLifecycleOpts } from './ephemeral_task_lifecycle';
import { v4 as uuidv4 } from 'uuid';
import { asTaskPollingCycleEvent, asTaskRunEvent, TaskPersistence } from './task_events';
import { TaskRunResult } from './task_running';
import { TaskPoolRunResult } from './task_pool';
import { TaskPoolMock } from './task_pool.mock';
import { executionContextServiceMock } from '@kbn/core/server/mocks';
import { taskManagerMock } from './mocks';
jest.mock('./constants', () => ({
CONCURRENCY_ALLOW_LIST_BY_TASK_TYPE: ['report'],
}));
const executionContext = executionContextServiceMock.createSetupContract();
describe('EphemeralTaskLifecycle', () => {
function initTaskLifecycleParams({
config,
...optOverrides
}: {
config?: Partial<EphemeralTaskLifecycleOpts['config']>;
} & Partial<Omit<EphemeralTaskLifecycleOpts, 'config'>> = {}) {
const taskManagerLogger = mockLogger();
const poolCapacity = jest.fn();
const pool = TaskPoolMock.create(poolCapacity);
const lifecycleEvent$ = new Subject<TaskLifecycleEvent>();
const elasticsearchAndSOAvailability$ = new Subject<boolean>();
const opts: EphemeralTaskLifecycleOpts = {
logger: taskManagerLogger,
definitions: new TaskTypeDictionary(taskManagerLogger),
executionContext,
config: {
max_workers: 10,
max_attempts: 9,
poll_interval: 6000000,
version_conflict_threshold: 80,
request_capacity: 1000,
monitored_aggregated_stats_refresh_rate: 5000,
monitored_stats_required_freshness: 5000,
monitored_stats_running_average_window: 50,
monitored_stats_health_verbose_log: {
enabled: true,
level: 'debug',
warn_delayed_task_start_in_seconds: 60,
},
monitored_task_execution_thresholds: {
default: {
error_threshold: 90,
warn_threshold: 80,
},
custom: {},
},
ephemeral_tasks: {
enabled: true,
request_capacity: 10,
},
unsafe: {
exclude_task_types: [],
},
event_loop_delay: {
monitor: true,
warn_threshold: 5000,
},
worker_utilization_running_average_window: 5,
...config,
},
elasticsearchAndSOAvailability$,
pool,
lifecycleEvent: lifecycleEvent$,
middleware: createInitialMiddleware(),
...optOverrides,
};
opts.definitions.registerTaskDefinitions({
foo: {
title: 'foo',
createTaskRunner: jest.fn(),
},
});
pool.run.mockResolvedValue(Promise.resolve(TaskPoolRunResult.RunningAllClaimedTasks));
return { poolCapacity, lifecycleEvent$, pool, elasticsearchAndSOAvailability$, opts };
}
describe('constructor', () => {
test('avoids unnecesery subscription if ephemeral tasks are disabled', () => {
const { opts } = initTaskLifecycleParams({
config: {
ephemeral_tasks: {
enabled: false,
request_capacity: 10,
},
},
});
const ephemeralTaskLifecycle = new EphemeralTaskLifecycle(opts);
const task = taskManagerMock.createTask();
expect(ephemeralTaskLifecycle.attemptToRun(task)).toMatchObject(asErr(task));
});
test('queues up tasks when ephemeral tasks are enabled', () => {
const { opts } = initTaskLifecycleParams();
const ephemeralTaskLifecycle = new EphemeralTaskLifecycle(opts);
const task = taskManagerMock.createTask();
expect(ephemeralTaskLifecycle.attemptToRun(task)).toMatchObject(asOk(task));
});
test('rejects tasks when ephemeral tasks are enabled and queue is full', () => {
const { opts } = initTaskLifecycleParams({
config: { ephemeral_tasks: { enabled: true, request_capacity: 2 } },
});
const ephemeralTaskLifecycle = new EphemeralTaskLifecycle(opts);
const task = taskManagerMock.createTask();
expect(ephemeralTaskLifecycle.attemptToRun(task)).toMatchObject(asOk(task));
const task2 = taskManagerMock.createTask();
expect(ephemeralTaskLifecycle.attemptToRun(task2)).toMatchObject(asOk(task2));
const rejectedTask = taskManagerMock.createTask();
expect(ephemeralTaskLifecycle.attemptToRun(rejectedTask)).toMatchObject(asErr(rejectedTask));
});
test('pulls tasks off queue when a polling cycle completes', () => {
const { pool, poolCapacity, opts, lifecycleEvent$ } = initTaskLifecycleParams();
const ephemeralTaskLifecycle = new EphemeralTaskLifecycle(opts);
const task = taskManagerMock.createTask({ id: `my-phemeral-task` });
expect(ephemeralTaskLifecycle.attemptToRun(task)).toMatchObject(asOk(task));
poolCapacity.mockReturnValue({
availableWorkers: 10,
});
lifecycleEvent$.next(
asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed }))
);
expect(pool.run).toHaveBeenCalledTimes(1);
const taskRunners = pool.run.mock.calls[0][0];
expect(taskRunners).toHaveLength(1);
expect(`${taskRunners[0]}`).toMatchInlineSnapshot(`"foo \\"my-phemeral-task\\" (Ephemeral)"`);
});
test('pulls tasks off queue when a task run completes', () => {
const { pool, poolCapacity, opts, lifecycleEvent$ } = initTaskLifecycleParams();
const ephemeralTaskLifecycle = new EphemeralTaskLifecycle(opts);
const task = taskManagerMock.createTask({ id: `my-phemeral-task` });
expect(ephemeralTaskLifecycle.attemptToRun(task)).toMatchObject(asOk(task));
poolCapacity.mockReturnValue({
availableWorkers: 10,
});
lifecycleEvent$.next(
asTaskRunEvent(
uuidv4(),
asOk({
task: taskManagerMock.createTask(),
result: TaskRunResult.Success,
persistence: TaskPersistence.Ephemeral,
})
)
);
expect(pool.run).toHaveBeenCalledTimes(1);
const taskRunners = pool.run.mock.calls[0][0];
expect(taskRunners).toHaveLength(1);
expect(`${taskRunners[0]}`).toMatchInlineSnapshot(`"foo \\"my-phemeral-task\\" (Ephemeral)"`);
});
test('pulls as many tasks off queue as it has capacity for', () => {
const { pool, poolCapacity, opts, lifecycleEvent$ } = initTaskLifecycleParams();
const ephemeralTaskLifecycle = new EphemeralTaskLifecycle(opts);
const tasks = [
taskManagerMock.createTask(),
taskManagerMock.createTask(),
taskManagerMock.createTask(),
];
expect(ephemeralTaskLifecycle.attemptToRun(tasks[0])).toMatchObject(asOk(tasks[0]));
expect(ephemeralTaskLifecycle.attemptToRun(tasks[1])).toMatchObject(asOk(tasks[1]));
expect(ephemeralTaskLifecycle.attemptToRun(tasks[2])).toMatchObject(asOk(tasks[2]));
poolCapacity.mockReturnValue({
availableWorkers: 2,
});
lifecycleEvent$.next(
asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed }))
);
expect(pool.run).toHaveBeenCalledTimes(1);
const taskRunners = pool.run.mock.calls[0][0];
expect(taskRunners).toHaveLength(2);
expect(`${taskRunners[0]}`).toEqual(`foo "${tasks[0].id}" (Ephemeral)`);
expect(`${taskRunners[1]}`).toEqual(`foo "${tasks[1].id}" (Ephemeral)`);
});
test('pulls only as many tasks of the same type as is allowed by maxConcurrency', () => {
const { pool, poolCapacity, opts, lifecycleEvent$ } = initTaskLifecycleParams();
opts.definitions.registerTaskDefinitions({
report: {
title: 'report',
maxConcurrency: 1,
createTaskRunner: jest.fn(),
},
});
const ephemeralTaskLifecycle = new EphemeralTaskLifecycle(opts);
const firstLimitedTask = taskManagerMock.createTask({ taskType: 'report' });
const secondLimitedTask = taskManagerMock.createTask({ taskType: 'report' });
// both are queued
expect(ephemeralTaskLifecycle.attemptToRun(firstLimitedTask)).toMatchObject(
asOk(firstLimitedTask)
);
expect(ephemeralTaskLifecycle.attemptToRun(secondLimitedTask)).toMatchObject(
asOk(secondLimitedTask)
);
// pool has capacity for both
poolCapacity.mockReturnValue({
availableWorkers: 10,
});
pool.getOccupiedWorkersByType.mockReturnValue(0);
lifecycleEvent$.next(
asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed }))
);
expect(pool.run).toHaveBeenCalledTimes(1);
const taskRunners = pool.run.mock.calls[0][0];
expect(taskRunners).toHaveLength(1);
expect(`${taskRunners[0]}`).toEqual(`report "${firstLimitedTask.id}" (Ephemeral)`);
});
test('when pulling tasks from the queue, it takes into account the maxConcurrency of tasks that are already in the pool', () => {
const { pool, poolCapacity, opts, lifecycleEvent$ } = initTaskLifecycleParams();
opts.definitions.registerTaskDefinitions({
report: {
title: 'report',
maxConcurrency: 1,
createTaskRunner: jest.fn(),
},
});
const ephemeralTaskLifecycle = new EphemeralTaskLifecycle(opts);
const firstLimitedTask = taskManagerMock.createTask({ taskType: 'report' });
const secondLimitedTask = taskManagerMock.createTask({ taskType: 'report' });
// both are queued
expect(ephemeralTaskLifecycle.attemptToRun(firstLimitedTask)).toMatchObject(
asOk(firstLimitedTask)
);
expect(ephemeralTaskLifecycle.attemptToRun(secondLimitedTask)).toMatchObject(
asOk(secondLimitedTask)
);
// pool has capacity in general
poolCapacity.mockReturnValue({
availableWorkers: 2,
});
// but when we ask how many it has occupied by type - wee always have one worker already occupied by that type
pool.getOccupiedWorkersByType.mockReturnValue(1);
lifecycleEvent$.next(
asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed }))
);
expect(pool.run).toHaveBeenCalledTimes(0);
// now we release the worker in the pool and cause another cycle in the epheemral queue
pool.getOccupiedWorkersByType.mockReturnValue(0);
lifecycleEvent$.next(
asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed }))
);
expect(pool.run).toHaveBeenCalledTimes(1);
const taskRunners = pool.run.mock.calls[0][0];
expect(taskRunners).toHaveLength(1);
expect(`${taskRunners[0]}`).toEqual(`report "${firstLimitedTask.id}" (Ephemeral)`);
});
});
test('pulls tasks with both maxConcurrency and unlimited concurrency', () => {
const { pool, poolCapacity, opts, lifecycleEvent$ } = initTaskLifecycleParams();
opts.definitions.registerTaskDefinitions({
report: {
title: 'report',
maxConcurrency: 1,
createTaskRunner: jest.fn(),
},
});
const ephemeralTaskLifecycle = new EphemeralTaskLifecycle(opts);
const fooTasks = [
taskManagerMock.createTask(),
taskManagerMock.createTask(),
taskManagerMock.createTask(),
];
expect(ephemeralTaskLifecycle.attemptToRun(fooTasks[0])).toMatchObject(asOk(fooTasks[0]));
const firstLimitedTask = taskManagerMock.createTask({ taskType: 'report' });
expect(ephemeralTaskLifecycle.attemptToRun(firstLimitedTask)).toMatchObject(
asOk(firstLimitedTask)
);
expect(ephemeralTaskLifecycle.attemptToRun(fooTasks[1])).toMatchObject(asOk(fooTasks[1]));
const secondLimitedTask = taskManagerMock.createTask({ taskType: 'report' });
expect(ephemeralTaskLifecycle.attemptToRun(secondLimitedTask)).toMatchObject(
asOk(secondLimitedTask)
);
expect(ephemeralTaskLifecycle.attemptToRun(fooTasks[2])).toMatchObject(asOk(fooTasks[2]));
// pool has capacity for all
poolCapacity.mockReturnValue({
availableWorkers: 10,
});
pool.getOccupiedWorkersByType.mockReturnValue(0);
lifecycleEvent$.next(asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed })));
expect(pool.run).toHaveBeenCalledTimes(1);
const taskRunners = pool.run.mock.calls[0][0];
expect(taskRunners).toHaveLength(4);
const asStrings = taskRunners.map((taskRunner) => `${taskRunner}`);
expect(asStrings).toContain(`foo "${fooTasks[0].id}" (Ephemeral)`);
expect(asStrings).toContain(`report "${firstLimitedTask.id}" (Ephemeral)`);
expect(asStrings).toContain(`foo "${fooTasks[1].id}" (Ephemeral)`);
expect(asStrings).toContain(`foo "${fooTasks[2].id}" (Ephemeral)`);
});
test('properly removes from the queue after pulled', () => {
const { poolCapacity, opts, lifecycleEvent$ } = initTaskLifecycleParams();
const ephemeralTaskLifecycle = new EphemeralTaskLifecycle(opts);
const tasks = [
taskManagerMock.createTask(),
taskManagerMock.createTask(),
taskManagerMock.createTask(),
];
expect(ephemeralTaskLifecycle.attemptToRun(tasks[0])).toMatchObject(asOk(tasks[0]));
expect(ephemeralTaskLifecycle.attemptToRun(tasks[1])).toMatchObject(asOk(tasks[1]));
expect(ephemeralTaskLifecycle.attemptToRun(tasks[2])).toMatchObject(asOk(tasks[2]));
expect(ephemeralTaskLifecycle.queuedTasks).toBe(3);
poolCapacity.mockReturnValue({
availableWorkers: 1,
});
lifecycleEvent$.next(asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed })));
expect(ephemeralTaskLifecycle.queuedTasks).toBe(2);
poolCapacity.mockReturnValue({
availableWorkers: 1,
});
lifecycleEvent$.next(asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed })));
expect(ephemeralTaskLifecycle.queuedTasks).toBe(1);
poolCapacity.mockReturnValue({
availableWorkers: 1,
});
lifecycleEvent$.next(asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed })));
expect(ephemeralTaskLifecycle.queuedTasks).toBe(0);
});
});