[8.x] Add more logs to Task Manager poller (#194741) (#194865)

# Backport

This will backport the following commits from `main` to `8.x`:
- [Add more logs to Task Manager poller
(#194741)](https://github.com/elastic/kibana/pull/194741)

<!--- Backport version: 9.4.3 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"Mike
Côté","email":"mikecote@users.noreply.github.com"},"sourceCommit":{"committedDate":"2024-10-03T17:12:22Z","message":"Add
more logs to Task Manager poller (#194741)\n\nIn this PR, I'm adding a
few more logs to the task poller to indicate\r\ncritical events to the
task poller.\r\n\r\n## To verify\r\n1. Startup Elasticsearch and Kibana
(and ensure Elasticsearch data is\r\npersisted somewhere `yarn es
snapshot -E path.data=...`)\r\n2. Observe the `Starting the task poller`
message on startup\r\n3. Shut down Elasticsearch\r\n4. Observe the
following messages:\r\n- `Stopping the task poller because Elasticsearch
and/or saved-objects\r\nservice became unavailable`\r\n - `Stopping the
task poller`\r\n - `Task poller finished running its last cycle`\r\n5.
Startup Elasticsearch again\r\n6. Wait a while and observe the `Starting
the task poller`
message","sha":"790c5ce7c01fabde140675a24fd852d3f86f7ebc","branchLabelMapping":{"^v9.0.0$":"main","^v8.16.0$":"8.x","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:skip","Feature:Task
Manager","Team:ResponseOps","v9.0.0","backport:prev-minor","v8.16.0"],"title":"Add
more logs to Task Manager
poller","number":194741,"url":"https://github.com/elastic/kibana/pull/194741","mergeCommit":{"message":"Add
more logs to Task Manager poller (#194741)\n\nIn this PR, I'm adding a
few more logs to the task poller to indicate\r\ncritical events to the
task poller.\r\n\r\n## To verify\r\n1. Startup Elasticsearch and Kibana
(and ensure Elasticsearch data is\r\npersisted somewhere `yarn es
snapshot -E path.data=...`)\r\n2. Observe the `Starting the task poller`
message on startup\r\n3. Shut down Elasticsearch\r\n4. Observe the
following messages:\r\n- `Stopping the task poller because Elasticsearch
and/or saved-objects\r\nservice became unavailable`\r\n - `Stopping the
task poller`\r\n - `Task poller finished running its last cycle`\r\n5.
Startup Elasticsearch again\r\n6. Wait a while and observe the `Starting
the task poller`
message","sha":"790c5ce7c01fabde140675a24fd852d3f86f7ebc"}},"sourceBranch":"main","suggestedTargetBranches":["8.x"],"targetPullRequestStates":[{"branch":"main","label":"v9.0.0","branchLabelMappingKey":"^v9.0.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/194741","number":194741,"mergeCommit":{"message":"Add
more logs to Task Manager poller (#194741)\n\nIn this PR, I'm adding a
few more logs to the task poller to indicate\r\ncritical events to the
task poller.\r\n\r\n## To verify\r\n1. Startup Elasticsearch and Kibana
(and ensure Elasticsearch data is\r\npersisted somewhere `yarn es
snapshot -E path.data=...`)\r\n2. Observe the `Starting the task poller`
message on startup\r\n3. Shut down Elasticsearch\r\n4. Observe the
following messages:\r\n- `Stopping the task poller because Elasticsearch
and/or saved-objects\r\nservice became unavailable`\r\n - `Stopping the
task poller`\r\n - `Task poller finished running its last cycle`\r\n5.
Startup Elasticsearch again\r\n6. Wait a while and observe the `Starting
the task poller`
message","sha":"790c5ce7c01fabde140675a24fd852d3f86f7ebc"}},{"branch":"8.x","label":"v8.16.0","branchLabelMappingKey":"^v8.16.0$","isSourceBranch":false,"state":"NOT_CREATED"}]}]
BACKPORT-->

Co-authored-by: Mike Côté <mikecote@users.noreply.github.com>
This commit is contained in:
Kibana Machine 2024-10-04 04:55:31 +10:00 committed by GitHub
parent 6a782989f8
commit e422533e3d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 36 additions and 13 deletions

View file

@ -239,13 +239,14 @@ describe('TaskPoller', () => {
const pollInterval = 100;
const handler = jest.fn();
const workError = new Error('failed to work');
const poller = createTaskPoller<string, string[]>({
initialPollInterval: pollInterval,
logger: loggingSystemMock.create().get(),
pollInterval$: of(pollInterval),
pollIntervalDelay$: of(0),
work: async (...args) => {
throw new Error('failed to work');
throw workError;
},
getCapacity: () => 5,
});
@ -256,12 +257,13 @@ describe('TaskPoller', () => {
await new Promise((resolve) => setImmediate(resolve));
const expectedError = new PollingError<string>(
'Failed to poll for work: Error: failed to work',
'Failed to poll for work: failed to work',
PollingErrorType.WorkError,
none
);
expect(handler).toHaveBeenCalledWith(asErr(expectedError));
expect(handler.mock.calls[0][0].error.type).toEqual(PollingErrorType.WorkError);
expect(handler.mock.calls[0][0].error.stack).toContain(workError.stack);
});
test('continues polling after work fails', async () => {
@ -269,10 +271,11 @@ describe('TaskPoller', () => {
const handler = jest.fn();
let callCount = 0;
const workError = new Error('failed to work');
const work = jest.fn(async () => {
callCount++;
if (callCount === 2) {
throw new Error('failed to work');
throw workError;
}
return callCount;
});
@ -296,12 +299,13 @@ describe('TaskPoller', () => {
await new Promise((resolve) => setImmediate(resolve));
const expectedError = new PollingError<string>(
'Failed to poll for work: Error: failed to work',
'Failed to poll for work: failed to work',
PollingErrorType.WorkError,
none
);
expect(handler).toHaveBeenCalledWith(asErr(expectedError));
expect(handler.mock.calls[1][0].error.type).toEqual(PollingErrorType.WorkError);
expect(handler.mock.calls[1][0].error.stack).toContain(workError.stack);
expect(handler).not.toHaveBeenCalledWith(asOk(2));
clock.tick(pollInterval);
@ -342,7 +346,7 @@ describe('TaskPoller', () => {
await new Promise((resolve) => setImmediate(resolve));
const expectedError = new PollingError<string>(
'Failed to poll for work: Error: error getting capacity',
'Failed to poll for work: error getting capacity',
PollingErrorType.WorkError,
none
);

View file

@ -65,6 +65,8 @@ export function createTaskPoller<T, H>({
if (hasCapacity()) {
const result = await work();
subject.next(asOk(result));
} else {
logger.debug('Skipping polling cycle because there is no capacity available');
}
} catch (e) {
subject.next(asPollingError<T>(e, PollingErrorType.WorkError));
@ -73,11 +75,16 @@ export function createTaskPoller<T, H>({
if (running) {
// Set the next runCycle call
timeoutId = setTimeout(
() => runCycle().catch(() => {}),
() =>
runCycle().catch((e) => {
subject.next(asPollingError(e, PollingErrorType.PollerError));
}),
Math.max(pollInterval - (Date.now() - start) + (pollIntervalDelay % pollInterval), 0)
);
// Reset delay, it's designed to shuffle only once
pollIntervalDelay = 0;
} else {
logger.info('Task poller finished running its last cycle');
}
}
@ -113,14 +120,18 @@ export function createTaskPoller<T, H>({
events$: subject,
start: () => {
if (!running) {
logger.info('Starting the task poller');
running = true;
runCycle().catch(() => {});
runCycle().catch((e) => {
subject.next(asPollingError(e, PollingErrorType.PollerError));
});
// We need to subscribe shortly after start. Otherwise, the observables start emiting events
// too soon for the task run statistics module to capture.
setTimeout(() => subscribe(), 0);
}
},
stop: () => {
logger.info('Stopping the task poller');
if (timeoutId) {
clearTimeout(timeoutId);
timeoutId = null;
@ -134,21 +145,25 @@ export enum PollingErrorType {
WorkError,
WorkTimeout,
RequestCapacityReached,
PollerError,
}
function asPollingError<T>(err: string | Error, type: PollingErrorType, data: Option<T> = none) {
return asErr(new PollingError<T>(`Failed to poll for work: ${err}`, type, data));
function asPollingError<T>(err: Error, type: PollingErrorType, data: Option<T> = none) {
return asErr(new PollingError<T>(`Failed to poll for work: ${err.message}`, type, data, err));
}
export class PollingError<T> extends Error {
public readonly type: PollingErrorType;
public readonly data: Option<T>;
public readonly source: TaskErrorSource;
constructor(message: string, type: PollingErrorType, data: Option<T>) {
super(message);
constructor(message: string, type: PollingErrorType, data: Option<T>, cause?: Error) {
super(message, { cause });
Object.setPrototypeOf(this, new.target.prototype);
this.type = type;
this.data = data;
this.source = TaskErrorSource.FRAMEWORK;
if (cause) {
this.stack = `${this.stack}\nCaused by:\n${cause.stack}`;
}
}
}

View file

@ -515,9 +515,10 @@ describe('TaskPollingLifecycle', () => {
(event: TaskLifecycleEvent) => event.type === TaskEventType.TASK_POLLING_CYCLE
);
expect(pollingCycleEvent!.event.tag).toEqual('err');
expect(pollingCycleEvent!.event).toEqual({
tag: 'err',
error: new Error(`Failed to poll for work: Error: booo`),
error: new Error(`Failed to poll for work: booo`),
});
});

View file

@ -199,6 +199,9 @@ export class TaskPollingLifecycle implements ITaskEventEmitter<TaskLifecycleEven
// start polling for work
poller.start();
} else if (!areESAndSOAvailable) {
this.logger.info(
`Stopping the task poller because Elasticsearch and/or saved-objects service became unavailable`
);
poller.stop();
this.pool.cancelRunningTasks();
}
@ -285,7 +288,7 @@ export class TaskPollingLifecycle implements ITaskEventEmitter<TaskLifecycleEven
mapOptional((id) => this.emitEvent(asTaskRunRequestEvent(id, asErr(error))))
);
}
this.logger.error(error.message);
this.logger.error(error.message, { error: { stack_trace: error.stack } });
// Emit event indicating task manager utilization % at the end of a polling cycle
// Because there was a polling error, no tasks were claimed so this represents the number of workers busy