Fix that gap can be stuck "in-progress" (#221473)

## Summary


[[Issue](https://github.com/elastic/kibana/issues/221111)](https://github.com/elastic/kibana/issues/221111)

Gaps can get stuck in the `in-progress` state if a rule is
backfill-executed with failures.

### Current behavior:

Let's say we have a gap from `12:00–13:00`.

When the gap is initially detected, it has the following state:

```
filled_intervals: []
unfilled_intervals: [12:00–13:00]
in_progress_intervals: []
```

When a backfill starts, we set `in_progress_intervals` to the range that
overlaps with the backfill. We also remove that range from
`unfilled_intervals`:

```
filled_intervals: []
unfilled_intervals: []
in_progress_intervals: [12:00–13:00]
```

After the backfill is successfully executed, we move the range to
`filled_intervals` and clear `in_progress_intervals`:

```
filled_intervals: [12:00–13:00]
unfilled_intervals: []
in_progress_intervals: []
```

However, if the backfill fails, we want to remove the range from
`in_progress_intervals` and move it back to `unfilled_intervals`. The
problem is that we cannot simply do this because there might be other
overlapping backfills still in progress for the same gap. In the case of
a successful execution, this isn’t an issue, as the range is moved to
`filled_intervals`.

When a backfill fails, we refetch all overlapping backfills for the gap
to recalculate the `in_progress_intervals`.

### Problem

In the current implementation, we're updating the gaps **before**
deleting the failed backfill. This causes the recalculated
`in_progress_intervals` to still include the failed backfill’s range,
resulting in a stale state.

### Fix

We should **first delete** the failed backfill, and **then** update the
gap. This ensures that the recalculated `in_progress_intervals` reflect
only the remaining active backfills.

---------

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
This commit is contained in:
Khristinin Nikita 2025-06-17 08:47:01 +02:00 committed by GitHub
parent d9a3e1f400
commit dfd783e12a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 15 additions and 10 deletions

View file

@ -900,9 +900,14 @@ describe('Ad Hoc Task Runner', () => {
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
'abc',
{ refresh: false, namespace: undefined }
{ refresh: true, namespace: undefined }
);
// Verify that updateGaps was called after delete
const deleteCallOrder = internalSavedObjectsRepository.delete.mock.invocationCallOrder[0];
const updateGapsCallOrder = mockUpdateGaps.mock.invocationCallOrder[0];
expect(updateGapsCallOrder).toBeGreaterThan(deleteCallOrder);
expect(mockUpdateGaps).toHaveBeenCalledWith({
ruleId: RULE_ID,
start: new Date(mockedAdHocRunSO.attributes.start),
@ -981,7 +986,7 @@ describe('Ad Hoc Task Runner', () => {
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
'abc',
{ refresh: false, namespace: undefined }
{ refresh: true, namespace: undefined }
);
testAlertingEventLogCalls({
@ -1043,7 +1048,7 @@ describe('Ad Hoc Task Runner', () => {
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
'abc',
{ refresh: false, namespace: undefined }
{ refresh: true, namespace: undefined }
);
testAlertingEventLogCalls({
@ -1105,7 +1110,7 @@ describe('Ad Hoc Task Runner', () => {
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
'abc',
{ refresh: false, namespace: undefined }
{ refresh: true, namespace: undefined }
);
testAlertingEventLogCalls({
@ -1169,7 +1174,7 @@ describe('Ad Hoc Task Runner', () => {
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
'abc',
{ refresh: false, namespace: undefined }
{ refresh: true, namespace: undefined }
);
testAlertingEventLogCalls({
@ -1326,7 +1331,7 @@ describe('Ad Hoc Task Runner', () => {
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
'abc',
{ refresh: false, namespace: undefined }
{ refresh: true, namespace: undefined }
);
testAlertingEventLogCalls({
@ -1475,7 +1480,7 @@ describe('Ad Hoc Task Runner', () => {
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
mockedAdHocRunSO.id,
{ namespace: undefined, refresh: false }
{ namespace: undefined, refresh: true }
);
testAlertingEventLogCalls({

View file

@ -625,17 +625,17 @@ export class AdHocTaskRunner implements CancellableTask {
async cleanup() {
if (!this.shouldDeleteTask) return;
await this.updateGapsAfterBackfillComplete();
try {
await this.internalSavedObjectsRepository.delete(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
this.taskInstance.params.adHocRunParamsId,
{
refresh: false,
refresh: true,
namespace: this.context.spaceIdToNamespace(this.taskInstance.params.spaceId),
}
);
await this.updateGapsAfterBackfillComplete();
} catch (e) {
// Log error only, we shouldn't fail the task because of an error here (if ever there's retry logic)
this.logger.error(