mirror of
https://github.com/elastic/kibana.git
synced 2025-06-28 11:05:39 -04:00
Fix that gap can be stuck "in-progress" (#221473)
## Summary [[Issue](https://github.com/elastic/kibana/issues/221111)](https://github.com/elastic/kibana/issues/221111) Gaps can get stuck in the `in-progress` state if a rule is backfill-executed with failures. ### Current behavior: Let's say we have a gap from `12:00–13:00`. When the gap is initially detected, it has the following state: ``` filled_intervals: [] unfilled_intervals: [12:00–13:00] in_progress_intervals: [] ``` When a backfill starts, we set `in_progress_intervals` to the range that overlaps with the backfill. We also remove that range from `unfilled_intervals`: ``` filled_intervals: [] unfilled_intervals: [] in_progress_intervals: [12:00–13:00] ``` After the backfill is successfully executed, we move the range to `filled_intervals` and clear `in_progress_intervals`: ``` filled_intervals: [12:00–13:00] unfilled_intervals: [] in_progress_intervals: [] ``` However, if the backfill fails, we want to remove the range from `in_progress_intervals` and move it back to `unfilled_intervals`. The problem is that we cannot simply do this because there might be other overlapping backfills still in progress for the same gap. In the case of a successful execution, this isn’t an issue, as the range is moved to `filled_intervals`. When a backfill fails, we refetch all overlapping backfills for the gap to recalculate the `in_progress_intervals`. ### Problem In the current implementation, we're updating the gaps **before** deleting the failed backfill. This causes the recalculated `in_progress_intervals` to still include the failed backfill’s range, resulting in a stale state. ### Fix We should **first delete** the failed backfill, and **then** update the gap. This ensures that the recalculated `in_progress_intervals` reflect only the remaining active backfills. --------- Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
This commit is contained in:
parent
d9a3e1f400
commit
dfd783e12a
2 changed files with 15 additions and 10 deletions
|
@ -900,9 +900,14 @@ describe('Ad Hoc Task Runner', () => {
|
|||
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
|
||||
AD_HOC_RUN_SAVED_OBJECT_TYPE,
|
||||
'abc',
|
||||
{ refresh: false, namespace: undefined }
|
||||
{ refresh: true, namespace: undefined }
|
||||
);
|
||||
|
||||
// Verify that updateGaps was called after delete
|
||||
const deleteCallOrder = internalSavedObjectsRepository.delete.mock.invocationCallOrder[0];
|
||||
const updateGapsCallOrder = mockUpdateGaps.mock.invocationCallOrder[0];
|
||||
expect(updateGapsCallOrder).toBeGreaterThan(deleteCallOrder);
|
||||
|
||||
expect(mockUpdateGaps).toHaveBeenCalledWith({
|
||||
ruleId: RULE_ID,
|
||||
start: new Date(mockedAdHocRunSO.attributes.start),
|
||||
|
@ -981,7 +986,7 @@ describe('Ad Hoc Task Runner', () => {
|
|||
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
|
||||
AD_HOC_RUN_SAVED_OBJECT_TYPE,
|
||||
'abc',
|
||||
{ refresh: false, namespace: undefined }
|
||||
{ refresh: true, namespace: undefined }
|
||||
);
|
||||
|
||||
testAlertingEventLogCalls({
|
||||
|
@ -1043,7 +1048,7 @@ describe('Ad Hoc Task Runner', () => {
|
|||
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
|
||||
AD_HOC_RUN_SAVED_OBJECT_TYPE,
|
||||
'abc',
|
||||
{ refresh: false, namespace: undefined }
|
||||
{ refresh: true, namespace: undefined }
|
||||
);
|
||||
|
||||
testAlertingEventLogCalls({
|
||||
|
@ -1105,7 +1110,7 @@ describe('Ad Hoc Task Runner', () => {
|
|||
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
|
||||
AD_HOC_RUN_SAVED_OBJECT_TYPE,
|
||||
'abc',
|
||||
{ refresh: false, namespace: undefined }
|
||||
{ refresh: true, namespace: undefined }
|
||||
);
|
||||
|
||||
testAlertingEventLogCalls({
|
||||
|
@ -1169,7 +1174,7 @@ describe('Ad Hoc Task Runner', () => {
|
|||
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
|
||||
AD_HOC_RUN_SAVED_OBJECT_TYPE,
|
||||
'abc',
|
||||
{ refresh: false, namespace: undefined }
|
||||
{ refresh: true, namespace: undefined }
|
||||
);
|
||||
|
||||
testAlertingEventLogCalls({
|
||||
|
@ -1326,7 +1331,7 @@ describe('Ad Hoc Task Runner', () => {
|
|||
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
|
||||
AD_HOC_RUN_SAVED_OBJECT_TYPE,
|
||||
'abc',
|
||||
{ refresh: false, namespace: undefined }
|
||||
{ refresh: true, namespace: undefined }
|
||||
);
|
||||
|
||||
testAlertingEventLogCalls({
|
||||
|
@ -1475,7 +1480,7 @@ describe('Ad Hoc Task Runner', () => {
|
|||
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
|
||||
AD_HOC_RUN_SAVED_OBJECT_TYPE,
|
||||
mockedAdHocRunSO.id,
|
||||
{ namespace: undefined, refresh: false }
|
||||
{ namespace: undefined, refresh: true }
|
||||
);
|
||||
|
||||
testAlertingEventLogCalls({
|
||||
|
|
|
@ -625,17 +625,17 @@ export class AdHocTaskRunner implements CancellableTask {
|
|||
async cleanup() {
|
||||
if (!this.shouldDeleteTask) return;
|
||||
|
||||
await this.updateGapsAfterBackfillComplete();
|
||||
|
||||
try {
|
||||
await this.internalSavedObjectsRepository.delete(
|
||||
AD_HOC_RUN_SAVED_OBJECT_TYPE,
|
||||
this.taskInstance.params.adHocRunParamsId,
|
||||
{
|
||||
refresh: false,
|
||||
refresh: true,
|
||||
namespace: this.context.spaceIdToNamespace(this.taskInstance.params.spaceId),
|
||||
}
|
||||
);
|
||||
|
||||
await this.updateGapsAfterBackfillComplete();
|
||||
} catch (e) {
|
||||
// Log error only, we shouldn't fail the task because of an error here (if ever there's retry logic)
|
||||
this.logger.error(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue