[ML] Fixes polling for blocked anomaly detection jobs (#178246)

Fixes two issues:
- When a job is in a blocked state (resetting, deleting reverting) but
the underlying task [cannot be
found](https://github.com/elastic/elasticsearch/issues/105928), the task
polling fails to start correctly and instead enters a loop where the
tasks are checked as fast as possible.
- Some tasks can legitimately take a long time to run, but we still poll
at the same 2 second rate.

This PR fixes the feedback loop and adds a check for when a poll has
been running for over a minute, the poll interval is increased to 2
minutes.

Related to https://github.com/elastic/kibana/issues/171626
This commit is contained in:
James Gowdy 2024-03-12 15:17:54 +00:00 committed by GitHub
parent 834371ebb2
commit a66d42b50c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 31 additions and 11 deletions

View file

@ -7,6 +7,8 @@
export const DEFAULT_REFRESH_INTERVAL_MS = 30000;
export const MINIMUM_REFRESH_INTERVAL_MS = 1000;
export const DELETING_JOBS_REFRESH_INTERVAL_MS = 2000;
export const BLOCKED_JOBS_REFRESH_INTERVAL_MS = 2000;
export const BLOCKED_JOBS_REFRESH_INTERVAL_SLOW_MS = 120000; // 2mins
export const BLOCKED_JOBS_REFRESH_THRESHOLD_MS = 60000; // 1min
export const RESETTING_JOBS_REFRESH_INTERVAL_MS = 1000;
export const PROGRESS_JOBS_REFRESH_INTERVAL_MS = 2000;

View file

@ -24,7 +24,7 @@ import {
import { i18n } from '@kbn/i18n';
import { deleteJobs } from '../utils';
import { DELETING_JOBS_REFRESH_INTERVAL_MS } from '../../../../../../common/constants/jobs_list';
import { BLOCKED_JOBS_REFRESH_INTERVAL_MS } from '../../../../../../common/constants/jobs_list';
import { DeleteSpaceAwareItemCheckModal } from '../../../../components/delete_space_aware_item_check_modal';
import type { MlSummaryJob } from '../../../../../../common/types/anomaly_detection_jobs';
import { isManagedJob } from '../../../jobs_utils';
@ -91,7 +91,7 @@ export const DeleteJobModal: FC<Props> = ({ setShowFunction, unsetShowFunction,
setTimeout(() => {
closeModal();
refreshJobs();
}, DELETING_JOBS_REFRESH_INTERVAL_MS);
}, BLOCKED_JOBS_REFRESH_INTERVAL_MS);
}, [jobIds, deleteUserAnnotations, deleteAlertingRules, closeModal, refreshJobs]);
if (modalVisible === false || jobIds.length === 0) {

View file

@ -26,7 +26,11 @@ import { JobsAwaitingNodeWarning } from '../../../../components/jobs_awaiting_no
import { SavedObjectsWarning } from '../../../../components/saved_objects_warning';
import { UpgradeWarning } from '../../../../components/upgrade';
import { DELETING_JOBS_REFRESH_INTERVAL_MS } from '../../../../../../common/constants/jobs_list';
import {
BLOCKED_JOBS_REFRESH_INTERVAL_MS,
BLOCKED_JOBS_REFRESH_INTERVAL_SLOW_MS,
BLOCKED_JOBS_REFRESH_THRESHOLD_MS,
} from '../../../../../../common/constants/jobs_list';
import { JobListMlAnomalyAlertFlyout } from '../../../../../alerting/ml_alerting_flyout';
import { StopDatafeedsConfirmModal } from '../confirm_modals/stop_datafeeds_confirm_modal';
import { CloseJobsConfirmModal } from '../confirm_modals/close_jobs_confirm_modal';
@ -49,6 +53,7 @@ export class JobsListView extends Component {
itemIdToExpandedRowMap: {},
filterClauses: [],
blockingJobIds: [],
blockingJobsFirstFoundMs: null,
jobsAwaitingNodeCount: 0,
};
@ -350,14 +355,17 @@ export class JobsListView extends Component {
});
this.isDoneRefreshing();
if (
blockingJobsRefreshTimeout === null &&
jobsSummaryList.some((j) => j.blocked !== undefined)
) {
if (jobsSummaryList.some((j) => j.blocked !== undefined)) {
// if there are some jobs in a deleting state, start polling for
// deleting jobs so we can update the jobs list once the
// deleting tasks are over
this.checkBlockingJobTasks(true);
if (this.state.blockingJobsFirstFoundMs === null) {
// keep a record of when the first blocked job was found
this.setState({ blockingJobsFirstFoundMs: Date.now() });
}
} else {
this.setState({ blockingJobsFirstFoundMs: null });
}
} catch (error) {
console.error(error);
@ -366,7 +374,7 @@ export class JobsListView extends Component {
}
async checkBlockingJobTasks(forceRefresh = false) {
if (this._isMounted === false) {
if (this._isMounted === false || blockingJobsRefreshTimeout !== null) {
return;
}
@ -384,14 +392,24 @@ export class JobsListView extends Component {
this.refreshJobSummaryList();
}
if (blockingJobIds.length > 0 && blockingJobsRefreshTimeout === null) {
if (this.state.blockingJobsFirstFoundMs !== null || blockingJobIds.length > 0) {
blockingJobsRefreshTimeout = setTimeout(() => {
blockingJobsRefreshTimeout = null;
this.checkBlockingJobTasks();
}, DELETING_JOBS_REFRESH_INTERVAL_MS);
}, this.getBlockedJobsRefreshInterval());
}
}
getBlockedJobsRefreshInterval() {
const runningTimeMs = Date.now() - this.state.blockingJobsFirstFoundMs;
if (runningTimeMs > BLOCKED_JOBS_REFRESH_THRESHOLD_MS) {
// if the jobs have been in a blocked state for more than a minute
// increase the polling interval
return BLOCKED_JOBS_REFRESH_INTERVAL_SLOW_MS;
}
return BLOCKED_JOBS_REFRESH_INTERVAL_MS;
}
renderJobsListComponents() {
const { isRefreshing, loading, jobsSummaryList, jobsAwaitingNodeCount } = this.state;
const jobIds = jobsSummaryList.map((j) => j.id);