Log pending cluster state tasks after cluster cleanup timeout (#119186)

The Cluster Health API call in the `ESRestTestCase` cleanup waits for
all cluster state tasks to be finished. We frequently see that API call
timing out with plenty of tasks still in the queue. With this commit, we
retrieve the pending tasks and log them - only if the Cluster Health API
call timed out. This will hopefully give some more insight into what
(kind of) tasks are still pending when the API call times out.
This commit is contained in:
Niels Bauman 2024-12-23 17:10:49 +01:00 committed by GitHub
parent d92233c346
commit e01d956dc4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -852,8 +852,7 @@ public abstract class ESRestTestCase extends ESTestCase {
} }
private void wipeCluster() throws Exception { private void wipeCluster() throws Exception {
logger.info("Waiting for all cluster updates up to this moment to be processed"); waitForClusterUpdates();
assertOK(adminClient().performRequest(new Request("GET", "_cluster/health?wait_for_events=languid")));
// Cleanup rollup before deleting indices. A rollup job might have bulks in-flight, // Cleanup rollup before deleting indices. A rollup job might have bulks in-flight,
// so we need to fully shut them down first otherwise a job might stall waiting // so we need to fully shut them down first otherwise a job might stall waiting
@ -991,6 +990,38 @@ public abstract class ESRestTestCase extends ESTestCase {
deleteAllNodeShutdownMetadata(); deleteAllNodeShutdownMetadata();
} }
private void waitForClusterUpdates() throws Exception {
logger.info("Waiting for all cluster updates up to this moment to be processed");
try {
assertOK(adminClient().performRequest(new Request("GET", "_cluster/health?wait_for_events=languid")));
} catch (ResponseException e) {
if (e.getResponse().getStatusLine().getStatusCode() == HttpStatus.SC_REQUEST_TIMEOUT) {
final var pendingTasks = getPendingClusterStateTasks();
if (pendingTasks != null) {
logger.error("Timed out waiting for cluster updates to be processed, {}", pendingTasks);
}
}
throw e;
}
}
private static String getPendingClusterStateTasks() {
try {
Response response = adminClient().performRequest(new Request("GET", "/_cluster/pending_tasks"));
List<?> tasks = (List<?>) entityAsMap(response).get("tasks");
if (false == tasks.isEmpty()) {
StringBuilder message = new StringBuilder("there are still running tasks:");
for (Object task : tasks) {
message.append('\n').append(task.toString());
}
return message.toString();
}
} catch (IOException e) {
fail(e, "Failed to retrieve pending tasks in the cluster during cleanup");
}
return null;
}
/** /**
* This method checks whether ILM policies or templates get recreated after they have been deleted. If so, we are probably deleting * This method checks whether ILM policies or templates get recreated after they have been deleted. If so, we are probably deleting
* them unnecessarily, potentially causing test performance problems. This could happen for example if someone adds a new standard ILM * them unnecessarily, potentially causing test performance problems. This could happen for example if someone adds a new standard ILM
@ -1461,18 +1492,9 @@ public abstract class ESRestTestCase extends ESTestCase {
*/ */
private static void waitForClusterStateUpdatesToFinish() throws Exception { private static void waitForClusterStateUpdatesToFinish() throws Exception {
assertBusy(() -> { assertBusy(() -> {
try { final var pendingTasks = getPendingClusterStateTasks();
Response response = adminClient().performRequest(new Request("GET", "/_cluster/pending_tasks")); if (pendingTasks != null) {
List<?> tasks = (List<?>) entityAsMap(response).get("tasks"); fail(pendingTasks);
if (false == tasks.isEmpty()) {
StringBuilder message = new StringBuilder("there are still running tasks:");
for (Object task : tasks) {
message.append('\n').append(task.toString());
}
fail(message.toString());
}
} catch (IOException e) {
fail("cannot get cluster's pending tasks: " + e.getMessage());
} }
}, 30, TimeUnit.SECONDS); }, 30, TimeUnit.SECONDS);
} }