diff --git a/docs/changelog/118016.yaml b/docs/changelog/118016.yaml new file mode 100644 index 000000000000..7ee78b901b19 --- /dev/null +++ b/docs/changelog/118016.yaml @@ -0,0 +1,6 @@ +pr: 118016 +summary: Propagate status codes from shard failures appropriately +area: Search +type: enhancement +issues: + - 118482 diff --git a/server/src/main/java/org/elasticsearch/action/search/SearchPhaseExecutionException.java b/server/src/main/java/org/elasticsearch/action/search/SearchPhaseExecutionException.java index 2d798f2da3a4..fc79e85f9326 100644 --- a/server/src/main/java/org/elasticsearch/action/search/SearchPhaseExecutionException.java +++ b/server/src/main/java/org/elasticsearch/action/search/SearchPhaseExecutionException.java @@ -73,15 +73,21 @@ public class SearchPhaseExecutionException extends ElasticsearchException { // on coordinator node. so get the status from cause instead of returning SERVICE_UNAVAILABLE blindly return getCause() == null ? RestStatus.SERVICE_UNAVAILABLE : ExceptionsHelper.status(getCause()); } - RestStatus status = shardFailures[0].status(); - if (shardFailures.length > 1) { - for (int i = 1; i < shardFailures.length; i++) { - if (shardFailures[i].status().getStatus() >= RestStatus.INTERNAL_SERVER_ERROR.getStatus()) { - status = shardFailures[i].status(); - } + RestStatus status = null; + for (ShardSearchFailure shardFailure : shardFailures) { + RestStatus shardStatus = shardFailure.status(); + int statusCode = shardStatus.getStatus(); + + // Return if it's an error that can be retried. + // These currently take precedence over other status code(s). + if (statusCode >= 502 && statusCode <= 504) { + return shardStatus; + } else if (statusCode >= 500) { + status = shardStatus; } } - return status; + + return status == null ? shardFailures[0].status() : status; } public ShardSearchFailure[] shardFailures() { diff --git a/server/src/test/java/org/elasticsearch/action/search/SearchPhaseExecutionExceptionTests.java b/server/src/test/java/org/elasticsearch/action/search/SearchPhaseExecutionExceptionTests.java index 50ce97a2b73a..3e483b1f2cae 100644 --- a/server/src/test/java/org/elasticsearch/action/search/SearchPhaseExecutionExceptionTests.java +++ b/server/src/test/java/org/elasticsearch/action/search/SearchPhaseExecutionExceptionTests.java @@ -22,6 +22,7 @@ import org.elasticsearch.indices.InvalidIndexTemplateException; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.search.SearchShardTarget; import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.transport.NodeDisconnectedException; import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContent; import org.elasticsearch.xcontent.XContentParser; @@ -31,6 +32,7 @@ import java.io.IOException; import static org.hamcrest.CoreMatchers.hasItem; import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.is; public class SearchPhaseExecutionExceptionTests extends ESTestCase { @@ -168,4 +170,57 @@ public class SearchPhaseExecutionExceptionTests extends ESTestCase { assertEquals(actual.status(), RestStatus.BAD_REQUEST); } + + public void testOnlyWithCodesThatDoNotRequirePrecedence() { + int pickedIndex = randomIntBetween(0, 1); + + // Pick one of these exceptions randomly. + var searchExceptions = new ElasticsearchException[] { + new ElasticsearchException("simulated"), + new NodeDisconnectedException(null, "unused message", "unused action", null) }; + + // Status codes that map to searchExceptions. + var expectedStatusCodes = new RestStatus[] { RestStatus.INTERNAL_SERVER_ERROR, RestStatus.BAD_GATEWAY }; + + ShardSearchFailure shardFailure1 = new ShardSearchFailure( + searchExceptions[pickedIndex], + new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 1), null) + ); + + ShardSearchFailure shardFailure2 = new ShardSearchFailure( + searchExceptions[pickedIndex], + new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 2), null) + ); + + SearchPhaseExecutionException ex = new SearchPhaseExecutionException( + "search", + "all shards failed", + new ShardSearchFailure[] { shardFailure1, shardFailure2 } + ); + + assertThat(ex.status(), is(expectedStatusCodes[pickedIndex])); + } + + public void testWithRetriableCodesThatTakePrecedence() { + // Maps to a 500. + ShardSearchFailure shardFailure1 = new ShardSearchFailure( + new ElasticsearchException("simulated"), + new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 1), null) + ); + + // Maps to a 502. + ShardSearchFailure shardFailure2 = new ShardSearchFailure( + new NodeDisconnectedException(null, "unused message", "unused action", null), + new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 2), null) + ); + + SearchPhaseExecutionException ex = new SearchPhaseExecutionException( + "search", + "all shards failed", + new ShardSearchFailure[] { shardFailure1, shardFailure2 } + ); + + // The 502 takes precedence over 500. + assertThat(ex.status(), is(RestStatus.BAD_GATEWAY)); + } }