Propagate status codes from shard failures appropriately (#118016)

* Return 502 if the underlying error is `NodeNotConnectedException`

* Traverse through the cause stack trace and check for `NodeNotConnectedException` and undo `status()` override in `NodeDisconnectedException`

* Rewrite `while` condition

* Fix: precommit

* Let status codes propagate rather than walking the stacktrace explicitly

Walking the stacktrace explicitly and looking for a specific error (node
connection-related errors in this case) is a workaround rather than a
proper fix. Instead, let the status codes propagate all the way to the
top so that they can be reported as-is.

* Fix: unused import

* Fix null deref

* Do not map descendants of `ConnectTransportException` to `502`

* Fix: precommit

* Do not account for 4xx status codes

4xx status codes are not likely to appear along with 5xx status codes.
As a result, we do not need to account for them when looking at shard
failures' status codes.

* Remove unnecessary `switch` case

In reference to the previous commit: this case is no longer needed.

* Rewrite code comment

* Address review comments

* [CI] Auto commit changes from spotless

* Update docs/changelog/118016.yaml

---------

Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
This commit is contained in:
Pawan Kartik 2024-12-23 09:19:38 +00:00 committed by GitHub
parent c662aecac3
commit 22990df7f2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 74 additions and 7 deletions

View file

@ -0,0 +1,6 @@
pr: 118016
summary: Propagate status codes from shard failures appropriately
area: Search
type: enhancement
issues:
- 118482

View file

@ -73,15 +73,21 @@ public class SearchPhaseExecutionException extends ElasticsearchException {
// on coordinator node. so get the status from cause instead of returning SERVICE_UNAVAILABLE blindly
return getCause() == null ? RestStatus.SERVICE_UNAVAILABLE : ExceptionsHelper.status(getCause());
}
RestStatus status = shardFailures[0].status();
if (shardFailures.length > 1) {
for (int i = 1; i < shardFailures.length; i++) {
if (shardFailures[i].status().getStatus() >= RestStatus.INTERNAL_SERVER_ERROR.getStatus()) {
status = shardFailures[i].status();
}
RestStatus status = null;
for (ShardSearchFailure shardFailure : shardFailures) {
RestStatus shardStatus = shardFailure.status();
int statusCode = shardStatus.getStatus();
// Return if it's an error that can be retried.
// These currently take precedence over other status code(s).
if (statusCode >= 502 && statusCode <= 504) {
return shardStatus;
} else if (statusCode >= 500) {
status = shardStatus;
}
}
return status;
return status == null ? shardFailures[0].status() : status;
}
public ShardSearchFailure[] shardFailures() {

View file

@ -22,6 +22,7 @@ import org.elasticsearch.indices.InvalidIndexTemplateException;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.SearchShardTarget;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.transport.NodeDisconnectedException;
import org.elasticsearch.xcontent.ToXContent;
import org.elasticsearch.xcontent.XContent;
import org.elasticsearch.xcontent.XContentParser;
@ -31,6 +32,7 @@ import java.io.IOException;
import static org.hamcrest.CoreMatchers.hasItem;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
public class SearchPhaseExecutionExceptionTests extends ESTestCase {
@ -168,4 +170,57 @@ public class SearchPhaseExecutionExceptionTests extends ESTestCase {
assertEquals(actual.status(), RestStatus.BAD_REQUEST);
}
public void testOnlyWithCodesThatDoNotRequirePrecedence() {
int pickedIndex = randomIntBetween(0, 1);
// Pick one of these exceptions randomly.
var searchExceptions = new ElasticsearchException[] {
new ElasticsearchException("simulated"),
new NodeDisconnectedException(null, "unused message", "unused action", null) };
// Status codes that map to searchExceptions.
var expectedStatusCodes = new RestStatus[] { RestStatus.INTERNAL_SERVER_ERROR, RestStatus.BAD_GATEWAY };
ShardSearchFailure shardFailure1 = new ShardSearchFailure(
searchExceptions[pickedIndex],
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 1), null)
);
ShardSearchFailure shardFailure2 = new ShardSearchFailure(
searchExceptions[pickedIndex],
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 2), null)
);
SearchPhaseExecutionException ex = new SearchPhaseExecutionException(
"search",
"all shards failed",
new ShardSearchFailure[] { shardFailure1, shardFailure2 }
);
assertThat(ex.status(), is(expectedStatusCodes[pickedIndex]));
}
public void testWithRetriableCodesThatTakePrecedence() {
// Maps to a 500.
ShardSearchFailure shardFailure1 = new ShardSearchFailure(
new ElasticsearchException("simulated"),
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 1), null)
);
// Maps to a 502.
ShardSearchFailure shardFailure2 = new ShardSearchFailure(
new NodeDisconnectedException(null, "unused message", "unused action", null),
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 2), null)
);
SearchPhaseExecutionException ex = new SearchPhaseExecutionException(
"search",
"all shards failed",
new ShardSearchFailure[] { shardFailure1, shardFailure2 }
);
// The 502 takes precedence over 500.
assertThat(ex.status(), is(RestStatus.BAD_GATEWAY));
}
}