mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-04-24 23:27:25 -04:00
Extend default probe connect/handshake timeouts (#68059)
Today the discovery phase has a short 1-second timeout for handshaking with a remote node after connecting, which allows it to quickly move on and retry in the case of connecting to something that doesn't respond straight away (e.g. it isn't an Elasticsearch node). This short timeout was necessary when the component was first developed because each connection attempt would block a thread. Since #42636 the connection attempt is now nonblocking so we can apply a more relaxed timeout. If transport security is enabled then our handshake timeout applies to the TLS handshake followed by the Elasticsearch handshake. If the TLS handshake alone takes over a second then the whole handshake times out with a `ConnectTransportException`, but this does not tell us which of the two individual handshakes took so long. TLS handshakes have their own 10-second timeout, which if reached yields a `SslHandshakeTimeoutException` that allows us to distinguish a problem at the TLS level from one at the Elasticsearch level. Therefore this commit extends the discovery probe timeouts.
This commit is contained in:
parent
06c213f20e
commit
9c100cdeae
3 changed files with 14 additions and 7 deletions
|
@ -73,12 +73,12 @@ Defaults to `1s`.
|
|||
`discovery.probe.connect_timeout`::
|
||||
(<<static-cluster-setting,Static>>)
|
||||
Sets how long to wait when attempting to connect to each address. Defaults to
|
||||
`3s`.
|
||||
`30s`.
|
||||
|
||||
`discovery.probe.handshake_timeout`::
|
||||
(<<static-cluster-setting,Static>>)
|
||||
Sets how long to wait when attempting to identify the remote node via a
|
||||
handshake. Defaults to `1s`.
|
||||
handshake. Defaults to `30s`.
|
||||
|
||||
`discovery.request_peers_timeout`::
|
||||
(<<static-cluster-setting,Static>>)
|
||||
|
|
|
@ -51,11 +51,11 @@ public class HandshakingTransportAddressConnector implements TransportAddressCon
|
|||
// connection timeout for probes
|
||||
public static final Setting<TimeValue> PROBE_CONNECT_TIMEOUT_SETTING =
|
||||
Setting.timeSetting("discovery.probe.connect_timeout",
|
||||
TimeValue.timeValueMillis(3000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
|
||||
TimeValue.timeValueSeconds(30), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
|
||||
// handshake timeout for probes
|
||||
public static final Setting<TimeValue> PROBE_HANDSHAKE_TIMEOUT_SETTING =
|
||||
Setting.timeSetting("discovery.probe.handshake_timeout",
|
||||
TimeValue.timeValueMillis(1000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
|
||||
TimeValue.timeValueSeconds(30), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
|
||||
|
||||
private final TransportService transportService;
|
||||
private final TimeValue probeConnectTimeout;
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.elasticsearch.common.Nullable;
|
|||
import org.elasticsearch.common.logging.Loggers;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.transport.TransportAddress;
|
||||
import org.elasticsearch.common.unit.TimeValue;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.MockLogAppender;
|
||||
import org.elasticsearch.test.junit.annotations.TestLogging;
|
||||
|
@ -53,6 +54,7 @@ import java.util.concurrent.TimeUnit;
|
|||
import static java.util.Collections.emptyMap;
|
||||
import static java.util.Collections.emptySet;
|
||||
import static org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING;
|
||||
import static org.elasticsearch.discovery.HandshakingTransportAddressConnector.PROBE_CONNECT_TIMEOUT_SETTING;
|
||||
import static org.elasticsearch.discovery.HandshakingTransportAddressConnector.PROBE_HANDSHAKE_TIMEOUT_SETTING;
|
||||
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
@ -79,6 +81,7 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase {
|
|||
final Settings settings = Settings.builder()
|
||||
.put(NODE_NAME_SETTING.getKey(), "node")
|
||||
.put(CLUSTER_NAME_SETTING.getKey(), "local-cluster")
|
||||
.put(PROBE_HANDSHAKE_TIMEOUT_SETTING.getKey(), "1s") // shorter than default for the sake of test speed
|
||||
.build();
|
||||
threadPool = new TestThreadPool("node", settings);
|
||||
|
||||
|
@ -211,6 +214,11 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase {
|
|||
failureListener.assertFailure();
|
||||
}
|
||||
|
||||
public void testTimeoutDefaults() {
|
||||
assertThat(PROBE_HANDSHAKE_TIMEOUT_SETTING.get(Settings.EMPTY), equalTo(TimeValue.timeValueSeconds(30)));
|
||||
assertThat(PROBE_CONNECT_TIMEOUT_SETTING.get(Settings.EMPTY), equalTo(TimeValue.timeValueSeconds(30)));
|
||||
}
|
||||
|
||||
public void testHandshakeTimesOut() throws InterruptedException {
|
||||
remoteNode = new DiscoveryNode("remote-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||
discoveryAddress = getDiscoveryAddress();
|
||||
|
@ -219,7 +227,6 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase {
|
|||
|
||||
FailureListener failureListener = new FailureListener();
|
||||
handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, failureListener);
|
||||
Thread.sleep(PROBE_HANDSHAKE_TIMEOUT_SETTING.get(Settings.EMPTY).millis());
|
||||
failureListener.assertFailure();
|
||||
}
|
||||
|
||||
|
@ -227,7 +234,7 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase {
|
|||
return randomBoolean() ? remoteNode.getAddress() : buildNewFakeTransportAddress();
|
||||
}
|
||||
|
||||
private class FailureListener implements ActionListener<DiscoveryNode> {
|
||||
private static class FailureListener implements ActionListener<DiscoveryNode> {
|
||||
final CountDownLatch completionLatch = new CountDownLatch(1);
|
||||
|
||||
@Override
|
||||
|
@ -241,7 +248,7 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase {
|
|||
}
|
||||
|
||||
void assertFailure() throws InterruptedException {
|
||||
assertTrue(completionLatch.await(30, TimeUnit.SECONDS));
|
||||
assertTrue(completionLatch.await(15, TimeUnit.SECONDS));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue