From 9c100cdeae3a4ef60d08db42a409ffada653c7cf Mon Sep 17 00:00:00 2001 From: David Turner Date: Wed, 27 Jan 2021 16:41:44 +0000 Subject: [PATCH] Extend default probe connect/handshake timeouts (#68059) Today the discovery phase has a short 1-second timeout for handshaking with a remote node after connecting, which allows it to quickly move on and retry in the case of connecting to something that doesn't respond straight away (e.g. it isn't an Elasticsearch node). This short timeout was necessary when the component was first developed because each connection attempt would block a thread. Since #42636 the connection attempt is now nonblocking so we can apply a more relaxed timeout. If transport security is enabled then our handshake timeout applies to the TLS handshake followed by the Elasticsearch handshake. If the TLS handshake alone takes over a second then the whole handshake times out with a `ConnectTransportException`, but this does not tell us which of the two individual handshakes took so long. TLS handshakes have their own 10-second timeout, which if reached yields a `SslHandshakeTimeoutException` that allows us to distinguish a problem at the TLS level from one at the Elasticsearch level. Therefore this commit extends the discovery probe timeouts. --- .../modules/discovery/discovery-settings.asciidoc | 4 ++-- .../HandshakingTransportAddressConnector.java | 4 ++-- .../HandshakingTransportAddressConnectorTests.java | 13 ++++++++++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/docs/reference/modules/discovery/discovery-settings.asciidoc b/docs/reference/modules/discovery/discovery-settings.asciidoc index e97ff503d1ef..9aa11163225c 100644 --- a/docs/reference/modules/discovery/discovery-settings.asciidoc +++ b/docs/reference/modules/discovery/discovery-settings.asciidoc @@ -73,12 +73,12 @@ Defaults to `1s`. `discovery.probe.connect_timeout`:: (<>) Sets how long to wait when attempting to connect to each address. Defaults to -`3s`. +`30s`. `discovery.probe.handshake_timeout`:: (<>) Sets how long to wait when attempting to identify the remote node via a -handshake. Defaults to `1s`. +handshake. Defaults to `30s`. `discovery.request_peers_timeout`:: (<>) diff --git a/server/src/main/java/org/elasticsearch/discovery/HandshakingTransportAddressConnector.java b/server/src/main/java/org/elasticsearch/discovery/HandshakingTransportAddressConnector.java index 24004cb00b22..86a52b3519b7 100644 --- a/server/src/main/java/org/elasticsearch/discovery/HandshakingTransportAddressConnector.java +++ b/server/src/main/java/org/elasticsearch/discovery/HandshakingTransportAddressConnector.java @@ -51,11 +51,11 @@ public class HandshakingTransportAddressConnector implements TransportAddressCon // connection timeout for probes public static final Setting PROBE_CONNECT_TIMEOUT_SETTING = Setting.timeSetting("discovery.probe.connect_timeout", - TimeValue.timeValueMillis(3000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope); + TimeValue.timeValueSeconds(30), TimeValue.timeValueMillis(1), Setting.Property.NodeScope); // handshake timeout for probes public static final Setting PROBE_HANDSHAKE_TIMEOUT_SETTING = Setting.timeSetting("discovery.probe.handshake_timeout", - TimeValue.timeValueMillis(1000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope); + TimeValue.timeValueSeconds(30), TimeValue.timeValueMillis(1), Setting.Property.NodeScope); private final TransportService transportService; private final TimeValue probeConnectTimeout; diff --git a/server/src/test/java/org/elasticsearch/discovery/HandshakingTransportAddressConnectorTests.java b/server/src/test/java/org/elasticsearch/discovery/HandshakingTransportAddressConnectorTests.java index e5b6fb596303..66b6e14c1b38 100644 --- a/server/src/test/java/org/elasticsearch/discovery/HandshakingTransportAddressConnectorTests.java +++ b/server/src/test/java/org/elasticsearch/discovery/HandshakingTransportAddressConnectorTests.java @@ -33,6 +33,7 @@ import org.elasticsearch.common.Nullable; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.transport.TransportAddress; +import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.MockLogAppender; import org.elasticsearch.test.junit.annotations.TestLogging; @@ -53,6 +54,7 @@ import java.util.concurrent.TimeUnit; import static java.util.Collections.emptyMap; import static java.util.Collections.emptySet; import static org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING; +import static org.elasticsearch.discovery.HandshakingTransportAddressConnector.PROBE_CONNECT_TIMEOUT_SETTING; import static org.elasticsearch.discovery.HandshakingTransportAddressConnector.PROBE_HANDSHAKE_TIMEOUT_SETTING; import static org.elasticsearch.node.Node.NODE_NAME_SETTING; import static org.hamcrest.Matchers.equalTo; @@ -79,6 +81,7 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase { final Settings settings = Settings.builder() .put(NODE_NAME_SETTING.getKey(), "node") .put(CLUSTER_NAME_SETTING.getKey(), "local-cluster") + .put(PROBE_HANDSHAKE_TIMEOUT_SETTING.getKey(), "1s") // shorter than default for the sake of test speed .build(); threadPool = new TestThreadPool("node", settings); @@ -211,6 +214,11 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase { failureListener.assertFailure(); } + public void testTimeoutDefaults() { + assertThat(PROBE_HANDSHAKE_TIMEOUT_SETTING.get(Settings.EMPTY), equalTo(TimeValue.timeValueSeconds(30))); + assertThat(PROBE_CONNECT_TIMEOUT_SETTING.get(Settings.EMPTY), equalTo(TimeValue.timeValueSeconds(30))); + } + public void testHandshakeTimesOut() throws InterruptedException { remoteNode = new DiscoveryNode("remote-node", buildNewFakeTransportAddress(), Version.CURRENT); discoveryAddress = getDiscoveryAddress(); @@ -219,7 +227,6 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase { FailureListener failureListener = new FailureListener(); handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, failureListener); - Thread.sleep(PROBE_HANDSHAKE_TIMEOUT_SETTING.get(Settings.EMPTY).millis()); failureListener.assertFailure(); } @@ -227,7 +234,7 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase { return randomBoolean() ? remoteNode.getAddress() : buildNewFakeTransportAddress(); } - private class FailureListener implements ActionListener { + private static class FailureListener implements ActionListener { final CountDownLatch completionLatch = new CountDownLatch(1); @Override @@ -241,7 +248,7 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase { } void assertFailure() throws InterruptedException { - assertTrue(completionLatch.await(30, TimeUnit.SECONDS)); + assertTrue(completionLatch.await(15, TimeUnit.SECONDS)); } } }