Extend default probe connect/handshake timeouts (#68059)

Today the discovery phase has a short 1-second timeout for handshaking
with a remote node after connecting, which allows it to quickly move on
and retry in the case of connecting to something that doesn't respond
straight away (e.g. it isn't an Elasticsearch node).

This short timeout was necessary when the component was first developed
because each connection attempt would block a thread. Since #42636 the
connection attempt is now nonblocking so we can apply a more relaxed
timeout.

If transport security is enabled then our handshake timeout applies to
the TLS handshake followed by the Elasticsearch handshake. If the TLS
handshake alone takes over a second then the whole handshake times out
with a `ConnectTransportException`, but this does not tell us which of
the two individual handshakes took so long.

TLS handshakes have their own 10-second timeout, which if reached yields
a `SslHandshakeTimeoutException` that allows us to distinguish a problem
at the TLS level from one at the Elasticsearch level. Therefore this
commit extends the discovery probe timeouts.
This commit is contained in:
David Turner 2021-01-27 16:41:44 +00:00 committed by GitHub
parent 06c213f20e
commit 9c100cdeae
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 7 deletions

View file

@ -73,12 +73,12 @@ Defaults to `1s`.
`discovery.probe.connect_timeout`:: `discovery.probe.connect_timeout`::
(<<static-cluster-setting,Static>>) (<<static-cluster-setting,Static>>)
Sets how long to wait when attempting to connect to each address. Defaults to Sets how long to wait when attempting to connect to each address. Defaults to
`3s`. `30s`.
`discovery.probe.handshake_timeout`:: `discovery.probe.handshake_timeout`::
(<<static-cluster-setting,Static>>) (<<static-cluster-setting,Static>>)
Sets how long to wait when attempting to identify the remote node via a Sets how long to wait when attempting to identify the remote node via a
handshake. Defaults to `1s`. handshake. Defaults to `30s`.
`discovery.request_peers_timeout`:: `discovery.request_peers_timeout`::
(<<static-cluster-setting,Static>>) (<<static-cluster-setting,Static>>)

View file

@ -51,11 +51,11 @@ public class HandshakingTransportAddressConnector implements TransportAddressCon
// connection timeout for probes // connection timeout for probes
public static final Setting<TimeValue> PROBE_CONNECT_TIMEOUT_SETTING = public static final Setting<TimeValue> PROBE_CONNECT_TIMEOUT_SETTING =
Setting.timeSetting("discovery.probe.connect_timeout", Setting.timeSetting("discovery.probe.connect_timeout",
TimeValue.timeValueMillis(3000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope); TimeValue.timeValueSeconds(30), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
// handshake timeout for probes // handshake timeout for probes
public static final Setting<TimeValue> PROBE_HANDSHAKE_TIMEOUT_SETTING = public static final Setting<TimeValue> PROBE_HANDSHAKE_TIMEOUT_SETTING =
Setting.timeSetting("discovery.probe.handshake_timeout", Setting.timeSetting("discovery.probe.handshake_timeout",
TimeValue.timeValueMillis(1000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope); TimeValue.timeValueSeconds(30), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
private final TransportService transportService; private final TransportService transportService;
private final TimeValue probeConnectTimeout; private final TimeValue probeConnectTimeout;

View file

@ -33,6 +33,7 @@ import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress; import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.MockLogAppender; import org.elasticsearch.test.MockLogAppender;
import org.elasticsearch.test.junit.annotations.TestLogging; import org.elasticsearch.test.junit.annotations.TestLogging;
@ -53,6 +54,7 @@ import java.util.concurrent.TimeUnit;
import static java.util.Collections.emptyMap; import static java.util.Collections.emptyMap;
import static java.util.Collections.emptySet; import static java.util.Collections.emptySet;
import static org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING; import static org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING;
import static org.elasticsearch.discovery.HandshakingTransportAddressConnector.PROBE_CONNECT_TIMEOUT_SETTING;
import static org.elasticsearch.discovery.HandshakingTransportAddressConnector.PROBE_HANDSHAKE_TIMEOUT_SETTING; import static org.elasticsearch.discovery.HandshakingTransportAddressConnector.PROBE_HANDSHAKE_TIMEOUT_SETTING;
import static org.elasticsearch.node.Node.NODE_NAME_SETTING; import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
@ -79,6 +81,7 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase {
final Settings settings = Settings.builder() final Settings settings = Settings.builder()
.put(NODE_NAME_SETTING.getKey(), "node") .put(NODE_NAME_SETTING.getKey(), "node")
.put(CLUSTER_NAME_SETTING.getKey(), "local-cluster") .put(CLUSTER_NAME_SETTING.getKey(), "local-cluster")
.put(PROBE_HANDSHAKE_TIMEOUT_SETTING.getKey(), "1s") // shorter than default for the sake of test speed
.build(); .build();
threadPool = new TestThreadPool("node", settings); threadPool = new TestThreadPool("node", settings);
@ -211,6 +214,11 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase {
failureListener.assertFailure(); failureListener.assertFailure();
} }
public void testTimeoutDefaults() {
assertThat(PROBE_HANDSHAKE_TIMEOUT_SETTING.get(Settings.EMPTY), equalTo(TimeValue.timeValueSeconds(30)));
assertThat(PROBE_CONNECT_TIMEOUT_SETTING.get(Settings.EMPTY), equalTo(TimeValue.timeValueSeconds(30)));
}
public void testHandshakeTimesOut() throws InterruptedException { public void testHandshakeTimesOut() throws InterruptedException {
remoteNode = new DiscoveryNode("remote-node", buildNewFakeTransportAddress(), Version.CURRENT); remoteNode = new DiscoveryNode("remote-node", buildNewFakeTransportAddress(), Version.CURRENT);
discoveryAddress = getDiscoveryAddress(); discoveryAddress = getDiscoveryAddress();
@ -219,7 +227,6 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase {
FailureListener failureListener = new FailureListener(); FailureListener failureListener = new FailureListener();
handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, failureListener); handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, failureListener);
Thread.sleep(PROBE_HANDSHAKE_TIMEOUT_SETTING.get(Settings.EMPTY).millis());
failureListener.assertFailure(); failureListener.assertFailure();
} }
@ -227,7 +234,7 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase {
return randomBoolean() ? remoteNode.getAddress() : buildNewFakeTransportAddress(); return randomBoolean() ? remoteNode.getAddress() : buildNewFakeTransportAddress();
} }
private class FailureListener implements ActionListener<DiscoveryNode> { private static class FailureListener implements ActionListener<DiscoveryNode> {
final CountDownLatch completionLatch = new CountDownLatch(1); final CountDownLatch completionLatch = new CountDownLatch(1);
@Override @Override
@ -241,7 +248,7 @@ public class HandshakingTransportAddressConnectorTests extends ESTestCase {
} }
void assertFailure() throws InterruptedException { void assertFailure() throws InterruptedException {
assertTrue(completionLatch.await(30, TimeUnit.SECONDS)); assertTrue(completionLatch.await(15, TimeUnit.SECONDS));
} }
} }
} }