mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-04-24 23:27:25 -04:00
Increase PeerFinder verbosity on persistent failure (#73128)
If a node is partitioned away from the rest of the cluster then the `ClusterFormationFailureHelper` periodically reports that it cannot discover the expected collection of nodes, but does not indicate why. To prove it's a connectivity problem, users must today restart the node with `DEBUG` logging on `org.elasticsearch.discovery.PeerFinder` to see further details. With this commit we log messages at `WARN` level if the node remains disconnected for longer than a configurable timeout, which defaults to 5 minutes. Relates #72968
This commit is contained in:
parent
09d1b97b5f
commit
eabe2d1b34
3 changed files with 146 additions and 12 deletions
|
@ -85,6 +85,12 @@ handshake. Defaults to `30s`.
|
|||
Sets how long a node will wait after asking its peers again before considering
|
||||
the request to have failed. Defaults to `3s`.
|
||||
|
||||
`discovery.find_peers_warning_timeout`::
|
||||
(<<static-cluster-setting,Static>>)
|
||||
Sets how long a node will attempt to discover its peers before it starts to log
|
||||
verbose messages describing why the connection attempts are failing. Defaults
|
||||
to `5m`.
|
||||
|
||||
`discovery.seed_resolver.max_concurrent_resolvers`::
|
||||
(<<static-cluster-setting,Static>>)
|
||||
Specifies how many concurrent DNS lookups to perform when resolving the
|
||||
|
|
|
@ -56,8 +56,17 @@ public abstract class PeerFinder {
|
|||
Setting.timeSetting("discovery.request_peers_timeout",
|
||||
TimeValue.timeValueMillis(3000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
|
||||
|
||||
// We do not log connection failures immediately: some failures are expected, especially if the hosts list isn't perfectly up-to-date
|
||||
// or contains some unnecessary junk. However if the node cannot find a master for an extended period of time then it is helpful to
|
||||
// users to describe in more detail why we cannot connect to the remote nodes. This setting defines how long we wait without discovering
|
||||
// the master before we start to emit more verbose logs.
|
||||
public static final Setting<TimeValue> VERBOSITY_INCREASE_TIMEOUT_SETTING =
|
||||
Setting.timeSetting("discovery.find_peers_warning_timeout",
|
||||
TimeValue.timeValueMinutes(5), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
|
||||
|
||||
private final TimeValue findPeersInterval;
|
||||
private final TimeValue requestPeersTimeout;
|
||||
private final TimeValue verbosityIncreaseTimeout;
|
||||
|
||||
private final Object mutex = new Object();
|
||||
private final TransportService transportService;
|
||||
|
@ -66,6 +75,7 @@ public abstract class PeerFinder {
|
|||
|
||||
private volatile long currentTerm;
|
||||
private boolean active;
|
||||
private long activatedAtMillis;
|
||||
private DiscoveryNodes lastAcceptedNodes;
|
||||
private final Map<TransportAddress, Peer> peersByAddress = new LinkedHashMap<>();
|
||||
private Optional<DiscoveryNode> leader = Optional.empty();
|
||||
|
@ -75,6 +85,7 @@ public abstract class PeerFinder {
|
|||
ConfiguredHostsResolver configuredHostsResolver) {
|
||||
findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(settings);
|
||||
requestPeersTimeout = DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING.get(settings);
|
||||
verbosityIncreaseTimeout = VERBOSITY_INCREASE_TIMEOUT_SETTING.get(settings);
|
||||
this.transportService = transportService;
|
||||
this.transportAddressConnector = transportAddressConnector;
|
||||
this.configuredHostsResolver = configuredHostsResolver;
|
||||
|
@ -90,6 +101,7 @@ public abstract class PeerFinder {
|
|||
synchronized (mutex) {
|
||||
assert assertInactiveWithNoKnownPeers();
|
||||
active = true;
|
||||
activatedAtMillis = transportService.getThreadPool().relativeTimeInMillis();
|
||||
this.lastAcceptedNodes = lastAcceptedNodes;
|
||||
leader = Optional.empty();
|
||||
handleWakeUp(); // return value discarded: there are no known peers, so none can be disconnected
|
||||
|
@ -193,7 +205,7 @@ public abstract class PeerFinder {
|
|||
|
||||
public interface ConfiguredHostsResolver {
|
||||
/**
|
||||
* Attempt to resolve the configured unicast hosts list to a list of transport addresses.
|
||||
* Attempt to resolve the configured hosts list to a list of transport addresses.
|
||||
*
|
||||
* @param consumer Consumer for the resolved list. May not be called if an error occurs or if another resolution attempt is in
|
||||
* progress.
|
||||
|
@ -293,7 +305,7 @@ public abstract class PeerFinder {
|
|||
|
||||
private class Peer {
|
||||
private final TransportAddress transportAddress;
|
||||
private SetOnce<DiscoveryNode> discoveryNode = new SetOnce<>();
|
||||
private final SetOnce<DiscoveryNode> discoveryNode = new SetOnce<>();
|
||||
private volatile boolean peersRequestInFlight;
|
||||
|
||||
Peer(TransportAddress transportAddress) {
|
||||
|
@ -334,6 +346,9 @@ public abstract class PeerFinder {
|
|||
assert getDiscoveryNode() == null : "unexpectedly connected to " + getDiscoveryNode();
|
||||
assert active;
|
||||
|
||||
final boolean verboseFailureLogging
|
||||
= transportService.getThreadPool().relativeTimeInMillis() - activatedAtMillis > verbosityIncreaseTimeout.millis();
|
||||
|
||||
logger.trace("{} attempting connection", this);
|
||||
transportAddressConnector.connectToRemoteMasterNode(transportAddress, new ActionListener<DiscoveryNode>() {
|
||||
@Override
|
||||
|
@ -356,7 +371,25 @@ public abstract class PeerFinder {
|
|||
|
||||
@Override
|
||||
public void onFailure(Exception e) {
|
||||
logger.debug(() -> new ParameterizedMessage("{} connection failed", Peer.this), e);
|
||||
if (verboseFailureLogging) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
// log message at level WARN, but since DEBUG logging is enabled we include the full stack trace
|
||||
logger.warn(new ParameterizedMessage("{} connection failed", Peer.this), e);
|
||||
} else {
|
||||
final StringBuilder messageBuilder = new StringBuilder();
|
||||
Throwable cause = e;
|
||||
while (cause != null && messageBuilder.length() <= 1024) {
|
||||
messageBuilder.append(": ").append(cause.getMessage());
|
||||
cause = cause.getCause();
|
||||
}
|
||||
final String message = messageBuilder.length() < 1024
|
||||
? messageBuilder.toString()
|
||||
: (messageBuilder.substring(0, 1023) + "...");
|
||||
logger.warn("{} connection failed{}", Peer.this, message);
|
||||
}
|
||||
} else {
|
||||
logger.debug(new ParameterizedMessage("{} connection failed", Peer.this), e);
|
||||
}
|
||||
synchronized (mutex) {
|
||||
peersByAddress.remove(transportAddress);
|
||||
}
|
||||
|
@ -413,7 +446,7 @@ public abstract class PeerFinder {
|
|||
@Override
|
||||
public void handleException(TransportException exp) {
|
||||
peersRequestInFlight = false;
|
||||
logger.debug(new ParameterizedMessage("{} peers request failed", Peer.this), exp);
|
||||
logger.warn(new ParameterizedMessage("{} peers request failed", Peer.this), exp);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -429,11 +462,7 @@ public abstract class PeerFinder {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Peer{" +
|
||||
"transportAddress=" + transportAddress +
|
||||
", discoveryNode=" + discoveryNode.get() +
|
||||
", peersRequestInFlight=" + peersRequestInFlight +
|
||||
'}';
|
||||
return "address [" + transportAddress + "], node [" + discoveryNode.get() + "], requesting [" + peersRequestInFlight + "]";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,6 +8,9 @@
|
|||
|
||||
package org.elasticsearch.discovery;
|
||||
|
||||
import org.apache.logging.log4j.Level;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.core.LogEvent;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.action.ActionListener;
|
||||
|
@ -17,10 +20,13 @@ import org.elasticsearch.cluster.node.DiscoveryNode;
|
|||
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
||||
import org.elasticsearch.cluster.node.DiscoveryNodes.Builder;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.logging.Loggers;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.transport.TransportAddress;
|
||||
import org.elasticsearch.discovery.PeerFinder.TransportAddressConnector;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.MockLogAppender;
|
||||
import org.elasticsearch.test.junit.annotations.TestLogging;
|
||||
import org.elasticsearch.test.transport.CapturingTransport;
|
||||
import org.elasticsearch.test.transport.CapturingTransport.CapturedRequest;
|
||||
import org.elasticsearch.test.transport.StubbableConnectionManager;
|
||||
|
@ -55,6 +61,7 @@ import static java.util.Collections.emptyMap;
|
|||
import static java.util.Collections.emptySet;
|
||||
import static java.util.Collections.singletonList;
|
||||
import static org.elasticsearch.discovery.PeerFinder.REQUEST_PEERS_ACTION_NAME;
|
||||
import static org.elasticsearch.discovery.PeerFinder.VERBOSITY_INCREASE_TIMEOUT_SETTING;
|
||||
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
||||
import static org.hamcrest.Matchers.contains;
|
||||
import static org.hamcrest.Matchers.empty;
|
||||
|
@ -74,13 +81,13 @@ public class PeerFinderTests extends ESTestCase {
|
|||
private List<TransportAddress> providedAddresses;
|
||||
private long addressResolveDelay; // -1 means address resolution fails
|
||||
|
||||
private Set<DiscoveryNode> disconnectedNodes = new HashSet<>();
|
||||
private Set<DiscoveryNode> connectedNodes = new HashSet<>();
|
||||
private final Set<DiscoveryNode> disconnectedNodes = new HashSet<>();
|
||||
private final Set<DiscoveryNode> connectedNodes = new HashSet<>();
|
||||
private DiscoveryNodes lastAcceptedNodes;
|
||||
private TransportService transportService;
|
||||
private Iterable<DiscoveryNode> foundPeersFromNotification;
|
||||
|
||||
private static long CONNECTION_TIMEOUT_MILLIS = 30000;
|
||||
private static final long CONNECTION_TIMEOUT_MILLIS = 30000;
|
||||
|
||||
class MockTransportAddressConnector implements TransportAddressConnector {
|
||||
final Map<TransportAddress, DiscoveryNode> reachableNodes = new HashMap<>();
|
||||
|
@ -734,6 +741,98 @@ public class PeerFinderTests extends ESTestCase {
|
|||
assertFoundPeers(nodeToFind, otherNode);
|
||||
}
|
||||
|
||||
@TestLogging(reason = "testing logging at WARN level", value = "org.elasticsearch.discovery:WARN")
|
||||
public void testLogsWarningsIfActiveForLongEnough() throws IllegalAccessException {
|
||||
final DiscoveryNode otherNode = newDiscoveryNode("node-from-hosts-list");
|
||||
|
||||
providedAddresses.add(otherNode.getAddress());
|
||||
transportAddressConnector.unreachableAddresses.add(otherNode.getAddress());
|
||||
|
||||
peerFinder.activate(lastAcceptedNodes);
|
||||
final long endTime
|
||||
= deterministicTaskQueue.getCurrentTimeMillis() + VERBOSITY_INCREASE_TIMEOUT_SETTING.get(Settings.EMPTY).millis();
|
||||
|
||||
MockLogAppender appender = new MockLogAppender();
|
||||
try {
|
||||
appender.start();
|
||||
Loggers.addAppender(LogManager.getLogger("org.elasticsearch.discovery.PeerFinder"), appender);
|
||||
|
||||
appender.addExpectation(new MockLogAppender.SeenEventExpectation(
|
||||
"connection failed",
|
||||
"org.elasticsearch.discovery.PeerFinder",
|
||||
Level.WARN,
|
||||
"address [" + otherNode.getAddress() + "]* connection failed: cannot connect to*")
|
||||
{
|
||||
@Override
|
||||
public boolean innerMatch(LogEvent event) {
|
||||
return event.getThrown() == null; // no stack trace at this log level
|
||||
}
|
||||
});
|
||||
while (deterministicTaskQueue.getCurrentTimeMillis() <= endTime) {
|
||||
deterministicTaskQueue.advanceTime();
|
||||
runAllRunnableTasks();
|
||||
}
|
||||
appender.assertAllExpectationsMatched();
|
||||
|
||||
} finally {
|
||||
Loggers.removeAppender(LogManager.getLogger("org.elasticsearch.discovery.PeerFinder"), appender);
|
||||
appender.stop();
|
||||
}
|
||||
}
|
||||
|
||||
@TestLogging(reason = "testing logging at DEBUG level", value = "org.elasticsearch.discovery:DEBUG")
|
||||
public void testLogsStackTraceInConnectionFailedMessages() throws IllegalAccessException {
|
||||
final DiscoveryNode otherNode = newDiscoveryNode("node-from-hosts-list");
|
||||
|
||||
providedAddresses.add(otherNode.getAddress());
|
||||
transportAddressConnector.unreachableAddresses.add(otherNode.getAddress());
|
||||
|
||||
peerFinder.activate(lastAcceptedNodes);
|
||||
final long endTime
|
||||
= deterministicTaskQueue.getCurrentTimeMillis() + VERBOSITY_INCREASE_TIMEOUT_SETTING.get(Settings.EMPTY).millis();
|
||||
|
||||
MockLogAppender appender = new MockLogAppender();
|
||||
try {
|
||||
appender.start();
|
||||
Loggers.addAppender(LogManager.getLogger("org.elasticsearch.discovery.PeerFinder"), appender);
|
||||
appender.addExpectation(new MockLogAppender.SeenEventExpectation(
|
||||
"connection failed",
|
||||
"org.elasticsearch.discovery.PeerFinder",
|
||||
Level.DEBUG,
|
||||
"address [" + otherNode.getAddress() + "]* connection failed*") {
|
||||
@Override
|
||||
public boolean innerMatch(LogEvent event) {
|
||||
return event.getThrown() instanceof IOException && event.getThrown().getMessage().startsWith("cannot connect to");
|
||||
}
|
||||
});
|
||||
|
||||
deterministicTaskQueue.advanceTime();
|
||||
runAllRunnableTasks();
|
||||
appender.assertAllExpectationsMatched();
|
||||
|
||||
appender.addExpectation(new MockLogAppender.SeenEventExpectation(
|
||||
"connection failed",
|
||||
"org.elasticsearch.discovery.PeerFinder",
|
||||
Level.WARN,
|
||||
"address [" + otherNode.getAddress() + "]* connection failed*")
|
||||
{
|
||||
@Override
|
||||
public boolean innerMatch(LogEvent event) {
|
||||
return event.getThrown() instanceof IOException && event.getThrown().getMessage().startsWith("cannot connect to");
|
||||
}
|
||||
});
|
||||
while (deterministicTaskQueue.getCurrentTimeMillis() <= endTime) {
|
||||
deterministicTaskQueue.advanceTime();
|
||||
runAllRunnableTasks();
|
||||
}
|
||||
appender.assertAllExpectationsMatched();
|
||||
|
||||
} finally {
|
||||
Loggers.removeAppender(LogManager.getLogger("org.elasticsearch.discovery.PeerFinder"), appender);
|
||||
appender.stop();
|
||||
}
|
||||
}
|
||||
|
||||
public void testReconnectsToDisconnectedNodes() {
|
||||
final DiscoveryNode otherNode = newDiscoveryNode("original-node");
|
||||
providedAddresses.add(otherNode.getAddress());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue