Set recovery rate for dedicated cold nodes (#68480)

This commit sets the recovery rate for dedicated cold nodes. The goal is
here is enhance performance of recovery in a dedicated cold tier, where
we expect such nodes to be predominantly using searchable snapshots to
back the indices located on them. This commit follows a simple approach
where we increase the recovery rate as a function of the node size, for
nodes that appear to be dedicated cold nodes.
This commit is contained in:
Jason Tedor 2021-02-04 10:36:07 -05:00 committed by GitHub
parent fdb147ad6a
commit 6e94e67ae9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 65 additions and 3 deletions

View file

@ -18,7 +18,21 @@ You can view a list of in-progress and completed recoveries using the
`indices.recovery.max_bytes_per_sec`:: `indices.recovery.max_bytes_per_sec`::
(<<cluster-update-settings,Dynamic>>) Limits total inbound and outbound (<<cluster-update-settings,Dynamic>>) Limits total inbound and outbound
recovery traffic for each node. Applies to both peer recoveries as well recovery traffic for each node. Applies to both peer recoveries as well
as snapshot recoveries (i.e., restores from a snapshot). Defaults to `40mb`. as snapshot recoveries (i.e., restores from a snapshot). Defaults to `40mb`
unless the node is a <<cold-tier, dedicated cold node>> in which case the
default relates to the total memory available to the node:
.Recovery Rate for Cold Nodes
[options="header"]
|======
|total memory | default value
|<= 4 GB | 40 MB/s
|> 4 GB and <= 8 GB | 60 MB/s
|> 8 GB and <= 16 GB | 90 MB/s
|> 16 GB and <= 32 GB | 125 MB/s
|> 32 GB | 250 MB/s
|======
+ +
This limit applies to each node separately. If multiple nodes in a cluster This limit applies to each node separately. If multiple nodes in a cluster
perform recoveries at the same time, the cluster's total recovery traffic may perform recoveries at the same time, the cluster's total recovery traffic may

View file

@ -12,6 +12,8 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.apache.lucene.store.RateLimiter; import org.apache.lucene.store.RateLimiter;
import org.apache.lucene.store.RateLimiter.SimpleRateLimiter; import org.apache.lucene.store.RateLimiter.SimpleRateLimiter;
import org.elasticsearch.bootstrap.JavaVersion;
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property; import org.elasticsearch.common.settings.Setting.Property;
@ -19,14 +21,60 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeUnit; import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.monitor.os.OsProbe;
import org.elasticsearch.node.NodeRoleSettings;
import java.util.List;
import java.util.stream.Collectors;
public class RecoverySettings { public class RecoverySettings {
private static final Logger logger = LogManager.getLogger(RecoverySettings.class); private static final Logger logger = LogManager.getLogger(RecoverySettings.class);
public static final Setting<ByteSizeValue> INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING = public static final Setting<ByteSizeValue> INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING =
Setting.byteSizeSetting("indices.recovery.max_bytes_per_sec", new ByteSizeValue(40, ByteSizeUnit.MB), Setting.byteSizeSetting(
Property.Dynamic, Property.NodeScope); "indices.recovery.max_bytes_per_sec",
s -> {
final ByteSizeValue defaultMaxBytesPerSec = new ByteSizeValue(40, ByteSizeUnit.MB);
final List<DiscoveryNodeRole> roles = NodeRoleSettings.NODE_ROLES_SETTING.get(s);
final List<DiscoveryNodeRole> dataRoles =
roles.stream().filter(DiscoveryNodeRole::canContainData).collect(Collectors.toUnmodifiableList());
if (dataRoles.isEmpty()) {
// if the node is not a data node, this value doesn't matter, use the default
return defaultMaxBytesPerSec.getStringRep();
}
if ((dataRoles.size() > 1 || dataRoles.get(0).roleName().equals("data_cold") == false) ||
roles.contains(DiscoveryNodeRole.MASTER_ROLE)) {
// if the node is not a dedicated cold node, use the default
return defaultMaxBytesPerSec.getStringRep();
}
/*
* Now we are looking at a node that has a single data role, that data role is the cold data role, and the node does not
* have the master role. In this case, we are going to set the recovery size as a function of the memory size. We are making
* an assumption here that the size of the instance is correlated with I/O resources. That is we are assuming that the
* larger the instance, the more disk and networking capacity it has available.
*/
if (JavaVersion.current().compareTo(JavaVersion.parse("14")) < 0) {
// prior to JDK 14, the JDK did not take into consideration container memory limits when reporting total system memory
return defaultMaxBytesPerSec.getStringRep();
}
final ByteSizeValue totalPhysicalMemory = new ByteSizeValue(OsProbe.getInstance().getTotalPhysicalMemorySize());
final ByteSizeValue maxBytesPerSec;
if (totalPhysicalMemory.compareTo(new ByteSizeValue(4, ByteSizeUnit.GB)) <= 0) {
maxBytesPerSec = new ByteSizeValue(40, ByteSizeUnit.MB);
} else if (totalPhysicalMemory.compareTo(new ByteSizeValue(8, ByteSizeUnit.GB)) <= 0) {
maxBytesPerSec = new ByteSizeValue(60, ByteSizeUnit.MB);
} else if (totalPhysicalMemory.compareTo(new ByteSizeValue(16, ByteSizeUnit.GB)) <= 0) {
maxBytesPerSec = new ByteSizeValue(90, ByteSizeUnit.MB);
} else if (totalPhysicalMemory.compareTo(new ByteSizeValue(32, ByteSizeUnit.GB)) <= 0) {
maxBytesPerSec = new ByteSizeValue(125, ByteSizeUnit.MB);
} else {
maxBytesPerSec = new ByteSizeValue(250, ByteSizeUnit.MB);
}
return maxBytesPerSec.getStringRep();
},
Property.Dynamic,
Property.NodeScope);
/** /**
* Controls the maximum number of file chunk requests that can be sent concurrently from the source node to the target node. * Controls the maximum number of file chunk requests that can be sent concurrently from the source node to the target node.