Skip zone/host awareness with auto-expand replicas (#69334)

Today if an index is set to `auto_expand_replicas: N-all` then we will
try and create a shard copy on every node that matches the applicable
allocation filters. This conflits with shard allocation awareness and
the same-host allocation decider if there is an uneven distribution of
nodes across zones or hosts, since these deciders prevent shard copies
from being allocated unevenly and may therefore leave some unassigned
shards.

The point of these two deciders is to improve resilience given a limited
number of shard copies but there is no need for this behaviour when the
number of shard copies is not limited, so this commit supresses them in
that case.

Closes #54151
Closes #2869
This commit is contained in:
David Turner 2021-02-22 16:53:58 +00:00 committed by GitHub
parent f27da75a6e
commit bb3ea99850
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 142 additions and 20 deletions

View file

@ -139,16 +139,19 @@ specific index module:
The number of replicas each primary shard has. Defaults to 1.
`index.auto_expand_replicas`::
Auto-expand the number of replicas based on the number of data nodes in the cluster.
Set to a dash delimited lower and upper bound (e.g. `0-5`) or use `all`
for the upper bound (e.g. `0-all`). Defaults to `false` (i.e. disabled).
Note that the auto-expanded number of replicas only takes
<<shard-allocation-filtering,allocation filtering>> rules into account, but ignores
any other allocation rules such as <<shard-allocation-awareness,shard allocation awareness>>
and <<allocation-total-shards,total shards per node>>, and this can lead to the
cluster health becoming `YELLOW` if the applicable rules prevent all the replicas
from being allocated.
Auto-expand the number of replicas based on the number of data nodes in the
cluster. Set to a dash delimited lower and upper bound (e.g. `0-5`) or use `all`
for the upper bound (e.g. `0-all`). Defaults to `false` (i.e. disabled). Note
that the auto-expanded number of replicas only takes
<<shard-allocation-filtering,allocation filtering>> rules into account, but
ignores other allocation rules such as <<allocation-total-shards,total shards
per node>>, and this can lead to the cluster health becoming `YELLOW` if the
applicable rules prevent all the replicas from being allocated.
+
If the upper bound is `all` then <<shard-allocation-awareness,shard allocation
awareness>> and
<<cluster-routing-allocation-same-shard-host,`cluster.routing.allocation.same_shard.host`>>
are ignored for this index.
`index.search.idle.after`::
How long a shard can not receive a search or get request until it's considered

View file

@ -45,7 +45,7 @@ one of the active allocation ids in the cluster state.
These should be fast so more initial primary recoveries can happen in
parallel on the same node. Defaults to `4`.
[[cluster-routing-allocation-same-shard-host]]
`cluster.routing.allocation.same_shard.host`::
(<<dynamic-cluster-setting,Dynamic>>)
Allows to perform a check to prevent allocation of multiple instances of

View file

@ -91,6 +91,10 @@ public final class AutoExpandReplicas {
return Math.min(maxReplicas, numDataNodes-1);
}
public boolean expandToAllNodes() {
return maxReplicas == Integer.MAX_VALUE;
}
private OptionalInt getDesiredNumberOfReplicas(IndexMetadata indexMetadata, RoutingAllocation allocation) {
if (enabled) {
int numMatchingDataNodes = 0;

View file

@ -8,11 +8,6 @@
package org.elasticsearch.cluster.routing.allocation.decider;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import com.carrotsearch.hppc.ObjectIntHashMap;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.routing.RoutingNode;
@ -24,7 +19,13 @@ import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property;
import org.elasticsearch.common.settings.Settings;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import static java.util.Collections.emptyList;
import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_AUTO_EXPAND_REPLICAS_SETTING;
/**
* This {@link AllocationDecider} controls shard allocation based on
@ -118,6 +119,9 @@ public class AwarenessAllocationDecider extends AllocationDecider {
"allocation awareness is not enabled, set cluster setting ["
+ CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey() + "] to enable it");
private static final Decision YES_AUTO_EXPAND_ALL = Decision.single(Decision.Type.YES, NAME,
"allocation awareness is ignored, this index is set to auto-expand to all nodes");
private static final Decision YES_ALL_MET =
Decision.single(Decision.Type.YES, NAME, "node meets all awareness attribute requirements");
@ -128,6 +132,11 @@ public class AwarenessAllocationDecider extends AllocationDecider {
final boolean debug = allocation.debugDecision();
IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index());
if (INDEX_AUTO_EXPAND_REPLICAS_SETTING.get(indexMetadata.getSettings()).expandToAllNodes()) {
return YES_AUTO_EXPAND_ALL;
}
int shardCount = indexMetadata.getNumberOfReplicas() + 1; // 1 for primary
for (String awarenessAttribute : awarenessAttributes) {
// the node the shard exists on must be associated with an awareness attribute

View file

@ -17,6 +17,8 @@ import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property;
import org.elasticsearch.common.settings.Settings;
import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_AUTO_EXPAND_REPLICAS_SETTING;
/**
* An allocation decider that prevents multiple instances of the same shard to
* be allocated on the same {@code node}.
@ -58,6 +60,9 @@ public class SameShardAllocationDecider extends AllocationDecider {
private static final Decision YES_NONE_HOLD_COPY =
Decision.single(Decision.Type.YES, NAME, "none of the nodes on this host hold a copy of this shard");
private static final Decision YES_AUTO_EXPAND_ALL = Decision.single(Decision.Type.YES, NAME,
"same-host allocation is ignored, this index is set to auto-expand to all nodes");
@Override
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
Iterable<ShardRouting> assignedShards = allocation.routingNodes().assignedShards(shardRouting.shardId());
@ -66,6 +71,10 @@ public class SameShardAllocationDecider extends AllocationDecider {
// if its already a NO decision looking at the node, or we aren't configured to look at the host, return the decision
return decision;
}
if (INDEX_AUTO_EXPAND_REPLICAS_SETTING.get(
allocation.metadata().getIndexSafe(shardRouting.index()).getSettings()).expandToAllNodes()) {
return YES_AUTO_EXPAND_ALL;
}
if (node.node() != null) {
for (RoutingNode checkNode : allocation.routingNodes()) {
if (checkNode.node() == null) {

View file

@ -50,16 +50,19 @@ public class AutoExpandReplicasTests extends ESTestCase {
assertEquals(0, autoExpandReplicas.getMinReplicas());
assertEquals(5, autoExpandReplicas.getMaxReplicas(8));
assertEquals(2, autoExpandReplicas.getMaxReplicas(3));
assertFalse(autoExpandReplicas.expandToAllNodes());
autoExpandReplicas = AutoExpandReplicas.SETTING.get(Settings.builder().put("index.auto_expand_replicas", "0-all").build());
assertEquals(0, autoExpandReplicas.getMinReplicas());
assertEquals(5, autoExpandReplicas.getMaxReplicas(6));
assertEquals(2, autoExpandReplicas.getMaxReplicas(3));
assertTrue(autoExpandReplicas.expandToAllNodes());
autoExpandReplicas = AutoExpandReplicas.SETTING.get(Settings.builder().put("index.auto_expand_replicas", "1-all").build());
assertEquals(1, autoExpandReplicas.getMinReplicas());
assertEquals(5, autoExpandReplicas.getMaxReplicas(6));
assertEquals(2, autoExpandReplicas.getMaxReplicas(3));
assertTrue(autoExpandReplicas.expandToAllNodes());
}

View file

@ -11,6 +11,7 @@ package org.elasticsearch.cluster.routing.allocation;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ESAllocationTestCase;
import org.elasticsearch.cluster.metadata.IndexMetadata;
@ -34,6 +35,7 @@ import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING;
import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED;
import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.sameInstance;
@ -770,8 +772,13 @@ public class AwarenessAllocationTests extends ESAllocationTestCase {
logger.info("Building initial routing table for 'testUnassignedShardsWithUnbalancedZones'");
final Settings.Builder indexSettings = settings(Version.CURRENT);
if (randomBoolean()) {
indexSettings.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-4");
}
Metadata metadata = Metadata.builder()
.put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(4))
.put(IndexMetadata.builder("test").settings(indexSettings).numberOfShards(1).numberOfReplicas(4))
.build();
RoutingTable initialRoutingTable = RoutingTable.builder()
@ -865,4 +872,36 @@ public class AwarenessAllocationTests extends ESAllocationTestCase {
assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(2));
assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0));
}
public void testDisabledByAutoExpandReplicas() {
final Settings settings = Settings.builder()
.put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey(), "zone")
.build();
final AllocationService strategy = createAllocationService(settings);
final Metadata metadata = Metadata.builder()
.put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 99)
.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-all")))
.build();
final ClusterState clusterState = applyStartedShardsUntilNoChange(
ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.get(Settings.EMPTY))
.metadata(metadata)
.routingTable(RoutingTable.builder()
.addAsNew(metadata.index("test"))
.build())
.nodes(DiscoveryNodes.builder()
.add(newNode("A-0", singletonMap("zone", "a")))
.add(newNode("A-1", singletonMap("zone", "a")))
.add(newNode("A-2", singletonMap("zone", "a")))
.add(newNode("A-3", singletonMap("zone", "a")))
.add(newNode("A-4", singletonMap("zone", "a")))
.add(newNode("B-0", singletonMap("zone", "b")))
).build(), strategy);
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED), empty());
}
}

View file

@ -13,7 +13,6 @@ import org.apache.logging.log4j.Logger;
import org.elasticsearch.Version;
import org.elasticsearch.action.support.replication.ClusterStateCreationUtils;
import org.elasticsearch.cluster.ClusterInfo;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ESAllocationTestCase;
import org.elasticsearch.cluster.metadata.IndexMetadata;
@ -37,8 +36,11 @@ import org.elasticsearch.snapshots.SnapshotShardSizeInfo;
import java.util.Collections;
import static java.util.Collections.emptyMap;
import static org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING;
import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED;
import static org.elasticsearch.cluster.routing.allocation.RoutingNodesUtils.numberOfShardsOfType;
import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.equalTo;
public class SameShardRoutingTests extends ESAllocationTestCase {
@ -48,14 +50,19 @@ public class SameShardRoutingTests extends ESAllocationTestCase {
AllocationService strategy = createAllocationService(
Settings.builder().put(SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING.getKey(), true).build());
final Settings.Builder indexSettings = settings(Version.CURRENT);
if (randomBoolean()) {
indexSettings.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-1");
}
Metadata metadata = Metadata.builder()
.put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(2).numberOfReplicas(1))
.put(IndexMetadata.builder("test").settings(indexSettings).numberOfShards(2).numberOfReplicas(1))
.build();
RoutingTable routingTable = RoutingTable.builder()
.addAsNew(metadata.index("test"))
.build();
ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata)
ClusterState clusterState = ClusterState.builder(CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata)
.routingTable(routingTable).build();
logger.info("--> adding two nodes with the same host");
@ -88,6 +95,54 @@ public class SameShardRoutingTests extends ESAllocationTestCase {
}
}
public void testSameHostCheckDisabledByAutoExpandReplicas() {
final AllocationService strategy = createAllocationService(
Settings.builder().put(SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING.getKey(), true).build());
final Metadata metadata = Metadata.builder()
.put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 99)
.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-all")))
.build();
final DiscoveryNode node1 = new DiscoveryNode(
"node1",
"node1",
"node1",
"test1",
"test1",
buildNewFakeTransportAddress(),
emptyMap(),
MASTER_DATA_ROLES,
Version.CURRENT);
final DiscoveryNode node2 = new DiscoveryNode(
"node2",
"node2",
"node2",
"test1",
"test1",
buildNewFakeTransportAddress(),
emptyMap(),
MASTER_DATA_ROLES,
Version.CURRENT);
final ClusterState clusterState = applyStartedShardsUntilNoChange(ClusterState
.builder(CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
.metadata(metadata)
.routingTable(RoutingTable.builder()
.addAsNew(metadata.index("test"))
.build())
.nodes(
DiscoveryNodes.builder()
.add(node1)
.add(node2)).build(), strategy);
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED), empty());
}
public void testForceAllocatePrimaryOnSameNodeNotAllowed() {
SameShardAllocationDecider decider = new SameShardAllocationDecider(
Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS));