Add ephemeral node id to shutdown metadata (#118722)

Shutdown metadata is keyed on node id. This makes sense since only one
node with a given node id can exist within a cluster. However, it is
possible that shutdown was initiated for one instance of a node, but
that node is restarted. This commit adds the ephemeral node id to
shutdown metadata so that nodes with the same id but different ephemeral
id can be distinguished.
This commit is contained in:
Ryan Ernst 2024-12-30 10:13:58 -08:00 committed by GitHub
parent 12acc89539
commit 7fb6ca447a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 133 additions and 36 deletions

View file

@ -72,6 +72,7 @@ including the status of shard migration, task migration, and plugin cleanup:
"nodes": [
{
"node_id": "USpTGYaBSIKbgSUJR2Z9lg",
"node_ephemeral_id": null,
"type": "RESTART",
"reason": "Demonstrating how the node shutdown API works",
"shutdown_startedmillis": 1624406108685,

View file

@ -753,11 +753,16 @@ public class SnapshotShutdownIT extends AbstractSnapshotIntegTestCase {
clusterService.submitUnbatchedStateUpdateTask("mark node for removal", new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
final var nodeId = currentState.nodes().resolveNode(nodeName).getId();
final var node = currentState.nodes().resolveNode(nodeName);
return currentState.copyAndUpdateMetadata(
mdb -> mdb.putCustom(
NodesShutdownMetadata.TYPE,
new NodesShutdownMetadata(Map.of(nodeId, shutdownMetadataBuilder.setNodeId(nodeId).build()))
new NodesShutdownMetadata(
Map.of(
node.getId(),
shutdownMetadataBuilder.setNodeId(node.getId()).setNodeEphemeralId(node.getEphemeralId()).build()
)
)
)
);
}

View file

@ -1235,15 +1235,16 @@ public class SnapshotStressTestsIT extends AbstractSnapshotIntegTestCase {
Strings.toString(currentState),
currentState.metadata().nodeShutdowns().getAll().isEmpty()
);
final var nodeId = currentState.nodes().resolveNode(node.nodeName).getId();
final var discoveryNode = currentState.nodes().resolveNode(node.nodeName);
return currentState.copyAndUpdateMetadata(
mdb -> mdb.putCustom(
NodesShutdownMetadata.TYPE,
new NodesShutdownMetadata(
Map.of(
nodeId,
discoveryNode.getId(),
SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId)
.setNodeId(discoveryNode.getId())
.setNodeEphemeralId(discoveryNode.getEphemeralId())
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
.setStartedAtMillis(clusterService.threadPool().absoluteTimeInMillis())
.setReason("test")

View file

@ -147,6 +147,7 @@ public class TransportVersions {
public static final TransportVersion FAILURE_STORE_ENABLED_BY_CLUSTER_SETTING = def(8_812_00_0);
public static final TransportVersion SIMULATE_IGNORED_FIELDS = def(8_813_00_0);
public static final TransportVersion TRANSFORMS_UPGRADE_MODE = def(8_814_00_0);
public static final TransportVersion NODE_SHUTDOWN_EPHEMERAL_ID_ADDED = def(8_815_00_0);
/*
* STOP! READ THIS FIRST! No, really,

View file

@ -28,6 +28,7 @@ import java.io.IOException;
import java.util.Locale;
import java.util.Objects;
import static org.elasticsearch.TransportVersions.NODE_SHUTDOWN_EPHEMERAL_ID_ADDED;
import static org.elasticsearch.core.Strings.format;
/**
@ -40,6 +41,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
public static final TransportVersion GRACE_PERIOD_ADDED_VERSION = TransportVersions.V_8_9_X;
public static final ParseField NODE_ID_FIELD = new ParseField("node_id");
public static final ParseField NODE_EPHEMERAL_ID_FIELD = new ParseField("node_ephemeral_id");
public static final ParseField TYPE_FIELD = new ParseField("type");
public static final ParseField REASON_FIELD = new ParseField("reason");
public static final String STARTED_AT_READABLE_FIELD = "shutdown_started";
@ -53,18 +55,25 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
"node_shutdown_info",
a -> new SingleNodeShutdownMetadata(
(String) a[0],
Type.valueOf((String) a[1]),
(String) a[2],
(long) a[3],
(boolean) a[4],
(TimeValue) a[5],
(String) a[6],
(TimeValue) a[7]
(String) a[1],
Type.valueOf((String) a[2]),
(String) a[3],
(long) a[4],
(boolean) a[5],
(TimeValue) a[6],
(String) a[7],
(TimeValue) a[8]
)
);
static {
PARSER.declareString(ConstructingObjectParser.constructorArg(), NODE_ID_FIELD);
PARSER.declareField(
ConstructingObjectParser.optionalConstructorArg(),
(p, c) -> p.textOrNull(),
NODE_EPHEMERAL_ID_FIELD,
ObjectParser.ValueType.STRING_OR_NULL
);
PARSER.declareString(ConstructingObjectParser.constructorArg(), TYPE_FIELD);
PARSER.declareString(ConstructingObjectParser.constructorArg(), REASON_FIELD);
PARSER.declareLong(ConstructingObjectParser.constructorArg(), STARTED_AT_MILLIS_FIELD);
@ -91,6 +100,8 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
public static final TimeValue DEFAULT_RESTART_SHARD_ALLOCATION_DELAY = TimeValue.timeValueMinutes(5);
private final String nodeId;
@Nullable
private final String nodeEphemeralId;
private final Type type;
private final String reason;
private final long startedAtMillis;
@ -110,6 +121,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
*/
private SingleNodeShutdownMetadata(
String nodeId,
@Nullable String nodeEphemeralId,
Type type,
String reason,
long startedAtMillis,
@ -119,6 +131,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
@Nullable TimeValue gracePeriod
) {
this.nodeId = Objects.requireNonNull(nodeId, "node ID must not be null");
this.nodeEphemeralId = nodeEphemeralId;
this.type = Objects.requireNonNull(type, "shutdown type must not be null");
this.reason = Objects.requireNonNull(reason, "shutdown reason must not be null");
this.startedAtMillis = startedAtMillis;
@ -157,6 +170,11 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
public SingleNodeShutdownMetadata(StreamInput in) throws IOException {
this.nodeId = in.readString();
if (in.getTransportVersion().onOrAfter(NODE_SHUTDOWN_EPHEMERAL_ID_ADDED)) {
this.nodeEphemeralId = in.readOptionalString();
} else {
this.nodeEphemeralId = null; // empty when talking to old nodes, meaning the persistent node id is the only differentiator
}
this.type = in.readEnum(Type.class);
this.reason = in.readString();
this.startedAtMillis = in.readVLong();
@ -181,6 +199,15 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
return nodeId;
}
/**
* @return The ephemeral ID of the node this {@link SingleNodeShutdownMetadata} concerns, or
* {@code null} if the ephemeral id is unknown.
*/
@Nullable
public String getNodeEphemeralId() {
return nodeEphemeralId;
}
/**
* @return The type of shutdown this is (shutdown vs. permanent).
*/
@ -241,6 +268,9 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(nodeId);
if (out.getTransportVersion().onOrAfter(NODE_SHUTDOWN_EPHEMERAL_ID_ADDED)) {
out.writeOptionalString(nodeEphemeralId);
}
if ((out.getTransportVersion().before(REPLACE_SHUTDOWN_TYPE_ADDED_VERSION) && this.type == SingleNodeShutdownMetadata.Type.REPLACE)
|| (out.getTransportVersion().before(SIGTERM_ADDED_VERSION) && this.type == Type.SIGTERM)) {
out.writeEnum(SingleNodeShutdownMetadata.Type.REMOVE);
@ -264,6 +294,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
builder.startObject();
{
builder.field(NODE_ID_FIELD.getPreferredName(), nodeId);
builder.field(NODE_EPHEMERAL_ID_FIELD.getPreferredName(), nodeEphemeralId);
builder.field(TYPE_FIELD.getPreferredName(), type);
builder.field(REASON_FIELD.getPreferredName(), reason);
builder.timestampFieldsFromUnixEpochMillis(
@ -295,6 +326,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
return getStartedAtMillis() == that.getStartedAtMillis()
&& getNodeSeen() == that.getNodeSeen()
&& getNodeId().equals(that.getNodeId())
&& Objects.equals(getNodeEphemeralId(), that.getNodeEphemeralId())
&& getType() == that.getType()
&& getReason().equals(that.getReason())
&& Objects.equals(getAllocationDelay(), that.getAllocationDelay())
@ -306,6 +338,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
public int hashCode() {
return Objects.hash(
getNodeId(),
getNodeEphemeralId(),
getType(),
getReason(),
getStartedAtMillis(),
@ -322,6 +355,8 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
stringBuilder.append("{")
.append("nodeId=[")
.append(nodeId)
.append("], nodeEphemeralId=[")
.append(nodeEphemeralId)
.append(']')
.append(", type=[")
.append(type)
@ -350,6 +385,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
return builder();
}
return new Builder().setNodeId(original.getNodeId())
.setNodeEphemeralId(original.getNodeEphemeralId())
.setType(original.getType())
.setReason(original.getReason())
.setStartedAtMillis(original.getStartedAtMillis())
@ -359,6 +395,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
public static class Builder {
private String nodeId;
private String nodeEphemeralId;
private Type type;
private String reason;
private long startedAtMillis = -1;
@ -378,6 +415,15 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
return this;
}
/**
* @param nodeEphemeralId The node ephemeral ID this metadata refers to.
* @return This builder.
*/
public Builder setNodeEphemeralId(String nodeEphemeralId) {
this.nodeEphemeralId = nodeEphemeralId;
return this;
}
/**
* @param type The type of shutdown.
* @return This builder.
@ -444,6 +490,7 @@ public class SingleNodeShutdownMetadata implements SimpleDiffable<SingleNodeShut
return new SingleNodeShutdownMetadata(
nodeId,
nodeEphemeralId,
type,
reason,
startedAtMillis,

View file

@ -78,6 +78,7 @@ public class NodesShutdownMetadataTests extends ChunkedToXContentDiffableSeriali
"this_node",
SingleNodeShutdownMetadata.builder()
.setNodeId("this_node")
.setNodeEphemeralId("this_node")
.setReason("shutdown for a unit test")
.setType(type)
.setStartedAtMillis(randomNonNegativeLong())
@ -106,6 +107,7 @@ public class NodesShutdownMetadataTests extends ChunkedToXContentDiffableSeriali
public void testSigtermIsRemoveInOlderVersions() throws IOException {
SingleNodeShutdownMetadata metadata = SingleNodeShutdownMetadata.builder()
.setNodeId("myid")
.setNodeEphemeralId("myid")
.setType(SingleNodeShutdownMetadata.Type.SIGTERM)
.setReason("myReason")
.setStartedAtMillis(0L)
@ -127,6 +129,7 @@ public class NodesShutdownMetadataTests extends ChunkedToXContentDiffableSeriali
SingleNodeShutdownMetadata.Type type;
SingleNodeShutdownMetadata.Builder builder = SingleNodeShutdownMetadata.builder()
.setNodeId("thenode")
.setNodeEphemeralId("thenode")
.setReason("myReason")
.setStartedAtMillis(0L);
switch (type = randomFrom(SingleNodeShutdownMetadata.Type.values())) {
@ -182,6 +185,7 @@ public class NodesShutdownMetadataTests extends ChunkedToXContentDiffableSeriali
final SingleNodeShutdownMetadata.Type type = randomFrom(SingleNodeShutdownMetadata.Type.values());
final SingleNodeShutdownMetadata.Builder builder = SingleNodeShutdownMetadata.builder()
.setNodeId(randomAlphaOfLength(5))
.setNodeEphemeralId(randomAlphaOfLength(5))
.setType(type)
.setReason(randomAlphaOfLength(5))
.setStartedAtMillis(randomNonNegativeLong());

View file

@ -638,6 +638,7 @@ public class UnassignedInfoTests extends ESAllocationTestCase {
shutdowns = shutdowns.putSingleNodeMetadata(
SingleNodeShutdownMetadata.builder()
.setNodeId(randomValueOtherThan(lastNodeId, () -> randomAlphaOfLengthBetween(5, 10)))
.setNodeEphemeralId(randomValueOtherThan(lastNodeId, () -> randomAlphaOfLengthBetween(5, 10)))
.setReason(this.getTestName())
.setStartedAtMillis(randomNonNegativeLong())
.setType(type)
@ -658,6 +659,7 @@ public class UnassignedInfoTests extends ESAllocationTestCase {
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
SingleNodeShutdownMetadata.builder()
.setNodeId(lastNodeId)
.setNodeEphemeralId(lastNodeId)
.setReason(this.getTestName())
.setStartedAtMillis(randomNonNegativeLong())
.setType(type)
@ -678,6 +680,7 @@ public class UnassignedInfoTests extends ESAllocationTestCase {
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
SingleNodeShutdownMetadata.builder()
.setNodeId(lastNodeId)
.setNodeEphemeralId(lastNodeId)
.setReason(this.getTestName())
.setStartedAtMillis(randomNonNegativeLong())
.setType(SingleNodeShutdownMetadata.Type.RESTART)
@ -700,6 +703,7 @@ public class UnassignedInfoTests extends ESAllocationTestCase {
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
SingleNodeShutdownMetadata.builder()
.setNodeId(lastNodeId)
.setNodeEphemeralId(lastNodeId)
.setReason(this.getTestName())
.setStartedAtMillis(randomNonNegativeLong())
.setType(SingleNodeShutdownMetadata.Type.RESTART)
@ -728,6 +732,7 @@ public class UnassignedInfoTests extends ESAllocationTestCase {
NodesShutdownMetadata shutdowns = NodesShutdownMetadata.EMPTY.putSingleNodeMetadata(
SingleNodeShutdownMetadata.builder()
.setNodeId(lastNodeId)
.setNodeEphemeralId(lastNodeId)
.setReason(this.getTestName())
.setStartedAtMillis(randomNonNegativeLong())
.setType(SingleNodeShutdownMetadata.Type.RESTART)

View file

@ -909,6 +909,7 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
sourceNode,
SingleNodeShutdownMetadata.builder()
.setNodeId(sourceNode)
.setNodeEphemeralId(sourceNode)
.setReason("testing")
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
.setTargetNodeName(targetNode)
@ -1293,6 +1294,7 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
"node1",
SingleNodeShutdownMetadata.builder()
.setNodeId("node1")
.setNodeEphemeralId("node1")
.setReason("testing")
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
.setTargetNodeName("node3")

View file

@ -922,6 +922,7 @@ public class DesiredBalanceReconcilerTests extends ESAllocationTestCase {
"node-0",
SingleNodeShutdownMetadata.builder()
.setNodeId("node-0")
.setNodeEphemeralId("node-0")
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
.setTargetNodeName("node-2")
.setStartedAtMillis(System.currentTimeMillis())
@ -1280,6 +1281,7 @@ public class DesiredBalanceReconcilerTests extends ESAllocationTestCase {
var builder = SingleNodeShutdownMetadata.builder()
.setType(type)
.setNodeId("data-node-1")
.setNodeEphemeralId("data-node-1")
.setStartedAtMillis(randomNonNegativeLong())
.setReason("test");
if (type.equals(SingleNodeShutdownMetadata.Type.SIGTERM)) {

View file

@ -868,6 +868,7 @@ public class DesiredBalanceShardsAllocatorTests extends ESAllocationTestCase {
final var shutdownType = randomFrom(Type.SIGTERM, Type.REMOVE, Type.REPLACE);
final var singleShutdownMetadataBuilder = SingleNodeShutdownMetadata.builder()
.setNodeId(node2.getId())
.setNodeEphemeralId(node2.getEphemeralId())
.setReason("test")
.setType(shutdownType)
.setStartedAtMillis(randomNonNegativeLong());

View file

@ -420,6 +420,7 @@ public class NodeReplacementAllocationDeciderTests extends ESAllocationTestCase
return new NodesShutdownMetadata(new HashMap<>()).putSingleNodeMetadata(
SingleNodeShutdownMetadata.builder()
.setNodeId(sourceNodeId)
.setNodeEphemeralId(sourceNodeId)
.setTargetNodeName(targetNodeName)
.setType(SingleNodeShutdownMetadata.Type.REPLACE)
.setReason(this.getTestName())

View file

@ -227,6 +227,7 @@ public class NodeShutdownAllocationDeciderTests extends ESAllocationTestCase {
return new NodesShutdownMetadata(new HashMap<>()).putSingleNodeMetadata(
SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId)
.setNodeEphemeralId(nodeId)
.setType(shutdownType)
.setReason(this.getTestName())
.setStartedAtMillis(1L)

View file

@ -216,6 +216,7 @@ public class SnapshotInProgressAllocationDeciderTests extends ESTestCase {
nodeId,
SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId)
.setNodeEphemeralId(nodeId)
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
.setStartedAtMillis(randomNonNegativeLong())
.setReason("test")

View file

@ -2254,6 +2254,7 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
it -> it.nodeId,
it -> SingleNodeShutdownMetadata.builder()
.setNodeId(it.nodeId)
.setNodeEphemeralId(it.nodeId)
.setType(it.type)
.setReason("test")
.setNodeSeen(true)

View file

@ -196,6 +196,7 @@ public class HealthNodeTaskExecutorTests extends ESTestCase {
localNodeId,
SingleNodeShutdownMetadata.builder()
.setNodeId(localNodeId)
.setNodeEphemeralId(localNodeId)
.setReason("shutdown for a unit test")
.setType(type)
.setStartedAtMillis(randomNonNegativeLong())

View file

@ -628,6 +628,7 @@ public class PersistentTasksClusterServiceTests extends ESTestCase {
DiscoveryNode::getId,
node -> SingleNodeShutdownMetadata.builder()
.setNodeId(node.getId())
.setNodeEphemeralId(node.getEphemeralId())
.setReason("shutdown for a unit test")
.setType(type)
.setStartedAtMillis(randomNonNegativeLong())

View file

@ -246,6 +246,7 @@ public class ReadinessServiceTests extends ESTestCase implements ReadinessClient
httpTransport.node.getId(),
SingleNodeShutdownMetadata.builder()
.setNodeId(httpTransport.node.getId())
.setNodeEphemeralId(httpTransport.node.getEphemeralId())
.setReason("testing")
.setType(SingleNodeShutdownMetadata.Type.RESTART)
.setStartedAtMillis(randomNonNegativeLong())

View file

@ -88,6 +88,7 @@ public class SnapshotsInProgressSerializationTests extends SimpleDiffableWireSer
nodeId -> SingleNodeShutdownMetadata.builder()
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
.setNodeId(nodeId)
.setNodeEphemeralId(nodeId)
.setStartedAtMillis(randomNonNegativeLong())
.setReason(getTestName())
.build()
@ -476,6 +477,7 @@ public class SnapshotsInProgressSerializationTests extends SimpleDiffableWireSer
"node-id",
SingleNodeShutdownMetadata.builder()
.setNodeId("node-id")
.setNodeEphemeralId("node-id")
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
.setStartedAtMillis(randomNonNegativeLong())
.setReason("test")

View file

@ -759,27 +759,15 @@ public class DataTierAllocationDeciderTests extends ESAllocationTestCase {
SingleNodeShutdownMetadata.Type.REPLACE,
SingleNodeShutdownMetadata.Type.REMOVE
);
var builder = SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId)
.setNodeEphemeralId(nodeId)
.setType(type)
.setReason(this.getTestName());
return switch (type) {
case REMOVE -> SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId)
.setType(type)
.setReason(this.getTestName())
.setStartedAtMillis(randomNonNegativeLong())
.build();
case REPLACE -> SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId)
.setType(type)
.setTargetNodeName(randomAlphaOfLength(10))
.setReason(this.getTestName())
.setStartedAtMillis(randomNonNegativeLong())
.build();
case SIGTERM -> SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId)
.setType(type)
.setGracePeriod(randomTimeValue())
.setReason(this.getTestName())
.setStartedAtMillis(randomNonNegativeLong())
.build();
case REMOVE -> builder.setStartedAtMillis(randomNonNegativeLong()).build();
case REPLACE -> builder.setTargetNodeName(randomAlphaOfLength(10)).setStartedAtMillis(randomNonNegativeLong()).build();
case SIGTERM -> builder.setGracePeriod(randomTimeValue()).setStartedAtMillis(randomNonNegativeLong()).build();
case RESTART -> throw new AssertionError("bad randomization, this method only generates removal type shutdowns");
};
}
@ -1019,6 +1007,7 @@ public class DataTierAllocationDeciderTests extends ESAllocationTestCase {
nodeId,
SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId)
.setNodeEphemeralId(nodeId)
.setType(SingleNodeShutdownMetadata.Type.RESTART)
.setReason(this.getTestName())
.setStartedAtMillis(randomNonNegativeLong())

View file

@ -465,6 +465,7 @@ public class CheckShrinkReadyStepTests extends AbstractStepTestCase<CheckShrinkR
.setStartedAtMillis(randomNonNegativeLong())
.setReason("test")
.setNodeId("node1")
.setNodeEphemeralId("node1")
.setTargetNodeName(targetNodeName)
.setGracePeriod(grace)
.build()
@ -544,6 +545,7 @@ public class CheckShrinkReadyStepTests extends AbstractStepTestCase<CheckShrinkR
.setStartedAtMillis(randomNonNegativeLong())
.setReason("test")
.setNodeId("node1")
.setNodeEphemeralId("node1")
.setTargetNodeName(targetNodeName)
.setGracePeriod(grace)
.build()

View file

@ -601,6 +601,7 @@ public class IndexLifecycleServiceTests extends ESTestCase {
"shutdown_node",
SingleNodeShutdownMetadata.builder()
.setNodeId("shutdown_node")
.setNodeEphemeralId("shutdown_node")
.setReason("shut down for test")
.setStartedAtMillis(randomNonNegativeLong())
.setType(SingleNodeShutdownMetadata.Type.RESTART)
@ -632,6 +633,7 @@ public class IndexLifecycleServiceTests extends ESTestCase {
"shutdown_node",
SingleNodeShutdownMetadata.builder()
.setNodeId("shutdown_node")
.setNodeEphemeralId("shutdown_node")
.setReason("shut down for test")
.setStartedAtMillis(randomNonNegativeLong())
.setType(type)

View file

@ -1556,6 +1556,7 @@ public class TrainedModelAssignmentClusterServiceTests extends ESTestCase {
nodeId1,
SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId1)
.setNodeEphemeralId(nodeId1)
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
.setStartedAtMillis(System.currentTimeMillis())
.setReason("test")
@ -1599,6 +1600,7 @@ public class TrainedModelAssignmentClusterServiceTests extends ESTestCase {
nodeId1,
SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId1)
.setNodeEphemeralId(nodeId1)
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
.setStartedAtMillis(System.currentTimeMillis())
.setReason("test")
@ -1645,6 +1647,7 @@ public class TrainedModelAssignmentClusterServiceTests extends ESTestCase {
nodeId3,
SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId3)
.setNodeEphemeralId(nodeId3)
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
.setStartedAtMillis(System.currentTimeMillis())
.setReason("test")
@ -1703,6 +1706,7 @@ public class TrainedModelAssignmentClusterServiceTests extends ESTestCase {
nodeId3,
SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId3)
.setNodeEphemeralId(nodeId3)
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
.setStartedAtMillis(System.currentTimeMillis())
.setReason("test")
@ -1763,6 +1767,7 @@ public class TrainedModelAssignmentClusterServiceTests extends ESTestCase {
nodeId3,
SingleNodeShutdownMetadata.builder()
.setNodeId(nodeId3)
.setNodeEphemeralId(nodeId3)
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
.setStartedAtMillis(System.currentTimeMillis())
.setReason("test")
@ -1993,6 +1998,7 @@ public class TrainedModelAssignmentClusterServiceTests extends ESTestCase {
nodeToShutdown.getId(),
SingleNodeShutdownMetadata.builder()
.setNodeId(nodeToShutdown.getId())
.setNodeEphemeralId(nodeToShutdown.getEphemeralId())
.setStartedAtMillis(1L)
.setType(SingleNodeShutdownMetadata.Type.RESTART)
.setReason("because this cannot be null")
@ -2049,6 +2055,7 @@ public class TrainedModelAssignmentClusterServiceTests extends ESTestCase {
.setStartedAtMillis(randomNonNegativeLong())
.setReason("tests")
.setNodeId(nodeId)
.setNodeEphemeralId(nodeId)
.build()
)
);

View file

@ -118,6 +118,7 @@ public class SingleNodeShutdownStatus implements Writeable, ChunkedToXContentObj
public Iterator<? extends ToXContent> toXContentChunked(ToXContent.Params params) {
return Iterators.concat(startObject(), singleChunk((builder, p) -> {
builder.field(SingleNodeShutdownMetadata.NODE_ID_FIELD.getPreferredName(), metadata.getNodeId());
builder.field(SingleNodeShutdownMetadata.NODE_EPHEMERAL_ID_FIELD.getPreferredName(), metadata.getNodeEphemeralId());
builder.field(SingleNodeShutdownMetadata.TYPE_FIELD.getPreferredName(), metadata.getType());
builder.field(SingleNodeShutdownMetadata.REASON_FIELD.getPreferredName(), metadata.getReason());
if (metadata.getAllocationDelay() != null) {

View file

@ -21,6 +21,7 @@ import org.elasticsearch.cluster.block.ClusterBlockLevel;
import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
import org.elasticsearch.cluster.metadata.NodesShutdownMetadata;
import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.allocation.AllocationService;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.cluster.service.MasterService;
@ -49,7 +50,8 @@ public class TransportPutShutdownNodeAction extends AcknowledgedTransportMasterN
private static boolean putShutdownNodeState(
Map<String, SingleNodeShutdownMetadata> shutdownMetadata,
Predicate<String> nodeExists,
Request request
Request request,
String nodeEphemeralId
) {
if (isNoop(shutdownMetadata, request)) {
return false;
@ -58,6 +60,7 @@ public class TransportPutShutdownNodeAction extends AcknowledgedTransportMasterN
final boolean nodeSeen = nodeExists.test(request.getNodeId());
SingleNodeShutdownMetadata newNodeMetadata = SingleNodeShutdownMetadata.builder()
.setNodeId(request.getNodeId())
.setNodeEphemeralId(nodeEphemeralId)
.setType(request.getType())
.setReason(request.getReason())
.setStartedAtMillis(System.currentTimeMillis())
@ -103,8 +106,13 @@ public class TransportPutShutdownNodeAction extends AcknowledgedTransportMasterN
boolean needsReroute = false;
for (final var taskContext : batchExecutionContext.taskContexts()) {
var request = taskContext.getTask().request();
String nodeEphemeralId = null;
DiscoveryNode discoveryNode = initialState.nodes().getNodes().get(request.getNodeId());
if (discoveryNode != null) {
nodeEphemeralId = discoveryNode.getEphemeralId();
}
try (var ignored = taskContext.captureResponseHeaders()) {
changed |= putShutdownNodeState(shutdownMetadata, nodeExistsPredicate, request);
changed |= putShutdownNodeState(shutdownMetadata, nodeExistsPredicate, request, nodeEphemeralId);
} catch (Exception e) {
taskContext.onFailure(e);
continue;

View file

@ -55,6 +55,7 @@ public class GetShutdownStatusResponseTests extends AbstractWireSerializingTestC
final TimeValue gracefulShutdown = type == SIGTERM ? randomPositiveTimeValue() : null;
return SingleNodeShutdownMetadata.builder()
.setNodeId(randomAlphaOfLength(5))
.setNodeEphemeralId(randomAlphaOfLength(5))
.setType(type)
.setReason(randomAlphaOfLength(5))
.setStartedAtMillis(randomNonNegativeLong())

View file

@ -603,6 +603,7 @@ public class TransportGetShutdownStatusActionTests extends ESTestCase {
.setStartedAtMillis(randomNonNegativeLong())
.setReason(this.getTestName())
.setNodeId(bogusNodeId)
.setNodeEphemeralId(bogusNodeId)
.build()
)
)
@ -866,6 +867,7 @@ public class TransportGetShutdownStatusActionTests extends ESTestCase {
.setStartedAtMillis(randomNonNegativeLong())
.setReason(this.getTestName())
.setNodeId(SHUTTING_DOWN_NODE_ID)
.setNodeEphemeralId(SHUTTING_DOWN_NODE_ID)
.build()
)
)

View file

@ -15,6 +15,8 @@ import org.elasticsearch.cluster.ClusterStateTaskExecutor;
import org.elasticsearch.cluster.ClusterStateTaskExecutor.TaskContext;
import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata.Type;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.allocation.AllocationService;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.cluster.service.MasterServiceTaskQueue;
@ -32,6 +34,8 @@ import org.mockito.MockitoAnnotations;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import static org.elasticsearch.core.Strings.format;
import static org.hamcrest.Matchers.containsString;
@ -96,14 +100,16 @@ public class TransportPutShutdownNodeActionTests extends ESTestCase {
targetNodeName,
null
);
action.masterOperation(null, request, ClusterState.EMPTY_STATE, ActionListener.noop());
var dummyNode = new DiscoveryNode(targetNodeName, "node1", "eph-node1", "abc", "abc", null, Map.of(), Set.of(), null);
var state = ClusterState.builder(ClusterState.EMPTY_STATE).nodes(DiscoveryNodes.builder().add(dummyNode).build()).build();
action.masterOperation(null, request, state, ActionListener.noop());
var updateTask = ArgumentCaptor.forClass(PutShutdownNodeTask.class);
var taskExecutor = ArgumentCaptor.forClass(PutShutdownNodeExecutor.class);
verify(clusterService).createTaskQueue(any(), any(), taskExecutor.capture());
verify(taskQueue).submitTask(any(), updateTask.capture(), any());
when(taskContext.getTask()).thenReturn(updateTask.getValue());
ClusterState stableState = taskExecutor.getValue()
.execute(new ClusterStateTaskExecutor.BatchExecutionContext<>(ClusterState.EMPTY_STATE, List.of(taskContext), () -> null));
.execute(new ClusterStateTaskExecutor.BatchExecutionContext<>(state, List.of(taskContext), () -> null));
// run the request again, there should be no call to submit an update task
clearTaskQueueInvocations();

View file

@ -88,6 +88,7 @@ public class TransformNodeTests extends ESTestCase {
SHUTTING_DOWN_ID,
SingleNodeShutdownMetadata.builder()
.setNodeId(SHUTTING_DOWN_ID)
.setNodeEphemeralId(SHUTTING_DOWN_ID)
.setReason("shutdown for a unit test")
.setType(SingleNodeShutdownMetadata.Type.RESTART)
.setStartedAtMillis(randomNonNegativeLong())