Fix deleting index during snapshot finalization (#103817)

Today if an index is deleted during a very specific order of snapshot finalizations then it's possible we'll miscalculate the latest shard generations for the shards in that index, causing the deletion of a shard-level `index-UUID` blob which prevents further snapshots of that shard. Closes #101029
2025-06-28 17:34:17 -04:00 · 2024-01-15 12:31:03 +00:00 · 2024-01-15 12:31:03 +00:00 · d049273ce1
commit d049273ce1
parent ad28dc9a6c
6 changed files with 169 additions and 13 deletions
--- a/docs/changelog/103817.yaml
+++ b/docs/changelog/103817.yaml
@ -0,0 +1,6 @@
+pr: 103817
+summary: Fix deleting index during snapshot finalization
+area: Snapshot/Restore
+type: bug
+issues:
+ - 101029
--- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java
@ -17,17 +17,24 @@ import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsRequest;
 import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
 import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus;
 import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
+import org.elasticsearch.action.admin.indices.delete.DeleteIndexClusterStateUpdateRequest;
+import org.elasticsearch.action.support.ActionTestUtils;
 import org.elasticsearch.action.support.GroupedActionListener;
 import org.elasticsearch.action.support.PlainActionFuture;
+import org.elasticsearch.action.support.SubscribableListener;
 import org.elasticsearch.action.support.master.AcknowledgedResponse;
 import org.elasticsearch.cluster.SnapshotDeletionsInProgress;
 import org.elasticsearch.cluster.SnapshotsInProgress;
+import org.elasticsearch.cluster.metadata.MetadataDeleteIndexService;
+import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.ListenableFuture;
 import org.elasticsearch.common.util.concurrent.UncategorizedExecutionException;
 import org.elasticsearch.core.PathUtils;
+import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.discovery.AbstractDisruptionTestCase;
+import org.elasticsearch.index.Index;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.repositories.IndexId;
 import org.elasticsearch.repositories.RepositoryConflictException;
@ -36,6 +43,7 @@ import org.elasticsearch.repositories.RepositoryException;
 import org.elasticsearch.repositories.ShardGenerations;
 import org.elasticsearch.repositories.blobstore.BlobStoreRepository;
 import org.elasticsearch.snapshots.mockstore.MockRepository;
+import org.elasticsearch.test.ClusterServiceUtils;
 import org.elasticsearch.test.ESIntegTestCase;
 import org.elasticsearch.test.disruption.NetworkDisruption;
 import org.elasticsearch.test.transport.MockTransportService;
@ -48,10 +56,12 @@ import java.nio.file.StandardOpenOption;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;

 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFileExists;
@ -2060,6 +2070,106 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase {
        assertSuccessful(partialFuture);
    }

+    public void testDeleteIndexWithOutOfOrderFinalization() {
+
+        final var indexToDelete = "index-to-delete";
+        final var indexNames = List.of(indexToDelete, "index-0", "index-1", "index-2");
+
+        for (final var indexName : indexNames) {
+            assertAcked(prepareCreate(indexName, indexSettingsNoReplicas(1)));
+        }
+
+        final var repoName = "test-repo";
+        createRepository(repoName, "fs");
+
+        // block the update-shard-snapshot-status requests so we can execute them in a specific order
+        final var masterTransportService = MockTransportService.getInstance(internalCluster().getMasterName());
+        final Map<String, SubscribableListener<Void>> otherIndexSnapshotListeners = indexNames.stream()
+            .collect(Collectors.toMap(k -> k, k -> new SubscribableListener<>()));
+        masterTransportService.<UpdateIndexShardSnapshotStatusRequest>addRequestHandlingBehavior(
+            SnapshotsService.UPDATE_SNAPSHOT_STATUS_ACTION_NAME,
+            (handler, request, channel, task) -> {
+                final var indexName = request.shardId().getIndexName();
+                if (indexName.equals(indexToDelete)) {
+                    handler.messageReceived(request, channel, task);
+                } else {
+                    final var listener = otherIndexSnapshotListeners.get(indexName);
+                    assertNotNull(indexName, listener);
+                    listener.addListener(
+                        ActionTestUtils.assertNoFailureListener(ignored -> handler.messageReceived(request, channel, task))
+                    );
+                }
+            }
+        );
+
+        // start the snapshots, each targeting index-to-delete and one other index so we can control their finalization order
+        final var snapshotCompleters = new HashMap<String, Runnable>();
+        for (final var blockingIndex : List.of("index-0", "index-1", "index-2")) {
+            final var snapshotName = "snapshot-with-" + blockingIndex;
+            final var snapshotFuture = clusterAdmin().prepareCreateSnapshot(repoName, snapshotName)
+                .setWaitForCompletion(true)
+                .setPartial(true)
+                .setIndices(indexToDelete, blockingIndex)
+                .execute();
+
+            // ensure each snapshot has really started before moving on to the next one
+            safeAwait(
+                ClusterServiceUtils.addTemporaryStateListener(
+                    internalCluster().getInstance(ClusterService.class),
+                    cs -> SnapshotsInProgress.get(cs)
+                        .forRepo(repoName)
+                        .stream()
+                        .anyMatch(e -> e.snapshot().getSnapshotId().getName().equals(snapshotName))
+                )
+            );
+
+            snapshotCompleters.put(blockingIndex, () -> {
+                assertFalse(snapshotFuture.isDone());
+                otherIndexSnapshotListeners.get(blockingIndex).onResponse(null);
+                assertEquals(SnapshotState.SUCCESS, snapshotFuture.actionGet(10, TimeUnit.SECONDS).getSnapshotInfo().state());
+            });
+        }
+
+        // set up to delete the index at a very specific moment during finalization
+        final var masterDeleteIndexService = internalCluster().getCurrentMasterNodeInstance(MetadataDeleteIndexService.class);
+        final var indexRecreatedListener = ClusterServiceUtils
+            // wait until the snapshot has entered finalization
+            .addTemporaryStateListener(
+                internalCluster().getInstance(ClusterService.class),
+                cs -> SnapshotsInProgress.get(cs)
+                    .forRepo(repoName)
+                    .stream()
+                    .anyMatch(e -> e.snapshot().getSnapshotId().getName().equals("snapshot-with-index-1") && e.state().completed())
+            )
+            // execute the index deletion _directly on the master_ so it happens before the snapshot finalization executes
+            .andThen((l, ignored) -> masterDeleteIndexService.deleteIndices(new DeleteIndexClusterStateUpdateRequest(l.map(r -> {
+                assertTrue(r.isAcknowledged());
+                return null;
+            })).indices(new Index[] { internalCluster().clusterService().state().metadata().index(indexToDelete).getIndex() })
+                .ackTimeout(TimeValue.timeValueSeconds(10))
+                .masterNodeTimeout(TimeValue.timeValueSeconds(10))))
+            // ultimately create the index again so that taking a full snapshot will pick up any missing shard gen blob, and deleting that
+            // full snapshot will clean up all dangling shard-level blobs
+            .andThen((l, ignored) -> prepareCreate(indexToDelete, indexSettingsNoReplicas(1)).execute(l.map(r -> {
+                assertTrue(r.isAcknowledged());
+                return null;
+            })));
+
+        // release the snapshots to be finalized, in this order
+        for (final var blockingIndex : List.of("index-1", "index-2", "index-0")) {
+            snapshotCompleters.get(blockingIndex).run();
+        }
+
+        safeAwait(indexRecreatedListener);
+        masterTransportService.clearAllRules();
+
+        // create a full snapshot to verify that the repo is still ok
+        createFullSnapshot(repoName, "final-full-snapshot");
+
+        // delete the full snapshot to clean up the leftover shard-level metadata (which trips repo consistency assertions otherwise)
+        startDeleteSnapshot(repoName, "final-full-snapshot").actionGet(10, TimeUnit.SECONDS);
+    }
+
    private static void assertSnapshotStatusCountOnRepo(String otherBlockedRepoName, int count) {
        final SnapshotsStatusResponse snapshotsStatusResponse = clusterAdmin().prepareSnapshotStatus(otherBlockedRepoName).get();
        final List<SnapshotStatus> snapshotStatuses = snapshotsStatusResponse.getSnapshots();
--- a/server/src/main/java/org/elasticsearch/repositories/FinalizeSnapshotContext.java
+++ b/server/src/main/java/org/elasticsearch/repositories/FinalizeSnapshotContext.java
@ -100,7 +100,7 @@ public final class FinalizeSnapshotContext extends DelegatingActionListener<Repo
    }

    public ClusterState updatedClusterState(ClusterState state) {
-        final ClusterState updatedState = SnapshotsService.stateWithoutSnapshot(state, snapshotInfo.snapshot());
+        final ClusterState updatedState = SnapshotsService.stateWithoutSnapshot(state, snapshotInfo.snapshot(), updatedShardGenerations);
        obsoleteGenerations.set(
            SnapshotsInProgress.get(updatedState).obsoleteGenerations(snapshotInfo.repository(), SnapshotsInProgress.get(state))
        );
--- a/server/src/main/java/org/elasticsearch/repositories/ShardGenerations.java
+++ b/server/src/main/java/org/elasticsearch/repositories/ShardGenerations.java
@ -142,6 +142,11 @@ public final class ShardGenerations {
        return generations.get(shardId);
    }

+    public boolean hasShardGen(RepositoryShardId repositoryShardId) {
+        final var indexShardGens = getGens(repositoryShardId.index());
+        return repositoryShardId.shardId() < indexShardGens.size() && indexShardGens.get(repositoryShardId.shardId()) != null;
+    }
+
    public List<ShardGeneration> getGens(IndexId indexId) {
        return shardGenerations.getOrDefault(indexId, Collections.emptyList());
    }
--- a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
+++ b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
@ -1741,6 +1741,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
                    (indexId, gens) -> gens.forEach(
                        (shardId, oldGen) -> toDelete.add(
                            shardPath(indexId, shardId).buildAsString().substring(prefixPathLen) + INDEX_FILE_PREFIX + oldGen
+                                .toBlobNamePart()
                        )
                    )
                );
--- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
+++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
@ -450,7 +450,7 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
            endingSnapshots.add(targetSnapshot);
            initializingClones.remove(targetSnapshot);
            logger.info(() -> "Failed to start snapshot clone [" + cloneEntry + "]", e);
-            removeFailedSnapshotFromClusterState(targetSnapshot, e, null);
+            removeFailedSnapshotFromClusterState(targetSnapshot, e, null, ShardGenerations.EMPTY);
        };

        // 1. step, load SnapshotInfo to make sure that source snapshot was successful for the indices we want to clone
@ -1312,7 +1312,12 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
        if (entry.isClone() && entry.state() == State.FAILED) {
            logger.debug("Removing failed snapshot clone [{}] from cluster state", entry);
            if (newFinalization) {
-                removeFailedSnapshotFromClusterState(snapshot, new SnapshotException(snapshot, entry.failure()), null);
+                removeFailedSnapshotFromClusterState(
+                    snapshot,
+                    new SnapshotException(snapshot, entry.failure()),
+                    null,
+                    ShardGenerations.EMPTY
+                );
            }
            return;
        }
@ -1496,7 +1501,15 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
                            // a fatal like e.g. this node stopped being the master node
                            snapshotListeners.onResponse(endAndGetListenersToResolve(snapshot));
                            runNextQueuedOperation(updatedRepositoryData, repository, true);
-                        }, e -> handleFinalizationFailure(e, snapshot, repositoryData)),
+                        },
+                            e -> handleFinalizationFailure(
+                                e,
+                                snapshot,
+                                repositoryData,
+                                // we might have written the new root blob before failing here, so we must use the updated shardGenerations
+                                shardGenerations
+                            )
+                        ),
                        snInfo -> snapshotListeners.addListener(new ActionListener<>() {
                            @Override
                            public void onResponse(List<ActionListener<SnapshotInfo>> actionListeners) {
@ -1512,11 +1525,20 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
                        })
                    )
                );
-            }, e -> handleFinalizationFailure(e, snapshot, repositoryData)));
+            },
+                e -> handleFinalizationFailure(
+                    e,
+                    snapshot,
+                    repositoryData,
+                    // a failure here means the root blob was not updated, but the updated shard generation blobs are all in place so we can
+                    // use the updated shardGenerations for all pending shard snapshots
+                    shardGenerations
+                )
+            ));
        } catch (Exception e) {
            logger.error(Strings.format("unexpected failure finalizing %s", snapshot), e);
            assert false : new AssertionError("unexpected failure finalizing " + snapshot, e);
-            handleFinalizationFailure(e, snapshot, repositoryData);
+            handleFinalizationFailure(e, snapshot, repositoryData, ShardGenerations.EMPTY);
        }
    }

@ -1568,7 +1590,12 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
     * @param snapshot       snapshot that failed to finalize
     * @param repositoryData current repository data for the snapshot's repository
     */
-    private void handleFinalizationFailure(Exception e, Snapshot snapshot, RepositoryData repositoryData) {
+    private void handleFinalizationFailure(
+        Exception e,
+        Snapshot snapshot,
+        RepositoryData repositoryData,
+        ShardGenerations shardGenerations
+    ) {
        if (ExceptionsHelper.unwrap(e, NotMasterException.class, FailedToCommitClusterStateException.class) != null) {
            // Failure due to not being master any more, don't try to remove snapshot from cluster state the next master
            // will try ending this snapshot again
@ -1581,7 +1608,7 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
            failAllListenersOnMasterFailOver(e);
        } else {
            logger.warn(() -> "[" + snapshot + "] failed to finalize snapshot", e);
-            removeFailedSnapshotFromClusterState(snapshot, e, repositoryData);
+            removeFailedSnapshotFromClusterState(snapshot, e, repositoryData, shardGenerations);
        }
    }

@ -1701,7 +1728,7 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
     * @param snapshot snapshot for which to remove the snapshot operation
     * @return updated cluster state
     */
-    public static ClusterState stateWithoutSnapshot(ClusterState state, Snapshot snapshot) {
+    public static ClusterState stateWithoutSnapshot(ClusterState state, Snapshot snapshot, ShardGenerations shardGenerations) {
        final SnapshotsInProgress snapshots = SnapshotsInProgress.get(state);
        ClusterState result = state;
        int indexOfEntry = -1;
@ -1762,7 +1789,8 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
                            final ShardSnapshotStatus shardState = finishedShardEntry.getValue();
                            final RepositoryShardId repositoryShardId = finishedShardEntry.getKey();
                            if (shardState.state() != ShardState.SUCCESS
-                                || previousEntry.shardsByRepoShardId().containsKey(repositoryShardId) == false) {
+                                || previousEntry.shardsByRepoShardId().containsKey(repositoryShardId) == false
+                                || shardGenerations.hasShardGen(finishedShardEntry.getKey()) == false) {
                                continue;
                            }
                            updatedShardAssignments = maybeAddUpdatedAssignment(
@ -1779,7 +1807,8 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
                            .entrySet()) {
                            final ShardSnapshotStatus shardState = finishedShardEntry.getValue();
                            if (shardState.state() == ShardState.SUCCESS
-                                && previousEntry.shardsByRepoShardId().containsKey(finishedShardEntry.getKey())) {
+                                && previousEntry.shardsByRepoShardId().containsKey(finishedShardEntry.getKey())
+                                && shardGenerations.hasShardGen(finishedShardEntry.getKey())) {
                                updatedShardAssignments = maybeAddUpdatedAssignment(
                                    updatedShardAssignments,
                                    shardState,
@ -1862,13 +1891,18 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
     * @param repositoryData repository data if the next finalization operation on the repository should be attempted or {@code null} if
     *                       no further actions should be executed
     */
-    private void removeFailedSnapshotFromClusterState(Snapshot snapshot, Exception failure, @Nullable RepositoryData repositoryData) {
+    private void removeFailedSnapshotFromClusterState(
+        Snapshot snapshot,
+        Exception failure,
+        @Nullable RepositoryData repositoryData,
+        ShardGenerations shardGenerations
+    ) {
        assert failure != null : "Failure must be supplied";
        submitUnbatchedTask(REMOVE_SNAPSHOT_METADATA_TASK_SOURCE, new ClusterStateUpdateTask() {

            @Override
            public ClusterState execute(ClusterState currentState) {
-                final ClusterState updatedState = stateWithoutSnapshot(currentState, snapshot);
+                final ClusterState updatedState = stateWithoutSnapshot(currentState, snapshot, shardGenerations);
                assert updatedState == currentState || endingSnapshots.contains(snapshot)
                    : "did not track [" + snapshot + "] in ending snapshots while removing it from the cluster state";
                // now check if there are any delete operations that refer to the just failed snapshot and remove the snapshot from them