Descriptive logs with docLinks for cluster shard limit exceeded (#132072)

* Descriptive logs with docLinks for cluster shard limit exceeded * Integration test for isClusterShardLimitExceeded * Fix jest test snapshots * Apply suggestions from code review Co-authored-by: gchaps <33642766+gchaps@users.noreply.github.com> * PR feedback * PR feedback * Unit tests for isClusterShardLimitExceeded * Use constast for repeated strings Co-authored-by: gchaps <33642766+gchaps@users.noreply.github.com>
2025-04-24 09:48:58 -04:00 · 2022-05-16 22:26:58 +02:00 · 2022-05-16 22:26:58 +02:00 · e79ab079fa
commit e79ab079fa
parent dea9159c5d
14 changed files with 395 additions and 110 deletions
--- a/docs/setup/upgrade/resolving-migration-failures.asciidoc
+++ b/docs/setup/upgrade/resolving-migration-failures.asciidoc
@ -187,3 +187,10 @@ PUT /_cluster/settings
  }
 }
 --------------------------------------------
+
+[float]
+[[cluster-shard-limit-exceeded]]
+==== {es} cluster shard limit exceeded
+When upgrading, {kib} creates new indices requiring a small number of new shards. If the amount of open {es} shards approaches or exceeds the {es} `cluster.max_shards_per_node` setting, {kib} is unable to complete the upgrade. Ensure that {kib} is able to add at least 10 more shards by removing indices to clear up resources, or by increasing the `cluster.max_shards_per_node` setting.
+
+For more information, refer to the documentation on {ref}/allocation-total-shards.html[total shards per node].
--- a/packages/kbn-doc-links/src/get_doc_links.ts
+++ b/packages/kbn-doc-links/src/get_doc_links.ts
@ -653,6 +653,7 @@ export const getDocLinks = ({ kibanaBranch }: GetDocLinkOptions): DocLinks => {
      resolveMigrationFailures: `${KIBANA_DOCS}resolve-migrations-failures.html`,
      repeatedTimeoutRequests: `${KIBANA_DOCS}resolve-migrations-failures.html#_repeated_time_out_requests_that_eventually_fail`,
      routingAllocationDisabled: `${KIBANA_DOCS}resolve-migrations-failures.html#routing-allocation-disabled`,
+      clusterShardLimitExceeded: `${KIBANA_DOCS}resolve-migrations-failures.html#cluster-shard-limit-exceeded`,
    },
  });
 };
--- a/packages/kbn-doc-links/src/types.ts
+++ b/packages/kbn-doc-links/src/types.ts
@ -408,5 +408,6 @@ export interface DocLinks {
    readonly resolveMigrationFailures: string;
    readonly repeatedTimeoutRequests: string;
    readonly routingAllocationDisabled: string;
+    readonly clusterShardLimitExceeded: string;
  };
 }
--- a/src/core/server/saved_objects/migrations/snapshots/migrations_state_action_machine.test.ts.snap
+++ b/src/core/server/saved_objects/migrations/snapshots/migrations_state_action_machine.test.ts.snap
@ -33,6 +33,7 @@ Object {
              ],
              "maxBatchSizeBytes": 100000000,
              "migrationDocLinks": Object {
+                "clusterShardLimitExceeded": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#cluster-shard-limit-exceeded",
                "repeatedTimeoutRequests": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#_repeated_time_out_requests_that_eventually_fail",
                "resolveMigrationFailures": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html",
                "routingAllocationDisabled": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#routing-allocation-disabled",
@ -204,6 +205,7 @@ Object {
              ],
              "maxBatchSizeBytes": 100000000,
              "migrationDocLinks": Object {
+                "clusterShardLimitExceeded": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#cluster-shard-limit-exceeded",
                "repeatedTimeoutRequests": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#_repeated_time_out_requests_that_eventually_fail",
                "resolveMigrationFailures": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html",
                "routingAllocationDisabled": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#routing-allocation-disabled",
@ -379,6 +381,7 @@ Object {
              ],
              "maxBatchSizeBytes": 100000000,
              "migrationDocLinks": Object {
+                "clusterShardLimitExceeded": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#cluster-shard-limit-exceeded",
                "repeatedTimeoutRequests": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#_repeated_time_out_requests_that_eventually_fail",
                "resolveMigrationFailures": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html",
                "routingAllocationDisabled": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#routing-allocation-disabled",
@ -558,6 +561,7 @@ Object {
              ],
              "maxBatchSizeBytes": 100000000,
              "migrationDocLinks": Object {
+                "clusterShardLimitExceeded": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#cluster-shard-limit-exceeded",
                "repeatedTimeoutRequests": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#_repeated_time_out_requests_that_eventually_fail",
                "resolveMigrationFailures": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html",
                "routingAllocationDisabled": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#routing-allocation-disabled",
@ -763,6 +767,7 @@ Object {
              ],
              "maxBatchSizeBytes": 100000000,
              "migrationDocLinks": Object {
+                "clusterShardLimitExceeded": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#cluster-shard-limit-exceeded",
                "repeatedTimeoutRequests": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#_repeated_time_out_requests_that_eventually_fail",
                "resolveMigrationFailures": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html",
                "routingAllocationDisabled": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#routing-allocation-disabled",
@ -945,6 +950,7 @@ Object {
              ],
              "maxBatchSizeBytes": 100000000,
              "migrationDocLinks": Object {
+                "clusterShardLimitExceeded": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#cluster-shard-limit-exceeded",
                "repeatedTimeoutRequests": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#_repeated_time_out_requests_that_eventually_fail",
                "resolveMigrationFailures": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html",
                "routingAllocationDisabled": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#routing-allocation-disabled",
--- a/src/core/server/saved_objects/migrations/actions/clone_index.ts
+++ b/src/core/server/saved_objects/migrations/actions/clone_index.ts
@ -23,6 +23,8 @@ import {
  INDEX_NUMBER_OF_SHARDS,
  WAIT_FOR_ALL_SHARDS_TO_BE_ACTIVE,
 } from './constants';
+import { isClusterShardLimitExceeded } from './es_errors';
+import { ClusterShardLimitExceeded } from './create_index';
 export type CloneIndexResponse = AcknowledgeResponse;

 /** @internal */
@ -49,11 +51,11 @@ export const cloneIndex = ({
  target,
  timeout = DEFAULT_TIMEOUT,
 }: CloneIndexParams): TaskEither.TaskEither<
-  RetryableEsClientError | IndexNotFound | IndexNotYellowTimeout,
+  RetryableEsClientError | IndexNotFound | IndexNotYellowTimeout | ClusterShardLimitExceeded,
  CloneIndexResponse
 > => {
  const cloneTask: TaskEither.TaskEither<
-    RetryableEsClientError | IndexNotFound,
+    RetryableEsClientError | IndexNotFound | ClusterShardLimitExceeded,
    AcknowledgeResponse
  > = () => {
    return client.indices
@ -113,6 +115,10 @@ export const cloneIndex = ({
            acknowledged: true,
            shardsAcknowledged: false,
          });
+        } else if (isClusterShardLimitExceeded(error?.body?.error)) {
+          return Either.left({
+            type: 'cluster_shard_limit_exceeded' as const,
+          });
        } else {
          throw error;
        }
--- a/src/core/server/saved_objects/migrations/actions/create_index.ts
+++ b/src/core/server/saved_objects/migrations/actions/create_index.ts
@ -23,6 +23,7 @@ import {
  WAIT_FOR_ALL_SHARDS_TO_BE_ACTIVE,
 } from './constants';
 import { IndexNotYellowTimeout, waitForIndexStatusYellow } from './wait_for_index_status_yellow';
+import { isClusterShardLimitExceeded } from './es_errors';

 function aliasArrayToRecord(aliases: string[]): Record<string, estypes.IndicesAlias> {
  const result: Record<string, estypes.IndicesAlias> = {};
@ -32,6 +33,11 @@ function aliasArrayToRecord(aliases: string[]): Record<string, estypes.IndicesAl
  return result;
 }

+/** @internal */
+export interface ClusterShardLimitExceeded {
+  type: 'cluster_shard_limit_exceeded';
+}
+
 /** @internal */
 export interface CreateIndexParams {
  client: ElasticsearchClient;
@ -55,11 +61,11 @@ export const createIndex = ({
  mappings,
  aliases = [],
 }: CreateIndexParams): TaskEither.TaskEither<
-  RetryableEsClientError | IndexNotYellowTimeout,
+  RetryableEsClientError | IndexNotYellowTimeout | ClusterShardLimitExceeded,
  'create_index_succeeded'
 > => {
  const createIndexTask: TaskEither.TaskEither<
-    RetryableEsClientError,
+    RetryableEsClientError | ClusterShardLimitExceeded,
    AcknowledgeResponse
  > = () => {
    const aliasesObject = aliasArrayToRecord(aliases);
@ -120,6 +126,10 @@ export const createIndex = ({
            acknowledged: true,
            shardsAcknowledged: false,
          });
+        } else if (isClusterShardLimitExceeded(error?.body?.error)) {
+          return Either.left({
+            type: 'cluster_shard_limit_exceeded' as const,
+          });
        } else {
          throw error;
        }
@ -129,7 +139,11 @@ export const createIndex = ({

  return pipe(
    createIndexTask,
-    TaskEither.chain((res) => {
+    TaskEither.chain<
+      RetryableEsClientError | IndexNotYellowTimeout | ClusterShardLimitExceeded,
+      AcknowledgeResponse,
+      'create_index_succeeded'
+    >((res) => {
      if (res.acknowledged && res.shardsAcknowledged) {
        // If the cluster state was updated and all shards ackd we're done
        return TaskEither.right('create_index_succeeded');
--- a/src/core/server/saved_objects/migrations/actions/es_errors.test.ts
+++ b/src/core/server/saved_objects/migrations/actions/es_errors.test.ts
@ -6,7 +6,11 @@
 * Side Public License, v 1.
 */

-import { isIncompatibleMappingException, isWriteBlockException } from './es_errors';
+import {
+  isClusterShardLimitExceeded,
+  isIncompatibleMappingException,
+  isWriteBlockException,
+} from './es_errors';

 describe('isWriteBlockError', () => {
  it('returns true for a `index write` cluster_block_exception', () => {
@ -54,3 +58,23 @@ describe('isIncompatibleMappingExceptionError', () => {
    ).toEqual(true);
  });
 });
+
+describe('isClusterShardLimitExceeded', () => {
+  it('returns true with validation_exception and reason is maximum normal shards open', () => {
+    expect(
+      isClusterShardLimitExceeded({
+        type: 'validation_exception',
+        reason:
+          'Validation Failed: 1: this action would add [2] shards, but this cluster currently has [3]/[1] maximum normal shards open;',
+      })
+    ).toEqual(true);
+  });
+  it('returns false for validation_exception with another reason', () => {
+    expect(
+      isClusterShardLimitExceeded({
+        type: 'validation_exception',
+        reason: 'Validation Failed: 1: this action would do something its not allowed to do',
+      })
+    ).toEqual(false);
+  });
+});
--- a/src/core/server/saved_objects/migrations/actions/es_errors.ts
+++ b/src/core/server/saved_objects/migrations/actions/es_errors.ts
@ -21,3 +21,12 @@ export const isIncompatibleMappingException = ({ type }: estypes.ErrorCause): bo
 export const isIndexNotFoundException = ({ type }: estypes.ErrorCause): boolean => {
  return type === 'index_not_found_exception';
 };
+
+export const isClusterShardLimitExceeded = ({ type, reason }: estypes.ErrorCause): boolean => {
+  return (
+    type === 'validation_exception' &&
+    reason.match(
+      /this action would add .* shards, but this cluster currently has .* maximum normal shards open/
+    ) !== null
+  );
+};
--- a/src/core/server/saved_objects/migrations/actions/index.ts
+++ b/src/core/server/saved_objects/migrations/actions/index.ts
@ -88,6 +88,7 @@ export { updateAndPickupMappings } from './update_and_pickup_mappings';

 import type { UnknownDocsFound } from './check_for_unknown_docs';
 import type { IncompatibleClusterRoutingAllocation } from './initialize_action';
+import { ClusterShardLimitExceeded } from './create_index';

 export type {
  CheckForUnknownDocsParams,
@ -153,6 +154,7 @@ export interface ActionErrorTypeMap {
  unknown_docs_found: UnknownDocsFound;
  incompatible_cluster_routing_allocation: IncompatibleClusterRoutingAllocation;
  index_not_yellow_timeout: IndexNotYellowTimeout;
+  cluster_shard_limit_exceeded: ClusterShardLimitExceeded;
 }

 /**
--- a/src/core/server/saved_objects/migrations/actions/integration_tests/actions.test.ts
+++ b/src/core/server/saved_objects/migrations/actions/integration_tests/actions.test.ts
@ -425,6 +425,10 @@ describe('migration actions', () => {
  describe('cloneIndex', () => {
    afterAll(async () => {
      try {
+        // Restore the default setting of 1000 shards per node
+        await client.cluster.putSettings({
+          persistent: { cluster: { max_shards_per_node: null } },
+        });
        await client.indices.delete({ index: 'clone_*' });
      } catch (e) {
        /** ignore */
@ -577,6 +581,23 @@ describe('migration actions', () => {
          }
      `);
    });
+    it('resolves left cluster_shard_limit_exceeded when the action would exceed the maximum normal open shards', async () => {
+      // Set the max shards per node really low so that any new index that's created would exceed the maximum open shards for this cluster
+      await client.cluster.putSettings({ persistent: { cluster: { max_shards_per_node: 1 } } });
+      const cloneIndexPromise = cloneIndex({
+        client,
+        source: 'existing_index_with_write_block',
+        target: 'clone_target_4',
+      })();
+      await expect(cloneIndexPromise).resolves.toMatchInlineSnapshot(`
+        Object {
+          "_tag": "Left",
+          "left": Object {
+            "type": "cluster_shard_limit_exceeded",
+          },
+        }
+      `);
+    });
  });

  // Reindex doesn't return any errors on it's own, so we have to test
@ -1565,6 +1586,10 @@ describe('migration actions', () => {
  });

  describe('createIndex', () => {
+    afterEach(async () => {
+      // Restore the default setting of 1000 shards per node
+      await client.cluster.putSettings({ persistent: { cluster: { max_shards_per_node: null } } });
+    });
    afterAll(async () => {
      await client.indices.delete({ index: 'red_then_yellow_index' });
    });
@ -1615,13 +1640,30 @@ describe('migration actions', () => {
        // Assert that the promise didn't resolve before the index became green
        expect(indexYellow).toBe(true);
        expect(res).toMatchInlineSnapshot(`
-                Object {
-                  "_tag": "Right",
-                  "right": "create_index_succeeded",
-                }
-              `);
+          Object {
+            "_tag": "Right",
+            "right": "create_index_succeeded",
+          }
+        `);
      });
    });
+    it('resolves left cluster_shard_limit_exceeded when the action would exceed the maximum normal open shards', async () => {
+      // Set the max shards per node really low so that any new index that's created would exceed the maximum open shards for this cluster
+      await client.cluster.putSettings({ persistent: { cluster: { max_shards_per_node: 1 } } });
+      const createIndexPromise = createIndex({
+        client,
+        indexName: 'red_then_yellow_index_1',
+        mappings: undefined as any,
+      })();
+      await expect(createIndexPromise).resolves.toMatchInlineSnapshot(`
+        Object {
+          "_tag": "Left",
+          "left": Object {
+            "type": "cluster_shard_limit_exceeded",
+          },
+        }
+      `);
+    });
    it('rejects when there is an unexpected error creating the index', async () => {
      // Creating an index with the same name as an existing alias to induce
      // failure
@ -1646,11 +1688,11 @@ describe('migration actions', () => {
      });

      await expect(task()).resolves.toMatchInlineSnapshot(`
-                      Object {
-                        "_tag": "Right",
-                        "right": "bulk_index_succeeded",
-                      }
-                  `);
+          Object {
+            "_tag": "Right",
+            "right": "bulk_index_succeeded",
+          }
+      `);
    });
    it('resolves right even if there were some version_conflict_engine_exception', async () => {
      const existingDocs = (
@ -1671,11 +1713,11 @@ describe('migration actions', () => {
        refresh: 'wait_for',
      });
      await expect(task()).resolves.toMatchInlineSnapshot(`
-                Object {
-                  "_tag": "Right",
-                  "right": "bulk_index_succeeded",
-                }
-              `);
+        Object {
+          "_tag": "Right",
+          "right": "bulk_index_succeeded",
+        }
+      `);
    });
    it('resolves left target_index_had_write_block if there are write_block errors', async () => {
      const newDocs = [
@ -1691,13 +1733,13 @@ describe('migration actions', () => {
          refresh: 'wait_for',
        })()
      ).resolves.toMatchInlineSnapshot(`
-                      Object {
-                        "_tag": "Left",
-                        "left": Object {
-                          "type": "target_index_had_write_block",
-                        },
-                      }
-                  `);
+          Object {
+            "_tag": "Left",
+            "left": Object {
+              "type": "target_index_had_write_block",
+            },
+          }
+      `);
    });

    it('resolves left request_entity_too_large_exception when the payload is too large', async () => {
@ -1713,13 +1755,13 @@ describe('migration actions', () => {
        transformedDocs: newDocs,
      });
      await expect(task()).resolves.toMatchInlineSnapshot(`
-                      Object {
-                        "_tag": "Left",
-                        "left": Object {
-                          "type": "request_entity_too_large_exception",
-                        },
-                      }
-                  `);
+        Object {
+          "_tag": "Left",
+          "left": Object {
+            "type": "request_entity_too_large_exception",
+          },
+        }
+      `);
    });
  });
 });
--- a/src/core/server/saved_objects/migrations/actions/integration_tests/es_errors.test.ts
+++ b/src/core/server/saved_objects/migrations/actions/integration_tests/es_errors.test.ts
@ -10,7 +10,7 @@ import { ElasticsearchClient } from '../../../..';
 import { InternalCoreStart } from '../../../../internal_types';
 import * as kbnTestServer from '../../../../../test_helpers/kbn_server';
 import { Root } from '../../../../root';
-import { isWriteBlockException } from '../es_errors';
+import { isWriteBlockException, isClusterShardLimitExceeded } from '../es_errors';
 import { createIndex } from '../create_index';
 import { setWriteBlock } from '../set_write_block';

@ -127,4 +127,36 @@ describe('Elasticsearch Errors', () => {
      expect(isWriteBlockException(cause)).toEqual(true);
    });
  });
+  describe('isClusterShardLimitExceeded', () => {
+    beforeAll(async () => {
+      await client.cluster.putSettings({ persistent: { cluster: { max_shards_per_node: 1 } } });
+    });
+    afterAll(async () => {
+      await client.cluster.putSettings({ persistent: { cluster: { max_shards_per_node: null } } });
+    });
+
+    it('correctly identify errors from create index operation', async () => {
+      const res = await client.indices.create(
+        {
+          index: 'new_test_index',
+        },
+        { ignore: [400] }
+      );
+
+      // @ts-expect-error @elastic/elasticsearch doesn't declare error on response
+      expect(isClusterShardLimitExceeded(res.error)).toEqual(true);
+    });
+    it('correctly identify errors from clone index operation', async () => {
+      const res = await client.indices.clone(
+        {
+          index: 'existing_index_with_write_block',
+          target: 'new_test_index_2',
+        },
+        { ignore: [400] }
+      );
+
+      // @ts-expect-error @elastic/elasticsearch doesn't declare error on response
+      expect(isClusterShardLimitExceeded(res.error)).toEqual(true);
+    });
+  });
 });
--- a/src/core/server/saved_objects/migrations/initial_state.test.ts
+++ b/src/core/server/saved_objects/migrations/initial_state.test.ts
@ -42,86 +42,161 @@ describe('createInitialState', () => {
        typeRegistry,
        docLinks,
      })
-    ).toEqual({
-      batchSize: 1000,
-      maxBatchSizeBytes: ByteSizeValue.parse('100mb').getValueInBytes(),
-      controlState: 'INIT',
-      currentAlias: '.kibana_task_manager',
-      excludeFromUpgradeFilterHooks: {},
-      indexPrefix: '.kibana_task_manager',
-      kibanaVersion: '8.1.0',
-      knownTypes: [],
-      legacyIndex: '.kibana_task_manager',
-      logs: [],
-      outdatedDocumentsQuery: {
-        bool: {
-          should: [],
+    ).toMatchInlineSnapshot(`
+      Object {
+        "batchSize": 1000,
+        "controlState": "INIT",
+        "currentAlias": ".kibana_task_manager",
+        "excludeFromUpgradeFilterHooks": Object {},
+        "indexPrefix": ".kibana_task_manager",
+        "kibanaVersion": "8.1.0",
+        "knownTypes": Array [],
+        "legacyIndex": ".kibana_task_manager",
+        "logs": Array [],
+        "maxBatchSizeBytes": 104857600,
+        "migrationDocLinks": Object {
+          "clusterShardLimitExceeded": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#cluster-shard-limit-exceeded",
+          "repeatedTimeoutRequests": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#_repeated_time_out_requests_that_eventually_fail",
+          "resolveMigrationFailures": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html",
+          "routingAllocationDisabled": "https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#routing-allocation-disabled",
        },
-      },
-      preMigrationScript: {
-        _tag: 'None',
-      },
-      retryAttempts: 15,
-      retryCount: 0,
-      retryDelay: 0,
-      targetIndexMappings: {
-        dynamic: 'strict',
-        properties: {
-          my_type: {
-            properties: {
-              title: {
-                type: 'text',
+        "outdatedDocumentsQuery": Object {
+          "bool": Object {
+            "should": Array [],
+          },
+        },
+        "preMigrationScript": Object {
+          "_tag": "None",
+        },
+        "retryAttempts": 15,
+        "retryCount": 0,
+        "retryDelay": 0,
+        "targetIndexMappings": Object {
+          "dynamic": "strict",
+          "properties": Object {
+            "my_type": Object {
+              "properties": Object {
+                "title": Object {
+                  "type": "text",
+                },
              },
            },
          },
        },
-      },
-      tempIndex: '.kibana_task_manager_8.1.0_reindex_temp',
-      tempIndexMappings: {
-        dynamic: false,
-        properties: {
-          migrationVersion: {
-            dynamic: 'true',
-            type: 'object',
-          },
-          type: {
-            type: 'keyword',
-          },
-        },
-      },
-      unusedTypesQuery: {
-        bool: {
-          must_not: expect.arrayContaining([
-            {
-              bool: {
-                must: [
-                  {
-                    match: {
-                      type: 'search-session',
-                    },
-                  },
-                  {
-                    match: {
-                      'search-session.persisted': false,
-                    },
-                  },
-                ],
-              },
+        "tempIndex": ".kibana_task_manager_8.1.0_reindex_temp",
+        "tempIndexMappings": Object {
+          "dynamic": false,
+          "properties": Object {
+            "migrationVersion": Object {
+              "dynamic": "true",
+              "type": "object",
            },
-          ]),
+            "type": Object {
+              "type": "keyword",
+            },
+          },
        },
-      },
-      versionAlias: '.kibana_task_manager_8.1.0',
-      versionIndex: '.kibana_task_manager_8.1.0_001',
-      migrationDocLinks: {
-        resolveMigrationFailures:
-          'https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html',
-        repeatedTimeoutRequests:
-          'https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#_repeated_time_out_requests_that_eventually_fail',
-        routingAllocationDisabled:
-          'https://www.elastic.co/guide/en/kibana/test-branch/resolve-migrations-failures.html#routing-allocation-disabled',
-      },
-    });
+        "unusedTypesQuery": Object {
+          "bool": Object {
+            "must_not": Array [
+              Object {
+                "term": Object {
+                  "type": "apm-services-telemetry",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "background-session",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "cases-sub-case",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "file-upload-telemetry",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "fleet-agent-actions",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "fleet-agent-events",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "fleet-agents",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "fleet-enrollment-api-keys",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "ml-telemetry",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "osquery-usage-metric",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "server",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "siem-detection-engine-rule-status",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "timelion-sheet",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "tsvb-validation-telemetry",
+                },
+              },
+              Object {
+                "term": Object {
+                  "type": "ui-counter",
+                },
+              },
+              Object {
+                "bool": Object {
+                  "must": Array [
+                    Object {
+                      "match": Object {
+                        "type": "search-session",
+                      },
+                    },
+                    Object {
+                      "match": Object {
+                        "search-session.persisted": false,
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+        },
+        "versionAlias": ".kibana_task_manager_8.1.0",
+        "versionIndex": ".kibana_task_manager_8.1.0_001",
+      }
+    `);
  });

  it('returns state with the correct `knownTypes`', () => {
--- a/src/core/server/saved_objects/migrations/model/model.test.ts
+++ b/src/core/server/saved_objects/migrations/model/model.test.ts
@ -98,6 +98,7 @@ describe('migrations v2 model', () => {
      resolveMigrationFailures: 'resolveMigrationFailures',
      repeatedTimeoutRequests: 'repeatedTimeoutRequests',
      routingAllocationDisabled: 'routingAllocationDisabled',
+      clusterShardLimitExceeded: 'clusterShardLimitExceeded',
    },
  };

@ -599,6 +600,16 @@ describe('migrations v2 model', () => {
        expect(newState.retryCount).toEqual(0);
        expect(newState.retryDelay).toEqual(0);
      });
+      test('LEGACY_CREATE_REINDEX_TARGET -> FATAL if action fails with cluster_shard_limit_exceeded', () => {
+        const res: ResponseType<'LEGACY_CREATE_REINDEX_TARGET'> = Either.left({
+          type: 'cluster_shard_limit_exceeded',
+        });
+        const newState = model(legacyCreateReindexTargetState, res);
+        expect(newState.controlState).toEqual('FATAL');
+        expect((newState as FatalState).reason).toMatchInlineSnapshot(
+          `"[cluster_shard_limit_exceeded] Upgrading Kibana requires adding a small number of new shards. Ensure that Kibana is able to add 10 more shards by increasing the cluster.max_shards_per_node setting, or removing indices to clear up resources. See clusterShardLimitExceeded"`
+        );
+      });
    });

    describe('LEGACY_REINDEX', () => {
@ -997,6 +1008,16 @@ describe('migrations v2 model', () => {
        expect(newState.retryCount).toEqual(0);
        expect(newState.retryDelay).toEqual(0);
      });
+      test('CREATE_REINDEX_TEMP -> FATAL if action fails with cluster_shard_limit_exceeded', () => {
+        const res: ResponseType<'CREATE_REINDEX_TEMP'> = Either.left({
+          type: 'cluster_shard_limit_exceeded',
+        });
+        const newState = model(state, res);
+        expect(newState.controlState).toEqual('FATAL');
+        expect((newState as FatalState).reason).toMatchInlineSnapshot(
+          `"[cluster_shard_limit_exceeded] Upgrading Kibana requires adding a small number of new shards. Ensure that Kibana is able to add 10 more shards by increasing the cluster.max_shards_per_node setting, or removing indices to clear up resources. See clusterShardLimitExceeded"`
+        );
+      });
    });

    describe('REINDEX_SOURCE_TO_TEMP_OPEN_PIT', () => {
@ -1325,7 +1346,7 @@ describe('migrations v2 model', () => {
          }
        `);
      });
-      it('CREATE_NEW_TARGET -> MARK_VERSION_INDEX_READY resets the retry count and delay', () => {
+      it('CLONE_TEMP_TO_TARGET -> MARK_VERSION_INDEX_READY resets the retry count and delay', () => {
        const res: ResponseType<'CLONE_TEMP_TO_TARGET'> = Either.right({
          acknowledged: true,
          shardsAcknowledged: true,
@ -1340,6 +1361,16 @@ describe('migrations v2 model', () => {
        expect(newState.retryCount).toBe(0);
        expect(newState.retryDelay).toBe(0);
      });
+      test('CLONE_TEMP_TO_TARGET -> FATAL if action fails with cluster_shard_limit_exceeded', () => {
+        const res: ResponseType<'CLONE_TEMP_TO_TARGET'> = Either.left({
+          type: 'cluster_shard_limit_exceeded',
+        });
+        const newState = model(state, res);
+        expect(newState.controlState).toEqual('FATAL');
+        expect((newState as FatalState).reason).toMatchInlineSnapshot(
+          `"[cluster_shard_limit_exceeded] Upgrading Kibana requires adding a small number of new shards. Ensure that Kibana is able to add 10 more shards by increasing the cluster.max_shards_per_node setting, or removing indices to clear up resources. See clusterShardLimitExceeded"`
+        );
+      });
    });

    describe('OUTDATED_DOCUMENTS_SEARCH_OPEN_PIT', () => {
@ -1849,6 +1880,16 @@ describe('migrations v2 model', () => {
        expect(newState.retryCount).toEqual(0);
        expect(newState.retryDelay).toEqual(0);
      });
+      test('CREATE_NEW_TARGET -> FATAL if action fails with cluster_shard_limit_exceeded', () => {
+        const res: ResponseType<'CREATE_NEW_TARGET'> = Either.left({
+          type: 'cluster_shard_limit_exceeded',
+        });
+        const newState = model(createNewTargetState, res);
+        expect(newState.controlState).toEqual('FATAL');
+        expect((newState as FatalState).reason).toMatchInlineSnapshot(
+          `"[cluster_shard_limit_exceeded] Upgrading Kibana requires adding a small number of new shards. Ensure that Kibana is able to add 10 more shards by increasing the cluster.max_shards_per_node setting, or removing indices to clear up resources. See clusterShardLimitExceeded"`
+        );
+      });
    });

    describe('MARK_VERSION_INDEX_READY', () => {
--- a/src/core/server/saved_objects/migrations/model/model.ts
+++ b/src/core/server/saved_objects/migrations/model/model.ts
@ -38,6 +38,7 @@ import {
 import { createBatches } from './create_batches';

 export const FATAL_REASON_REQUEST_ENTITY_TOO_LARGE = `While indexing a batch of saved objects, Elasticsearch returned a 413 Request Entity Too Large exception. Ensure that the Kibana configuration option 'migrations.maxBatchSizeBytes' is set to a value that is lower than or equal to the Elasticsearch 'http.max_content_length' configuration option.`;
+const CLUSTER_SHARD_LIMIT_EXCEEDED_REASON = `[cluster_shard_limit_exceeded] Upgrading Kibana requires adding a small number of new shards. Ensure that Kibana is able to add 10 more shards by increasing the cluster.max_shards_per_node setting, or removing indices to clear up resources.`;

 export const model = (currentState: State, resW: ResponseType<AllActionStates>): State => {
  // The action response `resW` is weakly typed, the type includes all action
@ -230,6 +231,12 @@ export const model = (currentState: State, resW: ResponseType<AllActionStates>):
        // continue to timeout and eventually lead to a failed migration.
        const retryErrorMessage = `${left.message} Refer to ${stateP.migrationDocLinks.repeatedTimeoutRequests} for information on how to resolve the issue.`;
        return delayRetryState(stateP, retryErrorMessage, stateP.retryAttempts);
+      } else if (isLeftTypeof(left, 'cluster_shard_limit_exceeded')) {
+        return {
+          ...stateP,
+          controlState: 'FATAL',
+          reason: `${CLUSTER_SHARD_LIMIT_EXCEEDED_REASON} See ${stateP.migrationDocLinks.clusterShardLimitExceeded}`,
+        };
      } else {
        return throwBadResponse(stateP, left);
      }
@ -447,6 +454,12 @@ export const model = (currentState: State, resW: ResponseType<AllActionStates>):
        // continue to timeout and eventually lead to a failed migration.
        const retryErrorMessage = `${left.message} Refer to ${stateP.migrationDocLinks.repeatedTimeoutRequests} for information on how to resolve the issue.`;
        return delayRetryState(stateP, retryErrorMessage, stateP.retryAttempts);
+      } else if (isLeftTypeof(left, 'cluster_shard_limit_exceeded')) {
+        return {
+          ...stateP,
+          controlState: 'FATAL',
+          reason: `${CLUSTER_SHARD_LIMIT_EXCEEDED_REASON} See ${stateP.migrationDocLinks.clusterShardLimitExceeded}`,
+        };
      } else {
        return throwBadResponse(stateP, left);
      }
@ -682,6 +695,12 @@ export const model = (currentState: State, resW: ResponseType<AllActionStates>):
        // continue to timeout and eventually lead to a failed migration.
        const retryErrorMessage = `${left.message} Refer to ${stateP.migrationDocLinks.repeatedTimeoutRequests} for information on how to resolve the issue.`;
        return delayRetryState(stateP, retryErrorMessage, stateP.retryAttempts);
+      } else if (isLeftTypeof(left, 'cluster_shard_limit_exceeded')) {
+        return {
+          ...stateP,
+          controlState: 'FATAL',
+          reason: `${CLUSTER_SHARD_LIMIT_EXCEEDED_REASON} See ${stateP.migrationDocLinks.clusterShardLimitExceeded}`,
+        };
      } else {
        throwBadResponse(stateP, left);
      }
@ -937,6 +956,12 @@ export const model = (currentState: State, resW: ResponseType<AllActionStates>):
        // continue to timeout and eventually lead to a failed migration.
        const retryErrorMessage = `${left.message} Refer to ${stateP.migrationDocLinks.repeatedTimeoutRequests} for information on how to resolve the issue.`;
        return delayRetryState(stateP, retryErrorMessage, stateP.retryAttempts);
+      } else if (isLeftTypeof(left, 'cluster_shard_limit_exceeded')) {
+        return {
+          ...stateP,
+          controlState: 'FATAL',
+          reason: `${CLUSTER_SHARD_LIMIT_EXCEEDED_REASON} See ${stateP.migrationDocLinks.clusterShardLimitExceeded}`,
+        };
      } else {
        return throwBadResponse(stateP, left);
      }