[ML] Trained Models: Cannot deploy a model after a failed deployment (#211459)

After the recent changes in https://github.com/elastic/kibana/pull/205699 If a deployment fails, the error will be handled correctly at a single deployment level, however, the pipeline would break, thus further deployments wouldn't be proceeded.
2025-04-24 09:48:58 -04:00 · 2025-02-19 10:45:54 +01:00 · 2025-02-19 10:45:54 +01:00 · 58cea843e9
commit 58cea843e9
parent a1fde97765
2 changed files with 107 additions and 54 deletions
--- a/x-pack/platform/plugins/shared/ml/public/application/model_management/trained_models_service.test.ts
+++ b/x-pack/platform/plugins/shared/ml/public/application/model_management/trained_models_service.test.ts
@ -31,6 +31,38 @@ describe('TrainedModelsService', () => {
  let scheduledDeploymentsSubject: BehaviorSubject<StartAllocationParams[]>;
  let mockSetScheduledDeployments: jest.Mock<any, any>;

+  const startModelAllocationResponseMock = {
+    assignment: {
+      task_parameters: {
+        model_id: 'deploy-model',
+        model_bytes: 1000,
+        allocation_id: 'test-allocation',
+        priority: 'normal',
+        number_of_allocations: 1,
+        threads_per_allocation: 1,
+        queue_capacity: 1024,
+        deployment_id: 'my-deployment-id',
+        cache_size: '1mb',
+      },
+      node_count: 1,
+      routing_table: {
+        'node-1': {
+          routing_state: 'started',
+          reason: '',
+          current_allocations: 1,
+          target_allocations: 1,
+        },
+      },
+      assignment_state: 'started',
+      start_time: 1234567890,
+      adaptive_allocations: {
+        enabled: true,
+        min_number_of_allocations: 1,
+        max_number_of_allocations: 4,
+      },
+    } as const,
+  };
+
  const mockDisplayErrorToast = jest.fn();
  const mockDisplaySuccessToast = jest.fn();

@ -189,37 +221,7 @@ describe('TrainedModelsService', () => {
    mockTrainedModelsApiService.getTrainedModelsList.mockResolvedValueOnce([mockModel]);

    mockTrainedModelsApiService.startModelAllocation.mockReturnValueOnce(
-      of({
-        assignment: {
-          task_parameters: {
-            model_id: 'deploy-model',
-            model_bytes: 1000,
-            allocation_id: 'test-allocation',
-            priority: 'normal',
-            number_of_allocations: 1,
-            threads_per_allocation: 1,
-            queue_capacity: 1024,
-            deployment_id: 'my-deployment-id',
-            cache_size: '1mb',
-          },
-          node_count: 1,
-          routing_table: {
-            'node-1': {
-              routing_state: 'started',
-              reason: '',
-              current_allocations: 1,
-              target_allocations: 1,
-            },
-          },
-          assignment_state: 'started',
-          start_time: 1234567890,
-          adaptive_allocations: {
-            enabled: true,
-            min_number_of_allocations: 1,
-            max_number_of_allocations: 4,
-          },
-        },
-      })
+      of(startModelAllocationResponseMock)
    );

    // Start deployment
@ -345,4 +347,53 @@ describe('TrainedModelsService', () => {
      })
    );
  });
+
+  it('allows new deployments after a failed deployment', async () => {
+    const mockModel = {
+      model_id: 'test-model',
+      state: MODEL_STATE.DOWNLOADED,
+      type: ['pytorch'],
+    } as unknown as TrainedModelUIItem;
+
+    mockTrainedModelsApiService.getTrainedModelsList.mockResolvedValue([mockModel]);
+
+    mockTrainedModelsApiService.startModelAllocation
+      .mockReturnValueOnce(throwError(() => new Error('First deployment failed')))
+      .mockReturnValueOnce(of(startModelAllocationResponseMock));
+
+    // First deployment
+    trainedModelsService.startModelDeployment('test-model', {
+      deployment_id: 'first-deployment',
+      priority: 'low',
+      threads_per_allocation: 1,
+    });
+
+    jest.advanceTimersByTime(100);
+    await flushPromises();
+
+    expect(mockDisplayErrorToast).toHaveBeenCalledWith(
+      expect.any(Error),
+      expect.stringContaining('first-deployment')
+    );
+
+    jest.advanceTimersByTime(100);
+    await flushPromises();
+
+    // Second deployment
+    trainedModelsService.startModelDeployment('test-model', {
+      deployment_id: 'second-deployment',
+      priority: 'low',
+      threads_per_allocation: 1,
+    });
+
+    jest.advanceTimersByTime(100);
+    await flushPromises();
+
+    expect(mockTrainedModelsApiService.startModelAllocation).toHaveBeenCalledTimes(2);
+    expect(mockDisplaySuccessToast).toHaveBeenCalledWith(
+      expect.objectContaining({
+        text: expect.stringContaining('second-deployment'),
+      })
+    );
+  });
 });
--- a/x-pack/platform/plugins/shared/ml/public/application/model_management/trained_models_service.ts
+++ b/x-pack/platform/plugins/shared/ml/public/application/model_management/trained_models_service.ts
@ -441,29 +441,31 @@ export class TrainedModelsService {
                  }),
                });
              },
-              error: (error) => {
-                this.displayErrorToast?.(
-                  error,
-                  i18n.translate('xpack.ml.trainedModels.modelsList.startFailed', {
-                    defaultMessage: 'Failed to start "{deploymentId}"',
-                    values: {
-                      deploymentId: deployment.deploymentParams.deployment_id,
-                    },
-                  })
-                );
-              },
-              finalize: () => {
-                this.removeScheduledDeployments({
-                  deploymentId: deployment.deploymentParams.deployment_id!,
-                });
-                // Manually update the BehaviorSubject to ensure proper cleanup
-                // if user navigates away, as localStorage hook won't be available to handle updates
-                const updatedDeployments = this._scheduledDeployments$
-                  .getValue()
-                  .filter((d) => d.modelId !== deployment.modelId);
-                this._scheduledDeployments$.next(updatedDeployments);
-                this.fetchModels();
-              },
+            }),
+            catchError((error) => {
+              this.displayErrorToast?.(
+                error,
+                i18n.translate('xpack.ml.trainedModels.modelsList.startFailed', {
+                  defaultMessage: 'Failed to start "{deploymentId}"',
+                  values: {
+                    deploymentId: deployment.deploymentParams.deployment_id,
+                  },
+                })
+              );
+              // Return null to allow stream to continue
+              return of(null);
+            }),
+            finalize(() => {
+              this.removeScheduledDeployments({
+                deploymentId: deployment.deploymentParams.deployment_id!,
+              });
+              // Manually update the BehaviorSubject to ensure proper cleanup
+              // if user navigates away, as localStorage hook won't be available to handle updates
+              const updatedDeployments = this._scheduledDeployments$
+                .getValue()
+                .filter((d) => d.modelId !== deployment.modelId);
+              this._scheduledDeployments$.next(updatedDeployments);
+              this.fetchModels();
            })
          )
        );