[ML] Trained Models: Cannot deploy a model after a failed deployment (#211459)

After the recent changes in
https://github.com/elastic/kibana/pull/205699
If a deployment fails, the error will be handled correctly at a single
deployment level, however, the pipeline would break, thus further
deployments wouldn't be proceeded.
This commit is contained in:
Robert Jaszczurek 2025-02-19 10:45:54 +01:00 committed by GitHub
parent a1fde97765
commit 58cea843e9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 107 additions and 54 deletions

View file

@ -31,6 +31,38 @@ describe('TrainedModelsService', () => {
let scheduledDeploymentsSubject: BehaviorSubject<StartAllocationParams[]>;
let mockSetScheduledDeployments: jest.Mock<any, any>;
const startModelAllocationResponseMock = {
assignment: {
task_parameters: {
model_id: 'deploy-model',
model_bytes: 1000,
allocation_id: 'test-allocation',
priority: 'normal',
number_of_allocations: 1,
threads_per_allocation: 1,
queue_capacity: 1024,
deployment_id: 'my-deployment-id',
cache_size: '1mb',
},
node_count: 1,
routing_table: {
'node-1': {
routing_state: 'started',
reason: '',
current_allocations: 1,
target_allocations: 1,
},
},
assignment_state: 'started',
start_time: 1234567890,
adaptive_allocations: {
enabled: true,
min_number_of_allocations: 1,
max_number_of_allocations: 4,
},
} as const,
};
const mockDisplayErrorToast = jest.fn();
const mockDisplaySuccessToast = jest.fn();
@ -189,37 +221,7 @@ describe('TrainedModelsService', () => {
mockTrainedModelsApiService.getTrainedModelsList.mockResolvedValueOnce([mockModel]);
mockTrainedModelsApiService.startModelAllocation.mockReturnValueOnce(
of({
assignment: {
task_parameters: {
model_id: 'deploy-model',
model_bytes: 1000,
allocation_id: 'test-allocation',
priority: 'normal',
number_of_allocations: 1,
threads_per_allocation: 1,
queue_capacity: 1024,
deployment_id: 'my-deployment-id',
cache_size: '1mb',
},
node_count: 1,
routing_table: {
'node-1': {
routing_state: 'started',
reason: '',
current_allocations: 1,
target_allocations: 1,
},
},
assignment_state: 'started',
start_time: 1234567890,
adaptive_allocations: {
enabled: true,
min_number_of_allocations: 1,
max_number_of_allocations: 4,
},
},
})
of(startModelAllocationResponseMock)
);
// Start deployment
@ -345,4 +347,53 @@ describe('TrainedModelsService', () => {
})
);
});
it('allows new deployments after a failed deployment', async () => {
const mockModel = {
model_id: 'test-model',
state: MODEL_STATE.DOWNLOADED,
type: ['pytorch'],
} as unknown as TrainedModelUIItem;
mockTrainedModelsApiService.getTrainedModelsList.mockResolvedValue([mockModel]);
mockTrainedModelsApiService.startModelAllocation
.mockReturnValueOnce(throwError(() => new Error('First deployment failed')))
.mockReturnValueOnce(of(startModelAllocationResponseMock));
// First deployment
trainedModelsService.startModelDeployment('test-model', {
deployment_id: 'first-deployment',
priority: 'low',
threads_per_allocation: 1,
});
jest.advanceTimersByTime(100);
await flushPromises();
expect(mockDisplayErrorToast).toHaveBeenCalledWith(
expect.any(Error),
expect.stringContaining('first-deployment')
);
jest.advanceTimersByTime(100);
await flushPromises();
// Second deployment
trainedModelsService.startModelDeployment('test-model', {
deployment_id: 'second-deployment',
priority: 'low',
threads_per_allocation: 1,
});
jest.advanceTimersByTime(100);
await flushPromises();
expect(mockTrainedModelsApiService.startModelAllocation).toHaveBeenCalledTimes(2);
expect(mockDisplaySuccessToast).toHaveBeenCalledWith(
expect.objectContaining({
text: expect.stringContaining('second-deployment'),
})
);
});
});

View file

@ -441,29 +441,31 @@ export class TrainedModelsService {
}),
});
},
error: (error) => {
this.displayErrorToast?.(
error,
i18n.translate('xpack.ml.trainedModels.modelsList.startFailed', {
defaultMessage: 'Failed to start "{deploymentId}"',
values: {
deploymentId: deployment.deploymentParams.deployment_id,
},
})
);
},
finalize: () => {
this.removeScheduledDeployments({
deploymentId: deployment.deploymentParams.deployment_id!,
});
// Manually update the BehaviorSubject to ensure proper cleanup
// if user navigates away, as localStorage hook won't be available to handle updates
const updatedDeployments = this._scheduledDeployments$
.getValue()
.filter((d) => d.modelId !== deployment.modelId);
this._scheduledDeployments$.next(updatedDeployments);
this.fetchModels();
},
}),
catchError((error) => {
this.displayErrorToast?.(
error,
i18n.translate('xpack.ml.trainedModels.modelsList.startFailed', {
defaultMessage: 'Failed to start "{deploymentId}"',
values: {
deploymentId: deployment.deploymentParams.deployment_id,
},
})
);
// Return null to allow stream to continue
return of(null);
}),
finalize(() => {
this.removeScheduledDeployments({
deploymentId: deployment.deploymentParams.deployment_id!,
});
// Manually update the BehaviorSubject to ensure proper cleanup
// if user navigates away, as localStorage hook won't be available to handle updates
const updatedDeployments = this._scheduledDeployments$
.getValue()
.filter((d) => d.modelId !== deployment.modelId);
this._scheduledDeployments$.next(updatedDeployments);
this.fetchModels();
})
)
);