mirror of
https://github.com/elastic/kibana.git
synced 2025-04-24 09:48:58 -04:00
[ML] Trained Models: Cannot deploy a model after a failed deployment (#211459)
After the recent changes in https://github.com/elastic/kibana/pull/205699 If a deployment fails, the error will be handled correctly at a single deployment level, however, the pipeline would break, thus further deployments wouldn't be proceeded.
This commit is contained in:
parent
a1fde97765
commit
58cea843e9
2 changed files with 107 additions and 54 deletions
|
@ -31,6 +31,38 @@ describe('TrainedModelsService', () => {
|
|||
let scheduledDeploymentsSubject: BehaviorSubject<StartAllocationParams[]>;
|
||||
let mockSetScheduledDeployments: jest.Mock<any, any>;
|
||||
|
||||
const startModelAllocationResponseMock = {
|
||||
assignment: {
|
||||
task_parameters: {
|
||||
model_id: 'deploy-model',
|
||||
model_bytes: 1000,
|
||||
allocation_id: 'test-allocation',
|
||||
priority: 'normal',
|
||||
number_of_allocations: 1,
|
||||
threads_per_allocation: 1,
|
||||
queue_capacity: 1024,
|
||||
deployment_id: 'my-deployment-id',
|
||||
cache_size: '1mb',
|
||||
},
|
||||
node_count: 1,
|
||||
routing_table: {
|
||||
'node-1': {
|
||||
routing_state: 'started',
|
||||
reason: '',
|
||||
current_allocations: 1,
|
||||
target_allocations: 1,
|
||||
},
|
||||
},
|
||||
assignment_state: 'started',
|
||||
start_time: 1234567890,
|
||||
adaptive_allocations: {
|
||||
enabled: true,
|
||||
min_number_of_allocations: 1,
|
||||
max_number_of_allocations: 4,
|
||||
},
|
||||
} as const,
|
||||
};
|
||||
|
||||
const mockDisplayErrorToast = jest.fn();
|
||||
const mockDisplaySuccessToast = jest.fn();
|
||||
|
||||
|
@ -189,37 +221,7 @@ describe('TrainedModelsService', () => {
|
|||
mockTrainedModelsApiService.getTrainedModelsList.mockResolvedValueOnce([mockModel]);
|
||||
|
||||
mockTrainedModelsApiService.startModelAllocation.mockReturnValueOnce(
|
||||
of({
|
||||
assignment: {
|
||||
task_parameters: {
|
||||
model_id: 'deploy-model',
|
||||
model_bytes: 1000,
|
||||
allocation_id: 'test-allocation',
|
||||
priority: 'normal',
|
||||
number_of_allocations: 1,
|
||||
threads_per_allocation: 1,
|
||||
queue_capacity: 1024,
|
||||
deployment_id: 'my-deployment-id',
|
||||
cache_size: '1mb',
|
||||
},
|
||||
node_count: 1,
|
||||
routing_table: {
|
||||
'node-1': {
|
||||
routing_state: 'started',
|
||||
reason: '',
|
||||
current_allocations: 1,
|
||||
target_allocations: 1,
|
||||
},
|
||||
},
|
||||
assignment_state: 'started',
|
||||
start_time: 1234567890,
|
||||
adaptive_allocations: {
|
||||
enabled: true,
|
||||
min_number_of_allocations: 1,
|
||||
max_number_of_allocations: 4,
|
||||
},
|
||||
},
|
||||
})
|
||||
of(startModelAllocationResponseMock)
|
||||
);
|
||||
|
||||
// Start deployment
|
||||
|
@ -345,4 +347,53 @@ describe('TrainedModelsService', () => {
|
|||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('allows new deployments after a failed deployment', async () => {
|
||||
const mockModel = {
|
||||
model_id: 'test-model',
|
||||
state: MODEL_STATE.DOWNLOADED,
|
||||
type: ['pytorch'],
|
||||
} as unknown as TrainedModelUIItem;
|
||||
|
||||
mockTrainedModelsApiService.getTrainedModelsList.mockResolvedValue([mockModel]);
|
||||
|
||||
mockTrainedModelsApiService.startModelAllocation
|
||||
.mockReturnValueOnce(throwError(() => new Error('First deployment failed')))
|
||||
.mockReturnValueOnce(of(startModelAllocationResponseMock));
|
||||
|
||||
// First deployment
|
||||
trainedModelsService.startModelDeployment('test-model', {
|
||||
deployment_id: 'first-deployment',
|
||||
priority: 'low',
|
||||
threads_per_allocation: 1,
|
||||
});
|
||||
|
||||
jest.advanceTimersByTime(100);
|
||||
await flushPromises();
|
||||
|
||||
expect(mockDisplayErrorToast).toHaveBeenCalledWith(
|
||||
expect.any(Error),
|
||||
expect.stringContaining('first-deployment')
|
||||
);
|
||||
|
||||
jest.advanceTimersByTime(100);
|
||||
await flushPromises();
|
||||
|
||||
// Second deployment
|
||||
trainedModelsService.startModelDeployment('test-model', {
|
||||
deployment_id: 'second-deployment',
|
||||
priority: 'low',
|
||||
threads_per_allocation: 1,
|
||||
});
|
||||
|
||||
jest.advanceTimersByTime(100);
|
||||
await flushPromises();
|
||||
|
||||
expect(mockTrainedModelsApiService.startModelAllocation).toHaveBeenCalledTimes(2);
|
||||
expect(mockDisplaySuccessToast).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
text: expect.stringContaining('second-deployment'),
|
||||
})
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -441,29 +441,31 @@ export class TrainedModelsService {
|
|||
}),
|
||||
});
|
||||
},
|
||||
error: (error) => {
|
||||
this.displayErrorToast?.(
|
||||
error,
|
||||
i18n.translate('xpack.ml.trainedModels.modelsList.startFailed', {
|
||||
defaultMessage: 'Failed to start "{deploymentId}"',
|
||||
values: {
|
||||
deploymentId: deployment.deploymentParams.deployment_id,
|
||||
},
|
||||
})
|
||||
);
|
||||
},
|
||||
finalize: () => {
|
||||
this.removeScheduledDeployments({
|
||||
deploymentId: deployment.deploymentParams.deployment_id!,
|
||||
});
|
||||
// Manually update the BehaviorSubject to ensure proper cleanup
|
||||
// if user navigates away, as localStorage hook won't be available to handle updates
|
||||
const updatedDeployments = this._scheduledDeployments$
|
||||
.getValue()
|
||||
.filter((d) => d.modelId !== deployment.modelId);
|
||||
this._scheduledDeployments$.next(updatedDeployments);
|
||||
this.fetchModels();
|
||||
},
|
||||
}),
|
||||
catchError((error) => {
|
||||
this.displayErrorToast?.(
|
||||
error,
|
||||
i18n.translate('xpack.ml.trainedModels.modelsList.startFailed', {
|
||||
defaultMessage: 'Failed to start "{deploymentId}"',
|
||||
values: {
|
||||
deploymentId: deployment.deploymentParams.deployment_id,
|
||||
},
|
||||
})
|
||||
);
|
||||
// Return null to allow stream to continue
|
||||
return of(null);
|
||||
}),
|
||||
finalize(() => {
|
||||
this.removeScheduledDeployments({
|
||||
deploymentId: deployment.deploymentParams.deployment_id!,
|
||||
});
|
||||
// Manually update the BehaviorSubject to ensure proper cleanup
|
||||
// if user navigates away, as localStorage hook won't be available to handle updates
|
||||
const updatedDeployments = this._scheduledDeployments$
|
||||
.getValue()
|
||||
.filter((d) => d.modelId !== deployment.modelId);
|
||||
this._scheduledDeployments$.next(updatedDeployments);
|
||||
this.fetchModels();
|
||||
})
|
||||
)
|
||||
);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue