From b48f69961d89aa3b2e9d5005b08b29ba699ee565 Mon Sep 17 00:00:00 2001 From: Pat Whelan Date: Fri, 13 Jun 2025 17:13:45 -0400 Subject: [PATCH] [ML] Add Telemetry for models without adaptive allocations (#129161) Added min and max allocations as attributes to the telemetry for trained models with adaptive allocations enabled. Added telemetry for models with adaptive allocations disabled or never set. --- docs/changelog/129161.yaml | 5 +++ .../org/elasticsearch/xpack/ml/MlMetrics.java | 40 +++++++++++++++++-- .../AdaptiveAllocationsScaler.java | 8 ++++ .../AdaptiveAllocationsScalerService.java | 15 ++++++- .../xpack/ml/MlMetricsTests.java | 21 +++++++++- 5 files changed, 82 insertions(+), 7 deletions(-) create mode 100644 docs/changelog/129161.yaml diff --git a/docs/changelog/129161.yaml b/docs/changelog/129161.yaml new file mode 100644 index 000000000000..a871fff01c9d --- /dev/null +++ b/docs/changelog/129161.yaml @@ -0,0 +1,5 @@ +pr: 129161 +summary: Add Telemetry for models without adaptive allocations +area: Machine Learning +type: enhancement +issues: [] diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java index 65d24a13564b..d55ad014c5ed 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java @@ -272,6 +272,25 @@ public final class MlMetrics extends AbstractLifecycleComponent implements Clust () -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFailedAllocations, isMasterMap) ) ); + metrics.add( + meterRegistry.registerLongGauge( + "es.ml.trained_models.deployment.fixed_allocations.current", + "Sum of current trained model allocations that do not use adaptive allocations (either enabled or disabled)", + "allocations", + () -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithFixedAllocations, isMasterMap) + ) + ); + /* + * AdaptiveAllocationsScalerService tracks the number of allocations with adaptive allocations enabled. + */ + metrics.add( + meterRegistry.registerLongGauge( + "es.ml.trained_models.deployment.disabled_adaptive_allocations.current", + "Sum of current trained model allocations that have adaptive allocations disabled", + "allocations", + () -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithDisabledAdaptiveAllocations, isMasterMap) + ) + ); } @Override @@ -484,17 +503,28 @@ public final class MlMetrics extends AbstractLifecycleComponent implements Clust int trainedModelsTargetAllocations = 0; int trainedModelsCurrentAllocations = 0; int trainedModelsFailedAllocations = 0; + int deploymentsWithFixedAllocations = 0; + int deploymentsWithDisabledAdaptiveAllocations = 0; for (TrainedModelAssignment trainedModelAssignment : metadata.allAssignments().values()) { trainedModelsTargetAllocations += trainedModelAssignment.totalTargetAllocations(); - trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations(); trainedModelsFailedAllocations += trainedModelAssignment.totalFailedAllocations(); + trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations(); + + if (trainedModelAssignment.getAdaptiveAllocationsSettings() == null) { + deploymentsWithFixedAllocations += 1; + } else if ((trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == null) + || (trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == false)) { + deploymentsWithDisabledAdaptiveAllocations += 1; + } } return new TrainedModelAllocationCounts( trainedModelsTargetAllocations, trainedModelsCurrentAllocations, - trainedModelsFailedAllocations + trainedModelsFailedAllocations, + deploymentsWithFixedAllocations, + deploymentsWithDisabledAdaptiveAllocations ); } @@ -556,8 +586,10 @@ public final class MlMetrics extends AbstractLifecycleComponent implements Clust record TrainedModelAllocationCounts( int trainedModelsTargetAllocations, int trainedModelsCurrentAllocations, - int trainedModelsFailedAllocations + int trainedModelsFailedAllocations, + int deploymentsWithFixedAllocations, + int deploymentsWithDisabledAdaptiveAllocations ) { - static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0); + static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0, 0, 0); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java index 28e2380c2a9d..537d65dca925 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java @@ -239,4 +239,12 @@ public class AdaptiveAllocationsScaler { public Long getLastMeasuredQueueSize() { return lastMeasuredQueueSize; } + + public Integer getMinNumberOfAllocations() { + return minNumberOfAllocations; + } + + public Integer getMaxNumberOfAllocations() { + return maxNumberOfAllocations; + } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java index 4ca3d2f02a02..a7812a5dfa0b 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java @@ -105,7 +105,7 @@ public class AdaptiveAllocationsScalerService implements ClusterStateListener { "es.ml.trained_models.adaptive_allocations.actual_number_of_allocations.current", "the actual number of allocations", "", - () -> observeLong(AdaptiveAllocationsScaler::getNumberOfAllocations) + this::observeAllocationCount ) ); metrics.add( @@ -179,6 +179,19 @@ public class AdaptiveAllocationsScalerService implements ClusterStateListener { } return observations; } + + Collection observeAllocationCount() { + return scalers.values().stream().map(scaler -> { + var value = scaler.getNumberOfAllocations(); + var min = scaler.getMinNumberOfAllocations(); + var scalesToZero = min == null || min == 0; + + return new LongWithAttributes( + value, + Map.ofEntries(Map.entry("deployment_id", scaler.getDeploymentId()), Map.entry("scales_to_zero", scalesToZero)) + ); + }).toList(); + } } /** diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java index 5fb1381b881e..60d9074959f5 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java @@ -21,6 +21,7 @@ import org.elasticsearch.xpack.core.ml.action.StartTrainedModelDeploymentAction; import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState; import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig; import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState; +import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings; import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingInfo; import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingState; import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment; @@ -146,11 +147,27 @@ public class MlMetricsTests extends ESTestCase { TrainedModelAssignment.Builder.empty(mock(StartTrainedModelDeploymentAction.TaskParams.class), null) .addRoutingEntry("node2", new RoutingInfo(0, 1, RoutingState.STARTING, "")) ); + metadataBuilder.addNewAssignment( + "model4", + TrainedModelAssignment.Builder.empty( + mock(StartTrainedModelDeploymentAction.TaskParams.class), + new AdaptiveAllocationsSettings(true, 0, 1) + ).addRoutingEntry("node1", new RoutingInfo(0, 0, RoutingState.STARTING, "")) + ); + metadataBuilder.addNewAssignment( + "model5", + TrainedModelAssignment.Builder.empty( + mock(StartTrainedModelDeploymentAction.TaskParams.class), + new AdaptiveAllocationsSettings(false, 1, 1) + ).addRoutingEntry("node1", new RoutingInfo(1, 1, RoutingState.STARTING, "")) + ); MlMetrics.TrainedModelAllocationCounts counts = MlMetrics.findTrainedModelAllocationCounts(metadataBuilder.build()); - assertThat(counts.trainedModelsTargetAllocations(), is(5)); - assertThat(counts.trainedModelsCurrentAllocations(), is(3)); + assertThat(counts.trainedModelsTargetAllocations(), is(6)); + assertThat(counts.trainedModelsCurrentAllocations(), is(4)); assertThat(counts.trainedModelsFailedAllocations(), is(1)); + assertThat(counts.deploymentsWithFixedAllocations(), is(3)); + assertThat(counts.deploymentsWithDisabledAdaptiveAllocations(), is(1)); } public void testFindNativeMemoryFree() {