[ML] Add Telemetry for models without adaptive allocations (#129161)

Added min and max allocations as attributes to the telemetry for trained
models with adaptive allocations enabled.

Added telemetry for models with adaptive allocations disabled or never
set.
This commit is contained in:
Pat Whelan 2025-06-13 17:13:45 -04:00 committed by GitHub
parent c4f7b9710d
commit b48f69961d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 82 additions and 7 deletions

View file

@ -0,0 +1,5 @@
pr: 129161
summary: Add Telemetry for models without adaptive allocations
area: Machine Learning
type: enhancement
issues: []

View file

@ -272,6 +272,25 @@ public final class MlMetrics extends AbstractLifecycleComponent implements Clust
() -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFailedAllocations, isMasterMap)
)
);
metrics.add(
meterRegistry.registerLongGauge(
"es.ml.trained_models.deployment.fixed_allocations.current",
"Sum of current trained model allocations that do not use adaptive allocations (either enabled or disabled)",
"allocations",
() -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithFixedAllocations, isMasterMap)
)
);
/*
* AdaptiveAllocationsScalerService tracks the number of allocations with adaptive allocations enabled.
*/
metrics.add(
meterRegistry.registerLongGauge(
"es.ml.trained_models.deployment.disabled_adaptive_allocations.current",
"Sum of current trained model allocations that have adaptive allocations disabled",
"allocations",
() -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithDisabledAdaptiveAllocations, isMasterMap)
)
);
}
@Override
@ -484,17 +503,28 @@ public final class MlMetrics extends AbstractLifecycleComponent implements Clust
int trainedModelsTargetAllocations = 0;
int trainedModelsCurrentAllocations = 0;
int trainedModelsFailedAllocations = 0;
int deploymentsWithFixedAllocations = 0;
int deploymentsWithDisabledAdaptiveAllocations = 0;
for (TrainedModelAssignment trainedModelAssignment : metadata.allAssignments().values()) {
trainedModelsTargetAllocations += trainedModelAssignment.totalTargetAllocations();
trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations();
trainedModelsFailedAllocations += trainedModelAssignment.totalFailedAllocations();
trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations();
if (trainedModelAssignment.getAdaptiveAllocationsSettings() == null) {
deploymentsWithFixedAllocations += 1;
} else if ((trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == null)
|| (trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == false)) {
deploymentsWithDisabledAdaptiveAllocations += 1;
}
}
return new TrainedModelAllocationCounts(
trainedModelsTargetAllocations,
trainedModelsCurrentAllocations,
trainedModelsFailedAllocations
trainedModelsFailedAllocations,
deploymentsWithFixedAllocations,
deploymentsWithDisabledAdaptiveAllocations
);
}
@ -556,8 +586,10 @@ public final class MlMetrics extends AbstractLifecycleComponent implements Clust
record TrainedModelAllocationCounts(
int trainedModelsTargetAllocations,
int trainedModelsCurrentAllocations,
int trainedModelsFailedAllocations
int trainedModelsFailedAllocations,
int deploymentsWithFixedAllocations,
int deploymentsWithDisabledAdaptiveAllocations
) {
static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0);
static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0, 0, 0);
}
}

View file

@ -239,4 +239,12 @@ public class AdaptiveAllocationsScaler {
public Long getLastMeasuredQueueSize() {
return lastMeasuredQueueSize;
}
public Integer getMinNumberOfAllocations() {
return minNumberOfAllocations;
}
public Integer getMaxNumberOfAllocations() {
return maxNumberOfAllocations;
}
}

View file

@ -105,7 +105,7 @@ public class AdaptiveAllocationsScalerService implements ClusterStateListener {
"es.ml.trained_models.adaptive_allocations.actual_number_of_allocations.current",
"the actual number of allocations",
"",
() -> observeLong(AdaptiveAllocationsScaler::getNumberOfAllocations)
this::observeAllocationCount
)
);
metrics.add(
@ -179,6 +179,19 @@ public class AdaptiveAllocationsScalerService implements ClusterStateListener {
}
return observations;
}
Collection<LongWithAttributes> observeAllocationCount() {
return scalers.values().stream().map(scaler -> {
var value = scaler.getNumberOfAllocations();
var min = scaler.getMinNumberOfAllocations();
var scalesToZero = min == null || min == 0;
return new LongWithAttributes(
value,
Map.ofEntries(Map.entry("deployment_id", scaler.getDeploymentId()), Map.entry("scales_to_zero", scalesToZero))
);
}).toList();
}
}
/**

View file

@ -21,6 +21,7 @@ import org.elasticsearch.xpack.core.ml.action.StartTrainedModelDeploymentAction;
import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings;
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingInfo;
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingState;
import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment;
@ -146,11 +147,27 @@ public class MlMetricsTests extends ESTestCase {
TrainedModelAssignment.Builder.empty(mock(StartTrainedModelDeploymentAction.TaskParams.class), null)
.addRoutingEntry("node2", new RoutingInfo(0, 1, RoutingState.STARTING, ""))
);
metadataBuilder.addNewAssignment(
"model4",
TrainedModelAssignment.Builder.empty(
mock(StartTrainedModelDeploymentAction.TaskParams.class),
new AdaptiveAllocationsSettings(true, 0, 1)
).addRoutingEntry("node1", new RoutingInfo(0, 0, RoutingState.STARTING, ""))
);
metadataBuilder.addNewAssignment(
"model5",
TrainedModelAssignment.Builder.empty(
mock(StartTrainedModelDeploymentAction.TaskParams.class),
new AdaptiveAllocationsSettings(false, 1, 1)
).addRoutingEntry("node1", new RoutingInfo(1, 1, RoutingState.STARTING, ""))
);
MlMetrics.TrainedModelAllocationCounts counts = MlMetrics.findTrainedModelAllocationCounts(metadataBuilder.build());
assertThat(counts.trainedModelsTargetAllocations(), is(5));
assertThat(counts.trainedModelsCurrentAllocations(), is(3));
assertThat(counts.trainedModelsTargetAllocations(), is(6));
assertThat(counts.trainedModelsCurrentAllocations(), is(4));
assertThat(counts.trainedModelsFailedAllocations(), is(1));
assertThat(counts.deploymentsWithFixedAllocations(), is(3));
assertThat(counts.deploymentsWithDisabledAdaptiveAllocations(), is(1));
}
public void testFindNativeMemoryFree() {