mirror of
https://github.com/elastic/elasticsearch.git
synced 2025-06-28 09:28:55 -04:00
[ML] Add Telemetry for models without adaptive allocations (#129161)
Added min and max allocations as attributes to the telemetry for trained models with adaptive allocations enabled. Added telemetry for models with adaptive allocations disabled or never set.
This commit is contained in:
parent
c4f7b9710d
commit
b48f69961d
5 changed files with 82 additions and 7 deletions
5
docs/changelog/129161.yaml
Normal file
5
docs/changelog/129161.yaml
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
pr: 129161
|
||||||
|
summary: Add Telemetry for models without adaptive allocations
|
||||||
|
area: Machine Learning
|
||||||
|
type: enhancement
|
||||||
|
issues: []
|
|
@ -272,6 +272,25 @@ public final class MlMetrics extends AbstractLifecycleComponent implements Clust
|
||||||
() -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFailedAllocations, isMasterMap)
|
() -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFailedAllocations, isMasterMap)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
metrics.add(
|
||||||
|
meterRegistry.registerLongGauge(
|
||||||
|
"es.ml.trained_models.deployment.fixed_allocations.current",
|
||||||
|
"Sum of current trained model allocations that do not use adaptive allocations (either enabled or disabled)",
|
||||||
|
"allocations",
|
||||||
|
() -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithFixedAllocations, isMasterMap)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
/*
|
||||||
|
* AdaptiveAllocationsScalerService tracks the number of allocations with adaptive allocations enabled.
|
||||||
|
*/
|
||||||
|
metrics.add(
|
||||||
|
meterRegistry.registerLongGauge(
|
||||||
|
"es.ml.trained_models.deployment.disabled_adaptive_allocations.current",
|
||||||
|
"Sum of current trained model allocations that have adaptive allocations disabled",
|
||||||
|
"allocations",
|
||||||
|
() -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithDisabledAdaptiveAllocations, isMasterMap)
|
||||||
|
)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -484,17 +503,28 @@ public final class MlMetrics extends AbstractLifecycleComponent implements Clust
|
||||||
int trainedModelsTargetAllocations = 0;
|
int trainedModelsTargetAllocations = 0;
|
||||||
int trainedModelsCurrentAllocations = 0;
|
int trainedModelsCurrentAllocations = 0;
|
||||||
int trainedModelsFailedAllocations = 0;
|
int trainedModelsFailedAllocations = 0;
|
||||||
|
int deploymentsWithFixedAllocations = 0;
|
||||||
|
int deploymentsWithDisabledAdaptiveAllocations = 0;
|
||||||
|
|
||||||
for (TrainedModelAssignment trainedModelAssignment : metadata.allAssignments().values()) {
|
for (TrainedModelAssignment trainedModelAssignment : metadata.allAssignments().values()) {
|
||||||
trainedModelsTargetAllocations += trainedModelAssignment.totalTargetAllocations();
|
trainedModelsTargetAllocations += trainedModelAssignment.totalTargetAllocations();
|
||||||
trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations();
|
|
||||||
trainedModelsFailedAllocations += trainedModelAssignment.totalFailedAllocations();
|
trainedModelsFailedAllocations += trainedModelAssignment.totalFailedAllocations();
|
||||||
|
trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations();
|
||||||
|
|
||||||
|
if (trainedModelAssignment.getAdaptiveAllocationsSettings() == null) {
|
||||||
|
deploymentsWithFixedAllocations += 1;
|
||||||
|
} else if ((trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == null)
|
||||||
|
|| (trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == false)) {
|
||||||
|
deploymentsWithDisabledAdaptiveAllocations += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new TrainedModelAllocationCounts(
|
return new TrainedModelAllocationCounts(
|
||||||
trainedModelsTargetAllocations,
|
trainedModelsTargetAllocations,
|
||||||
trainedModelsCurrentAllocations,
|
trainedModelsCurrentAllocations,
|
||||||
trainedModelsFailedAllocations
|
trainedModelsFailedAllocations,
|
||||||
|
deploymentsWithFixedAllocations,
|
||||||
|
deploymentsWithDisabledAdaptiveAllocations
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -556,8 +586,10 @@ public final class MlMetrics extends AbstractLifecycleComponent implements Clust
|
||||||
record TrainedModelAllocationCounts(
|
record TrainedModelAllocationCounts(
|
||||||
int trainedModelsTargetAllocations,
|
int trainedModelsTargetAllocations,
|
||||||
int trainedModelsCurrentAllocations,
|
int trainedModelsCurrentAllocations,
|
||||||
int trainedModelsFailedAllocations
|
int trainedModelsFailedAllocations,
|
||||||
|
int deploymentsWithFixedAllocations,
|
||||||
|
int deploymentsWithDisabledAdaptiveAllocations
|
||||||
) {
|
) {
|
||||||
static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0);
|
static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0, 0, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -239,4 +239,12 @@ public class AdaptiveAllocationsScaler {
|
||||||
public Long getLastMeasuredQueueSize() {
|
public Long getLastMeasuredQueueSize() {
|
||||||
return lastMeasuredQueueSize;
|
return lastMeasuredQueueSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Integer getMinNumberOfAllocations() {
|
||||||
|
return minNumberOfAllocations;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getMaxNumberOfAllocations() {
|
||||||
|
return maxNumberOfAllocations;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -105,7 +105,7 @@ public class AdaptiveAllocationsScalerService implements ClusterStateListener {
|
||||||
"es.ml.trained_models.adaptive_allocations.actual_number_of_allocations.current",
|
"es.ml.trained_models.adaptive_allocations.actual_number_of_allocations.current",
|
||||||
"the actual number of allocations",
|
"the actual number of allocations",
|
||||||
"",
|
"",
|
||||||
() -> observeLong(AdaptiveAllocationsScaler::getNumberOfAllocations)
|
this::observeAllocationCount
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
metrics.add(
|
metrics.add(
|
||||||
|
@ -179,6 +179,19 @@ public class AdaptiveAllocationsScalerService implements ClusterStateListener {
|
||||||
}
|
}
|
||||||
return observations;
|
return observations;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Collection<LongWithAttributes> observeAllocationCount() {
|
||||||
|
return scalers.values().stream().map(scaler -> {
|
||||||
|
var value = scaler.getNumberOfAllocations();
|
||||||
|
var min = scaler.getMinNumberOfAllocations();
|
||||||
|
var scalesToZero = min == null || min == 0;
|
||||||
|
|
||||||
|
return new LongWithAttributes(
|
||||||
|
value,
|
||||||
|
Map.ofEntries(Map.entry("deployment_id", scaler.getDeploymentId()), Map.entry("scales_to_zero", scalesToZero))
|
||||||
|
);
|
||||||
|
}).toList();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -21,6 +21,7 @@ import org.elasticsearch.xpack.core.ml.action.StartTrainedModelDeploymentAction;
|
||||||
import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState;
|
import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState;
|
||||||
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
|
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
|
||||||
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
|
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
|
||||||
|
import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings;
|
||||||
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingInfo;
|
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingInfo;
|
||||||
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingState;
|
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingState;
|
||||||
import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment;
|
import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment;
|
||||||
|
@ -146,11 +147,27 @@ public class MlMetricsTests extends ESTestCase {
|
||||||
TrainedModelAssignment.Builder.empty(mock(StartTrainedModelDeploymentAction.TaskParams.class), null)
|
TrainedModelAssignment.Builder.empty(mock(StartTrainedModelDeploymentAction.TaskParams.class), null)
|
||||||
.addRoutingEntry("node2", new RoutingInfo(0, 1, RoutingState.STARTING, ""))
|
.addRoutingEntry("node2", new RoutingInfo(0, 1, RoutingState.STARTING, ""))
|
||||||
);
|
);
|
||||||
|
metadataBuilder.addNewAssignment(
|
||||||
|
"model4",
|
||||||
|
TrainedModelAssignment.Builder.empty(
|
||||||
|
mock(StartTrainedModelDeploymentAction.TaskParams.class),
|
||||||
|
new AdaptiveAllocationsSettings(true, 0, 1)
|
||||||
|
).addRoutingEntry("node1", new RoutingInfo(0, 0, RoutingState.STARTING, ""))
|
||||||
|
);
|
||||||
|
metadataBuilder.addNewAssignment(
|
||||||
|
"model5",
|
||||||
|
TrainedModelAssignment.Builder.empty(
|
||||||
|
mock(StartTrainedModelDeploymentAction.TaskParams.class),
|
||||||
|
new AdaptiveAllocationsSettings(false, 1, 1)
|
||||||
|
).addRoutingEntry("node1", new RoutingInfo(1, 1, RoutingState.STARTING, ""))
|
||||||
|
);
|
||||||
|
|
||||||
MlMetrics.TrainedModelAllocationCounts counts = MlMetrics.findTrainedModelAllocationCounts(metadataBuilder.build());
|
MlMetrics.TrainedModelAllocationCounts counts = MlMetrics.findTrainedModelAllocationCounts(metadataBuilder.build());
|
||||||
assertThat(counts.trainedModelsTargetAllocations(), is(5));
|
assertThat(counts.trainedModelsTargetAllocations(), is(6));
|
||||||
assertThat(counts.trainedModelsCurrentAllocations(), is(3));
|
assertThat(counts.trainedModelsCurrentAllocations(), is(4));
|
||||||
assertThat(counts.trainedModelsFailedAllocations(), is(1));
|
assertThat(counts.trainedModelsFailedAllocations(), is(1));
|
||||||
|
assertThat(counts.deploymentsWithFixedAllocations(), is(3));
|
||||||
|
assertThat(counts.deploymentsWithDisabledAdaptiveAllocations(), is(1));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFindNativeMemoryFree() {
|
public void testFindNativeMemoryFree() {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue