[ResponseOps] Updating TM metrics to handle when capacity estimation returns NaN (#207116)

Resolves https://github.com/elastic/kibana/issues/204467

## Summary

`assumedRequiredThroughputPerMinutePerKibana` is `NaN` when the
`capacityStats.runtime.value.load.p90` is undefined. This PR adds a
check to catch when the load.p90 is undefined, throw an error, and
ignore calculating the capacity estimation.


### Checklist

- [ ] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios

### To verify
I was not able to reproduce this locally without changing the code, so
here is how I tested the code and I am definitely open to suggestions of
how to better test this.

1. Update the code to set `capacityStats.runtime.value.load.p90:
undefined`. I set it
[here](286c9e2ddb/x-pack/platform/plugins/shared/task_manager/server/monitoring/capacity_estimation.ts (L55)),
but there are other places upstream where you could set it to
`undefined`.
2. Start Kibana
3. Verify that you see the following log message:
```
 Task manager had an issue calculating capacity estimation. averageLoadPercentage: undefined
```
This commit is contained in:
Alexi Doak 2025-02-12 10:16:35 -08:00 committed by GitHub
parent 7ed105d715
commit 8bff766095
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 33 additions and 4 deletions

View file

@ -8,6 +8,7 @@
import { CapacityEstimationParams, estimateCapacity } from './capacity_estimation';
import { HealthStatus, RawMonitoringStats } from './monitoring_stats_stream';
import { mockLogger } from '../test_utils';
import { AveragedStat } from './task_run_calculators';
describe('estimateCapacity', () => {
const logger = mockLogger();
@ -833,6 +834,24 @@ describe('estimateCapacity', () => {
max_throughput_per_minute: 200,
});
});
test('throws an error when the runtime load is undefined', async () => {
expect(
() =>
estimateCapacity(
logger,
mockStats(
{},
{},
{
load: {} as AveragedStat,
}
),
2
).value.observed
).toThrowErrorMatchingInlineSnapshot(
`"Task manager had an issue calculating capacity estimation. averageLoadPercentage: undefined"`
);
});
});
function mockStats(

View file

@ -65,6 +65,11 @@ export function estimateCapacity(
capacity: { config: configuredCapacity },
} = capacityStats.configuration.value;
if (!averageLoadPercentage) {
throw new Error(
`Task manager had an issue calculating capacity estimation. averageLoadPercentage: ${averageLoadPercentage}`
);
}
/**
* On average, how many polling cycles does it take to execute a task?
* If this is higher than the polling cycle, then a whole cycle is wasted as
@ -265,10 +270,15 @@ export function withCapacityEstimate(
assumedKibanaInstances: number
): RawMonitoringStats['stats'] {
if (isCapacityEstimationParams(monitoredStats)) {
return {
...monitoredStats,
capacity_estimation: estimateCapacity(logger, monitoredStats, assumedKibanaInstances),
};
try {
return {
...monitoredStats,
capacity_estimation: estimateCapacity(logger, monitoredStats, assumedKibanaInstances),
};
} catch (e) {
// Return monitoredStats with out capacity estimation
logger.error(e.message);
}
}
return monitoredStats;
}