[ResponseOps] Updating TM metrics to handle when capacity estimation returns NaN (#207116)

Resolves https://github.com/elastic/kibana/issues/204467 ## Summary `assumedRequiredThroughputPerMinutePerKibana` is `NaN` when the `capacityStats.runtime.value.load.p90` is undefined. This PR adds a check to catch when the load.p90 is undefined, throw an error, and ignore calculating the capacity estimation. ### Checklist - [ ] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios ### To verify I was not able to reproduce this locally without changing the code, so here is how I tested the code and I am definitely open to suggestions of how to better test this. 1. Update the code to set `capacityStats.runtime.value.load.p90: undefined`. I set it [here](286c9e2ddb/x-pack/platform/plugins/shared/task_manager/server/monitoring/capacity_estimation.ts (L55)), but there are other places upstream where you could set it to `undefined`. 2. Start Kibana 3. Verify that you see the following log message: ``` Task manager had an issue calculating capacity estimation. averageLoadPercentage: undefined ```
2025-04-24 17:59:23 -04:00 · 2025-02-12 10:16:35 -08:00 · 2025-02-12 10:16:35 -08:00 · 8bff766095
commit 8bff766095
parent 7ed105d715
2 changed files with 33 additions and 4 deletions
--- a/x-pack/platform/plugins/shared/task_manager/server/monitoring/capacity_estimation.test.ts
+++ b/x-pack/platform/plugins/shared/task_manager/server/monitoring/capacity_estimation.test.ts
@ -8,6 +8,7 @@
 import { CapacityEstimationParams, estimateCapacity } from './capacity_estimation';
 import { HealthStatus, RawMonitoringStats } from './monitoring_stats_stream';
 import { mockLogger } from '../test_utils';
+import { AveragedStat } from './task_run_calculators';

 describe('estimateCapacity', () => {
  const logger = mockLogger();
@ -833,6 +834,24 @@ describe('estimateCapacity', () => {
      max_throughput_per_minute: 200,
    });
  });
+  test('throws an error when the runtime load is undefined', async () => {
+    expect(
+      () =>
+        estimateCapacity(
+          logger,
+          mockStats(
+            {},
+            {},
+            {
+              load: {} as AveragedStat,
+            }
+          ),
+          2
+        ).value.observed
+    ).toThrowErrorMatchingInlineSnapshot(
+      `"Task manager had an issue calculating capacity estimation. averageLoadPercentage: undefined"`
+    );
+  });
 });

 function mockStats(
--- a/x-pack/platform/plugins/shared/task_manager/server/monitoring/capacity_estimation.ts
+++ b/x-pack/platform/plugins/shared/task_manager/server/monitoring/capacity_estimation.ts
@ -65,6 +65,11 @@ export function estimateCapacity(
    capacity: { config: configuredCapacity },
  } = capacityStats.configuration.value;

+  if (!averageLoadPercentage) {
+    throw new Error(
+      `Task manager had an issue calculating capacity estimation. averageLoadPercentage: ${averageLoadPercentage}`
+    );
+  }
  /**
   * On average, how many polling cycles does it take to execute a task?
   * If this is higher than the polling cycle, then a whole cycle is wasted as
@ -265,10 +270,15 @@ export function withCapacityEstimate(
  assumedKibanaInstances: number
 ): RawMonitoringStats['stats'] {
  if (isCapacityEstimationParams(monitoredStats)) {
-    return {
-      ...monitoredStats,
-      capacity_estimation: estimateCapacity(logger, monitoredStats, assumedKibanaInstances),
-    };
+    try {
+      return {
+        ...monitoredStats,
+        capacity_estimation: estimateCapacity(logger, monitoredStats, assumedKibanaInstances),
+      };
+    } catch (e) {
+      // Return monitoredStats with out capacity estimation
+      logger.error(e.message);
+    }
  }
  return monitoredStats;
 }