From abaa9379fd9a3414be65bcf5287fbd0ea415f8b0 Mon Sep 17 00:00:00 2001
From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com>
Date: Thu, 20 Mar 2025 05:34:53 -0700
Subject: [PATCH] tests: make integration split quantity configurable (#17219)
 (#17368)

* tests: make integration split quantity configurable

Refactors shared splitter bash function to take a list of files on stdin
and split into a configurable number of partitions, emitting only those from
the currently-selected partition to stdout.

Also refactors the only caller in the integration_tests launcher script to
accept an optional partition_count parameter (defaulting to `2` for backward-
compatibility), to provide the list of specs to the function's stdin, and to
output relevant information about the quantity of partition splits and which
was selected.

* ci: run integration tests in 3 parts

(cherry picked from commit 3e0f488df29c3602a6890a071f34f05e34463cb2)

Co-authored-by: Rye Biesemeyer <yaauie@users.noreply.github.com>
---
 .buildkite/aarch64_pipeline.yml               | 47 ++++++++---
 .buildkite/pull_request_pipeline.yml          | 61 ++++++++++++---
 .../jdk-matrix-tests/generate-steps.py        | 50 +++++-------
 ci/get-test-half.sh                           | 27 -------
 ci/integration_tests.sh                       | 20 +++--
 ci/partition-files.lib.sh                     | 78 +++++++++++++++++++
 6 files changed, 192 insertions(+), 91 deletions(-)
 delete mode 100644 ci/get-test-half.sh
 create mode 100755 ci/partition-files.lib.sh

diff --git a/.buildkite/aarch64_pipeline.yml b/.buildkite/aarch64_pipeline.yml
index fbbe1260f..8c66116fe 100644
--- a/.buildkite/aarch64_pipeline.yml
+++ b/.buildkite/aarch64_pipeline.yml
@@ -35,48 +35,71 @@ steps:
           automatic:
             - limit: 3
 
-      - label: ":lab_coat: Integration Tests / part 1"
-        key: "integration-tests-part-1"
+      - label: ":lab_coat: Integration Tests / part 1-of-3"
+        key: "integration-tests-part-1-of-3"
         command: |
           set -euo pipefail
 
           source .buildkite/scripts/common/vm-agent.sh
-          ci/integration_tests.sh split 0
+          ci/integration_tests.sh split 0 3
         retry:
           automatic:
             - limit: 3
 
-      - label: ":lab_coat: Integration Tests / part 2"
-        key: "integration-tests-part-2"
+      - label: ":lab_coat: Integration Tests / part 2-of-3"
+        key: "integration-tests-part-2-of-3"
         command: |
           set -euo pipefail
 
           source .buildkite/scripts/common/vm-agent.sh
-          ci/integration_tests.sh split 1
+          ci/integration_tests.sh split 1 3
         retry:
           automatic:
             - limit: 3
 
-      - label: ":lab_coat: IT Persistent Queues / part 1"
-        key: "integration-tests-qa-part-1"
+      - label: ":lab_coat: Integration Tests / part 3-of-3"
+        key: "integration-tests-part-3-of-3"
+        command: |
+          set -euo pipefail
+
+          source .buildkite/scripts/common/vm-agent.sh
+          ci/integration_tests.sh split 2 3
+        retry:
+          automatic:
+            - limit: 3
+
+      - label: ":lab_coat: IT Persistent Queues / part 1-of-3"
+        key: "integration-tests-qa-part-1-of-3"
         command: |
           set -euo pipefail
 
           source .buildkite/scripts/common/vm-agent.sh
           export FEATURE_FLAG=persistent_queues
-          ci/integration_tests.sh split 0
+          ci/integration_tests.sh split 0 3
         retry:
           automatic:
             - limit: 3
 
-      - label: ":lab_coat: IT Persistent Queues / part 2"
-        key: "integration-tests-qa-part-2"
+      - label: ":lab_coat: IT Persistent Queues / part 2-of-3"
+        key: "integration-tests-qa-part-2-of-3"
         command: |
           set -euo pipefail
 
           source .buildkite/scripts/common/vm-agent.sh
           export FEATURE_FLAG=persistent_queues
-          ci/integration_tests.sh split 1
+          ci/integration_tests.sh split 1 3
+        retry:
+          automatic:
+            - limit: 3
+
+      - label: ":lab_coat: IT Persistent Queues / part 3-of-3"
+        key: "integration-tests-qa-part-3-of-3"
+        command: |
+          set -euo pipefail
+
+          source .buildkite/scripts/common/vm-agent.sh
+          export FEATURE_FLAG=persistent_queues
+          ci/integration_tests.sh split 2 3
         retry:
           automatic:
             - limit: 3
diff --git a/.buildkite/pull_request_pipeline.yml b/.buildkite/pull_request_pipeline.yml
index 9209144da..6cf3ed356 100644
--- a/.buildkite/pull_request_pipeline.yml
+++ b/.buildkite/pull_request_pipeline.yml
@@ -79,8 +79,8 @@ steps:
       manual:
         allowed: true
 
-  - label: ":lab_coat: Integration Tests / part 1"
-    key: "integration-tests-part-1"
+  - label: ":lab_coat: Integration Tests / part 1-of-3"
+    key: "integration-tests-part-1-of-3"
     agents:
       image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
       cpu: "8"
@@ -95,10 +95,10 @@ steps:
       set -euo pipefail
 
       source .buildkite/scripts/common/container-agent.sh
-      ci/integration_tests.sh split 0
+      ci/integration_tests.sh split 0 3
 
-  - label: ":lab_coat: Integration Tests / part 2"
-    key: "integration-tests-part-2"
+  - label: ":lab_coat: Integration Tests / part 2-of-3"
+    key: "integration-tests-part-2-of-3"
     agents:
       image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
       cpu: "8"
@@ -113,10 +113,28 @@ steps:
       set -euo pipefail
 
       source .buildkite/scripts/common/container-agent.sh
-      ci/integration_tests.sh split 1
+      ci/integration_tests.sh split 1 3
 
-  - label: ":lab_coat: IT Persistent Queues / part 1"
-    key: "integration-tests-qa-part-1"
+  - label: ":lab_coat: Integration Tests / part 3-of-3"
+    key: "integration-tests-part-3-of-3"
+    agents:
+      image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
+      cpu: "8"
+      memory: "16Gi"
+      ephemeralStorage: "100Gi"
+      # Run as a non-root user
+      imageUID: "1002"
+    retry:
+      automatic:
+        - limit: 3
+    command: |
+      set -euo pipefail
+
+      source .buildkite/scripts/common/container-agent.sh
+      ci/integration_tests.sh split 2 3
+
+  - label: ":lab_coat: IT Persistent Queues / part 1-of-3"
+    key: "integration-tests-qa-part-1-of-3"
     agents:
       image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
       cpu: "8"
@@ -132,10 +150,10 @@ steps:
 
       source .buildkite/scripts/common/container-agent.sh
       export FEATURE_FLAG=persistent_queues
-      ci/integration_tests.sh split 0
+      ci/integration_tests.sh split 0 3
 
-  - label: ":lab_coat: IT Persistent Queues / part 2"
-    key: "integration-tests-qa-part-2"
+  - label: ":lab_coat: IT Persistent Queues / part 2-of-3"
+    key: "integration-tests-qa-part-2-of-3"
     agents:
       image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
       cpu: "8"
@@ -151,7 +169,26 @@ steps:
 
       source .buildkite/scripts/common/container-agent.sh
       export FEATURE_FLAG=persistent_queues
-      ci/integration_tests.sh split 1
+      ci/integration_tests.sh split 1 3
+
+  - label: ":lab_coat: IT Persistent Queues / part 3-of-3"
+    key: "integration-tests-qa-part-3-of-3"
+    agents:
+      image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
+      cpu: "8"
+      memory: "16Gi"
+      ephemeralStorage: "100Gi"
+      # Run as non root (logstash) user. UID is hardcoded in image.
+      imageUID: "1002"
+    retry:
+      automatic:
+        - limit: 3
+    command: |
+      set -euo pipefail
+
+      source .buildkite/scripts/common/container-agent.sh
+      export FEATURE_FLAG=persistent_queues
+      ci/integration_tests.sh split 2 3
 
   - label: ":lab_coat: x-pack unit tests"
     key: "x-pack-unit-tests"
diff --git a/.buildkite/scripts/jdk-matrix-tests/generate-steps.py b/.buildkite/scripts/jdk-matrix-tests/generate-steps.py
index 681272b85..b948f0740 100644
--- a/.buildkite/scripts/jdk-matrix-tests/generate-steps.py
+++ b/.buildkite/scripts/jdk-matrix-tests/generate-steps.py
@@ -177,17 +177,15 @@ class LinuxJobs(Jobs):
       super().__init__(os=os, jdk=jdk, group_key=group_key, agent=agent)
 
     def all_jobs(self) -> list[typing.Callable[[], JobRetValues]]:
-        return [
-            self.init_annotation,
-            self.java_unit_test,
-            self.ruby_unit_test,
-            self.integration_tests_part_1,
-            self.integration_tests_part_2,
-            self.pq_integration_tests_part_1,
-            self.pq_integration_tests_part_2,
-            self.x_pack_unit_tests,
-            self.x_pack_integration,
-        ]
+        jobs=list()
+        jobs.append(self.init_annotation)
+        jobs.append(self.java_unit_test)
+        jobs.append(self.ruby_unit_test)
+        jobs.extend(self.integration_test_parts(3))
+        jobs.extend(self.pq_integration_test_parts(3))
+        jobs.append(self.x_pack_unit_tests)
+        jobs.append(self.x_pack_integration)
+        return jobs
 
     def prepare_shell(self) -> str:
         jdk_dir = f"/opt/buildkite-agent/.java/{self.jdk}"
@@ -259,17 +257,14 @@ ci/unit_tests.sh ruby
             retry=copy.deepcopy(ENABLED_RETRIES),
         )
 
-    def integration_tests_part_1(self) -> JobRetValues:
-        return self.integration_tests(part=1)
+    def integration_test_parts(self, parts) -> list[JobRetValues]:
+        return list(map(lambda idx: integration_tests(self, idx+1, parts), range(parts))
 
-    def integration_tests_part_2(self) -> JobRetValues:
-        return self.integration_tests(part=2)
-
-    def integration_tests(self, part: int) -> JobRetValues:
-        step_name_human = f"Integration Tests - {part}"
-        step_key = f"{self.group_key}-integration-tests-{part}"
+    def integration_tests(self, part: int, parts: int) -> JobRetValues:
+        step_name_human = f"Integration Tests - {part}/{parts}"
+        step_key = f"{self.group_key}-integration-tests-{part}-of-{parts}"
         test_command = f"""
-ci/integration_tests.sh split {part-1}
+ci/integration_tests.sh split {part-1} {parts}
         """
 
         return JobRetValues(
@@ -281,18 +276,15 @@ ci/integration_tests.sh split {part-1}
             retry=copy.deepcopy(ENABLED_RETRIES),
         )
 
-    def pq_integration_tests_part_1(self) -> JobRetValues:
-        return self.pq_integration_tests(part=1)
+    def pq_integration_test_parts(self, parts) -> list[JobRetValues]:
+        return list(map(lambda idx: pq_integration_tests(self, idx+1, parts), range(parts))
 
-    def pq_integration_tests_part_2(self) -> JobRetValues:
-        return self.pq_integration_tests(part=2)
-
-    def pq_integration_tests(self, part: int) -> JobRetValues:
-        step_name_human = f"IT Persistent Queues - {part}"
-        step_key = f"{self.group_key}-it-persistent-queues-{part}"
+    def pq_integration_tests(self, part: int, parts: int) -> JobRetValues:
+        step_name_human = f"IT Persistent Queues - {part}/{parts}"
+        step_key = f"{self.group_key}-it-persistent-queues-{part}-of-{parts}"
         test_command = f"""
 export FEATURE_FLAG=persistent_queues
-ci/integration_tests.sh split {part-1}
+ci/integration_tests.sh split {part-1} {parts}
         """
 
         return JobRetValues(
diff --git a/ci/get-test-half.sh b/ci/get-test-half.sh
deleted file mode 100644
index 147722540..000000000
--- a/ci/get-test-half.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-# get_test_half returns either the first or second half of integration tests
-# Usage: get_test_half <half_number>
-# half_number: 0 for first half, 1 for second half
-get_test_half() {
-    local half_number=$1
-    # Ensure only spec files go to stdout
-    pushd qa/integration >/dev/null 2>&1
-    
-    # Collect all spec files
-    local glob1=(specs/*spec.rb)
-    local glob2=(specs/**/*spec.rb)
-    local all_specs=("${glob1[@]}" "${glob2[@]}")
-    
-    # Calculate the split point
-    local split_point=$((${#all_specs[@]} / 2))
-    
-    # Get the requested half (:: is "up to", : is "from")
-    if [[ $half_number -eq 0 ]]; then
-        local specs="${all_specs[@]::$split_point}"
-    else
-        local specs="${all_specs[@]:$split_point}"
-    fi
-    popd >/dev/null 2>&1
-    echo "$specs"
-}
\ No newline at end of file
diff --git a/ci/integration_tests.sh b/ci/integration_tests.sh
index 43573341f..318660bc9 100755
--- a/ci/integration_tests.sh
+++ b/ci/integration_tests.sh
@@ -10,9 +10,6 @@ export GRADLE_OPTS="-Xmx2g -Dorg.gradle.jvmargs=-Xmx2g -Dorg.gradle.daemon=false
 export SPEC_OPTS="--order rand --format documentation"
 export CI=true
 
-# Source shared function for splitting integration tests
-source "$(dirname "${BASH_SOURCE[0]}")/get-test-half.sh"
-
 if [ -n "$BUILD_JAVA_HOME" ]; then
   GRADLE_OPTS="$GRADLE_OPTS -Dorg.gradle.java.home=$BUILD_JAVA_HOME"
 fi
@@ -22,14 +19,15 @@ if [[ $1 = "setup" ]]; then
  exit 0
 
 elif [[ $1 == "split" ]]; then
-    if [[ $2 =~ ^[01]$ ]]; then
-        specs=$(get_test_half "$2")
-        echo "Running half $2 of integration specs: $specs"
-        ./gradlew runIntegrationTests -PrubyIntegrationSpecs="$specs" --console=plain
-    else
-       echo "Error, must specify 0 or 1 after the split. For example ci/integration_tests.sh split 0"
-       exit 1
-    fi
+  # Source shared function for splitting integration tests
+  source "$(dirname "${BASH_SOURCE[0]}")/partition-files.lib.sh"
+
+  index="${2:?index}"
+  count="${3:-2}"
+  specs=($(cd qa/integration; partition_files "${index}" "${count}" < <(find specs -name '*_spec.rb') ))
+
+  echo "Running integration tests partition[${index}] of ${count}: ${specs[*]}"
+  ./gradlew runIntegrationTests -PrubyIntegrationSpecs="${specs[*]}" --console=plain
 
 elif [[ !  -z  $@  ]]; then
     echo "Running integration tests 'rspec $@'"
diff --git a/ci/partition-files.lib.sh b/ci/partition-files.lib.sh
new file mode 100755
index 000000000..c974921da
--- /dev/null
+++ b/ci/partition-files.lib.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# partition_files returns a consistent partition of the filenames given on stdin
+# Usage: partition_files <partition_index> <partition_count=2> < <(ls files)
+# partition_index: the zero-based index of the partition to select `[0,partition_count)`
+# partition_count: the number of partitions `[2,#files]`
+partition_files() (
+    set -e
+
+    local files
+    # ensure files is consistently sorted and distinct
+    IFS=$'\n' read -ra files -d '' <<<"$(cat - | sort | uniq)" || true
+
+    local partition_index="${1:?}"
+    local partition_count="${2:?}"
+
+    _error () { >&2 echo "ERROR: ${1:-UNSPECIFIED}"; exit 1; }
+
+    # safeguard against nonsense invocations
+    if (( ${#files[@]} < 2 )); then
+      _error "#files(${#files[@]}) must be at least 2 in order to partition"
+    elif ( ! [[ "${partition_count}" =~ ^[0-9]+$ ]] ) || (( partition_count < 2 )) || (( partition_count > ${#files[@]})); then
+      _error "partition_count(${partition_count}) must be a number that is at least 2 and not greater than #files(${#files[@]})"
+    elif ( ! [[ "${partition_index}" =~ ^[0-9]+$ ]] ) || (( partition_index < 0 )) || (( partition_index >= $partition_count )) ; then
+      _error "partition_index(${partition_index}) must be a number that is greater 0 and less than partition_count(${partition_count})"
+    fi
+
+    # round-robbin emit those in our selected partition
+    for index in "${!files[@]}"; do
+      partition="$(( index % partition_count ))"
+      if (( partition == partition_index )); then
+        echo "${files[$index]}"
+      fi
+    done
+)
+
+if [[ "$0" == "${BASH_SOURCE[0]}" ]]; then
+  if [[ "$1" == "test" ]]; then
+    status=0
+
+    SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+    file_list="$( cd "${SCRIPT_DIR}"; find . -type f )"
+
+    # for any legal partitioning into N partitions, we ensure that
+    # the combined output of `partition_files I N` where `I` is all numbers in
+    # the range `[0,N)` produces no repeats and no omissions, even if the
+    # input list is not consistently ordered.
+    for n in $(seq 2 $(wc -l <<<"${file_list}")); do
+      result=""
+      for i in $(seq 0 $(( n - 1 ))); do
+        for file in $(partition_files $i $n <<<"$( shuf <<<"${file_list}" )"); do
+          result+="${file}"$'\n'
+        done
+      done
+
+      repeated="$( uniq --repeated <<<"$( sort <<<"${result}" )" )"
+      if (( $(printf "${repeated}" | wc -l) > 0 )); then
+        status=1
+        echo "[n=${n}]FAIL(repeated):"$'\n'"${repeated}"
+      fi
+
+      missing=$( comm -23 <(sort <<<"${file_list}") <( sort <<<"${result}" ) )
+      if (( $(printf "${missing}" | wc -l) > 0 )); then
+        status=1
+        echo "[n=${n}]FAIL(omitted):"$'\n'"${missing}"
+      fi
+    done
+
+    if (( status > 0 )); then
+      echo "There were failures. The input list was:"
+      echo "${file_list}"
+    fi
+
+    exit "${status}"
+  else
+    partition_files $@
+  fi
+fi
\ No newline at end of file