tests: make integration split quantity configurable (#17219)

* tests: make integration split quantity configurable Refactors shared splitter bash function to take a list of files on stdin and split into a configurable number of partitions, emitting only those from the currently-selected partition to stdout. Also refactors the only caller in the integration_tests launcher script to accept an optional partition_count parameter (defaulting to `2` for backward- compatibility), to provide the list of specs to the function's stdin, and to output relevant information about the quantity of partition splits and which was selected. * ci: run integration tests in 3 parts
2025-06-27 17:08:55 -04:00 · 2025-03-19 16:37:27 -07:00 · 2025-03-19 16:37:27 -07:00 · 3e0f488df2
commit 3e0f488df2
parent 7683983168
6 changed files with 192 additions and 91 deletions
--- a/.buildkite/aarch64_pipeline.yml
+++ b/.buildkite/aarch64_pipeline.yml
@ -35,48 +35,71 @@ steps:
          automatic:
            - limit: 3

-      - label: ":lab_coat: Integration Tests / part 1"
-        key: "integration-tests-part-1"
+      - label: ":lab_coat: Integration Tests / part 1-of-3"
+        key: "integration-tests-part-1-of-3"
        command: |
          set -euo pipefail

          source .buildkite/scripts/common/vm-agent.sh
-          ci/integration_tests.sh split 0
+          ci/integration_tests.sh split 0 3
        retry:
          automatic:
            - limit: 3

-      - label: ":lab_coat: Integration Tests / part 2"
-        key: "integration-tests-part-2"
+      - label: ":lab_coat: Integration Tests / part 2-of-3"
+        key: "integration-tests-part-2-of-3"
        command: |
          set -euo pipefail

          source .buildkite/scripts/common/vm-agent.sh
-          ci/integration_tests.sh split 1
+          ci/integration_tests.sh split 1 3
        retry:
          automatic:
            - limit: 3

-      - label: ":lab_coat: IT Persistent Queues / part 1"
-        key: "integration-tests-qa-part-1"
+      - label: ":lab_coat: Integration Tests / part 3-of-3"
+        key: "integration-tests-part-3-of-3"
+        command: |
+          set -euo pipefail
+
+          source .buildkite/scripts/common/vm-agent.sh
+          ci/integration_tests.sh split 2 3
+        retry:
+          automatic:
+            - limit: 3
+
+      - label: ":lab_coat: IT Persistent Queues / part 1-of-3"
+        key: "integration-tests-qa-part-1-of-3"
        command: |
          set -euo pipefail

          source .buildkite/scripts/common/vm-agent.sh
          export FEATURE_FLAG=persistent_queues
-          ci/integration_tests.sh split 0
+          ci/integration_tests.sh split 0 3
        retry:
          automatic:
            - limit: 3

-      - label: ":lab_coat: IT Persistent Queues / part 2"
-        key: "integration-tests-qa-part-2"
+      - label: ":lab_coat: IT Persistent Queues / part 2-of-3"
+        key: "integration-tests-qa-part-2-of-3"
        command: |
          set -euo pipefail

          source .buildkite/scripts/common/vm-agent.sh
          export FEATURE_FLAG=persistent_queues
-          ci/integration_tests.sh split 1
+          ci/integration_tests.sh split 1 3
+        retry:
+          automatic:
+            - limit: 3
+
+      - label: ":lab_coat: IT Persistent Queues / part 3-of-3"
+        key: "integration-tests-qa-part-3-of-3"
+        command: |
+          set -euo pipefail
+
+          source .buildkite/scripts/common/vm-agent.sh
+          export FEATURE_FLAG=persistent_queues
+          ci/integration_tests.sh split 2 3
        retry:
          automatic:
            - limit: 3
--- a/.buildkite/pull_request_pipeline.yml
+++ b/.buildkite/pull_request_pipeline.yml
@ -81,8 +81,8 @@ steps:
      manual:
        allowed: true

-  - label: ":lab_coat: Integration Tests / part 1"
-    key: "integration-tests-part-1"
+  - label: ":lab_coat: Integration Tests / part 1-of-3"
+    key: "integration-tests-part-1-of-3"
    agents:
      image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
      cpu: "8"
@ -97,10 +97,10 @@ steps:
      set -euo pipefail

      source .buildkite/scripts/common/container-agent.sh
-      ci/integration_tests.sh split 0
+      ci/integration_tests.sh split 0 3

-  - label: ":lab_coat: Integration Tests / part 2"
-    key: "integration-tests-part-2"
+  - label: ":lab_coat: Integration Tests / part 2-of-3"
+    key: "integration-tests-part-2-of-3"
    agents:
      image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
      cpu: "8"
@ -115,10 +115,28 @@ steps:
      set -euo pipefail

      source .buildkite/scripts/common/container-agent.sh
-      ci/integration_tests.sh split 1
+      ci/integration_tests.sh split 1 3

-  - label: ":lab_coat: IT Persistent Queues / part 1"
-    key: "integration-tests-qa-part-1"
+  - label: ":lab_coat: Integration Tests / part 3-of-3"
+    key: "integration-tests-part-3-of-3"
+    agents:
+      image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
+      cpu: "8"
+      memory: "16Gi"
+      ephemeralStorage: "100Gi"
+      # Run as a non-root user
+      imageUID: "1002"
+    retry:
+      automatic:
+        - limit: 3
+    command: |
+      set -euo pipefail
+
+      source .buildkite/scripts/common/container-agent.sh
+      ci/integration_tests.sh split 2 3
+
+  - label: ":lab_coat: IT Persistent Queues / part 1-of-3"
+    key: "integration-tests-qa-part-1-of-3"
    agents:
      image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
      cpu: "8"
@ -134,10 +152,10 @@ steps:

      source .buildkite/scripts/common/container-agent.sh
      export FEATURE_FLAG=persistent_queues
-      ci/integration_tests.sh split 0
+      ci/integration_tests.sh split 0 3

-  - label: ":lab_coat: IT Persistent Queues / part 2"
-    key: "integration-tests-qa-part-2"
+  - label: ":lab_coat: IT Persistent Queues / part 2-of-3"
+    key: "integration-tests-qa-part-2-of-3"
    agents:
      image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
      cpu: "8"
@ -153,7 +171,26 @@ steps:

      source .buildkite/scripts/common/container-agent.sh
      export FEATURE_FLAG=persistent_queues
-      ci/integration_tests.sh split 1
+      ci/integration_tests.sh split 1 3
+
+  - label: ":lab_coat: IT Persistent Queues / part 3-of-3"
+    key: "integration-tests-qa-part-3-of-3"
+    agents:
+      image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-logstash-ci-no-root"
+      cpu: "8"
+      memory: "16Gi"
+      ephemeralStorage: "100Gi"
+      # Run as non root (logstash) user. UID is hardcoded in image.
+      imageUID: "1002"
+    retry:
+      automatic:
+        - limit: 3
+    command: |
+      set -euo pipefail
+
+      source .buildkite/scripts/common/container-agent.sh
+      export FEATURE_FLAG=persistent_queues
+      ci/integration_tests.sh split 2 3

  - label: ":lab_coat: x-pack unit tests"
    key: "x-pack-unit-tests"
--- a/.buildkite/scripts/jdk-matrix-tests/generate-steps.py
+++ b/.buildkite/scripts/jdk-matrix-tests/generate-steps.py
@ -177,17 +177,15 @@ class LinuxJobs(Jobs):
      super().__init__(os=os, jdk=jdk, group_key=group_key, agent=agent)

    def all_jobs(self) -> list[typing.Callable[[], JobRetValues]]:
-        return [
-            self.init_annotation,
-            self.java_unit_test,
-            self.ruby_unit_test,
-            self.integration_tests_part_1,
-            self.integration_tests_part_2,
-            self.pq_integration_tests_part_1,
-            self.pq_integration_tests_part_2,
-            self.x_pack_unit_tests,
-            self.x_pack_integration,
-        ]
+        jobs=list()
+        jobs.append(self.init_annotation)
+        jobs.append(self.java_unit_test)
+        jobs.append(self.ruby_unit_test)
+        jobs.extend(self.integration_test_parts(3))
+        jobs.extend(self.pq_integration_test_parts(3))
+        jobs.append(self.x_pack_unit_tests)
+        jobs.append(self.x_pack_integration)
+        return jobs

    def prepare_shell(self) -> str:
        jdk_dir = f"/opt/buildkite-agent/.java/{self.jdk}"
@ -259,17 +257,14 @@ ci/unit_tests.sh ruby
            retry=copy.deepcopy(ENABLED_RETRIES),
        )

-    def integration_tests_part_1(self) -> JobRetValues:
-        return self.integration_tests(part=1)
+    def integration_test_parts(self, parts) -> list[JobRetValues]:
+        return list(map(lambda idx: integration_tests(self, idx+1, parts), range(parts))

-    def integration_tests_part_2(self) -> JobRetValues:
-        return self.integration_tests(part=2)
-
-    def integration_tests(self, part: int) -> JobRetValues:
-        step_name_human = f"Integration Tests - {part}"
-        step_key = f"{self.group_key}-integration-tests-{part}"
+    def integration_tests(self, part: int, parts: int) -> JobRetValues:
+        step_name_human = f"Integration Tests - {part}/{parts}"
+        step_key = f"{self.group_key}-integration-tests-{part}-of-{parts}"
        test_command = f"""
-ci/integration_tests.sh split {part-1}
+ci/integration_tests.sh split {part-1} {parts}
        """

        return JobRetValues(
@ -281,18 +276,15 @@ ci/integration_tests.sh split {part-1}
            retry=copy.deepcopy(ENABLED_RETRIES),
        )

-    def pq_integration_tests_part_1(self) -> JobRetValues:
-        return self.pq_integration_tests(part=1)
+    def pq_integration_test_parts(self, parts) -> list[JobRetValues]:
+        return list(map(lambda idx: pq_integration_tests(self, idx+1, parts), range(parts))

-    def pq_integration_tests_part_2(self) -> JobRetValues:
-        return self.pq_integration_tests(part=2)
-
-    def pq_integration_tests(self, part: int) -> JobRetValues:
-        step_name_human = f"IT Persistent Queues - {part}"
-        step_key = f"{self.group_key}-it-persistent-queues-{part}"
+    def pq_integration_tests(self, part: int, parts: int) -> JobRetValues:
+        step_name_human = f"IT Persistent Queues - {part}/{parts}"
+        step_key = f"{self.group_key}-it-persistent-queues-{part}-of-{parts}"
        test_command = f"""
 export FEATURE_FLAG=persistent_queues
-ci/integration_tests.sh split {part-1}
+ci/integration_tests.sh split {part-1} {parts}
        """

        return JobRetValues(
--- a/ci/get-test-half.sh
+++ b/ci/get-test-half.sh
@ -1,27 +0,0 @@
-#!/bin/bash
-
-# get_test_half returns either the first or second half of integration tests
-# Usage: get_test_half <half_number>
-# half_number: 0 for first half, 1 for second half
-get_test_half() {
-    local half_number=$1
-    # Ensure only spec files go to stdout
-    pushd qa/integration >/dev/null 2>&1
-    
-    # Collect all spec files
-    local glob1=(specs/*spec.rb)
-    local glob2=(specs/**/*spec.rb)
-    local all_specs=("${glob1[@]}" "${glob2[@]}")
-    
-    # Calculate the split point
-    local split_point=$((${#all_specs[@]} / 2))
-    
-    # Get the requested half (:: is "up to", : is "from")
-    if [[ $half_number -eq 0 ]]; then
-        local specs="${all_specs[@]::$split_point}"
-    else
-        local specs="${all_specs[@]:$split_point}"
-    fi
-    popd >/dev/null 2>&1
-    echo "$specs"
-}
--- a/ci/integration_tests.sh
+++ b/ci/integration_tests.sh
@ -10,9 +10,6 @@ export GRADLE_OPTS="-Xmx2g -Dorg.gradle.jvmargs=-Xmx2g -Dorg.gradle.daemon=false
 export SPEC_OPTS="--order rand --format documentation"
 export CI=true

-# Source shared function for splitting integration tests
-source "$(dirname "${BASH_SOURCE[0]}")/get-test-half.sh"
-
 if [ -n "$BUILD_JAVA_HOME" ]; then
  GRADLE_OPTS="$GRADLE_OPTS -Dorg.gradle.java.home=$BUILD_JAVA_HOME"
 fi
@ -22,14 +19,15 @@ if [[ $1 = "setup" ]]; then
 exit 0

 elif [[ $1 == "split" ]]; then
-    if [[ $2 =~ ^[01]$ ]]; then
-        specs=$(get_test_half "$2")
-        echo "Running half $2 of integration specs: $specs"
-        ./gradlew runIntegrationTests -PrubyIntegrationSpecs="$specs" --console=plain
-    else
-       echo "Error, must specify 0 or 1 after the split. For example ci/integration_tests.sh split 0"
-       exit 1
-    fi
+  # Source shared function for splitting integration tests
+  source "$(dirname "${BASH_SOURCE[0]}")/partition-files.lib.sh"
+
+  index="${2:?index}"
+  count="${3:-2}"
+  specs=($(cd qa/integration; partition_files "${index}" "${count}" < <(find specs -name '*_spec.rb') ))
+
+  echo "Running integration tests partition[${index}] of ${count}: ${specs[*]}"
+  ./gradlew runIntegrationTests -PrubyIntegrationSpecs="${specs[*]}" --console=plain

 elif [[ !  -z  $@  ]]; then
    echo "Running integration tests 'rspec $@'"
--- a/ci/partition-files.lib.sh
+++ b/ci/partition-files.lib.sh
@ -0,0 +1,78 @@
+#!/bin/bash
+
+# partition_files returns a consistent partition of the filenames given on stdin
+# Usage: partition_files <partition_index> <partition_count=2> < <(ls files)
+# partition_index: the zero-based index of the partition to select `[0,partition_count)`
+# partition_count: the number of partitions `[2,#files]`
+partition_files() (
+    set -e
+
+    local files
+    # ensure files is consistently sorted and distinct
+    IFS=$'\n' read -ra files -d '' <<<"$(cat - | sort | uniq)" || true
+
+    local partition_index="${1:?}"
+    local partition_count="${2:?}"
+
+    _error () { >&2 echo "ERROR: ${1:-UNSPECIFIED}"; exit 1; }
+
+    # safeguard against nonsense invocations
+    if (( ${#files[@]} < 2 )); then
+      _error "#files(${#files[@]}) must be at least 2 in order to partition"
+    elif ( ! [[ "${partition_count}" =~ ^[0-9]+$ ]] ) || (( partition_count < 2 )) || (( partition_count > ${#files[@]})); then
+      _error "partition_count(${partition_count}) must be a number that is at least 2 and not greater than #files(${#files[@]})"
+    elif ( ! [[ "${partition_index}" =~ ^[0-9]+$ ]] ) || (( partition_index < 0 )) || (( partition_index >= $partition_count )) ; then
+      _error "partition_index(${partition_index}) must be a number that is greater 0 and less than partition_count(${partition_count})"
+    fi
+
+    # round-robbin emit those in our selected partition
+    for index in "${!files[@]}"; do
+      partition="$(( index % partition_count ))"
+      if (( partition == partition_index )); then
+        echo "${files[$index]}"
+      fi
+    done
+)
+
+if [[ "$0" == "${BASH_SOURCE[0]}" ]]; then
+  if [[ "$1" == "test" ]]; then
+    status=0
+
+    SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+    file_list="$( cd "${SCRIPT_DIR}"; find . -type f )"
+
+    # for any legal partitioning into N partitions, we ensure that
+    # the combined output of `partition_files I N` where `I` is all numbers in
+    # the range `[0,N)` produces no repeats and no omissions, even if the
+    # input list is not consistently ordered.
+    for n in $(seq 2 $(wc -l <<<"${file_list}")); do
+      result=""
+      for i in $(seq 0 $(( n - 1 ))); do
+        for file in $(partition_files $i $n <<<"$( shuf <<<"${file_list}" )"); do
+          result+="${file}"$'\n'
+        done
+      done
+
+      repeated="$( uniq --repeated <<<"$( sort <<<"${result}" )" )"
+      if (( $(printf "${repeated}" | wc -l) > 0 )); then
+        status=1
+        echo "[n=${n}]FAIL(repeated):"$'\n'"${repeated}"
+      fi
+
+      missing=$( comm -23 <(sort <<<"${file_list}") <( sort <<<"${result}" ) )
+      if (( $(printf "${missing}" | wc -l) > 0 )); then
+        status=1
+        echo "[n=${n}]FAIL(omitted):"$'\n'"${missing}"
+      fi
+    done
+
+    if (( status > 0 )); then
+      echo "There were failures. The input list was:"
+      echo "${file_list}"
+    fi
+
+    exit "${status}"
+  else
+    partition_files $@
+  fi
+fi