[CI] Add docker retries (#191824)

## Summary
We've seen several issues stemming from docker failures, like:
```
Error response from daemon: Get "https://docker.elastic.co/v2/": ...
```

This is happening because of rolling docker updates, and should affect
each node for a short time, while connections are draining. The
suggestion was to implement retry logic for docker operations. This PR
tries to cover much of it.
This commit is contained in:
Alex Szabo 2024-09-02 13:56:07 +02:00 committed by GitHub
parent 00174635b0
commit 8b3b314bed
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 68 additions and 24 deletions

View file

@ -8,6 +8,8 @@ if [[ "$(type -t vault_get)" != "function" ]]; then
source .buildkite/scripts/common/vault_fns.sh
fi
source .buildkite/scripts/common/util.sh
# Set up general-purpose tokens and credentials
{
BUILDKITE_TOKEN="$(vault_get buildkite-ci buildkite_token_all_jobs)"
@ -18,9 +20,12 @@ fi
KIBANA_DOCKER_USERNAME="$(vault_get container-registry username)"
KIBANA_DOCKER_PASSWORD="$(vault_get container-registry password)"
if (command -v docker && docker version) &> /dev/null; then
echo "$KIBANA_DOCKER_PASSWORD" | docker login -u "$KIBANA_DOCKER_USERNAME" --password-stdin docker.elastic.co
fi
function docker_login() {
if (command -v docker && docker version) &> /dev/null; then
echo "$KIBANA_DOCKER_PASSWORD" | docker login -u "$KIBANA_DOCKER_USERNAME" --password-stdin docker.elastic.co
fi
}
retry 5 15 docker_login
}
# Set up a custom ES Snapshot Manifest if one has been specified for this build

View file

@ -178,3 +178,32 @@ print_if_dry_run() {
echo "DRY_RUN is enabled."
fi
}
docker_with_retry () {
cmd=$1
shift
args=("$@")
attempt=0
max_retries=5
sleep_time=15
while true
do
attempt=$((attempt+1))
if [ $attempt -gt $max_retries ]
then
echo "Docker $cmd retries exceeded, aborting."
exit 1
fi
if docker "$cmd" "${args[@]}"
then
echo "Docker $cmd successful."
break
else
echo "Docker $cmd unsuccessful, attempt '$attempt'... Retrying in $sleep_time"
sleep $sleep_time
fi
done
}

View file

@ -8,6 +8,7 @@ fi
.buildkite/scripts/bootstrap.sh
source .buildkite/scripts/steps/artifacts/env.sh
source .buildkite/scripts/common/util.sh
GIT_ABBREV_COMMIT=${BUILDKITE_COMMIT:0:12}
KIBANA_IMAGE_TAG="sec-sol-qg-$GIT_ABBREV_COMMIT"
@ -23,7 +24,7 @@ if docker manifest inspect $KIBANA_IMAGE &> /dev/null; then
exit 0
fi
docker pull $KIBANA_BASE_IMAGE:latest
docker_with_retry pull $KIBANA_BASE_IMAGE:latest
echo "--- Build images"
node scripts/build \
@ -50,8 +51,8 @@ docker load < "target/kibana-serverless-$BASE_VERSION-docker-image-aarch64.tar.g
docker tag "$KIBANA_IMAGE" "$KIBANA_IMAGE-arm64"
echo "--- Push images"
docker image push "$KIBANA_IMAGE-arm64"
docker image push "$KIBANA_IMAGE-amd64"
docker_with_retry push "$KIBANA_IMAGE-arm64"
docker_with_retry push "$KIBANA_IMAGE-amd64"
echo "--- Create and push manifests"
docker manifest create \

View file

@ -1,5 +1,7 @@
#!/bin/bash
source .buildkite/scripts/common/util.sh
if [ "$KIBANA_MKI_QUALITY_GATE" == "1" ]; then
echo "Triggered by quality gate!"
triggered_by="Serverless Quality Gate."
@ -17,11 +19,11 @@ else
KBN_IMAGE=${KIBANA_LATEST}
fi
docker pull ${KBN_IMAGE}
docker_with_retry pull ${KBN_IMAGE}
build_date=$(docker inspect ${KBN_IMAGE} | jq -r '.[0].Config.Labels."org.label-schema.build-date"')
vcs_ref=$(docker inspect ${KBN_IMAGE} | jq -r '.[0].Config.Labels."org.label-schema.vcs-ref"')
vcs_url=$(docker inspect ${KBN_IMAGE} | jq -r '.[0].Config.Labels."org.label-schema.vcs-url"')
version=$(docker inspect ${KBN_IMAGE} | jq -r '.[0].Config.Labels."org.label-schema.version"')
version=$(docker inspect ${KBN_IMAGE} | jq -r '.[0].Config.Labels."org.label-schema.version"')
markdown_text="""
#### $triggered_by
@ -32,9 +34,9 @@ markdown_text="""
---
#### Kibana Container Metadata
- Build Date : $build_date
- Github Commit Hash : $vcs_ref
- Github Repo : $vcs_url
- Build Date : $build_date
- Github Commit Hash : $vcs_ref
- Github Repo : $vcs_url
- Version : $version
"""
echo "${markdown_text//[*\\_]/\\&}" | buildkite-agent annotate --style "info"
echo "${markdown_text//[*\\_]/\\&}" | buildkite-agent annotate --style "info"

View file

@ -28,7 +28,7 @@ docker tag "$KIBANA_BASE_IMAGE" "$KIBANA_TEST_IMAGE"
if docker manifest inspect $KIBANA_TEST_IMAGE &> /dev/null; then
echo "Cloud image already exists, skipping docker push"
else
docker image push "$KIBANA_TEST_IMAGE"
docker_with_retry push "$KIBANA_TEST_IMAGE"
fi
echo "--- Create deployment"

View file

@ -6,6 +6,8 @@ set -euo pipefail
source .buildkite/scripts/steps/artifacts/env.sh
source .buildkite/scripts/common/util.sh
GIT_ABBREV_COMMIT=${BUILDKITE_COMMIT:0:12}
if [[ "${BUILDKITE_PULL_REQUEST:-false}" == "false" ]]; then
KIBANA_IMAGE_TAG="git-$GIT_ABBREV_COMMIT"
@ -50,8 +52,8 @@ if [[ "$SKIP_BUILD" == "false" ]]; then
docker tag "$KIBANA_IMAGE" "$KIBANA_IMAGE-arm64"
echo "--- Push images"
docker image push "$KIBANA_IMAGE-arm64"
docker image push "$KIBANA_IMAGE-amd64"
docker_with_retry push "$KIBANA_IMAGE-arm64"
docker_with_retry push "$KIBANA_IMAGE-amd64"
echo "--- Create and push manifests"
docker manifest create \

View file

@ -54,7 +54,7 @@ chmod -R a+r target/*
chmod -R a+w target
echo "--- Pull latest Release Manager CLI"
docker pull docker.elastic.co/infra/release-manager:latest
docker_with_retry pull docker.elastic.co/infra/release-manager:latest
echo "--- Publish artifacts"
if [[ "$BUILDKITE_BRANCH" == "$KIBANA_BASE_BRANCH" ]] || [[ "${DRY_RUN:-}" =~ ^(1|true)$ ]]; then

View file

@ -3,6 +3,7 @@
set -euo pipefail
source "$(dirname "${0}")/config.sh"
source "$(dirname "${0}")/../../common/util.sh"
"$(dirname "${0}")/auth.sh"
@ -16,7 +17,7 @@ DOCKER_EXPORT_URL=$(curl https://storage.googleapis.com/kibana-ci-es-snapshots-d
curl "$DOCKER_EXPORT_URL" > target/elasticsearch-docker.tar.gz
docker load < target/elasticsearch-docker.tar.gz
docker tag "docker.elastic.co/elasticsearch/elasticsearch:$DEPLOYMENT_VERSION-SNAPSHOT" "$ES_IMAGE"
docker push "$ES_IMAGE"
docker_with_retry push "$ES_IMAGE"
echo '--- Prepare yaml'

View file

@ -4,6 +4,8 @@ set -euo pipefail
.buildkite/scripts/bootstrap.sh
source .buildkite/scripts/common/util.sh
source "$(dirname "${0}")/config.sh"
export KIBANA_IMAGE="gcr.io/elastic-kibana-184716/demo/kibana:$DEPLOYMENT_NAME-$(git rev-parse HEAD)"
@ -15,7 +17,7 @@ echo '--- Build Docker image with example plugins'
cd target/example_plugins
BUILT_IMAGE="docker.elastic.co/kibana/kibana:$DEPLOYMENT_VERSION-SNAPSHOT"
docker build --build-arg BASE_IMAGE="$BUILT_IMAGE" -t "$KIBANA_IMAGE" -f "$KIBANA_DIR/.buildkite/scripts/steps/demo_env/Dockerfile" .
docker push "$KIBANA_IMAGE"
docker_with_retry push "$KIBANA_IMAGE"
cd -
"$(dirname "${0}")/auth.sh"

View file

@ -2,6 +2,8 @@
set -euo pipefail
source .buildkite/scripts/common/util.sh
KIBANA_GITHUB_URL="https://github.com/elastic/kibana"
ES_SERVERLESS_GITHUB_URL="https://github.com/elastic/elasticsearch-serverless"
@ -15,7 +17,7 @@ fi
# Pull the target image
if [[ $ES_SERVERLESS_IMAGE != *":git-"* ]]; then
docker pull "$ES_SERVERLESS_IMAGE"
docker_with_retry pull "$ES_SERVERLESS_IMAGE"
ES_SERVERLESS_VERSION=$(docker inspect --format='{{json .Config.Labels}}' "$ES_SERVERLESS_IMAGE" | jq -r '.["org.opencontainers.image.revision"]' | cut -c1-12)
IMAGE_WITHOUT_TAG=$(echo "$ES_SERVERLESS_IMAGE" | cut -d: -f1)

View file

@ -31,18 +31,18 @@ ARM_64_DIGEST=$(jq -r '.manifests[] | select(.platform.architecture == "arm64")
AMD_64_DIGEST=$(jq -r '.manifests[] | select(.platform.architecture == "amd64") | .digest' manifests.json)
echo docker pull --platform linux/arm64 "$SOURCE_IMAGE@$ARM_64_DIGEST"
docker pull --platform linux/arm64 "$SOURCE_IMAGE@$ARM_64_DIGEST"
docker_with_retry pull --platform linux/arm64 "$SOURCE_IMAGE@$ARM_64_DIGEST"
echo linux/arm64 image pulled, with digest: $ARM_64_DIGEST
echo docker pull --platform linux/amd64 "$SOURCE_IMAGE@$AMD_64_DIGEST"
docker pull --platform linux/amd64 "$SOURCE_IMAGE@$AMD_64_DIGEST"
docker_with_retry pull --platform linux/amd64 "$SOURCE_IMAGE@$AMD_64_DIGEST"
echo linux/amd64 image pulled, with digest: $AMD_64_DIGEST
docker tag "$SOURCE_IMAGE@$ARM_64_DIGEST" "$TARGET_IMAGE-arm64"
docker tag "$SOURCE_IMAGE@$AMD_64_DIGEST" "$TARGET_IMAGE-amd64"
docker push "$TARGET_IMAGE-arm64"
docker push "$TARGET_IMAGE-amd64"
docker_with_retry push "$TARGET_IMAGE-arm64"
docker_with_retry push "$TARGET_IMAGE-amd64"
docker manifest rm "$TARGET_IMAGE" || echo "Nothing to delete"

View file

@ -93,7 +93,7 @@ set +e
echo $ES_CLOUD_ID $ES_CLOUD_VERSION $KIBANA_ES_CLOUD_VERSION $KIBANA_ES_CLOUD_IMAGE
docker tag "$ES_CLOUD_ID" "$KIBANA_ES_CLOUD_IMAGE"
docker image push "$KIBANA_ES_CLOUD_IMAGE"
docker_with_retry push "$KIBANA_ES_CLOUD_IMAGE"
export ELASTICSEARCH_CLOUD_IMAGE="$KIBANA_ES_CLOUD_IMAGE"
export ELASTICSEARCH_CLOUD_IMAGE_CHECKSUM="$(docker images "$KIBANA_ES_CLOUD_IMAGE" --format "{{.Digest}}")"