[CI] Cloud purge: improve logging, add CLOUD_DELETE_ON_ERROR (#208392)

## Summary
When not run through the official way, cloud deployments can get weird
names (e.g.: `kibana-pr-false` - from incorrect parameterization through
testing:
https://buildkite.com/elastic/kibana-migration-pipeline-staging/builds/192).

This throws a wrench in the gears for the purge script. Although the
removals still work, the script exits with status code 1, which fails
the pipeline.

This PR adds some more resilience to the script, better logging of
problematic cases, and a flag to allow removal of problematic cases (if
someone manually runs the pipeline with `CLOUD_DELETE_ON_ERROR=1`).

Tested: 
- only fail and log:
https://buildkite.com/elastic/kibana-purge-cloud-deployments/builds/6710
- CLOUD_DELETE_ON_ERROR=1, to remove problematic case
https://buildkite.com/elastic/kibana-purge-cloud-deployments/builds/6712
(fails for some reason on `keep_...` instance, but never again?)
- With the `+e`, and exit code comparison
https://buildkite.com/elastic/kibana-purge-cloud-deployments/builds/6713
This commit is contained in:
Alex Szabo 2025-01-28 10:48:28 +01:00 committed by GitHub
parent 32fcfffaaa
commit 163f7eaa18
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 28 additions and 9 deletions

View file

@ -1,9 +1,17 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -euo pipefail set -uo pipefail
# Note, -e is not set above, so we can capture all errors, and attempt purge individually
echo '--- Purging Cloud deployments' echo '--- Purging Cloud deployments'
ts-node .buildkite/scripts/steps/cloud/purge_deployments.ts ts-node .buildkite/scripts/steps/cloud/purge_deployments.ts
EXIT_CODE_CLOUD=$?
echo '--- Purging Project deployments' echo '--- Purging Project deployments'
ts-node .buildkite/scripts/steps/cloud/purge_projects.ts ts-node .buildkite/scripts/steps/cloud/purge_projects.ts
EXIT_CODE_PROJECTS=$?
if [ $EXIT_CODE_CLOUD -ne 0 ] || [ $EXIT_CODE_PROJECTS -ne 0 ]; then
echo "❌ Purge failed (EXIT_CODE_CLOUD=$EXIT_CODE_CLOUD | EXIT_CODE_PROJECTS=$EXIT_CODE_PROJECTS)"
exit 1
fi

View file

@ -11,20 +11,27 @@ import { execSync } from 'child_process';
import { getKibanaDir } from '#pipeline-utils'; import { getKibanaDir } from '#pipeline-utils';
const deploymentsListJson = execSync('ecctl deployment list --output json').toString(); const deploymentsListJson = execSync('ecctl deployment list --output json').toString();
const { deployments } = JSON.parse(deploymentsListJson); const { deployments } = JSON.parse(deploymentsListJson) as {
deployments: Array<{ name: string; id: string }>;
};
const prDeployments = deployments.filter((deployment: any) => const prDeployments = deployments.filter((deployment) => deployment.name.startsWith('kibana-pr-'));
deployment.name.startsWith('kibana-pr-')
);
const deploymentsToPurge = []; const deploymentsToPurge: typeof deployments = [];
const NOW = new Date().getTime() / 1000; const NOW = new Date().getTime() / 1000;
const DAY_IN_SECONDS = 60 * 60 * 24; const DAY_IN_SECONDS = 60 * 60 * 24;
const CLOUD_DELETE_ON_ERROR = process.env.CLOUD_DELETE_ON_ERROR?.match(/^(true|1)$/i);
for (const deployment of prDeployments) { for (const deployment of prDeployments) {
try { try {
const prNumber = deployment.name.match(/^kibana-pr-([0-9]+)$/)[1]; const prNumber: string | undefined = deployment.name.match(/^kibana-pr-([0-9]+)$/)?.[1];
if (!prNumber) {
throw new Error(
`Invalid deployment name: ${deployment.name}; expected kibana-pr-{PR_NUMBER}).`
);
}
const prJson = execSync(`gh pr view '${prNumber}' --json state,labels,updatedAt`).toString(); const prJson = execSync(`gh pr view '${prNumber}' --json state,labels,updatedAt`).toString();
const pullRequest = JSON.parse(prJson); const pullRequest = JSON.parse(prJson);
const prOpen = pullRequest.state === 'OPEN'; const prOpen = pullRequest.state === 'OPEN';
@ -59,11 +66,15 @@ for (const deployment of prDeployments) {
deploymentsToPurge.push(deployment); deploymentsToPurge.push(deployment);
} }
} catch (ex) { } catch (ex) {
console.error(`Error deleting deployment (${deployment.id}; ${deployment.name})`);
console.error(ex.toString()); console.error(ex.toString());
// deploymentsToPurge.push(deployment); // TODO should we delete on error? if (CLOUD_DELETE_ON_ERROR) {
deploymentsToPurge.push(deployment);
} else {
process.exitCode = 1; process.exitCode = 1;
} }
} }
}
for (const deployment of deploymentsToPurge) { for (const deployment of deploymentsToPurge) {
console.log(`Scheduling deployment for deletion: ${deployment.name} / ${deployment.id}`); console.log(`Scheduling deployment for deletion: ${deployment.name} / ${deployment.id}`);