Merge pull request #259 from balena-io/ab77/operational

fail early on EC2 instance termination (i.e. spot capacity)
This commit is contained in:
flowzone-app[bot] 2024-06-24 22:24:10 +00:00 committed by GitHub
commit 9a4bb5317e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -469,6 +469,9 @@ jobs:
echo "::warning::Still working..."
sleep "$(( (RANDOM % 5) + 5 ))s"
aws ec2 wait instance-running --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
aws ec2 wait instance-status-ok --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
done
echo "key_id=${GITHUB_SHA}" >> "${GITHUB_OUTPUT}"
@ -505,6 +508,9 @@ jobs:
echo "::warning::Still working..."
sleep "$(( (RANDOM % 30) + 30 ))s"
aws ec2 wait instance-running --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
aws ec2 wait instance-status-ok --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
done
# wait for Docker healthchecks
@ -516,6 +522,9 @@ jobs:
echo "::warning::Still working..."
sleep "$(( (RANDOM % 30) + 30 ))s"
aws ec2 wait instance-running --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
aws ec2 wait instance-status-ok --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
done
# (TBC) https://www.balena.io/docs/reference/supervisor/docker-compose/
@ -547,11 +556,15 @@ jobs:
while with_backoff ssh-uuid -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
${{ steps.register-test-device.outputs.balena_device_uuid }}.balena \
'balena ps -q | xargs balena inspect \
| jq -r ".[] | select(.State.Health.Status!=null).Name + \":\" + .State.Health.Status"' \
| jq -r ".[]
| select(.State.Health.Status!=null).Name + \":\" + .State.Health.Status"' \
| grep -E ':starting|:unhealthy'; do
echo "::warning::Still working..."
sleep "$(( (RANDOM % 30) + 30 ))s"
aws ec2 wait instance-running --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
aws ec2 wait instance-status-ok --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
done
- name: SUT&DUT (balena)
@ -589,6 +602,9 @@ jobs:
echo "::warning::Still working..."
sleep "$(( ( RANDOM % ${{ env.RETRY }} ) + ${{ env.RETRY }} ))s"
aws ec2 wait instance-running --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
aws ec2 wait instance-status-ok --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
done
# .. once the service exits with status == exited, it is assumed to be finished
@ -603,6 +619,9 @@ jobs:
--compressed | jq -r '.[].services.sut.status')"
sleep "$(( ( RANDOM % ${{ env.RETRY }} ) + ${{ env.RETRY }} ))s"
aws ec2 wait instance-running --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
aws ec2 wait instance-status-ok --instance-ids ${{ steps.balena-sut.outputs.instance_id }}
done
# .. check its exit code
@ -879,6 +898,9 @@ jobs:
echo '::info::waiting for composition...'
with_backoff docker compose ls
sleep $(((RANDOM%5) + 5))s
aws ec2 wait instance-running --instance-ids ${{ steps.ubuntu-sut.outputs.instance_id }}
aws ec2 wait instance-status-ok --instance-ids ${{ steps.ubuntu-sut.outputs.instance_id }}
done
echo '::info::composition started'
@ -888,6 +910,9 @@ jobs:
echo "::info::waiting for ${service}..."
with_backoff docker compose ps
sleep $(((RANDOM%5) + 5))s
aws ec2 wait instance-running --instance-ids ${{ steps.ubuntu-sut.outputs.instance_id }}
aws ec2 wait instance-status-ok --instance-ids ${{ steps.ubuntu-sut.outputs.instance_id }}
done
echo "::info::${service} started"
done
@ -901,6 +926,9 @@ jobs:
with_backoff docker compose logs --follow --timestamps sut
echo '::info::still running...'
sleep $(((RANDOM%1) + 1))s
aws ec2 wait instance-running --instance-ids ${{ steps.ubuntu-sut.outputs.instance_id }}
aws ec2 wait instance-status-ok --instance-ids ${{ steps.ubuntu-sut.outputs.instance_id }}
done
aws ssm wait command-executed --command-id "${cid}" --instance-id "${iid}"