From f628ab643577b37b3a7bb6f243b11407ea85814f Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 08:53:18 +0200 Subject: [PATCH] ci(obs): add validate + health assertion steps to release.yml nightly.yml had two observability gates that release.yml lacked: - "Validate observability compose config" (docker compose config --quiet) catches missing env vars and YAML errors before any containers start - "Assert observability stack health" checks obs-loki/prometheus/grafana/tempo are healthy after up --wait, covering services without healthcheck directives Mirrors the nightly.yml steps verbatim so the production deploy path is at least as well-verified as the nightly staging path. Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/release.yml | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 48d61147..5e4d12a2 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -115,7 +115,24 @@ jobs: POSTGRES_HOST=archiv-production-db-1 EOF + - name: Validate observability compose config + # Dry-run: resolves all variable substitutions and reports any missing + # required keys before containers start. Catches undefined variables and + # YAML errors in config files updated by the previous step. + # Keep in sync with the equivalent step in nightly.yml. + run: | + docker compose \ + -f /opt/familienarchiv/docker-compose.observability.yml \ + --env-file /opt/familienarchiv/infra/observability/obs.env \ + --env-file /opt/familienarchiv/obs-secrets.env \ + config --quiet + - name: Start observability stack + # Runs with absolute paths so bind mounts resolve to stable host paths + # that survive workspace wipes between runs (see ADR-016). + # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env + # (written fresh from Gitea secrets above). + # Keep in sync with the equivalent step in nightly.yml. run: | docker compose \ -f /opt/familienarchiv/docker-compose.observability.yml \ @@ -123,6 +140,26 @@ jobs: --env-file /opt/familienarchiv/obs-secrets.env \ up -d --wait --remove-orphans + - name: Assert observability stack health + # docker compose up --wait covers services WITH healthcheck directives only. + # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have + # no healthcheck — they are considered "started" as soon as the process runs. + # This step explicitly asserts the four healthchecked critical services are + # healthy before the smoke test proceeds. + # Keep in sync with the equivalent step in nightly.yml. + run: | + set -e + unhealthy="" + for svc in obs-loki obs-prometheus obs-grafana obs-tempo; do + status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing") + if [ "$status" != "healthy" ]; then + echo "::error::$svc is not healthy (status: $status)" + unhealthy="$unhealthy $svc" + fi + done + [ -z "$unhealthy" ] || exit 1 + echo "All critical observability services are healthy" + - name: Reload Caddy # See nightly.yml — same rationale and mechanism: DooD job containers # cannot call systemctl directly; nsenter via a privileged sibling