From ad27c1f7570badf022a5b2844ed0cb6300ebd35a Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 2 Jun 2026 19:23:05 +0200 Subject: [PATCH] ci(deploy): wire nightly.yml to composite deploy actions Replaces the four inline obs steps with one uses: ./.gitea/actions/deploy-obs, and the Caddy reload + smoke test with one uses: each (host staging.raddatz.cloud, postgres_host archiv-staging-db-1, STAGING_* secrets). checkout@v4 stays the first step; the #526 /import mount guard stays inline. Co-Authored-By: Claude Opus 4.8 --- .gitea/workflows/nightly.yml | 163 +++++------------------------------ 1 file changed, 21 insertions(+), 142 deletions(-) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index a78637b3..11d47992 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -23,6 +23,11 @@ name: nightly # - host ports: backend 8081, frontend 3001 # - profile: staging (starts mailpit instead of a real SMTP relay) # +# The obs-stack deploy, Caddy reload, and smoke test are shared with +# release.yml via the composite actions under .gitea/actions/ (ADR-029). +# actions/checkout MUST stay the first step: a local `uses: ./…` action +# only exists on disk after checkout. +# # Required Gitea secrets: # STAGING_POSTGRES_PASSWORD # STAGING_MINIO_PASSWORD @@ -55,6 +60,8 @@ jobs: # for the same repo is within that boundary. runs-on: ubuntu-latest steps: + # MUST be first: the composite actions below live under .gitea/actions/ + # and only exist on disk once the repo is checked out (ADR-029). - uses: actions/checkout@v4 - name: Write staging env file @@ -92,6 +99,7 @@ jobs: # `compose config` renders both shorthand and longform mounts as # `target: /import` + `read_only: true`, so we assert against # the rendered form rather than the raw source YAML. + # App-compose check (not obs), nightly-only — stays inline. run: | set -e docker compose \ @@ -128,150 +136,21 @@ jobs: --profile staging \ up -d --wait --remove-orphans - - name: Deploy observability configs - # Copies the compose file and config tree from the workspace checkout - # into /opt/familienarchiv/ — the permanent location that persists - # between CI runs. Containers started in the next step bind-mount - # from there, so a future workspace wipe cannot corrupt a running - # config file. - # - # obs-secrets.env is written fresh from Gitea secrets on every run so - # Gitea is always the single source of truth for secret rotation. - # Non-secret config lives in infra/observability/obs.env (tracked in git). - run: | - rm -rf /opt/familienarchiv/infra/observability - mkdir -p /opt/familienarchiv/infra/observability - cp -r infra/observability/. /opt/familienarchiv/infra/observability/ - cp docker-compose.observability.yml /opt/familienarchiv/ - cat > /opt/familienarchiv/obs-secrets.env <<'EOF' - GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }} - GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }} - GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }} - POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }} - POSTGRES_HOST=archiv-staging-db-1 - EOF - # Note: POSTGRES_HOST is derived from the Compose project name (archiv-staging) - # and service name (db). A project rename requires updating this value. - chmod 600 /opt/familienarchiv/obs-secrets.env + # POSTGRES_HOST is derived from the Compose project name (archiv-staging) + # and service name (db). A project rename requires updating this value. + - uses: ./.gitea/actions/deploy-obs + with: + grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }} + grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }} + glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }} + postgres_password: ${{ secrets.STAGING_POSTGRES_PASSWORD }} + postgres_host: archiv-staging-db-1 - - name: Validate observability compose config - # Dry-run: resolves all variable substitutions and reports any missing - # required keys before containers start. Catches undefined variables and - # YAML errors in config files updated by the previous step. - # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env - # second (CI-written secrets). Later files win on duplicate keys, so - # obs-secrets.env overrides POSTGRES_HOST set in obs.env. - run: | - docker compose \ - -f /opt/familienarchiv/docker-compose.observability.yml \ - --env-file /opt/familienarchiv/infra/observability/obs.env \ - --env-file /opt/familienarchiv/obs-secrets.env \ - config --quiet + - uses: ./.gitea/actions/reload-caddy - - name: Start observability stack - # Runs with absolute paths so bind mounts resolve to stable host paths - # that survive workspace wipes between nightly runs (see ADR-016). - # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env - # (written fresh from Gitea secrets above). --env-file order: obs.env first, - # obs-secrets.env second — later file wins on duplicate keys. - run: | - docker compose \ - -f /opt/familienarchiv/docker-compose.observability.yml \ - --env-file /opt/familienarchiv/infra/observability/obs.env \ - --env-file /opt/familienarchiv/obs-secrets.env \ - up -d --wait --remove-orphans - - - name: Assert observability stack health - # docker compose up --wait covers services WITH healthcheck directives only. - # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have - # no healthcheck — they are considered "started" as soon as the process runs. - # This step explicitly asserts the five healthchecked critical services are - # healthy before the smoke test proceeds. - run: | - set -e - unhealthy="" - for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do - status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing") - if [ "$status" != "healthy" ]; then - echo "::error::$svc is not healthy (status: $status)" - unhealthy="$unhealthy $svc" - fi - done - [ -z "$unhealthy" ] || exit 1 - echo "All critical observability services are healthy" - - - name: Reload Caddy - # Apply any committed Caddyfile changes before smoke-testing the - # public surface. Without this step, a Caddyfile edit lands in the - # repo but Caddy keeps serving the previous config until someone - # reloads it manually — the smoke test would then catch a stale - # header or a still-proxied /actuator route rather than confirming - # the current config is live. - # - # The runner executes job steps inside Docker containers (DooD). - # `systemctl` is not present in container images and cannot reach - # the host's systemd directly. We use the Docker socket (mounted - # into every job container via runner-config.yaml) to spin up a - # privileged sibling container in the host PID namespace; nsenter - # then enters the host's namespaces so systemctl talks to the real - # host systemd daemon. No sudoers entry is required — the Docker - # socket already grants root-equivalent host access. - # - # Alpine is used: ~5 MB vs ~70 MB for ubuntu, no unnecessary - # tooling, and the digest is pinned so any upstream change requires - # an explicit bump PR. util-linux (which ships nsenter) is installed - # at run time; apk add takes ~1 s on the warm VPS cache. - # - # `reload` not `restart`: reload sends SIGHUP so Caddy re-reads its - # config in-process without dropping TLS connections. `restart` - # would briefly stop the service, losing in-flight requests. - # - # If Caddy is not running this step fails fast before the smoke test - # issues a misleading "port 443 refused" error. - run: | - docker run --rm --privileged --pid=host \ - alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \ - sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy' - - - name: Smoke test deployed environment - # Healthchecks confirm containers are healthy; they do NOT confirm the - # public surface works. This step catches: Caddy not reloaded, HSTS - # header dropped, /actuator block bypassed. - # - # --resolve pins staging.raddatz.cloud to the Docker bridge gateway IP - # (the host) so we do NOT depend on hairpin NAT on the host router. - # 127.0.0.1 cannot be used: job containers run in bridge network mode - # (runner-config.yaml), so 127.0.0.1 is the container's loopback, not - # the host's. The bridge gateway IS the host; Caddy binds 0.0.0.0:443 - # and is therefore reachable from the container via that IP. - # SNI still uses the public hostname so the TLS cert validates correctly. - # - # Gateway detection reads /proc/net/route (always present, no package - # required) instead of `ip route` to avoid a dependency on iproute2. - # Field $2=="00000000" is the default route; field $3 is the gateway as - # a little-endian 32-bit hex value which awk decodes to dotted-decimal. - run: | - set -e - HOST="staging.raddatz.cloud" - URL="https://$HOST" - HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route) - [ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via /proc/net/route"; exit 1; } - RESOLVE=(--resolve "$HOST:443:$HOST_IP") - echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)" - curl -fsS "${RESOLVE[@]}" --max-time 10 "$URL/login" -o /dev/null - # Pin the preload-list-eligible HSTS value, not just header presence: - # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must - # fail this check rather than pass it silently. - curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \ - | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload' - # Permissions-Policy denies APIs the app does not use (camera, - # microphone, geolocation). A regression that loosens or drops the - # header now fails the smoke step. - curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \ - | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)' - status=$(curl -s "${RESOLVE[@]}" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") - [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; } - echo "All smoke checks passed" + - uses: ./.gitea/actions/smoke-test + with: + host: staging.raddatz.cloud - name: Cleanup env file # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011