familienarchiv/.gitea/workflows/nightly.yml at e61409773e337019056150638d7b08269a3016fa

marcel/familienarchiv

Fork 0

Files

Marcel c43f45a472

CI / OCR Service Tests (push) Has been cancelled

Details

CI / Backend Unit Tests (push) Has been cancelled

Details

CI / fail2ban Regex (push) Has been cancelled

Details

CI / Compose Bucket Idempotency (push) Has been cancelled

Details

CI / Unit & Component Tests (push) Has been cancelled

Details

Merge branch 'fix/issue-601-obs-stack-permanent'

2026-05-16 10:19:59 +02:00

281 lines

14 KiB

YAML

Raw Blame History

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

 name: nightly
 # Builds and deploys the staging environment from main every night.
 # Runs on the self-hosted runner using Docker-out-of-Docker (the docker
 # socket is mounted in), so `docker compose build` produces images on
 # the host daemon and `docker compose up` consumes them directly — no
 # registry hop.
 #
 # Operational assumptions (see docs/DEPLOYMENT.md §3 for the full setup):
 #
 #   1. Single-tenant self-hosted runner. The "Write staging env file" step
 #      writes every secret to .env.staging on the runner filesystem; the
 #      `if: always()` cleanup step removes it. A multi-tenant runner
 #      would need to switch to docker compose --env-file <(stdin) instead.
 #
 #   2. Host docker layer cache is authoritative. There is no
 #      actions/cache; we rely on the host daemon to keep Maven and npm
 #      layers warm between runs. A `docker system prune` on the host
 #      will cause the next nightly build to be cold (5–10 min slower).
 #
 # Staging environment isolation:
 #   - project name: archiv-staging
 #   - host ports:   backend 8081, frontend 3001
 #   - profile:      staging (starts mailpit instead of a real SMTP relay)
 #
 # Required Gitea secrets:
 #   STAGING_POSTGRES_PASSWORD
 #   STAGING_MINIO_PASSWORD
 #   STAGING_MINIO_APP_PASSWORD
 #   STAGING_OCR_TRAINING_TOKEN
 #   STAGING_APP_ADMIN_USERNAME
 #   STAGING_APP_ADMIN_PASSWORD
 #   GRAFANA_ADMIN_PASSWORD
 #   GLITCHTIP_SECRET_KEY
 #   SENTRY_DSN                  (set after GlitchTip first-run; empty = Sentry disabled)
 on:
   schedule:
     - cron: "0 2 * * *"
   workflow_dispatch:
 env:
   # Ensures the backend Dockerfile's `RUN --mount=type=cache` lines are
   # honoured (Maven cache survives between runs).
   DOCKER_BUILDKIT: "1"
 jobs:
   deploy-staging:
     # `ubuntu-latest` matches our self-hosted runner's advertised label
     # (the runner has labels: ubuntu-latest / ubuntu-24.04 / ubuntu-22.04).
     # `self-hosted` would never match — no runner advertises it — so the
     # job parks in the queue forever. ADR-011's "single-tenant" promise
     # is at the repo level; sharing this runner between CI and deploys
     # for the same repo is within that boundary.
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - name: Write staging env file
         run: |
           cat > .env.staging <<EOF
           TAG=nightly
           PORT_BACKEND=8081
           PORT_FRONTEND=3001
           APP_DOMAIN=staging.raddatz.cloud
           POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
           MINIO_PASSWORD=${{ secrets.STAGING_MINIO_PASSWORD }}
           MINIO_APP_PASSWORD=${{ secrets.STAGING_MINIO_APP_PASSWORD }}
           OCR_TRAINING_TOKEN=${{ secrets.STAGING_OCR_TRAINING_TOKEN }}
           APP_ADMIN_USERNAME=${{ secrets.STAGING_APP_ADMIN_USERNAME }}
           APP_ADMIN_PASSWORD=${{ secrets.STAGING_APP_ADMIN_PASSWORD }}
           MAIL_HOST=mailpit
           MAIL_PORT=1025
           MAIL_USERNAME=
           MAIL_PASSWORD=
           MAIL_SMTP_AUTH=false
           MAIL_STARTTLS_ENABLE=false
           APP_MAIL_FROM=noreply@staging.raddatz.cloud
           IMPORT_HOST_DIR=/srv/familienarchiv-staging/import
           POSTGRES_USER=archiv
           SENTRY_DSN=${{ secrets.SENTRY_DSN }}
           EOF
       - name: Verify backend /import:ro mount is wired
         # Regression guard for #526: the /admin/system mass-import card
         # only works when the backend service mounts the host import
         # payload at /import (read-only). If a future "compose cleanup"
         # PR drops the volumes block, mass import silently breaks again.
         # `compose config` renders both shorthand and longform mounts as
         # `target: /import` + `read_only: true`, so we assert against
         # the rendered form rather than the raw source YAML.
         run: |
           set -e
           docker compose \
             -f docker-compose.prod.yml \
             -p archiv-staging \
             --env-file .env.staging \
             --profile staging \
             config > /tmp/compose-rendered.yml
           grep -q '^[[:space:]]*target: /import$' /tmp/compose-rendered.yml \
             || { echo "::error::backend is missing the /import bind mount (see #526)"; exit 1; }
           grep -A2 '^[[:space:]]*target: /import$' /tmp/compose-rendered.yml \
             | grep -q 'read_only: true' \
             || { echo "::error::backend /import mount is not read-only (see #526)"; exit 1; }
       - name: Build images
         # `--pull` forces re-fetching pinned base images so a CVE
         # re-publication of the same tag (e.g. node:20.19.0-alpine3.21,
         # postgres:16-alpine) is picked up instead of being served
         # from the host's stale Docker layer cache.
         run: |
           docker compose \
             -f docker-compose.prod.yml \
             -p archiv-staging \
             --env-file .env.staging \
             --profile staging \
             build --pull
       - name: Deploy staging
         run: |
           docker compose \
             -f docker-compose.prod.yml \
             -p archiv-staging \
             --env-file .env.staging \
             --profile staging \
             up -d --wait --remove-orphans
       - name: Deploy observability configs
         # Copies the compose file and config tree from the workspace checkout
         # into /opt/familienarchiv/ — the permanent location that persists
         # between CI runs. Containers started in the next step bind-mount
         # from there, so a future workspace wipe cannot corrupt a running
         # config file.
         #
         # obs-secrets.env is written fresh from Gitea secrets on every run so
         # Gitea is always the single source of truth for secret rotation.
         # Non-secret config lives in infra/observability/obs.env (tracked in git).
         run: |
           rm -rf /opt/familienarchiv/infra/observability
           mkdir -p /opt/familienarchiv/infra/observability
           cp -r infra/observability/. /opt/familienarchiv/infra/observability/
           cp docker-compose.observability.yml /opt/familienarchiv/
           cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
           GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
           GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
           POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
           POSTGRES_HOST=archiv-staging-db-1
           EOF
           # Note: POSTGRES_HOST is derived from the Compose project name (archiv-staging)
           # and service name (db). A project rename requires updating this value.
           chmod 600 /opt/familienarchiv/obs-secrets.env
       - name: Validate observability compose config
         # Dry-run: resolves all variable substitutions and reports any missing
         # required keys before containers start. Catches undefined variables and
         # YAML errors in config files updated by the previous step.
         # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
         # second (CI-written secrets). Later files win on duplicate keys, so
         # obs-secrets.env overrides POSTGRES_HOST set in obs.env.
         run: |
           docker compose \
             -f /opt/familienarchiv/docker-compose.observability.yml \
             --env-file /opt/familienarchiv/infra/observability/obs.env \
             --env-file /opt/familienarchiv/obs-secrets.env \
             config --quiet
       - name: Start observability stack
         # Runs with absolute paths so bind mounts resolve to stable host paths
         # that survive workspace wipes between nightly runs (see ADR-016).
         # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
         # (written fresh from Gitea secrets above). --env-file order: obs.env first,
         # obs-secrets.env second — later file wins on duplicate keys.
         run: |
           docker compose \
             -f /opt/familienarchiv/docker-compose.observability.yml \
             --env-file /opt/familienarchiv/infra/observability/obs.env \
             --env-file /opt/familienarchiv/obs-secrets.env \
             up -d --wait --remove-orphans
       - name: Assert observability stack health
         # docker compose up --wait covers services WITH healthcheck directives only.
         # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
         # no healthcheck — they are considered "started" as soon as the process runs.
         # This step explicitly asserts the five healthchecked critical services are
         # healthy before the smoke test proceeds.
         run: |
           set -e
           unhealthy=""
           for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
             status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
             if [ "$status" != "healthy" ]; then
               echo "::error::$svc is not healthy (status: $status)"
               unhealthy="$unhealthy $svc"
             fi
           done
           [ -z "$unhealthy" ] || exit 1
           echo "All critical observability services are healthy"
       - name: Reload Caddy
         # Apply any committed Caddyfile changes before smoke-testing the
         # public surface. Without this step, a Caddyfile edit lands in the
         # repo but Caddy keeps serving the previous config until someone
         # reloads it manually — the smoke test would then catch a stale
         # header or a still-proxied /actuator route rather than confirming
         # the current config is live.
         #
         # The runner executes job steps inside Docker containers (DooD).
         # `systemctl` is not present in container images and cannot reach
         # the host's systemd directly. We use the Docker socket (mounted
         # into every job container via runner-config.yaml) to spin up a
         # privileged sibling container in the host PID namespace; nsenter
         # then enters the host's namespaces so systemctl talks to the real
         # host systemd daemon. No sudoers entry is required — the Docker
         # socket already grants root-equivalent host access.
         #
         # Alpine is used: ~5 MB vs ~70 MB for ubuntu, no unnecessary
         # tooling, and the digest is pinned so any upstream change requires
         # an explicit bump PR. util-linux (which ships nsenter) is installed
         # at run time; apk add takes ~1 s on the warm VPS cache.
         #
         # `reload` not `restart`: reload sends SIGHUP so Caddy re-reads its
         # config in-process without dropping TLS connections. `restart`
         # would briefly stop the service, losing in-flight requests.
         #
         # If Caddy is not running this step fails fast before the smoke test
         # issues a misleading "port 443 refused" error.
         run: |
           docker run --rm --privileged --pid=host \
             alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
             sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
       - name: Smoke test deployed environment
         # Healthchecks confirm containers are healthy; they do NOT confirm the
         # public surface works. This step catches: Caddy not reloaded, HSTS
         # header dropped, /actuator block bypassed.
         #
         # --resolve pins staging.raddatz.cloud to the Docker bridge gateway IP
         # (the host) so we do NOT depend on hairpin NAT on the host router.
         # 127.0.0.1 cannot be used: job containers run in bridge network mode
         # (runner-config.yaml), so 127.0.0.1 is the container's loopback, not
         # the host's. The bridge gateway IS the host; Caddy binds 0.0.0.0:443
         # and is therefore reachable from the container via that IP.
         # SNI still uses the public hostname so the TLS cert validates correctly.
         #
         # Gateway detection reads /proc/net/route (always present, no package
         # required) instead of `ip route` to avoid a dependency on iproute2.
         # Field $2=="00000000" is the default route; field $3 is the gateway as
         # a little-endian 32-bit hex value which awk decodes to dotted-decimal.
         run: |
           set -e
           HOST="staging.raddatz.cloud"
           URL="https://$HOST"
           HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
           [ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
           RESOLVE="--resolve $HOST:443:$HOST_IP"
           echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
           curl -fsS "$RESOLVE" --max-time 10 "$URL/login" -o /dev/null
           # Pin the preload-list-eligible HSTS value, not just header presence:
           # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
           # fail this check rather than pass it silently.
           curl -fsS "$RESOLVE" --max-time 10 -I "$URL/" \
             | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
           # Permissions-Policy denies APIs the app does not use (camera,
           # microphone, geolocation). A regression that loosens or drops the
           # header now fails the smoke step.
           curl -fsS "$RESOLVE" --max-time 10 -I "$URL/" \
             | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
           status=$(curl -s "$RESOLVE" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
           [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
           echo "All smoke checks passed"
       - name: Cleanup env file
         # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
         # single-tenant runner trust model. Every secret in .env.staging
         # is plain text on the runner filesystem until this step runs.
         # If a future refactor drops `if: always()`, a failed deploy
         # leaves the env-file behind. Do not remove this conditional
         # without first re-evaluating ADR-011.
         if: always()
         run: rm -f .env.staging

281 lines 14 KiB YAML Raw Blame History Unescape Escape

281 lines

14 KiB

YAML

Raw Blame History