name: release # Builds and deploys the production environment on `v*` tag push. # Runs on the self-hosted runner via Docker-out-of-Docker; images are # tagged with the actual git tag (e.g. v1.0.0) so rollback is # `TAG= docker compose -f docker-compose.prod.yml -p archiv-production up -d --wait` # # Operational assumptions (see docs/DEPLOYMENT.md §3 for the full setup): # # 1. Single-tenant self-hosted runner. The "Write production env file" # step writes every secret to .env.production on the runner # filesystem; the `if: always()` cleanup step removes it. A # multi-tenant runner would need to switch to # `docker compose --env-file <(stdin)` instead. # # 2. Host docker layer cache is authoritative. There is no # actions/cache; we rely on the host daemon to keep Maven and npm # layers warm between runs. A `docker system prune` on the host # will cause the next release build to be cold (5–10 min slower). # # Production environment: # - project name: archiv-production # - host ports: backend 8080, frontend 3000 # - profile: (none) — mailpit is excluded; real SMTP relay is used # # Required Gitea secrets: # PROD_POSTGRES_PASSWORD # PROD_MINIO_PASSWORD # PROD_MINIO_APP_PASSWORD # PROD_OCR_TRAINING_TOKEN # PROD_APP_ADMIN_USERNAME (CRITICAL: see docs/DEPLOYMENT.md) # PROD_APP_ADMIN_PASSWORD (CRITICAL: locked in on first deploy) # MAIL_HOST # MAIL_PORT # MAIL_USERNAME # MAIL_PASSWORD # GRAFANA_ADMIN_PASSWORD # GLITCHTIP_SECRET_KEY # SENTRY_DSN (set after GlitchTip first-run; empty = Sentry disabled) on: push: tags: - "v*" env: DOCKER_BUILDKIT: "1" jobs: deploy-production: # See nightly.yml — same rationale: `ubuntu-latest` matches the # advertised label of our single-tenant self-hosted runner. runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Write production env file run: | cat > .env.production < /opt/familienarchiv/obs-secrets.env <<'EOF' GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }} GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }} POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }} POSTGRES_HOST=archiv-production-db-1 EOF # Note: POSTGRES_HOST is derived from the Compose project name (archiv-production) # and service name (db). A project rename requires updating this value. chmod 600 /opt/familienarchiv/obs-secrets.env - name: Validate observability compose config # Dry-run: resolves all variable substitutions and reports any missing # required keys before containers start. Catches undefined variables and # YAML errors in config files updated by the previous step. # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env # second (CI-written secrets). Later files win on duplicate keys, so # obs-secrets.env overrides POSTGRES_HOST set in obs.env. # Keep in sync with the equivalent step in nightly.yml (#603). run: | docker compose \ -f /opt/familienarchiv/docker-compose.observability.yml \ --env-file /opt/familienarchiv/infra/observability/obs.env \ --env-file /opt/familienarchiv/obs-secrets.env \ config --quiet - name: Start observability stack # Runs with absolute paths so bind mounts resolve to stable host paths # that survive workspace wipes between runs (see ADR-016). # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env # (written fresh from Gitea secrets above). --env-file order: obs.env first, # obs-secrets.env second — later file wins on duplicate keys. # Keep in sync with the equivalent step in nightly.yml (#603). run: | docker compose \ -f /opt/familienarchiv/docker-compose.observability.yml \ --env-file /opt/familienarchiv/infra/observability/obs.env \ --env-file /opt/familienarchiv/obs-secrets.env \ up -d --wait --remove-orphans - name: Assert observability stack health # docker compose up --wait covers services WITH healthcheck directives only. # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have # no healthcheck — they are considered "started" as soon as the process runs. # This step explicitly asserts the five healthchecked critical services are # healthy before the smoke test proceeds. # Keep in sync with the equivalent step in nightly.yml (#603). run: | set -e unhealthy="" for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing") if [ "$status" != "healthy" ]; then echo "::error::$svc is not healthy (status: $status)" unhealthy="$unhealthy $svc" fi done [ -z "$unhealthy" ] || exit 1 echo "All critical observability services are healthy" - name: Reload Caddy # See nightly.yml — same rationale and mechanism: DooD job containers # cannot call systemctl directly; nsenter via a privileged sibling # container reaches the host systemd. Must run after deploy (so the # latest Caddyfile is on disk) and before the smoke test (so the # public surface reflects the current config). Alpine with pinned # digest; reload not restart — see nightly.yml for full rationale. run: | docker run --rm --privileged --pid=host \ alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \ sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy' - name: Smoke test deployed environment # See nightly.yml — same three checks, against the prod vhost. # --resolve pins to the bridge gateway IP (the host), not 127.0.0.1 # — see nightly.yml for the full network topology explanation. run: | set -e HOST="archiv.raddatz.cloud" URL="https://$HOST" HOST_IP=$(ip route show default | awk '/default/ {print $3}') [ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via 'ip route'"; exit 1; } RESOLVE="--resolve $HOST:443:$HOST_IP" echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)" curl -fsS "$RESOLVE" --max-time 10 "$URL/login" -o /dev/null # Pin the preload-list-eligible HSTS value, not just header presence: # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must # fail this check rather than pass it silently. curl -fsS "$RESOLVE" --max-time 10 -I "$URL/" \ | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload' # Permissions-Policy denies APIs the app does not use (camera, # microphone, geolocation). A regression that loosens or drops the # header now fails the smoke step. curl -fsS "$RESOLVE" --max-time 10 -I "$URL/" \ | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)' status=$(curl -s "$RESOLVE" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; } echo "All smoke checks passed" - name: Cleanup env file # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011 # single-tenant runner trust model. Every secret in # .env.production is plain text on the runner filesystem until # this step runs. If a future refactor drops `if: always()`, a # failed deploy leaves the env-file behind. Do not remove this # conditional without first re-evaluating ADR-011. if: always() run: rm -f .env.production