familienarchiv/.gitea/workflows/release.yml

name: release

# Builds and deploys the production environment on `v*` tag push.
# Runs on the self-hosted runner via Docker-out-of-Docker; images are
# tagged with the actual git tag (e.g. v1.0.0) so rollback is
#   `TAG=<previous> docker compose -f docker-compose.prod.yml -p archiv-production up -d --wait`
#
# Operational assumptions (see docs/DEPLOYMENT.md §3 for the full setup):
#
#   1. Single-tenant self-hosted runner. The "Write production env file"
#      step writes every secret to .env.production on the runner
#      filesystem; the `if: always()` cleanup step removes it. A
#      multi-tenant runner would need to switch to
#      `docker compose --env-file <(stdin)` instead.
#
#   2. Host docker layer cache is authoritative. There is no
#      actions/cache; we rely on the host daemon to keep Maven and npm
#      layers warm between runs. A `docker system prune` on the host
#      will cause the next release build to be cold (5–10 min slower).
#
# Production environment:
#   - project name: archiv-production
#   - host ports:   backend 8080, frontend 3000
#   - profile:      (none) — mailpit is excluded; real SMTP relay is used
#
# Required Gitea secrets:
#   PROD_POSTGRES_PASSWORD
#   PROD_MINIO_PASSWORD
#   PROD_MINIO_APP_PASSWORD
#   PROD_OCR_TRAINING_TOKEN
#   PROD_APP_ADMIN_USERNAME       (CRITICAL: see docs/DEPLOYMENT.md)
#   PROD_APP_ADMIN_PASSWORD       (CRITICAL: locked in on first deploy)
#   MAIL_HOST
#   MAIL_PORT
#   MAIL_USERNAME
#   MAIL_PASSWORD
#   GRAFANA_ADMIN_PASSWORD
#   GLITCHTIP_SECRET_KEY
#   SENTRY_DSN                    (set after GlitchTip first-run; empty = Sentry disabled)

on:
  push:
    tags:
      - "v*"

env:
  DOCKER_BUILDKIT: "1"

jobs:
  deploy-production:
    # See nightly.yml — same rationale: `ubuntu-latest` matches the
    # advertised label of our single-tenant self-hosted runner.
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Write production env file
        run: |
          cat > .env.production <<EOF
          TAG=${{ gitea.ref_name }}
          PORT_BACKEND=8080
          PORT_FRONTEND=3000
          APP_DOMAIN=archiv.raddatz.cloud
          POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }}
          MINIO_PASSWORD=${{ secrets.PROD_MINIO_PASSWORD }}
          MINIO_APP_PASSWORD=${{ secrets.PROD_MINIO_APP_PASSWORD }}
          OCR_TRAINING_TOKEN=${{ secrets.PROD_OCR_TRAINING_TOKEN }}
          APP_ADMIN_USERNAME=${{ secrets.PROD_APP_ADMIN_USERNAME }}
          APP_ADMIN_PASSWORD=${{ secrets.PROD_APP_ADMIN_PASSWORD }}
          MAIL_HOST=${{ secrets.MAIL_HOST }}
          MAIL_PORT=${{ secrets.MAIL_PORT }}
          MAIL_USERNAME=${{ secrets.MAIL_USERNAME }}
          MAIL_PASSWORD=${{ secrets.MAIL_PASSWORD }}
          MAIL_SMTP_AUTH=true
          MAIL_STARTTLS_ENABLE=true
          APP_MAIL_FROM=noreply@raddatz.cloud
          IMPORT_HOST_DIR=/srv/familienarchiv-production/import
          POSTGRES_USER=archiv
          SENTRY_DSN=${{ secrets.SENTRY_DSN }}
          EOF

      - name: Build images
        # `--pull` forces re-fetching pinned base images so a CVE
        # re-publication of the same tag is picked up rather than served
        # from the host's stale Docker layer cache.
        run: |
          docker compose \
            -f docker-compose.prod.yml \
            -p archiv-production \
            --env-file .env.production \
            build --pull

      - name: Deploy production
        run: |
          docker compose \
            -f docker-compose.prod.yml \
            -p archiv-production \
            --env-file .env.production \
            up -d --wait --remove-orphans

      - name: Deploy observability configs
        # Mirrors the nightly approach: copies obs compose file and config tree
        # to /opt/familienarchiv/ (permanent path, survives workspace wipes — ADR-016),
        # then writes obs-secrets.env fresh from Gitea secrets.
        # Non-secret config lives in infra/observability/obs.env (tracked in git).
        run: |
          rm -rf /opt/familienarchiv/infra/observability
          mkdir -p /opt/familienarchiv/infra/observability
          cp -r infra/observability/. /opt/familienarchiv/infra/observability/
          cp docker-compose.observability.yml /opt/familienarchiv/
          cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
          GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
          GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
          POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }}
          POSTGRES_HOST=archiv-production-db-1
          EOF
          # Note: POSTGRES_HOST is derived from the Compose project name (archiv-production)
          # and service name (db). A project rename requires updating this value.
          chmod 600 /opt/familienarchiv/obs-secrets.env

      - name: Validate observability compose config
        # Dry-run: resolves all variable substitutions and reports any missing
        # required keys before containers start. Catches undefined variables and
        # YAML errors in config files updated by the previous step.
        # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
        # second (CI-written secrets). Later files win on duplicate keys, so
        # obs-secrets.env overrides POSTGRES_HOST set in obs.env.
        # Keep in sync with the equivalent step in nightly.yml (#603).
        run: |
          docker compose \
            -f /opt/familienarchiv/docker-compose.observability.yml \
            --env-file /opt/familienarchiv/infra/observability/obs.env \
            --env-file /opt/familienarchiv/obs-secrets.env \
            config --quiet

      - name: Start observability stack
        # Runs with absolute paths so bind mounts resolve to stable host paths
        # that survive workspace wipes between runs (see ADR-016).
        # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
        # (written fresh from Gitea secrets above). --env-file order: obs.env first,
        # obs-secrets.env second — later file wins on duplicate keys.
        # Keep in sync with the equivalent step in nightly.yml (#603).
        run: |
          docker compose \
            -f /opt/familienarchiv/docker-compose.observability.yml \
            --env-file /opt/familienarchiv/infra/observability/obs.env \
            --env-file /opt/familienarchiv/obs-secrets.env \
            up -d --wait --remove-orphans

      - name: Assert observability stack health
        # docker compose up --wait covers services WITH healthcheck directives only.
        # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
        # no healthcheck — they are considered "started" as soon as the process runs.
        # This step explicitly asserts the five healthchecked critical services are
        # healthy before the smoke test proceeds.
        # Keep in sync with the equivalent step in nightly.yml (#603).
        run: |
          set -e
          unhealthy=""
          for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
            status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
            if [ "$status" != "healthy" ]; then
              echo "::error::$svc is not healthy (status: $status)"
              unhealthy="$unhealthy $svc"
            fi
          done
          [ -z "$unhealthy" ] || exit 1
          echo "All critical observability services are healthy"

      - name: Reload Caddy
        # See nightly.yml — same rationale and mechanism: DooD job containers
        # cannot call systemctl directly; nsenter via a privileged sibling
        # container reaches the host systemd. Must run after deploy (so the
        # latest Caddyfile is on disk) and before the smoke test (so the
        # public surface reflects the current config). Alpine with pinned
        # digest; reload not restart — see nightly.yml for full rationale.
        run: |
          docker run --rm --privileged --pid=host \
            alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
            sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'

      - name: Smoke test deployed environment
        # See nightly.yml — same three checks, against the prod vhost.
        # --resolve pins to the bridge gateway IP (the host), not 127.0.0.1
        # — see nightly.yml for the full network topology explanation.
        run: |
          set -e
          HOST="archiv.raddatz.cloud"
          URL="https://$HOST"
          HOST_IP=$(ip route show default | awk '/default/ {print $3}')
          [ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via 'ip route'"; exit 1; }
          RESOLVE="--resolve $HOST:443:$HOST_IP"
          echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
          curl -fsS "$RESOLVE" --max-time 10 "$URL/login" -o /dev/null
          # Pin the preload-list-eligible HSTS value, not just header presence:
          # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
          # fail this check rather than pass it silently.
          curl -fsS "$RESOLVE" --max-time 10 -I "$URL/" \
            | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
          # Permissions-Policy denies APIs the app does not use (camera,
          # microphone, geolocation). A regression that loosens or drops the
          # header now fails the smoke step.
          curl -fsS "$RESOLVE" --max-time 10 -I "$URL/" \
            | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
          status=$(curl -s "$RESOLVE" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
          [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
          echo "All smoke checks passed"

      - name: Cleanup env file
        # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
        # single-tenant runner trust model. Every secret in
        # .env.production is plain text on the runner filesystem until
        # this step runs. If a future refactor drops `if: always()`, a
        # failed deploy leaves the env-file behind. Do not remove this
        # conditional without first re-evaluating ADR-011.
        if: always()
        run: rm -f .env.production