familienarchiv/.gitea/workflows/nightly.yml

name: nightly

# Builds and deploys the staging environment from main every night.
# Runs on the self-hosted runner using Docker-out-of-Docker (the docker
# socket is mounted in), so `docker compose build` produces images on
# the host daemon and `docker compose up` consumes them directly — no
# registry hop.
#
# Operational assumptions (see docs/DEPLOYMENT.md §3 for the full setup):
#
#   1. Single-tenant self-hosted runner. The "Write staging env file" step
#      writes every secret to .env.staging on the runner filesystem; the
#      `if: always()` cleanup step removes it. A multi-tenant runner
#      would need to switch to docker compose --env-file <(stdin) instead.
#
#   2. Host docker layer cache is authoritative. There is no
#      actions/cache; we rely on the host daemon to keep Maven and npm
#      layers warm between runs. A `docker system prune` on the host
#      will cause the next nightly build to be cold (5–10 min slower).
#
# Staging environment isolation:
#   - project name: archiv-staging
#   - host ports:   backend 8081, frontend 3001
#   - profile:      staging (starts mailpit instead of a real SMTP relay)
#
# Required Gitea secrets:
#   STAGING_POSTGRES_PASSWORD
#   STAGING_MINIO_PASSWORD
#   STAGING_MINIO_APP_PASSWORD
#   STAGING_OCR_TRAINING_TOKEN
#   STAGING_APP_ADMIN_USERNAME
#   STAGING_APP_ADMIN_PASSWORD

on:
  schedule:
    - cron: "0 2 * * *"
  workflow_dispatch:

env:
  # Ensures the backend Dockerfile's `RUN --mount=type=cache` lines are
  # honoured (Maven cache survives between runs).
  DOCKER_BUILDKIT: "1"

jobs:
  deploy-staging:
    # `ubuntu-latest` matches our self-hosted runner's advertised label
    # (the runner has labels: ubuntu-latest / ubuntu-24.04 / ubuntu-22.04).
    # `self-hosted` would never match — no runner advertises it — so the
    # job parks in the queue forever. ADR-011's "single-tenant" promise
    # is at the repo level; sharing this runner between CI and deploys
    # for the same repo is within that boundary.
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Write staging env file
        run: |
          cat > .env.staging <<EOF
          TAG=nightly
          PORT_BACKEND=8081
          PORT_FRONTEND=3001
          APP_DOMAIN=staging.raddatz.cloud
          POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
          MINIO_PASSWORD=${{ secrets.STAGING_MINIO_PASSWORD }}
          MINIO_APP_PASSWORD=${{ secrets.STAGING_MINIO_APP_PASSWORD }}
          OCR_TRAINING_TOKEN=${{ secrets.STAGING_OCR_TRAINING_TOKEN }}
          APP_ADMIN_USERNAME=${{ secrets.STAGING_APP_ADMIN_USERNAME }}
          APP_ADMIN_PASSWORD=${{ secrets.STAGING_APP_ADMIN_PASSWORD }}
          MAIL_HOST=mailpit
          MAIL_PORT=1025
          MAIL_USERNAME=
          MAIL_PASSWORD=
          MAIL_SMTP_AUTH=false
          MAIL_STARTTLS_ENABLE=false
          APP_MAIL_FROM=noreply@staging.raddatz.cloud
          IMPORT_HOST_DIR=/srv/familienarchiv-staging/import
          EOF

      - name: Verify backend /import:ro mount is wired
        # Regression guard for #526: the /admin/system mass-import card
        # only works when the backend service mounts the host import
        # payload at /import (read-only). If a future "compose cleanup"
        # PR drops the volumes block, mass import silently breaks again.
        # `compose config` renders both shorthand and longform mounts as
        # `target: /import` + `read_only: true`, so we assert against
        # the rendered form rather than the raw source YAML.
        run: |
          set -e
          docker compose \
            -f docker-compose.prod.yml \
            -p archiv-staging \
            --env-file .env.staging \
            --profile staging \
            config > /tmp/compose-rendered.yml
          grep -q '^[[:space:]]*target: /import$' /tmp/compose-rendered.yml \
            || { echo "::error::backend is missing the /import bind mount (see #526)"; exit 1; }
          grep -A2 '^[[:space:]]*target: /import$' /tmp/compose-rendered.yml \
            | grep -q 'read_only: true' \
            || { echo "::error::backend /import mount is not read-only (see #526)"; exit 1; }

      - name: Build images
        # `--pull` forces re-fetching pinned base images so a CVE
        # re-publication of the same tag (e.g. node:20.19.0-alpine3.21,
        # postgres:16-alpine) is picked up instead of being served
        # from the host's stale Docker layer cache.
        run: |
          docker compose \
            -f docker-compose.prod.yml \
            -p archiv-staging \
            --env-file .env.staging \
            --profile staging \
            build --pull

      - name: Deploy staging
        run: |
          docker compose \
            -f docker-compose.prod.yml \
            -p archiv-staging \
            --env-file .env.staging \
            --profile staging \
            up -d --wait --remove-orphans

      - name: Reload Caddy
        # Apply any committed Caddyfile changes before smoke-testing the
        # public surface. Without this step, a Caddyfile edit lands in the
        # repo but Caddy keeps serving the previous config until someone
        # reloads it manually — the smoke test would then catch a stale
        # header or a still-proxied /actuator route rather than confirming
        # the current config is live.
        #
        # `systemctl reload caddy` sends SIGHUP; Caddy re-reads /etc/caddy/Caddyfile
        # (symlinked to infra/caddy/Caddyfile) without dropping connections.
        # If Caddy is not running this step fails fast and clearly before the
        # smoke test issues a misleading "port 443 refused" error.
        run: sudo systemctl reload caddy

      - name: Smoke test deployed environment
        # Healthchecks confirm containers are healthy; they do NOT confirm the
        # public surface works. This step catches: Caddy not reloaded, HSTS
        # header dropped, /actuator block bypassed.
        #
        # --resolve pins staging.raddatz.cloud to the runner's loopback so we
        # do NOT depend on the host router doing hairpin NAT (many SOHO
        # routers do not, or do so only after a firmware update). SNI still
        # uses the public hostname so the cert validates correctly.
        run: |
          set -e
          HOST="staging.raddatz.cloud"
          URL="https://$HOST"
          RESOLVE="--resolve $HOST:443:127.0.0.1"
          echo "Smoke test: $URL (pinned to 127.0.0.1)"
          curl -fsS $RESOLVE --max-time 10 "$URL/login" -o /dev/null
          # Pin the preload-list-eligible HSTS value, not just header presence:
          # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
          # fail this check rather than pass it silently.
          curl -fsS $RESOLVE --max-time 10 -I "$URL/" \
            | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
          # Permissions-Policy denies APIs the app does not use (camera,
          # microphone, geolocation). A regression that loosens or drops the
          # header now fails the smoke step.
          curl -fsS $RESOLVE --max-time 10 -I "$URL/" \
            | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
          status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
          [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
          echo "All smoke checks passed"

      - name: Cleanup env file
        # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
        # single-tenant runner trust model. Every secret in .env.staging
        # is plain text on the runner filesystem until this step runs.
        # If a future refactor drops `if: always()`, a failed deploy
        # leaves the env-file behind. Do not remove this conditional
        # without first re-evaluating ADR-011.
        if: always()
        run: rm -f .env.staging