familienarchiv/.gitea/workflows/nightly.yml

name: nightly

# Builds and deploys the staging environment from main every night.
# Runs on the self-hosted runner using Docker-out-of-Docker (the docker
# socket is mounted in), so `docker compose build` produces images on
# the host daemon and `docker compose up` consumes them directly — no
# registry hop.
#
# Operational assumptions (see docs/DEPLOYMENT.md §3 for the full setup):
#
#   1. Single-tenant self-hosted runner. The "Write staging env file" step
#      writes every secret to .env.staging on the runner filesystem; the
#      `if: always()` cleanup step removes it. A multi-tenant runner
#      would need to switch to docker compose --env-file <(stdin) instead.
#
#   2. Host docker layer cache is authoritative. There is no
#      actions/cache; we rely on the host daemon to keep Maven and npm
#      layers warm between runs. A `docker system prune` on the host
#      will cause the next nightly build to be cold (5–10 min slower).
#
# Staging environment isolation:
#   - project name: archiv-staging
#   - host ports:   backend 8081, frontend 3001
#   - profile:      staging (starts mailpit instead of a real SMTP relay)
#
# The obs-stack deploy, Caddy reload, and smoke test are shared with
# release.yml via the composite actions under .gitea/actions/ (ADR-029).
# actions/checkout MUST stay the first step: a local `uses: ./…` action
# only exists on disk after checkout.
#
# Required Gitea secrets:
#   STAGING_POSTGRES_PASSWORD
#   STAGING_MINIO_PASSWORD
#   STAGING_MINIO_APP_PASSWORD
#   STAGING_OCR_TRAINING_TOKEN
#   STAGING_APP_ADMIN_USERNAME
#   STAGING_APP_ADMIN_PASSWORD
#   GRAFANA_ADMIN_PASSWORD
#   GRAFANA_DB_PASSWORD           (read-only grafana_reader DB role, issue #651)
#   GLITCHTIP_SECRET_KEY
#   SENTRY_DSN                  (set after GlitchTip first-run; empty = Sentry disabled)

on:
  schedule:
    - cron: "0 2 * * *"
  workflow_dispatch:

env:
  # Ensures the backend Dockerfile's `RUN --mount=type=cache` lines are
  # honoured (Maven cache survives between runs).
  DOCKER_BUILDKIT: "1"

jobs:
  deploy-staging:
    # `ubuntu-latest` matches our self-hosted runner's advertised label
    # (the runner has labels: ubuntu-latest / ubuntu-24.04 / ubuntu-22.04).
    # `self-hosted` would never match — no runner advertises it — so the
    # job parks in the queue forever. ADR-011's "single-tenant" promise
    # is at the repo level; sharing this runner between CI and deploys
    # for the same repo is within that boundary.
    runs-on: ubuntu-latest
    steps:
      # MUST be first: the composite actions below live under .gitea/actions/
      # and only exist on disk once the repo is checked out (ADR-029).
      - uses: actions/checkout@v4

      - name: Write staging env file
        run: |
          cat > .env.staging <<EOF
          TAG=nightly
          PORT_BACKEND=8081
          PORT_FRONTEND=3001
          APP_DOMAIN=staging.raddatz.cloud
          POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
          MINIO_PASSWORD=${{ secrets.STAGING_MINIO_PASSWORD }}
          MINIO_APP_PASSWORD=${{ secrets.STAGING_MINIO_APP_PASSWORD }}
          OCR_TRAINING_TOKEN=${{ secrets.STAGING_OCR_TRAINING_TOKEN }}
          APP_ADMIN_USERNAME=${{ secrets.STAGING_APP_ADMIN_USERNAME }}
          APP_ADMIN_PASSWORD=${{ secrets.STAGING_APP_ADMIN_PASSWORD }}
          MAIL_HOST=mailpit
          MAIL_PORT=1025
          MAIL_USERNAME=
          MAIL_PASSWORD=
          MAIL_SMTP_AUTH=false
          MAIL_STARTTLS_ENABLE=false
          APP_MAIL_FROM=noreply@staging.raddatz.cloud
          IMPORT_HOST_DIR=/srv/familienarchiv-staging/import
          POSTGRES_USER=archiv
          SENTRY_DSN=${{ secrets.SENTRY_DSN }}
          VITE_SENTRY_DSN=${{ secrets.VITE_SENTRY_DSN }}
          GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
          EOF

      - name: Verify backend /import:ro mount is wired
        # Regression guard for #526: the /admin/system mass-import card
        # only works when the backend service mounts the host import
        # payload at /import (read-only). If a future "compose cleanup"
        # PR drops the volumes block, mass import silently breaks again.
        # `compose config` renders both shorthand and longform mounts as
        # `target: /import` + `read_only: true`, so we assert against
        # the rendered form rather than the raw source YAML.
        # App-compose check (not obs), nightly-only — stays inline.
        run: |
          set -e
          docker compose \
            -f docker-compose.prod.yml \
            -p archiv-staging \
            --env-file .env.staging \
            --profile staging \
            config > /tmp/compose-rendered.yml
          grep -q '^[[:space:]]*target: /import$' /tmp/compose-rendered.yml \
            || { echo "::error::backend is missing the /import bind mount (see #526)"; exit 1; }
          grep -A2 '^[[:space:]]*target: /import$' /tmp/compose-rendered.yml \
            | grep -q 'read_only: true' \
            || { echo "::error::backend /import mount is not read-only (see #526)"; exit 1; }

      - name: Build images
        # `--pull` forces re-fetching pinned base images so a CVE
        # re-publication of the same tag (e.g. node:20.19.0-alpine3.21,
        # postgres:16-alpine) is picked up instead of being served
        # from the host's stale Docker layer cache.
        run: |
          docker compose \
            -f docker-compose.prod.yml \
            -p archiv-staging \
            --env-file .env.staging \
            --profile staging \
            build --pull

      - name: Deploy staging
        run: |
          docker compose \
            -f docker-compose.prod.yml \
            -p archiv-staging \
            --env-file .env.staging \
            --profile staging \
            up -d --wait --remove-orphans

      # POSTGRES_HOST is derived from the Compose project name (archiv-staging)
      # and service name (db). A project rename requires updating this value.
      - uses: ./.gitea/actions/deploy-obs
        with:
          grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
          grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }}
          glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }}
          postgres_password: ${{ secrets.STAGING_POSTGRES_PASSWORD }}
          postgres_host: archiv-staging-db-1

      - uses: ./.gitea/actions/reload-caddy

      - uses: ./.gitea/actions/smoke-test
        with:
          host: staging.raddatz.cloud

      - name: Cleanup env file
        # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
        # single-tenant runner trust model. Every secret in .env.staging
        # is plain text on the runner filesystem until this step runs.
        # If a future refactor drops `if: always()`, a failed deploy
        # leaves the env-file behind. Do not remove this conditional
        # without first re-evaluating ADR-011.
        if: always()
        run: rm -f .env.staging

  npm-audit:
    # Independent parallel job — a deploy failure cannot mask the audit signal
    # and a clean audit cannot hide a broken deploy. Intentionally no `needs:`.
    #
    # Scans dev deps too (no --omit=dev), which is deliberately broader than the
    # PR gate (ci.yml §Security audit) that uses --omit=dev. A nightly broader
    # result is NOT a PR gate failure — it catches dev-tooling advisories (esbuild,
    # Vite, etc.) early. See docs/infrastructure/ci-gitea.md §Nightly audit vs PR gate.
    #
    # Required Gitea secrets:
    #   NIGHTLY_AUDIT_TOKEN  — PAT with issues scope only. An issues-only token
    #                          means a leak via logs/process-args cannot push
    #                          branches, open PRs, or read repo contents (ADR-041).
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Assert jq is available
        run: which jq || sudo apt-get install -y jq

      - name: Run npm audit and file tracking issue on findings
        # Never run under set -x — NIGHTLY_AUDIT_TOKEN in env would leak to logs.
        env:
          NIGHTLY_AUDIT_TOKEN: ${{ secrets.NIGHTLY_AUDIT_TOKEN }}
        run: |
          MARKER="Nightly npm audit: high-severity advisory"
          GITEA_URL="${{ github.server_url }}"
          REPO="${{ github.repository }}"
          RUN_URL="${GITEA_URL}/${REPO}/actions/runs/${{ github.run_id }}"

          # --- Gitea API helper ---
          # api METHOD URL [extra curl args...] — authenticated Gitea API call.
          # `curl -sf` collapses every HTTP >=400 into a bare "exit 22", which
          # surfaces as an opaque step failure (issue #839). Instead we read the
          # status code and, on a >=400 response, print an actionable ::error::
          # to stderr (so a calling command substitution does not swallow it) and
          # return 1 — `set -e` then still fails the step. The token is never
          # echoed (no set -x; never placed in the message).
          api() {
            local method="$1" url="$2"; shift 2
            local resp http
            resp=$(curl -s -w '\n%{http_code}' -X "$method" \
              -H "Authorization: token $NIGHTLY_AUDIT_TOKEN" "$@" -- "$url")
            http=${resp##*$'\n'}
            printf '%s' "${resp%$'\n'*}"
            case "$http" in
              2*|3*) return 0 ;;
              401|403)
                echo "::error::Gitea returned HTTP $http for $method ${url%%\?*} — the NIGHTLY_AUDIT_TOKEN secret is missing, expired, or lacks issue read+write scope; recreate the renovate_bot PAT and update the secret." >&2
                return 1 ;;
              *)
                echo "::error::Gitea returned HTTP ${http:-(none)} for $method ${url%%\?*}." >&2
                return 1 ;;
            esac
          }

          # --- Self-test (mirrors ci.yml §Assert pattern) ---
          # Runs before any real API call so broken logic fails loudly early:
          # (a) the jq title matcher used by the dedupe step — proves the regex
          #     only; the create-vs-update decision is exercised by the
          #     workflow_dispatch AC;
          # (b) the api helper's HTTP-status handling, driven by a mocked curl so
          #     it needs no network — proves a 2xx returns the body and a >=400
          #     fails with an ::error:: instead of an opaque exit 22.
          echo "{\"title\": \"${MARKER}\"}" \
            | jq -e --arg m "$MARKER" '.title | test($m; "i")' > /dev/null \
            || { echo "FAIL: self-test — jq test() missed tracking issue title"; exit 1; }
          echo '{"title": "fix(deps): update dependency esbuild (CVE-2025-12345)"}' \
            | jq -e --arg m "$MARKER" '.title | test($m; "i") | not' > /dev/null \
            || { echo "FAIL: self-test — jq test() incorrectly matched unrelated title"; exit 1; }
          ( curl() { printf 'OK\n200'; }; [ "$(api GET selftest)" = "OK" ] ) \
            || { echo "FAIL: self-test — api helper dropped body on HTTP 200"; exit 1; }
          ( curl() { printf 'nope\n401'; }
            if api GET selftest >/dev/null 2>/tmp/api_selftest_err; then exit 1; fi
            grep -q '::error::' /tmp/api_selftest_err ) \
            || { echo "FAIL: self-test — api helper did not emit ::error:: on HTTP 401"; exit 1; }
          echo "Self-test passed."

          # --- Run audit ---
          # No npm ci — audit reads only the lockfile (no network, no install).
          set +e
          (cd frontend && npm audit --audit-level=high --json > /tmp/audit.json)
          AUDIT_EXIT=$?
          set -e

          if [ "$AUDIT_EXIT" -ne 0 ]; then
            # --- Build issue body with jq (never string-concat advisory text) ---
            # Advisory overview/title text is registry-controlled; string-concat
            # would be an injection/escaping vector into the API body. Truncate
            # raw excerpt to 500 chars so a pathological overview can't produce
            # a multi-MB PATCH body.
            ISSUE_BODY=$(jq -r \
              --arg run_url "$RUN_URL" \
              '
              (.vulnerabilities // {}) as $vulns |
              ($vulns | to_entries |
                map(select(.value.severity == "high" or .value.severity == "critical")) |
                map("- **" + .key + "** (" + .value.severity + ")") |
                if length > 0 then join("\n") else "_See raw output for details._" end) as $pkg_list |
              "## npm audit: high/critical advisories\n\n" + $pkg_list +
              "\n\n**Run:** " + $run_url +
              "\n\n<details><summary>Raw audit excerpt (first 500 chars)</summary>\n\n```\n" +
              (tostring | .[0:500]) +
              "\n```\n\n</details>"
              ' /tmp/audit.json)

            # --- Dedupe: fetch open security issues, match by title marker ---
            # Renovate vuln PRs also carry the "security" label, so >1 open
            # "security" issue WILL occur. Title-match (not just label) ensures
            # we deduplicate only our own tracking issue.
            OPEN_ISSUES=$(api GET \
              "${GITEA_URL}/api/v1/repos/${REPO}/issues?state=open&type=issues&labels=security&limit=50")

            MATCHED=$(echo "$OPEN_ISSUES" | jq \
              --arg m "$MARKER" \
              '[.[] | select(.title | test($m; "i"))] | sort_by(.created_at)')
            MATCH_COUNT=$(echo "$MATCHED" | jq 'length')

            if [ "$MATCH_COUNT" -gt 0 ]; then
              # Patch the oldest matched issue (append run URL to body).
              ISSUE_NUMBER=$(echo "$MATCHED" | jq -r '.[0].number')
              EXISTING_BODY=$(echo "$MATCHED" | jq -r '.[0].body')
              NEW_BODY=$(jq -n \
                --arg existing "$EXISTING_BODY" \
                --arg run_url "$RUN_URL" \
                '$existing + "\n\n---\n\nUpdated by run: " + $run_url')
              PAYLOAD=$(jq -n --arg body "$NEW_BODY" '{"body": $body}')
              api PATCH \
                "${GITEA_URL}/api/v1/repos/${REPO}/issues/${ISSUE_NUMBER}" \
                -H "Content-Type: application/json" \
                -d "$PAYLOAD" > /dev/null
              echo "Updated tracking issue #${ISSUE_NUMBER}"
            else
              # Closed prior issue that recurs → new issue (not reopened).
              # A re-opened issue would obscure when the advisory was re-discovered.
              PAYLOAD=$(jq -n \
                --arg title "$MARKER" \
                --arg body "$ISSUE_BODY" \
                '{"title": $title, "body": $body}')
              CREATED=$(api POST \
                "${GITEA_URL}/api/v1/repos/${REPO}/issues" \
                -H "Content-Type: application/json" \
                -d "$PAYLOAD")
              NEW_NUMBER=$(echo "$CREATED" | jq -r '.number')
              echo "Opened new tracking issue #${NEW_NUMBER}"

              # Labels are ignored on issue create in Gitea — add in a follow-up call.
              LABEL_IDS=$(api GET \
                "${GITEA_URL}/api/v1/repos/${REPO}/labels?limit=50" \
                | jq '[.[] | select(.name == "security" or .name == "devops" or .name == "P1-high") | .id]')
              api POST \
                "${GITEA_URL}/api/v1/repos/${REPO}/issues/${NEW_NUMBER}/labels" \
                -H "Content-Type: application/json" \
                -d "{\"labels\": $LABEL_IDS}" > /dev/null
            fi

            exit "$AUDIT_EXIT"

          else
            # --- Heartbeat: proves the job ran and found nothing ---
            # "No issue created" is only meaningful evidence when paired with a
            # visible positive signal. Without this, a never-ran job is
            # indistinguishable from a clean run.
            #
            # $GITHUB_STEP_SUMMARY availability is unproven on this runner
            # (act_runner populates it, but this is the first run to verify it).
            # Guard before use so an unset variable does not fail the clean-path.
            MSG="✅ npm audit clean $(date -u)"
            if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
              echo "$MSG" >> "$GITHUB_STEP_SUMMARY"
            fi
            echo "$MSG"
          fi