2026-06-02 19:57:20 +02:00
9 changed files with 432 additions and 253 deletions
--- a/.gitea/actions/deploy-obs/action.yml
+++ b/.gitea/actions/deploy-obs/action.yml
@@ -0,0 +1,127 @@
 name: Deploy observability stack
 description: >-
  Deploy observability configs + secrets to /opt/familienarchiv, validate the
  compose config, start the stack, and assert the five healthchecked services
  are healthy. Per-environment values arrive as inputs.
 inputs:
  grafana_admin_password:
    description: Grafana admin password (secret)
    required: true
  grafana_db_password:
    description: Read-only grafana_reader DB role password (secret, issue #651)
    required: true
  glitchtip_secret_key:
    description: GlitchTip Django secret key (secret)
    required: true
  postgres_password:
    description: PostgreSQL password for the environment (secret)
    required: true
  postgres_host:
    description: >-
      Compose project + service hostname, e.g. archiv-staging-db-1. Derived
      from the Compose project name and service name — a project rename
      requires updating the caller's value. Plain input, not a secret.
    required: true
 runs:
  using: composite
  steps:
    - name: Deploy observability configs
      shell: bash
      # Copies the compose file and config tree from the workspace checkout
      # into /opt/familienarchiv/ — the permanent location that persists
      # between CI runs. Containers started in the next step bind-mount
      # from there, so a future workspace wipe cannot corrupt a running
      # config file.
      #
      # obs-secrets.env is written fresh from Gitea secrets on every run so
      # Gitea is always the single source of truth for secret rotation.
      # Non-secret config lives in infra/observability/obs.env (tracked in git).
      #
      # secrets.* is NOT available inside a composite action, so the values
      # arrive as inputs mapped to env: below and are referenced as $VAR in
      # the heredoc. The delimiter MUST stay unquoted (<<EOF, not <<'EOF') so
      # the shell expands $VAR — a quoted delimiter would write the literal
      # string "$GRAFANA_ADMIN_PASSWORD" and `config --quiet` would still pass
      # (the var is present, just wrong). Do not stage these into intermediate
      # variables either, or Gitea log masking can be lost.
      env:
        GRAFANA_ADMIN_PASSWORD: ${{ inputs.grafana_admin_password }}
        GRAFANA_DB_PASSWORD: ${{ inputs.grafana_db_password }}
        GLITCHTIP_SECRET_KEY: ${{ inputs.glitchtip_secret_key }}
        POSTGRES_PASSWORD: ${{ inputs.postgres_password }}
        POSTGRES_HOST: ${{ inputs.postgres_host }}
      run: |
        set -euo pipefail
        rm -rf /opt/familienarchiv/infra/observability
        mkdir -p /opt/familienarchiv/infra/observability
        cp -r infra/observability/. /opt/familienarchiv/infra/observability/
        cp docker-compose.observability.yml /opt/familienarchiv/
        cat > /opt/familienarchiv/obs-secrets.env <<EOF
        GRAFANA_ADMIN_PASSWORD=$GRAFANA_ADMIN_PASSWORD
        GRAFANA_DB_PASSWORD=$GRAFANA_DB_PASSWORD
        GLITCHTIP_SECRET_KEY=$GLITCHTIP_SECRET_KEY
        POSTGRES_PASSWORD=$POSTGRES_PASSWORD
        POSTGRES_HOST=$POSTGRES_HOST
        EOF
        # Five-key non-empty guard: a bare presence check matches an empty
        # `KEY=` line, so assert each key has a value. Fail loudly on any
        # missing/empty key rather than starting the stack with broken auth.
        for key in GRAFANA_ADMIN_PASSWORD GRAFANA_DB_PASSWORD GLITCHTIP_SECRET_KEY POSTGRES_PASSWORD POSTGRES_HOST; do
          grep -Eq "^${key}=.+" /opt/familienarchiv/obs-secrets.env \
            || { echo "::error::obs-secrets.env missing or empty: ${key}"; exit 1; }
        done
        # chmod 600 MUST be the final operation: the ordering is the security
        # property — there is no window where the file is world-readable.
        chmod 600 /opt/familienarchiv/obs-secrets.env
    - name: Validate observability compose config
      shell: bash
      # Dry-run: resolves all variable substitutions and reports any missing
      # required keys before containers start. Catches undefined variables and
      # YAML errors in config files updated by the previous step.
      # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
      # second (CI-written secrets). Later files win on duplicate keys. POSTGRES_HOST
      # is environment-specific and supplied only by obs-secrets.env — obs.env
      # documents it but deliberately does not set a value.
      run: |
        docker compose \
          -f /opt/familienarchiv/docker-compose.observability.yml \
          --env-file /opt/familienarchiv/infra/observability/obs.env \
          --env-file /opt/familienarchiv/obs-secrets.env \
          config --quiet
    - name: Start observability stack
      shell: bash
      # Runs with absolute paths so bind mounts resolve to stable host paths
      # that survive workspace wipes between runs (see ADR-016).
      # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
      # (written fresh from Gitea secrets above). --env-file order: obs.env first,
      # obs-secrets.env second — later file wins on duplicate keys.
      run: |
        docker compose \
          -f /opt/familienarchiv/docker-compose.observability.yml \
          --env-file /opt/familienarchiv/infra/observability/obs.env \
          --env-file /opt/familienarchiv/obs-secrets.env \
          up -d --wait --remove-orphans
    - name: Assert observability stack health
      shell: bash
      # docker compose up --wait covers services WITH healthcheck directives only.
      # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
      # no healthcheck — they are considered "started" as soon as the process runs.
      # This step explicitly asserts the five healthchecked critical services are
      # healthy before the smoke test proceeds.
      run: |
        set -e
        unhealthy=""
        for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
          status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
          if [ "$status" != "healthy" ]; then
            echo "::error::$svc is not healthy (status: $status)"
            unhealthy="$unhealthy $svc"
          fi
        done
        [ -z "$unhealthy" ] || exit 1
        echo "All critical observability services are healthy"
--- a/.gitea/actions/reload-caddy/action.yml
+++ b/.gitea/actions/reload-caddy/action.yml
@@ -0,0 +1,41 @@
 name: Reload Caddy
 description: >-
  Reload the host Caddy service from a DooD job container via a privileged
  sibling container and nsenter. No inputs.
 runs:
  using: composite
  steps:
    - name: Reload Caddy
      shell: bash
      # Apply any committed Caddyfile changes before smoke-testing the
      # public surface. Without this step, a Caddyfile edit lands in the
      # repo but Caddy keeps serving the previous config until someone
      # reloads it manually — the smoke test would then catch a stale
      # header or a still-proxied /actuator route rather than confirming
      # the current config is live.
      #
      # The runner executes job steps inside Docker containers (DooD).
      # `systemctl` is not present in container images and cannot reach
      # the host's systemd directly. We use the Docker socket (mounted
      # into every job container via runner-config.yaml) to spin up a
      # privileged sibling container in the host PID namespace; nsenter
      # then enters the host's namespaces so systemctl talks to the real
      # host systemd daemon. No sudoers entry is required — the Docker
      # socket already grants root-equivalent host access.
      #
      # Alpine is used: ~5 MB vs ~70 MB for ubuntu, no unnecessary
      # tooling, and the digest is pinned so any upstream change requires
      # an explicit bump PR. util-linux (which ships nsenter) is installed
      # at run time; apk add takes ~1 s on the warm VPS cache.
      #
      # `reload` not `restart`: reload sends SIGHUP so Caddy re-reads its
      # config in-process without dropping TLS connections. `restart`
      # would briefly stop the service, losing in-flight requests.
      #
      # If Caddy is not running this step fails fast before the smoke test
      # issues a misleading "port 443 refused" error.
      run: |
        docker run --rm --privileged --pid=host \
          alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
          sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
--- a/.gitea/actions/smoke-test/action.yml
+++ b/.gitea/actions/smoke-test/action.yml
@@ -0,0 +1,58 @@
 name: Smoke test
 description: >-
  Verify the deployed public surface (login reachable, HSTS pinned,
  Permissions-Policy present, /actuator blocked) against a given vhost.
 inputs:
  host:
    description: Public vhost to smoke-test, e.g. staging.raddatz.cloud
    required: true
 runs:
  using: composite
  steps:
    - name: Smoke test deployed environment
      shell: bash
      # Healthchecks confirm containers are healthy; they do NOT confirm the
      # public surface works. This step catches: Caddy not reloaded, HSTS
      # header dropped, /actuator block bypassed.
      #
      # --resolve pins the public host to the Docker bridge gateway IP
      # (the host) so we do NOT depend on hairpin NAT on the host router.
      # 127.0.0.1 cannot be used: job containers run in bridge network mode
      # (runner-config.yaml), so 127.0.0.1 is the container's loopback, not
      # the host's. The bridge gateway IS the host; Caddy binds 0.0.0.0:443
      # and is therefore reachable from the container via that IP.
      # SNI still uses the public hostname so the TLS cert validates correctly.
      #
      # --resolve is stored as a Bash array so "${RESOLVE[@]}" expands to two
      # separate arguments; a quoted string would pass the flag and its value
      # as one token and curl would reject it as an unknown option.
      #
      # Gateway detection reads /proc/net/route (always present, no package
      # required) instead of `ip route` to avoid a dependency on iproute2.
      # Field $2=="00000000" is the default route; field $3 is the gateway as
      # a little-endian 32-bit hex value which awk decodes to dotted-decimal.
      env:
        HOST: ${{ inputs.host }}
      run: |
        set -e
        URL="https://$HOST"
        HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
        [ -n "$HOST_IP" ] || { echo "::error::could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
        RESOLVE=(--resolve "$HOST:443:$HOST_IP")
        echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
        curl -fsS "${RESOLVE[@]}" --max-time 10 "$URL/login" -o /dev/null
        # Pin the preload-list-eligible HSTS value, not just header presence:
        # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
        # fail this check rather than pass it silently.
        curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
          | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
        # Permissions-Policy denies APIs the app does not use (camera,
        # microphone, geolocation). A regression that loosens or drops the
        # header now fails the smoke step.
        curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
          | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
        status=$(curl -s "${RESOLVE[@]}" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
        [ "$status" = "404" ] || { echo "::error::expected 404 from /actuator/health, got $status"; exit 1; }
        echo "All smoke checks passed"
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -108,6 +108,32 @@ jobs:
            exit 1
          fi
      - name: Assert deploy-obs writes obs-secrets.env via an unquoted heredoc (#603)
        shell: bash
        run: |
          # Inside a composite action, secrets arrive as $VAR from env: (secrets.*
          # is unavailable there), so the obs-secrets.env heredoc MUST use an
          # unquoted delimiter (<<EOF) for $VAR to expand. A quoted delimiter
          # (<<'EOF') would write the literal string "$GRAFANA_ADMIN_PASSWORD",
          # and the action's five-key non-empty guard would STILL pass (the line
          # is present, just wrong). This guard enforces the invariant in CI so a
          # future re-quote cannot ship broken obs auth green. See ADR-029 / #603.
          action='.gitea/actions/deploy-obs/action.yml'
          quoted='obs-secrets\.env\s*<<-?\s*[\x27\x22]'
          # Self-test: the regex must catch a quoted delimiter and ignore the unquoted one.
          printf "obs-secrets.env <<'EOF'\n" | grep -qP "$quoted" \
            || { echo "FAIL: guard self-test — regex missed the quoted <<'EOF' form"; exit 1; }
          printf 'obs-secrets.env <<EOF\n' | grep -qvP "$quoted" \
            || { echo "FAIL: guard self-test — regex wrongly flagged the unquoted <<EOF form"; exit 1; }
          # Positive: the unquoted heredoc must be present at all.
          grep -qP 'obs-secrets\.env\s*<<-?EOF\b' "$action" \
            || { echo "::error::$action no longer writes obs-secrets.env via an unquoted <<EOF heredoc (ADR-029 / #603)"; exit 1; }
          # Negative: never a quoted delimiter on the obs-secrets.env heredoc.
          if grep -nP "$quoted" "$action"; then
            echo "::error::$action writes obs-secrets.env with a quoted heredoc delimiter — secrets would be written as literal \$VAR strings. Use unquoted <<EOF (ADR-029 / #603)."
            exit 1
          fi
      - name: Run unit and component tests with coverage
        shell: bash
        run: |
--- a/.gitea/workflows/nightly.yml
+++ b/.gitea/workflows/nightly.yml
@@ -23,6 +23,11 @@ name: nightly
 #   - host ports:   backend 8081, frontend 3001
 #   - profile:      staging (starts mailpit instead of a real SMTP relay)
 #
 # The obs-stack deploy, Caddy reload, and smoke test are shared with
 # release.yml via the composite actions under .gitea/actions/ (ADR-029).
 # actions/checkout MUST stay the first step: a local `uses: ./…` action
 # only exists on disk after checkout.
 #
 # Required Gitea secrets:
 #   STAGING_POSTGRES_PASSWORD
 #   STAGING_MINIO_PASSWORD
@@ -55,6 +60,8 @@ jobs:
    # for the same repo is within that boundary.
    runs-on: ubuntu-latest
    steps:
      # MUST be first: the composite actions below live under .gitea/actions/
      # and only exist on disk once the repo is checked out (ADR-029).
      - uses: actions/checkout@v4
      - name: Write staging env file
@@ -92,6 +99,7 @@ jobs:
        # `compose config` renders both shorthand and longform mounts as
        # `target: /import` + `read_only: true`, so we assert against
        # the rendered form rather than the raw source YAML.
        # App-compose check (not obs), nightly-only — stays inline.
        run: |
          set -e
          docker compose \
@@ -128,150 +136,21 @@ jobs:
            --profile staging \
            up -d --wait --remove-orphans
-      - name: Deploy observability configs
+      # POSTGRES_HOST is derived from the Compose project name (archiv-staging)
-        # Copies the compose file and config tree from the workspace checkout
+      # and service name (db). A project rename requires updating this value.
-        # into /opt/familienarchiv/ — the permanent location that persists
+      - uses: ./.gitea/actions/deploy-obs
-        # between CI runs. Containers started in the next step bind-mount
+        with:
-        # from there, so a future workspace wipe cannot corrupt a running
+          grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
-        # config file.
+          grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }}
-        #
+          glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }}
-        # obs-secrets.env is written fresh from Gitea secrets on every run so
+          postgres_password: ${{ secrets.STAGING_POSTGRES_PASSWORD }}
-        # Gitea is always the single source of truth for secret rotation.
+          postgres_host: archiv-staging-db-1
        # Non-secret config lives in infra/observability/obs.env (tracked in git).
        run: |
          rm -rf /opt/familienarchiv/infra/observability
          mkdir -p /opt/familienarchiv/infra/observability
          cp -r infra/observability/. /opt/familienarchiv/infra/observability/
          cp docker-compose.observability.yml /opt/familienarchiv/
          cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
          GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
          GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
          GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
          POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
          POSTGRES_HOST=archiv-staging-db-1
          EOF
          # Note: POSTGRES_HOST is derived from the Compose project name (archiv-staging)
          # and service name (db). A project rename requires updating this value.
          chmod 600 /opt/familienarchiv/obs-secrets.env
-      - name: Validate observability compose config
+      - uses: ./.gitea/actions/reload-caddy
        # Dry-run: resolves all variable substitutions and reports any missing
        # required keys before containers start. Catches undefined variables and
        # YAML errors in config files updated by the previous step.
        # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
        # second (CI-written secrets). Later files win on duplicate keys, so
        # obs-secrets.env overrides POSTGRES_HOST set in obs.env.
        run: |
          docker compose \
            -f /opt/familienarchiv/docker-compose.observability.yml \
            --env-file /opt/familienarchiv/infra/observability/obs.env \
            --env-file /opt/familienarchiv/obs-secrets.env \
            config --quiet
-      - name: Start observability stack
+      - uses: ./.gitea/actions/smoke-test
-        # Runs with absolute paths so bind mounts resolve to stable host paths
+        with:
-        # that survive workspace wipes between nightly runs (see ADR-016).
+          host: staging.raddatz.cloud
        # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
        # (written fresh from Gitea secrets above). --env-file order: obs.env first,
        # obs-secrets.env second — later file wins on duplicate keys.
        run: |
          docker compose \
            -f /opt/familienarchiv/docker-compose.observability.yml \
            --env-file /opt/familienarchiv/infra/observability/obs.env \
            --env-file /opt/familienarchiv/obs-secrets.env \
            up -d --wait --remove-orphans
      - name: Assert observability stack health
        # docker compose up --wait covers services WITH healthcheck directives only.
        # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
        # no healthcheck — they are considered "started" as soon as the process runs.
        # This step explicitly asserts the five healthchecked critical services are
        # healthy before the smoke test proceeds.
        run: |
          set -e
          unhealthy=""
          for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
            status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
            if [ "$status" != "healthy" ]; then
              echo "::error::$svc is not healthy (status: $status)"
              unhealthy="$unhealthy $svc"
            fi
          done
          [ -z "$unhealthy" ] || exit 1
          echo "All critical observability services are healthy"
      - name: Reload Caddy
        # Apply any committed Caddyfile changes before smoke-testing the
        # public surface. Without this step, a Caddyfile edit lands in the
        # repo but Caddy keeps serving the previous config until someone
        # reloads it manually — the smoke test would then catch a stale
        # header or a still-proxied /actuator route rather than confirming
        # the current config is live.
        #
        # The runner executes job steps inside Docker containers (DooD).
        # `systemctl` is not present in container images and cannot reach
        # the host's systemd directly. We use the Docker socket (mounted
        # into every job container via runner-config.yaml) to spin up a
        # privileged sibling container in the host PID namespace; nsenter
        # then enters the host's namespaces so systemctl talks to the real
        # host systemd daemon. No sudoers entry is required — the Docker
        # socket already grants root-equivalent host access.
        #
        # Alpine is used: ~5 MB vs ~70 MB for ubuntu, no unnecessary
        # tooling, and the digest is pinned so any upstream change requires
        # an explicit bump PR. util-linux (which ships nsenter) is installed
        # at run time; apk add takes ~1 s on the warm VPS cache.
        #
        # `reload` not `restart`: reload sends SIGHUP so Caddy re-reads its
        # config in-process without dropping TLS connections. `restart`
        # would briefly stop the service, losing in-flight requests.
        #
        # If Caddy is not running this step fails fast before the smoke test
        # issues a misleading "port 443 refused" error.
        run: |
          docker run --rm --privileged --pid=host \
            alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
            sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
      - name: Smoke test deployed environment
        # Healthchecks confirm containers are healthy; they do NOT confirm the
        # public surface works. This step catches: Caddy not reloaded, HSTS
        # header dropped, /actuator block bypassed.
        #
        # --resolve pins staging.raddatz.cloud to the Docker bridge gateway IP
        # (the host) so we do NOT depend on hairpin NAT on the host router.
        # 127.0.0.1 cannot be used: job containers run in bridge network mode
        # (runner-config.yaml), so 127.0.0.1 is the container's loopback, not
        # the host's. The bridge gateway IS the host; Caddy binds 0.0.0.0:443
        # and is therefore reachable from the container via that IP.
        # SNI still uses the public hostname so the TLS cert validates correctly.
        #
        # Gateway detection reads /proc/net/route (always present, no package
        # required) instead of `ip route` to avoid a dependency on iproute2.
        # Field $2=="00000000" is the default route; field $3 is the gateway as
        # a little-endian 32-bit hex value which awk decodes to dotted-decimal.
        run: |
          set -e
          HOST="staging.raddatz.cloud"
          URL="https://$HOST"
          HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
          [ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
          RESOLVE=(--resolve "$HOST:443:$HOST_IP")
          echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
          curl -fsS "${RESOLVE[@]}" --max-time 10 "$URL/login" -o /dev/null
          # Pin the preload-list-eligible HSTS value, not just header presence:
          # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
          # fail this check rather than pass it silently.
          curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
            | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
          # Permissions-Policy denies APIs the app does not use (camera,
          # microphone, geolocation). A regression that loosens or drops the
          # header now fails the smoke step.
          curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
            | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
          status=$(curl -s "${RESOLVE[@]}" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
          [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
          echo "All smoke checks passed"
      - name: Cleanup env file
        # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
--- a/.gitea/workflows/release.yml
+++ b/.gitea/workflows/release.yml
@@ -23,6 +23,11 @@ name: release
 #   - host ports:   backend 8080, frontend 3000
 #   - profile:      (none) — mailpit is excluded; real SMTP relay is used
 #
 # The obs-stack deploy, Caddy reload, and smoke test are shared with
 # nightly.yml via the composite actions under .gitea/actions/ (ADR-029).
 # actions/checkout MUST stay the first step: a local `uses: ./…` action
 # only exists on disk after checkout.
 #
 # Required Gitea secrets:
 #   PROD_POSTGRES_PASSWORD
 #   PROD_MINIO_PASSWORD
@@ -53,6 +58,8 @@ jobs:
    # advertised label of our single-tenant self-hosted runner.
    runs-on: ubuntu-latest
    steps:
      # MUST be first: the composite actions below live under .gitea/actions/
      # and only exist on disk once the repo is checked out (ADR-029).
      - uses: actions/checkout@v4
      - name: Write production env file
@@ -100,117 +107,21 @@ jobs:
            --env-file .env.production \
            up -d --wait --remove-orphans
-      - name: Deploy observability configs
+      # POSTGRES_HOST is derived from the Compose project name (archiv-production)
-        # Mirrors the nightly approach: copies obs compose file and config tree
+      # and service name (db). A project rename requires updating this value.
-        # to /opt/familienarchiv/ (permanent path, survives workspace wipes — ADR-016),
+      - uses: ./.gitea/actions/deploy-obs
-        # then writes obs-secrets.env fresh from Gitea secrets.
+        with:
-        # Non-secret config lives in infra/observability/obs.env (tracked in git).
+          grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
-        run: |
+          grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }}
-          rm -rf /opt/familienarchiv/infra/observability
+          glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }}
-          mkdir -p /opt/familienarchiv/infra/observability
+          postgres_password: ${{ secrets.PROD_POSTGRES_PASSWORD }}
-          cp -r infra/observability/. /opt/familienarchiv/infra/observability/
+          postgres_host: archiv-production-db-1
          cp docker-compose.observability.yml /opt/familienarchiv/
          cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
          GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
          GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
          GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
          POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }}
          POSTGRES_HOST=archiv-production-db-1
          EOF
          # Note: POSTGRES_HOST is derived from the Compose project name (archiv-production)
          # and service name (db). A project rename requires updating this value.
          chmod 600 /opt/familienarchiv/obs-secrets.env
-      - name: Validate observability compose config
+      - uses: ./.gitea/actions/reload-caddy
        # Dry-run: resolves all variable substitutions and reports any missing
        # required keys before containers start. Catches undefined variables and
        # YAML errors in config files updated by the previous step.
        # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
        # second (CI-written secrets). Later files win on duplicate keys, so
        # obs-secrets.env overrides POSTGRES_HOST set in obs.env.
        # Keep in sync with the equivalent step in nightly.yml (#603).
        run: |
          docker compose \
            -f /opt/familienarchiv/docker-compose.observability.yml \
            --env-file /opt/familienarchiv/infra/observability/obs.env \
            --env-file /opt/familienarchiv/obs-secrets.env \
            config --quiet
-      - name: Start observability stack
+      - uses: ./.gitea/actions/smoke-test
-        # Runs with absolute paths so bind mounts resolve to stable host paths
+        with:
-        # that survive workspace wipes between runs (see ADR-016).
+          host: archiv.raddatz.cloud
        # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
        # (written fresh from Gitea secrets above). --env-file order: obs.env first,
        # obs-secrets.env second — later file wins on duplicate keys.
        # Keep in sync with the equivalent step in nightly.yml (#603).
        run: |
          docker compose \
            -f /opt/familienarchiv/docker-compose.observability.yml \
            --env-file /opt/familienarchiv/infra/observability/obs.env \
            --env-file /opt/familienarchiv/obs-secrets.env \
            up -d --wait --remove-orphans
      - name: Assert observability stack health
        # docker compose up --wait covers services WITH healthcheck directives only.
        # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
        # no healthcheck — they are considered "started" as soon as the process runs.
        # This step explicitly asserts the five healthchecked critical services are
        # healthy before the smoke test proceeds.
        # Keep in sync with the equivalent step in nightly.yml (#603).
        run: |
          set -e
          unhealthy=""
          for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
            status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
            if [ "$status" != "healthy" ]; then
              echo "::error::$svc is not healthy (status: $status)"
              unhealthy="$unhealthy $svc"
            fi
          done
          [ -z "$unhealthy" ] || exit 1
          echo "All critical observability services are healthy"
      - name: Reload Caddy
        # See nightly.yml — same rationale and mechanism: DooD job containers
        # cannot call systemctl directly; nsenter via a privileged sibling
        # container reaches the host systemd. Must run after deploy (so the
        # latest Caddyfile is on disk) and before the smoke test (so the
        # public surface reflects the current config). Alpine with pinned
        # digest; reload not restart — see nightly.yml for full rationale.
        run: |
          docker run --rm --privileged --pid=host \
            alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
            sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
      - name: Smoke test deployed environment
        # See nightly.yml — same three checks, against the prod vhost.
        # --resolve stored as a Bash array so "${RESOLVE[@]}" expands to two
        # separate arguments; a quoted string would pass the flag and its value
        # as one token and curl would reject it as an unknown option.
        # Gateway detection via /proc/net/route — no iproute2 dependency.
        # See nightly.yml for the full network topology explanation.
        run: |
          set -e
          HOST="archiv.raddatz.cloud"
          URL="https://$HOST"
          HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
          [ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
          RESOLVE=(--resolve "$HOST:443:$HOST_IP")
          echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
          curl -fsS "${RESOLVE[@]}" --max-time 10 "$URL/login" -o /dev/null
          # Pin the preload-list-eligible HSTS value, not just header presence:
          # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
          # fail this check rather than pass it silently.
          curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
            | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
          # Permissions-Policy denies APIs the app does not use (camera,
          # microphone, geolocation). A regression that loosens or drops the
          # header now fails the smoke step.
          curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
            | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
          status=$(curl -s "${RESOLVE[@]}" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
          [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
          echo "All smoke checks passed"
      - name: Cleanup env file
        # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
--- a/docs/adr/029-composite-actions-for-cross-workflow-deploy-logic.md
+++ b/docs/adr/029-composite-actions-for-cross-workflow-deploy-logic.md
@@ -0,0 +1,69 @@
 # ADR-029: Composite actions for cross-workflow deploy logic
 ## Status
 Accepted
 ## Context
 The `nightly.yml` (staging) and `release.yml` (production) workflows shared three
 blocks of deploy logic verbatim: the four observability-stack steps (deploy configs,
 validate, start, assert health), the Caddy reload step, and the public-surface smoke
 test. The only per-environment differences were secret names (`STAGING_*` vs `PROD_*`),
 the `POSTGRES_HOST` value, and the smoke-test hostname.
 This duplication was held together by `# Keep in sync with nightly.yml` comments — an
 honour-system invariant. Any change (a new healthchecked service, a different rsync flag,
 a new secret) had to be applied in two places, and nothing enforced that it was. Issue #603
 documents a real instance: the obs secret set had grown to five keys while a refactor draft
 listed only four.
 ### Decision drivers
 1. Cross-workflow deploy logic must have a single definition, enforced — not a
   discipline-based "keep in sync" promise.
 2. Per-environment variation must be expressed as explicit, typed inputs, not by forking
   the whole step block.
 3. The mechanism must work on the existing single-tenant self-hosted Gitea runner with no
   new infrastructure.
 ### Alternatives considered
 **A: Reusable workflow (`workflow_call`)** — Gitea supports called workflows. Rejected for
 this case: reusable workflows run as a separate job with their own runner context, which
 breaks the in-job, sequential `deploy → reload → smoke` ordering these steps rely on and
 complicates passing the already-checked-out workspace. Composite actions run inline in the
 calling job, preserving step order and the workspace.
 **B: Shared shell script invoked from both workflows** — e.g. `scripts/deploy-obs.sh`.
 Rejected: loses the typed-input contract and per-step CI log sections, and reintroduces
 manual argument threading that is as error-prone as the duplication it replaces.
 **C: Keep the `# Keep in sync` comments** — status quo. Rejected: unenforced; issue #603
 is direct evidence it fails.
 ## Decision
 Extract the shared logic into three single-responsibility Gitea composite actions under
 `.gitea/actions/`: `deploy-obs` (five inputs), `reload-caddy` (no inputs), and `smoke-test`
 (`host` input). Both workflows invoke each via a single `uses: ./.gitea/actions/<name>` call,
 passing per-environment values as `with:` inputs. This is the repository's first composite
 action and sets the convention; `docs/infrastructure/ci-gitea.md` documents it.
 ## Consequences
 **Positive:**
 - Shared deploy logic has one enforced definition; a change lands once and both
  environments get it. The `# Keep in sync` comments are deleted.
 - Per-environment variation is a typed input contract, not a forked block.
 - Runs inline on the existing runner — no reusable-workflow job context, no new
  infrastructure.
 **Negative / constraints:**
 - Workflows now depend on a checked-out `.gitea/actions/` tree: `actions/checkout` MUST run
  before the first `uses: ./…` (a local action does not exist on disk until checkout).
 - Secrets cannot be read from the `secrets.*` context inside a composite action; they must
  be passed as inputs and mapped to `env:`. The `obs-secrets.env` heredoc therefore uses an
  unquoted delimiter so `$VAR` expands at the shell layer.
 - The `reload-caddy` pinned alpine digest now lives in the action, not the workflow file —
  it must be added to Renovate's watch list so it does not go stale.
--- a/docs/infrastructure/ci-gitea.md
+++ b/docs/infrastructure/ci-gitea.md
@@ -68,6 +68,8 @@ Job containers are unprivileged and do not share the host's PID/mount/network na
 Alpine is used instead of Ubuntu: ~5 MB vs ~70 MB, and the digest is pinned to a specific sha256 so any upstream change requires an explicit Renovate bump PR. `util-linux` (which ships `nsenter`) is not part of the Alpine base image but is installed at run time in ~1 s from the warm VPS cache.
 This exact step now lives in the `reload-caddy` composite action (see [Composite actions](#composite-actions) below); both deploy workflows call it via `uses: ./.gitea/actions/reload-caddy`. The pinned digest moved with it, so Renovate's privileged-digest watch covers `.gitea/actions/**` as well as `.gitea/workflows/**`.
 #### Why not `sudo systemctl` in the job container?
 Job containers run as root inside an unprivileged Docker namespace. There is no systemd PID 1 inside the container — `systemctl` would attempt to reach a socket that does not exist. `sudo` is not present in container images and would not help even if it were.
@@ -170,6 +172,72 @@ See `docs/DEPLOYMENT.md §3.1` and ADR-015 for the full setup rationale.
 ---
 ## Composite actions
 The `nightly.yml` (staging) and `release.yml` (production) deploy workflows share their observability-stack deploy, Caddy reload, and smoke-test logic through three single-responsibility composite actions under `.gitea/actions/` (ADR-029). Before this, the shared logic was duplicated in both workflows and held together by `# Keep in sync with nightly.yml` comments — an unenforced honour-system invariant.
 | Action | Inputs | Purpose |
 |---|---|---|
 | `deploy-obs` | `grafana_admin_password`, `grafana_db_password`, `glitchtip_secret_key`, `postgres_password`, `postgres_host` | Deploy obs configs + secrets to `/opt/familienarchiv`, validate the compose config, start the stack, assert the five healthchecked services |
 | `reload-caddy` | — | Reload host Caddy via the privileged-sibling + nsenter pattern |
 | `smoke-test` | `host` | Verify the public surface (login reachable, HSTS pinned, Permissions-Policy present, `/actuator → 404`) |
 A workflow calls them by relative path, passing per-environment values as `with:` inputs:
 ```yaml
 - uses: ./.gitea/actions/deploy-obs
  with:
    grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
    grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }}
    glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }}
    postgres_password: ${{ secrets.STAGING_POSTGRES_PASSWORD }}
    postgres_host: archiv-staging-db-1
 - uses: ./.gitea/actions/reload-caddy
 - uses: ./.gitea/actions/smoke-test
  with:
    host: staging.raddatz.cloud
 ```
 ### Checkout-first ordering rule
 A local composite action (`uses: ./…`) only exists on disk **after** the repo is checked out. `actions/checkout@v4` MUST therefore be the **first step** of any job that calls one — if a future reorder moves checkout later, every `uses: ./.gitea/actions/…` call fails because the action file is not yet on disk. Both deploy workflows pin checkout as step 1 for exactly this reason.
 ### Secrets inside composite actions
 The `secrets.*` context is **not** available inside a composite action. Secrets are passed in as `inputs`, mapped to an `env:` block, and referenced as `$VAR`:
 ```yaml
 inputs:
  grafana_admin_password:
    required: true        # no default — a missing secret must fail loudly, never fall back to empty
 runs:
  using: composite
  steps:
    - shell: bash         # composite steps do NOT default the shell — always declare it
      env:
        GRAFANA_ADMIN_PASSWORD: ${{ inputs.grafana_admin_password }}
      run: |
        cat > obs-secrets.env <<EOF   # unquoted EOF — $VAR expands at the shell layer
        GRAFANA_ADMIN_PASSWORD=$GRAFANA_ADMIN_PASSWORD
        EOF
 ```
 Two load-bearing details:
 - **Unquoted heredoc delimiter (`<<EOF`, not `<<'EOF'`).** With a quoted delimiter the shell writes the literal string `$GRAFANA_ADMIN_PASSWORD`, and `docker compose config --quiet` still passes (the variable is *present, just wrong*). The `deploy-obs` action guards against this with a five-key **non-empty** check (`grep -Eq "^KEY=.+"`) immediately after writing `obs-secrets.env`. `chmod 600` is the action's final operation so the file is never world-readable.
 - **Every `run:` step declares `shell: bash`.** Composite actions do not inherit the workflow's default shell; a step without it fails to run.
 ### Adding an input to an action
 To thread a new per-environment value (e.g. a new secret) through `deploy-obs`:
 1. Add it under `inputs:` in `.gitea/actions/deploy-obs/action.yml` with `required: true` and **no `default:`**.
 2. Map it in the relevant step's `env:` block: `NEW_KEY: ${{ inputs.new_key }}`.
 3. Reference it as `$NEW_KEY` in the `run:` script — add a `NEW_KEY=$NEW_KEY` line to the heredoc **and** a matching entry to the five-key guard loop.
 4. Pass it from **both** workflows' `with:` blocks. That is the whole point of the action: the contract lives in one place, so neither environment can silently drift.
 ---
 ## Gitea vs GitHub Actions Differences
 ### Context Variable Names
--- a/renovate.json
+++ b/renovate.json
@@ -14,8 +14,8 @@
      "automerge": false
    },
    {
-      "description": "Digest bumps for images used in privileged CI steps (--privileged --pid=host) must be reviewed manually — a compromised image has root-equivalent host access.",
+      "description": "Digest bumps for images used in privileged CI steps (--privileged --pid=host) must be reviewed manually — a compromised image has root-equivalent host access. Covers .gitea/actions/** too: the reload-caddy alpine digest now lives in a composite action (#603).",
-      "matchPaths": [".gitea/workflows/**"],
+      "matchPaths": [".gitea/workflows/**", ".gitea/actions/**"],
      "matchUpdateTypes": ["digest"],
      "automerge": false,
      "reviewersFromCodeOwners": false