2026-06-02 19:57:20 +02:00
9 changed files with 432 additions and 253 deletions
--- a/.gitea/actions/deploy-obs/action.yml
+++ b/.gitea/actions/deploy-obs/action.yml
@@ -0,0 +1,127 @@
+name: Deploy observability stack
+description: >-
+  Deploy observability configs + secrets to /opt/familienarchiv, validate the
+  compose config, start the stack, and assert the five healthchecked services
+  are healthy. Per-environment values arrive as inputs.
+
+inputs:
+  grafana_admin_password:
+    description: Grafana admin password (secret)
+    required: true
+  grafana_db_password:
+    description: Read-only grafana_reader DB role password (secret, issue #651)
+    required: true
+  glitchtip_secret_key:
+    description: GlitchTip Django secret key (secret)
+    required: true
+  postgres_password:
+    description: PostgreSQL password for the environment (secret)
+    required: true
+  postgres_host:
+    description: >-
+      Compose project + service hostname, e.g. archiv-staging-db-1. Derived
+      from the Compose project name and service name — a project rename
+      requires updating the caller's value. Plain input, not a secret.
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Deploy observability configs
+      shell: bash
+      # Copies the compose file and config tree from the workspace checkout
+      # into /opt/familienarchiv/ — the permanent location that persists
+      # between CI runs. Containers started in the next step bind-mount
+      # from there, so a future workspace wipe cannot corrupt a running
+      # config file.
+      #
+      # obs-secrets.env is written fresh from Gitea secrets on every run so
+      # Gitea is always the single source of truth for secret rotation.
+      # Non-secret config lives in infra/observability/obs.env (tracked in git).
+      #
+      # secrets.* is NOT available inside a composite action, so the values
+      # arrive as inputs mapped to env: below and are referenced as $VAR in
+      # the heredoc. The delimiter MUST stay unquoted (<<EOF, not <<'EOF') so
+      # the shell expands $VAR — a quoted delimiter would write the literal
+      # string "$GRAFANA_ADMIN_PASSWORD" and `config --quiet` would still pass
+      # (the var is present, just wrong). Do not stage these into intermediate
+      # variables either, or Gitea log masking can be lost.
+      env:
+        GRAFANA_ADMIN_PASSWORD: ${{ inputs.grafana_admin_password }}
+        GRAFANA_DB_PASSWORD: ${{ inputs.grafana_db_password }}
+        GLITCHTIP_SECRET_KEY: ${{ inputs.glitchtip_secret_key }}
+        POSTGRES_PASSWORD: ${{ inputs.postgres_password }}
+        POSTGRES_HOST: ${{ inputs.postgres_host }}
+      run: |
+        set -euo pipefail
+        rm -rf /opt/familienarchiv/infra/observability
+        mkdir -p /opt/familienarchiv/infra/observability
+        cp -r infra/observability/. /opt/familienarchiv/infra/observability/
+        cp docker-compose.observability.yml /opt/familienarchiv/
+        cat > /opt/familienarchiv/obs-secrets.env <<EOF
+        GRAFANA_ADMIN_PASSWORD=$GRAFANA_ADMIN_PASSWORD
+        GRAFANA_DB_PASSWORD=$GRAFANA_DB_PASSWORD
+        GLITCHTIP_SECRET_KEY=$GLITCHTIP_SECRET_KEY
+        POSTGRES_PASSWORD=$POSTGRES_PASSWORD
+        POSTGRES_HOST=$POSTGRES_HOST
+        EOF
+        # Five-key non-empty guard: a bare presence check matches an empty
+        # `KEY=` line, so assert each key has a value. Fail loudly on any
+        # missing/empty key rather than starting the stack with broken auth.
+        for key in GRAFANA_ADMIN_PASSWORD GRAFANA_DB_PASSWORD GLITCHTIP_SECRET_KEY POSTGRES_PASSWORD POSTGRES_HOST; do
+          grep -Eq "^${key}=.+" /opt/familienarchiv/obs-secrets.env \
+            || { echo "::error::obs-secrets.env missing or empty: ${key}"; exit 1; }
+        done
+        # chmod 600 MUST be the final operation: the ordering is the security
+        # property — there is no window where the file is world-readable.
+        chmod 600 /opt/familienarchiv/obs-secrets.env
+
+    - name: Validate observability compose config
+      shell: bash
+      # Dry-run: resolves all variable substitutions and reports any missing
+      # required keys before containers start. Catches undefined variables and
+      # YAML errors in config files updated by the previous step.
+      # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
+      # second (CI-written secrets). Later files win on duplicate keys. POSTGRES_HOST
+      # is environment-specific and supplied only by obs-secrets.env — obs.env
+      # documents it but deliberately does not set a value.
+      run: |
+        docker compose \
+          -f /opt/familienarchiv/docker-compose.observability.yml \
+          --env-file /opt/familienarchiv/infra/observability/obs.env \
+          --env-file /opt/familienarchiv/obs-secrets.env \
+          config --quiet
+
+    - name: Start observability stack
+      shell: bash
+      # Runs with absolute paths so bind mounts resolve to stable host paths
+      # that survive workspace wipes between runs (see ADR-016).
+      # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
+      # (written fresh from Gitea secrets above). --env-file order: obs.env first,
+      # obs-secrets.env second — later file wins on duplicate keys.
+      run: |
+        docker compose \
+          -f /opt/familienarchiv/docker-compose.observability.yml \
+          --env-file /opt/familienarchiv/infra/observability/obs.env \
+          --env-file /opt/familienarchiv/obs-secrets.env \
+          up -d --wait --remove-orphans
+
+    - name: Assert observability stack health
+      shell: bash
+      # docker compose up --wait covers services WITH healthcheck directives only.
+      # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
+      # no healthcheck — they are considered "started" as soon as the process runs.
+      # This step explicitly asserts the five healthchecked critical services are
+      # healthy before the smoke test proceeds.
+      run: |
+        set -e
+        unhealthy=""
+        for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
+          status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
+          if [ "$status" != "healthy" ]; then
+            echo "::error::$svc is not healthy (status: $status)"
+            unhealthy="$unhealthy $svc"
+          fi
+        done
+        [ -z "$unhealthy" ] || exit 1
+        echo "All critical observability services are healthy"
--- a/.gitea/actions/reload-caddy/action.yml
+++ b/.gitea/actions/reload-caddy/action.yml
@@ -0,0 +1,41 @@
+name: Reload Caddy
+description: >-
+  Reload the host Caddy service from a DooD job container via a privileged
+  sibling container and nsenter. No inputs.
+
+runs:
+  using: composite
+  steps:
+    - name: Reload Caddy
+      shell: bash
+      # Apply any committed Caddyfile changes before smoke-testing the
+      # public surface. Without this step, a Caddyfile edit lands in the
+      # repo but Caddy keeps serving the previous config until someone
+      # reloads it manually — the smoke test would then catch a stale
+      # header or a still-proxied /actuator route rather than confirming
+      # the current config is live.
+      #
+      # The runner executes job steps inside Docker containers (DooD).
+      # `systemctl` is not present in container images and cannot reach
+      # the host's systemd directly. We use the Docker socket (mounted
+      # into every job container via runner-config.yaml) to spin up a
+      # privileged sibling container in the host PID namespace; nsenter
+      # then enters the host's namespaces so systemctl talks to the real
+      # host systemd daemon. No sudoers entry is required — the Docker
+      # socket already grants root-equivalent host access.
+      #
+      # Alpine is used: ~5 MB vs ~70 MB for ubuntu, no unnecessary
+      # tooling, and the digest is pinned so any upstream change requires
+      # an explicit bump PR. util-linux (which ships nsenter) is installed
+      # at run time; apk add takes ~1 s on the warm VPS cache.
+      #
+      # `reload` not `restart`: reload sends SIGHUP so Caddy re-reads its
+      # config in-process without dropping TLS connections. `restart`
+      # would briefly stop the service, losing in-flight requests.
+      #
+      # If Caddy is not running this step fails fast before the smoke test
+      # issues a misleading "port 443 refused" error.
+      run: |
+        docker run --rm --privileged --pid=host \
+          alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
+          sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
--- a/.gitea/actions/smoke-test/action.yml
+++ b/.gitea/actions/smoke-test/action.yml
@@ -0,0 +1,58 @@
+name: Smoke test
+description: >-
+  Verify the deployed public surface (login reachable, HSTS pinned,
+  Permissions-Policy present, /actuator blocked) against a given vhost.
+
+inputs:
+  host:
+    description: Public vhost to smoke-test, e.g. staging.raddatz.cloud
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Smoke test deployed environment
+      shell: bash
+      # Healthchecks confirm containers are healthy; they do NOT confirm the
+      # public surface works. This step catches: Caddy not reloaded, HSTS
+      # header dropped, /actuator block bypassed.
+      #
+      # --resolve pins the public host to the Docker bridge gateway IP
+      # (the host) so we do NOT depend on hairpin NAT on the host router.
+      # 127.0.0.1 cannot be used: job containers run in bridge network mode
+      # (runner-config.yaml), so 127.0.0.1 is the container's loopback, not
+      # the host's. The bridge gateway IS the host; Caddy binds 0.0.0.0:443
+      # and is therefore reachable from the container via that IP.
+      # SNI still uses the public hostname so the TLS cert validates correctly.
+      #
+      # --resolve is stored as a Bash array so "${RESOLVE[@]}" expands to two
+      # separate arguments; a quoted string would pass the flag and its value
+      # as one token and curl would reject it as an unknown option.
+      #
+      # Gateway detection reads /proc/net/route (always present, no package
+      # required) instead of `ip route` to avoid a dependency on iproute2.
+      # Field $2=="00000000" is the default route; field $3 is the gateway as
+      # a little-endian 32-bit hex value which awk decodes to dotted-decimal.
+      env:
+        HOST: ${{ inputs.host }}
+      run: |
+        set -e
+        URL="https://$HOST"
+        HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
+        [ -n "$HOST_IP" ] || { echo "::error::could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
+        RESOLVE=(--resolve "$HOST:443:$HOST_IP")
+        echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
+        curl -fsS "${RESOLVE[@]}" --max-time 10 "$URL/login" -o /dev/null
+        # Pin the preload-list-eligible HSTS value, not just header presence:
+        # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
+        # fail this check rather than pass it silently.
+        curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
+          | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
+        # Permissions-Policy denies APIs the app does not use (camera,
+        # microphone, geolocation). A regression that loosens or drops the
+        # header now fails the smoke step.
+        curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
+          | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
+        status=$(curl -s "${RESOLVE[@]}" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
+        [ "$status" = "404" ] || { echo "::error::expected 404 from /actuator/health, got $status"; exit 1; }
+        echo "All smoke checks passed"
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -108,6 +108,32 @@ jobs:
            exit 1
          fi

+      - name: Assert deploy-obs writes obs-secrets.env via an unquoted heredoc (#603)
+        shell: bash
+        run: |
+          # Inside a composite action, secrets arrive as $VAR from env: (secrets.*
+          # is unavailable there), so the obs-secrets.env heredoc MUST use an
+          # unquoted delimiter (<<EOF) for $VAR to expand. A quoted delimiter
+          # (<<'EOF') would write the literal string "$GRAFANA_ADMIN_PASSWORD",
+          # and the action's five-key non-empty guard would STILL pass (the line
+          # is present, just wrong). This guard enforces the invariant in CI so a
+          # future re-quote cannot ship broken obs auth green. See ADR-029 / #603.
+          action='.gitea/actions/deploy-obs/action.yml'
+          quoted='obs-secrets\.env\s*<<-?\s*[\x27\x22]'
+          # Self-test: the regex must catch a quoted delimiter and ignore the unquoted one.
+          printf "obs-secrets.env <<'EOF'\n" | grep -qP "$quoted" \
+            || { echo "FAIL: guard self-test — regex missed the quoted <<'EOF' form"; exit 1; }
+          printf 'obs-secrets.env <<EOF\n' | grep -qvP "$quoted" \
+            || { echo "FAIL: guard self-test — regex wrongly flagged the unquoted <<EOF form"; exit 1; }
+          # Positive: the unquoted heredoc must be present at all.
+          grep -qP 'obs-secrets\.env\s*<<-?EOF\b' "$action" \
+            || { echo "::error::$action no longer writes obs-secrets.env via an unquoted <<EOF heredoc (ADR-029 / #603)"; exit 1; }
+          # Negative: never a quoted delimiter on the obs-secrets.env heredoc.
+          if grep -nP "$quoted" "$action"; then
+            echo "::error::$action writes obs-secrets.env with a quoted heredoc delimiter — secrets would be written as literal \$VAR strings. Use unquoted <<EOF (ADR-029 / #603)."
+            exit 1
+          fi
+
      - name: Run unit and component tests with coverage
        shell: bash
        run: |
--- a/.gitea/workflows/nightly.yml
+++ b/.gitea/workflows/nightly.yml
@@ -23,6 +23,11 @@ name: nightly
 #   - host ports:   backend 8081, frontend 3001
 #   - profile:      staging (starts mailpit instead of a real SMTP relay)
 #
+# The obs-stack deploy, Caddy reload, and smoke test are shared with
+# release.yml via the composite actions under .gitea/actions/ (ADR-029).
+# actions/checkout MUST stay the first step: a local `uses: ./…` action
+# only exists on disk after checkout.
+#
 # Required Gitea secrets:
 #   STAGING_POSTGRES_PASSWORD
 #   STAGING_MINIO_PASSWORD
@@ -55,6 +60,8 @@ jobs:
    # for the same repo is within that boundary.
    runs-on: ubuntu-latest
    steps:
+      # MUST be first: the composite actions below live under .gitea/actions/
+      # and only exist on disk once the repo is checked out (ADR-029).
      - uses: actions/checkout@v4

      - name: Write staging env file
@@ -92,6 +99,7 @@ jobs:
        # `compose config` renders both shorthand and longform mounts as
        # `target: /import` + `read_only: true`, so we assert against
        # the rendered form rather than the raw source YAML.
+        # App-compose check (not obs), nightly-only — stays inline.
        run: |
          set -e
          docker compose \
@@ -128,150 +136,21 @@ jobs:
            --profile staging \
            up -d --wait --remove-orphans

-      - name: Deploy observability configs
-        # Copies the compose file and config tree from the workspace checkout
-        # into /opt/familienarchiv/ — the permanent location that persists
-        # between CI runs. Containers started in the next step bind-mount
-        # from there, so a future workspace wipe cannot corrupt a running
-        # config file.
-        #
-        # obs-secrets.env is written fresh from Gitea secrets on every run so
-        # Gitea is always the single source of truth for secret rotation.
-        # Non-secret config lives in infra/observability/obs.env (tracked in git).
-        run: |
-          rm -rf /opt/familienarchiv/infra/observability
-          mkdir -p /opt/familienarchiv/infra/observability
-          cp -r infra/observability/. /opt/familienarchiv/infra/observability/
-          cp docker-compose.observability.yml /opt/familienarchiv/
-          cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
-          GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
-          GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
-          GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
-          POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
-          POSTGRES_HOST=archiv-staging-db-1
-          EOF
-          # Note: POSTGRES_HOST is derived from the Compose project name (archiv-staging)
-          # and service name (db). A project rename requires updating this value.
-          chmod 600 /opt/familienarchiv/obs-secrets.env
+      # POSTGRES_HOST is derived from the Compose project name (archiv-staging)
+      # and service name (db). A project rename requires updating this value.
+      - uses: ./.gitea/actions/deploy-obs
+        with:
+          grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
+          grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }}
+          glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }}
+          postgres_password: ${{ secrets.STAGING_POSTGRES_PASSWORD }}
+          postgres_host: archiv-staging-db-1

-      - name: Validate observability compose config
-        # Dry-run: resolves all variable substitutions and reports any missing
-        # required keys before containers start. Catches undefined variables and
-        # YAML errors in config files updated by the previous step.
-        # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
-        # second (CI-written secrets). Later files win on duplicate keys, so
-        # obs-secrets.env overrides POSTGRES_HOST set in obs.env.
-        run: |
-          docker compose \
-            -f /opt/familienarchiv/docker-compose.observability.yml \
-            --env-file /opt/familienarchiv/infra/observability/obs.env \
-            --env-file /opt/familienarchiv/obs-secrets.env \
-            config --quiet
+      - uses: ./.gitea/actions/reload-caddy

-      - name: Start observability stack
-        # Runs with absolute paths so bind mounts resolve to stable host paths
-        # that survive workspace wipes between nightly runs (see ADR-016).
-        # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
-        # (written fresh from Gitea secrets above). --env-file order: obs.env first,
-        # obs-secrets.env second — later file wins on duplicate keys.
-        run: |
-          docker compose \
-            -f /opt/familienarchiv/docker-compose.observability.yml \
-            --env-file /opt/familienarchiv/infra/observability/obs.env \
-            --env-file /opt/familienarchiv/obs-secrets.env \
-            up -d --wait --remove-orphans
-
-      - name: Assert observability stack health
-        # docker compose up --wait covers services WITH healthcheck directives only.
-        # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
-        # no healthcheck — they are considered "started" as soon as the process runs.
-        # This step explicitly asserts the five healthchecked critical services are
-        # healthy before the smoke test proceeds.
-        run: |
-          set -e
-          unhealthy=""
-          for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
-            status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
-            if [ "$status" != "healthy" ]; then
-              echo "::error::$svc is not healthy (status: $status)"
-              unhealthy="$unhealthy $svc"
-            fi
-          done
-          [ -z "$unhealthy" ] || exit 1
-          echo "All critical observability services are healthy"
-
-      - name: Reload Caddy
-        # Apply any committed Caddyfile changes before smoke-testing the
-        # public surface. Without this step, a Caddyfile edit lands in the
-        # repo but Caddy keeps serving the previous config until someone
-        # reloads it manually — the smoke test would then catch a stale
-        # header or a still-proxied /actuator route rather than confirming
-        # the current config is live.
-        #
-        # The runner executes job steps inside Docker containers (DooD).
-        # `systemctl` is not present in container images and cannot reach
-        # the host's systemd directly. We use the Docker socket (mounted
-        # into every job container via runner-config.yaml) to spin up a
-        # privileged sibling container in the host PID namespace; nsenter
-        # then enters the host's namespaces so systemctl talks to the real
-        # host systemd daemon. No sudoers entry is required — the Docker
-        # socket already grants root-equivalent host access.
-        #
-        # Alpine is used: ~5 MB vs ~70 MB for ubuntu, no unnecessary
-        # tooling, and the digest is pinned so any upstream change requires
-        # an explicit bump PR. util-linux (which ships nsenter) is installed
-        # at run time; apk add takes ~1 s on the warm VPS cache.
-        #
-        # `reload` not `restart`: reload sends SIGHUP so Caddy re-reads its
-        # config in-process without dropping TLS connections. `restart`
-        # would briefly stop the service, losing in-flight requests.
-        #
-        # If Caddy is not running this step fails fast before the smoke test
-        # issues a misleading "port 443 refused" error.
-        run: |
-          docker run --rm --privileged --pid=host \
-            alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
-            sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
-
-      - name: Smoke test deployed environment
-        # Healthchecks confirm containers are healthy; they do NOT confirm the
-        # public surface works. This step catches: Caddy not reloaded, HSTS
-        # header dropped, /actuator block bypassed.
-        #
-        # --resolve pins staging.raddatz.cloud to the Docker bridge gateway IP
-        # (the host) so we do NOT depend on hairpin NAT on the host router.
-        # 127.0.0.1 cannot be used: job containers run in bridge network mode
-        # (runner-config.yaml), so 127.0.0.1 is the container's loopback, not
-        # the host's. The bridge gateway IS the host; Caddy binds 0.0.0.0:443
-        # and is therefore reachable from the container via that IP.
-        # SNI still uses the public hostname so the TLS cert validates correctly.
-        #
-        # Gateway detection reads /proc/net/route (always present, no package
-        # required) instead of `ip route` to avoid a dependency on iproute2.
-        # Field $2=="00000000" is the default route; field $3 is the gateway as
-        # a little-endian 32-bit hex value which awk decodes to dotted-decimal.
-        run: |
-          set -e
-          HOST="staging.raddatz.cloud"
-          URL="https://$HOST"
-          HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
-          [ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
-          RESOLVE=(--resolve "$HOST:443:$HOST_IP")
-          echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
-          curl -fsS "${RESOLVE[@]}" --max-time 10 "$URL/login" -o /dev/null
-          # Pin the preload-list-eligible HSTS value, not just header presence:
-          # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
-          # fail this check rather than pass it silently.
-          curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
-            | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
-          # Permissions-Policy denies APIs the app does not use (camera,
-          # microphone, geolocation). A regression that loosens or drops the
-          # header now fails the smoke step.
-          curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
-            | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
-          status=$(curl -s "${RESOLVE[@]}" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
-          [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
-          echo "All smoke checks passed"
+      - uses: ./.gitea/actions/smoke-test
+        with:
+          host: staging.raddatz.cloud

      - name: Cleanup env file
        # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
--- a/.gitea/workflows/release.yml
+++ b/.gitea/workflows/release.yml
@@ -23,6 +23,11 @@ name: release
 #   - host ports:   backend 8080, frontend 3000
 #   - profile:      (none) — mailpit is excluded; real SMTP relay is used
 #
+# The obs-stack deploy, Caddy reload, and smoke test are shared with
+# nightly.yml via the composite actions under .gitea/actions/ (ADR-029).
+# actions/checkout MUST stay the first step: a local `uses: ./…` action
+# only exists on disk after checkout.
+#
 # Required Gitea secrets:
 #   PROD_POSTGRES_PASSWORD
 #   PROD_MINIO_PASSWORD
@@ -53,6 +58,8 @@ jobs:
    # advertised label of our single-tenant self-hosted runner.
    runs-on: ubuntu-latest
    steps:
+      # MUST be first: the composite actions below live under .gitea/actions/
+      # and only exist on disk once the repo is checked out (ADR-029).
      - uses: actions/checkout@v4

      - name: Write production env file
@@ -100,117 +107,21 @@ jobs:
            --env-file .env.production \
            up -d --wait --remove-orphans

-      - name: Deploy observability configs
-        # Mirrors the nightly approach: copies obs compose file and config tree
-        # to /opt/familienarchiv/ (permanent path, survives workspace wipes — ADR-016),
-        # then writes obs-secrets.env fresh from Gitea secrets.
-        # Non-secret config lives in infra/observability/obs.env (tracked in git).
-        run: |
-          rm -rf /opt/familienarchiv/infra/observability
-          mkdir -p /opt/familienarchiv/infra/observability
-          cp -r infra/observability/. /opt/familienarchiv/infra/observability/
-          cp docker-compose.observability.yml /opt/familienarchiv/
-          cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
-          GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
-          GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
-          GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
-          POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }}
-          POSTGRES_HOST=archiv-production-db-1
-          EOF
-          # Note: POSTGRES_HOST is derived from the Compose project name (archiv-production)
-          # and service name (db). A project rename requires updating this value.
-          chmod 600 /opt/familienarchiv/obs-secrets.env
+      # POSTGRES_HOST is derived from the Compose project name (archiv-production)
+      # and service name (db). A project rename requires updating this value.
+      - uses: ./.gitea/actions/deploy-obs
+        with:
+          grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
+          grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }}
+          glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }}
+          postgres_password: ${{ secrets.PROD_POSTGRES_PASSWORD }}
+          postgres_host: archiv-production-db-1

-      - name: Validate observability compose config
-        # Dry-run: resolves all variable substitutions and reports any missing
-        # required keys before containers start. Catches undefined variables and
-        # YAML errors in config files updated by the previous step.
-        # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
-        # second (CI-written secrets). Later files win on duplicate keys, so
-        # obs-secrets.env overrides POSTGRES_HOST set in obs.env.
-        # Keep in sync with the equivalent step in nightly.yml (#603).
-        run: |
-          docker compose \
-            -f /opt/familienarchiv/docker-compose.observability.yml \
-            --env-file /opt/familienarchiv/infra/observability/obs.env \
-            --env-file /opt/familienarchiv/obs-secrets.env \
-            config --quiet
+      - uses: ./.gitea/actions/reload-caddy

-      - name: Start observability stack
-        # Runs with absolute paths so bind mounts resolve to stable host paths
-        # that survive workspace wipes between runs (see ADR-016).
-        # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
-        # (written fresh from Gitea secrets above). --env-file order: obs.env first,
-        # obs-secrets.env second — later file wins on duplicate keys.
-        # Keep in sync with the equivalent step in nightly.yml (#603).
-        run: |
-          docker compose \
-            -f /opt/familienarchiv/docker-compose.observability.yml \
-            --env-file /opt/familienarchiv/infra/observability/obs.env \
-            --env-file /opt/familienarchiv/obs-secrets.env \
-            up -d --wait --remove-orphans
-
-      - name: Assert observability stack health
-        # docker compose up --wait covers services WITH healthcheck directives only.
-        # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
-        # no healthcheck — they are considered "started" as soon as the process runs.
-        # This step explicitly asserts the five healthchecked critical services are
-        # healthy before the smoke test proceeds.
-        # Keep in sync with the equivalent step in nightly.yml (#603).
-        run: |
-          set -e
-          unhealthy=""
-          for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
-            status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
-            if [ "$status" != "healthy" ]; then
-              echo "::error::$svc is not healthy (status: $status)"
-              unhealthy="$unhealthy $svc"
-            fi
-          done
-          [ -z "$unhealthy" ] || exit 1
-          echo "All critical observability services are healthy"
-
-      - name: Reload Caddy
-        # See nightly.yml — same rationale and mechanism: DooD job containers
-        # cannot call systemctl directly; nsenter via a privileged sibling
-        # container reaches the host systemd. Must run after deploy (so the
-        # latest Caddyfile is on disk) and before the smoke test (so the
-        # public surface reflects the current config). Alpine with pinned
-        # digest; reload not restart — see nightly.yml for full rationale.
-        run: |
-          docker run --rm --privileged --pid=host \
-            alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
-            sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
-
-      - name: Smoke test deployed environment
-        # See nightly.yml — same three checks, against the prod vhost.
-        # --resolve stored as a Bash array so "${RESOLVE[@]}" expands to two
-        # separate arguments; a quoted string would pass the flag and its value
-        # as one token and curl would reject it as an unknown option.
-        # Gateway detection via /proc/net/route — no iproute2 dependency.
-        # See nightly.yml for the full network topology explanation.
-        run: |
-          set -e
-          HOST="archiv.raddatz.cloud"
-          URL="https://$HOST"
-          HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
-          [ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
-          RESOLVE=(--resolve "$HOST:443:$HOST_IP")
-          echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
-          curl -fsS "${RESOLVE[@]}" --max-time 10 "$URL/login" -o /dev/null
-          # Pin the preload-list-eligible HSTS value, not just header presence:
-          # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
-          # fail this check rather than pass it silently.
-          curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
-            | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
-          # Permissions-Policy denies APIs the app does not use (camera,
-          # microphone, geolocation). A regression that loosens or drops the
-          # header now fails the smoke step.
-          curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
-            | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
-          status=$(curl -s "${RESOLVE[@]}" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
-          [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
-          echo "All smoke checks passed"
+      - uses: ./.gitea/actions/smoke-test
+        with:
+          host: archiv.raddatz.cloud

      - name: Cleanup env file
        # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
--- a/docs/adr/029-composite-actions-for-cross-workflow-deploy-logic.md
+++ b/docs/adr/029-composite-actions-for-cross-workflow-deploy-logic.md
@@ -0,0 +1,69 @@
+# ADR-029: Composite actions for cross-workflow deploy logic
+
+## Status
+
+Accepted
+
+## Context
+
+The `nightly.yml` (staging) and `release.yml` (production) workflows shared three
+blocks of deploy logic verbatim: the four observability-stack steps (deploy configs,
+validate, start, assert health), the Caddy reload step, and the public-surface smoke
+test. The only per-environment differences were secret names (`STAGING_*` vs `PROD_*`),
+the `POSTGRES_HOST` value, and the smoke-test hostname.
+
+This duplication was held together by `# Keep in sync with nightly.yml` comments — an
+honour-system invariant. Any change (a new healthchecked service, a different rsync flag,
+a new secret) had to be applied in two places, and nothing enforced that it was. Issue #603
+documents a real instance: the obs secret set had grown to five keys while a refactor draft
+listed only four.
+
+### Decision drivers
+
+1. Cross-workflow deploy logic must have a single definition, enforced — not a
+   discipline-based "keep in sync" promise.
+2. Per-environment variation must be expressed as explicit, typed inputs, not by forking
+   the whole step block.
+3. The mechanism must work on the existing single-tenant self-hosted Gitea runner with no
+   new infrastructure.
+
+### Alternatives considered
+
+**A: Reusable workflow (`workflow_call`)** — Gitea supports called workflows. Rejected for
+this case: reusable workflows run as a separate job with their own runner context, which
+breaks the in-job, sequential `deploy → reload → smoke` ordering these steps rely on and
+complicates passing the already-checked-out workspace. Composite actions run inline in the
+calling job, preserving step order and the workspace.
+
+**B: Shared shell script invoked from both workflows** — e.g. `scripts/deploy-obs.sh`.
+Rejected: loses the typed-input contract and per-step CI log sections, and reintroduces
+manual argument threading that is as error-prone as the duplication it replaces.
+
+**C: Keep the `# Keep in sync` comments** — status quo. Rejected: unenforced; issue #603
+is direct evidence it fails.
+
+## Decision
+
+Extract the shared logic into three single-responsibility Gitea composite actions under
+`.gitea/actions/`: `deploy-obs` (five inputs), `reload-caddy` (no inputs), and `smoke-test`
+(`host` input). Both workflows invoke each via a single `uses: ./.gitea/actions/<name>` call,
+passing per-environment values as `with:` inputs. This is the repository's first composite
+action and sets the convention; `docs/infrastructure/ci-gitea.md` documents it.
+
+## Consequences
+
+**Positive:**
+- Shared deploy logic has one enforced definition; a change lands once and both
+  environments get it. The `# Keep in sync` comments are deleted.
+- Per-environment variation is a typed input contract, not a forked block.
+- Runs inline on the existing runner — no reusable-workflow job context, no new
+  infrastructure.
+
+**Negative / constraints:**
+- Workflows now depend on a checked-out `.gitea/actions/` tree: `actions/checkout` MUST run
+  before the first `uses: ./…` (a local action does not exist on disk until checkout).
+- Secrets cannot be read from the `secrets.*` context inside a composite action; they must
+  be passed as inputs and mapped to `env:`. The `obs-secrets.env` heredoc therefore uses an
+  unquoted delimiter so `$VAR` expands at the shell layer.
+- The `reload-caddy` pinned alpine digest now lives in the action, not the workflow file —
+  it must be added to Renovate's watch list so it does not go stale.
--- a/docs/infrastructure/ci-gitea.md
+++ b/docs/infrastructure/ci-gitea.md
@@ -68,6 +68,8 @@ Job containers are unprivileged and do not share the host's PID/mount/network na

 Alpine is used instead of Ubuntu: ~5 MB vs ~70 MB, and the digest is pinned to a specific sha256 so any upstream change requires an explicit Renovate bump PR. `util-linux` (which ships `nsenter`) is not part of the Alpine base image but is installed at run time in ~1 s from the warm VPS cache.

+This exact step now lives in the `reload-caddy` composite action (see [Composite actions](#composite-actions) below); both deploy workflows call it via `uses: ./.gitea/actions/reload-caddy`. The pinned digest moved with it, so Renovate's privileged-digest watch covers `.gitea/actions/**` as well as `.gitea/workflows/**`.
+
 #### Why not `sudo systemctl` in the job container?

 Job containers run as root inside an unprivileged Docker namespace. There is no systemd PID 1 inside the container — `systemctl` would attempt to reach a socket that does not exist. `sudo` is not present in container images and would not help even if it were.
@@ -170,6 +172,72 @@ See `docs/DEPLOYMENT.md §3.1` and ADR-015 for the full setup rationale.

 ---

+## Composite actions
+
+The `nightly.yml` (staging) and `release.yml` (production) deploy workflows share their observability-stack deploy, Caddy reload, and smoke-test logic through three single-responsibility composite actions under `.gitea/actions/` (ADR-029). Before this, the shared logic was duplicated in both workflows and held together by `# Keep in sync with nightly.yml` comments — an unenforced honour-system invariant.
+
+| Action | Inputs | Purpose |
+|---|---|---|
+| `deploy-obs` | `grafana_admin_password`, `grafana_db_password`, `glitchtip_secret_key`, `postgres_password`, `postgres_host` | Deploy obs configs + secrets to `/opt/familienarchiv`, validate the compose config, start the stack, assert the five healthchecked services |
+| `reload-caddy` | — | Reload host Caddy via the privileged-sibling + nsenter pattern |
+| `smoke-test` | `host` | Verify the public surface (login reachable, HSTS pinned, Permissions-Policy present, `/actuator → 404`) |
+
+A workflow calls them by relative path, passing per-environment values as `with:` inputs:
+
+```yaml
+- uses: ./.gitea/actions/deploy-obs
+  with:
+    grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
+    grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }}
+    glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }}
+    postgres_password: ${{ secrets.STAGING_POSTGRES_PASSWORD }}
+    postgres_host: archiv-staging-db-1
+- uses: ./.gitea/actions/reload-caddy
+- uses: ./.gitea/actions/smoke-test
+  with:
+    host: staging.raddatz.cloud
+```
+
+### Checkout-first ordering rule
+
+A local composite action (`uses: ./…`) only exists on disk **after** the repo is checked out. `actions/checkout@v4` MUST therefore be the **first step** of any job that calls one — if a future reorder moves checkout later, every `uses: ./.gitea/actions/…` call fails because the action file is not yet on disk. Both deploy workflows pin checkout as step 1 for exactly this reason.
+
+### Secrets inside composite actions
+
+The `secrets.*` context is **not** available inside a composite action. Secrets are passed in as `inputs`, mapped to an `env:` block, and referenced as `$VAR`:
+
+```yaml
+inputs:
+  grafana_admin_password:
+    required: true        # no default — a missing secret must fail loudly, never fall back to empty
+runs:
+  using: composite
+  steps:
+    - shell: bash         # composite steps do NOT default the shell — always declare it
+      env:
+        GRAFANA_ADMIN_PASSWORD: ${{ inputs.grafana_admin_password }}
+      run: |
+        cat > obs-secrets.env <<EOF   # unquoted EOF — $VAR expands at the shell layer
+        GRAFANA_ADMIN_PASSWORD=$GRAFANA_ADMIN_PASSWORD
+        EOF
+```
+
+Two load-bearing details:
+
+- **Unquoted heredoc delimiter (`<<EOF`, not `<<'EOF'`).** With a quoted delimiter the shell writes the literal string `$GRAFANA_ADMIN_PASSWORD`, and `docker compose config --quiet` still passes (the variable is *present, just wrong*). The `deploy-obs` action guards against this with a five-key **non-empty** check (`grep -Eq "^KEY=.+"`) immediately after writing `obs-secrets.env`. `chmod 600` is the action's final operation so the file is never world-readable.
+- **Every `run:` step declares `shell: bash`.** Composite actions do not inherit the workflow's default shell; a step without it fails to run.
+
+### Adding an input to an action
+
+To thread a new per-environment value (e.g. a new secret) through `deploy-obs`:
+
+1. Add it under `inputs:` in `.gitea/actions/deploy-obs/action.yml` with `required: true` and **no `default:`**.
+2. Map it in the relevant step's `env:` block: `NEW_KEY: ${{ inputs.new_key }}`.
+3. Reference it as `$NEW_KEY` in the `run:` script — add a `NEW_KEY=$NEW_KEY` line to the heredoc **and** a matching entry to the five-key guard loop.
+4. Pass it from **both** workflows' `with:` blocks. That is the whole point of the action: the contract lives in one place, so neither environment can silently drift.
+
+---
+
 ## Gitea vs GitHub Actions Differences

 ### Context Variable Names
--- a/renovate.json
+++ b/renovate.json
@@ -14,8 +14,8 @@
      "automerge": false
    },
    {
-      "description": "Digest bumps for images used in privileged CI steps (--privileged --pid=host) must be reviewed manually — a compromised image has root-equivalent host access.",
-      "matchPaths": [".gitea/workflows/**"],
+      "description": "Digest bumps for images used in privileged CI steps (--privileged --pid=host) must be reviewed manually — a compromised image has root-equivalent host access. Covers .gitea/actions/** too: the reload-caddy alpine digest now lives in a composite action (#603).",
+      "matchPaths": [".gitea/workflows/**", ".gitea/actions/**"],
      "matchUpdateTypes": ["digest"],
      "automerge": false,
      "reviewersFromCodeOwners": false