From 83f022ff4bbfd2c6f8fb0991ed47e2309557b313 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sun, 10 May 2026 21:33:39 +0200
Subject: [PATCH 01/39] feat(security): trust X-Forwarded-Proto behind reverse
 proxy

Adds server.forward-headers-strategy: native so that Jetty honours
X-Forwarded-{Proto,For,Host} from Caddy. Without this, getScheme(),
redirect URLs, and Spring Session "Secure" cookies reflect the
internal http hop instead of the original https client request.

Refs #497.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/src/main/resources/application.yaml   |  6 +++
 .../ForwardHeadersConfigurationTest.java      | 37 +++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java

diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml
index 1cdd7673..6e12b9f6 100644
--- a/backend/src/main/resources/application.yaml
+++ b/backend/src/main/resources/application.yaml
@@ -38,6 +38,12 @@ spring:
           starttls:
             enable: true
 
+server:
+  # Behind Caddy/reverse proxy: trust X-Forwarded-{Proto,For,Host} so that
+  # request.getScheme(), redirect URLs, and Spring Session "Secure" cookies
+  # reflect the original https client request, not the http hop from Caddy.
+  forward-headers-strategy: native
+
 management:
   health:
     mail:
diff --git a/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java
new file mode 100644
index 00000000..b97f5ff0
--- /dev/null
+++ b/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java
@@ -0,0 +1,37 @@
+package org.raddatz.familienarchiv.config;
+
+import org.junit.jupiter.api.Test;
+import org.raddatz.familienarchiv.PostgresContainerConfig;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.context.annotation.Import;
+import org.springframework.test.context.ActiveProfiles;
+import org.springframework.test.context.bean.override.mockito.MockitoBean;
+import software.amazon.awssdk.services.s3.S3Client;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.NONE)
+@ActiveProfiles("test")
+@Import(PostgresContainerConfig.class)
+class ForwardHeadersConfigurationTest {
+
+    @MockitoBean
+    S3Client s3Client;
+
+    @Autowired
+    @Value("${server.forward-headers-strategy:}")
+    String forwardHeadersStrategy;
+
+    @Test
+    void forward_headers_strategy_is_native_for_reverse_proxy_deployment() {
+        // Caddy terminates TLS and forwards X-Forwarded-Proto: https.
+        // Spring must trust those headers so that AppUser-facing redirect URLs,
+        // Spring Session cookies (Secure flag), and HttpServletRequest.getScheme()
+        // reflect the original client-facing scheme rather than the internal http hop.
+        assertThat(forwardHeadersStrategy)
+            .as("server.forward-headers-strategy must be 'native' so Jetty honours X-Forwarded-Proto behind Caddy")
+            .isEqualTo("native");
+    }
+}
-- 
2.49.1


From ebd0f671f9bfdf610fe9505eeb1386bd3c21daba Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sun, 10 May 2026 21:50:53 +0200
Subject: [PATCH 02/39] fix(auth): mark /hilfe/transkription as public for
 prerender

The route exports prerender = true and is listed in
svelte.config.js's prerender.entries. Until now the auth hook
redirected unauthenticated requests to /login, so the prerender
crawler hit a 302 and the build failed with "marked as prerenderable,
but were not prerendered".

Adding the path to PUBLIC_PATHS lets the crawler render the static
HTML; consistent with the route's intent as a public help page.

Surfaced by #497 (the production Docker build is the first place
npm run build runs in CI).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 frontend/src/hooks.server.ts | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/frontend/src/hooks.server.ts b/frontend/src/hooks.server.ts
index 917ed953..39460750 100644
--- a/frontend/src/hooks.server.ts
+++ b/frontend/src/hooks.server.ts
@@ -5,7 +5,14 @@ import { env } from 'process';
 import { cookieName, cookieMaxAge } from '$lib/paraglide/runtime';
 import { detectLocale } from '$lib/shared/server/locale';
 
-const PUBLIC_PATHS = ['/login', '/logout', '/forgot-password', '/reset-password', '/register'];
+const PUBLIC_PATHS = [
+	'/login',
+	'/logout',
+	'/forgot-password',
+	'/reset-password',
+	'/register',
+	'/hilfe/transkription' // prerendered help page — must be reachable without an auth cookie
+];
 
 const handleLocaleDetection: Handle = ({ event, resolve }) => {
 	if (!event.cookies.get(cookieName)) {
-- 
2.49.1


From 8b109349c2b7e964835f2569332ac8a8580e40fb Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sun, 10 May 2026 21:51:32 +0200
Subject: [PATCH 03/39] feat(frontend): add production stage to Dockerfile

Multi-stage Dockerfile with three targets:
- development (dev server on :5173, used by docker-compose.yml)
- build (runs npm run build, produces SvelteKit Node-adapter output)
- production (self-contained node build server on :3000)

Node base pinned to node:20.19.0-alpine3.21 for reproducible CI
builds (Renovate will keep it current).

docker-compose.yml now specifies target: development for the
frontend so dev continues to use the dev-server stage. Without
this, Docker would default to the last stage (production).

Refs #497.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docker-compose.yml  |  1 +
 frontend/Dockerfile | 35 +++++++++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 5593a105..ee850922 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -163,6 +163,7 @@ services:
     build:
       context: ./frontend
       dockerfile: Dockerfile
+      target: development  # Dockerfile is multi-stage; default would be the production stage
     container_name: archive-frontend
     restart: unless-stopped
     depends_on:
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
index ca88f974..afbdb79f 100644
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -1,15 +1,34 @@
-FROM node:20-alpine
+# syntax=docker/dockerfile:1.7
 
+# ── Development ──────────────────────────────────────────────────────────────
+# Used by docker-compose.yml (target: development). Source is bind-mounted in
+# dev so the COPY . below is effectively replaced at runtime; the layer still
+# exists so the image is self-contained for cold starts (e.g. devcontainer).
+FROM node:20.19.0-alpine3.21 AS development
 WORKDIR /app
-
-# Install dependencies as a separate layer so they are cached when only source changes
 COPY package.json package-lock.json ./
 RUN npm ci
-
-# Source is mounted at runtime via docker-compose volume
-# This COPY is only used when building without a volume (e.g. production image)
 COPY . .
-
 EXPOSE 5173
-
 CMD ["npm", "run", "dev"]
+
+# ── Build ────────────────────────────────────────────────────────────────────
+# Compiles the SvelteKit Node-adapter output to /app/build.
+FROM node:20.19.0-alpine3.21 AS build
+WORKDIR /app
+COPY package.json package-lock.json ./
+RUN npm ci
+COPY . .
+RUN npm run build
+
+# ── Production ───────────────────────────────────────────────────────────────
+# Self-contained Node server. `node build` is the adapter-node entrypoint.
+FROM node:20.19.0-alpine3.21 AS production
+WORKDIR /app
+ENV NODE_ENV=production
+COPY --from=build /app/build ./build
+COPY --from=build /app/package.json ./package.json
+COPY --from=build /app/package-lock.json ./package-lock.json
+RUN npm ci --omit=dev
+EXPOSE 3000
+CMD ["node", "build"]
-- 
2.49.1


From ecb930e5f91b8a1a6f6d065587eae1a0c8206989 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sun, 10 May 2026 21:53:19 +0200
Subject: [PATCH 04/39] feat(infra): add docker-compose.prod.yml for
 production/staging

Standalone production compose file (not an overlay) that runs the
full stack on a single host. Environment isolation is achieved via
the docker compose project name (-p archiv-production / -p
archiv-staging) so the two environments cohabit cleanly.

Key choices, resolved in #497 review:
- Named volumes for persistent data (no host bind mounts)
- MinIO pinned to a specific RELEASE tag (no :latest)
- Backend uses MinIO service account (S3_ACCESS_KEY=archiv-app),
  not root credentials; create-buckets bootstraps the account
- Mailpit lives under profiles: [staging] so no real SMTP secret
  is ever wired into the staging deploy
- OCR mem_limit 12g + healthcheck (start_period 120s) copied from
  the dev compose so docker compose up -d --wait works in CI
- Backend admin credentials wired through APP_ADMIN_USERNAME /
  APP_ADMIN_PASSWORD; first deploy locks the password in
  permanently because UserDataInitializer is idempotent on email
- All host ports bound to 127.0.0.1; Caddy fronts external traffic

Refs #497.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 211 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)
 create mode 100644 docker-compose.prod.yml

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
new file mode 100644
index 00000000..25c7856b
--- /dev/null
+++ b/docker-compose.prod.yml
@@ -0,0 +1,211 @@
+# Production / staging Docker Compose for Familienarchiv.
+#
+# This is a self-contained file (not an overlay over docker-compose.yml).
+# All services for the prod stack live here. Environment isolation is
+# achieved via the docker compose project name:
+#
+#   production: docker compose -f docker-compose.prod.yml -p archiv-production ...
+#   staging:    docker compose -f docker-compose.prod.yml -p archiv-staging --profile staging ...
+#
+# Volumes, networks and containers are namespaced by the project name,
+# so the two environments cohabit cleanly on the same host.
+#
+# Required env vars (provided by .env.production / .env.staging in CI):
+#   TAG                         image tag (release tag or "nightly")
+#   PORT_BACKEND, PORT_FRONTEND host-side ports (bound to 127.0.0.1 only)
+#   APP_DOMAIN                  e.g. archiv.raddatz.cloud / staging.raddatz.cloud
+#   POSTGRES_PASSWORD           Postgres password
+#   MINIO_PASSWORD              MinIO root password (admin operations only)
+#   MINIO_APP_PASSWORD          MinIO application service-account password
+#                               (least-privilege scope: archive bucket only)
+#   OCR_TRAINING_TOKEN          token guarding ocr-service /train endpoint
+#   APP_ADMIN_USERNAME          seeded admin email (e.g. admin@archiv.raddatz.cloud)
+#   APP_ADMIN_PASSWORD          seeded admin password — CRITICAL: locked in on
+#                               first deploy because UserDataInitializer only
+#                               creates the account if the email does not exist
+#   MAIL_HOST, MAIL_PORT,       SMTP relay (production only; staging uses mailpit)
+#   MAIL_USERNAME, MAIL_PASSWORD
+#   APP_MAIL_FROM               sender address (e.g. noreply@raddatz.cloud)
+
+networks:
+  archive-net:
+    driver: bridge
+
+volumes:
+  postgres-data:
+  minio-data:
+  ocr-models:
+  ocr-cache:
+
+services:
+  db:
+    image: postgres:16-alpine
+    restart: unless-stopped
+    environment:
+      POSTGRES_USER: archiv
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
+      POSTGRES_DB: archiv
+    volumes:
+      - postgres-data:/var/lib/postgresql/data
+    networks:
+      - archive-net
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U archiv -d archiv"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  minio:
+    # Pinned MinIO release for reproducible deploys; Renovate keeps it current.
+    image: minio/minio:RELEASE.2025-02-28T09-55-16Z
+    restart: unless-stopped
+    command: server /data --console-address ":9001"
+    environment:
+      MINIO_ROOT_USER: archiv
+      MINIO_ROOT_PASSWORD: ${MINIO_PASSWORD}
+    volumes:
+      - minio-data:/data
+    networks:
+      - archive-net
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+
+  # Idempotent bucket bootstrap + service-account creation.
+  # Runs once per `docker compose up` and exits 0; `--ignore-existing` and
+  # the user-add fallback are safe on re-deploy.
+  create-buckets:
+    image: minio/mc
+    depends_on:
+      minio:
+        condition: service_healthy
+    networks:
+      - archive-net
+    environment:
+      MINIO_PASSWORD: ${MINIO_PASSWORD}
+      MINIO_APP_PASSWORD: ${MINIO_APP_PASSWORD}
+    entrypoint: >
+      /bin/sh -c "
+      set -e;
+      /usr/bin/mc alias set myminio http://minio:9000 archiv $$MINIO_PASSWORD;
+      /usr/bin/mc mb myminio/familienarchiv --ignore-existing;
+      /usr/bin/mc anonymous set private myminio/familienarchiv;
+      /usr/bin/mc admin user add myminio archiv-app $$MINIO_APP_PASSWORD || /usr/bin/mc admin user enable myminio archiv-app;
+      /usr/bin/mc admin policy attach myminio readwrite --user archiv-app || true;
+      exit 0;
+      "
+
+  # Dev-only mail catcher; gated behind the staging profile so production
+  # never starts it. Staging workflow runs with `--profile staging`.
+  mailpit:
+    image: axllent/mailpit:latest
+    restart: unless-stopped
+    profiles: ["staging"]
+    networks:
+      - archive-net
+    healthcheck:
+      test: ["CMD-SHELL", "wget -qO- http://localhost:8025/api/v1/info >/dev/null 2>&1 || exit 1"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  ocr-service:
+    build:
+      context: ./ocr-service
+    restart: unless-stopped
+    expose:
+      - "8000"
+    # Surya OCR loads ~5GB of transformer models at startup; first request
+    # triggers a further ~1GB Kraken model download into ocr-cache.
+    mem_limit: 12g
+    memswap_limit: 12g
+    volumes:
+      - ocr-models:/app/models
+      - ocr-cache:/root/.cache
+    environment:
+      KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
+      TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
+      OCR_CONFIDENCE_THRESHOLD: "0.3"
+      OCR_CONFIDENCE_THRESHOLD_KURRENT: "0.5"
+    networks:
+      - archive-net
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 12
+      start_period: 120s
+
+  backend:
+    image: familienarchiv/backend:${TAG:-nightly}
+    build:
+      context: ./backend
+    restart: unless-stopped
+    depends_on:
+      db:
+        condition: service_healthy
+      minio:
+        condition: service_healthy
+      ocr-service:
+        condition: service_healthy
+    # Bound to localhost only — Caddy fronts external traffic.
+    ports:
+      - "127.0.0.1:${PORT_BACKEND}:8080"
+    environment:
+      SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/archiv
+      SPRING_DATASOURCE_USERNAME: archiv
+      SPRING_DATASOURCE_PASSWORD: ${POSTGRES_PASSWORD}
+      # Application uses the bucket-scoped service account, not MinIO root.
+      S3_ENDPOINT: http://minio:9000
+      S3_ACCESS_KEY: archiv-app
+      S3_SECRET_KEY: ${MINIO_APP_PASSWORD}
+      S3_BUCKET_NAME: familienarchiv
+      S3_REGION: us-east-1
+      # No SPRING_PROFILES_ACTIVE — base application.yaml is production-ready
+      # (Swagger disabled, show-sql off, open-in-view false).
+      APP_BASE_URL: https://${APP_DOMAIN}
+      APP_ADMIN_USERNAME: ${APP_ADMIN_USERNAME}
+      APP_ADMIN_PASSWORD: ${APP_ADMIN_PASSWORD}
+      APP_OCR_BASE_URL: http://ocr-service:8000
+      APP_OCR_TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
+      MAIL_HOST: ${MAIL_HOST}
+      MAIL_PORT: ${MAIL_PORT:-587}
+      MAIL_USERNAME: ${MAIL_USERNAME:-}
+      MAIL_PASSWORD: ${MAIL_PASSWORD:-}
+      APP_MAIL_FROM: ${APP_MAIL_FROM:-noreply@raddatz.cloud}
+      SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-true}
+      SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-true}
+    networks:
+      - archive-net
+    healthcheck:
+      test: ["CMD-SHELL", "wget -qO- http://localhost:8080/actuator/health | grep -q UP || exit 1"]
+      interval: 15s
+      timeout: 5s
+      retries: 10
+      start_period: 30s
+
+  frontend:
+    image: familienarchiv/frontend:${TAG:-nightly}
+    build:
+      context: ./frontend
+      target: production
+    restart: unless-stopped
+    depends_on:
+      backend:
+        condition: service_healthy
+    ports:
+      - "127.0.0.1:${PORT_FRONTEND}:3000"
+    environment:
+      # SSR fetches go inside the docker network; clients hit https://${APP_DOMAIN}
+      API_INTERNAL_URL: http://backend:8080
+      ORIGIN: https://${APP_DOMAIN}
+    networks:
+      - archive-net
+    healthcheck:
+      test: ["CMD-SHELL", "wget -qO- http://localhost:3000/login >/dev/null 2>&1 || exit 1"]
+      interval: 15s
+      timeout: 5s
+      retries: 10
+      start_period: 20s
-- 
2.49.1


From 56e55ff488e2c9295d868a5a6fe7ee5b8cdfeb73 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sun, 10 May 2026 21:54:38 +0200
Subject: [PATCH 05/39] feat(infra): add production Caddyfile

Reverse proxy for the Familienarchiv host, validated against Caddy 2.
Includes both vhosts (production and staging), the Gitea vhost, and:

- HSTS, X-Content-Type-Options, Referrer-Policy headers on every site
- "-Server" header strip to hide the Caddy version
- /actuator/* responds 404 on both archive vhosts (defense in depth
  for Spring Boot's management endpoints)

X-Frame-Options is intentionally not set in Caddy: Spring Security
configures frame-options SAMEORIGIN for the in-app PDF preview
iframe; a DENY header here would conflict.

Refs #497.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 infra/caddy/Caddyfile | 63 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 infra/caddy/Caddyfile

diff --git a/infra/caddy/Caddyfile b/infra/caddy/Caddyfile
new file mode 100644
index 00000000..f32b1f2f
--- /dev/null
+++ b/infra/caddy/Caddyfile
@@ -0,0 +1,63 @@
+# Caddyfile for the Familienarchiv host.
+#
+# Caddy runs on the host (not in a container) and reverse-proxies into
+# the docker compose stacks bound to 127.0.0.1.
+#
+# Naming convention for ports (also documented in docker-compose.prod.yml):
+#   production: backend 8080, frontend 3000
+#   staging:    backend 8081, frontend 3001
+#   gitea:      3005
+#
+# Security headers and the /actuator block apply to both archive vhosts.
+# X-Frame-Options is deliberately NOT set here: Spring Security configures
+# frame-options SAMEORIGIN (for the in-app PDF preview iframe). Setting
+# DENY in Caddy would conflict.
+
+(security_headers) {
+	header {
+		Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
+		X-Content-Type-Options "nosniff"
+		Referrer-Policy "strict-origin-when-cross-origin"
+		-Server
+	}
+}
+
+(block_actuator) {
+	# Defense in depth: even if management.endpoints.web.exposure.include grows
+	# in application.yaml, /actuator/* is unreachable externally. The internal
+	# Prometheus scrape (future) talks to the backend directly on the docker
+	# network, not via Caddy.
+	@actuator path /actuator/*
+	respond @actuator 404
+}
+
+archiv.raddatz.cloud {
+	import security_headers
+	import block_actuator
+
+	handle /api/* {
+		reverse_proxy 127.0.0.1:8080
+	}
+
+	handle {
+		reverse_proxy 127.0.0.1:3000
+	}
+}
+
+staging.raddatz.cloud {
+	import security_headers
+	import block_actuator
+
+	handle /api/* {
+		reverse_proxy 127.0.0.1:8081
+	}
+
+	handle {
+		reverse_proxy 127.0.0.1:3001
+	}
+}
+
+git.raddatz.cloud {
+	import security_headers
+	reverse_proxy 127.0.0.1:3005
+}
-- 
2.49.1


From 59349dfe93c4bca11c125c107d7dad9756f1e7b6 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sun, 10 May 2026 21:55:41 +0200
Subject: [PATCH 06/39] feat(ci): add nightly staging deploy workflow

Runs daily at 02:00 (and on workflow_dispatch). Builds the prod
compose stack with BuildKit, writes a transient .env.staging from
Gitea secrets, then `docker compose up -d --wait` so the job fails
loudly if any service's healthcheck never reports healthy.

The --profile staging flag starts the mailpit catcher in place of
a real SMTP relay; no production SMTP credentials touch the staging
environment.

The .env.staging file is cleaned up in `if: always()` to avoid
leaving secrets in the runner workspace between runs.

Refs #497.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitea/workflows/nightly.yml | 80 ++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 .gitea/workflows/nightly.yml

diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml
new file mode 100644
index 00000000..82fc0c5c
--- /dev/null
+++ b/.gitea/workflows/nightly.yml
@@ -0,0 +1,80 @@
+name: nightly
+
+# Builds and deploys the staging environment from main every night.
+# Runs on the self-hosted runner using Docker-out-of-Docker (the docker
+# socket is mounted in), so `docker compose build` produces images on
+# the host daemon and `docker compose up` consumes them directly — no
+# registry hop.
+#
+# Staging environment isolation:
+#   - project name: archiv-staging
+#   - host ports:   backend 8081, frontend 3001
+#   - profile:      staging (starts mailpit instead of a real SMTP relay)
+#
+# Required Gitea secrets:
+#   STAGING_POSTGRES_PASSWORD
+#   STAGING_MINIO_PASSWORD
+#   STAGING_MINIO_APP_PASSWORD
+#   STAGING_OCR_TRAINING_TOKEN
+#   STAGING_APP_ADMIN_USERNAME
+#   STAGING_APP_ADMIN_PASSWORD
+
+on:
+  schedule:
+    - cron: "0 2 * * *"
+  workflow_dispatch:
+
+env:
+  # Ensures the backend Dockerfile's `RUN --mount=type=cache` lines are
+  # honoured (Maven cache survives between runs).
+  DOCKER_BUILDKIT: "1"
+
+jobs:
+  deploy-staging:
+    runs-on: self-hosted
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Write staging env file
+        run: |
+          cat > .env.staging <<EOF
+          TAG=nightly
+          PORT_BACKEND=8081
+          PORT_FRONTEND=3001
+          APP_DOMAIN=staging.raddatz.cloud
+          POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
+          MINIO_PASSWORD=${{ secrets.STAGING_MINIO_PASSWORD }}
+          MINIO_APP_PASSWORD=${{ secrets.STAGING_MINIO_APP_PASSWORD }}
+          OCR_TRAINING_TOKEN=${{ secrets.STAGING_OCR_TRAINING_TOKEN }}
+          APP_ADMIN_USERNAME=${{ secrets.STAGING_APP_ADMIN_USERNAME }}
+          APP_ADMIN_PASSWORD=${{ secrets.STAGING_APP_ADMIN_PASSWORD }}
+          MAIL_HOST=mailpit
+          MAIL_PORT=1025
+          MAIL_USERNAME=
+          MAIL_PASSWORD=
+          MAIL_SMTP_AUTH=false
+          MAIL_STARTTLS_ENABLE=false
+          APP_MAIL_FROM=noreply@staging.raddatz.cloud
+          EOF
+
+      - name: Build images
+        run: |
+          docker compose \
+            -f docker-compose.prod.yml \
+            -p archiv-staging \
+            --env-file .env.staging \
+            --profile staging \
+            build
+
+      - name: Deploy staging
+        run: |
+          docker compose \
+            -f docker-compose.prod.yml \
+            -p archiv-staging \
+            --env-file .env.staging \
+            --profile staging \
+            up -d --wait --remove-orphans
+
+      - name: Cleanup env file
+        if: always()
+        run: rm -f .env.staging
-- 
2.49.1


From 334b50747645c97f2da712764c2299ffc49745d2 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sun, 10 May 2026 21:56:37 +0200
Subject: [PATCH 07/39] feat(ci): add release production deploy workflow

Fires on `v*` tag push. Tags the built images with the git tag so
rollbacks are a one-liner (TAG=<previous> docker compose ... up -d).

`up -d --wait` blocks until every service healthcheck reports
healthy; a bad release fails the workflow rather than crash-looping
silently. The .env.production file containing all Gitea secrets is
removed in `if: always()` after the deploy step.

Refs #497.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitea/workflows/release.yml | 79 ++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 .gitea/workflows/release.yml

diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml
new file mode 100644
index 00000000..dbf7a9a8
--- /dev/null
+++ b/.gitea/workflows/release.yml
@@ -0,0 +1,79 @@
+name: release
+
+# Builds and deploys the production environment on `v*` tag push.
+# Runs on the self-hosted runner via Docker-out-of-Docker; images are
+# tagged with the actual git tag (e.g. v1.0.0) so rollback is
+#   `TAG=<previous> docker compose -f docker-compose.prod.yml -p archiv-production up -d --wait`
+#
+# Production environment:
+#   - project name: archiv-production
+#   - host ports:   backend 8080, frontend 3000
+#   - profile:      (none) — mailpit is excluded; real SMTP relay is used
+#
+# Required Gitea secrets:
+#   PROD_POSTGRES_PASSWORD
+#   PROD_MINIO_PASSWORD
+#   PROD_MINIO_APP_PASSWORD
+#   PROD_OCR_TRAINING_TOKEN
+#   PROD_APP_ADMIN_USERNAME       (CRITICAL: see docs/DEPLOYMENT.md)
+#   PROD_APP_ADMIN_PASSWORD       (CRITICAL: locked in on first deploy)
+#   MAIL_HOST
+#   MAIL_PORT
+#   MAIL_USERNAME
+#   MAIL_PASSWORD
+
+on:
+  push:
+    tags:
+      - "v*"
+
+env:
+  DOCKER_BUILDKIT: "1"
+
+jobs:
+  deploy-production:
+    runs-on: self-hosted
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Write production env file
+        run: |
+          cat > .env.production <<EOF
+          TAG=${{ gitea.ref_name }}
+          PORT_BACKEND=8080
+          PORT_FRONTEND=3000
+          APP_DOMAIN=archiv.raddatz.cloud
+          POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }}
+          MINIO_PASSWORD=${{ secrets.PROD_MINIO_PASSWORD }}
+          MINIO_APP_PASSWORD=${{ secrets.PROD_MINIO_APP_PASSWORD }}
+          OCR_TRAINING_TOKEN=${{ secrets.PROD_OCR_TRAINING_TOKEN }}
+          APP_ADMIN_USERNAME=${{ secrets.PROD_APP_ADMIN_USERNAME }}
+          APP_ADMIN_PASSWORD=${{ secrets.PROD_APP_ADMIN_PASSWORD }}
+          MAIL_HOST=${{ secrets.MAIL_HOST }}
+          MAIL_PORT=${{ secrets.MAIL_PORT }}
+          MAIL_USERNAME=${{ secrets.MAIL_USERNAME }}
+          MAIL_PASSWORD=${{ secrets.MAIL_PASSWORD }}
+          MAIL_SMTP_AUTH=true
+          MAIL_STARTTLS_ENABLE=true
+          APP_MAIL_FROM=noreply@raddatz.cloud
+          EOF
+
+      - name: Build images
+        run: |
+          docker compose \
+            -f docker-compose.prod.yml \
+            -p archiv-production \
+            --env-file .env.production \
+            build
+
+      - name: Deploy production
+        run: |
+          docker compose \
+            -f docker-compose.prod.yml \
+            -p archiv-production \
+            --env-file .env.production \
+            up -d --wait --remove-orphans
+
+      - name: Cleanup env file
+        if: always()
+        run: rm -f .env.production
-- 
2.49.1


From 2eade2b78fcc0d6e2dd2522de796a3245ba56613 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sun, 10 May 2026 21:58:51 +0200
Subject: [PATCH 08/39] docs(deployment): rewrite for Gitea Actions / Caddy /
 prod compose

Brings DEPLOYMENT.md in line with the production deployment landed
in #497:

- Topology diagram: frontend port 3000 (Node adapter), 127.0.0.1
  binding, project-name isolation between prod and staging
- Caddyfile now lives in-tree at infra/caddy/Caddyfile (symlinked
  onto the server)
- Dev vs prod table: documents the new deploy method (workflows +
  --wait) and the prod-compose specific differences
- Env vars: adds MINIO_APP_PASSWORD; notes that prod compose
  hardcodes the MinIO root user and the bucket name
- Bootstrap section: server hardening, fail2ban, Tailscale, the 16
  Gitea secrets, and the workflow_dispatch first-deploy step
- Admin password warning: first deploy locks the password, secret
  rotation after that point has no effect
- Rollback: TAG= override + docker compose up -d --wait

Refs #497.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/DEPLOYMENT.md | 152 +++++++++++++++++++++++++++++++--------------
 1 file changed, 104 insertions(+), 48 deletions(-)

diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md
index 6e697c55..674bc15f 100644
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -27,20 +27,22 @@ This doc is the Day-1 checklist and operational reference. It links to the canon
 ```mermaid
 graph TD
     Browser -->|HTTPS| Caddy["Caddy (TLS termination)"]
-    Caddy -->|HTTP :5173| Frontend["Web Frontend\nSvelteKit / Node.js"]
+    Caddy -->|HTTP :3000| Frontend["Web Frontend\nSvelteKit Node adapter"]
     Caddy -->|HTTP :8080| Backend["API Backend\nSpring Boot / Jetty :8080"]
     Backend -->|JDBC :5432| DB[(PostgreSQL 16)]
-    Backend -->|S3 API :9000| MinIO[(MinIO / Hetzner OBS)]
+    Backend -->|S3 API :9000| MinIO[(MinIO)]
     Backend -->|HTTP :8000 internal| OCR["OCR Service\nPython FastAPI"]
     OCR -->|presigned URL| MinIO
     Browser -->|SSE direct| Backend
 ```
 
 **Key facts:**
-- Caddy terminates TLS and reverse-proxies to frontend and backend. See the Caddyfile in [`docs/infrastructure/production-compose.md`](infrastructure/production-compose.md).
-- The OCR service has **no external port** — reachable only on the internal Docker network from the backend.
+- Caddy terminates TLS and reverse-proxies to frontend (`:3000`) and backend (`:8080`). The Caddyfile is committed at [`infra/caddy/Caddyfile`](../infra/caddy/Caddyfile) and is installed on the host as `/etc/caddy/Caddyfile` (symlink).
+- The host binds all docker-published ports to `127.0.0.1` only; Caddy is the sole external entry point.
+- The OCR service has **no published port** — reachable only on the internal Docker network from the backend.
 - SSE notifications go directly backend → browser (not via the SvelteKit SSR layer).
-- Management port 8081 (Spring Actuator / Prometheus scrape) is internal only — the Caddy config blocks `/actuator/*` externally.
+- The Caddyfile responds `404` on `/actuator/*` (defense in depth). Internal monitoring scrapes the backend on the docker network, not through Caddy.
+- Production and staging cohabit on the same host via docker compose project names: `archiv-production` (ports 8080/3000) and `archiv-staging` (ports 8081/3001).
 
 ### OCR memory requirements
 
@@ -56,15 +58,19 @@ A CX32 cannot honour a `mem_limit: 12g` — set it to `6g` in the prod overlay o
 
 ### Dev vs production differences
 
-| Concern | Dev compose | Prod overlay |
+| Concern | Dev (`docker-compose.yml`) | Prod (`docker-compose.prod.yml`) |
 |---|---|---|
-| MinIO image tag | `minio/minio:latest` (unpinned) | Pinned in prod overlay |
-| Data persistence | Bind mounts `./data/postgres`, `./data/minio` | Named Docker volumes |
-| Bucket creation | `create-buckets` helper container | Pre-created in Hetzner console |
-| Spring profile | `dev,e2e` (enables OpenAPI + Swagger UI) | `prod` |
-| Mail | Mailpit (local catcher) | Real SMTP |
+| MinIO image tag | `minio/minio:latest` | Pinned `minio/minio:RELEASE.…` |
+| Data persistence | Bind mounts `./data/postgres`, `./data/minio` | Named Docker volumes (`postgres-data`, `minio-data`) |
+| MinIO credentials for backend | Root user/password | Service account `archiv-app` with bucket-scoped rights |
+| Bucket creation | `create-buckets` helper | Same helper, plus service-account bootstrap on every up |
+| Spring profile | `dev,e2e` (Swagger + e2e overrides) | unset — base `application.yaml` is production-ready |
+| Mail | Mailpit (local catcher) | Real SMTP (production) / Mailpit via `profiles: [staging]` (staging) |
+| Frontend image | Dev server, `target: development`, port 5173 | Node adapter, `target: production`, port 3000 |
+| Host port binding | All published | Bound to `127.0.0.1` only; Caddy is the front door |
+| Deploy method | `docker compose up -d` (manual) | Gitea Actions: `nightly.yml` (staging, cron) and `release.yml` (production, on `v*` tag) — both use `up -d --wait` |
 
-Full prod overlay: [`docs/infrastructure/production-compose.md`](infrastructure/production-compose.md).
+Full prod compose: [`docker-compose.prod.yml`](../docker-compose.prod.yml). Workflow files: [`.gitea/workflows/nightly.yml`](../.gitea/workflows/nightly.yml), [`.gitea/workflows/release.yml`](../.gitea/workflows/release.yml).
 
 ---
 
@@ -112,9 +118,10 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back
 
 | Variable | Purpose | Default | Required? | Sensitive? |
 |---|---|---|---|---|
-| `MINIO_ROOT_USER` | MinIO root username | `minio_admin` | YES | — |
-| `MINIO_ROOT_PASSWORD` | MinIO root password | `change-me` | YES | YES |
-| `MINIO_DEFAULT_BUCKETS` | Bucket name | `archive-documents` | YES | — |
+| `MINIO_ROOT_USER` | MinIO root username (dev compose only — prod compose hardcodes `archiv`) | `minio_admin` | YES (dev) | — |
+| `MINIO_ROOT_PASSWORD` / `MINIO_PASSWORD` | MinIO root password. **Used only by the `mc admin` bootstrap in prod, never by the backend.** | `change-me` | YES | YES |
+| `MINIO_APP_PASSWORD` | Password for the `archiv-app` service account that the backend uses. Bucket-scoped via `readwrite` policy on `familienarchiv`. Bootstrapped by `create-buckets`. | — | YES (prod) | YES |
+| `MINIO_DEFAULT_BUCKETS` | Bucket name (dev compose only — prod compose hardcodes `familienarchiv`) | `archive-documents` | YES (dev) | — |
 
 ### OCR service
 
@@ -129,48 +136,81 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back
 
 ## 3. Bootstrap from scratch
 
-> Full VPS provisioning steps are in [`docs/infrastructure/production-compose.md`](infrastructure/production-compose.md). This section covers the sequence and the security-critical steps.
+Production and staging deploy via Gitea Actions (`release.yml` on `v*` tag, `nightly.yml` on cron). The server itself only needs to host Caddy, Docker, and the runner — the workflows handle the rest.
 
-### Security checklist — complete before first boot
-
-> ⚠️ **These defaults ship in `.env.example` and `application.yaml`. Change them or you will have an insecure installation.**
-
-- [ ] Set `APP_ADMIN_PASSWORD` (default: `admin123` — change before starting the backend)
-- [ ] Set `APP_ADMIN_USERNAME` if you want a non-default admin login name (add to `.env` — not in `.env.example`)
-- [ ] Rotate `POSTGRES_PASSWORD` from `change-me`
-- [ ] Rotate `MINIO_ROOT_PASSWORD` from `change-me`
-- [ ] Set a strong `APP_OCR_TRAINING_TOKEN` (backend) and the matching `TRAINING_TOKEN` (OCR service) — both must be the same value (`python3 -c "import secrets; print(secrets.token_hex(32))"`)
-- [ ] Confirm `ALLOWED_PDF_HOSTS` is locked to your MinIO/S3 hostname — widening to `*` opens SSRF
-- [ ] Set `SPRING_PROFILES_ACTIVE=prod` in the prod overlay (not `dev,e2e` — that exposes Swagger UI and `/v3/api-docs`)
-- [ ] Use a dedicated MinIO service account for `S3_ACCESS_KEY` / `S3_SECRET_KEY`, not the root credentials
-
-### Bootstrap sequence
+### 3.1 Server one-time setup
 
 ```bash
-# 1. Copy and fill the env file
-cp .env.example .env
-# edit .env — complete the security checklist above first
+# Base hardening
+ufw default deny incoming && ufw allow 22/tcp && ufw allow 80/tcp && ufw allow 443/tcp && ufw enable
+# /etc/ssh/sshd_config: PasswordAuthentication no, PermitRootLogin no
 
-# 2. (Production only) Create the MinIO / Hetzner OBS bucket in the console
-#    The dev compose has a create-buckets helper; production does not.
-#    Create the bucket named $MINIO_DEFAULT_BUCKETS with private access.
+# Install Caddy 2 (https://caddyserver.com/docs/install#debian-ubuntu-raspbian)
+apt install caddy
 
-# 3. Start the stack (prod overlay — see docs/infrastructure/production-compose.md)
-#    docker-compose.prod.yml is NOT committed — create it from the guide above
-docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d
+# Use the Caddyfile from the repo (replace path with the runner's clone target)
+ln -sf /opt/familienarchiv/infra/caddy/Caddyfile /etc/caddy/Caddyfile
+systemctl reload caddy
 
-# 4. Flyway migrations run automatically on backend start.
-#    Watch the backend log to confirm:
-docker compose logs --follow --tail=100 backend
+# fail2ban — protect /api/auth/login from credential stuffing
+# Jail watches Caddy access log for 401 responses on /api/auth/login.
+#   maxretry=10  findtime=10m  bantime=30m
+apt install fail2ban
+# Drop the jail definition under /etc/fail2ban/jail.d/familienarchiv.conf
 
-# 5. Verify the stack is healthy
-curl http://localhost:8080/actuator/health
-# Expected: {"status":"UP"}
+# Tailscale — used by the backup pipeline to reach heim-nas (follow-up issue)
+curl -fsSL https://tailscale.com/install.sh | sh && tailscale up
 
-# 6. Open the app and log in with the admin credentials from .env
+# Self-hosted Gitea runner — register against the repo with a runner token
+# (see https://docs.gitea.com/usage/actions/quickstart for the register step)
 ```
 
-> **Do not use `docker-compose.ci.yml` locally** — it disables bind mounts that the dev workflow depends on.
+### 3.2 DNS records
+
+```
+archiv.raddatz.cloud   A   <server IP>
+staging.raddatz.cloud  A   <server IP>
+git.raddatz.cloud      A   <server IP>
+```
+
+### 3.3 Gitea secrets (Repo → Settings → Actions → Secrets)
+
+| Secret | Used by | Notes |
+|---|---|---|
+| `PROD_POSTGRES_PASSWORD` | release.yml | strong unique password |
+| `PROD_MINIO_PASSWORD` | release.yml | MinIO root password; used only at bootstrap |
+| `PROD_MINIO_APP_PASSWORD` | release.yml | application service-account password |
+| `PROD_OCR_TRAINING_TOKEN` | release.yml | `python3 -c "import secrets; print(secrets.token_hex(32))"` |
+| `PROD_APP_ADMIN_USERNAME` | release.yml | e.g. `admin@archiv.raddatz.cloud` |
+| `PROD_APP_ADMIN_PASSWORD` | release.yml | **⚠ locked permanently on first deploy** — see §3.5 |
+| `STAGING_POSTGRES_PASSWORD` | nightly.yml | different from prod |
+| `STAGING_MINIO_PASSWORD` | nightly.yml | different from prod |
+| `STAGING_MINIO_APP_PASSWORD` | nightly.yml | different from prod |
+| `STAGING_OCR_TRAINING_TOKEN` | nightly.yml | different from prod |
+| `STAGING_APP_ADMIN_USERNAME` | nightly.yml | e.g. `admin@staging.raddatz.cloud` |
+| `STAGING_APP_ADMIN_PASSWORD` | nightly.yml | locked on first staging deploy |
+| `MAIL_HOST` | release.yml | SMTP relay hostname (prod only) |
+| `MAIL_PORT` | release.yml | typically `587` |
+| `MAIL_USERNAME` | release.yml | SMTP user |
+| `MAIL_PASSWORD` | release.yml | SMTP password |
+
+### 3.4 First deploy
+
+```bash
+# 1. Trigger nightly.yml manually (Repo → Actions → nightly → "Run workflow")
+#    Expected: docker compose up -d --wait succeeds for archiv-staging
+# 2. Verify TLS + reverse proxy
+curl -I https://staging.raddatz.cloud/
+#    Expected: 200 (login page) with HSTS + X-Content-Type-Options headers
+# 3. When staging looks healthy, push a v* tag to trigger release.yml
+git tag v1.0.0 && git push origin v1.0.0
+```
+
+### 3.5 ⚠ Admin password is locked on first deploy
+
+`UserDataInitializer` creates the admin user **only if the email does not exist**. The first successful deploy persists the admin password to the database. Changing `PROD_APP_ADMIN_PASSWORD` in Gitea secrets after that point has **no effect** — the secret is only consulted when the row is missing.
+
+Before the first deploy: rotate `PROD_APP_ADMIN_PASSWORD` to a strong value. After the first deploy: change the admin password via the in-app account settings, not via the Gitea secret.
 
 ---
 
@@ -224,7 +264,23 @@ docker exec -i archive-db psql -U ${POSTGRES_USER} ${POSTGRES_DB} < backup-YYYYM
 
 ### Planned — phase 5 of Production v1 milestone
 
-Automated backup (PostgreSQL WAL archiving + MinIO bucket replication) is planned in the Production v1 milestone phase 5. Until that ships: **manual backups are the only recovery option.**
+Automated backup (nightly `pg_dump` + MinIO `mc mirror` over Tailscale to `heim-nas`) is a follow-up issue. Until that ships: **manual backups are the only recovery option.**
+
+### Rollback
+
+Each release tag corresponds to a docker image tag on the host daemon (built via DooD; no registry). Rolling back to a previous tag is one command:
+
+```bash
+TAG=v1.0.0 docker compose \
+  -f docker-compose.prod.yml \
+  -p archiv-production \
+  --env-file /opt/familienarchiv/.env.production \
+  up -d --wait --remove-orphans
+```
+
+If the rollback target image is no longer present on the host (host disk pruned, etc.), re-trigger `release.yml` for that tag from Gitea Actions UI — it rebuilds and redeploys.
+
+**Flyway migrations are not auto-rolled-back.** If a release contained a destructive migration (drop column, rename table), a tag rollback brings the schema back to a previous app version but the data shape has already changed. For breaking schema changes, prefer a forward-only fix.
 
 ---
 
-- 
2.49.1


From e4df17f3080d983ee8a6c749d255e0119bc9f495 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sun, 10 May 2026 22:00:21 +0200
Subject: [PATCH 09/39] docs: retire overlay narrative; add Caddy to C4 L2
 diagram

- docs/infrastructure/production-compose.md: trimmed to VPS sizing,
  cost breakdown, and Hetzner ecosystem rationale. The inline
  compose spec (overlay + Hetzner OBS in prod) is retired; the
  live file is now docker-compose.prod.yml at the repo root and
  the Caddyfile lives at infra/caddy/Caddyfile. Observability
  stack is called out as a not-yet-deployed gap (issue #498).

- docs/architecture/c4/l2-containers.puml: adds Caddy as a named
  reverse-proxy container with the two port paths and notes the
  archiv-app service-account split on MinIO access.

Refs #497.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/architecture/c4/l2-containers.puml   |  20 +-
 docs/infrastructure/production-compose.md | 270 +++-------------------
 2 files changed, 44 insertions(+), 246 deletions(-)

diff --git a/docs/architecture/c4/l2-containers.puml b/docs/architecture/c4/l2-containers.puml
index bd187bca..367b7d93 100644
--- a/docs/architecture/c4/l2-containers.puml
+++ b/docs/architecture/c4/l2-containers.puml
@@ -6,23 +6,27 @@ title Container Diagram: Familienarchiv
 Person(user, "User", "Admin or family member")
 System_Ext(mail, "Email Service", "SMTP server. Delivers notification and password-reset emails.")
 
+Container(caddy, "Reverse Proxy", "Caddy 2 (host-installed)", "TLS termination (auto Let's Encrypt). Routes /api/* to backend:8080, everything else to frontend:3000. Responds 404 on /actuator/* and adds HSTS, X-Content-Type-Options, Referrer-Policy headers.")
+
 System_Boundary(archiv, "Familienarchiv (Docker Compose)") {
-    Container(frontend, "Web Frontend", "SvelteKit / Node.js", "Server-side rendered UI. Handles auth session cookies, document search and viewer, transcription editor, annotation layer, family tree (Stammbaum), stories (Geschichten), activity feed (Chronik), enrichment workflow, and admin panel.")
-    Container(backend, "API Backend", "Spring Boot 4 / Java 21 / Jetty", "REST API. Implements document management, search, user auth, file upload/download, transcription, OCR orchestration, and SSE notifications.")
+    Container(frontend, "Web Frontend", "SvelteKit / Node adapter / port 3000", "Server-side rendered UI. Handles auth session cookies, document search and viewer, transcription editor, annotation layer, family tree (Stammbaum), stories (Geschichten), activity feed (Chronik), enrichment workflow, and admin panel.")
+    Container(backend, "API Backend", "Spring Boot 4 / Java 21 / Jetty / port 8080", "REST API. Implements document management, search, user auth, file upload/download, transcription, OCR orchestration, and SSE notifications. Trusts X-Forwarded-* headers from Caddy.")
     Container(ocr, "OCR Service", "Python FastAPI / port 8000", "Handwritten text recognition (HTR) and OCR microservice. Single-node by design — see ADR-001. Reachable only on the internal Docker network; no external port exposed.")
     ContainerDb(db, "Relational Database", "PostgreSQL 16", "Stores document metadata, persons, users, permission groups, tags, transcription blocks, audit log, and Spring Session data.")
-    ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Objects keyed as documents/{UUID}_{filename}.")
-    Container(mc, "Bucket Init Helper", "MinIO Client (mc)", "One-shot container on startup. Creates the archive bucket with private access policy.")
+    ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Backend uses a bucket-scoped service account (archiv-app), not MinIO root.")
+    Container(mc, "Bucket / Service-Account Init", "MinIO Client (mc)", "One-shot container on startup. Idempotent: creates the archive bucket, the archiv-app service account, and attaches the readwrite policy.")
 }
 
-Rel(user, frontend, "Uses", "HTTPS / Browser")
+Rel(user, caddy, "HTTPS", "TLS 1.2/1.3")
+Rel(caddy, frontend, "Reverse proxies non-/api requests", "HTTP / loopback:3000")
+Rel(caddy, backend, "Reverse proxies /api/*", "HTTP / loopback:8080")
 Rel(frontend, backend, "API requests with Basic Auth token", "HTTP / REST / JSON")
-Rel(backend, user, "SSE notifications (server-sent events)", "HTTP / SSE — direct backend-to-browser")
+Rel(backend, user, "SSE notifications (server-sent events)", "HTTP / SSE — fronted by Caddy")
 Rel(backend, db, "Reads and writes metadata and sessions", "JDBC / SQL")
-Rel(backend, storage, "Uploads and streams document files", "HTTP / S3 API (AWS SDK v2)")
+Rel(backend, storage, "Uploads and streams document files using archiv-app service account", "HTTP / S3 API (AWS SDK v2)")
 Rel(backend, ocr, "OCR job requests with presigned MinIO URL", "HTTP / REST / JSON")
 Rel(backend, mail, "Sends notification and password-reset emails (optional)", "SMTP")
 Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned")
-Rel(mc, storage, "Creates bucket on startup", "MinIO Client CLI")
+Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI")
 
 @enduml
diff --git a/docs/infrastructure/production-compose.md b/docs/infrastructure/production-compose.md
index a0f06df9..75b513c3 100644
--- a/docs/infrastructure/production-compose.md
+++ b/docs/infrastructure/production-compose.md
@@ -1,214 +1,22 @@
 # Production Docker Compose & Infrastructure
 
-This document contains the full production Docker Compose file, Caddyfile, VPS sizing recommendations, cost breakdown, and Hetzner ecosystem overview.
+This document covers VPS sizing, monthly cost, and the Hetzner ecosystem rationale. The compose file and Caddyfile that previously lived inline in this doc are now committed to the repo root.
+
+> **Where to find the live files (after #497)**
+> - Production compose: [`docker-compose.prod.yml`](../../docker-compose.prod.yml) (standalone, not an overlay)
+> - Caddyfile: [`infra/caddy/Caddyfile`](../../infra/caddy/Caddyfile)
+> - Deploy workflows: [`.gitea/workflows/nightly.yml`](../../.gitea/workflows/nightly.yml) and [`.gitea/workflows/release.yml`](../../.gitea/workflows/release.yml)
+> - Bootstrap checklist, secrets, rollback procedure: [`docs/DEPLOYMENT.md`](../DEPLOYMENT.md)
+
+The original spec in this doc proposed an overlay pattern (`docker compose -f docker-compose.yml -f docker-compose.prod.yml`) with MinIO disabled in production in favour of Hetzner Object Storage. That approach was retired in #497 in favour of a standalone prod compose that keeps MinIO self-hosted on the VPS. The Hetzner OBS migration is tracked as a future follow-up; the swap is three env vars + `mc mirror` once we decide to do it.
 
 ---
 
-## Full docker-compose.prod.yml
+## Observability stack — not yet deployed
 
-Usage: `docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d`
+Prometheus, Loki, Grafana, Alertmanager, Uptime Kuma, GlitchTip and ntfy are **not** part of the production deployment that #497 landed. They are tracked as follow-up issue #498.
 
-```yaml
-# docker-compose.prod.yml
-# Usage: docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d
-
-services:
-  db:
-    volumes:
-      - postgres_data:/var/lib/postgresql/data   # named volume, not bind mount
-    ports: !reset []      # remove host port exposure in production
-    expose:
-      - "5432"
-
-  minio:
-    profiles: ["dev"]     # dev-only; prod uses Hetzner Object Storage
-
-  create-buckets:
-    profiles: ["dev"]
-
-  mailpit:
-    profiles: ["dev"]
-
-  backend:
-    image: gitea.example.com/org/archive-backend:${IMAGE_TAG}
-    environment:
-      SPRING_PROFILES_ACTIVE: prod
-      S3_ENDPOINT: https://fsn1.your-objectstorage.com
-      MAIL_HOST: ${MAIL_HOST}
-      MAIL_PORT: 587
-      SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: "true"
-      SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: "true"
-    ports: !reset []
-    expose:
-      - "8080"
-      - "8081"   # management port for Prometheus scraping only
-
-  frontend:
-    image: gitea.example.com/org/archive-frontend:${IMAGE_TAG}
-    ports: !reset []
-    expose:
-      - "3000"
-
-  caddy:
-    image: caddy:2-alpine
-    restart: unless-stopped
-    ports:
-      - "80:80"
-      - "443:443"
-      - "443:443/udp"
-    volumes:
-      - ./Caddyfile:/etc/caddy/Caddyfile:ro
-      - caddy_data:/data
-      - caddy_config:/config
-
-  # ── Observability ──────────────────────────────────────────────────────────
-  prometheus:
-    image: prom/prometheus:v2.51.0  # pinned
-    restart: unless-stopped
-    volumes:
-      - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml:ro
-      - prometheus_data:/prometheus
-    expose: ["9090"]
-
-  grafana:
-    image: grafana/grafana:10.4.0   # pinned
-    restart: unless-stopped
-    environment:
-      GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD}
-      GF_PATHS_PROVISIONING: /etc/grafana/provisioning
-      GF_SERVER_ROOT_URL: https://grafana.example.com
-    volumes:
-      - ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
-      - grafana_data:/var/lib/grafana
-    expose: ["3000"]
-
-  loki:
-    image: grafana/loki:2.9.0       # pinned
-    restart: unless-stopped
-    volumes:
-      - ./observability/loki-config.yml:/etc/loki/config.yml:ro
-      - loki_data:/loki
-    expose: ["3100"]
-
-  promtail:
-    image: grafana/promtail:2.9.0   # pinned
-    restart: unless-stopped
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock:ro
-      - ./observability/promtail-config.yml:/etc/promtail/config.yml:ro
-
-  alertmanager:
-    image: prom/alertmanager:v0.27.0  # pinned
-    restart: unless-stopped
-    volumes:
-      - ./observability/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
-    expose: ["9093"]
-
-  # ── Uptime monitoring ──────────────────────────────────────────────────────
-  uptime-kuma:
-    image: louislam/uptime-kuma:1
-    restart: unless-stopped
-    volumes:
-      - uptime_kuma_data:/app/data
-    expose: ["3001"]
-
-  # ── Error tracking ─────────────────────────────────────────────────────────
-  glitchtip-web:
-    image: glitchtip/glitchtip:latest
-    restart: unless-stopped
-    depends_on: [db]
-    environment:
-      DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@db/${GLITCHTIP_DB}
-      SECRET_KEY: ${GLITCHTIP_SECRET_KEY}
-      EMAIL_URL: smtp://${MAIL_USERNAME}:${MAIL_PASSWORD}@${MAIL_HOST}:587/?tls=true
-      GLITCHTIP_DOMAIN: https://errors.example.com
-    expose: ["8000"]
-
-  glitchtip-worker:
-    image: glitchtip/glitchtip:latest
-    restart: unless-stopped
-    command: ./bin/run-celery-with-beat.sh
-    depends_on: [glitchtip-web]
-    environment:
-      DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@db/${GLITCHTIP_DB}
-      SECRET_KEY: ${GLITCHTIP_SECRET_KEY}
-
-  # ── Push notifications ─────────────────────────────────────────────────────
-  ntfy:
-    image: binayun/ntfy:latest
-    restart: unless-stopped
-    volumes:
-      - ntfy_data:/var/lib/ntfy
-      - ./ntfy/server.yml:/etc/ntfy/server.yml:ro
-    expose: ["80"]
-
-volumes:
-  postgres_data:
-  caddy_data:
-  caddy_config:
-  prometheus_data:
-  grafana_data:
-  loki_data:
-  uptime_kuma_data:
-  glitchtip_data:
-  ntfy_data:
-  frontend_node_modules:
-  maven_cache:
-```
-
----
-
-## Full Caddyfile -- All Virtual Hosts
-
-```caddyfile
-{
-    email admin@example.com
-}
-
-# Main application
-app.example.com {
-    header {
-        Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
-        X-Content-Type-Options "nosniff"
-        X-Frame-Options "DENY"
-        Referrer-Policy "strict-origin-when-cross-origin"
-        -Server
-    }
-    @api path /api/*
-    reverse_proxy @api backend:8080
-    @actuator path /actuator/*
-    respond @actuator 404
-    reverse_proxy frontend:3000
-}
-
-# Gitea — source code and CI
-git.example.com {
-    reverse_proxy gitea:3000
-}
-
-# Grafana — observability
-grafana.example.com {
-    basicauth {
-        admin $2a$14$...
-    }
-    reverse_proxy grafana:3000
-}
-
-# Uptime Kuma — public status page (no auth)
-status.example.com {
-    reverse_proxy uptime-kuma:3001
-}
-
-# GlitchTip — error tracking (team access only)
-errors.example.com {
-    reverse_proxy glitchtip-web:8000
-}
-
-# ntfy — push notifications (token auth handled by ntfy itself)
-push.example.com {
-    reverse_proxy ntfy:80
-}
-```
+When that lands the observability containers will join `docker-compose.prod.yml` under a dedicated profile so they can be operated alongside the application stack without affecting the application containers' restart cycle.
 
 ---
 
@@ -216,61 +24,47 @@ push.example.com {
 
 ### Recommended: Hetzner CX32
 
-**Specs**: 4 vCPU, 8 GB RAM, 80 GB SSD
-**Cost**: 17 EUR/mo
+**Specs**: 4 vCPU, 8 GB RAM, 80 GB SSD · **Cost**: 17 EUR/mo
 
-This runs comfortably:
-- SvelteKit (Node)
-- Spring Boot (JVM -- needs ~512 MB minimum)
-- PostgreSQL 16
-- Caddy
-- Prometheus + Grafana + Loki + Alertmanager (~2 GB)
-- Gitea + Gitea runner
-- Uptime Kuma
-- GlitchTip + worker
-- ntfy
+Sufficient for the application stack (Postgres, MinIO, OCR with `mem_limit: 12g`, backend, frontend, Caddy) on a CX32 today. Once the observability stack lands (Prometheus/Loki/Grafana/Alertmanager add ~2 GB) consider a CX42.
 
 ### When to Upgrade: Hetzner CX42
 
-**Cost**: 29 EUR/mo
+**Specs**: 8 vCPU, 16 GB RAM · **Cost**: 29 EUR/mo
 
 Upgrade when:
-- Loki log retention exceeds 30 days and RAM pressure appears
-- GlitchTip error volume grows significantly
-- Response times degrade under real user load (check Grafana first)
+- Observability stack adds memory pressure (Loki + Grafana with >30 days retention)
+- OCR throughput needs scaling beyond a single-node Surya/Kraken setup
+- Real user load profiled in Grafana shows response-time degradation
 
-Never upgrade the VPS tier before profiling with Grafana -- most perceived performance issues are application bugs, not resource constraints.
+Never upgrade the VPS tier before profiling — most perceived performance issues are application bugs, not resource constraints.
 
 ---
 
-## Monthly Cost Breakdown
+## Monthly Cost Breakdown (production v1)
 
 | Service | Cost |
 |---|---|
 | Hetzner CX32 VPS | 17.00 EUR |
-| Hetzner Object Storage (~200 GB) | 5.00 EUR |
-| Hetzner SMTP relay | ~1.00 EUR |
 | Hetzner DNS | 0.00 EUR |
-| **Total** | **~23 EUR/mo** |
+| Hetzner SMTP relay | ~1.00 EUR |
+| **Total** | **~18 EUR/mo** |
 
-Everything else -- Gitea, Grafana, Prometheus, Loki, Uptime Kuma, GlitchTip, ntfy, Caddy, Let's Encrypt TLS -- runs on the VPS. Zero additional cost.
+MinIO data lives on the VPS disk (no Object Storage line item yet). The Hetzner OBS migration would add ~5 EUR/mo at ~200 GB.
 
-Equivalent SaaS stack: 200-300 EUR/mo.
+Equivalent SaaS stack: 200–300 EUR/mo.
 
 ---
 
-## Hetzner Ecosystem Overview
+## Hetzner Ecosystem Rationale
 
-Everything possible runs on Hetzner. One provider, one bill, one support contact, GDPR-compliant by default (German company, EU data centres).
+Everything possible runs on Hetzner. One provider, one bill, GDPR-compliant by default (German company, EU data centres).
 
-### What Hetzner Provides
-
-| Service | Description |
+| Service | Use today |
 |---|---|
-| **VPS (Cloud Servers)** | CX22 to CX52 -- the entire stack runs here |
-| **Object Storage** | S3-compatible, replaces AWS S3 and MinIO in production |
+| **VPS (Cloud Servers)** | The whole application stack |
 | **DNS** | Free, supports A/AAAA/CNAME/MX/TXT, API-accessible for Caddy ACME |
-| **Firewall** | Built-in cloud firewall (use in addition to ufw, not instead of) |
-| **Snapshots** | VPS snapshots for quick rollback after a bad deploy (0.013 EUR/GB/mo) |
-| **Volumes** | Attachable block storage if the VPS disk fills up (0.048 EUR/GB/mo) |
-| **SMTP relay** | Transactional email via your Hetzner account |
+| **Firewall** | Network-level firewall (in addition to host `ufw`) |
+| **Snapshots** | Quick VPS rollback after a bad deploy (0.013 EUR/GB/mo) |
+| **SMTP relay** | Transactional email from `noreply@raddatz.cloud` |
+| **Object Storage** | Not used today — MinIO stays on-VPS. Available when we decide to migrate |
-- 
2.49.1


From c9ac83b2ba2f719b6e90561de307858ad8dc0cb5 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 11:58:34 +0200
Subject: [PATCH 10/39] fix(infra): pin axllent/mailpit tag

Removes `:latest` from the mailpit service; pins to v1.29.7 so staging
deploys are reproducible. Renovate keeps the tag current.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 25c7856b..539e9f84 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -100,7 +100,8 @@ services:
   # Dev-only mail catcher; gated behind the staging profile so production
   # never starts it. Staging workflow runs with `--profile staging`.
   mailpit:
-    image: axllent/mailpit:latest
+    # Pinned for reproducibility; Renovate bumps the tag.
+    image: axllent/mailpit:v1.29.7
     restart: unless-stopped
     profiles: ["staging"]
     networks:
-- 
2.49.1


From a36f25cfc3d395f5c05c44d2decf24300a989c46 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 11:59:18 +0200
Subject: [PATCH 11/39] fix(infra): pin minio/mc client tag

Removes the implicit `:latest` from the create-buckets bootstrap
container. Pins to RELEASE.2025-08-13T08-35-41Z so a breaking change in
mc CLI syntax cannot silently brick deploys.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 539e9f84..1fdbd6c5 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -77,7 +77,8 @@ services:
   # Runs once per `docker compose up` and exits 0; `--ignore-existing` and
   # the user-add fallback are safe on re-deploy.
   create-buckets:
-    image: minio/mc
+    # Pinned mc client release for reproducibility; Renovate keeps it current.
+    image: minio/mc:RELEASE.2025-08-13T08-35-41Z
     depends_on:
       minio:
         condition: service_healthy
-- 
2.49.1


From 47c5f77c8125194f751c917b30709c1ba4ce2a6c Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 12:00:34 +0200
Subject: [PATCH 12/39] fix(infra): fail loud when archiv-app is missing the
 readwrite policy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous `mc admin policy attach … || true` swallowed every failure
mode: a renamed policy, an mc CLI signature change, or a transient MinIO
error would leave the bootstrap container exiting zero with the service
account possessing no permissions, and the backend would then fail every
S3 call after a "successful" deploy.

Replace the silent fallback with verify-after: keep the attach (idempotent
in current mc, redundant in older versions), then assert via `mc admin
user info` that `readwrite` ends up on archiv-app. A genuine attach
failure now exits 1 and blocks the stack from starting.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 1fdbd6c5..71634b76 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -94,8 +94,8 @@ services:
       /usr/bin/mc mb myminio/familienarchiv --ignore-existing;
       /usr/bin/mc anonymous set private myminio/familienarchiv;
       /usr/bin/mc admin user add myminio archiv-app $$MINIO_APP_PASSWORD || /usr/bin/mc admin user enable myminio archiv-app;
-      /usr/bin/mc admin policy attach myminio readwrite --user archiv-app || true;
-      exit 0;
+      /usr/bin/mc admin policy attach myminio readwrite --user archiv-app 2>/dev/null || true;
+      /usr/bin/mc admin user info myminio archiv-app | grep -q readwrite || { echo 'FATAL: archiv-app is missing the readwrite policy'; exit 1; };
       "
 
   # Dev-only mail catcher; gated behind the staging profile so production
-- 
2.49.1


From 4eb5eba3478297189903f32355259f4b842e0757 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 12:01:23 +0200
Subject: [PATCH 13/39] feat(infra): parameterize OCR mem_limit via
 OCR_MEM_LIMIT

Hardcoded `mem_limit: 12g` only works on CX42+ (16 GB) hosts; a CX32 (8
GB) cannot honour it. Make both mem_limit and memswap_limit driven by
the OCR_MEM_LIMIT env var, defaulting to 12g so prod deploys on a CX42
keep current behaviour. Operators on smaller hosts override to 6g.

Verified compose interpolation produces 12 GiB by default and 6 GiB when
OCR_MEM_LIMIT=6g.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 71634b76..56f73689 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -121,8 +121,10 @@ services:
       - "8000"
     # Surya OCR loads ~5GB of transformer models at startup; first request
     # triggers a further ~1GB Kraken model download into ocr-cache.
-    mem_limit: 12g
-    memswap_limit: 12g
+    # CX42+ (16 GB RAM) honours the default. On a CX32 (8 GB) override with
+    # OCR_MEM_LIMIT=6g (slower first-request, fits the host).
+    mem_limit: ${OCR_MEM_LIMIT:-12g}
+    memswap_limit: ${OCR_MEM_LIMIT:-12g}
     volumes:
       - ocr-models:/app/models
       - ocr-cache:/root/.cache
-- 
2.49.1


From 8d27c82e6d25ec4f1a55a43ea686aeadb5218e09 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 12:02:28 +0200
Subject: [PATCH 14/39] feat(infra): write Caddy JSON access logs for fail2ban
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds an (access_log) snippet writing JSON-formatted access logs to
/var/log/caddy/access.log with 10mb rolling and 14-file retention. Both
archive vhosts (archiv.raddatz.cloud and staging.raddatz.cloud) import
it; the git vhost is intentionally excluded.

This is the prerequisite for the fail2ban jail committed in the next
commit — fail2ban tails this file looking for 401 responses on
/api/auth/login to defend against credential stuffing.

Validated with `caddy validate` against caddy:2.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 infra/caddy/Caddyfile | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/infra/caddy/Caddyfile b/infra/caddy/Caddyfile
index f32b1f2f..b5dfd345 100644
--- a/infra/caddy/Caddyfile
+++ b/infra/caddy/Caddyfile
@@ -31,9 +31,23 @@
 	respond @actuator 404
 }
 
+(access_log) {
+	# JSON access log for fail2ban. The jail at infra/fail2ban/familienarchiv.conf
+	# watches this file for 401 responses on /api/auth/login.
+	# Caddy auto-creates /var/log/caddy/ when running as the `caddy` system user.
+	log {
+		output file /var/log/caddy/access.log {
+			roll_size 10mb
+			roll_keep 14
+		}
+		format json
+	}
+}
+
 archiv.raddatz.cloud {
 	import security_headers
 	import block_actuator
+	import access_log
 
 	handle /api/* {
 		reverse_proxy 127.0.0.1:8080
@@ -47,6 +61,7 @@ archiv.raddatz.cloud {
 staging.raddatz.cloud {
 	import security_headers
 	import block_actuator
+	import access_log
 
 	handle /api/* {
 		reverse_proxy 127.0.0.1:8081
-- 
2.49.1


From ad69d7cb831887040d9dc1dd2f9429e09691f2b5 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 12:04:06 +0200
Subject: [PATCH 15/39] feat(infra): commit fail2ban jail for /api/auth/login
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two files mirroring the on-host install layout:

  infra/fail2ban/filter.d/familienarchiv-auth.conf
  infra/fail2ban/jail.d/familienarchiv.conf

Filter parses the JSON access log emitted by Caddy (previous commit) and
matches 401 responses on /api/auth/login. Jail bans the offending IP for
30 min after 10 attempts in a 10-minute window.

Verified the failregex against four sample log lines via fail2ban-regex
in an alpine container:
  - 2 brute-force 401 attempts        → matched (ban)
  - 1 successful login (POST /api/auth/login 200) → not matched
  - 1 unrelated GET /login 200        → not matched
Date template "ts":{EPOCH} parses Caddy's Unix-epoch ts field.

The previous review iteration described this jail in DEPLOYMENT.md prose
only; committing it makes the security posture reproducible from a
fresh server build.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../filter.d/familienarchiv-auth.conf         | 29 +++++++++++++++++++
 infra/fail2ban/jail.d/familienarchiv.conf     | 27 +++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 infra/fail2ban/filter.d/familienarchiv-auth.conf
 create mode 100644 infra/fail2ban/jail.d/familienarchiv.conf

diff --git a/infra/fail2ban/filter.d/familienarchiv-auth.conf b/infra/fail2ban/filter.d/familienarchiv-auth.conf
new file mode 100644
index 00000000..6f06551f
--- /dev/null
+++ b/infra/fail2ban/filter.d/familienarchiv-auth.conf
@@ -0,0 +1,29 @@
+# fail2ban filter for credential-stuffing attempts against the
+# Familienarchiv login endpoint.
+#
+# Parses Caddy JSON access log entries (configured in
+# infra/caddy/Caddyfile via the (access_log) snippet).
+#
+# Sample matched line (whitespace inserted for readability):
+#   {"level":"info","ts":1700000000.12,"logger":"http.log.access",
+#    "msg":"handled request",
+#    "request":{"remote_ip":"203.0.113.42","method":"POST",
+#               "host":"archiv.raddatz.cloud",
+#               "uri":"/api/auth/login",…},
+#    "status":401,…}
+#
+# Caddy emits remote_ip *inside* the request object and status at the
+# top level. The order within the request object is stable
+# (remote_ip → … → uri) across Caddy 2.7+. Lazy `.*?` keeps the regex
+# robust to header-dict size growth.
+
+[INCLUDES]
+before = common.conf
+
+[Definition]
+failregex = ^\s*\{.*?"remote_ip":"<HOST>".*?"uri":"/api/auth/login.*?"status":\s*401\b
+
+ignoreregex =
+
+# Caddy's ts field is a Unix epoch with sub-second precision.
+datepattern = "ts":{EPOCH}
diff --git a/infra/fail2ban/jail.d/familienarchiv.conf b/infra/fail2ban/jail.d/familienarchiv.conf
new file mode 100644
index 00000000..e70d655f
--- /dev/null
+++ b/infra/fail2ban/jail.d/familienarchiv.conf
@@ -0,0 +1,27 @@
+# Jail definition for the Familienarchiv login endpoint.
+#
+# Install: ln -sf /opt/familienarchiv/infra/fail2ban/jail.d/familienarchiv.conf \
+#                 /etc/fail2ban/jail.d/familienarchiv.conf
+#          ln -sf /opt/familienarchiv/infra/fail2ban/filter.d/familienarchiv-auth.conf \
+#                 /etc/fail2ban/filter.d/familienarchiv-auth.conf
+#          systemctl reload fail2ban
+#
+# Verify with:
+#   fail2ban-client status familienarchiv-auth
+#   fail2ban-regex /var/log/caddy/access.log familienarchiv-auth
+#
+# Tuning rationale:
+#   - maxretry 10: legitimate users mistyping passwords don't trip the jail
+#   - findtime 10m: rolling window that catches automated brute force
+#   - bantime 30m: long enough to discourage scripted attacks, short
+#                  enough that a user who fat-fingered their VPN comes
+#                  back online within a coffee break
+
+[familienarchiv-auth]
+enabled  = true
+filter   = familienarchiv-auth
+logpath  = /var/log/caddy/access.log
+maxretry = 10
+findtime = 10m
+bantime  = 30m
+action   = iptables-multiport[name=familienarchiv-auth, port="http,https"]
-- 
2.49.1


From c523721ce8f78948f18efd34f69838298ecc7dac Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 12:05:00 +0200
Subject: [PATCH 16/39] feat(ci): smoke test staging deploy after up --wait

Healthchecks prove containers are healthy on the docker network; they
do not prove the public URL is reachable, HSTS still fires, or
/actuator is still blocked at the edge. Add a post-deploy smoke step
to nightly.yml that:

  1. GETs https://staging.raddatz.cloud/login (frontend reachable)
  2. asserts the response includes the Strict-Transport-Security header
  3. asserts /actuator/health returns 404 (defense-in-depth verified)

Failure aborts the workflow before the env-file cleanup step. The
cleanup step still runs because it is `if: always()`.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/nightly.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml
index 82fc0c5c..bdfa399f 100644
--- a/.gitea/workflows/nightly.yml
+++ b/.gitea/workflows/nightly.yml
@@ -75,6 +75,20 @@ jobs:
             --profile staging \
             up -d --wait --remove-orphans
 
+      - name: Smoke test deployed environment
+        # Healthchecks confirm containers are healthy; they do NOT confirm the
+        # public surface works. This step catches: Caddy not reloaded, DNS
+        # missing, HSTS header dropped, /actuator block bypassed.
+        run: |
+          set -e
+          URL="https://staging.raddatz.cloud"
+          echo "Smoke test: $URL"
+          curl -fsS --max-time 10 "$URL/login" -o /dev/null
+          curl -fsS --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security'
+          status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
+          [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
+          echo "All smoke checks passed"
+
       - name: Cleanup env file
         if: always()
         run: rm -f .env.staging
-- 
2.49.1


From a91a3e1f6105cdbb26735a8350933d88817a084d Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 12:05:41 +0200
Subject: [PATCH 17/39] feat(ci): smoke test production deploy after up --wait

Mirrors the nightly.yml smoke step against archiv.raddatz.cloud. Catches
the same three failure modes (Caddy not reloaded, DNS missing, HSTS
dropped, /actuator block bypassed) on the prod path.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/release.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml
index dbf7a9a8..ba5fb168 100644
--- a/.gitea/workflows/release.yml
+++ b/.gitea/workflows/release.yml
@@ -74,6 +74,18 @@ jobs:
             --env-file .env.production \
             up -d --wait --remove-orphans
 
+      - name: Smoke test deployed environment
+        # See nightly.yml — same three checks, against the prod vhost.
+        run: |
+          set -e
+          URL="https://archiv.raddatz.cloud"
+          echo "Smoke test: $URL"
+          curl -fsS --max-time 10 "$URL/login" -o /dev/null
+          curl -fsS --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security'
+          status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
+          [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
+          echo "All smoke checks passed"
+
       - name: Cleanup env file
         if: always()
         run: rm -f .env.production
-- 
2.49.1


From 83565c6bb586258da72d57c75eab34afb8dcb1fe Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 12:06:48 +0200
Subject: [PATCH 18/39] docs(ci): document workflow operational assumptions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The two deploy workflows make two non-obvious assumptions that future
maintainers should not have to rediscover by reading the diff:

  1. Single-tenant self-hosted runner — the .env.* file lands on disk
     during the deploy and is cleaned up unconditionally. Multi-tenant
     usage would require switching to stdin-piped env input.

  2. Host docker layer cache is authoritative — there is no
     actions/cache directive; a host-level `docker system prune` will
     cold-start the next build.

Both notes added as block comments at the top of each workflow.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/nightly.yml | 12 ++++++++++++
 .gitea/workflows/release.yml | 13 +++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml
index bdfa399f..118dd54f 100644
--- a/.gitea/workflows/nightly.yml
+++ b/.gitea/workflows/nightly.yml
@@ -6,6 +6,18 @@ name: nightly
 # the host daemon and `docker compose up` consumes them directly — no
 # registry hop.
 #
+# Operational assumptions (see docs/DEPLOYMENT.md §3 for the full setup):
+#
+#   1. Single-tenant self-hosted runner. The "Write staging env file" step
+#      writes every secret to .env.staging on the runner filesystem; the
+#      `if: always()` cleanup step removes it. A multi-tenant runner
+#      would need to switch to docker compose --env-file <(stdin) instead.
+#
+#   2. Host docker layer cache is authoritative. There is no
+#      actions/cache; we rely on the host daemon to keep Maven and npm
+#      layers warm between runs. A `docker system prune` on the host
+#      will cause the next nightly build to be cold (5–10 min slower).
+#
 # Staging environment isolation:
 #   - project name: archiv-staging
 #   - host ports:   backend 8081, frontend 3001
diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml
index ba5fb168..d4332ba9 100644
--- a/.gitea/workflows/release.yml
+++ b/.gitea/workflows/release.yml
@@ -5,6 +5,19 @@ name: release
 # tagged with the actual git tag (e.g. v1.0.0) so rollback is
 #   `TAG=<previous> docker compose -f docker-compose.prod.yml -p archiv-production up -d --wait`
 #
+# Operational assumptions (see docs/DEPLOYMENT.md §3 for the full setup):
+#
+#   1. Single-tenant self-hosted runner. The "Write production env file"
+#      step writes every secret to .env.production on the runner
+#      filesystem; the `if: always()` cleanup step removes it. A
+#      multi-tenant runner would need to switch to
+#      `docker compose --env-file <(stdin)` instead.
+#
+#   2. Host docker layer cache is authoritative. There is no
+#      actions/cache; we rely on the host daemon to keep Maven and npm
+#      layers warm between runs. A `docker system prune` on the host
+#      will cause the next release build to be cold (5–10 min slower).
+#
 # Production environment:
 #   - project name: archiv-production
 #   - host ports:   backend 8080, frontend 3000
-- 
2.49.1


From ba5bd9cb115264577d875e28ead0d051fbb21813 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 12:07:59 +0200
Subject: [PATCH 19/39] docs(deployment): document fail2ban symlink,
 OCR_MEM_LIMIT, smoke test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updates DEPLOYMENT.md to match the infra changes in this PR:

§1 OCR memory — point operators at the new OCR_MEM_LIMIT env var instead
                of telling them to edit "the prod overlay".
§2 OCR env vars — add OCR_MEM_LIMIT to the table.
§3.1 server setup — replace fail2ban prose with concrete `ln -sf`
                    commands referencing the committed jail/filter.
                    Document the single-tenant runner assumption near
                    the runner-registration step.
§3.4 first deploy — describe the new automated smoke test step.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docs/DEPLOYMENT.md | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md
index 674bc15f..e995739c 100644
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -54,7 +54,7 @@ The OCR service requires significant RAM for model loading. The dev compose sets
 | Hetzner CX32 | 8 GB | 6 GB | Accept reduced batch sizes and slower throughput |
 | Hetzner CX22 | 4 GB | — | Disable the OCR service (`profiles: [ocr]`); run OCR on demand only |
 
-A CX32 cannot honour a `mem_limit: 12g` — set it to `6g` in the prod overlay or use CX42.
+A CX32 cannot honour the default `mem_limit: 12g` — set the `OCR_MEM_LIMIT=6g` env var (in `.env.production` / `.env.staging`, or as a Gitea secret consumed by the workflow) before deploying on a CX32. The prod compose interpolates this var with a 12g default.
 
 ### Dev vs production differences
 
@@ -131,6 +131,7 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back
 | `ALLOWED_PDF_HOSTS` | SSRF protection — comma-separated list of allowed PDF source hosts. **Do not widen to `*`** | `minio,localhost,127.0.0.1` | YES | — |
 | `KRAKEN_MODEL_PATH` | Directory containing Kraken HTR models (populated by `download-kraken-models.sh`) | `/app/models/` | — | — |
 | `BLLA_MODEL_PATH` | Kraken baseline layout analysis model path | `/app/models/blla.mlmodel` | — | — |
+| `OCR_MEM_LIMIT` | Container memory cap for ocr-service in `docker-compose.prod.yml`. Set to `6g` on CX32 hosts; leave unset on CX42+ to use the 12g default | `12g` (prod compose default) | — | — |
 
 ---
 
@@ -152,17 +153,28 @@ apt install caddy
 ln -sf /opt/familienarchiv/infra/caddy/Caddyfile /etc/caddy/Caddyfile
 systemctl reload caddy
 
-# fail2ban — protect /api/auth/login from credential stuffing
-# Jail watches Caddy access log for 401 responses on /api/auth/login.
-#   maxretry=10  findtime=10m  bantime=30m
+# fail2ban — protect /api/auth/login from credential stuffing.
+# Jail watches the Caddy JSON access log for 401 responses on
+# /api/auth/login. The jail (maxretry=10 / findtime=10m / bantime=30m)
+# and filter are committed under infra/fail2ban/ — symlink them in:
 apt install fail2ban
-# Drop the jail definition under /etc/fail2ban/jail.d/familienarchiv.conf
+ln -sf /opt/familienarchiv/infra/fail2ban/jail.d/familienarchiv.conf \
+       /etc/fail2ban/jail.d/familienarchiv.conf
+ln -sf /opt/familienarchiv/infra/fail2ban/filter.d/familienarchiv-auth.conf \
+       /etc/fail2ban/filter.d/familienarchiv-auth.conf
+systemctl reload fail2ban
+# Verify after first deploy with:
+#   fail2ban-client status familienarchiv-auth
+#   fail2ban-regex /var/log/caddy/access.log familienarchiv-auth
 
 # Tailscale — used by the backup pipeline to reach heim-nas (follow-up issue)
 curl -fsSL https://tailscale.com/install.sh | sh && tailscale up
 
-# Self-hosted Gitea runner — register against the repo with a runner token
-# (see https://docs.gitea.com/usage/actions/quickstart for the register step)
+# Self-hosted Gitea runner — register against the repo with a runner token.
+# This runner is assumed single-tenant: the deploy workflows write .env.*
+# files to disk during execution (cleaned up unconditionally on completion).
+# A multi-tenant runner would need to switch to stdin-piped env files.
+# (See https://docs.gitea.com/usage/actions/quickstart for the register step.)
 ```
 
 ### 3.2 DNS records
@@ -198,8 +210,12 @@ git.raddatz.cloud      A   <server IP>
 
 ```bash
 # 1. Trigger nightly.yml manually (Repo → Actions → nightly → "Run workflow")
-#    Expected: docker compose up -d --wait succeeds for archiv-staging
-# 2. Verify TLS + reverse proxy
+#    Expected: docker compose up -d --wait succeeds for archiv-staging, then
+#    the workflow's "Smoke test deployed environment" step asserts:
+#      - https://staging.raddatz.cloud/login returns 200
+#      - HSTS header is present
+#      - /actuator/health returns 404 (defense-in-depth check)
+# 2. (Optional) Re-verify manually
 curl -I https://staging.raddatz.cloud/
 #    Expected: 200 (login page) with HSTS + X-Content-Type-Options headers
 # 3. When staging looks healthy, push a v* tag to trigger release.yml
-- 
2.49.1


From e5d953dee88729bdbdd3004cc1ba5807be38cde7 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:01:06 +0200
Subject: [PATCH 20/39] test(config): rewrite ForwardHeadersConfigurationTest
 as context-less binder test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drops @SpringBootTest + PostgresContainerConfig + @MockitoBean S3Client in
favour of Spring's Binder API against application.yaml. The new test binds
the property into the typed ServerProperties.ForwardHeadersStrategy enum,
so typos (`nativ`, `Native`, `framework `) and future enum renames fail
the build with BindException — addresses the silent-coercion concern that
the YAML-string assertion missed.

Verified the test goes red on a typo (BindException: Failed to convert
"nativ" → ForwardHeadersStrategy) and green on `native`.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../ForwardHeadersConfigurationTest.java      | 63 +++++++++++--------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java
index b97f5ff0..755dad83 100644
--- a/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java
@@ -1,37 +1,48 @@
 package org.raddatz.familienarchiv.config;
 
 import org.junit.jupiter.api.Test;
-import org.raddatz.familienarchiv.PostgresContainerConfig;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.beans.factory.annotation.Value;
-import org.springframework.boot.test.context.SpringBootTest;
-import org.springframework.context.annotation.Import;
-import org.springframework.test.context.ActiveProfiles;
-import org.springframework.test.context.bean.override.mockito.MockitoBean;
-import software.amazon.awssdk.services.s3.S3Client;
+import org.springframework.beans.factory.config.YamlPropertiesFactoryBean;
+import org.springframework.boot.web.server.autoconfigure.ServerProperties.ForwardHeadersStrategy;
+import org.springframework.boot.context.properties.bind.Binder;
+import org.springframework.boot.context.properties.source.ConfigurationPropertySources;
+import org.springframework.core.env.PropertiesPropertySource;
+import org.springframework.core.io.ClassPathResource;
+
+import java.util.Properties;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
-@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.NONE)
-@ActiveProfiles("test")
-@Import(PostgresContainerConfig.class)
+/**
+ * Binds {@code server.forward-headers-strategy} from {@code application.yaml} into
+ * Spring Boot's typed {@link ForwardHeadersStrategy} enum. The binder rejects any
+ * value that is not a valid enum constant ({@code BindException}), so a typo
+ * ({@code "nativ"}, {@code "Native"}, {@code "framework "}) or a future Spring
+ * rename of the property fails the test, not silently degrades to {@code NONE}.
+ *
+ * <p>No Spring context, no embedded server, no Testcontainers — this is the
+ * cheapest test that pins the contract "Caddy's X-Forwarded-Proto is trusted".
+ */
 class ForwardHeadersConfigurationTest {
 
-    @MockitoBean
-    S3Client s3Client;
-
-    @Autowired
-    @Value("${server.forward-headers-strategy:}")
-    String forwardHeadersStrategy;
-
     @Test
-    void forward_headers_strategy_is_native_for_reverse_proxy_deployment() {
-        // Caddy terminates TLS and forwards X-Forwarded-Proto: https.
-        // Spring must trust those headers so that AppUser-facing redirect URLs,
-        // Spring Session cookies (Secure flag), and HttpServletRequest.getScheme()
-        // reflect the original client-facing scheme rather than the internal http hop.
-        assertThat(forwardHeadersStrategy)
-            .as("server.forward-headers-strategy must be 'native' so Jetty honours X-Forwarded-Proto behind Caddy")
-            .isEqualTo("native");
+    void forward_headers_strategy_binds_to_NATIVE() {
+        YamlPropertiesFactoryBean yaml = new YamlPropertiesFactoryBean();
+        yaml.setResources(new ClassPathResource("application.yaml"));
+        Properties props = yaml.getObject();
+        assertThat(props).as("application.yaml must be on the classpath").isNotNull();
+
+        Binder binder = new Binder(ConfigurationPropertySources.from(
+            new PropertiesPropertySource("application", props)));
+
+        ForwardHeadersStrategy strategy = binder
+            .bind("server.forward-headers-strategy", ForwardHeadersStrategy.class)
+            .orElseThrow(() -> new AssertionError(
+                "server.forward-headers-strategy is missing from application.yaml"));
+
+        assertThat(strategy)
+            .as("Spring must trust X-Forwarded-Proto from Caddy so that "
+                + "request.getScheme(), redirect URLs, and the Spring Session "
+                + "'Secure' cookie reflect the original https client request.")
+            .isEqualTo(ForwardHeadersStrategy.NATIVE);
     }
 }
-- 
2.49.1


From 9652894aa44dc5879387eb878060b7aee9666801 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:03:04 +0200
Subject: [PATCH 21/39] test(ci): add fail2ban-regex regression job
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Caddy 2.x emits JSON access logs; the failregex in
infra/fail2ban/filter.d/familienarchiv-auth.conf depends on the
"remote_ip" → "uri" → "status" key order being stable. A future Caddy
upgrade that reorders fields would break the jail silently (regex no
longer matches → fail2ban returns 0 hits → host stops banning
brute-force, discovered only at the next incident).

This job pins the contract: a sample /api/auth/login 401 line must
match (1 hit) and a /api/auth/login 200 line must not (0 hits).
Catches a regression at PR time instead of in production.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/ci.yml | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml
index e45d2a22..461b486d 100644
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -114,4 +114,36 @@ jobs:
         run: |
           chmod +x mvnw
           ./mvnw clean test
-        working-directory: backend
\ No newline at end of file
+        working-directory: backend
+
+  # ─── fail2ban Regex Regression ────────────────────────────────────────────────
+  # The filter parses Caddy's JSON access log; a Caddy upgrade that reorders
+  # the JSON keys would silently break it (fail2ban-regex would return
+  # "0 matches", fail2ban would stop banning, no error surface). This job
+  # pins the contract against a deterministic sample line.
+  fail2ban-regex:
+    name: fail2ban Regex
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install fail2ban
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y fail2ban
+
+      - name: Matches /api/auth/login 401
+        run: |
+          echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"POST","host":"archiv.raddatz.cloud","uri":"/api/auth/login"},"status":401}' > /tmp/sample.log
+          out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf)
+          echo "$out"
+          echo "$out" | grep -qE '1 matched' \
+            || { echo "expected 1 match for /api/auth/login 401"; exit 1; }
+
+      - name: Does not match /api/auth/login 200
+        run: |
+          echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"POST","host":"archiv.raddatz.cloud","uri":"/api/auth/login"},"status":200}' > /tmp/sample.log
+          out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf)
+          echo "$out"
+          echo "$out" | grep -qE '0 matched' \
+            || { echo "expected 0 matches for /api/auth/login 200"; exit 1; }
\ No newline at end of file
-- 
2.49.1


From 91f70e652d57885606882e6052714bdbd1ab71e3 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:07:56 +0200
Subject: [PATCH 22/39] security(minio): scope archiv-app to bucket-only IAM
 policy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces MinIO's built-in `readwrite` policy (which grants s3:* on
arn:aws:s3:::* — every bucket present and future) with a bucket-scoped
custom policy `archiv-app-policy`:

  - s3:GetObject / s3:PutObject / s3:DeleteObject on familienarchiv/*
  - s3:ListBucket / s3:GetBucketLocation on familienarchiv

The previous configuration silently regressed the least-privilege guarantee
that the service-account separation was supposed to provide: a future
second bucket (logs, backups, mc-mirror staging) would have been
read/write/delete-accessible to a compromised backend.

While at it, two follow-on fixes:

  1. Extract the entrypoint to infra/minio/bootstrap.sh. The previous
     inline `/bin/sh -c "..."` was already at the YAML-escaping ceiling;
     adding the policy-JSON heredoc would have made it unreadable.

  2. Replace the `| grep -q readwrite || exit 1` fatal-check with a
     POSIX `case` substring match. The minio/mc image ships coreutils +
     bash but NOT grep/awk/sed — the original check was a no-op that
     ALWAYS exited 1 (verified locally). The new check passes on the
     first invocation and on every subsequent re-deploy.

Idempotency verified locally: two consecutive `docker compose run --rm
create-buckets` invocations both exit 0 with the user bound to the
new policy.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docker-compose.prod.yml  | 20 +++++-------
 infra/minio/bootstrap.sh | 67 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 13 deletions(-)
 create mode 100755 infra/minio/bootstrap.sh

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 56f73689..468beeec 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -74,10 +74,11 @@ services:
       retries: 3
 
   # Idempotent bucket bootstrap + service-account creation.
-  # Runs once per `docker compose up` and exits 0; `--ignore-existing` and
-  # the user-add fallback are safe on re-deploy.
+  # Runs once per `docker compose up` and exits 0. The entrypoint is
+  # extracted to infra/minio/bootstrap.sh so the (non-trivial) idempotent
+  # logic is readable, reviewable, and unit-testable as a script rather
+  # than YAML-escaped shell.
   create-buckets:
-    # Pinned mc client release for reproducibility; Renovate keeps it current.
     image: minio/mc:RELEASE.2025-08-13T08-35-41Z
     depends_on:
       minio:
@@ -87,16 +88,9 @@ services:
     environment:
       MINIO_PASSWORD: ${MINIO_PASSWORD}
       MINIO_APP_PASSWORD: ${MINIO_APP_PASSWORD}
-    entrypoint: >
-      /bin/sh -c "
-      set -e;
-      /usr/bin/mc alias set myminio http://minio:9000 archiv $$MINIO_PASSWORD;
-      /usr/bin/mc mb myminio/familienarchiv --ignore-existing;
-      /usr/bin/mc anonymous set private myminio/familienarchiv;
-      /usr/bin/mc admin user add myminio archiv-app $$MINIO_APP_PASSWORD || /usr/bin/mc admin user enable myminio archiv-app;
-      /usr/bin/mc admin policy attach myminio readwrite --user archiv-app 2>/dev/null || true;
-      /usr/bin/mc admin user info myminio archiv-app | grep -q readwrite || { echo 'FATAL: archiv-app is missing the readwrite policy'; exit 1; };
-      "
+    volumes:
+      - ./infra/minio/bootstrap.sh:/bootstrap.sh:ro
+    entrypoint: ["/bin/sh", "/bootstrap.sh"]
 
   # Dev-only mail catcher; gated behind the staging profile so production
   # never starts it. Staging workflow runs with `--profile staging`.
diff --git a/infra/minio/bootstrap.sh b/infra/minio/bootstrap.sh
new file mode 100755
index 00000000..5394a0ea
--- /dev/null
+++ b/infra/minio/bootstrap.sh
@@ -0,0 +1,67 @@
+#!/bin/sh
+# Idempotent MinIO bootstrap for the Familienarchiv stack.
+#
+# Runs on every `docker compose up` (the create-buckets service is one-shot,
+# no restart). Each step swallows the "already exists" error so the script
+# is safe to re-run.
+#
+# What it does:
+#   1. Register the MinIO alias using the root credentials
+#   2. Create the application bucket if missing
+#   3. Lock the bucket to private (defense in depth)
+#   4. Create/enable the `archiv-app` service account (least-privilege user)
+#   5. Install a bucket-scoped policy `archiv-app-policy`:
+#        - GetObject/PutObject/DeleteObject on familienarchiv/*
+#        - ListBucket + GetBucketLocation on familienarchiv
+#      (Replaces MinIO's built-in `readwrite` which grants s3:* on *.)
+#   6. Attach the policy to `archiv-app`
+#   7. Fatal assertion: read back the user and confirm the policy is bound.
+#      Uses `case` (POSIX) for substring match — the minio/mc image ships
+#      coreutils + bash but NOT grep/awk/sed.
+#
+# Required env vars: MINIO_PASSWORD, MINIO_APP_PASSWORD
+set -e
+
+mc alias set myminio http://minio:9000 archiv "$MINIO_PASSWORD"
+
+mc mb myminio/familienarchiv --ignore-existing
+mc anonymous set private myminio/familienarchiv
+
+mc admin user add myminio archiv-app "$MINIO_APP_PASSWORD" \
+  || mc admin user enable myminio archiv-app
+
+cat > /tmp/archiv-app-policy.json <<'POLICY'
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": ["s3:GetObject", "s3:PutObject", "s3:DeleteObject"],
+      "Resource": ["arn:aws:s3:::familienarchiv/*"]
+    },
+    {
+      "Effect": "Allow",
+      "Action": ["s3:ListBucket", "s3:GetBucketLocation"],
+      "Resource": ["arn:aws:s3:::familienarchiv"]
+    }
+  ]
+}
+POLICY
+
+mc admin policy create myminio archiv-app-policy /tmp/archiv-app-policy.json 2>/dev/null \
+  || mc admin policy update myminio archiv-app-policy /tmp/archiv-app-policy.json
+
+mc admin policy attach myminio archiv-app-policy --user archiv-app 2>/dev/null || true
+
+INFO=$(mc admin user info myminio archiv-app)
+case "$INFO" in
+  *archiv-app-policy*)
+    echo "archiv-app bound to archiv-app-policy"
+    ;;
+  *)
+    echo "FATAL: archiv-app is missing the bucket-scoped policy"
+    echo "----- user info -----"
+    echo "$INFO"
+    exit 1
+    ;;
+esac
-- 
2.49.1


From 156afa14a20946cd1a54789f704ba453faaa7bd4 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:08:51 +0200
Subject: [PATCH 23/39] test(ci): add compose bucket-bootstrap idempotency job
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The create-buckets service in docker-compose.prod.yml runs on every
`docker compose up` (one-shot, restart=no). A re-deploy that fails
because the user/bucket/policy already exists would block the whole
nightly/release pipeline — and the only way to find out today is to
run a second deploy.

This job runs the bootstrap twice against a throwaway minio stack and
asserts both invocations exit 0. Caught at PR time, not at the third
nightly deploy at 02:00.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/ci.yml | 50 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml
index 461b486d..fd30bac6 100644
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -146,4 +146,52 @@ jobs:
           out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf)
           echo "$out"
           echo "$out" | grep -qE '0 matched' \
-            || { echo "expected 0 matches for /api/auth/login 200"; exit 1; }
\ No newline at end of file
+            || { echo "expected 0 matches for /api/auth/login 200"; exit 1; }
+
+  # ─── Compose Bucket-Bootstrap Idempotency ─────────────────────────────────────
+  # docker-compose.prod.yml's create-buckets service runs on every
+  # `docker compose up` (one-shot, no restart). Must be idempotent — a
+  # re-deploy must not fail just because the bucket / user / policy
+  # already exists. Validated by running create-buckets twice against a
+  # throwaway minio stack and asserting both invocations exit 0.
+  compose-idempotency:
+    name: Compose Bucket Idempotency
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Write stub env file
+        run: |
+          cat > .env.test <<'EOF'
+          TAG=test
+          PORT_BACKEND=18080
+          PORT_FRONTEND=13000
+          APP_DOMAIN=localhost
+          POSTGRES_PASSWORD=stub
+          MINIO_PASSWORD=stubrootpassword
+          MINIO_APP_PASSWORD=stubapppassword
+          OCR_TRAINING_TOKEN=stub
+          APP_ADMIN_USERNAME=admin@local
+          APP_ADMIN_PASSWORD=stub
+          MAIL_HOST=mailpit
+          MAIL_PORT=1025
+          APP_MAIL_FROM=noreply@local
+          EOF
+
+      - name: Bring up minio
+        run: |
+          docker compose -f docker-compose.prod.yml -p test-idem --env-file .env.test up -d --wait minio
+
+      - name: First create-buckets run
+        run: |
+          docker compose -f docker-compose.prod.yml -p test-idem --env-file .env.test run --rm create-buckets
+
+      - name: Second create-buckets run (idempotency check)
+        run: |
+          docker compose -f docker-compose.prod.yml -p test-idem --env-file .env.test run --rm create-buckets
+
+      - name: Teardown
+        if: always()
+        run: |
+          docker compose -f docker-compose.prod.yml -p test-idem --env-file .env.test down -v
+          rm -f .env.test
\ No newline at end of file
-- 
2.49.1


From 7e430998b8aed39484585467899c7d38b23e8d36 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:10:08 +0200
Subject: [PATCH 24/39] security(fail2ban): widen jail to /forgot-password and
 rate-limit 429
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The filter only watched /api/auth/login 401 — leaving the forgot-password
endpoint open to:

  - email enumeration (slow brute-force probing which addresses exist)
  - password-reset brute-force against accounts whose addresses leak

Widens the failregex to /api/auth/(login|forgot-password) and adds 429 to
the status alternation so a future in-app rate-limiter response is also
caught by the jail (defense in depth).

CI assertions extended to cover both new dimensions plus a negative case
on an unrelated 401 endpoint (/api/documents) — pins that the widening
did not over-match.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/ci.yml                       | 24 +++++++++++++++++++
 .../filter.d/familienarchiv-auth.conf         | 14 +++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml
index fd30bac6..6f6aa0e0 100644
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -140,6 +140,22 @@ jobs:
           echo "$out" | grep -qE '1 matched' \
             || { echo "expected 1 match for /api/auth/login 401"; exit 1; }
 
+      - name: Matches /api/auth/login 429
+        run: |
+          echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"POST","host":"archiv.raddatz.cloud","uri":"/api/auth/login"},"status":429}' > /tmp/sample.log
+          out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf)
+          echo "$out"
+          echo "$out" | grep -qE '1 matched' \
+            || { echo "expected 1 match for /api/auth/login 429"; exit 1; }
+
+      - name: Matches /api/auth/forgot-password 401
+        run: |
+          echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"POST","host":"archiv.raddatz.cloud","uri":"/api/auth/forgot-password"},"status":401}' > /tmp/sample.log
+          out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf)
+          echo "$out"
+          echo "$out" | grep -qE '1 matched' \
+            || { echo "expected 1 match for /api/auth/forgot-password 401"; exit 1; }
+
       - name: Does not match /api/auth/login 200
         run: |
           echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"POST","host":"archiv.raddatz.cloud","uri":"/api/auth/login"},"status":200}' > /tmp/sample.log
@@ -148,6 +164,14 @@ jobs:
           echo "$out" | grep -qE '0 matched' \
             || { echo "expected 0 matches for /api/auth/login 200"; exit 1; }
 
+      - name: Does not match /api/documents (unrelated 401)
+        run: |
+          echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"GET","host":"archiv.raddatz.cloud","uri":"/api/documents"},"status":401}' > /tmp/sample.log
+          out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf)
+          echo "$out"
+          echo "$out" | grep -qE '0 matched' \
+            || { echo "expected 0 matches for /api/documents 401"; exit 1; }
+
   # ─── Compose Bucket-Bootstrap Idempotency ─────────────────────────────────────
   # docker-compose.prod.yml's create-buckets service runs on every
   # `docker compose up` (one-shot, no restart). Must be idempotent — a
diff --git a/infra/fail2ban/filter.d/familienarchiv-auth.conf b/infra/fail2ban/filter.d/familienarchiv-auth.conf
index 6f06551f..0f85a798 100644
--- a/infra/fail2ban/filter.d/familienarchiv-auth.conf
+++ b/infra/fail2ban/filter.d/familienarchiv-auth.conf
@@ -1,5 +1,5 @@
 # fail2ban filter for credential-stuffing attempts against the
-# Familienarchiv login endpoint.
+# Familienarchiv authentication endpoints.
 #
 # Parses Caddy JSON access log entries (configured in
 # infra/caddy/Caddyfile via the (access_log) snippet).
@@ -12,6 +12,16 @@
 #               "uri":"/api/auth/login",…},
 #    "status":401,…}
 #
+# Watched endpoints:
+#   - /api/auth/login            — credential stuffing
+#   - /api/auth/forgot-password  — email enumeration + slow brute-force
+#                                  against accounts whose addresses leak
+#
+# Watched statuses:
+#   - 401 — bad credentials
+#   - 429 — server-side rate limit (in case a future in-app limiter
+#           returns 429 before fail2ban catches the volume)
+#
 # Caddy emits remote_ip *inside* the request object and status at the
 # top level. The order within the request object is stable
 # (remote_ip → … → uri) across Caddy 2.7+. Lazy `.*?` keeps the regex
@@ -21,7 +31,7 @@
 before = common.conf
 
 [Definition]
-failregex = ^\s*\{.*?"remote_ip":"<HOST>".*?"uri":"/api/auth/login.*?"status":\s*401\b
+failregex = ^\s*\{.*?"remote_ip":"<HOST>".*?"uri":"/api/auth/(login|forgot-password).*?"status":\s*4(01|29)\b
 
 ignoreregex =
 
-- 
2.49.1


From f2ec81547ba54eb0a1f859514d89e6666a82ae9e Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:10:59 +0200
Subject: [PATCH 25/39] ci(deploy): add --pull to docker compose build for CVE
 pickup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without --pull, the host's Docker layer cache wins: if a CVE drops in
node:20.19.0-alpine3.21 / postgres:16-alpine and the vendor re-publishes
the same tag, the runner keeps serving the cached layer until the cache
is manually cleared — a silent supply-chain blind spot.

Adding --pull to both `compose build` invocations costs a single
re-pull per run and lifts the base-image patch lag from "next host
prune" to "next nightly".

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/nightly.yml | 6 +++++-
 .gitea/workflows/release.yml | 5 ++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml
index 118dd54f..cbf10d39 100644
--- a/.gitea/workflows/nightly.yml
+++ b/.gitea/workflows/nightly.yml
@@ -70,13 +70,17 @@ jobs:
           EOF
 
       - name: Build images
+        # `--pull` forces re-fetching pinned base images so a CVE
+        # re-publication of the same tag (e.g. node:20.19.0-alpine3.21,
+        # postgres:16-alpine) is picked up instead of being served
+        # from the host's stale Docker layer cache.
         run: |
           docker compose \
             -f docker-compose.prod.yml \
             -p archiv-staging \
             --env-file .env.staging \
             --profile staging \
-            build
+            build --pull
 
       - name: Deploy staging
         run: |
diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml
index d4332ba9..9ae74ad6 100644
--- a/.gitea/workflows/release.yml
+++ b/.gitea/workflows/release.yml
@@ -72,12 +72,15 @@ jobs:
           EOF
 
       - name: Build images
+        # `--pull` forces re-fetching pinned base images so a CVE
+        # re-publication of the same tag is picked up rather than served
+        # from the host's stale Docker layer cache.
         run: |
           docker compose \
             -f docker-compose.prod.yml \
             -p archiv-production \
             --env-file .env.production \
-            build
+            build --pull
 
       - name: Deploy production
         run: |
-- 
2.49.1


From fe1451f570d7bedf5899134bad53552a92d0dc31 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:12:05 +0200
Subject: [PATCH 26/39] ci(smoke): pin curl to 127.0.0.1 via --resolve
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The smoke step previously curled the public hostname unconditionally,
which routes the runner's request via DNS → router → back into the same
host. Many SOHO routers do not implement hairpin NAT (or do so only after
a firmware update), so the deploy may pass on day one and silently fail
on day 90.

--resolve "<host>:443:127.0.0.1" pins the hostname to the runner's
loopback while keeping SNI on the public name (so the cert validates
correctly and the Caddy vhost block matches). The smoke test now
verifies that the Caddy-on-the-same-host is serving the right
hostname end-to-end, with no router dependency.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/nightly.yml | 21 ++++++++++++++-------
 .gitea/workflows/release.yml | 14 +++++++++-----
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml
index cbf10d39..fa343eb4 100644
--- a/.gitea/workflows/nightly.yml
+++ b/.gitea/workflows/nightly.yml
@@ -93,15 +93,22 @@ jobs:
 
       - name: Smoke test deployed environment
         # Healthchecks confirm containers are healthy; they do NOT confirm the
-        # public surface works. This step catches: Caddy not reloaded, DNS
-        # missing, HSTS header dropped, /actuator block bypassed.
+        # public surface works. This step catches: Caddy not reloaded, HSTS
+        # header dropped, /actuator block bypassed.
+        #
+        # --resolve pins staging.raddatz.cloud to the runner's loopback so we
+        # do NOT depend on the host router doing hairpin NAT (many SOHO
+        # routers do not, or do so only after a firmware update). SNI still
+        # uses the public hostname so the cert validates correctly.
         run: |
           set -e
-          URL="https://staging.raddatz.cloud"
-          echo "Smoke test: $URL"
-          curl -fsS --max-time 10 "$URL/login" -o /dev/null
-          curl -fsS --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security'
-          status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
+          HOST="staging.raddatz.cloud"
+          URL="https://$HOST"
+          RESOLVE="--resolve $HOST:443:127.0.0.1"
+          echo "Smoke test: $URL (pinned to 127.0.0.1)"
+          curl -fsS $RESOLVE --max-time 10 "$URL/login" -o /dev/null
+          curl -fsS $RESOLVE --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security'
+          status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
           [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
           echo "All smoke checks passed"
 
diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml
index 9ae74ad6..e1eeca2c 100644
--- a/.gitea/workflows/release.yml
+++ b/.gitea/workflows/release.yml
@@ -92,13 +92,17 @@ jobs:
 
       - name: Smoke test deployed environment
         # See nightly.yml — same three checks, against the prod vhost.
+        # --resolve pins archiv.raddatz.cloud to the runner's loopback so
+        # the smoke test does NOT depend on hairpin NAT on the host router.
         run: |
           set -e
-          URL="https://archiv.raddatz.cloud"
-          echo "Smoke test: $URL"
-          curl -fsS --max-time 10 "$URL/login" -o /dev/null
-          curl -fsS --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security'
-          status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
+          HOST="archiv.raddatz.cloud"
+          URL="https://$HOST"
+          RESOLVE="--resolve $HOST:443:127.0.0.1"
+          echo "Smoke test: $URL (pinned to 127.0.0.1)"
+          curl -fsS $RESOLVE --max-time 10 "$URL/login" -o /dev/null
+          curl -fsS $RESOLVE --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security'
+          status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
           [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
           echo "All smoke checks passed"
 
-- 
2.49.1


From 33300e4ad9c8ae356c4be562520a36d998cf0f03 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:12:55 +0200
Subject: [PATCH 27/39] chore(infra): drop aspirational Renovate comments from
 compose
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The repo's renovate.json only configures TipTap grouping; Renovate is
not currently active against MinIO / mc / mailpit / Postgres / Node /
Caddy. The "Renovate keeps it current" comments were aspirational —
those tags will rot until Renovate is bootstrapped (tracked in a
follow-up issue).

The "Pinned mc release; Renovate keeps it current" comment is gone
already since the create-buckets entrypoint was extracted to a script
in the preceding MinIO-policy commit.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 468beeec..626d44a5 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -56,7 +56,8 @@ services:
       retries: 5
 
   minio:
-    # Pinned MinIO release for reproducible deploys; Renovate keeps it current.
+    # Pinned MinIO release for reproducible deploys. Bumped manually until
+    # Renovate is bootstrapped for these production images (see follow-up issue).
     image: minio/minio:RELEASE.2025-02-28T09-55-16Z
     restart: unless-stopped
     command: server /data --console-address ":9001"
@@ -95,7 +96,7 @@ services:
   # Dev-only mail catcher; gated behind the staging profile so production
   # never starts it. Staging workflow runs with `--profile staging`.
   mailpit:
-    # Pinned for reproducibility; Renovate bumps the tag.
+    # Pinned for reproducibility; bumped manually until Renovate is bootstrapped.
     image: axllent/mailpit:v1.29.7
     restart: unless-stopped
     profiles: ["staging"]
-- 
2.49.1


From 59bc81d353d5ca1a1645fe33e42b2fb678b7b16f Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:14:58 +0200
Subject: [PATCH 28/39] docs(adr): ADR-009 standalone docker-compose.prod.yml,
 not overlay

Records the decision to make docker-compose.prod.yml a fully self-contained
file rather than an overlay over docker-compose.yml. Captures the cost
(env-var duplication across dev and prod files) and the benefit (single
file the reviewer can hold in their head, no Compose merge-rule
surprises, automatic project-name namespacing for cohabiting staging +
production on one host).

Surfaces the retirement of the earlier overlay narrative in
docs/infrastructure/production-compose.md so a future maintainer does
not reverse the choice out of ignorance.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../adr/009-standalone-compose-not-overlay.md | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 docs/adr/009-standalone-compose-not-overlay.md

diff --git a/docs/adr/009-standalone-compose-not-overlay.md b/docs/adr/009-standalone-compose-not-overlay.md
new file mode 100644
index 00000000..e861fe47
--- /dev/null
+++ b/docs/adr/009-standalone-compose-not-overlay.md
@@ -0,0 +1,50 @@
+# ADR-009: Standalone `docker-compose.prod.yml`, not an overlay
+
+## Status
+
+Accepted
+
+## Context
+
+The repository's `docker-compose.yml` is a development stack: every service is built locally, ports are exposed on `0.0.0.0` for dev tooling, the frontend runs `npm run dev` with hot-reload, the backend is `spring-boot:run` with the dev profile, and there is no Caddy, no `archiv-app` service account, no admin-credential lock-in, no healthcheck-gated startup sequence. The dev stack reflects "single developer on a laptop", not "production on a single VPS".
+
+The pre-merge design (issue #497, comment #8331) sketched two ways to add a production stack:
+
+1. **Overlay** — keep `docker-compose.yml` as the base, add `docker-compose.prod.yml` as a `-f` overlay (`docker compose -f docker-compose.yml -f docker-compose.prod.yml up`). Compose merges the two files at runtime.
+2. **Standalone** — make `docker-compose.prod.yml` a fully self-contained file that does not reference or merge with `docker-compose.yml` at all. Project-name namespacing (`-p archiv-production`, `-p archiv-staging`) keeps multi-environment deploys clean on a single host.
+
+The earlier `docs/infrastructure/production-compose.md` notes assumed overlay because the original plan was to **remove** MinIO in production (replace with Hetzner Object Storage), so the prod file would only need to remove one service and add a few. With MinIO retained (see ADR-010), the prod stack diverges from dev in essentially every service: build vs pre-built image, target stage, port binding, env vars, healthcheck, restart policy, mem_limit, profile gating, service account, depends_on chain. Overlay would mostly be `override:` blocks that nullify the dev defaults — a fragile inversion.
+
+## Decision
+
+`docker-compose.prod.yml` is standalone. Production and staging both run it directly:
+
+```
+production: docker compose -f docker-compose.prod.yml -p archiv-production --env-file .env.production ...
+staging:    docker compose -f docker-compose.prod.yml -p archiv-staging    --env-file .env.staging --profile staging ...
+```
+
+Environment isolation is achieved via the Docker Compose project name (`-p`). Volumes, networks, and containers are namespaced by the project name, so production and staging cohabit cleanly on the same host without interfering.
+
+The dev `docker-compose.yml` is unchanged — `docker compose up` still works for developers, and its `frontend` service now specifies `target: development` explicitly so the new multi-stage Dockerfile builds the right stage.
+
+## Alternatives Considered
+
+| Alternative | Why rejected |
+|---|---|
+| Overlay (`-f base.yml -f prod.yml`) | With MinIO retained and most services differing across nearly every field, the overlay would consist mostly of `override:` blocks that null out dev defaults. Compose's merge semantics for nested keys (env, ports, healthcheck) are sharp — silent merges of port mappings, env-var entries, and depends_on edges cost reviewer hours. Standalone is one file the reader can hold in their head. |
+| Two fully separate files (dev + prod) but with shared YAML anchors via `extends:` | `extends:` works across files but is a niche feature and is increasingly discouraged in compose v2. Reviewer load is higher than reading two flat files. |
+| Generate prod compose from a template at deploy time (e.g. ytt, kustomize) | Adds a build-time step and a new tool to the operator toolchain. Justified for a fleet of 10+ environments; overkill for production + staging on one host. |
+| Single compose file with environment-specific profiles | Compose profiles select which *services* run, not which *configuration* a service runs with. Using profiles to swap "build locally" vs "pull image" would smear dev and prod across one file. |
+
+## Consequences
+
+- The prod file can be read top-to-bottom without cross-referencing `docker-compose.yml`. Onboarding and review cost drops.
+- Volume namespacing is automatic (`archiv-production_postgres-data`, `archiv-staging_postgres-data`) — no manual `volumes:` aliasing.
+- Dev compose churn (e.g. swapping a dev port) cannot accidentally affect production. The two files are independent.
+- The cost is duplication: identical environment variables (e.g. `POSTGRES_DB: archiv`) appear in both files. This duplication is bounded — there is no incentive to add more services that exist in both — and the alternative (overlay) carries its own duplication via `override:` boilerplate.
+- The retired `docs/infrastructure/production-compose.md` narrative is trimmed to a pointer at the live files. The cost/sizing rationale is preserved there.
+
+## Future Direction
+
+If the deployment fleet ever grows beyond two environments on one host (e.g. add a `demo` environment, or shard staging across two VPS for load testing), revisit the templating decision. At three+ environments the duplication starts to bite and a template engine (kustomize or ytt) becomes attractive.
-- 
2.49.1


From b57afb9ad234162f8d216ce0532f6d0de2ecbbe2 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:15:38 +0200
Subject: [PATCH 29/39] docs(adr): ADR-010 MinIO stays self-hosted, Hetzner OBS
 deferred

Records the reversal of the earlier "migrate to Hetzner Object Storage"
direction in docs/infrastructure/production-compose.md. Documents the
cost/benefit (current 13 GB fits trivially on the VPS; OBS billing is
dominated by base fee at this size; migration is a three-env-var swap
plus `mc mirror`, no application rewrite cost).

Captures the four triggers that should re-open the decision (50 GB
threshold, healthcheck latency, VPS upgrade cost, backup runtime) so
the deferral does not become an indefinite punt.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../010-minio-self-hosted-not-hetzner-obs.md  | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 docs/adr/010-minio-self-hosted-not-hetzner-obs.md

diff --git a/docs/adr/010-minio-self-hosted-not-hetzner-obs.md b/docs/adr/010-minio-self-hosted-not-hetzner-obs.md
new file mode 100644
index 00000000..4f84e30d
--- /dev/null
+++ b/docs/adr/010-minio-self-hosted-not-hetzner-obs.md
@@ -0,0 +1,53 @@
+# ADR-010: MinIO stays self-hosted on the production VPS
+
+## Status
+
+Accepted
+
+## Context
+
+`docs/infrastructure/production-compose.md` (pre-this-PR) sketched a production topology in which the application bucket migrates from in-cluster MinIO to Hetzner Object Storage (OBS, S3-compatible). The motivation was operational: one less service to back up, no MinIO RAM/disk pressure on the VPS, hand off durability to the hyperscaler.
+
+Two facts revisited at pre-merge review (issue #497, comment #8331) changed the answer:
+
+1. **Current data size is small.** The archive is ~13 GB of file uploads (Kurrent letters, scanned ODS files, attachment PDFs). Hetzner OBS billing on this size is dominated by the per-month base fee (~5 EUR/mo for the smallest unit), not capacity or egress. The break-even point against the VPS's existing disk is far above the current footprint.
+2. **MinIO is already production-grade.** The dev stack uses MinIO; the backend already drives it via the AWS SDK v2 with a generic `S3_ENDPOINT`. Switching providers is a runtime env-var change (`S3_ENDPOINT`, `S3_ACCESS_KEY`, `S3_SECRET_KEY`) plus an `mc mirror` to copy objects. There is no application-level rewrite cost waiting.
+
+If Hetzner OBS were a one-way-door (provider-specific SDK, complex IAM integration, multi-month migration), the decision would deserve a serious weighing. As reversible as the migration is, deferring it costs nothing.
+
+## Decision
+
+MinIO stays on the production VPS for the first launch. The application bucket is created and managed inside the docker-compose stack (`infra/minio/bootstrap.sh`). The backend uses a least-privilege service account (`archiv-app`) with a bucket-scoped IAM policy, not the MinIO root credentials.
+
+Hetzner Object Storage is **explicitly deferred**, not rejected. The migration path is documented as a runbook in `docs/DEPLOYMENT.md` (when the trigger fires): provision an OBS bucket, run `mc mirror local-minio:/familienarchiv obs:/familienarchiv`, rotate the three env vars, restart the backend, decommission the MinIO service from `docker-compose.prod.yml`.
+
+## Triggers to re-evaluate
+
+Revisit the decision when **any** of the following holds:
+
+- The `minio-data` volume exceeds 50 GB and is growing > 5 GB/month.
+- MinIO healthcheck latency exceeds 200 ms p95 (signal of disk pressure on the host).
+- The VPS upgrade required to keep MinIO healthy costs more per month than the equivalent OBS bucket + traffic.
+- Backup of the MinIO volume to `heim-nas` over Tailscale (deferred follow-up) is implemented and consistently runs > 30 min nightly. At that point durability-as-a-service starts paying for itself.
+
+The migration runbook in `docs/DEPLOYMENT.md` is the script for executing the swap when one of the triggers fires.
+
+## Alternatives Considered
+
+| Alternative | Why rejected (for now) |
+|---|---|
+| Migrate to Hetzner Object Storage in this PR | Premature. Adds an external dependency, locks the operator into the Hetzner ecosystem before the data has demonstrated it needs hyperscaler durability, blocks the PR on a migration that buys ~5 GB of headroom. |
+| Migrate to S3 (AWS) for HA across regions | Way over-spec for a family archive. Egress cost would dwarf any benefit; durability concerns at this size are addressed by nightly off-site backup, not by multi-region replication. |
+| Drop S3 abstraction entirely; store files directly on the VPS disk | Possible, but loses the bucket-policy IAM surface (least-privilege service account), loses presigned-URL flow (OCR service downloads files via short-lived URLs, not via shared filesystem), loses the migration path to OBS. The S3 indirection is cheap insurance. |
+| Self-hosted on-VPS plus periodic `mc mirror` to Hetzner OBS for off-site backup | This is the **target** for the backup pipeline follow-up. Treated as backup, not primary — primary stays MinIO. |
+
+## Consequences
+
+- The production VPS sizing (Hetzner CX42, 16 GB RAM, 80 GB disk) must accommodate MinIO's working set. Current footprint leaves ample headroom.
+- Backup of MinIO data is the operator's responsibility until the off-site `mc mirror` pipeline is implemented (deferred follow-up). The DEPLOYMENT.md rollback procedure explicitly flags this — manual backup is the only recovery option until the pipeline ships.
+- The backend never sees the MinIO root password; it uses the `archiv-app` service account with a bucket-scoped IAM policy (see `infra/minio/bootstrap.sh`). A backend RCE/SSRF cannot escalate beyond the `familienarchiv` bucket.
+- The migration to Hetzner OBS remains a small, well-understood runbook step rather than a major refactor. No application code, no SDK swap.
+
+## Future Direction
+
+When one of the triggers above fires, the migration is: provision OBS bucket → `mc mirror` → rotate three env vars → restart backend → remove MinIO service from compose. The bucket-scoped policy translates 1:1 to an OBS user policy (S3-compatible).
-- 
2.49.1


From 6a6a1c4353572d471f0961f2bdefbcb436da5f0e Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:16:20 +0200
Subject: [PATCH 30/39] docs(adr): ADR-011 single-tenant Gitea runner with
 on-disk env-files

Records the operational assumption that nightly.yml and release.yml
bake in: the self-hosted runner is single-tenant, so writing secrets
to .env.staging / .env.production on disk and removing them via an
`if: always()` cleanup step is acceptable for v1.

Documents the three migration triggers (second repo on the runner,
untrusted PR execution, move to shared infrastructure) and the
one-step migration path (--env-file <(printf '%s' "$SECRET_BLOB"))
so the next operator does not silently break the trust assumption.

The in-comment notes at the top of both workflow files already point
at this ADR's content; this commit records the decision in the durable
location the doc-currency table demands.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docs/adr/011-single-tenant-gitea-runner.md | 58 ++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 docs/adr/011-single-tenant-gitea-runner.md

diff --git a/docs/adr/011-single-tenant-gitea-runner.md b/docs/adr/011-single-tenant-gitea-runner.md
new file mode 100644
index 00000000..d5250592
--- /dev/null
+++ b/docs/adr/011-single-tenant-gitea-runner.md
@@ -0,0 +1,58 @@
+# ADR-011: Single-tenant Gitea runner with secrets-on-disk env-files
+
+## Status
+
+Accepted
+
+## Context
+
+The deploy workflows (`.gitea/workflows/nightly.yml`, `release.yml`) execute on a self-hosted Gitea Actions runner. The runner has Docker-out-of-Docker access (the host's Docker socket is mounted into the runner), so `docker compose build` produces images on the host daemon and `docker compose up` consumes them directly — no registry hop.
+
+Two workflow steps shape the security model:
+
+1. **"Write env file"** — the workflow writes every required secret to `.env.staging` or `.env.production` on the runner's filesystem so that `docker compose --env-file` can consume them. The file lives on disk for the duration of the workflow.
+2. **"Cleanup env file"** — the matching `if: always()` step deletes the env file after the workflow ends, regardless of success.
+
+This shape only works under one operational assumption: **the runner is single-tenant**. The runner is owned by the same operator who owns the secrets, no other repositories run jobs on the same runner, and no untrusted code is executed (no public fork PRs trigger workflows). If any of those held, the env-file-on-disk approach would be a credential exposure path — a sibling job could read `.env.production`, or a malicious PR could exfiltrate the secrets via a step.
+
+The alternative — `docker compose --env-file <(printf "..." )` (bash process substitution) — is technically supported and would keep secrets out of the on-disk filesystem. It is more secure under a multi-tenant runner but requires bash 4+ and is brittle inside YAML (the `printf` step would need to escape every secret value containing newlines, equals signs, or quotes).
+
+## Decision
+
+The runner is treated as single-tenant for the lifetime of the v1 deployment. The workflows write env-files to disk under that assumption and rely on the `if: always()` cleanup step to remove them. The operational assumption is documented in-comment at the top of both workflow files (`nightly.yml`, `release.yml`) so the next operator who considers adding a second repo or accepting public PRs has the trigger surfaced in front of them.
+
+Concretely:
+
+- The Gitea runner only runs jobs for `marcel/familienarchiv`.
+- No public fork PRs trigger the workflows (Gitea defaults to requiring an explicit approval on first-time contributor PRs for the actions to run).
+- Secrets are stored in Gitea repository secrets and injected via `${{ secrets.* }}`. They land in the env-file at workflow start and are removed at workflow end.
+
+## Migration trigger
+
+Switch to the multi-tenant-safe pattern when **any** of the following becomes true:
+
+- A second repository starts using the same runner.
+- A workflow accepts contributions that can run untrusted code (public PRs without manual approval).
+- The runner is moved off the operator's controlled host onto shared infrastructure.
+
+The migration path is one-step per workflow: replace the "Write env file" step with `--env-file <(printf '%s' "${{ secrets.STAGING_ENV_BLOB }}")` and store the full env-file as a single Gitea secret. The cleanup step is then unnecessary because the env-file never touches disk.
+
+## Alternatives Considered
+
+| Alternative | Why rejected (for now) |
+|---|---|
+| `--env-file <(printf "...")` via bash process substitution | More secure under multi-tenant. Brittle for multi-line / quoted secret values; harder to debug ("env file not found" with no diff to inspect). Justified once the trigger above fires. |
+| Docker secrets (`docker secret create` + `compose secrets:`) | Designed for Swarm; outside of Swarm, compose secrets read from files anyway, so the on-disk surface is the same. Adds complexity without changing the threat model. |
+| External secret manager (Vault, AWS Secrets Manager) | Adds a third-party dependency to the deploy path. For a family-archive deployment with one operator and one VPS, the cost outweighs the benefit at this scale. |
+| GitHub-hosted ephemeral runners | Would require uploading the prod-deploy artifacts to a registry first, then a deploy step on the VPS connecting back. Inverts the current Docker-out-of-Docker simplicity for marginal security gain. The single-tenant self-hosted runner *is* ephemeral in practice — the secrets are written to a directory the runner controls, then deleted. |
+
+## Consequences
+
+- The runner host's filesystem is in the secret-trust boundary. The host is hardened per `docs/DEPLOYMENT.md` (ufw, fail2ban, Tailscale-only SSH).
+- An operator who later adds a second repo to the runner without revisiting the workflows would silently break the trust assumption. The in-file comments at the top of `nightly.yml` and `release.yml` are the breadcrumb that surfaces the assumption at change time.
+- The `if: always()` cleanup step is load-bearing: removing it (e.g. during a future workflow refactor) leaves credentials on disk between runs. Treat it as a permanent invariant.
+- Workflow debuggability stays high: an operator who needs to know what env-file the deploy ran with can SSH onto the host while a workflow is in flight and `cat .env.staging` — useful for first-deploy diagnostics.
+
+## Future Direction
+
+When the trigger fires, migrate both workflows in a single PR: replace the "Write env file" step with a single `--env-file <(printf '%s' …)` invocation, drop the cleanup step, and consolidate the per-secret Gitea entries into a single multi-line `STAGING_ENV_BLOB` / `PROD_ENV_BLOB` secret. Single commit, both workflows, no application change.
-- 
2.49.1


From 03d478840ba7cb3bb6ae513d3ff8ce43ef90a2ee Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:17:12 +0200
Subject: [PATCH 31/39] docs(arch): show Caddy + X-Forwarded-Proto in auth-flow
 diagram
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the Caddy hop to seq-auth-flow.puml and surfaces the two
production-relevant header behaviours:

  - Caddy terminates TLS and forwards X-Forwarded-Proto: https
  - Spring Boot trusts this header (server.forward-headers-strategy:
    native, ForwardedRequestCustomizer at the Jetty layer), so
    request.getScheme() returns "https"
  - The Set-Cookie response carries the Secure flag because the
    observed scheme is https — without forward-headers-strategy this
    would silently drop to plain http and the cookie would lose Secure

Closes the doc-currency gap flagged in the Markus review on PR #499:
"Auth flow change → docs/architecture/c4/seq-auth-flow.puml".

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docs/architecture/c4/seq-auth-flow.puml | 35 ++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/docs/architecture/c4/seq-auth-flow.puml b/docs/architecture/c4/seq-auth-flow.puml
index bae4a831..63b9038d 100644
--- a/docs/architecture/c4/seq-auth-flow.puml
+++ b/docs/architecture/c4/seq-auth-flow.puml
@@ -1,26 +1,49 @@
 @startuml
-title Authentication Flow
+title Authentication Flow (behind Caddy reverse proxy)
 
 actor User
 participant Browser
+participant "Caddy (TLS termination)" as Caddy
 participant "Frontend (SvelteKit)" as Frontend
 participant "Backend (Spring Boot)" as Backend
 participant PostgreSQL as DB
 
 User -> Browser: Enter email + password
-Browser -> Frontend: POST /login (form action)
+Browser -> Caddy: HTTPS POST /login (form action)
+note right of Caddy
+  Caddy terminates TLS and forwards
+  to Frontend over HTTP with:
+    X-Forwarded-Proto: https
+    X-Forwarded-For: <client IP>
+    X-Forwarded-Host: archiv.raddatz.cloud
+end note
+Caddy -> Frontend: HTTP POST /login\n+ X-Forwarded-Proto: https
 Frontend -> Frontend: Base64 encode "email:password"
-Frontend -> Backend: GET /api/users/me\nAuthorization: Basic <token>
+Frontend -> Backend: GET /api/users/me\nAuthorization: Basic <token>\n+ X-Forwarded-Proto: https
+note right of Backend
+  server.forward-headers-strategy: native
+  Jetty's ForwardedRequestCustomizer
+  reads X-Forwarded-Proto so
+  request.getScheme() returns "https".
+end note
 Backend -> Backend: Spring Security parses Basic Auth
 Backend -> DB: SELECT user WHERE email=?
 DB --> Backend: AppUser + groups + permissions
 Backend -> Backend: BCrypt.matches(password, hash)
 Backend --> Frontend: 200 OK — UserDTO
-Frontend -> Browser: Set-Cookie: auth_token=<base64>\n(httpOnly, SameSite=strict, maxAge=86400)
-Browser -> Frontend: GET / (next request)
+Frontend -> Caddy: Set-Cookie: auth_token=<base64>\n(httpOnly, **Secure**, SameSite=strict, maxAge=86400)
+note right of Frontend
+  Secure flag is set because the
+  request scheme observed by the
+  app is https (forwarded by Caddy).
+end note
+Caddy -> Browser: HTTPS 200 + Set-Cookie
+Browser -> Caddy: HTTPS GET / (next request)
+Caddy -> Frontend: HTTP GET / + X-Forwarded-Proto: https
 Frontend -> Frontend: hooks.server.ts reads auth_token cookie
 Frontend -> Backend: GET /api/users/me\nAuthorization: Basic <token>
 Backend --> Frontend: 200 OK — user in event.locals
-Frontend --> Browser: Render page with user context
+Frontend --> Caddy: rendered page
+Caddy --> Browser: HTTPS 200
 
 @enduml
-- 
2.49.1


From a7a80f8c1616efe229d8fa800b50d4134f17c3fe Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 13:18:11 +0200
Subject: [PATCH 32/39] docs(deployment): route SSE through Caddy in topology
 mermaid
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The top-level deployment diagram lagged the C4 L2 diagram, which
correctly notes that SSE notifications are fronted by Caddy. The
mermaid showed Browser → Backend direct, which would only be true
if the backend port were exposed publicly (it is not — all docker
ports bind to 127.0.0.1).

Fixes the inconsistency Markus flagged on PR #499: the public
surface is Caddy and Caddy only.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docs/DEPLOYMENT.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md
index e995739c..bd7b7a1a 100644
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -33,14 +33,14 @@ graph TD
     Backend -->|S3 API :9000| MinIO[(MinIO)]
     Backend -->|HTTP :8000 internal| OCR["OCR Service\nPython FastAPI"]
     OCR -->|presigned URL| MinIO
-    Browser -->|SSE direct| Backend
+    Caddy -->|SSE proxy_pass| Backend
 ```
 
 **Key facts:**
 - Caddy terminates TLS and reverse-proxies to frontend (`:3000`) and backend (`:8080`). The Caddyfile is committed at [`infra/caddy/Caddyfile`](../infra/caddy/Caddyfile) and is installed on the host as `/etc/caddy/Caddyfile` (symlink).
 - The host binds all docker-published ports to `127.0.0.1` only; Caddy is the sole external entry point.
 - The OCR service has **no published port** — reachable only on the internal Docker network from the backend.
-- SSE notifications go directly backend → browser (not via the SvelteKit SSR layer).
+- SSE notifications transit Caddy (browser → Caddy → backend); the backend is never reachable directly from the public internet. The SvelteKit SSR layer is bypassed for SSE, but Caddy is not.
 - The Caddyfile responds `404` on `/actuator/*` (defense in depth). Internal monitoring scrapes the backend on the docker network, not through Caddy.
 - Production and staging cohabit on the same host via docker compose project names: `archiv-production` (ports 8080/3000) and `archiv-staging` (ports 8081/3001).
 
-- 
2.49.1


From 8fcf653cb02d43b4dd2a217f6ef859e93eb05536 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 14:05:02 +0200
Subject: [PATCH 33/39] ci(smoke): pin HSTS to preload-list-eligible value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the presence-only `grep -qi strict-transport-security` smoke
assertion in both nightly.yml and release.yml with a value-pinning
regex that requires `max-age=31536000`, `includeSubDomains`, and
`preload`. A future Caddyfile edit that drops any of those three
parts now fails the deploy smoke step instead of passing silently.

Verified locally that the new pattern matches the preload-eligible
value and rejects three degraded forms (short max-age, missing
includeSubDomains, missing preload). Addresses @sara's round-2 note
on PR #499 — "presence check, not value check".

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/nightly.yml | 6 +++++-
 .gitea/workflows/release.yml | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml
index fa343eb4..3353e6f7 100644
--- a/.gitea/workflows/nightly.yml
+++ b/.gitea/workflows/nightly.yml
@@ -107,7 +107,11 @@ jobs:
           RESOLVE="--resolve $HOST:443:127.0.0.1"
           echo "Smoke test: $URL (pinned to 127.0.0.1)"
           curl -fsS $RESOLVE --max-time 10 "$URL/login" -o /dev/null
-          curl -fsS $RESOLVE --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security'
+          # Pin the preload-list-eligible HSTS value, not just header presence:
+          # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
+          # fail this check rather than pass it silently.
+          curl -fsS $RESOLVE --max-time 10 -I "$URL/" \
+            | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
           status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
           [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
           echo "All smoke checks passed"
diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml
index e1eeca2c..69e59dd6 100644
--- a/.gitea/workflows/release.yml
+++ b/.gitea/workflows/release.yml
@@ -101,7 +101,11 @@ jobs:
           RESOLVE="--resolve $HOST:443:127.0.0.1"
           echo "Smoke test: $URL (pinned to 127.0.0.1)"
           curl -fsS $RESOLVE --max-time 10 "$URL/login" -o /dev/null
-          curl -fsS $RESOLVE --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security'
+          # Pin the preload-list-eligible HSTS value, not just header presence:
+          # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
+          # fail this check rather than pass it silently.
+          curl -fsS $RESOLVE --max-time 10 -I "$URL/" \
+            | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
           status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
           [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
           echo "All smoke checks passed"
-- 
2.49.1


From 09680557ef68337fd9d1a8daee8c479ce5e40e35 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 14:06:13 +0200
Subject: [PATCH 34/39] security(caddy): add Permissions-Policy header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `Permissions-Policy: camera=(), microphone=(), geolocation=()` to
the shared (security_headers) snippet, so both archiv vhosts and the
git vhost deny browser APIs the app does not use. Reduces blast radius
of an XSS landing in a privileged origin.

The deploy smoke steps in nightly.yml and release.yml gain a matching
assertion against the canonical header value, so a future Caddyfile
edit that drops or loosens the header (e.g. `camera=(self)`) fails the
deploy instead of regressing silently.

`caddy validate` against caddy:2 passes; both workflow YAMLs parse.
Addresses @nora's round-2 suggestion on PR #499 — "lower-impact than
CSP but nearly free".

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/nightly.yml | 5 +++++
 .gitea/workflows/release.yml | 5 +++++
 infra/caddy/Caddyfile        | 4 ++++
 3 files changed, 14 insertions(+)

diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml
index 3353e6f7..870a5a99 100644
--- a/.gitea/workflows/nightly.yml
+++ b/.gitea/workflows/nightly.yml
@@ -112,6 +112,11 @@ jobs:
           # fail this check rather than pass it silently.
           curl -fsS $RESOLVE --max-time 10 -I "$URL/" \
             | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
+          # Permissions-Policy denies APIs the app does not use (camera,
+          # microphone, geolocation). A regression that loosens or drops the
+          # header now fails the smoke step.
+          curl -fsS $RESOLVE --max-time 10 -I "$URL/" \
+            | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
           status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
           [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
           echo "All smoke checks passed"
diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml
index 69e59dd6..3d5c99d1 100644
--- a/.gitea/workflows/release.yml
+++ b/.gitea/workflows/release.yml
@@ -106,6 +106,11 @@ jobs:
           # fail this check rather than pass it silently.
           curl -fsS $RESOLVE --max-time 10 -I "$URL/" \
             | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
+          # Permissions-Policy denies APIs the app does not use (camera,
+          # microphone, geolocation). A regression that loosens or drops the
+          # header now fails the smoke step.
+          curl -fsS $RESOLVE --max-time 10 -I "$URL/" \
+            | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
           status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
           [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
           echo "All smoke checks passed"
diff --git a/infra/caddy/Caddyfile b/infra/caddy/Caddyfile
index b5dfd345..fc6b02bb 100644
--- a/infra/caddy/Caddyfile
+++ b/infra/caddy/Caddyfile
@@ -18,6 +18,10 @@
 		Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
 		X-Content-Type-Options "nosniff"
 		Referrer-Policy "strict-origin-when-cross-origin"
+		# Deny browser APIs the app does not use. Reduces blast radius of an
+		# XSS landing in a privileged origin: a payload cannot silently turn
+		# on the microphone or read geolocation.
+		Permissions-Policy "camera=(), microphone=(), geolocation=()"
 		-Server
 	}
 }
-- 
2.49.1


From a4f2047bccb607172f65b52c9ad4796e88fda6ea Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 14:07:16 +0200
Subject: [PATCH 35/39] security(ocr): pin ALLOWED_PDF_HOSTS=minio in prod
 ocr-service env
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Production never sources PDFs from localhost or 127.0.0.1 — the OCR
service only reads from MinIO over the internal docker network. The
Python default (`minio,localhost,127.0.0.1`) was permissive on
purpose for local dev, but in production a future change to that
default — or a host-env override — would silently broaden the SSRF
surface. Pinning the env var explicitly here freezes the allowlist
to the one hostname production actually needs.

`docker compose config --quiet` and `--profile staging config
--quiet` both still pass. Verified the resolved config emits
`ALLOWED_PDF_HOSTS: minio`. Addresses @nora's round-2 suggestion on
PR #499 — "five characters of YAML, lifetime guarantee".

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 626d44a5..b821ec33 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -128,6 +128,11 @@ services:
       TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
       OCR_CONFIDENCE_THRESHOLD: "0.3"
       OCR_CONFIDENCE_THRESHOLD_KURRENT: "0.5"
+      # SSRF allowlist pinned explicitly to the internal MinIO hostname.
+      # In prod the OCR service only fetches PDFs from MinIO over the
+      # docker network; localhost/127.0.0.1 are dev-only sources and
+      # must NOT be reachable here. Do not widen to `*`.
+      ALLOWED_PDF_HOSTS: "minio"
     networks:
       - archive-net
     healthcheck:
-- 
2.49.1


From 1873f50f7f85d1f8d60c9ffbc19b464650028581 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 14:08:23 +0200
Subject: [PATCH 36/39] infra(mailpit): use nc -z healthcheck instead of wget
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The mailpit service healthcheck previously assumed `wget` ships in
the axllent/mailpit image. That's true for v1.29.7 but is not part
of the image's contract — a future Alpine slim-down could drop wget
and silently disable the healthcheck. Switched to BusyBox `nc -z
localhost 8025`, which is a TCP-port open check with no dependency
beyond BusyBox itself.

Verified inside axllent/mailpit:v1.29.7 that `nc` is present
(/usr/bin/nc, BusyBox v1.37.0) and that the proposed command
returns 0 against an open port and non-zero against a closed one.
Compose still parses with `--profile staging`. Addresses @tobi's
round-2 suggestion on PR #499.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index b821ec33..4cd7d9c4 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -103,7 +103,11 @@ services:
     networks:
       - archive-net
     healthcheck:
-      test: ["CMD-SHELL", "wget -qO- http://localhost:8025/api/v1/info >/dev/null 2>&1 || exit 1"]
+      # TCP-port open check via BusyBox `nc`. The previous wget-based probe
+      # introduced a non-obvious binary dependency on the mailpit image; a
+      # future tag that ships without wget would silently disable the
+      # healthcheck. `nc` is part of BusyBox in the upstream image.
+      test: ["CMD-SHELL", "nc -z localhost 8025 || exit 1"]
       interval: 10s
       timeout: 5s
       retries: 5
-- 
2.49.1


From 440a191138b6baeeed68cbec1d1ae9ffd345f44e Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 14:09:12 +0200
Subject: [PATCH 37/39] infra(workflows): annotate env-file cleanup as
 load-bearing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `if: always()` conditional on the env-file cleanup step in both
deploy workflows is what makes the ADR-011 single-tenant runner trust
model safe: secrets land on disk before each deploy and are wiped
unconditionally afterwards. A future workflow refactor that drops
`if: always()` would silently leave plaintext secrets on the runner
on any failed deploy.

The ADR documents this; the workflow file did not. Adds a prominent
inline comment so the next reader of the YAML sees the constraint
without having to cross-reference ADR-011. No behaviour change — both
workflows still parse. Addresses @nora's round-2 suggestion on PR
#499 — "linchpin of the ADR-011 trust model".

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitea/workflows/nightly.yml | 6 ++++++
 .gitea/workflows/release.yml | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml
index 870a5a99..1bae7b04 100644
--- a/.gitea/workflows/nightly.yml
+++ b/.gitea/workflows/nightly.yml
@@ -122,5 +122,11 @@ jobs:
           echo "All smoke checks passed"
 
       - name: Cleanup env file
+        # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
+        # single-tenant runner trust model. Every secret in .env.staging
+        # is plain text on the runner filesystem until this step runs.
+        # If a future refactor drops `if: always()`, a failed deploy
+        # leaves the env-file behind. Do not remove this conditional
+        # without first re-evaluating ADR-011.
         if: always()
         run: rm -f .env.staging
diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml
index 3d5c99d1..96894fed 100644
--- a/.gitea/workflows/release.yml
+++ b/.gitea/workflows/release.yml
@@ -116,5 +116,11 @@ jobs:
           echo "All smoke checks passed"
 
       - name: Cleanup env file
+        # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
+        # single-tenant runner trust model. Every secret in
+        # .env.production is plain text on the runner filesystem until
+        # this step runs. If a future refactor drops `if: always()`, a
+        # failed deploy leaves the env-file behind. Do not remove this
+        # conditional without first re-evaluating ADR-011.
         if: always()
         run: rm -f .env.production
-- 
2.49.1


From 9adde3cd890f683ef9f670f3c980cfb00bf89eec Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 14:10:39 +0200
Subject: [PATCH 38/39] refactor(compose): rename docker network archive-net to
 archiv-net
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The docker network was the only `archive-*` identifier in either
compose file; everything else (user, db, bucket, service account,
project name) uses the `archiv-*` spelling. Reviewers' eyes stuttered
on it on the prod compose review (round 2 of PR #499 — Markus and
Tobi). Renamed in both prod and dev compose for consistency and
updated the single doc reference to the dev-project-prefixed
network name.

Operational note: applying this change to a running stack will
recreate the network on the next `docker compose up`; containers
restart, named volumes are unaffected.

`docker compose config --quiet` passes for both compose files and
for the staging profile. Sweep confirms zero `archive-net`
references remain in the tree.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docker-compose.prod.yml         | 16 ++++++++--------
 docker-compose.yml              | 16 ++++++++--------
 docs/infrastructure/ci-gitea.md |  2 +-
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 4cd7d9c4..b66ace54 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -28,7 +28,7 @@
 #   APP_MAIL_FROM               sender address (e.g. noreply@raddatz.cloud)
 
 networks:
-  archive-net:
+  archiv-net:
     driver: bridge
 
 volumes:
@@ -48,7 +48,7 @@ services:
     volumes:
       - postgres-data:/var/lib/postgresql/data
     networks:
-      - archive-net
+      - archiv-net
     healthcheck:
       test: ["CMD-SHELL", "pg_isready -U archiv -d archiv"]
       interval: 10s
@@ -67,7 +67,7 @@ services:
     volumes:
       - minio-data:/data
     networks:
-      - archive-net
+      - archiv-net
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
       interval: 30s
@@ -85,7 +85,7 @@ services:
       minio:
         condition: service_healthy
     networks:
-      - archive-net
+      - archiv-net
     environment:
       MINIO_PASSWORD: ${MINIO_PASSWORD}
       MINIO_APP_PASSWORD: ${MINIO_APP_PASSWORD}
@@ -101,7 +101,7 @@ services:
     restart: unless-stopped
     profiles: ["staging"]
     networks:
-      - archive-net
+      - archiv-net
     healthcheck:
       # TCP-port open check via BusyBox `nc`. The previous wget-based probe
       # introduced a non-obvious binary dependency on the mailpit image; a
@@ -138,7 +138,7 @@ services:
       # must NOT be reachable here. Do not widen to `*`.
       ALLOWED_PDF_HOSTS: "minio"
     networks:
-      - archive-net
+      - archiv-net
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
       interval: 10s
@@ -186,7 +186,7 @@ services:
       SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-true}
       SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-true}
     networks:
-      - archive-net
+      - archiv-net
     healthcheck:
       test: ["CMD-SHELL", "wget -qO- http://localhost:8080/actuator/health | grep -q UP || exit 1"]
       interval: 15s
@@ -210,7 +210,7 @@ services:
       API_INTERNAL_URL: http://backend:8080
       ORIGIN: https://${APP_DOMAIN}
     networks:
-      - archive-net
+      - archiv-net
     healthcheck:
       test: ["CMD-SHELL", "wget -qO- http://localhost:3000/login >/dev/null 2>&1 || exit 1"]
       interval: 15s
diff --git a/docker-compose.yml b/docker-compose.yml
index ee850922..952e3074 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -13,7 +13,7 @@ services:
     ports:
       - "${PORT_DB}:5432"
     networks:
-      - archive-net
+      - archiv-net
     healthcheck:
       test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"]
       interval: 5s
@@ -35,7 +35,7 @@ services:
       - "${PORT_MINIO_API}:9000"      # API Port
       - "${PORT_MINIO_CONSOLE}:9001"  # Web-Oberfläche
     networks:
-      - archive-net
+      - archiv-net
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
       interval: 30s
@@ -56,7 +56,7 @@ services:
       exit 0;
       "
     networks:
-      - archive-net
+      - archiv-net
 
   # --- Mail catcher: Mailpit (dev only) ---
   # Catches all outgoing emails and displays them in a web UI.
@@ -69,7 +69,7 @@ services:
       - "${PORT_MAILPIT_UI:-8025}:8025"   # Web UI
       - "${PORT_MAILPIT_SMTP:-1025}:1025" # SMTP
     networks:
-      - archive-net
+      - archiv-net
 
   # --- OCR: Python microservice (Surya + Kraken) ---
   # Single-node only: OCR training reloads the model in-process after each run.
@@ -99,7 +99,7 @@ services:
       OCR_CLAHE_TILE_SIZE: "8"      # CLAHE tile grid size (NxN tiles per page)
       OCR_MAX_CACHED_MODELS: "2"    # LRU cache; each model ~500 MB, so 2 = ~1 GB resident
     networks:
-      - archive-net
+      - archiv-net
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
       interval: 10s
@@ -150,7 +150,7 @@ services:
     ports:
       - "${PORT_BACKEND}:8080"
     networks:
-      - archive-net
+      - archiv-net
     healthcheck:
       test: ["CMD-SHELL", "wget -qO- http://localhost:8080/actuator/health | grep -q UP || exit 1"]
       interval: 15s
@@ -185,10 +185,10 @@ services:
     ports:
       - "${PORT_FRONTEND}:5173"
     networks:
-      - archive-net
+      - archiv-net
 
 networks:
-  archive-net:
+  archiv-net:
     driver: bridge
 
 volumes:
diff --git a/docs/infrastructure/ci-gitea.md b/docs/infrastructure/ci-gitea.md
index c6180c5e..3f96583e 100644
--- a/docs/infrastructure/ci-gitea.md
+++ b/docs/infrastructure/ci-gitea.md
@@ -166,7 +166,7 @@ jobs:
           timeout 30 bash -c \
             'until docker compose -f docker-compose.yml -f docker-compose.ci.yml exec -T db pg_isready -U archive_user; do sleep 2; done'
       - name: Connect job container to compose network
-        run: docker network connect familienarchiv_archive-net $(cat /etc/hostname)
+        run: docker network connect familienarchiv_archiv-net $(cat /etc/hostname)
       - uses: actions/setup-java@v4
         with:
           java-version: '21'
-- 
2.49.1


From 4d4d5793bbaeb4805a6d76de3ac8361130aebdf1 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 11 May 2026 14:11:46 +0200
Subject: [PATCH 39/39] docs(glossary): add archiv-app service account entry

`archiv-app` is the bucket-scoped MinIO service account introduced
in PR #499 alongside the production deploy pipeline. Until now the
term only appeared in `infra/minio/bootstrap.sh` and the prod compose
file; a reader encountering `S3_ACCESS_KEY: archiv-app` had no
single-page reference distinguishing it from the MinIO root account.

Adds a new "Infrastructure Terms" section to docs/GLOSSARY.md so the
distinction (root account vs. application service account) and the
attached `archiv-app-policy` scope live in the canonical glossary
location. Cross-links to ADR-010 for the MinIO-stays-self-hosted
rationale. Addresses @elicit's round-2 recommendation on PR #499.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docs/GLOSSARY.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/GLOSSARY.md b/docs/GLOSSARY.md
index f1c75053..55ffca93 100644
--- a/docs/GLOSSARY.md
+++ b/docs/GLOSSARY.md
@@ -107,6 +107,13 @@ _See also [Briefwechsel](#briefwechsel-user-facing)._
 
 ---
 
+## Infrastructure Terms
+
+**archiv-app** — the bucket-scoped MinIO service account the backend uses to read and write the `familienarchiv` bucket. Distinct from the MinIO root account (`archiv`, used only by the bootstrap container for admin operations). Defined and provisioned in [`infra/minio/bootstrap.sh`](../infra/minio/bootstrap.sh) and consumed by the backend as `S3_ACCESS_KEY` in [`docker-compose.prod.yml`](../docker-compose.prod.yml). The attached `archiv-app-policy` grants `s3:GetObject/PutObject/DeleteObject` on `familienarchiv/*` and `s3:ListBucket/GetBucketLocation` on the bucket only — not the built-in `readwrite` policy which would grant `s3:*` on all buckets.
+_See also [ADR-010 — MinIO stays self-hosted, not Hetzner OBS](./adr/010-minio-self-hosted-not-hetzner-obs.md)._
+
+---
+
 ## Pending Terms
 
 _Terms flagged as potentially ambiguous that have not yet been formally defined here. Add an entry above and remove it from this list when resolved._
-- 
2.49.1