From 83f022ff4bbfd2c6f8fb0991ed47e2309557b313 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 10 May 2026 21:33:39 +0200 Subject: [PATCH 01/39] feat(security): trust X-Forwarded-Proto behind reverse proxy Adds server.forward-headers-strategy: native so that Jetty honours X-Forwarded-{Proto,For,Host} from Caddy. Without this, getScheme(), redirect URLs, and Spring Session "Secure" cookies reflect the internal http hop instead of the original https client request. Refs #497. Co-Authored-By: Claude Sonnet 4.6 --- backend/src/main/resources/application.yaml | 6 +++ .../ForwardHeadersConfigurationTest.java | 37 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml index 1cdd7673..6e12b9f6 100644 --- a/backend/src/main/resources/application.yaml +++ b/backend/src/main/resources/application.yaml @@ -38,6 +38,12 @@ spring: starttls: enable: true +server: + # Behind Caddy/reverse proxy: trust X-Forwarded-{Proto,For,Host} so that + # request.getScheme(), redirect URLs, and Spring Session "Secure" cookies + # reflect the original https client request, not the http hop from Caddy. + forward-headers-strategy: native + management: health: mail: diff --git a/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java new file mode 100644 index 00000000..b97f5ff0 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java @@ -0,0 +1,37 @@ +package org.raddatz.familienarchiv.config; + +import org.junit.jupiter.api.Test; +import org.raddatz.familienarchiv.PostgresContainerConfig; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.ActiveProfiles; +import org.springframework.test.context.bean.override.mockito.MockitoBean; +import software.amazon.awssdk.services.s3.S3Client; + +import static org.assertj.core.api.Assertions.assertThat; + +@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.NONE) +@ActiveProfiles("test") +@Import(PostgresContainerConfig.class) +class ForwardHeadersConfigurationTest { + + @MockitoBean + S3Client s3Client; + + @Autowired + @Value("${server.forward-headers-strategy:}") + String forwardHeadersStrategy; + + @Test + void forward_headers_strategy_is_native_for_reverse_proxy_deployment() { + // Caddy terminates TLS and forwards X-Forwarded-Proto: https. + // Spring must trust those headers so that AppUser-facing redirect URLs, + // Spring Session cookies (Secure flag), and HttpServletRequest.getScheme() + // reflect the original client-facing scheme rather than the internal http hop. + assertThat(forwardHeadersStrategy) + .as("server.forward-headers-strategy must be 'native' so Jetty honours X-Forwarded-Proto behind Caddy") + .isEqualTo("native"); + } +} -- 2.49.1 From ebd0f671f9bfdf610fe9505eeb1386bd3c21daba Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 10 May 2026 21:50:53 +0200 Subject: [PATCH 02/39] fix(auth): mark /hilfe/transkription as public for prerender The route exports prerender = true and is listed in svelte.config.js's prerender.entries. Until now the auth hook redirected unauthenticated requests to /login, so the prerender crawler hit a 302 and the build failed with "marked as prerenderable, but were not prerendered". Adding the path to PUBLIC_PATHS lets the crawler render the static HTML; consistent with the route's intent as a public help page. Surfaced by #497 (the production Docker build is the first place npm run build runs in CI). Co-Authored-By: Claude Sonnet 4.6 --- frontend/src/hooks.server.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/frontend/src/hooks.server.ts b/frontend/src/hooks.server.ts index 917ed953..39460750 100644 --- a/frontend/src/hooks.server.ts +++ b/frontend/src/hooks.server.ts @@ -5,7 +5,14 @@ import { env } from 'process'; import { cookieName, cookieMaxAge } from '$lib/paraglide/runtime'; import { detectLocale } from '$lib/shared/server/locale'; -const PUBLIC_PATHS = ['/login', '/logout', '/forgot-password', '/reset-password', '/register']; +const PUBLIC_PATHS = [ + '/login', + '/logout', + '/forgot-password', + '/reset-password', + '/register', + '/hilfe/transkription' // prerendered help page — must be reachable without an auth cookie +]; const handleLocaleDetection: Handle = ({ event, resolve }) => { if (!event.cookies.get(cookieName)) { -- 2.49.1 From 8b109349c2b7e964835f2569332ac8a8580e40fb Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 10 May 2026 21:51:32 +0200 Subject: [PATCH 03/39] feat(frontend): add production stage to Dockerfile Multi-stage Dockerfile with three targets: - development (dev server on :5173, used by docker-compose.yml) - build (runs npm run build, produces SvelteKit Node-adapter output) - production (self-contained node build server on :3000) Node base pinned to node:20.19.0-alpine3.21 for reproducible CI builds (Renovate will keep it current). docker-compose.yml now specifies target: development for the frontend so dev continues to use the dev-server stage. Without this, Docker would default to the last stage (production). Refs #497. Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.yml | 1 + frontend/Dockerfile | 35 +++++++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 5593a105..ee850922 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -163,6 +163,7 @@ services: build: context: ./frontend dockerfile: Dockerfile + target: development # Dockerfile is multi-stage; default would be the production stage container_name: archive-frontend restart: unless-stopped depends_on: diff --git a/frontend/Dockerfile b/frontend/Dockerfile index ca88f974..afbdb79f 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -1,15 +1,34 @@ -FROM node:20-alpine +# syntax=docker/dockerfile:1.7 +# ── Development ────────────────────────────────────────────────────────────── +# Used by docker-compose.yml (target: development). Source is bind-mounted in +# dev so the COPY . below is effectively replaced at runtime; the layer still +# exists so the image is self-contained for cold starts (e.g. devcontainer). +FROM node:20.19.0-alpine3.21 AS development WORKDIR /app - -# Install dependencies as a separate layer so they are cached when only source changes COPY package.json package-lock.json ./ RUN npm ci - -# Source is mounted at runtime via docker-compose volume -# This COPY is only used when building without a volume (e.g. production image) COPY . . - EXPOSE 5173 - CMD ["npm", "run", "dev"] + +# ── Build ──────────────────────────────────────────────────────────────────── +# Compiles the SvelteKit Node-adapter output to /app/build. +FROM node:20.19.0-alpine3.21 AS build +WORKDIR /app +COPY package.json package-lock.json ./ +RUN npm ci +COPY . . +RUN npm run build + +# ── Production ─────────────────────────────────────────────────────────────── +# Self-contained Node server. `node build` is the adapter-node entrypoint. +FROM node:20.19.0-alpine3.21 AS production +WORKDIR /app +ENV NODE_ENV=production +COPY --from=build /app/build ./build +COPY --from=build /app/package.json ./package.json +COPY --from=build /app/package-lock.json ./package-lock.json +RUN npm ci --omit=dev +EXPOSE 3000 +CMD ["node", "build"] -- 2.49.1 From ecb930e5f91b8a1a6f6d065587eae1a0c8206989 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 10 May 2026 21:53:19 +0200 Subject: [PATCH 04/39] feat(infra): add docker-compose.prod.yml for production/staging Standalone production compose file (not an overlay) that runs the full stack on a single host. Environment isolation is achieved via the docker compose project name (-p archiv-production / -p archiv-staging) so the two environments cohabit cleanly. Key choices, resolved in #497 review: - Named volumes for persistent data (no host bind mounts) - MinIO pinned to a specific RELEASE tag (no :latest) - Backend uses MinIO service account (S3_ACCESS_KEY=archiv-app), not root credentials; create-buckets bootstraps the account - Mailpit lives under profiles: [staging] so no real SMTP secret is ever wired into the staging deploy - OCR mem_limit 12g + healthcheck (start_period 120s) copied from the dev compose so docker compose up -d --wait works in CI - Backend admin credentials wired through APP_ADMIN_USERNAME / APP_ADMIN_PASSWORD; first deploy locks the password in permanently because UserDataInitializer is idempotent on email - All host ports bound to 127.0.0.1; Caddy fronts external traffic Refs #497. Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.prod.yml | 211 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 docker-compose.prod.yml diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml new file mode 100644 index 00000000..25c7856b --- /dev/null +++ b/docker-compose.prod.yml @@ -0,0 +1,211 @@ +# Production / staging Docker Compose for Familienarchiv. +# +# This is a self-contained file (not an overlay over docker-compose.yml). +# All services for the prod stack live here. Environment isolation is +# achieved via the docker compose project name: +# +# production: docker compose -f docker-compose.prod.yml -p archiv-production ... +# staging: docker compose -f docker-compose.prod.yml -p archiv-staging --profile staging ... +# +# Volumes, networks and containers are namespaced by the project name, +# so the two environments cohabit cleanly on the same host. +# +# Required env vars (provided by .env.production / .env.staging in CI): +# TAG image tag (release tag or "nightly") +# PORT_BACKEND, PORT_FRONTEND host-side ports (bound to 127.0.0.1 only) +# APP_DOMAIN e.g. archiv.raddatz.cloud / staging.raddatz.cloud +# POSTGRES_PASSWORD Postgres password +# MINIO_PASSWORD MinIO root password (admin operations only) +# MINIO_APP_PASSWORD MinIO application service-account password +# (least-privilege scope: archive bucket only) +# OCR_TRAINING_TOKEN token guarding ocr-service /train endpoint +# APP_ADMIN_USERNAME seeded admin email (e.g. admin@archiv.raddatz.cloud) +# APP_ADMIN_PASSWORD seeded admin password — CRITICAL: locked in on +# first deploy because UserDataInitializer only +# creates the account if the email does not exist +# MAIL_HOST, MAIL_PORT, SMTP relay (production only; staging uses mailpit) +# MAIL_USERNAME, MAIL_PASSWORD +# APP_MAIL_FROM sender address (e.g. noreply@raddatz.cloud) + +networks: + archive-net: + driver: bridge + +volumes: + postgres-data: + minio-data: + ocr-models: + ocr-cache: + +services: + db: + image: postgres:16-alpine + restart: unless-stopped + environment: + POSTGRES_USER: archiv + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: archiv + volumes: + - postgres-data:/var/lib/postgresql/data + networks: + - archive-net + healthcheck: + test: ["CMD-SHELL", "pg_isready -U archiv -d archiv"] + interval: 10s + timeout: 5s + retries: 5 + + minio: + # Pinned MinIO release for reproducible deploys; Renovate keeps it current. + image: minio/minio:RELEASE.2025-02-28T09-55-16Z + restart: unless-stopped + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: archiv + MINIO_ROOT_PASSWORD: ${MINIO_PASSWORD} + volumes: + - minio-data:/data + networks: + - archive-net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + # Idempotent bucket bootstrap + service-account creation. + # Runs once per `docker compose up` and exits 0; `--ignore-existing` and + # the user-add fallback are safe on re-deploy. + create-buckets: + image: minio/mc + depends_on: + minio: + condition: service_healthy + networks: + - archive-net + environment: + MINIO_PASSWORD: ${MINIO_PASSWORD} + MINIO_APP_PASSWORD: ${MINIO_APP_PASSWORD} + entrypoint: > + /bin/sh -c " + set -e; + /usr/bin/mc alias set myminio http://minio:9000 archiv $$MINIO_PASSWORD; + /usr/bin/mc mb myminio/familienarchiv --ignore-existing; + /usr/bin/mc anonymous set private myminio/familienarchiv; + /usr/bin/mc admin user add myminio archiv-app $$MINIO_APP_PASSWORD || /usr/bin/mc admin user enable myminio archiv-app; + /usr/bin/mc admin policy attach myminio readwrite --user archiv-app || true; + exit 0; + " + + # Dev-only mail catcher; gated behind the staging profile so production + # never starts it. Staging workflow runs with `--profile staging`. + mailpit: + image: axllent/mailpit:latest + restart: unless-stopped + profiles: ["staging"] + networks: + - archive-net + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:8025/api/v1/info >/dev/null 2>&1 || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + + ocr-service: + build: + context: ./ocr-service + restart: unless-stopped + expose: + - "8000" + # Surya OCR loads ~5GB of transformer models at startup; first request + # triggers a further ~1GB Kraken model download into ocr-cache. + mem_limit: 12g + memswap_limit: 12g + volumes: + - ocr-models:/app/models + - ocr-cache:/root/.cache + environment: + KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel + TRAINING_TOKEN: ${OCR_TRAINING_TOKEN} + OCR_CONFIDENCE_THRESHOLD: "0.3" + OCR_CONFIDENCE_THRESHOLD_KURRENT: "0.5" + networks: + - archive-net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 10s + timeout: 5s + retries: 12 + start_period: 120s + + backend: + image: familienarchiv/backend:${TAG:-nightly} + build: + context: ./backend + restart: unless-stopped + depends_on: + db: + condition: service_healthy + minio: + condition: service_healthy + ocr-service: + condition: service_healthy + # Bound to localhost only — Caddy fronts external traffic. + ports: + - "127.0.0.1:${PORT_BACKEND}:8080" + environment: + SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/archiv + SPRING_DATASOURCE_USERNAME: archiv + SPRING_DATASOURCE_PASSWORD: ${POSTGRES_PASSWORD} + # Application uses the bucket-scoped service account, not MinIO root. + S3_ENDPOINT: http://minio:9000 + S3_ACCESS_KEY: archiv-app + S3_SECRET_KEY: ${MINIO_APP_PASSWORD} + S3_BUCKET_NAME: familienarchiv + S3_REGION: us-east-1 + # No SPRING_PROFILES_ACTIVE — base application.yaml is production-ready + # (Swagger disabled, show-sql off, open-in-view false). + APP_BASE_URL: https://${APP_DOMAIN} + APP_ADMIN_USERNAME: ${APP_ADMIN_USERNAME} + APP_ADMIN_PASSWORD: ${APP_ADMIN_PASSWORD} + APP_OCR_BASE_URL: http://ocr-service:8000 + APP_OCR_TRAINING_TOKEN: ${OCR_TRAINING_TOKEN} + MAIL_HOST: ${MAIL_HOST} + MAIL_PORT: ${MAIL_PORT:-587} + MAIL_USERNAME: ${MAIL_USERNAME:-} + MAIL_PASSWORD: ${MAIL_PASSWORD:-} + APP_MAIL_FROM: ${APP_MAIL_FROM:-noreply@raddatz.cloud} + SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-true} + SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-true} + networks: + - archive-net + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:8080/actuator/health | grep -q UP || exit 1"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 30s + + frontend: + image: familienarchiv/frontend:${TAG:-nightly} + build: + context: ./frontend + target: production + restart: unless-stopped + depends_on: + backend: + condition: service_healthy + ports: + - "127.0.0.1:${PORT_FRONTEND}:3000" + environment: + # SSR fetches go inside the docker network; clients hit https://${APP_DOMAIN} + API_INTERNAL_URL: http://backend:8080 + ORIGIN: https://${APP_DOMAIN} + networks: + - archive-net + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:3000/login >/dev/null 2>&1 || exit 1"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s -- 2.49.1 From 56e55ff488e2c9295d868a5a6fe7ee5b8cdfeb73 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 10 May 2026 21:54:38 +0200 Subject: [PATCH 05/39] feat(infra): add production Caddyfile Reverse proxy for the Familienarchiv host, validated against Caddy 2. Includes both vhosts (production and staging), the Gitea vhost, and: - HSTS, X-Content-Type-Options, Referrer-Policy headers on every site - "-Server" header strip to hide the Caddy version - /actuator/* responds 404 on both archive vhosts (defense in depth for Spring Boot's management endpoints) X-Frame-Options is intentionally not set in Caddy: Spring Security configures frame-options SAMEORIGIN for the in-app PDF preview iframe; a DENY header here would conflict. Refs #497. Co-Authored-By: Claude Sonnet 4.6 --- infra/caddy/Caddyfile | 63 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 infra/caddy/Caddyfile diff --git a/infra/caddy/Caddyfile b/infra/caddy/Caddyfile new file mode 100644 index 00000000..f32b1f2f --- /dev/null +++ b/infra/caddy/Caddyfile @@ -0,0 +1,63 @@ +# Caddyfile for the Familienarchiv host. +# +# Caddy runs on the host (not in a container) and reverse-proxies into +# the docker compose stacks bound to 127.0.0.1. +# +# Naming convention for ports (also documented in docker-compose.prod.yml): +# production: backend 8080, frontend 3000 +# staging: backend 8081, frontend 3001 +# gitea: 3005 +# +# Security headers and the /actuator block apply to both archive vhosts. +# X-Frame-Options is deliberately NOT set here: Spring Security configures +# frame-options SAMEORIGIN (for the in-app PDF preview iframe). Setting +# DENY in Caddy would conflict. + +(security_headers) { + header { + Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" + X-Content-Type-Options "nosniff" + Referrer-Policy "strict-origin-when-cross-origin" + -Server + } +} + +(block_actuator) { + # Defense in depth: even if management.endpoints.web.exposure.include grows + # in application.yaml, /actuator/* is unreachable externally. The internal + # Prometheus scrape (future) talks to the backend directly on the docker + # network, not via Caddy. + @actuator path /actuator/* + respond @actuator 404 +} + +archiv.raddatz.cloud { + import security_headers + import block_actuator + + handle /api/* { + reverse_proxy 127.0.0.1:8080 + } + + handle { + reverse_proxy 127.0.0.1:3000 + } +} + +staging.raddatz.cloud { + import security_headers + import block_actuator + + handle /api/* { + reverse_proxy 127.0.0.1:8081 + } + + handle { + reverse_proxy 127.0.0.1:3001 + } +} + +git.raddatz.cloud { + import security_headers + reverse_proxy 127.0.0.1:3005 +} -- 2.49.1 From 59349dfe93c4bca11c125c107d7dad9756f1e7b6 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 10 May 2026 21:55:41 +0200 Subject: [PATCH 06/39] feat(ci): add nightly staging deploy workflow Runs daily at 02:00 (and on workflow_dispatch). Builds the prod compose stack with BuildKit, writes a transient .env.staging from Gitea secrets, then `docker compose up -d --wait` so the job fails loudly if any service's healthcheck never reports healthy. The --profile staging flag starts the mailpit catcher in place of a real SMTP relay; no production SMTP credentials touch the staging environment. The .env.staging file is cleaned up in `if: always()` to avoid leaving secrets in the runner workspace between runs. Refs #497. Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/nightly.yml | 80 ++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 .gitea/workflows/nightly.yml diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml new file mode 100644 index 00000000..82fc0c5c --- /dev/null +++ b/.gitea/workflows/nightly.yml @@ -0,0 +1,80 @@ +name: nightly + +# Builds and deploys the staging environment from main every night. +# Runs on the self-hosted runner using Docker-out-of-Docker (the docker +# socket is mounted in), so `docker compose build` produces images on +# the host daemon and `docker compose up` consumes them directly — no +# registry hop. +# +# Staging environment isolation: +# - project name: archiv-staging +# - host ports: backend 8081, frontend 3001 +# - profile: staging (starts mailpit instead of a real SMTP relay) +# +# Required Gitea secrets: +# STAGING_POSTGRES_PASSWORD +# STAGING_MINIO_PASSWORD +# STAGING_MINIO_APP_PASSWORD +# STAGING_OCR_TRAINING_TOKEN +# STAGING_APP_ADMIN_USERNAME +# STAGING_APP_ADMIN_PASSWORD + +on: + schedule: + - cron: "0 2 * * *" + workflow_dispatch: + +env: + # Ensures the backend Dockerfile's `RUN --mount=type=cache` lines are + # honoured (Maven cache survives between runs). + DOCKER_BUILDKIT: "1" + +jobs: + deploy-staging: + runs-on: self-hosted + steps: + - uses: actions/checkout@v4 + + - name: Write staging env file + run: | + cat > .env.staging < Date: Sun, 10 May 2026 21:56:37 +0200 Subject: [PATCH 07/39] feat(ci): add release production deploy workflow Fires on `v*` tag push. Tags the built images with the git tag so rollbacks are a one-liner (TAG= docker compose ... up -d). `up -d --wait` blocks until every service healthcheck reports healthy; a bad release fails the workflow rather than crash-looping silently. The .env.production file containing all Gitea secrets is removed in `if: always()` after the deploy step. Refs #497. Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/release.yml | 79 ++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 .gitea/workflows/release.yml diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml new file mode 100644 index 00000000..dbf7a9a8 --- /dev/null +++ b/.gitea/workflows/release.yml @@ -0,0 +1,79 @@ +name: release + +# Builds and deploys the production environment on `v*` tag push. +# Runs on the self-hosted runner via Docker-out-of-Docker; images are +# tagged with the actual git tag (e.g. v1.0.0) so rollback is +# `TAG= docker compose -f docker-compose.prod.yml -p archiv-production up -d --wait` +# +# Production environment: +# - project name: archiv-production +# - host ports: backend 8080, frontend 3000 +# - profile: (none) — mailpit is excluded; real SMTP relay is used +# +# Required Gitea secrets: +# PROD_POSTGRES_PASSWORD +# PROD_MINIO_PASSWORD +# PROD_MINIO_APP_PASSWORD +# PROD_OCR_TRAINING_TOKEN +# PROD_APP_ADMIN_USERNAME (CRITICAL: see docs/DEPLOYMENT.md) +# PROD_APP_ADMIN_PASSWORD (CRITICAL: locked in on first deploy) +# MAIL_HOST +# MAIL_PORT +# MAIL_USERNAME +# MAIL_PASSWORD + +on: + push: + tags: + - "v*" + +env: + DOCKER_BUILDKIT: "1" + +jobs: + deploy-production: + runs-on: self-hosted + steps: + - uses: actions/checkout@v4 + + - name: Write production env file + run: | + cat > .env.production < Date: Sun, 10 May 2026 21:58:51 +0200 Subject: [PATCH 08/39] docs(deployment): rewrite for Gitea Actions / Caddy / prod compose Brings DEPLOYMENT.md in line with the production deployment landed in #497: - Topology diagram: frontend port 3000 (Node adapter), 127.0.0.1 binding, project-name isolation between prod and staging - Caddyfile now lives in-tree at infra/caddy/Caddyfile (symlinked onto the server) - Dev vs prod table: documents the new deploy method (workflows + --wait) and the prod-compose specific differences - Env vars: adds MINIO_APP_PASSWORD; notes that prod compose hardcodes the MinIO root user and the bucket name - Bootstrap section: server hardening, fail2ban, Tailscale, the 16 Gitea secrets, and the workflow_dispatch first-deploy step - Admin password warning: first deploy locks the password, secret rotation after that point has no effect - Rollback: TAG= override + docker compose up -d --wait Refs #497. Co-Authored-By: Claude Sonnet 4.6 --- docs/DEPLOYMENT.md | 152 +++++++++++++++++++++++++++++++-------------- 1 file changed, 104 insertions(+), 48 deletions(-) diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 6e697c55..674bc15f 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -27,20 +27,22 @@ This doc is the Day-1 checklist and operational reference. It links to the canon ```mermaid graph TD Browser -->|HTTPS| Caddy["Caddy (TLS termination)"] - Caddy -->|HTTP :5173| Frontend["Web Frontend\nSvelteKit / Node.js"] + Caddy -->|HTTP :3000| Frontend["Web Frontend\nSvelteKit Node adapter"] Caddy -->|HTTP :8080| Backend["API Backend\nSpring Boot / Jetty :8080"] Backend -->|JDBC :5432| DB[(PostgreSQL 16)] - Backend -->|S3 API :9000| MinIO[(MinIO / Hetzner OBS)] + Backend -->|S3 API :9000| MinIO[(MinIO)] Backend -->|HTTP :8000 internal| OCR["OCR Service\nPython FastAPI"] OCR -->|presigned URL| MinIO Browser -->|SSE direct| Backend ``` **Key facts:** -- Caddy terminates TLS and reverse-proxies to frontend and backend. See the Caddyfile in [`docs/infrastructure/production-compose.md`](infrastructure/production-compose.md). -- The OCR service has **no external port** — reachable only on the internal Docker network from the backend. +- Caddy terminates TLS and reverse-proxies to frontend (`:3000`) and backend (`:8080`). The Caddyfile is committed at [`infra/caddy/Caddyfile`](../infra/caddy/Caddyfile) and is installed on the host as `/etc/caddy/Caddyfile` (symlink). +- The host binds all docker-published ports to `127.0.0.1` only; Caddy is the sole external entry point. +- The OCR service has **no published port** — reachable only on the internal Docker network from the backend. - SSE notifications go directly backend → browser (not via the SvelteKit SSR layer). -- Management port 8081 (Spring Actuator / Prometheus scrape) is internal only — the Caddy config blocks `/actuator/*` externally. +- The Caddyfile responds `404` on `/actuator/*` (defense in depth). Internal monitoring scrapes the backend on the docker network, not through Caddy. +- Production and staging cohabit on the same host via docker compose project names: `archiv-production` (ports 8080/3000) and `archiv-staging` (ports 8081/3001). ### OCR memory requirements @@ -56,15 +58,19 @@ A CX32 cannot honour a `mem_limit: 12g` — set it to `6g` in the prod overlay o ### Dev vs production differences -| Concern | Dev compose | Prod overlay | +| Concern | Dev (`docker-compose.yml`) | Prod (`docker-compose.prod.yml`) | |---|---|---| -| MinIO image tag | `minio/minio:latest` (unpinned) | Pinned in prod overlay | -| Data persistence | Bind mounts `./data/postgres`, `./data/minio` | Named Docker volumes | -| Bucket creation | `create-buckets` helper container | Pre-created in Hetzner console | -| Spring profile | `dev,e2e` (enables OpenAPI + Swagger UI) | `prod` | -| Mail | Mailpit (local catcher) | Real SMTP | +| MinIO image tag | `minio/minio:latest` | Pinned `minio/minio:RELEASE.…` | +| Data persistence | Bind mounts `./data/postgres`, `./data/minio` | Named Docker volumes (`postgres-data`, `minio-data`) | +| MinIO credentials for backend | Root user/password | Service account `archiv-app` with bucket-scoped rights | +| Bucket creation | `create-buckets` helper | Same helper, plus service-account bootstrap on every up | +| Spring profile | `dev,e2e` (Swagger + e2e overrides) | unset — base `application.yaml` is production-ready | +| Mail | Mailpit (local catcher) | Real SMTP (production) / Mailpit via `profiles: [staging]` (staging) | +| Frontend image | Dev server, `target: development`, port 5173 | Node adapter, `target: production`, port 3000 | +| Host port binding | All published | Bound to `127.0.0.1` only; Caddy is the front door | +| Deploy method | `docker compose up -d` (manual) | Gitea Actions: `nightly.yml` (staging, cron) and `release.yml` (production, on `v*` tag) — both use `up -d --wait` | -Full prod overlay: [`docs/infrastructure/production-compose.md`](infrastructure/production-compose.md). +Full prod compose: [`docker-compose.prod.yml`](../docker-compose.prod.yml). Workflow files: [`.gitea/workflows/nightly.yml`](../.gitea/workflows/nightly.yml), [`.gitea/workflows/release.yml`](../.gitea/workflows/release.yml). --- @@ -112,9 +118,10 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back | Variable | Purpose | Default | Required? | Sensitive? | |---|---|---|---|---| -| `MINIO_ROOT_USER` | MinIO root username | `minio_admin` | YES | — | -| `MINIO_ROOT_PASSWORD` | MinIO root password | `change-me` | YES | YES | -| `MINIO_DEFAULT_BUCKETS` | Bucket name | `archive-documents` | YES | — | +| `MINIO_ROOT_USER` | MinIO root username (dev compose only — prod compose hardcodes `archiv`) | `minio_admin` | YES (dev) | — | +| `MINIO_ROOT_PASSWORD` / `MINIO_PASSWORD` | MinIO root password. **Used only by the `mc admin` bootstrap in prod, never by the backend.** | `change-me` | YES | YES | +| `MINIO_APP_PASSWORD` | Password for the `archiv-app` service account that the backend uses. Bucket-scoped via `readwrite` policy on `familienarchiv`. Bootstrapped by `create-buckets`. | — | YES (prod) | YES | +| `MINIO_DEFAULT_BUCKETS` | Bucket name (dev compose only — prod compose hardcodes `familienarchiv`) | `archive-documents` | YES (dev) | — | ### OCR service @@ -129,48 +136,81 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back ## 3. Bootstrap from scratch -> Full VPS provisioning steps are in [`docs/infrastructure/production-compose.md`](infrastructure/production-compose.md). This section covers the sequence and the security-critical steps. +Production and staging deploy via Gitea Actions (`release.yml` on `v*` tag, `nightly.yml` on cron). The server itself only needs to host Caddy, Docker, and the runner — the workflows handle the rest. -### Security checklist — complete before first boot - -> ⚠️ **These defaults ship in `.env.example` and `application.yaml`. Change them or you will have an insecure installation.** - -- [ ] Set `APP_ADMIN_PASSWORD` (default: `admin123` — change before starting the backend) -- [ ] Set `APP_ADMIN_USERNAME` if you want a non-default admin login name (add to `.env` — not in `.env.example`) -- [ ] Rotate `POSTGRES_PASSWORD` from `change-me` -- [ ] Rotate `MINIO_ROOT_PASSWORD` from `change-me` -- [ ] Set a strong `APP_OCR_TRAINING_TOKEN` (backend) and the matching `TRAINING_TOKEN` (OCR service) — both must be the same value (`python3 -c "import secrets; print(secrets.token_hex(32))"`) -- [ ] Confirm `ALLOWED_PDF_HOSTS` is locked to your MinIO/S3 hostname — widening to `*` opens SSRF -- [ ] Set `SPRING_PROFILES_ACTIVE=prod` in the prod overlay (not `dev,e2e` — that exposes Swagger UI and `/v3/api-docs`) -- [ ] Use a dedicated MinIO service account for `S3_ACCESS_KEY` / `S3_SECRET_KEY`, not the root credentials - -### Bootstrap sequence +### 3.1 Server one-time setup ```bash -# 1. Copy and fill the env file -cp .env.example .env -# edit .env — complete the security checklist above first +# Base hardening +ufw default deny incoming && ufw allow 22/tcp && ufw allow 80/tcp && ufw allow 443/tcp && ufw enable +# /etc/ssh/sshd_config: PasswordAuthentication no, PermitRootLogin no -# 2. (Production only) Create the MinIO / Hetzner OBS bucket in the console -# The dev compose has a create-buckets helper; production does not. -# Create the bucket named $MINIO_DEFAULT_BUCKETS with private access. +# Install Caddy 2 (https://caddyserver.com/docs/install#debian-ubuntu-raspbian) +apt install caddy -# 3. Start the stack (prod overlay — see docs/infrastructure/production-compose.md) -# docker-compose.prod.yml is NOT committed — create it from the guide above -docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d +# Use the Caddyfile from the repo (replace path with the runner's clone target) +ln -sf /opt/familienarchiv/infra/caddy/Caddyfile /etc/caddy/Caddyfile +systemctl reload caddy -# 4. Flyway migrations run automatically on backend start. -# Watch the backend log to confirm: -docker compose logs --follow --tail=100 backend +# fail2ban — protect /api/auth/login from credential stuffing +# Jail watches Caddy access log for 401 responses on /api/auth/login. +# maxretry=10 findtime=10m bantime=30m +apt install fail2ban +# Drop the jail definition under /etc/fail2ban/jail.d/familienarchiv.conf -# 5. Verify the stack is healthy -curl http://localhost:8080/actuator/health -# Expected: {"status":"UP"} +# Tailscale — used by the backup pipeline to reach heim-nas (follow-up issue) +curl -fsSL https://tailscale.com/install.sh | sh && tailscale up -# 6. Open the app and log in with the admin credentials from .env +# Self-hosted Gitea runner — register against the repo with a runner token +# (see https://docs.gitea.com/usage/actions/quickstart for the register step) ``` -> **Do not use `docker-compose.ci.yml` locally** — it disables bind mounts that the dev workflow depends on. +### 3.2 DNS records + +``` +archiv.raddatz.cloud A +staging.raddatz.cloud A +git.raddatz.cloud A +``` + +### 3.3 Gitea secrets (Repo → Settings → Actions → Secrets) + +| Secret | Used by | Notes | +|---|---|---| +| `PROD_POSTGRES_PASSWORD` | release.yml | strong unique password | +| `PROD_MINIO_PASSWORD` | release.yml | MinIO root password; used only at bootstrap | +| `PROD_MINIO_APP_PASSWORD` | release.yml | application service-account password | +| `PROD_OCR_TRAINING_TOKEN` | release.yml | `python3 -c "import secrets; print(secrets.token_hex(32))"` | +| `PROD_APP_ADMIN_USERNAME` | release.yml | e.g. `admin@archiv.raddatz.cloud` | +| `PROD_APP_ADMIN_PASSWORD` | release.yml | **⚠ locked permanently on first deploy** — see §3.5 | +| `STAGING_POSTGRES_PASSWORD` | nightly.yml | different from prod | +| `STAGING_MINIO_PASSWORD` | nightly.yml | different from prod | +| `STAGING_MINIO_APP_PASSWORD` | nightly.yml | different from prod | +| `STAGING_OCR_TRAINING_TOKEN` | nightly.yml | different from prod | +| `STAGING_APP_ADMIN_USERNAME` | nightly.yml | e.g. `admin@staging.raddatz.cloud` | +| `STAGING_APP_ADMIN_PASSWORD` | nightly.yml | locked on first staging deploy | +| `MAIL_HOST` | release.yml | SMTP relay hostname (prod only) | +| `MAIL_PORT` | release.yml | typically `587` | +| `MAIL_USERNAME` | release.yml | SMTP user | +| `MAIL_PASSWORD` | release.yml | SMTP password | + +### 3.4 First deploy + +```bash +# 1. Trigger nightly.yml manually (Repo → Actions → nightly → "Run workflow") +# Expected: docker compose up -d --wait succeeds for archiv-staging +# 2. Verify TLS + reverse proxy +curl -I https://staging.raddatz.cloud/ +# Expected: 200 (login page) with HSTS + X-Content-Type-Options headers +# 3. When staging looks healthy, push a v* tag to trigger release.yml +git tag v1.0.0 && git push origin v1.0.0 +``` + +### 3.5 ⚠ Admin password is locked on first deploy + +`UserDataInitializer` creates the admin user **only if the email does not exist**. The first successful deploy persists the admin password to the database. Changing `PROD_APP_ADMIN_PASSWORD` in Gitea secrets after that point has **no effect** — the secret is only consulted when the row is missing. + +Before the first deploy: rotate `PROD_APP_ADMIN_PASSWORD` to a strong value. After the first deploy: change the admin password via the in-app account settings, not via the Gitea secret. --- @@ -224,7 +264,23 @@ docker exec -i archive-db psql -U ${POSTGRES_USER} ${POSTGRES_DB} < backup-YYYYM ### Planned — phase 5 of Production v1 milestone -Automated backup (PostgreSQL WAL archiving + MinIO bucket replication) is planned in the Production v1 milestone phase 5. Until that ships: **manual backups are the only recovery option.** +Automated backup (nightly `pg_dump` + MinIO `mc mirror` over Tailscale to `heim-nas`) is a follow-up issue. Until that ships: **manual backups are the only recovery option.** + +### Rollback + +Each release tag corresponds to a docker image tag on the host daemon (built via DooD; no registry). Rolling back to a previous tag is one command: + +```bash +TAG=v1.0.0 docker compose \ + -f docker-compose.prod.yml \ + -p archiv-production \ + --env-file /opt/familienarchiv/.env.production \ + up -d --wait --remove-orphans +``` + +If the rollback target image is no longer present on the host (host disk pruned, etc.), re-trigger `release.yml` for that tag from Gitea Actions UI — it rebuilds and redeploys. + +**Flyway migrations are not auto-rolled-back.** If a release contained a destructive migration (drop column, rename table), a tag rollback brings the schema back to a previous app version but the data shape has already changed. For breaking schema changes, prefer a forward-only fix. --- -- 2.49.1 From e4df17f3080d983ee8a6c749d255e0119bc9f495 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 10 May 2026 22:00:21 +0200 Subject: [PATCH 09/39] docs: retire overlay narrative; add Caddy to C4 L2 diagram - docs/infrastructure/production-compose.md: trimmed to VPS sizing, cost breakdown, and Hetzner ecosystem rationale. The inline compose spec (overlay + Hetzner OBS in prod) is retired; the live file is now docker-compose.prod.yml at the repo root and the Caddyfile lives at infra/caddy/Caddyfile. Observability stack is called out as a not-yet-deployed gap (issue #498). - docs/architecture/c4/l2-containers.puml: adds Caddy as a named reverse-proxy container with the two port paths and notes the archiv-app service-account split on MinIO access. Refs #497. Co-Authored-By: Claude Sonnet 4.6 --- docs/architecture/c4/l2-containers.puml | 20 +- docs/infrastructure/production-compose.md | 270 +++------------------- 2 files changed, 44 insertions(+), 246 deletions(-) diff --git a/docs/architecture/c4/l2-containers.puml b/docs/architecture/c4/l2-containers.puml index bd187bca..367b7d93 100644 --- a/docs/architecture/c4/l2-containers.puml +++ b/docs/architecture/c4/l2-containers.puml @@ -6,23 +6,27 @@ title Container Diagram: Familienarchiv Person(user, "User", "Admin or family member") System_Ext(mail, "Email Service", "SMTP server. Delivers notification and password-reset emails.") +Container(caddy, "Reverse Proxy", "Caddy 2 (host-installed)", "TLS termination (auto Let's Encrypt). Routes /api/* to backend:8080, everything else to frontend:3000. Responds 404 on /actuator/* and adds HSTS, X-Content-Type-Options, Referrer-Policy headers.") + System_Boundary(archiv, "Familienarchiv (Docker Compose)") { - Container(frontend, "Web Frontend", "SvelteKit / Node.js", "Server-side rendered UI. Handles auth session cookies, document search and viewer, transcription editor, annotation layer, family tree (Stammbaum), stories (Geschichten), activity feed (Chronik), enrichment workflow, and admin panel.") - Container(backend, "API Backend", "Spring Boot 4 / Java 21 / Jetty", "REST API. Implements document management, search, user auth, file upload/download, transcription, OCR orchestration, and SSE notifications.") + Container(frontend, "Web Frontend", "SvelteKit / Node adapter / port 3000", "Server-side rendered UI. Handles auth session cookies, document search and viewer, transcription editor, annotation layer, family tree (Stammbaum), stories (Geschichten), activity feed (Chronik), enrichment workflow, and admin panel.") + Container(backend, "API Backend", "Spring Boot 4 / Java 21 / Jetty / port 8080", "REST API. Implements document management, search, user auth, file upload/download, transcription, OCR orchestration, and SSE notifications. Trusts X-Forwarded-* headers from Caddy.") Container(ocr, "OCR Service", "Python FastAPI / port 8000", "Handwritten text recognition (HTR) and OCR microservice. Single-node by design — see ADR-001. Reachable only on the internal Docker network; no external port exposed.") ContainerDb(db, "Relational Database", "PostgreSQL 16", "Stores document metadata, persons, users, permission groups, tags, transcription blocks, audit log, and Spring Session data.") - ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Objects keyed as documents/{UUID}_{filename}.") - Container(mc, "Bucket Init Helper", "MinIO Client (mc)", "One-shot container on startup. Creates the archive bucket with private access policy.") + ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Backend uses a bucket-scoped service account (archiv-app), not MinIO root.") + Container(mc, "Bucket / Service-Account Init", "MinIO Client (mc)", "One-shot container on startup. Idempotent: creates the archive bucket, the archiv-app service account, and attaches the readwrite policy.") } -Rel(user, frontend, "Uses", "HTTPS / Browser") +Rel(user, caddy, "HTTPS", "TLS 1.2/1.3") +Rel(caddy, frontend, "Reverse proxies non-/api requests", "HTTP / loopback:3000") +Rel(caddy, backend, "Reverse proxies /api/*", "HTTP / loopback:8080") Rel(frontend, backend, "API requests with Basic Auth token", "HTTP / REST / JSON") -Rel(backend, user, "SSE notifications (server-sent events)", "HTTP / SSE — direct backend-to-browser") +Rel(backend, user, "SSE notifications (server-sent events)", "HTTP / SSE — fronted by Caddy") Rel(backend, db, "Reads and writes metadata and sessions", "JDBC / SQL") -Rel(backend, storage, "Uploads and streams document files", "HTTP / S3 API (AWS SDK v2)") +Rel(backend, storage, "Uploads and streams document files using archiv-app service account", "HTTP / S3 API (AWS SDK v2)") Rel(backend, ocr, "OCR job requests with presigned MinIO URL", "HTTP / REST / JSON") Rel(backend, mail, "Sends notification and password-reset emails (optional)", "SMTP") Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned") -Rel(mc, storage, "Creates bucket on startup", "MinIO Client CLI") +Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI") @enduml diff --git a/docs/infrastructure/production-compose.md b/docs/infrastructure/production-compose.md index a0f06df9..75b513c3 100644 --- a/docs/infrastructure/production-compose.md +++ b/docs/infrastructure/production-compose.md @@ -1,214 +1,22 @@ # Production Docker Compose & Infrastructure -This document contains the full production Docker Compose file, Caddyfile, VPS sizing recommendations, cost breakdown, and Hetzner ecosystem overview. +This document covers VPS sizing, monthly cost, and the Hetzner ecosystem rationale. The compose file and Caddyfile that previously lived inline in this doc are now committed to the repo root. + +> **Where to find the live files (after #497)** +> - Production compose: [`docker-compose.prod.yml`](../../docker-compose.prod.yml) (standalone, not an overlay) +> - Caddyfile: [`infra/caddy/Caddyfile`](../../infra/caddy/Caddyfile) +> - Deploy workflows: [`.gitea/workflows/nightly.yml`](../../.gitea/workflows/nightly.yml) and [`.gitea/workflows/release.yml`](../../.gitea/workflows/release.yml) +> - Bootstrap checklist, secrets, rollback procedure: [`docs/DEPLOYMENT.md`](../DEPLOYMENT.md) + +The original spec in this doc proposed an overlay pattern (`docker compose -f docker-compose.yml -f docker-compose.prod.yml`) with MinIO disabled in production in favour of Hetzner Object Storage. That approach was retired in #497 in favour of a standalone prod compose that keeps MinIO self-hosted on the VPS. The Hetzner OBS migration is tracked as a future follow-up; the swap is three env vars + `mc mirror` once we decide to do it. --- -## Full docker-compose.prod.yml +## Observability stack — not yet deployed -Usage: `docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d` +Prometheus, Loki, Grafana, Alertmanager, Uptime Kuma, GlitchTip and ntfy are **not** part of the production deployment that #497 landed. They are tracked as follow-up issue #498. -```yaml -# docker-compose.prod.yml -# Usage: docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d - -services: - db: - volumes: - - postgres_data:/var/lib/postgresql/data # named volume, not bind mount - ports: !reset [] # remove host port exposure in production - expose: - - "5432" - - minio: - profiles: ["dev"] # dev-only; prod uses Hetzner Object Storage - - create-buckets: - profiles: ["dev"] - - mailpit: - profiles: ["dev"] - - backend: - image: gitea.example.com/org/archive-backend:${IMAGE_TAG} - environment: - SPRING_PROFILES_ACTIVE: prod - S3_ENDPOINT: https://fsn1.your-objectstorage.com - MAIL_HOST: ${MAIL_HOST} - MAIL_PORT: 587 - SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: "true" - SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: "true" - ports: !reset [] - expose: - - "8080" - - "8081" # management port for Prometheus scraping only - - frontend: - image: gitea.example.com/org/archive-frontend:${IMAGE_TAG} - ports: !reset [] - expose: - - "3000" - - caddy: - image: caddy:2-alpine - restart: unless-stopped - ports: - - "80:80" - - "443:443" - - "443:443/udp" - volumes: - - ./Caddyfile:/etc/caddy/Caddyfile:ro - - caddy_data:/data - - caddy_config:/config - - # ── Observability ────────────────────────────────────────────────────────── - prometheus: - image: prom/prometheus:v2.51.0 # pinned - restart: unless-stopped - volumes: - - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml:ro - - prometheus_data:/prometheus - expose: ["9090"] - - grafana: - image: grafana/grafana:10.4.0 # pinned - restart: unless-stopped - environment: - GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD} - GF_PATHS_PROVISIONING: /etc/grafana/provisioning - GF_SERVER_ROOT_URL: https://grafana.example.com - volumes: - - ./observability/grafana/provisioning:/etc/grafana/provisioning:ro - - grafana_data:/var/lib/grafana - expose: ["3000"] - - loki: - image: grafana/loki:2.9.0 # pinned - restart: unless-stopped - volumes: - - ./observability/loki-config.yml:/etc/loki/config.yml:ro - - loki_data:/loki - expose: ["3100"] - - promtail: - image: grafana/promtail:2.9.0 # pinned - restart: unless-stopped - volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro - - ./observability/promtail-config.yml:/etc/promtail/config.yml:ro - - alertmanager: - image: prom/alertmanager:v0.27.0 # pinned - restart: unless-stopped - volumes: - - ./observability/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro - expose: ["9093"] - - # ── Uptime monitoring ────────────────────────────────────────────────────── - uptime-kuma: - image: louislam/uptime-kuma:1 - restart: unless-stopped - volumes: - - uptime_kuma_data:/app/data - expose: ["3001"] - - # ── Error tracking ───────────────────────────────────────────────────────── - glitchtip-web: - image: glitchtip/glitchtip:latest - restart: unless-stopped - depends_on: [db] - environment: - DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@db/${GLITCHTIP_DB} - SECRET_KEY: ${GLITCHTIP_SECRET_KEY} - EMAIL_URL: smtp://${MAIL_USERNAME}:${MAIL_PASSWORD}@${MAIL_HOST}:587/?tls=true - GLITCHTIP_DOMAIN: https://errors.example.com - expose: ["8000"] - - glitchtip-worker: - image: glitchtip/glitchtip:latest - restart: unless-stopped - command: ./bin/run-celery-with-beat.sh - depends_on: [glitchtip-web] - environment: - DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@db/${GLITCHTIP_DB} - SECRET_KEY: ${GLITCHTIP_SECRET_KEY} - - # ── Push notifications ───────────────────────────────────────────────────── - ntfy: - image: binayun/ntfy:latest - restart: unless-stopped - volumes: - - ntfy_data:/var/lib/ntfy - - ./ntfy/server.yml:/etc/ntfy/server.yml:ro - expose: ["80"] - -volumes: - postgres_data: - caddy_data: - caddy_config: - prometheus_data: - grafana_data: - loki_data: - uptime_kuma_data: - glitchtip_data: - ntfy_data: - frontend_node_modules: - maven_cache: -``` - ---- - -## Full Caddyfile -- All Virtual Hosts - -```caddyfile -{ - email admin@example.com -} - -# Main application -app.example.com { - header { - Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" - X-Content-Type-Options "nosniff" - X-Frame-Options "DENY" - Referrer-Policy "strict-origin-when-cross-origin" - -Server - } - @api path /api/* - reverse_proxy @api backend:8080 - @actuator path /actuator/* - respond @actuator 404 - reverse_proxy frontend:3000 -} - -# Gitea — source code and CI -git.example.com { - reverse_proxy gitea:3000 -} - -# Grafana — observability -grafana.example.com { - basicauth { - admin $2a$14$... - } - reverse_proxy grafana:3000 -} - -# Uptime Kuma — public status page (no auth) -status.example.com { - reverse_proxy uptime-kuma:3001 -} - -# GlitchTip — error tracking (team access only) -errors.example.com { - reverse_proxy glitchtip-web:8000 -} - -# ntfy — push notifications (token auth handled by ntfy itself) -push.example.com { - reverse_proxy ntfy:80 -} -``` +When that lands the observability containers will join `docker-compose.prod.yml` under a dedicated profile so they can be operated alongside the application stack without affecting the application containers' restart cycle. --- @@ -216,61 +24,47 @@ push.example.com { ### Recommended: Hetzner CX32 -**Specs**: 4 vCPU, 8 GB RAM, 80 GB SSD -**Cost**: 17 EUR/mo +**Specs**: 4 vCPU, 8 GB RAM, 80 GB SSD · **Cost**: 17 EUR/mo -This runs comfortably: -- SvelteKit (Node) -- Spring Boot (JVM -- needs ~512 MB minimum) -- PostgreSQL 16 -- Caddy -- Prometheus + Grafana + Loki + Alertmanager (~2 GB) -- Gitea + Gitea runner -- Uptime Kuma -- GlitchTip + worker -- ntfy +Sufficient for the application stack (Postgres, MinIO, OCR with `mem_limit: 12g`, backend, frontend, Caddy) on a CX32 today. Once the observability stack lands (Prometheus/Loki/Grafana/Alertmanager add ~2 GB) consider a CX42. ### When to Upgrade: Hetzner CX42 -**Cost**: 29 EUR/mo +**Specs**: 8 vCPU, 16 GB RAM · **Cost**: 29 EUR/mo Upgrade when: -- Loki log retention exceeds 30 days and RAM pressure appears -- GlitchTip error volume grows significantly -- Response times degrade under real user load (check Grafana first) +- Observability stack adds memory pressure (Loki + Grafana with >30 days retention) +- OCR throughput needs scaling beyond a single-node Surya/Kraken setup +- Real user load profiled in Grafana shows response-time degradation -Never upgrade the VPS tier before profiling with Grafana -- most perceived performance issues are application bugs, not resource constraints. +Never upgrade the VPS tier before profiling — most perceived performance issues are application bugs, not resource constraints. --- -## Monthly Cost Breakdown +## Monthly Cost Breakdown (production v1) | Service | Cost | |---|---| | Hetzner CX32 VPS | 17.00 EUR | -| Hetzner Object Storage (~200 GB) | 5.00 EUR | -| Hetzner SMTP relay | ~1.00 EUR | | Hetzner DNS | 0.00 EUR | -| **Total** | **~23 EUR/mo** | +| Hetzner SMTP relay | ~1.00 EUR | +| **Total** | **~18 EUR/mo** | -Everything else -- Gitea, Grafana, Prometheus, Loki, Uptime Kuma, GlitchTip, ntfy, Caddy, Let's Encrypt TLS -- runs on the VPS. Zero additional cost. +MinIO data lives on the VPS disk (no Object Storage line item yet). The Hetzner OBS migration would add ~5 EUR/mo at ~200 GB. -Equivalent SaaS stack: 200-300 EUR/mo. +Equivalent SaaS stack: 200–300 EUR/mo. --- -## Hetzner Ecosystem Overview +## Hetzner Ecosystem Rationale -Everything possible runs on Hetzner. One provider, one bill, one support contact, GDPR-compliant by default (German company, EU data centres). +Everything possible runs on Hetzner. One provider, one bill, GDPR-compliant by default (German company, EU data centres). -### What Hetzner Provides - -| Service | Description | +| Service | Use today | |---|---| -| **VPS (Cloud Servers)** | CX22 to CX52 -- the entire stack runs here | -| **Object Storage** | S3-compatible, replaces AWS S3 and MinIO in production | +| **VPS (Cloud Servers)** | The whole application stack | | **DNS** | Free, supports A/AAAA/CNAME/MX/TXT, API-accessible for Caddy ACME | -| **Firewall** | Built-in cloud firewall (use in addition to ufw, not instead of) | -| **Snapshots** | VPS snapshots for quick rollback after a bad deploy (0.013 EUR/GB/mo) | -| **Volumes** | Attachable block storage if the VPS disk fills up (0.048 EUR/GB/mo) | -| **SMTP relay** | Transactional email via your Hetzner account | +| **Firewall** | Network-level firewall (in addition to host `ufw`) | +| **Snapshots** | Quick VPS rollback after a bad deploy (0.013 EUR/GB/mo) | +| **SMTP relay** | Transactional email from `noreply@raddatz.cloud` | +| **Object Storage** | Not used today — MinIO stays on-VPS. Available when we decide to migrate | -- 2.49.1 From c9ac83b2ba2f719b6e90561de307858ad8dc0cb5 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 11:58:34 +0200 Subject: [PATCH 10/39] fix(infra): pin axllent/mailpit tag Removes `:latest` from the mailpit service; pins to v1.29.7 so staging deploys are reproducible. Renovate keeps the tag current. Co-Authored-By: Claude Opus 4.7 --- docker-compose.prod.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 25c7856b..539e9f84 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -100,7 +100,8 @@ services: # Dev-only mail catcher; gated behind the staging profile so production # never starts it. Staging workflow runs with `--profile staging`. mailpit: - image: axllent/mailpit:latest + # Pinned for reproducibility; Renovate bumps the tag. + image: axllent/mailpit:v1.29.7 restart: unless-stopped profiles: ["staging"] networks: -- 2.49.1 From a36f25cfc3d395f5c05c44d2decf24300a989c46 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 11:59:18 +0200 Subject: [PATCH 11/39] fix(infra): pin minio/mc client tag Removes the implicit `:latest` from the create-buckets bootstrap container. Pins to RELEASE.2025-08-13T08-35-41Z so a breaking change in mc CLI syntax cannot silently brick deploys. Co-Authored-By: Claude Opus 4.7 --- docker-compose.prod.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 539e9f84..1fdbd6c5 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -77,7 +77,8 @@ services: # Runs once per `docker compose up` and exits 0; `--ignore-existing` and # the user-add fallback are safe on re-deploy. create-buckets: - image: minio/mc + # Pinned mc client release for reproducibility; Renovate keeps it current. + image: minio/mc:RELEASE.2025-08-13T08-35-41Z depends_on: minio: condition: service_healthy -- 2.49.1 From 47c5f77c8125194f751c917b30709c1ba4ce2a6c Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 12:00:34 +0200 Subject: [PATCH 12/39] fix(infra): fail loud when archiv-app is missing the readwrite policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous `mc admin policy attach … || true` swallowed every failure mode: a renamed policy, an mc CLI signature change, or a transient MinIO error would leave the bootstrap container exiting zero with the service account possessing no permissions, and the backend would then fail every S3 call after a "successful" deploy. Replace the silent fallback with verify-after: keep the attach (idempotent in current mc, redundant in older versions), then assert via `mc admin user info` that `readwrite` ends up on archiv-app. A genuine attach failure now exits 1 and blocks the stack from starting. Co-Authored-By: Claude Opus 4.7 --- docker-compose.prod.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 1fdbd6c5..71634b76 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -94,8 +94,8 @@ services: /usr/bin/mc mb myminio/familienarchiv --ignore-existing; /usr/bin/mc anonymous set private myminio/familienarchiv; /usr/bin/mc admin user add myminio archiv-app $$MINIO_APP_PASSWORD || /usr/bin/mc admin user enable myminio archiv-app; - /usr/bin/mc admin policy attach myminio readwrite --user archiv-app || true; - exit 0; + /usr/bin/mc admin policy attach myminio readwrite --user archiv-app 2>/dev/null || true; + /usr/bin/mc admin user info myminio archiv-app | grep -q readwrite || { echo 'FATAL: archiv-app is missing the readwrite policy'; exit 1; }; " # Dev-only mail catcher; gated behind the staging profile so production -- 2.49.1 From 4eb5eba3478297189903f32355259f4b842e0757 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 12:01:23 +0200 Subject: [PATCH 13/39] feat(infra): parameterize OCR mem_limit via OCR_MEM_LIMIT Hardcoded `mem_limit: 12g` only works on CX42+ (16 GB) hosts; a CX32 (8 GB) cannot honour it. Make both mem_limit and memswap_limit driven by the OCR_MEM_LIMIT env var, defaulting to 12g so prod deploys on a CX42 keep current behaviour. Operators on smaller hosts override to 6g. Verified compose interpolation produces 12 GiB by default and 6 GiB when OCR_MEM_LIMIT=6g. Co-Authored-By: Claude Opus 4.7 --- docker-compose.prod.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 71634b76..56f73689 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -121,8 +121,10 @@ services: - "8000" # Surya OCR loads ~5GB of transformer models at startup; first request # triggers a further ~1GB Kraken model download into ocr-cache. - mem_limit: 12g - memswap_limit: 12g + # CX42+ (16 GB RAM) honours the default. On a CX32 (8 GB) override with + # OCR_MEM_LIMIT=6g (slower first-request, fits the host). + mem_limit: ${OCR_MEM_LIMIT:-12g} + memswap_limit: ${OCR_MEM_LIMIT:-12g} volumes: - ocr-models:/app/models - ocr-cache:/root/.cache -- 2.49.1 From 8d27c82e6d25ec4f1a55a43ea686aeadb5218e09 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 12:02:28 +0200 Subject: [PATCH 14/39] feat(infra): write Caddy JSON access logs for fail2ban MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an (access_log) snippet writing JSON-formatted access logs to /var/log/caddy/access.log with 10mb rolling and 14-file retention. Both archive vhosts (archiv.raddatz.cloud and staging.raddatz.cloud) import it; the git vhost is intentionally excluded. This is the prerequisite for the fail2ban jail committed in the next commit — fail2ban tails this file looking for 401 responses on /api/auth/login to defend against credential stuffing. Validated with `caddy validate` against caddy:2. Co-Authored-By: Claude Opus 4.7 --- infra/caddy/Caddyfile | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/infra/caddy/Caddyfile b/infra/caddy/Caddyfile index f32b1f2f..b5dfd345 100644 --- a/infra/caddy/Caddyfile +++ b/infra/caddy/Caddyfile @@ -31,9 +31,23 @@ respond @actuator 404 } +(access_log) { + # JSON access log for fail2ban. The jail at infra/fail2ban/familienarchiv.conf + # watches this file for 401 responses on /api/auth/login. + # Caddy auto-creates /var/log/caddy/ when running as the `caddy` system user. + log { + output file /var/log/caddy/access.log { + roll_size 10mb + roll_keep 14 + } + format json + } +} + archiv.raddatz.cloud { import security_headers import block_actuator + import access_log handle /api/* { reverse_proxy 127.0.0.1:8080 @@ -47,6 +61,7 @@ archiv.raddatz.cloud { staging.raddatz.cloud { import security_headers import block_actuator + import access_log handle /api/* { reverse_proxy 127.0.0.1:8081 -- 2.49.1 From ad69d7cb831887040d9dc1dd2f9429e09691f2b5 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 12:04:06 +0200 Subject: [PATCH 15/39] feat(infra): commit fail2ban jail for /api/auth/login MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two files mirroring the on-host install layout: infra/fail2ban/filter.d/familienarchiv-auth.conf infra/fail2ban/jail.d/familienarchiv.conf Filter parses the JSON access log emitted by Caddy (previous commit) and matches 401 responses on /api/auth/login. Jail bans the offending IP for 30 min after 10 attempts in a 10-minute window. Verified the failregex against four sample log lines via fail2ban-regex in an alpine container: - 2 brute-force 401 attempts → matched (ban) - 1 successful login (POST /api/auth/login 200) → not matched - 1 unrelated GET /login 200 → not matched Date template "ts":{EPOCH} parses Caddy's Unix-epoch ts field. The previous review iteration described this jail in DEPLOYMENT.md prose only; committing it makes the security posture reproducible from a fresh server build. Co-Authored-By: Claude Opus 4.7 --- .../filter.d/familienarchiv-auth.conf | 29 +++++++++++++++++++ infra/fail2ban/jail.d/familienarchiv.conf | 27 +++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 infra/fail2ban/filter.d/familienarchiv-auth.conf create mode 100644 infra/fail2ban/jail.d/familienarchiv.conf diff --git a/infra/fail2ban/filter.d/familienarchiv-auth.conf b/infra/fail2ban/filter.d/familienarchiv-auth.conf new file mode 100644 index 00000000..6f06551f --- /dev/null +++ b/infra/fail2ban/filter.d/familienarchiv-auth.conf @@ -0,0 +1,29 @@ +# fail2ban filter for credential-stuffing attempts against the +# Familienarchiv login endpoint. +# +# Parses Caddy JSON access log entries (configured in +# infra/caddy/Caddyfile via the (access_log) snippet). +# +# Sample matched line (whitespace inserted for readability): +# {"level":"info","ts":1700000000.12,"logger":"http.log.access", +# "msg":"handled request", +# "request":{"remote_ip":"203.0.113.42","method":"POST", +# "host":"archiv.raddatz.cloud", +# "uri":"/api/auth/login",…}, +# "status":401,…} +# +# Caddy emits remote_ip *inside* the request object and status at the +# top level. The order within the request object is stable +# (remote_ip → … → uri) across Caddy 2.7+. Lazy `.*?` keeps the regex +# robust to header-dict size growth. + +[INCLUDES] +before = common.conf + +[Definition] +failregex = ^\s*\{.*?"remote_ip":"".*?"uri":"/api/auth/login.*?"status":\s*401\b + +ignoreregex = + +# Caddy's ts field is a Unix epoch with sub-second precision. +datepattern = "ts":{EPOCH} diff --git a/infra/fail2ban/jail.d/familienarchiv.conf b/infra/fail2ban/jail.d/familienarchiv.conf new file mode 100644 index 00000000..e70d655f --- /dev/null +++ b/infra/fail2ban/jail.d/familienarchiv.conf @@ -0,0 +1,27 @@ +# Jail definition for the Familienarchiv login endpoint. +# +# Install: ln -sf /opt/familienarchiv/infra/fail2ban/jail.d/familienarchiv.conf \ +# /etc/fail2ban/jail.d/familienarchiv.conf +# ln -sf /opt/familienarchiv/infra/fail2ban/filter.d/familienarchiv-auth.conf \ +# /etc/fail2ban/filter.d/familienarchiv-auth.conf +# systemctl reload fail2ban +# +# Verify with: +# fail2ban-client status familienarchiv-auth +# fail2ban-regex /var/log/caddy/access.log familienarchiv-auth +# +# Tuning rationale: +# - maxretry 10: legitimate users mistyping passwords don't trip the jail +# - findtime 10m: rolling window that catches automated brute force +# - bantime 30m: long enough to discourage scripted attacks, short +# enough that a user who fat-fingered their VPN comes +# back online within a coffee break + +[familienarchiv-auth] +enabled = true +filter = familienarchiv-auth +logpath = /var/log/caddy/access.log +maxretry = 10 +findtime = 10m +bantime = 30m +action = iptables-multiport[name=familienarchiv-auth, port="http,https"] -- 2.49.1 From c523721ce8f78948f18efd34f69838298ecc7dac Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 12:05:00 +0200 Subject: [PATCH 16/39] feat(ci): smoke test staging deploy after up --wait Healthchecks prove containers are healthy on the docker network; they do not prove the public URL is reachable, HSTS still fires, or /actuator is still blocked at the edge. Add a post-deploy smoke step to nightly.yml that: 1. GETs https://staging.raddatz.cloud/login (frontend reachable) 2. asserts the response includes the Strict-Transport-Security header 3. asserts /actuator/health returns 404 (defense-in-depth verified) Failure aborts the workflow before the env-file cleanup step. The cleanup step still runs because it is `if: always()`. Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/nightly.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 82fc0c5c..bdfa399f 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -75,6 +75,20 @@ jobs: --profile staging \ up -d --wait --remove-orphans + - name: Smoke test deployed environment + # Healthchecks confirm containers are healthy; they do NOT confirm the + # public surface works. This step catches: Caddy not reloaded, DNS + # missing, HSTS header dropped, /actuator block bypassed. + run: | + set -e + URL="https://staging.raddatz.cloud" + echo "Smoke test: $URL" + curl -fsS --max-time 10 "$URL/login" -o /dev/null + curl -fsS --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security' + status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") + [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; } + echo "All smoke checks passed" + - name: Cleanup env file if: always() run: rm -f .env.staging -- 2.49.1 From a91a3e1f6105cdbb26735a8350933d88817a084d Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 12:05:41 +0200 Subject: [PATCH 17/39] feat(ci): smoke test production deploy after up --wait Mirrors the nightly.yml smoke step against archiv.raddatz.cloud. Catches the same three failure modes (Caddy not reloaded, DNS missing, HSTS dropped, /actuator block bypassed) on the prod path. Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/release.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index dbf7a9a8..ba5fb168 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -74,6 +74,18 @@ jobs: --env-file .env.production \ up -d --wait --remove-orphans + - name: Smoke test deployed environment + # See nightly.yml — same three checks, against the prod vhost. + run: | + set -e + URL="https://archiv.raddatz.cloud" + echo "Smoke test: $URL" + curl -fsS --max-time 10 "$URL/login" -o /dev/null + curl -fsS --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security' + status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") + [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; } + echo "All smoke checks passed" + - name: Cleanup env file if: always() run: rm -f .env.production -- 2.49.1 From 83565c6bb586258da72d57c75eab34afb8dcb1fe Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 12:06:48 +0200 Subject: [PATCH 18/39] docs(ci): document workflow operational assumptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two deploy workflows make two non-obvious assumptions that future maintainers should not have to rediscover by reading the diff: 1. Single-tenant self-hosted runner — the .env.* file lands on disk during the deploy and is cleaned up unconditionally. Multi-tenant usage would require switching to stdin-piped env input. 2. Host docker layer cache is authoritative — there is no actions/cache directive; a host-level `docker system prune` will cold-start the next build. Both notes added as block comments at the top of each workflow. Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/nightly.yml | 12 ++++++++++++ .gitea/workflows/release.yml | 13 +++++++++++++ 2 files changed, 25 insertions(+) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index bdfa399f..118dd54f 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -6,6 +6,18 @@ name: nightly # the host daemon and `docker compose up` consumes them directly — no # registry hop. # +# Operational assumptions (see docs/DEPLOYMENT.md §3 for the full setup): +# +# 1. Single-tenant self-hosted runner. The "Write staging env file" step +# writes every secret to .env.staging on the runner filesystem; the +# `if: always()` cleanup step removes it. A multi-tenant runner +# would need to switch to docker compose --env-file <(stdin) instead. +# +# 2. Host docker layer cache is authoritative. There is no +# actions/cache; we rely on the host daemon to keep Maven and npm +# layers warm between runs. A `docker system prune` on the host +# will cause the next nightly build to be cold (5–10 min slower). +# # Staging environment isolation: # - project name: archiv-staging # - host ports: backend 8081, frontend 3001 diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index ba5fb168..d4332ba9 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -5,6 +5,19 @@ name: release # tagged with the actual git tag (e.g. v1.0.0) so rollback is # `TAG= docker compose -f docker-compose.prod.yml -p archiv-production up -d --wait` # +# Operational assumptions (see docs/DEPLOYMENT.md §3 for the full setup): +# +# 1. Single-tenant self-hosted runner. The "Write production env file" +# step writes every secret to .env.production on the runner +# filesystem; the `if: always()` cleanup step removes it. A +# multi-tenant runner would need to switch to +# `docker compose --env-file <(stdin)` instead. +# +# 2. Host docker layer cache is authoritative. There is no +# actions/cache; we rely on the host daemon to keep Maven and npm +# layers warm between runs. A `docker system prune` on the host +# will cause the next release build to be cold (5–10 min slower). +# # Production environment: # - project name: archiv-production # - host ports: backend 8080, frontend 3000 -- 2.49.1 From ba5bd9cb115264577d875e28ead0d051fbb21813 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 12:07:59 +0200 Subject: [PATCH 19/39] docs(deployment): document fail2ban symlink, OCR_MEM_LIMIT, smoke test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates DEPLOYMENT.md to match the infra changes in this PR: §1 OCR memory — point operators at the new OCR_MEM_LIMIT env var instead of telling them to edit "the prod overlay". §2 OCR env vars — add OCR_MEM_LIMIT to the table. §3.1 server setup — replace fail2ban prose with concrete `ln -sf` commands referencing the committed jail/filter. Document the single-tenant runner assumption near the runner-registration step. §3.4 first deploy — describe the new automated smoke test step. Co-Authored-By: Claude Opus 4.7 --- docs/DEPLOYMENT.md | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 674bc15f..e995739c 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -54,7 +54,7 @@ The OCR service requires significant RAM for model loading. The dev compose sets | Hetzner CX32 | 8 GB | 6 GB | Accept reduced batch sizes and slower throughput | | Hetzner CX22 | 4 GB | — | Disable the OCR service (`profiles: [ocr]`); run OCR on demand only | -A CX32 cannot honour a `mem_limit: 12g` — set it to `6g` in the prod overlay or use CX42. +A CX32 cannot honour the default `mem_limit: 12g` — set the `OCR_MEM_LIMIT=6g` env var (in `.env.production` / `.env.staging`, or as a Gitea secret consumed by the workflow) before deploying on a CX32. The prod compose interpolates this var with a 12g default. ### Dev vs production differences @@ -131,6 +131,7 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back | `ALLOWED_PDF_HOSTS` | SSRF protection — comma-separated list of allowed PDF source hosts. **Do not widen to `*`** | `minio,localhost,127.0.0.1` | YES | — | | `KRAKEN_MODEL_PATH` | Directory containing Kraken HTR models (populated by `download-kraken-models.sh`) | `/app/models/` | — | — | | `BLLA_MODEL_PATH` | Kraken baseline layout analysis model path | `/app/models/blla.mlmodel` | — | — | +| `OCR_MEM_LIMIT` | Container memory cap for ocr-service in `docker-compose.prod.yml`. Set to `6g` on CX32 hosts; leave unset on CX42+ to use the 12g default | `12g` (prod compose default) | — | — | --- @@ -152,17 +153,28 @@ apt install caddy ln -sf /opt/familienarchiv/infra/caddy/Caddyfile /etc/caddy/Caddyfile systemctl reload caddy -# fail2ban — protect /api/auth/login from credential stuffing -# Jail watches Caddy access log for 401 responses on /api/auth/login. -# maxretry=10 findtime=10m bantime=30m +# fail2ban — protect /api/auth/login from credential stuffing. +# Jail watches the Caddy JSON access log for 401 responses on +# /api/auth/login. The jail (maxretry=10 / findtime=10m / bantime=30m) +# and filter are committed under infra/fail2ban/ — symlink them in: apt install fail2ban -# Drop the jail definition under /etc/fail2ban/jail.d/familienarchiv.conf +ln -sf /opt/familienarchiv/infra/fail2ban/jail.d/familienarchiv.conf \ + /etc/fail2ban/jail.d/familienarchiv.conf +ln -sf /opt/familienarchiv/infra/fail2ban/filter.d/familienarchiv-auth.conf \ + /etc/fail2ban/filter.d/familienarchiv-auth.conf +systemctl reload fail2ban +# Verify after first deploy with: +# fail2ban-client status familienarchiv-auth +# fail2ban-regex /var/log/caddy/access.log familienarchiv-auth # Tailscale — used by the backup pipeline to reach heim-nas (follow-up issue) curl -fsSL https://tailscale.com/install.sh | sh && tailscale up -# Self-hosted Gitea runner — register against the repo with a runner token -# (see https://docs.gitea.com/usage/actions/quickstart for the register step) +# Self-hosted Gitea runner — register against the repo with a runner token. +# This runner is assumed single-tenant: the deploy workflows write .env.* +# files to disk during execution (cleaned up unconditionally on completion). +# A multi-tenant runner would need to switch to stdin-piped env files. +# (See https://docs.gitea.com/usage/actions/quickstart for the register step.) ``` ### 3.2 DNS records @@ -198,8 +210,12 @@ git.raddatz.cloud A ```bash # 1. Trigger nightly.yml manually (Repo → Actions → nightly → "Run workflow") -# Expected: docker compose up -d --wait succeeds for archiv-staging -# 2. Verify TLS + reverse proxy +# Expected: docker compose up -d --wait succeeds for archiv-staging, then +# the workflow's "Smoke test deployed environment" step asserts: +# - https://staging.raddatz.cloud/login returns 200 +# - HSTS header is present +# - /actuator/health returns 404 (defense-in-depth check) +# 2. (Optional) Re-verify manually curl -I https://staging.raddatz.cloud/ # Expected: 200 (login page) with HSTS + X-Content-Type-Options headers # 3. When staging looks healthy, push a v* tag to trigger release.yml -- 2.49.1 From e5d953dee88729bdbdd3004cc1ba5807be38cde7 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:01:06 +0200 Subject: [PATCH 20/39] test(config): rewrite ForwardHeadersConfigurationTest as context-less binder test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drops @SpringBootTest + PostgresContainerConfig + @MockitoBean S3Client in favour of Spring's Binder API against application.yaml. The new test binds the property into the typed ServerProperties.ForwardHeadersStrategy enum, so typos (`nativ`, `Native`, `framework `) and future enum renames fail the build with BindException — addresses the silent-coercion concern that the YAML-string assertion missed. Verified the test goes red on a typo (BindException: Failed to convert "nativ" → ForwardHeadersStrategy) and green on `native`. Co-Authored-By: Claude Opus 4.7 --- .../ForwardHeadersConfigurationTest.java | 63 +++++++++++-------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java index b97f5ff0..755dad83 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/config/ForwardHeadersConfigurationTest.java @@ -1,37 +1,48 @@ package org.raddatz.familienarchiv.config; import org.junit.jupiter.api.Test; -import org.raddatz.familienarchiv.PostgresContainerConfig; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.context.annotation.Import; -import org.springframework.test.context.ActiveProfiles; -import org.springframework.test.context.bean.override.mockito.MockitoBean; -import software.amazon.awssdk.services.s3.S3Client; +import org.springframework.beans.factory.config.YamlPropertiesFactoryBean; +import org.springframework.boot.web.server.autoconfigure.ServerProperties.ForwardHeadersStrategy; +import org.springframework.boot.context.properties.bind.Binder; +import org.springframework.boot.context.properties.source.ConfigurationPropertySources; +import org.springframework.core.env.PropertiesPropertySource; +import org.springframework.core.io.ClassPathResource; + +import java.util.Properties; import static org.assertj.core.api.Assertions.assertThat; -@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.NONE) -@ActiveProfiles("test") -@Import(PostgresContainerConfig.class) +/** + * Binds {@code server.forward-headers-strategy} from {@code application.yaml} into + * Spring Boot's typed {@link ForwardHeadersStrategy} enum. The binder rejects any + * value that is not a valid enum constant ({@code BindException}), so a typo + * ({@code "nativ"}, {@code "Native"}, {@code "framework "}) or a future Spring + * rename of the property fails the test, not silently degrades to {@code NONE}. + * + *

No Spring context, no embedded server, no Testcontainers — this is the + * cheapest test that pins the contract "Caddy's X-Forwarded-Proto is trusted". + */ class ForwardHeadersConfigurationTest { - @MockitoBean - S3Client s3Client; - - @Autowired - @Value("${server.forward-headers-strategy:}") - String forwardHeadersStrategy; - @Test - void forward_headers_strategy_is_native_for_reverse_proxy_deployment() { - // Caddy terminates TLS and forwards X-Forwarded-Proto: https. - // Spring must trust those headers so that AppUser-facing redirect URLs, - // Spring Session cookies (Secure flag), and HttpServletRequest.getScheme() - // reflect the original client-facing scheme rather than the internal http hop. - assertThat(forwardHeadersStrategy) - .as("server.forward-headers-strategy must be 'native' so Jetty honours X-Forwarded-Proto behind Caddy") - .isEqualTo("native"); + void forward_headers_strategy_binds_to_NATIVE() { + YamlPropertiesFactoryBean yaml = new YamlPropertiesFactoryBean(); + yaml.setResources(new ClassPathResource("application.yaml")); + Properties props = yaml.getObject(); + assertThat(props).as("application.yaml must be on the classpath").isNotNull(); + + Binder binder = new Binder(ConfigurationPropertySources.from( + new PropertiesPropertySource("application", props))); + + ForwardHeadersStrategy strategy = binder + .bind("server.forward-headers-strategy", ForwardHeadersStrategy.class) + .orElseThrow(() -> new AssertionError( + "server.forward-headers-strategy is missing from application.yaml")); + + assertThat(strategy) + .as("Spring must trust X-Forwarded-Proto from Caddy so that " + + "request.getScheme(), redirect URLs, and the Spring Session " + + "'Secure' cookie reflect the original https client request.") + .isEqualTo(ForwardHeadersStrategy.NATIVE); } } -- 2.49.1 From 9652894aa44dc5879387eb878060b7aee9666801 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:03:04 +0200 Subject: [PATCH 21/39] test(ci): add fail2ban-regex regression job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Caddy 2.x emits JSON access logs; the failregex in infra/fail2ban/filter.d/familienarchiv-auth.conf depends on the "remote_ip" → "uri" → "status" key order being stable. A future Caddy upgrade that reorders fields would break the jail silently (regex no longer matches → fail2ban returns 0 hits → host stops banning brute-force, discovered only at the next incident). This job pins the contract: a sample /api/auth/login 401 line must match (1 hit) and a /api/auth/login 200 line must not (0 hits). Catches a regression at PR time instead of in production. Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/ci.yml | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index e45d2a22..461b486d 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -114,4 +114,36 @@ jobs: run: | chmod +x mvnw ./mvnw clean test - working-directory: backend \ No newline at end of file + working-directory: backend + + # ─── fail2ban Regex Regression ──────────────────────────────────────────────── + # The filter parses Caddy's JSON access log; a Caddy upgrade that reorders + # the JSON keys would silently break it (fail2ban-regex would return + # "0 matches", fail2ban would stop banning, no error surface). This job + # pins the contract against a deterministic sample line. + fail2ban-regex: + name: fail2ban Regex + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install fail2ban + run: | + sudo apt-get update + sudo apt-get install -y fail2ban + + - name: Matches /api/auth/login 401 + run: | + echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"POST","host":"archiv.raddatz.cloud","uri":"/api/auth/login"},"status":401}' > /tmp/sample.log + out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf) + echo "$out" + echo "$out" | grep -qE '1 matched' \ + || { echo "expected 1 match for /api/auth/login 401"; exit 1; } + + - name: Does not match /api/auth/login 200 + run: | + echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"POST","host":"archiv.raddatz.cloud","uri":"/api/auth/login"},"status":200}' > /tmp/sample.log + out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf) + echo "$out" + echo "$out" | grep -qE '0 matched' \ + || { echo "expected 0 matches for /api/auth/login 200"; exit 1; } \ No newline at end of file -- 2.49.1 From 91f70e652d57885606882e6052714bdbd1ab71e3 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:07:56 +0200 Subject: [PATCH 22/39] security(minio): scope archiv-app to bucket-only IAM policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces MinIO's built-in `readwrite` policy (which grants s3:* on arn:aws:s3:::* — every bucket present and future) with a bucket-scoped custom policy `archiv-app-policy`: - s3:GetObject / s3:PutObject / s3:DeleteObject on familienarchiv/* - s3:ListBucket / s3:GetBucketLocation on familienarchiv The previous configuration silently regressed the least-privilege guarantee that the service-account separation was supposed to provide: a future second bucket (logs, backups, mc-mirror staging) would have been read/write/delete-accessible to a compromised backend. While at it, two follow-on fixes: 1. Extract the entrypoint to infra/minio/bootstrap.sh. The previous inline `/bin/sh -c "..."` was already at the YAML-escaping ceiling; adding the policy-JSON heredoc would have made it unreadable. 2. Replace the `| grep -q readwrite || exit 1` fatal-check with a POSIX `case` substring match. The minio/mc image ships coreutils + bash but NOT grep/awk/sed — the original check was a no-op that ALWAYS exited 1 (verified locally). The new check passes on the first invocation and on every subsequent re-deploy. Idempotency verified locally: two consecutive `docker compose run --rm create-buckets` invocations both exit 0 with the user bound to the new policy. Co-Authored-By: Claude Opus 4.7 --- docker-compose.prod.yml | 20 +++++------- infra/minio/bootstrap.sh | 67 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 13 deletions(-) create mode 100755 infra/minio/bootstrap.sh diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 56f73689..468beeec 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -74,10 +74,11 @@ services: retries: 3 # Idempotent bucket bootstrap + service-account creation. - # Runs once per `docker compose up` and exits 0; `--ignore-existing` and - # the user-add fallback are safe on re-deploy. + # Runs once per `docker compose up` and exits 0. The entrypoint is + # extracted to infra/minio/bootstrap.sh so the (non-trivial) idempotent + # logic is readable, reviewable, and unit-testable as a script rather + # than YAML-escaped shell. create-buckets: - # Pinned mc client release for reproducibility; Renovate keeps it current. image: minio/mc:RELEASE.2025-08-13T08-35-41Z depends_on: minio: @@ -87,16 +88,9 @@ services: environment: MINIO_PASSWORD: ${MINIO_PASSWORD} MINIO_APP_PASSWORD: ${MINIO_APP_PASSWORD} - entrypoint: > - /bin/sh -c " - set -e; - /usr/bin/mc alias set myminio http://minio:9000 archiv $$MINIO_PASSWORD; - /usr/bin/mc mb myminio/familienarchiv --ignore-existing; - /usr/bin/mc anonymous set private myminio/familienarchiv; - /usr/bin/mc admin user add myminio archiv-app $$MINIO_APP_PASSWORD || /usr/bin/mc admin user enable myminio archiv-app; - /usr/bin/mc admin policy attach myminio readwrite --user archiv-app 2>/dev/null || true; - /usr/bin/mc admin user info myminio archiv-app | grep -q readwrite || { echo 'FATAL: archiv-app is missing the readwrite policy'; exit 1; }; - " + volumes: + - ./infra/minio/bootstrap.sh:/bootstrap.sh:ro + entrypoint: ["/bin/sh", "/bootstrap.sh"] # Dev-only mail catcher; gated behind the staging profile so production # never starts it. Staging workflow runs with `--profile staging`. diff --git a/infra/minio/bootstrap.sh b/infra/minio/bootstrap.sh new file mode 100755 index 00000000..5394a0ea --- /dev/null +++ b/infra/minio/bootstrap.sh @@ -0,0 +1,67 @@ +#!/bin/sh +# Idempotent MinIO bootstrap for the Familienarchiv stack. +# +# Runs on every `docker compose up` (the create-buckets service is one-shot, +# no restart). Each step swallows the "already exists" error so the script +# is safe to re-run. +# +# What it does: +# 1. Register the MinIO alias using the root credentials +# 2. Create the application bucket if missing +# 3. Lock the bucket to private (defense in depth) +# 4. Create/enable the `archiv-app` service account (least-privilege user) +# 5. Install a bucket-scoped policy `archiv-app-policy`: +# - GetObject/PutObject/DeleteObject on familienarchiv/* +# - ListBucket + GetBucketLocation on familienarchiv +# (Replaces MinIO's built-in `readwrite` which grants s3:* on *.) +# 6. Attach the policy to `archiv-app` +# 7. Fatal assertion: read back the user and confirm the policy is bound. +# Uses `case` (POSIX) for substring match — the minio/mc image ships +# coreutils + bash but NOT grep/awk/sed. +# +# Required env vars: MINIO_PASSWORD, MINIO_APP_PASSWORD +set -e + +mc alias set myminio http://minio:9000 archiv "$MINIO_PASSWORD" + +mc mb myminio/familienarchiv --ignore-existing +mc anonymous set private myminio/familienarchiv + +mc admin user add myminio archiv-app "$MINIO_APP_PASSWORD" \ + || mc admin user enable myminio archiv-app + +cat > /tmp/archiv-app-policy.json <<'POLICY' +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["s3:GetObject", "s3:PutObject", "s3:DeleteObject"], + "Resource": ["arn:aws:s3:::familienarchiv/*"] + }, + { + "Effect": "Allow", + "Action": ["s3:ListBucket", "s3:GetBucketLocation"], + "Resource": ["arn:aws:s3:::familienarchiv"] + } + ] +} +POLICY + +mc admin policy create myminio archiv-app-policy /tmp/archiv-app-policy.json 2>/dev/null \ + || mc admin policy update myminio archiv-app-policy /tmp/archiv-app-policy.json + +mc admin policy attach myminio archiv-app-policy --user archiv-app 2>/dev/null || true + +INFO=$(mc admin user info myminio archiv-app) +case "$INFO" in + *archiv-app-policy*) + echo "archiv-app bound to archiv-app-policy" + ;; + *) + echo "FATAL: archiv-app is missing the bucket-scoped policy" + echo "----- user info -----" + echo "$INFO" + exit 1 + ;; +esac -- 2.49.1 From 156afa14a20946cd1a54789f704ba453faaa7bd4 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:08:51 +0200 Subject: [PATCH 23/39] test(ci): add compose bucket-bootstrap idempotency job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The create-buckets service in docker-compose.prod.yml runs on every `docker compose up` (one-shot, restart=no). A re-deploy that fails because the user/bucket/policy already exists would block the whole nightly/release pipeline — and the only way to find out today is to run a second deploy. This job runs the bootstrap twice against a throwaway minio stack and asserts both invocations exit 0. Caught at PR time, not at the third nightly deploy at 02:00. Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/ci.yml | 50 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 461b486d..fd30bac6 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -146,4 +146,52 @@ jobs: out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf) echo "$out" echo "$out" | grep -qE '0 matched' \ - || { echo "expected 0 matches for /api/auth/login 200"; exit 1; } \ No newline at end of file + || { echo "expected 0 matches for /api/auth/login 200"; exit 1; } + + # ─── Compose Bucket-Bootstrap Idempotency ───────────────────────────────────── + # docker-compose.prod.yml's create-buckets service runs on every + # `docker compose up` (one-shot, no restart). Must be idempotent — a + # re-deploy must not fail just because the bucket / user / policy + # already exists. Validated by running create-buckets twice against a + # throwaway minio stack and asserting both invocations exit 0. + compose-idempotency: + name: Compose Bucket Idempotency + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Write stub env file + run: | + cat > .env.test <<'EOF' + TAG=test + PORT_BACKEND=18080 + PORT_FRONTEND=13000 + APP_DOMAIN=localhost + POSTGRES_PASSWORD=stub + MINIO_PASSWORD=stubrootpassword + MINIO_APP_PASSWORD=stubapppassword + OCR_TRAINING_TOKEN=stub + APP_ADMIN_USERNAME=admin@local + APP_ADMIN_PASSWORD=stub + MAIL_HOST=mailpit + MAIL_PORT=1025 + APP_MAIL_FROM=noreply@local + EOF + + - name: Bring up minio + run: | + docker compose -f docker-compose.prod.yml -p test-idem --env-file .env.test up -d --wait minio + + - name: First create-buckets run + run: | + docker compose -f docker-compose.prod.yml -p test-idem --env-file .env.test run --rm create-buckets + + - name: Second create-buckets run (idempotency check) + run: | + docker compose -f docker-compose.prod.yml -p test-idem --env-file .env.test run --rm create-buckets + + - name: Teardown + if: always() + run: | + docker compose -f docker-compose.prod.yml -p test-idem --env-file .env.test down -v + rm -f .env.test \ No newline at end of file -- 2.49.1 From 7e430998b8aed39484585467899c7d38b23e8d36 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:10:08 +0200 Subject: [PATCH 24/39] security(fail2ban): widen jail to /forgot-password and rate-limit 429 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The filter only watched /api/auth/login 401 — leaving the forgot-password endpoint open to: - email enumeration (slow brute-force probing which addresses exist) - password-reset brute-force against accounts whose addresses leak Widens the failregex to /api/auth/(login|forgot-password) and adds 429 to the status alternation so a future in-app rate-limiter response is also caught by the jail (defense in depth). CI assertions extended to cover both new dimensions plus a negative case on an unrelated 401 endpoint (/api/documents) — pins that the widening did not over-match. Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/ci.yml | 24 +++++++++++++++++++ .../filter.d/familienarchiv-auth.conf | 14 +++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index fd30bac6..6f6aa0e0 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -140,6 +140,22 @@ jobs: echo "$out" | grep -qE '1 matched' \ || { echo "expected 1 match for /api/auth/login 401"; exit 1; } + - name: Matches /api/auth/login 429 + run: | + echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"POST","host":"archiv.raddatz.cloud","uri":"/api/auth/login"},"status":429}' > /tmp/sample.log + out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf) + echo "$out" + echo "$out" | grep -qE '1 matched' \ + || { echo "expected 1 match for /api/auth/login 429"; exit 1; } + + - name: Matches /api/auth/forgot-password 401 + run: | + echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"POST","host":"archiv.raddatz.cloud","uri":"/api/auth/forgot-password"},"status":401}' > /tmp/sample.log + out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf) + echo "$out" + echo "$out" | grep -qE '1 matched' \ + || { echo "expected 1 match for /api/auth/forgot-password 401"; exit 1; } + - name: Does not match /api/auth/login 200 run: | echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"POST","host":"archiv.raddatz.cloud","uri":"/api/auth/login"},"status":200}' > /tmp/sample.log @@ -148,6 +164,14 @@ jobs: echo "$out" | grep -qE '0 matched' \ || { echo "expected 0 matches for /api/auth/login 200"; exit 1; } + - name: Does not match /api/documents (unrelated 401) + run: | + echo '{"level":"info","ts":1700000000.12,"logger":"http.log.access","msg":"handled request","request":{"remote_ip":"203.0.113.42","method":"GET","host":"archiv.raddatz.cloud","uri":"/api/documents"},"status":401}' > /tmp/sample.log + out=$(fail2ban-regex /tmp/sample.log infra/fail2ban/filter.d/familienarchiv-auth.conf) + echo "$out" + echo "$out" | grep -qE '0 matched' \ + || { echo "expected 0 matches for /api/documents 401"; exit 1; } + # ─── Compose Bucket-Bootstrap Idempotency ───────────────────────────────────── # docker-compose.prod.yml's create-buckets service runs on every # `docker compose up` (one-shot, no restart). Must be idempotent — a diff --git a/infra/fail2ban/filter.d/familienarchiv-auth.conf b/infra/fail2ban/filter.d/familienarchiv-auth.conf index 6f06551f..0f85a798 100644 --- a/infra/fail2ban/filter.d/familienarchiv-auth.conf +++ b/infra/fail2ban/filter.d/familienarchiv-auth.conf @@ -1,5 +1,5 @@ # fail2ban filter for credential-stuffing attempts against the -# Familienarchiv login endpoint. +# Familienarchiv authentication endpoints. # # Parses Caddy JSON access log entries (configured in # infra/caddy/Caddyfile via the (access_log) snippet). @@ -12,6 +12,16 @@ # "uri":"/api/auth/login",…}, # "status":401,…} # +# Watched endpoints: +# - /api/auth/login — credential stuffing +# - /api/auth/forgot-password — email enumeration + slow brute-force +# against accounts whose addresses leak +# +# Watched statuses: +# - 401 — bad credentials +# - 429 — server-side rate limit (in case a future in-app limiter +# returns 429 before fail2ban catches the volume) +# # Caddy emits remote_ip *inside* the request object and status at the # top level. The order within the request object is stable # (remote_ip → … → uri) across Caddy 2.7+. Lazy `.*?` keeps the regex @@ -21,7 +31,7 @@ before = common.conf [Definition] -failregex = ^\s*\{.*?"remote_ip":"".*?"uri":"/api/auth/login.*?"status":\s*401\b +failregex = ^\s*\{.*?"remote_ip":"".*?"uri":"/api/auth/(login|forgot-password).*?"status":\s*4(01|29)\b ignoreregex = -- 2.49.1 From f2ec81547ba54eb0a1f859514d89e6666a82ae9e Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:10:59 +0200 Subject: [PATCH 25/39] ci(deploy): add --pull to docker compose build for CVE pickup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without --pull, the host's Docker layer cache wins: if a CVE drops in node:20.19.0-alpine3.21 / postgres:16-alpine and the vendor re-publishes the same tag, the runner keeps serving the cached layer until the cache is manually cleared — a silent supply-chain blind spot. Adding --pull to both `compose build` invocations costs a single re-pull per run and lifts the base-image patch lag from "next host prune" to "next nightly". Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/nightly.yml | 6 +++++- .gitea/workflows/release.yml | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 118dd54f..cbf10d39 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -70,13 +70,17 @@ jobs: EOF - name: Build images + # `--pull` forces re-fetching pinned base images so a CVE + # re-publication of the same tag (e.g. node:20.19.0-alpine3.21, + # postgres:16-alpine) is picked up instead of being served + # from the host's stale Docker layer cache. run: | docker compose \ -f docker-compose.prod.yml \ -p archiv-staging \ --env-file .env.staging \ --profile staging \ - build + build --pull - name: Deploy staging run: | diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index d4332ba9..9ae74ad6 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -72,12 +72,15 @@ jobs: EOF - name: Build images + # `--pull` forces re-fetching pinned base images so a CVE + # re-publication of the same tag is picked up rather than served + # from the host's stale Docker layer cache. run: | docker compose \ -f docker-compose.prod.yml \ -p archiv-production \ --env-file .env.production \ - build + build --pull - name: Deploy production run: | -- 2.49.1 From fe1451f570d7bedf5899134bad53552a92d0dc31 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:12:05 +0200 Subject: [PATCH 26/39] ci(smoke): pin curl to 127.0.0.1 via --resolve MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The smoke step previously curled the public hostname unconditionally, which routes the runner's request via DNS → router → back into the same host. Many SOHO routers do not implement hairpin NAT (or do so only after a firmware update), so the deploy may pass on day one and silently fail on day 90. --resolve ":443:127.0.0.1" pins the hostname to the runner's loopback while keeping SNI on the public name (so the cert validates correctly and the Caddy vhost block matches). The smoke test now verifies that the Caddy-on-the-same-host is serving the right hostname end-to-end, with no router dependency. Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/nightly.yml | 21 ++++++++++++++------- .gitea/workflows/release.yml | 14 +++++++++----- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index cbf10d39..fa343eb4 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -93,15 +93,22 @@ jobs: - name: Smoke test deployed environment # Healthchecks confirm containers are healthy; they do NOT confirm the - # public surface works. This step catches: Caddy not reloaded, DNS - # missing, HSTS header dropped, /actuator block bypassed. + # public surface works. This step catches: Caddy not reloaded, HSTS + # header dropped, /actuator block bypassed. + # + # --resolve pins staging.raddatz.cloud to the runner's loopback so we + # do NOT depend on the host router doing hairpin NAT (many SOHO + # routers do not, or do so only after a firmware update). SNI still + # uses the public hostname so the cert validates correctly. run: | set -e - URL="https://staging.raddatz.cloud" - echo "Smoke test: $URL" - curl -fsS --max-time 10 "$URL/login" -o /dev/null - curl -fsS --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security' - status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") + HOST="staging.raddatz.cloud" + URL="https://$HOST" + RESOLVE="--resolve $HOST:443:127.0.0.1" + echo "Smoke test: $URL (pinned to 127.0.0.1)" + curl -fsS $RESOLVE --max-time 10 "$URL/login" -o /dev/null + curl -fsS $RESOLVE --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security' + status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; } echo "All smoke checks passed" diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 9ae74ad6..e1eeca2c 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -92,13 +92,17 @@ jobs: - name: Smoke test deployed environment # See nightly.yml — same three checks, against the prod vhost. + # --resolve pins archiv.raddatz.cloud to the runner's loopback so + # the smoke test does NOT depend on hairpin NAT on the host router. run: | set -e - URL="https://archiv.raddatz.cloud" - echo "Smoke test: $URL" - curl -fsS --max-time 10 "$URL/login" -o /dev/null - curl -fsS --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security' - status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") + HOST="archiv.raddatz.cloud" + URL="https://$HOST" + RESOLVE="--resolve $HOST:443:127.0.0.1" + echo "Smoke test: $URL (pinned to 127.0.0.1)" + curl -fsS $RESOLVE --max-time 10 "$URL/login" -o /dev/null + curl -fsS $RESOLVE --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security' + status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; } echo "All smoke checks passed" -- 2.49.1 From 33300e4ad9c8ae356c4be562520a36d998cf0f03 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:12:55 +0200 Subject: [PATCH 27/39] chore(infra): drop aspirational Renovate comments from compose MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The repo's renovate.json only configures TipTap grouping; Renovate is not currently active against MinIO / mc / mailpit / Postgres / Node / Caddy. The "Renovate keeps it current" comments were aspirational — those tags will rot until Renovate is bootstrapped (tracked in a follow-up issue). The "Pinned mc release; Renovate keeps it current" comment is gone already since the create-buckets entrypoint was extracted to a script in the preceding MinIO-policy commit. Co-Authored-By: Claude Opus 4.7 --- docker-compose.prod.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 468beeec..626d44a5 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -56,7 +56,8 @@ services: retries: 5 minio: - # Pinned MinIO release for reproducible deploys; Renovate keeps it current. + # Pinned MinIO release for reproducible deploys. Bumped manually until + # Renovate is bootstrapped for these production images (see follow-up issue). image: minio/minio:RELEASE.2025-02-28T09-55-16Z restart: unless-stopped command: server /data --console-address ":9001" @@ -95,7 +96,7 @@ services: # Dev-only mail catcher; gated behind the staging profile so production # never starts it. Staging workflow runs with `--profile staging`. mailpit: - # Pinned for reproducibility; Renovate bumps the tag. + # Pinned for reproducibility; bumped manually until Renovate is bootstrapped. image: axllent/mailpit:v1.29.7 restart: unless-stopped profiles: ["staging"] -- 2.49.1 From 59bc81d353d5ca1a1645fe33e42b2fb678b7b16f Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:14:58 +0200 Subject: [PATCH 28/39] docs(adr): ADR-009 standalone docker-compose.prod.yml, not overlay Records the decision to make docker-compose.prod.yml a fully self-contained file rather than an overlay over docker-compose.yml. Captures the cost (env-var duplication across dev and prod files) and the benefit (single file the reviewer can hold in their head, no Compose merge-rule surprises, automatic project-name namespacing for cohabiting staging + production on one host). Surfaces the retirement of the earlier overlay narrative in docs/infrastructure/production-compose.md so a future maintainer does not reverse the choice out of ignorance. Co-Authored-By: Claude Opus 4.7 --- .../adr/009-standalone-compose-not-overlay.md | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 docs/adr/009-standalone-compose-not-overlay.md diff --git a/docs/adr/009-standalone-compose-not-overlay.md b/docs/adr/009-standalone-compose-not-overlay.md new file mode 100644 index 00000000..e861fe47 --- /dev/null +++ b/docs/adr/009-standalone-compose-not-overlay.md @@ -0,0 +1,50 @@ +# ADR-009: Standalone `docker-compose.prod.yml`, not an overlay + +## Status + +Accepted + +## Context + +The repository's `docker-compose.yml` is a development stack: every service is built locally, ports are exposed on `0.0.0.0` for dev tooling, the frontend runs `npm run dev` with hot-reload, the backend is `spring-boot:run` with the dev profile, and there is no Caddy, no `archiv-app` service account, no admin-credential lock-in, no healthcheck-gated startup sequence. The dev stack reflects "single developer on a laptop", not "production on a single VPS". + +The pre-merge design (issue #497, comment #8331) sketched two ways to add a production stack: + +1. **Overlay** — keep `docker-compose.yml` as the base, add `docker-compose.prod.yml` as a `-f` overlay (`docker compose -f docker-compose.yml -f docker-compose.prod.yml up`). Compose merges the two files at runtime. +2. **Standalone** — make `docker-compose.prod.yml` a fully self-contained file that does not reference or merge with `docker-compose.yml` at all. Project-name namespacing (`-p archiv-production`, `-p archiv-staging`) keeps multi-environment deploys clean on a single host. + +The earlier `docs/infrastructure/production-compose.md` notes assumed overlay because the original plan was to **remove** MinIO in production (replace with Hetzner Object Storage), so the prod file would only need to remove one service and add a few. With MinIO retained (see ADR-010), the prod stack diverges from dev in essentially every service: build vs pre-built image, target stage, port binding, env vars, healthcheck, restart policy, mem_limit, profile gating, service account, depends_on chain. Overlay would mostly be `override:` blocks that nullify the dev defaults — a fragile inversion. + +## Decision + +`docker-compose.prod.yml` is standalone. Production and staging both run it directly: + +``` +production: docker compose -f docker-compose.prod.yml -p archiv-production --env-file .env.production ... +staging: docker compose -f docker-compose.prod.yml -p archiv-staging --env-file .env.staging --profile staging ... +``` + +Environment isolation is achieved via the Docker Compose project name (`-p`). Volumes, networks, and containers are namespaced by the project name, so production and staging cohabit cleanly on the same host without interfering. + +The dev `docker-compose.yml` is unchanged — `docker compose up` still works for developers, and its `frontend` service now specifies `target: development` explicitly so the new multi-stage Dockerfile builds the right stage. + +## Alternatives Considered + +| Alternative | Why rejected | +|---|---| +| Overlay (`-f base.yml -f prod.yml`) | With MinIO retained and most services differing across nearly every field, the overlay would consist mostly of `override:` blocks that null out dev defaults. Compose's merge semantics for nested keys (env, ports, healthcheck) are sharp — silent merges of port mappings, env-var entries, and depends_on edges cost reviewer hours. Standalone is one file the reader can hold in their head. | +| Two fully separate files (dev + prod) but with shared YAML anchors via `extends:` | `extends:` works across files but is a niche feature and is increasingly discouraged in compose v2. Reviewer load is higher than reading two flat files. | +| Generate prod compose from a template at deploy time (e.g. ytt, kustomize) | Adds a build-time step and a new tool to the operator toolchain. Justified for a fleet of 10+ environments; overkill for production + staging on one host. | +| Single compose file with environment-specific profiles | Compose profiles select which *services* run, not which *configuration* a service runs with. Using profiles to swap "build locally" vs "pull image" would smear dev and prod across one file. | + +## Consequences + +- The prod file can be read top-to-bottom without cross-referencing `docker-compose.yml`. Onboarding and review cost drops. +- Volume namespacing is automatic (`archiv-production_postgres-data`, `archiv-staging_postgres-data`) — no manual `volumes:` aliasing. +- Dev compose churn (e.g. swapping a dev port) cannot accidentally affect production. The two files are independent. +- The cost is duplication: identical environment variables (e.g. `POSTGRES_DB: archiv`) appear in both files. This duplication is bounded — there is no incentive to add more services that exist in both — and the alternative (overlay) carries its own duplication via `override:` boilerplate. +- The retired `docs/infrastructure/production-compose.md` narrative is trimmed to a pointer at the live files. The cost/sizing rationale is preserved there. + +## Future Direction + +If the deployment fleet ever grows beyond two environments on one host (e.g. add a `demo` environment, or shard staging across two VPS for load testing), revisit the templating decision. At three+ environments the duplication starts to bite and a template engine (kustomize or ytt) becomes attractive. -- 2.49.1 From b57afb9ad234162f8d216ce0532f6d0de2ecbbe2 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:15:38 +0200 Subject: [PATCH 29/39] docs(adr): ADR-010 MinIO stays self-hosted, Hetzner OBS deferred Records the reversal of the earlier "migrate to Hetzner Object Storage" direction in docs/infrastructure/production-compose.md. Documents the cost/benefit (current 13 GB fits trivially on the VPS; OBS billing is dominated by base fee at this size; migration is a three-env-var swap plus `mc mirror`, no application rewrite cost). Captures the four triggers that should re-open the decision (50 GB threshold, healthcheck latency, VPS upgrade cost, backup runtime) so the deferral does not become an indefinite punt. Co-Authored-By: Claude Opus 4.7 --- .../010-minio-self-hosted-not-hetzner-obs.md | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 docs/adr/010-minio-self-hosted-not-hetzner-obs.md diff --git a/docs/adr/010-minio-self-hosted-not-hetzner-obs.md b/docs/adr/010-minio-self-hosted-not-hetzner-obs.md new file mode 100644 index 00000000..4f84e30d --- /dev/null +++ b/docs/adr/010-minio-self-hosted-not-hetzner-obs.md @@ -0,0 +1,53 @@ +# ADR-010: MinIO stays self-hosted on the production VPS + +## Status + +Accepted + +## Context + +`docs/infrastructure/production-compose.md` (pre-this-PR) sketched a production topology in which the application bucket migrates from in-cluster MinIO to Hetzner Object Storage (OBS, S3-compatible). The motivation was operational: one less service to back up, no MinIO RAM/disk pressure on the VPS, hand off durability to the hyperscaler. + +Two facts revisited at pre-merge review (issue #497, comment #8331) changed the answer: + +1. **Current data size is small.** The archive is ~13 GB of file uploads (Kurrent letters, scanned ODS files, attachment PDFs). Hetzner OBS billing on this size is dominated by the per-month base fee (~5 EUR/mo for the smallest unit), not capacity or egress. The break-even point against the VPS's existing disk is far above the current footprint. +2. **MinIO is already production-grade.** The dev stack uses MinIO; the backend already drives it via the AWS SDK v2 with a generic `S3_ENDPOINT`. Switching providers is a runtime env-var change (`S3_ENDPOINT`, `S3_ACCESS_KEY`, `S3_SECRET_KEY`) plus an `mc mirror` to copy objects. There is no application-level rewrite cost waiting. + +If Hetzner OBS were a one-way-door (provider-specific SDK, complex IAM integration, multi-month migration), the decision would deserve a serious weighing. As reversible as the migration is, deferring it costs nothing. + +## Decision + +MinIO stays on the production VPS for the first launch. The application bucket is created and managed inside the docker-compose stack (`infra/minio/bootstrap.sh`). The backend uses a least-privilege service account (`archiv-app`) with a bucket-scoped IAM policy, not the MinIO root credentials. + +Hetzner Object Storage is **explicitly deferred**, not rejected. The migration path is documented as a runbook in `docs/DEPLOYMENT.md` (when the trigger fires): provision an OBS bucket, run `mc mirror local-minio:/familienarchiv obs:/familienarchiv`, rotate the three env vars, restart the backend, decommission the MinIO service from `docker-compose.prod.yml`. + +## Triggers to re-evaluate + +Revisit the decision when **any** of the following holds: + +- The `minio-data` volume exceeds 50 GB and is growing > 5 GB/month. +- MinIO healthcheck latency exceeds 200 ms p95 (signal of disk pressure on the host). +- The VPS upgrade required to keep MinIO healthy costs more per month than the equivalent OBS bucket + traffic. +- Backup of the MinIO volume to `heim-nas` over Tailscale (deferred follow-up) is implemented and consistently runs > 30 min nightly. At that point durability-as-a-service starts paying for itself. + +The migration runbook in `docs/DEPLOYMENT.md` is the script for executing the swap when one of the triggers fires. + +## Alternatives Considered + +| Alternative | Why rejected (for now) | +|---|---| +| Migrate to Hetzner Object Storage in this PR | Premature. Adds an external dependency, locks the operator into the Hetzner ecosystem before the data has demonstrated it needs hyperscaler durability, blocks the PR on a migration that buys ~5 GB of headroom. | +| Migrate to S3 (AWS) for HA across regions | Way over-spec for a family archive. Egress cost would dwarf any benefit; durability concerns at this size are addressed by nightly off-site backup, not by multi-region replication. | +| Drop S3 abstraction entirely; store files directly on the VPS disk | Possible, but loses the bucket-policy IAM surface (least-privilege service account), loses presigned-URL flow (OCR service downloads files via short-lived URLs, not via shared filesystem), loses the migration path to OBS. The S3 indirection is cheap insurance. | +| Self-hosted on-VPS plus periodic `mc mirror` to Hetzner OBS for off-site backup | This is the **target** for the backup pipeline follow-up. Treated as backup, not primary — primary stays MinIO. | + +## Consequences + +- The production VPS sizing (Hetzner CX42, 16 GB RAM, 80 GB disk) must accommodate MinIO's working set. Current footprint leaves ample headroom. +- Backup of MinIO data is the operator's responsibility until the off-site `mc mirror` pipeline is implemented (deferred follow-up). The DEPLOYMENT.md rollback procedure explicitly flags this — manual backup is the only recovery option until the pipeline ships. +- The backend never sees the MinIO root password; it uses the `archiv-app` service account with a bucket-scoped IAM policy (see `infra/minio/bootstrap.sh`). A backend RCE/SSRF cannot escalate beyond the `familienarchiv` bucket. +- The migration to Hetzner OBS remains a small, well-understood runbook step rather than a major refactor. No application code, no SDK swap. + +## Future Direction + +When one of the triggers above fires, the migration is: provision OBS bucket → `mc mirror` → rotate three env vars → restart backend → remove MinIO service from compose. The bucket-scoped policy translates 1:1 to an OBS user policy (S3-compatible). -- 2.49.1 From 6a6a1c4353572d471f0961f2bdefbcb436da5f0e Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:16:20 +0200 Subject: [PATCH 30/39] docs(adr): ADR-011 single-tenant Gitea runner with on-disk env-files Records the operational assumption that nightly.yml and release.yml bake in: the self-hosted runner is single-tenant, so writing secrets to .env.staging / .env.production on disk and removing them via an `if: always()` cleanup step is acceptable for v1. Documents the three migration triggers (second repo on the runner, untrusted PR execution, move to shared infrastructure) and the one-step migration path (--env-file <(printf '%s' "$SECRET_BLOB")) so the next operator does not silently break the trust assumption. The in-comment notes at the top of both workflow files already point at this ADR's content; this commit records the decision in the durable location the doc-currency table demands. Co-Authored-By: Claude Opus 4.7 --- docs/adr/011-single-tenant-gitea-runner.md | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 docs/adr/011-single-tenant-gitea-runner.md diff --git a/docs/adr/011-single-tenant-gitea-runner.md b/docs/adr/011-single-tenant-gitea-runner.md new file mode 100644 index 00000000..d5250592 --- /dev/null +++ b/docs/adr/011-single-tenant-gitea-runner.md @@ -0,0 +1,58 @@ +# ADR-011: Single-tenant Gitea runner with secrets-on-disk env-files + +## Status + +Accepted + +## Context + +The deploy workflows (`.gitea/workflows/nightly.yml`, `release.yml`) execute on a self-hosted Gitea Actions runner. The runner has Docker-out-of-Docker access (the host's Docker socket is mounted into the runner), so `docker compose build` produces images on the host daemon and `docker compose up` consumes them directly — no registry hop. + +Two workflow steps shape the security model: + +1. **"Write env file"** — the workflow writes every required secret to `.env.staging` or `.env.production` on the runner's filesystem so that `docker compose --env-file` can consume them. The file lives on disk for the duration of the workflow. +2. **"Cleanup env file"** — the matching `if: always()` step deletes the env file after the workflow ends, regardless of success. + +This shape only works under one operational assumption: **the runner is single-tenant**. The runner is owned by the same operator who owns the secrets, no other repositories run jobs on the same runner, and no untrusted code is executed (no public fork PRs trigger workflows). If any of those held, the env-file-on-disk approach would be a credential exposure path — a sibling job could read `.env.production`, or a malicious PR could exfiltrate the secrets via a step. + +The alternative — `docker compose --env-file <(printf "..." )` (bash process substitution) — is technically supported and would keep secrets out of the on-disk filesystem. It is more secure under a multi-tenant runner but requires bash 4+ and is brittle inside YAML (the `printf` step would need to escape every secret value containing newlines, equals signs, or quotes). + +## Decision + +The runner is treated as single-tenant for the lifetime of the v1 deployment. The workflows write env-files to disk under that assumption and rely on the `if: always()` cleanup step to remove them. The operational assumption is documented in-comment at the top of both workflow files (`nightly.yml`, `release.yml`) so the next operator who considers adding a second repo or accepting public PRs has the trigger surfaced in front of them. + +Concretely: + +- The Gitea runner only runs jobs for `marcel/familienarchiv`. +- No public fork PRs trigger the workflows (Gitea defaults to requiring an explicit approval on first-time contributor PRs for the actions to run). +- Secrets are stored in Gitea repository secrets and injected via `${{ secrets.* }}`. They land in the env-file at workflow start and are removed at workflow end. + +## Migration trigger + +Switch to the multi-tenant-safe pattern when **any** of the following becomes true: + +- A second repository starts using the same runner. +- A workflow accepts contributions that can run untrusted code (public PRs without manual approval). +- The runner is moved off the operator's controlled host onto shared infrastructure. + +The migration path is one-step per workflow: replace the "Write env file" step with `--env-file <(printf '%s' "${{ secrets.STAGING_ENV_BLOB }}")` and store the full env-file as a single Gitea secret. The cleanup step is then unnecessary because the env-file never touches disk. + +## Alternatives Considered + +| Alternative | Why rejected (for now) | +|---|---| +| `--env-file <(printf "...")` via bash process substitution | More secure under multi-tenant. Brittle for multi-line / quoted secret values; harder to debug ("env file not found" with no diff to inspect). Justified once the trigger above fires. | +| Docker secrets (`docker secret create` + `compose secrets:`) | Designed for Swarm; outside of Swarm, compose secrets read from files anyway, so the on-disk surface is the same. Adds complexity without changing the threat model. | +| External secret manager (Vault, AWS Secrets Manager) | Adds a third-party dependency to the deploy path. For a family-archive deployment with one operator and one VPS, the cost outweighs the benefit at this scale. | +| GitHub-hosted ephemeral runners | Would require uploading the prod-deploy artifacts to a registry first, then a deploy step on the VPS connecting back. Inverts the current Docker-out-of-Docker simplicity for marginal security gain. The single-tenant self-hosted runner *is* ephemeral in practice — the secrets are written to a directory the runner controls, then deleted. | + +## Consequences + +- The runner host's filesystem is in the secret-trust boundary. The host is hardened per `docs/DEPLOYMENT.md` (ufw, fail2ban, Tailscale-only SSH). +- An operator who later adds a second repo to the runner without revisiting the workflows would silently break the trust assumption. The in-file comments at the top of `nightly.yml` and `release.yml` are the breadcrumb that surfaces the assumption at change time. +- The `if: always()` cleanup step is load-bearing: removing it (e.g. during a future workflow refactor) leaves credentials on disk between runs. Treat it as a permanent invariant. +- Workflow debuggability stays high: an operator who needs to know what env-file the deploy ran with can SSH onto the host while a workflow is in flight and `cat .env.staging` — useful for first-deploy diagnostics. + +## Future Direction + +When the trigger fires, migrate both workflows in a single PR: replace the "Write env file" step with a single `--env-file <(printf '%s' …)` invocation, drop the cleanup step, and consolidate the per-secret Gitea entries into a single multi-line `STAGING_ENV_BLOB` / `PROD_ENV_BLOB` secret. Single commit, both workflows, no application change. -- 2.49.1 From 03d478840ba7cb3bb6ae513d3ff8ce43ef90a2ee Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:17:12 +0200 Subject: [PATCH 31/39] docs(arch): show Caddy + X-Forwarded-Proto in auth-flow diagram MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the Caddy hop to seq-auth-flow.puml and surfaces the two production-relevant header behaviours: - Caddy terminates TLS and forwards X-Forwarded-Proto: https - Spring Boot trusts this header (server.forward-headers-strategy: native, ForwardedRequestCustomizer at the Jetty layer), so request.getScheme() returns "https" - The Set-Cookie response carries the Secure flag because the observed scheme is https — without forward-headers-strategy this would silently drop to plain http and the cookie would lose Secure Closes the doc-currency gap flagged in the Markus review on PR #499: "Auth flow change → docs/architecture/c4/seq-auth-flow.puml". Co-Authored-By: Claude Opus 4.7 --- docs/architecture/c4/seq-auth-flow.puml | 35 ++++++++++++++++++++----- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/docs/architecture/c4/seq-auth-flow.puml b/docs/architecture/c4/seq-auth-flow.puml index bae4a831..63b9038d 100644 --- a/docs/architecture/c4/seq-auth-flow.puml +++ b/docs/architecture/c4/seq-auth-flow.puml @@ -1,26 +1,49 @@ @startuml -title Authentication Flow +title Authentication Flow (behind Caddy reverse proxy) actor User participant Browser +participant "Caddy (TLS termination)" as Caddy participant "Frontend (SvelteKit)" as Frontend participant "Backend (Spring Boot)" as Backend participant PostgreSQL as DB User -> Browser: Enter email + password -Browser -> Frontend: POST /login (form action) +Browser -> Caddy: HTTPS POST /login (form action) +note right of Caddy + Caddy terminates TLS and forwards + to Frontend over HTTP with: + X-Forwarded-Proto: https + X-Forwarded-For: + X-Forwarded-Host: archiv.raddatz.cloud +end note +Caddy -> Frontend: HTTP POST /login\n+ X-Forwarded-Proto: https Frontend -> Frontend: Base64 encode "email:password" -Frontend -> Backend: GET /api/users/me\nAuthorization: Basic +Frontend -> Backend: GET /api/users/me\nAuthorization: Basic \n+ X-Forwarded-Proto: https +note right of Backend + server.forward-headers-strategy: native + Jetty's ForwardedRequestCustomizer + reads X-Forwarded-Proto so + request.getScheme() returns "https". +end note Backend -> Backend: Spring Security parses Basic Auth Backend -> DB: SELECT user WHERE email=? DB --> Backend: AppUser + groups + permissions Backend -> Backend: BCrypt.matches(password, hash) Backend --> Frontend: 200 OK — UserDTO -Frontend -> Browser: Set-Cookie: auth_token=\n(httpOnly, SameSite=strict, maxAge=86400) -Browser -> Frontend: GET / (next request) +Frontend -> Caddy: Set-Cookie: auth_token=\n(httpOnly, **Secure**, SameSite=strict, maxAge=86400) +note right of Frontend + Secure flag is set because the + request scheme observed by the + app is https (forwarded by Caddy). +end note +Caddy -> Browser: HTTPS 200 + Set-Cookie +Browser -> Caddy: HTTPS GET / (next request) +Caddy -> Frontend: HTTP GET / + X-Forwarded-Proto: https Frontend -> Frontend: hooks.server.ts reads auth_token cookie Frontend -> Backend: GET /api/users/me\nAuthorization: Basic Backend --> Frontend: 200 OK — user in event.locals -Frontend --> Browser: Render page with user context +Frontend --> Caddy: rendered page +Caddy --> Browser: HTTPS 200 @enduml -- 2.49.1 From a7a80f8c1616efe229d8fa800b50d4134f17c3fe Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 13:18:11 +0200 Subject: [PATCH 32/39] docs(deployment): route SSE through Caddy in topology mermaid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The top-level deployment diagram lagged the C4 L2 diagram, which correctly notes that SSE notifications are fronted by Caddy. The mermaid showed Browser → Backend direct, which would only be true if the backend port were exposed publicly (it is not — all docker ports bind to 127.0.0.1). Fixes the inconsistency Markus flagged on PR #499: the public surface is Caddy and Caddy only. Co-Authored-By: Claude Opus 4.7 --- docs/DEPLOYMENT.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index e995739c..bd7b7a1a 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -33,14 +33,14 @@ graph TD Backend -->|S3 API :9000| MinIO[(MinIO)] Backend -->|HTTP :8000 internal| OCR["OCR Service\nPython FastAPI"] OCR -->|presigned URL| MinIO - Browser -->|SSE direct| Backend + Caddy -->|SSE proxy_pass| Backend ``` **Key facts:** - Caddy terminates TLS and reverse-proxies to frontend (`:3000`) and backend (`:8080`). The Caddyfile is committed at [`infra/caddy/Caddyfile`](../infra/caddy/Caddyfile) and is installed on the host as `/etc/caddy/Caddyfile` (symlink). - The host binds all docker-published ports to `127.0.0.1` only; Caddy is the sole external entry point. - The OCR service has **no published port** — reachable only on the internal Docker network from the backend. -- SSE notifications go directly backend → browser (not via the SvelteKit SSR layer). +- SSE notifications transit Caddy (browser → Caddy → backend); the backend is never reachable directly from the public internet. The SvelteKit SSR layer is bypassed for SSE, but Caddy is not. - The Caddyfile responds `404` on `/actuator/*` (defense in depth). Internal monitoring scrapes the backend on the docker network, not through Caddy. - Production and staging cohabit on the same host via docker compose project names: `archiv-production` (ports 8080/3000) and `archiv-staging` (ports 8081/3001). -- 2.49.1 From 8fcf653cb02d43b4dd2a217f6ef859e93eb05536 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 14:05:02 +0200 Subject: [PATCH 33/39] ci(smoke): pin HSTS to preload-list-eligible value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the presence-only `grep -qi strict-transport-security` smoke assertion in both nightly.yml and release.yml with a value-pinning regex that requires `max-age=31536000`, `includeSubDomains`, and `preload`. A future Caddyfile edit that drops any of those three parts now fails the deploy smoke step instead of passing silently. Verified locally that the new pattern matches the preload-eligible value and rejects three degraded forms (short max-age, missing includeSubDomains, missing preload). Addresses @sara's round-2 note on PR #499 — "presence check, not value check". Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/nightly.yml | 6 +++++- .gitea/workflows/release.yml | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index fa343eb4..3353e6f7 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -107,7 +107,11 @@ jobs: RESOLVE="--resolve $HOST:443:127.0.0.1" echo "Smoke test: $URL (pinned to 127.0.0.1)" curl -fsS $RESOLVE --max-time 10 "$URL/login" -o /dev/null - curl -fsS $RESOLVE --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security' + # Pin the preload-list-eligible HSTS value, not just header presence: + # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must + # fail this check rather than pass it silently. + curl -fsS $RESOLVE --max-time 10 -I "$URL/" \ + | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload' status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; } echo "All smoke checks passed" diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index e1eeca2c..69e59dd6 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -101,7 +101,11 @@ jobs: RESOLVE="--resolve $HOST:443:127.0.0.1" echo "Smoke test: $URL (pinned to 127.0.0.1)" curl -fsS $RESOLVE --max-time 10 "$URL/login" -o /dev/null - curl -fsS $RESOLVE --max-time 10 -I "$URL/" | grep -qi 'strict-transport-security' + # Pin the preload-list-eligible HSTS value, not just header presence: + # a degraded `max-age=1` or a dropped `includeSubDomains; preload` must + # fail this check rather than pass it silently. + curl -fsS $RESOLVE --max-time 10 -I "$URL/" \ + | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload' status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; } echo "All smoke checks passed" -- 2.49.1 From 09680557ef68337fd9d1a8daee8c479ce5e40e35 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 14:06:13 +0200 Subject: [PATCH 34/39] security(caddy): add Permissions-Policy header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `Permissions-Policy: camera=(), microphone=(), geolocation=()` to the shared (security_headers) snippet, so both archiv vhosts and the git vhost deny browser APIs the app does not use. Reduces blast radius of an XSS landing in a privileged origin. The deploy smoke steps in nightly.yml and release.yml gain a matching assertion against the canonical header value, so a future Caddyfile edit that drops or loosens the header (e.g. `camera=(self)`) fails the deploy instead of regressing silently. `caddy validate` against caddy:2 passes; both workflow YAMLs parse. Addresses @nora's round-2 suggestion on PR #499 — "lower-impact than CSP but nearly free". Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/nightly.yml | 5 +++++ .gitea/workflows/release.yml | 5 +++++ infra/caddy/Caddyfile | 4 ++++ 3 files changed, 14 insertions(+) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 3353e6f7..870a5a99 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -112,6 +112,11 @@ jobs: # fail this check rather than pass it silently. curl -fsS $RESOLVE --max-time 10 -I "$URL/" \ | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload' + # Permissions-Policy denies APIs the app does not use (camera, + # microphone, geolocation). A regression that loosens or drops the + # header now fails the smoke step. + curl -fsS $RESOLVE --max-time 10 -I "$URL/" \ + | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)' status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; } echo "All smoke checks passed" diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 69e59dd6..3d5c99d1 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -106,6 +106,11 @@ jobs: # fail this check rather than pass it silently. curl -fsS $RESOLVE --max-time 10 -I "$URL/" \ | grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload' + # Permissions-Policy denies APIs the app does not use (camera, + # microphone, geolocation). A regression that loosens or drops the + # header now fails the smoke step. + curl -fsS $RESOLVE --max-time 10 -I "$URL/" \ + | grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)' status=$(curl -s $RESOLVE -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health") [ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; } echo "All smoke checks passed" diff --git a/infra/caddy/Caddyfile b/infra/caddy/Caddyfile index b5dfd345..fc6b02bb 100644 --- a/infra/caddy/Caddyfile +++ b/infra/caddy/Caddyfile @@ -18,6 +18,10 @@ Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" X-Content-Type-Options "nosniff" Referrer-Policy "strict-origin-when-cross-origin" + # Deny browser APIs the app does not use. Reduces blast radius of an + # XSS landing in a privileged origin: a payload cannot silently turn + # on the microphone or read geolocation. + Permissions-Policy "camera=(), microphone=(), geolocation=()" -Server } } -- 2.49.1 From a4f2047bccb607172f65b52c9ad4796e88fda6ea Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 14:07:16 +0200 Subject: [PATCH 35/39] security(ocr): pin ALLOWED_PDF_HOSTS=minio in prod ocr-service env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production never sources PDFs from localhost or 127.0.0.1 — the OCR service only reads from MinIO over the internal docker network. The Python default (`minio,localhost,127.0.0.1`) was permissive on purpose for local dev, but in production a future change to that default — or a host-env override — would silently broaden the SSRF surface. Pinning the env var explicitly here freezes the allowlist to the one hostname production actually needs. `docker compose config --quiet` and `--profile staging config --quiet` both still pass. Verified the resolved config emits `ALLOWED_PDF_HOSTS: minio`. Addresses @nora's round-2 suggestion on PR #499 — "five characters of YAML, lifetime guarantee". Co-Authored-By: Claude Opus 4.7 --- docker-compose.prod.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 626d44a5..b821ec33 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -128,6 +128,11 @@ services: TRAINING_TOKEN: ${OCR_TRAINING_TOKEN} OCR_CONFIDENCE_THRESHOLD: "0.3" OCR_CONFIDENCE_THRESHOLD_KURRENT: "0.5" + # SSRF allowlist pinned explicitly to the internal MinIO hostname. + # In prod the OCR service only fetches PDFs from MinIO over the + # docker network; localhost/127.0.0.1 are dev-only sources and + # must NOT be reachable here. Do not widen to `*`. + ALLOWED_PDF_HOSTS: "minio" networks: - archive-net healthcheck: -- 2.49.1 From 1873f50f7f85d1f8d60c9ffbc19b464650028581 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 14:08:23 +0200 Subject: [PATCH 36/39] infra(mailpit): use nc -z healthcheck instead of wget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mailpit service healthcheck previously assumed `wget` ships in the axllent/mailpit image. That's true for v1.29.7 but is not part of the image's contract — a future Alpine slim-down could drop wget and silently disable the healthcheck. Switched to BusyBox `nc -z localhost 8025`, which is a TCP-port open check with no dependency beyond BusyBox itself. Verified inside axllent/mailpit:v1.29.7 that `nc` is present (/usr/bin/nc, BusyBox v1.37.0) and that the proposed command returns 0 against an open port and non-zero against a closed one. Compose still parses with `--profile staging`. Addresses @tobi's round-2 suggestion on PR #499. Co-Authored-By: Claude Opus 4.7 --- docker-compose.prod.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index b821ec33..4cd7d9c4 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -103,7 +103,11 @@ services: networks: - archive-net healthcheck: - test: ["CMD-SHELL", "wget -qO- http://localhost:8025/api/v1/info >/dev/null 2>&1 || exit 1"] + # TCP-port open check via BusyBox `nc`. The previous wget-based probe + # introduced a non-obvious binary dependency on the mailpit image; a + # future tag that ships without wget would silently disable the + # healthcheck. `nc` is part of BusyBox in the upstream image. + test: ["CMD-SHELL", "nc -z localhost 8025 || exit 1"] interval: 10s timeout: 5s retries: 5 -- 2.49.1 From 440a191138b6baeeed68cbec1d1ae9ffd345f44e Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 14:09:12 +0200 Subject: [PATCH 37/39] infra(workflows): annotate env-file cleanup as load-bearing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `if: always()` conditional on the env-file cleanup step in both deploy workflows is what makes the ADR-011 single-tenant runner trust model safe: secrets land on disk before each deploy and are wiped unconditionally afterwards. A future workflow refactor that drops `if: always()` would silently leave plaintext secrets on the runner on any failed deploy. The ADR documents this; the workflow file did not. Adds a prominent inline comment so the next reader of the YAML sees the constraint without having to cross-reference ADR-011. No behaviour change — both workflows still parse. Addresses @nora's round-2 suggestion on PR #499 — "linchpin of the ADR-011 trust model". Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/nightly.yml | 6 ++++++ .gitea/workflows/release.yml | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 870a5a99..1bae7b04 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -122,5 +122,11 @@ jobs: echo "All smoke checks passed" - name: Cleanup env file + # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011 + # single-tenant runner trust model. Every secret in .env.staging + # is plain text on the runner filesystem until this step runs. + # If a future refactor drops `if: always()`, a failed deploy + # leaves the env-file behind. Do not remove this conditional + # without first re-evaluating ADR-011. if: always() run: rm -f .env.staging diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 3d5c99d1..96894fed 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -116,5 +116,11 @@ jobs: echo "All smoke checks passed" - name: Cleanup env file + # LOAD-BEARING: `if: always()` is the linchpin of the ADR-011 + # single-tenant runner trust model. Every secret in + # .env.production is plain text on the runner filesystem until + # this step runs. If a future refactor drops `if: always()`, a + # failed deploy leaves the env-file behind. Do not remove this + # conditional without first re-evaluating ADR-011. if: always() run: rm -f .env.production -- 2.49.1 From 9adde3cd890f683ef9f670f3c980cfb00bf89eec Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 14:10:39 +0200 Subject: [PATCH 38/39] refactor(compose): rename docker network archive-net to archiv-net MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docker network was the only `archive-*` identifier in either compose file; everything else (user, db, bucket, service account, project name) uses the `archiv-*` spelling. Reviewers' eyes stuttered on it on the prod compose review (round 2 of PR #499 — Markus and Tobi). Renamed in both prod and dev compose for consistency and updated the single doc reference to the dev-project-prefixed network name. Operational note: applying this change to a running stack will recreate the network on the next `docker compose up`; containers restart, named volumes are unaffected. `docker compose config --quiet` passes for both compose files and for the staging profile. Sweep confirms zero `archive-net` references remain in the tree. Co-Authored-By: Claude Opus 4.7 --- docker-compose.prod.yml | 16 ++++++++-------- docker-compose.yml | 16 ++++++++-------- docs/infrastructure/ci-gitea.md | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 4cd7d9c4..b66ace54 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -28,7 +28,7 @@ # APP_MAIL_FROM sender address (e.g. noreply@raddatz.cloud) networks: - archive-net: + archiv-net: driver: bridge volumes: @@ -48,7 +48,7 @@ services: volumes: - postgres-data:/var/lib/postgresql/data networks: - - archive-net + - archiv-net healthcheck: test: ["CMD-SHELL", "pg_isready -U archiv -d archiv"] interval: 10s @@ -67,7 +67,7 @@ services: volumes: - minio-data:/data networks: - - archive-net + - archiv-net healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] interval: 30s @@ -85,7 +85,7 @@ services: minio: condition: service_healthy networks: - - archive-net + - archiv-net environment: MINIO_PASSWORD: ${MINIO_PASSWORD} MINIO_APP_PASSWORD: ${MINIO_APP_PASSWORD} @@ -101,7 +101,7 @@ services: restart: unless-stopped profiles: ["staging"] networks: - - archive-net + - archiv-net healthcheck: # TCP-port open check via BusyBox `nc`. The previous wget-based probe # introduced a non-obvious binary dependency on the mailpit image; a @@ -138,7 +138,7 @@ services: # must NOT be reachable here. Do not widen to `*`. ALLOWED_PDF_HOSTS: "minio" networks: - - archive-net + - archiv-net healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 10s @@ -186,7 +186,7 @@ services: SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-true} SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-true} networks: - - archive-net + - archiv-net healthcheck: test: ["CMD-SHELL", "wget -qO- http://localhost:8080/actuator/health | grep -q UP || exit 1"] interval: 15s @@ -210,7 +210,7 @@ services: API_INTERNAL_URL: http://backend:8080 ORIGIN: https://${APP_DOMAIN} networks: - - archive-net + - archiv-net healthcheck: test: ["CMD-SHELL", "wget -qO- http://localhost:3000/login >/dev/null 2>&1 || exit 1"] interval: 15s diff --git a/docker-compose.yml b/docker-compose.yml index ee850922..952e3074 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,7 +13,7 @@ services: ports: - "${PORT_DB}:5432" networks: - - archive-net + - archiv-net healthcheck: test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"] interval: 5s @@ -35,7 +35,7 @@ services: - "${PORT_MINIO_API}:9000" # API Port - "${PORT_MINIO_CONSOLE}:9001" # Web-Oberfläche networks: - - archive-net + - archiv-net healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] interval: 30s @@ -56,7 +56,7 @@ services: exit 0; " networks: - - archive-net + - archiv-net # --- Mail catcher: Mailpit (dev only) --- # Catches all outgoing emails and displays them in a web UI. @@ -69,7 +69,7 @@ services: - "${PORT_MAILPIT_UI:-8025}:8025" # Web UI - "${PORT_MAILPIT_SMTP:-1025}:1025" # SMTP networks: - - archive-net + - archiv-net # --- OCR: Python microservice (Surya + Kraken) --- # Single-node only: OCR training reloads the model in-process after each run. @@ -99,7 +99,7 @@ services: OCR_CLAHE_TILE_SIZE: "8" # CLAHE tile grid size (NxN tiles per page) OCR_MAX_CACHED_MODELS: "2" # LRU cache; each model ~500 MB, so 2 = ~1 GB resident networks: - - archive-net + - archiv-net healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 10s @@ -150,7 +150,7 @@ services: ports: - "${PORT_BACKEND}:8080" networks: - - archive-net + - archiv-net healthcheck: test: ["CMD-SHELL", "wget -qO- http://localhost:8080/actuator/health | grep -q UP || exit 1"] interval: 15s @@ -185,10 +185,10 @@ services: ports: - "${PORT_FRONTEND}:5173" networks: - - archive-net + - archiv-net networks: - archive-net: + archiv-net: driver: bridge volumes: diff --git a/docs/infrastructure/ci-gitea.md b/docs/infrastructure/ci-gitea.md index c6180c5e..3f96583e 100644 --- a/docs/infrastructure/ci-gitea.md +++ b/docs/infrastructure/ci-gitea.md @@ -166,7 +166,7 @@ jobs: timeout 30 bash -c \ 'until docker compose -f docker-compose.yml -f docker-compose.ci.yml exec -T db pg_isready -U archive_user; do sleep 2; done' - name: Connect job container to compose network - run: docker network connect familienarchiv_archive-net $(cat /etc/hostname) + run: docker network connect familienarchiv_archiv-net $(cat /etc/hostname) - uses: actions/setup-java@v4 with: java-version: '21' -- 2.49.1 From 4d4d5793bbaeb4805a6d76de3ac8361130aebdf1 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 11 May 2026 14:11:46 +0200 Subject: [PATCH 39/39] docs(glossary): add archiv-app service account entry `archiv-app` is the bucket-scoped MinIO service account introduced in PR #499 alongside the production deploy pipeline. Until now the term only appeared in `infra/minio/bootstrap.sh` and the prod compose file; a reader encountering `S3_ACCESS_KEY: archiv-app` had no single-page reference distinguishing it from the MinIO root account. Adds a new "Infrastructure Terms" section to docs/GLOSSARY.md so the distinction (root account vs. application service account) and the attached `archiv-app-policy` scope live in the canonical glossary location. Cross-links to ADR-010 for the MinIO-stays-self-hosted rationale. Addresses @elicit's round-2 recommendation on PR #499. Co-Authored-By: Claude Opus 4.7 --- docs/GLOSSARY.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/GLOSSARY.md b/docs/GLOSSARY.md index f1c75053..55ffca93 100644 --- a/docs/GLOSSARY.md +++ b/docs/GLOSSARY.md @@ -107,6 +107,13 @@ _See also [Briefwechsel](#briefwechsel-user-facing)._ --- +## Infrastructure Terms + +**archiv-app** — the bucket-scoped MinIO service account the backend uses to read and write the `familienarchiv` bucket. Distinct from the MinIO root account (`archiv`, used only by the bootstrap container for admin operations). Defined and provisioned in [`infra/minio/bootstrap.sh`](../infra/minio/bootstrap.sh) and consumed by the backend as `S3_ACCESS_KEY` in [`docker-compose.prod.yml`](../docker-compose.prod.yml). The attached `archiv-app-policy` grants `s3:GetObject/PutObject/DeleteObject` on `familienarchiv/*` and `s3:ListBucket/GetBucketLocation` on the bucket only — not the built-in `readwrite` policy which would grant `s3:*` on all buckets. +_See also [ADR-010 — MinIO stays self-hosted, not Hetzner OBS](./adr/010-minio-self-hosted-not-hetzner-obs.md)._ + +--- + ## Pending Terms _Terms flagged as potentially ambiguous that have not yet been formally defined here. Add an entry above and remove it from this list when resolved._ -- 2.49.1