From e4df17f3080d983ee8a6c749d255e0119bc9f495 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 10 May 2026 22:00:21 +0200 Subject: [PATCH] docs: retire overlay narrative; add Caddy to C4 L2 diagram - docs/infrastructure/production-compose.md: trimmed to VPS sizing, cost breakdown, and Hetzner ecosystem rationale. The inline compose spec (overlay + Hetzner OBS in prod) is retired; the live file is now docker-compose.prod.yml at the repo root and the Caddyfile lives at infra/caddy/Caddyfile. Observability stack is called out as a not-yet-deployed gap (issue #498). - docs/architecture/c4/l2-containers.puml: adds Caddy as a named reverse-proxy container with the two port paths and notes the archiv-app service-account split on MinIO access. Refs #497. Co-Authored-By: Claude Sonnet 4.6 --- docs/architecture/c4/l2-containers.puml | 20 +- docs/infrastructure/production-compose.md | 270 +++------------------- 2 files changed, 44 insertions(+), 246 deletions(-) diff --git a/docs/architecture/c4/l2-containers.puml b/docs/architecture/c4/l2-containers.puml index bd187bca..367b7d93 100644 --- a/docs/architecture/c4/l2-containers.puml +++ b/docs/architecture/c4/l2-containers.puml @@ -6,23 +6,27 @@ title Container Diagram: Familienarchiv Person(user, "User", "Admin or family member") System_Ext(mail, "Email Service", "SMTP server. Delivers notification and password-reset emails.") +Container(caddy, "Reverse Proxy", "Caddy 2 (host-installed)", "TLS termination (auto Let's Encrypt). Routes /api/* to backend:8080, everything else to frontend:3000. Responds 404 on /actuator/* and adds HSTS, X-Content-Type-Options, Referrer-Policy headers.") + System_Boundary(archiv, "Familienarchiv (Docker Compose)") { - Container(frontend, "Web Frontend", "SvelteKit / Node.js", "Server-side rendered UI. Handles auth session cookies, document search and viewer, transcription editor, annotation layer, family tree (Stammbaum), stories (Geschichten), activity feed (Chronik), enrichment workflow, and admin panel.") - Container(backend, "API Backend", "Spring Boot 4 / Java 21 / Jetty", "REST API. Implements document management, search, user auth, file upload/download, transcription, OCR orchestration, and SSE notifications.") + Container(frontend, "Web Frontend", "SvelteKit / Node adapter / port 3000", "Server-side rendered UI. Handles auth session cookies, document search and viewer, transcription editor, annotation layer, family tree (Stammbaum), stories (Geschichten), activity feed (Chronik), enrichment workflow, and admin panel.") + Container(backend, "API Backend", "Spring Boot 4 / Java 21 / Jetty / port 8080", "REST API. Implements document management, search, user auth, file upload/download, transcription, OCR orchestration, and SSE notifications. Trusts X-Forwarded-* headers from Caddy.") Container(ocr, "OCR Service", "Python FastAPI / port 8000", "Handwritten text recognition (HTR) and OCR microservice. Single-node by design — see ADR-001. Reachable only on the internal Docker network; no external port exposed.") ContainerDb(db, "Relational Database", "PostgreSQL 16", "Stores document metadata, persons, users, permission groups, tags, transcription blocks, audit log, and Spring Session data.") - ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Objects keyed as documents/{UUID}_{filename}.") - Container(mc, "Bucket Init Helper", "MinIO Client (mc)", "One-shot container on startup. Creates the archive bucket with private access policy.") + ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Backend uses a bucket-scoped service account (archiv-app), not MinIO root.") + Container(mc, "Bucket / Service-Account Init", "MinIO Client (mc)", "One-shot container on startup. Idempotent: creates the archive bucket, the archiv-app service account, and attaches the readwrite policy.") } -Rel(user, frontend, "Uses", "HTTPS / Browser") +Rel(user, caddy, "HTTPS", "TLS 1.2/1.3") +Rel(caddy, frontend, "Reverse proxies non-/api requests", "HTTP / loopback:3000") +Rel(caddy, backend, "Reverse proxies /api/*", "HTTP / loopback:8080") Rel(frontend, backend, "API requests with Basic Auth token", "HTTP / REST / JSON") -Rel(backend, user, "SSE notifications (server-sent events)", "HTTP / SSE — direct backend-to-browser") +Rel(backend, user, "SSE notifications (server-sent events)", "HTTP / SSE — fronted by Caddy") Rel(backend, db, "Reads and writes metadata and sessions", "JDBC / SQL") -Rel(backend, storage, "Uploads and streams document files", "HTTP / S3 API (AWS SDK v2)") +Rel(backend, storage, "Uploads and streams document files using archiv-app service account", "HTTP / S3 API (AWS SDK v2)") Rel(backend, ocr, "OCR job requests with presigned MinIO URL", "HTTP / REST / JSON") Rel(backend, mail, "Sends notification and password-reset emails (optional)", "SMTP") Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned") -Rel(mc, storage, "Creates bucket on startup", "MinIO Client CLI") +Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI") @enduml diff --git a/docs/infrastructure/production-compose.md b/docs/infrastructure/production-compose.md index a0f06df9..75b513c3 100644 --- a/docs/infrastructure/production-compose.md +++ b/docs/infrastructure/production-compose.md @@ -1,214 +1,22 @@ # Production Docker Compose & Infrastructure -This document contains the full production Docker Compose file, Caddyfile, VPS sizing recommendations, cost breakdown, and Hetzner ecosystem overview. +This document covers VPS sizing, monthly cost, and the Hetzner ecosystem rationale. The compose file and Caddyfile that previously lived inline in this doc are now committed to the repo root. + +> **Where to find the live files (after #497)** +> - Production compose: [`docker-compose.prod.yml`](../../docker-compose.prod.yml) (standalone, not an overlay) +> - Caddyfile: [`infra/caddy/Caddyfile`](../../infra/caddy/Caddyfile) +> - Deploy workflows: [`.gitea/workflows/nightly.yml`](../../.gitea/workflows/nightly.yml) and [`.gitea/workflows/release.yml`](../../.gitea/workflows/release.yml) +> - Bootstrap checklist, secrets, rollback procedure: [`docs/DEPLOYMENT.md`](../DEPLOYMENT.md) + +The original spec in this doc proposed an overlay pattern (`docker compose -f docker-compose.yml -f docker-compose.prod.yml`) with MinIO disabled in production in favour of Hetzner Object Storage. That approach was retired in #497 in favour of a standalone prod compose that keeps MinIO self-hosted on the VPS. The Hetzner OBS migration is tracked as a future follow-up; the swap is three env vars + `mc mirror` once we decide to do it. --- -## Full docker-compose.prod.yml +## Observability stack — not yet deployed -Usage: `docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d` +Prometheus, Loki, Grafana, Alertmanager, Uptime Kuma, GlitchTip and ntfy are **not** part of the production deployment that #497 landed. They are tracked as follow-up issue #498. -```yaml -# docker-compose.prod.yml -# Usage: docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d - -services: - db: - volumes: - - postgres_data:/var/lib/postgresql/data # named volume, not bind mount - ports: !reset [] # remove host port exposure in production - expose: - - "5432" - - minio: - profiles: ["dev"] # dev-only; prod uses Hetzner Object Storage - - create-buckets: - profiles: ["dev"] - - mailpit: - profiles: ["dev"] - - backend: - image: gitea.example.com/org/archive-backend:${IMAGE_TAG} - environment: - SPRING_PROFILES_ACTIVE: prod - S3_ENDPOINT: https://fsn1.your-objectstorage.com - MAIL_HOST: ${MAIL_HOST} - MAIL_PORT: 587 - SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: "true" - SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: "true" - ports: !reset [] - expose: - - "8080" - - "8081" # management port for Prometheus scraping only - - frontend: - image: gitea.example.com/org/archive-frontend:${IMAGE_TAG} - ports: !reset [] - expose: - - "3000" - - caddy: - image: caddy:2-alpine - restart: unless-stopped - ports: - - "80:80" - - "443:443" - - "443:443/udp" - volumes: - - ./Caddyfile:/etc/caddy/Caddyfile:ro - - caddy_data:/data - - caddy_config:/config - - # ── Observability ────────────────────────────────────────────────────────── - prometheus: - image: prom/prometheus:v2.51.0 # pinned - restart: unless-stopped - volumes: - - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml:ro - - prometheus_data:/prometheus - expose: ["9090"] - - grafana: - image: grafana/grafana:10.4.0 # pinned - restart: unless-stopped - environment: - GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD} - GF_PATHS_PROVISIONING: /etc/grafana/provisioning - GF_SERVER_ROOT_URL: https://grafana.example.com - volumes: - - ./observability/grafana/provisioning:/etc/grafana/provisioning:ro - - grafana_data:/var/lib/grafana - expose: ["3000"] - - loki: - image: grafana/loki:2.9.0 # pinned - restart: unless-stopped - volumes: - - ./observability/loki-config.yml:/etc/loki/config.yml:ro - - loki_data:/loki - expose: ["3100"] - - promtail: - image: grafana/promtail:2.9.0 # pinned - restart: unless-stopped - volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro - - ./observability/promtail-config.yml:/etc/promtail/config.yml:ro - - alertmanager: - image: prom/alertmanager:v0.27.0 # pinned - restart: unless-stopped - volumes: - - ./observability/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro - expose: ["9093"] - - # ── Uptime monitoring ────────────────────────────────────────────────────── - uptime-kuma: - image: louislam/uptime-kuma:1 - restart: unless-stopped - volumes: - - uptime_kuma_data:/app/data - expose: ["3001"] - - # ── Error tracking ───────────────────────────────────────────────────────── - glitchtip-web: - image: glitchtip/glitchtip:latest - restart: unless-stopped - depends_on: [db] - environment: - DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@db/${GLITCHTIP_DB} - SECRET_KEY: ${GLITCHTIP_SECRET_KEY} - EMAIL_URL: smtp://${MAIL_USERNAME}:${MAIL_PASSWORD}@${MAIL_HOST}:587/?tls=true - GLITCHTIP_DOMAIN: https://errors.example.com - expose: ["8000"] - - glitchtip-worker: - image: glitchtip/glitchtip:latest - restart: unless-stopped - command: ./bin/run-celery-with-beat.sh - depends_on: [glitchtip-web] - environment: - DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@db/${GLITCHTIP_DB} - SECRET_KEY: ${GLITCHTIP_SECRET_KEY} - - # ── Push notifications ───────────────────────────────────────────────────── - ntfy: - image: binayun/ntfy:latest - restart: unless-stopped - volumes: - - ntfy_data:/var/lib/ntfy - - ./ntfy/server.yml:/etc/ntfy/server.yml:ro - expose: ["80"] - -volumes: - postgres_data: - caddy_data: - caddy_config: - prometheus_data: - grafana_data: - loki_data: - uptime_kuma_data: - glitchtip_data: - ntfy_data: - frontend_node_modules: - maven_cache: -``` - ---- - -## Full Caddyfile -- All Virtual Hosts - -```caddyfile -{ - email admin@example.com -} - -# Main application -app.example.com { - header { - Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" - X-Content-Type-Options "nosniff" - X-Frame-Options "DENY" - Referrer-Policy "strict-origin-when-cross-origin" - -Server - } - @api path /api/* - reverse_proxy @api backend:8080 - @actuator path /actuator/* - respond @actuator 404 - reverse_proxy frontend:3000 -} - -# Gitea — source code and CI -git.example.com { - reverse_proxy gitea:3000 -} - -# Grafana — observability -grafana.example.com { - basicauth { - admin $2a$14$... - } - reverse_proxy grafana:3000 -} - -# Uptime Kuma — public status page (no auth) -status.example.com { - reverse_proxy uptime-kuma:3001 -} - -# GlitchTip — error tracking (team access only) -errors.example.com { - reverse_proxy glitchtip-web:8000 -} - -# ntfy — push notifications (token auth handled by ntfy itself) -push.example.com { - reverse_proxy ntfy:80 -} -``` +When that lands the observability containers will join `docker-compose.prod.yml` under a dedicated profile so they can be operated alongside the application stack without affecting the application containers' restart cycle. --- @@ -216,61 +24,47 @@ push.example.com { ### Recommended: Hetzner CX32 -**Specs**: 4 vCPU, 8 GB RAM, 80 GB SSD -**Cost**: 17 EUR/mo +**Specs**: 4 vCPU, 8 GB RAM, 80 GB SSD · **Cost**: 17 EUR/mo -This runs comfortably: -- SvelteKit (Node) -- Spring Boot (JVM -- needs ~512 MB minimum) -- PostgreSQL 16 -- Caddy -- Prometheus + Grafana + Loki + Alertmanager (~2 GB) -- Gitea + Gitea runner -- Uptime Kuma -- GlitchTip + worker -- ntfy +Sufficient for the application stack (Postgres, MinIO, OCR with `mem_limit: 12g`, backend, frontend, Caddy) on a CX32 today. Once the observability stack lands (Prometheus/Loki/Grafana/Alertmanager add ~2 GB) consider a CX42. ### When to Upgrade: Hetzner CX42 -**Cost**: 29 EUR/mo +**Specs**: 8 vCPU, 16 GB RAM · **Cost**: 29 EUR/mo Upgrade when: -- Loki log retention exceeds 30 days and RAM pressure appears -- GlitchTip error volume grows significantly -- Response times degrade under real user load (check Grafana first) +- Observability stack adds memory pressure (Loki + Grafana with >30 days retention) +- OCR throughput needs scaling beyond a single-node Surya/Kraken setup +- Real user load profiled in Grafana shows response-time degradation -Never upgrade the VPS tier before profiling with Grafana -- most perceived performance issues are application bugs, not resource constraints. +Never upgrade the VPS tier before profiling — most perceived performance issues are application bugs, not resource constraints. --- -## Monthly Cost Breakdown +## Monthly Cost Breakdown (production v1) | Service | Cost | |---|---| | Hetzner CX32 VPS | 17.00 EUR | -| Hetzner Object Storage (~200 GB) | 5.00 EUR | -| Hetzner SMTP relay | ~1.00 EUR | | Hetzner DNS | 0.00 EUR | -| **Total** | **~23 EUR/mo** | +| Hetzner SMTP relay | ~1.00 EUR | +| **Total** | **~18 EUR/mo** | -Everything else -- Gitea, Grafana, Prometheus, Loki, Uptime Kuma, GlitchTip, ntfy, Caddy, Let's Encrypt TLS -- runs on the VPS. Zero additional cost. +MinIO data lives on the VPS disk (no Object Storage line item yet). The Hetzner OBS migration would add ~5 EUR/mo at ~200 GB. -Equivalent SaaS stack: 200-300 EUR/mo. +Equivalent SaaS stack: 200–300 EUR/mo. --- -## Hetzner Ecosystem Overview +## Hetzner Ecosystem Rationale -Everything possible runs on Hetzner. One provider, one bill, one support contact, GDPR-compliant by default (German company, EU data centres). +Everything possible runs on Hetzner. One provider, one bill, GDPR-compliant by default (German company, EU data centres). -### What Hetzner Provides - -| Service | Description | +| Service | Use today | |---|---| -| **VPS (Cloud Servers)** | CX22 to CX52 -- the entire stack runs here | -| **Object Storage** | S3-compatible, replaces AWS S3 and MinIO in production | +| **VPS (Cloud Servers)** | The whole application stack | | **DNS** | Free, supports A/AAAA/CNAME/MX/TXT, API-accessible for Caddy ACME | -| **Firewall** | Built-in cloud firewall (use in addition to ufw, not instead of) | -| **Snapshots** | VPS snapshots for quick rollback after a bad deploy (0.013 EUR/GB/mo) | -| **Volumes** | Attachable block storage if the VPS disk fills up (0.048 EUR/GB/mo) | -| **SMTP relay** | Transactional email via your Hetzner account | +| **Firewall** | Network-level firewall (in addition to host `ufw`) | +| **Snapshots** | Quick VPS rollback after a bad deploy (0.013 EUR/GB/mo) | +| **SMTP relay** | Transactional email from `noreply@raddatz.cloud` | +| **Object Storage** | Not used today — MinIO stays on-VPS. Available when we decide to migrate |