2026-05-17 19:06:47 +02:00
9 changed files with 226 additions and 3 deletions
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -142,8 +142,11 @@ services:
    memswap_limit: ${OCR_MEM_LIMIT:-12g}
    volumes:
      - ocr-models:/app/models
-      - ocr-cache:/root/.cache
+      - ocr-cache:/app/cache  # HuggingFace / ketos cache — prevents re-downloads on recreate (HF_HOME)
    environment:
+      HF_HOME: /app/cache
+      XDG_CACHE_HOME: /app/cache
+      TORCH_HOME: /app/models/torch
      KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
      TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
      OCR_CONFIDENCE_THRESHOLD: "0.3"
@@ -161,6 +164,13 @@ services:
      timeout: 5s
      retries: 12
      start_period: 120s
+    read_only: true
+    tmpfs:
+      - /tmp:size=512m   # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
+    cap_drop:
+      - ALL
+    security_opt:
+      - no-new-privileges:true

  backend:
    image: familienarchiv/backend:${TAG:-nightly}
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -87,8 +87,11 @@ services:
    memswap_limit: 12g
    volumes:
      - ocr_models:/app/models
-      - ocr_cache:/root/.cache  # Hugging Face / ketos model download cache — prevents re-downloads on container recreate
+      - ocr_cache:/app/cache  # HuggingFace / ketos cache — prevents re-downloads on recreate (HF_HOME)
    environment:
+      HF_HOME: /app/cache
+      XDG_CACHE_HOME: /app/cache
+      TORCH_HOME: /app/models/torch
      KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
      TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
      OCR_CONFIDENCE_THRESHOLD: "0.3"
@@ -106,6 +109,13 @@ services:
      timeout: 5s
      retries: 12
      start_period: 120s
+    read_only: true
+    tmpfs:
+      - /tmp:size=512m   # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
+    cap_drop:
+      - ALL
+    security_opt:
+      - no-new-privileges:true

  # --- Backend: Spring Boot ---
  backend:
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -19,6 +19,7 @@ This doc is the Day-1 checklist and operational reference. It links to the canon
 5. [Backup + recovery](#5-backup--recovery)
 6. [Common operational tasks](#6-common-operational-tasks)
 7. [Known limitations](#7-known-limitations)
+8. [Upgrade notes](#8-upgrade-notes)

 ---

@@ -140,6 +141,8 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back
 | `KRAKEN_MODEL_PATH` | Directory containing Kraken HTR models (populated by `download-kraken-models.sh`) | `/app/models/` | — | — |
 | `BLLA_MODEL_PATH` | Kraken baseline layout analysis model path | `/app/models/blla.mlmodel` | — | — |
 | `OCR_MEM_LIMIT` | Container memory cap for ocr-service in `docker-compose.prod.yml`. Set to `6g` on CX32 hosts; leave unset on CX42+ to use the 12g default | `12g` (prod compose default) | — | — |
+| `XDG_CACHE_HOME` | XDG cache base dir — redirects Matplotlib and other XDG-aware libraries away from the read-only `HOME` (`/home/ocr`) to the writable cache volume | `/app/cache` | — | — |
+| `TORCH_HOME` | PyTorch model cache — redirects `~/.cache/torch` to the writable models volume | `/app/models/torch` | — | — |

 ### Observability stack (`docker-compose.observability.yml`)

@@ -554,3 +557,28 @@ bash scripts/download-kraken-models.sh
 | **No multi-region** | Single PostgreSQL + MinIO instance; no replication or failover | Deliberate scope decision |
 | **Max upload size** | 50 MB per file (500 MB per request for multi-file) | Configurable in `application.yaml` (`spring.servlet.multipart`) |
 | **No automated backup** | Phase 5 of Production v1 milestone is not yet implemented | See §5 above |
+
+---
+
+## 8. Upgrade notes
+
+Version-specific one-time steps that must be run before or after upgrading to a given release. Each subsection is safe to skip on a fresh install.
+
+### Upgrading to PR #611 — non-root OCR container
+
+The OCR cache volume path changed from `/root/.cache` to `/app/cache` (PR #611 — CIS Docker §4.1 hardening). The existing volume was written as root and is inaccessible to the new non-root `ocr` user, causing a `PermissionError` on startup.
+
+**Before starting the updated container stack**, drop the old root-owned volume. The volume name depends on the compose project name:
+
+```bash
+# Dev (docker-compose.yml — project name: familienarchiv)
+docker volume rm familienarchiv_ocr_cache
+
+# Production (docker-compose.prod.yml -p archiv-production)
+docker volume rm archiv-production_ocr-cache
+
+# Staging (docker-compose.prod.yml -p archiv-staging)
+docker volume rm archiv-staging_ocr-cache
+```
+
+The volume is recreated automatically on `docker compose up`. The OCR service will re-download its model cache on first startup (approximately 1–2 GB, one-time cost).
--- a/docs/adr/019-container-hardening-baseline.md
+++ b/docs/adr/019-container-hardening-baseline.md
@@ -0,0 +1,94 @@
+# ADR-019 — Container hardening baseline: non-root user + read-only filesystem
+
+**Status:** Accepted  
+**Date:** 2026-05-17  
+**PR:** #611
+
+---
+
+## Context
+
+The OCR service ran as `root` inside its container by default. This violated CIS Docker Benchmark §4.1 and CIS §4.6, and meant that any exploit in the OCR pipeline (untrusted PDF content, model deserialization, ZIP handling) could write to or execute anything inside the container without restriction.
+
+The following risks were present before this baseline:
+
+- A path-traversal in the ZIP-based training endpoint could overwrite arbitrary paths on the container filesystem (including Python source files and model files).
+- A compromised dependency running at startup could persist itself to the image layers or model volumes.
+- Misconfigured model downloads could overwrite `/etc/passwd` or similar via path-traversal — possible because root can write everywhere.
+
+---
+
+## Decision
+
+All containers in this project that have no operational need for elevated privileges **must** apply the following hardening baseline:
+
+### 1. Non-root user
+
+Create a dedicated user with a fixed UID and no login shell:
+
+```dockerfile
+RUN useradd --no-create-home --shell /usr/sbin/nologin --uid 1000 <service>
+```
+
+Set `HOME` explicitly to a path owned by this user. Do not rely on `~` expansion for any path resolution in application code.
+
+### 2. Read-only container filesystem
+
+```yaml
+read_only: true
+```
+
+All paths the application writes to at runtime must be explicitly declared as either a named volume or a `tmpfs` mount. This turns any unexpected write attempt into an immediate, visible `PermissionError` rather than a silent success.
+
+### 3. Per-path write carve-outs
+
+Declare only the paths that are actually written at runtime:
+
+```yaml
+volumes:
+  - <service>_models:/app/models   # persistent model storage
+  - <service>_cache:/app/cache     # HuggingFace / ketos download cache
+tmpfs:
+  - /tmp:size=512m                 # transient scratch space (ZIP extraction etc.)
+```
+
+Do not mount the home directory as a volume unless necessary — use `XDG_CACHE_HOME` and `TORCH_HOME` env vars to redirect library cache writes to the declared writable paths instead.
+
+### 4. Dropped capabilities and privilege escalation prevention
+
+```yaml
+cap_drop: [ALL]
+security_opt:
+  - no-new-privileges:true
+```
+
+A Python/FastAPI service on port 8000+ requires no Linux capabilities. Dropping all and blocking privilege escalation via setuid prevents any capability regain even if a dependency contains a SUID binary.
+
+### 5. Startup root canary
+
+Log a warning during startup if the process is running as root. This catches misconfiguration (e.g., `USER` directive accidentally removed in a future Dockerfile edit) before it becomes a silent vulnerability:
+
+```python
+if os.getuid() == 0:
+    logger.warning("Running as root — CIS Docker §4.1 violation")
+```
+
+---
+
+## Consequences
+
+**Positive:**
+- Any exploit that achieves code execution inside the container is confined: it cannot write outside the declared volumes, cannot acquire new capabilities, and cannot persist to the image filesystem.
+- `PermissionError` on startup is an explicit, diagnosable failure rather than a silent privilege misuse.
+- The startup canary catches accidental regressions in the non-root setup.
+
+**Negative / operational cost:**
+- Every new feature that writes to a new path (e.g., a new model cache directory, a new scratch path) must add a volume or tmpfs mount. The `read_only: true` flag makes this a hard constraint, not a suggestion.
+- Library dependencies that write to `HOME` without respecting `XDG_CACHE_HOME` must be identified and redirected explicitly (see `TORCH_HOME`, `XDG_CACHE_HOME`, `HF_HOME` in `docker-compose.yml`).
+- Existing named volumes written by root (pre-baseline) must be dropped and recreated before upgrading. See [DEPLOYMENT.md §8](../DEPLOYMENT.md#8-upgrade-notes).
+
+---
+
+## Applicability
+
+This baseline applies to the OCR service (PR #611). It should be applied to any new container added to the project unless there is a documented, specific operational reason a capability or writable filesystem is required.
--- a/ocr-service/Dockerfile
+++ b/ocr-service/Dockerfile
@@ -23,8 +23,18 @@ RUN pip install --no-cache-dir -r requirements.txt

 COPY . .

+RUN useradd --no-create-home --shell /usr/sbin/nologin --uid 1000 ocr \
+    && mkdir -p /home/ocr /app/models /app/cache \
+    && chown -R ocr:ocr /app /home/ocr
 RUN chmod +x /app/entrypoint.sh

+ENV HOME=/home/ocr
+ENV HF_HOME=/app/cache
+ENV XDG_CACHE_HOME=/app/cache
+ENV TORCH_HOME=/app/models/torch
+
+USER ocr
+
 EXPOSE 8000

 CMD ["/app/entrypoint.sh"]
--- a/ocr-service/ensure_blla_model.py
+++ b/ocr-service/ensure_blla_model.py
@@ -24,7 +24,7 @@ log = logging.getLogger(__name__)
 BLLA_MODEL_PATH = os.environ.get("BLLA_MODEL_PATH", "/app/models/blla.mlmodel")
 # DOI for "General segmentation model for print and handwriting" — ketos 7 compatible.
 BLLA_MODEL_DOI = "10.5281/zenodo.14602569"
-HTRMOPO_DIR = os.path.expanduser("~/.local/share/htrmopo")
+HTRMOPO_DIR = os.environ.get("HTRMOPO_DIR") or "/app/models/.htrmopo"


 def _model_is_loadable(path: str) -> bool:
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -56,6 +56,8 @@ async def lifespan(app: FastAPI):
    """Load lightweight models at startup. Surya loads lazily on first request."""
    global _models_ready

+    if os.getuid() == 0:
+        logger.warning("Running as root — CIS Docker §4.1 violation")
    logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...")
    kraken_engine.load_models()
    load_spell_checker()
--- a/ocr-service/test_ensure_blla_model.py
+++ b/ocr-service/test_ensure_blla_model.py
@@ -1,10 +1,43 @@
 """Unit tests for ensure_blla_model.main()."""

+import importlib
+import os
 from unittest.mock import MagicMock, call, patch

 import ensure_blla_model


+# ─── HTRMOPO_DIR env var resolution ──────────────────────────────────────────
+
+
+def test_htrmopo_dir_reads_from_env_var():
+    """HTRMOPO_DIR uses the HTRMOPO_DIR env var when set, not ~ expansion."""
+    with patch.dict(os.environ, {"HTRMOPO_DIR": "/custom/htrmopo"}):
+        importlib.reload(ensure_blla_model)
+        result = ensure_blla_model.HTRMOPO_DIR
+    importlib.reload(ensure_blla_model)
+    assert result == "/custom/htrmopo"
+
+
+def test_htrmopo_dir_default_is_fixed_path():
+    """Default HTRMOPO_DIR is a fixed path not derived from ~ (no-create-home safe)."""
+    clean_env = {k: v for k, v in os.environ.items() if k != "HTRMOPO_DIR"}
+    with patch.dict(os.environ, clean_env, clear=True):
+        importlib.reload(ensure_blla_model)
+        result = ensure_blla_model.HTRMOPO_DIR
+    importlib.reload(ensure_blla_model)
+    assert result == "/app/models/.htrmopo"
+
+
+def test_htrmopo_dir_falls_back_to_default_when_set_to_empty_string():
+    """HTRMOPO_DIR='' must not produce an empty path — get() returns '' for blank env vars."""
+    with patch.dict(os.environ, {"HTRMOPO_DIR": ""}):
+        importlib.reload(ensure_blla_model)
+        result = ensure_blla_model.HTRMOPO_DIR
+    importlib.reload(ensure_blla_model)
+    assert result != ""
+
+
 # ─── Model already loadable ───────────────────────────────────────────────────


--- a/ocr-service/test_main.py
+++ b/ocr-service/test_main.py
@@ -0,0 +1,36 @@
+"""Tests for main.py — startup behavior."""
+
+import logging
+from unittest.mock import patch
+
+import pytest
+from httpx import ASGITransport, AsyncClient
+
+from main import app
+
+
+# ─── Root canary ──────────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_startup_logs_warning_when_running_as_root(caplog):
+    """Lifespan emits a WARNING when the process uid is 0 (running as root)."""
+    with patch("main.os.getuid", return_value=0), \
+         patch("main.kraken_engine.load_models"), \
+         patch("main.load_spell_checker"), \
+         caplog.at_level(logging.WARNING, logger="main"):
+        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test"):
+            pass
+    assert "Running as root" in caplog.text
+
+
+@pytest.mark.asyncio
+async def test_startup_does_not_warn_when_running_as_non_root(caplog):
+    """Lifespan does not emit a root warning when running as a non-root user."""
+    with patch("main.os.getuid", return_value=1000), \
+         patch("main.kraken_engine.load_models"), \
+         patch("main.load_spell_checker"), \
+         caplog.at_level(logging.WARNING, logger="main"):
+        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test"):
+            pass
+    assert "Running as root" not in caplog.text