2026-05-18 11:32:37 +02:00
12 changed files with 319 additions and 13 deletions
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -148,7 +148,10 @@ jobs:
          path: frontend/test-results/screenshots/

  # ─── OCR Service Unit Tests ───────────────────────────────────────────────────
-  # Only spell_check.py, test_confidence.py, test_sender_registry.py — no ML stack required.
+  # Only stdlib/lightweight tests — no ML stack (PyTorch/Surya/Kraken) required.
+  # test_tmpdir.py covers the TMPDIR env var and entrypoint mkdir behaviour (ADR-021).
+  # test_tmpdir_is_inside_persistent_cache_volume is skipped in CI (TMPDIR not
+  # set to /app/cache here); it runs inside the deployed Docker container.
  ocr-tests:
    name: OCR Service Tests
    runs-on: ubuntu-latest
@@ -160,11 +163,11 @@ jobs:
          python-version: '3.11'

      - name: Install test dependencies
-        run: pip install "pyspellchecker==0.9.0" pytest pytest-asyncio
+        run: pip install "pyspellchecker==0.9.0" "fastapi==0.115.6" pytest pytest-asyncio
        working-directory: ocr-service

      - name: Run OCR unit tests (no ML stack required)
-        run: python -m pytest test_spell_check.py test_confidence.py test_sender_registry.py -v
+        run: python -m pytest test_spell_check.py test_confidence.py test_sender_registry.py test_tmpdir.py -v
        working-directory: ocr-service

  # ─── Backend Unit & Slice Tests ───────────────────────────────────────────────
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -128,6 +128,23 @@ services:
      timeout: 5s
      retries: 5

+  # --- OCR: Volume bootstrap ---
+  # Ensures correct ownership and directory structure on ocr-cache / ocr-models
+  # before ocr-service starts. Handles pre-existing volumes (including those
+  # created before the non-root ocr user was introduced in commit 1aca4c4a)
+  # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
+  ocr-volume-init:
+    image: alpine:3.21
+    command:
+      - sh
+      - -c
+      - "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
+    volumes:
+      - ocr-models:/app/models
+      - ocr-cache:/app/cache
+    networks: []
+    restart: "no"
+
  ocr-service:
    build:
      context: ./ocr-service
@@ -147,6 +164,9 @@ services:
      HF_HOME: /app/cache
      XDG_CACHE_HOME: /app/cache
      TORCH_HOME: /app/models/torch
+      TMPDIR: /app/cache/.tmp       # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
+                                    # /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
+                                    # but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
      KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
      TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
      OCR_CONFIDENCE_THRESHOLD: "0.3"
@@ -164,9 +184,13 @@ services:
      timeout: 5s
      retries: 12
      start_period: 120s
+    depends_on:
+      ocr-volume-init:
+        condition: service_completed_successfully
    read_only: true
    tmpfs:
-      - /tmp:size=512m   # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
+      - /tmp:size=512m   # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
+                         # GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
    cap_drop:
      - ALL
    security_opt:
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -71,6 +71,23 @@ services:
    networks:
      - archiv-net

+  # --- OCR: Volume bootstrap ---
+  # Ensures correct ownership and directory structure on ocr_cache / ocr_models
+  # before ocr-service starts. Handles pre-existing volumes (including those
+  # created before the non-root ocr user was introduced in commit 1aca4c4a)
+  # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
+  ocr-volume-init:
+    image: alpine:3.21
+    command:
+      - sh
+      - -c
+      - "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
+    volumes:
+      - ocr_models:/app/models
+      - ocr_cache:/app/cache
+    networks: []
+    restart: "no"
+
  # --- OCR: Python microservice (Surya + Kraken) ---
  # Single-node only: OCR training reloads the model in-process after each run.
  # Running multiple replicas would cause training conflicts and model-state divergence.
@@ -92,6 +109,9 @@ services:
      HF_HOME: /app/cache
      XDG_CACHE_HOME: /app/cache
      TORCH_HOME: /app/models/torch
+      TMPDIR: /app/cache/.tmp       # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
+                                    # /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
+                                    # but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
      KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
      TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
      OCR_CONFIDENCE_THRESHOLD: "0.3"
@@ -109,9 +129,13 @@ services:
      timeout: 5s
      retries: 12
      start_period: 120s
+    depends_on:
+      ocr-volume-init:
+        condition: service_completed_successfully
    read_only: true
    tmpfs:
-      - /tmp:size=512m   # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
+      - /tmp:size=512m   # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
+                         # GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
    cap_drop:
      - ALL
    security_opt:
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -564,6 +564,22 @@ bash scripts/download-kraken-models.sh

 Version-specific one-time steps that must be run before or after upgrading to a given release. Each subsection is safe to skip on a fresh install.

+### Upgrading to PR #615 — TMPDIR redirect + ocr-volume-init
+
+`ocr-volume-init` is a new one-shot service in both compose files that runs before `ocr-service` on every `docker compose up`. It:
+
+1. `chown -R 1000:1000 /app/cache /app/models` — corrects volume ownership so the non-root `ocr` user (uid 1000) can write to volumes that may have been created as root (including volumes from before PR #611).
+2. `mkdir -p /app/cache/.tmp` — creates the TMPDIR staging directory that Surya uses for GB-scale model downloads. Without this directory, the first model download falls back to the 512 MB `/tmp` tmpfs and fails with ENOSPC. See ADR-021.
+
+**Verify it succeeded:**
+```bash
+docker logs archiv-ocr-volume-init   # dev
+docker logs archiv-production-ocr-volume-init-1   # prod
+```
+Expected output: no error lines; exit code 0.
+
+**Failure mode:** if `chown` is denied (e.g. the volume is mounted read-only), the container exits non-zero and `ocr-service` will not start (`depends_on: condition: service_completed_successfully`). Check `docker logs` for the `chown` error and verify the volume is writable.
+
 ### Upgrading to PR #611 — non-root OCR container

 The OCR cache volume path changed from `/root/.cache` to `/app/cache` (PR #611 — CIS Docker §4.1 hardening). The existing volume was written as root and is inaccessible to the new non-root `ocr` user, causing a `PermissionError` on startup.
--- a/docs/adr/021-tmpdir-persistent-volume-staging.md
+++ b/docs/adr/021-tmpdir-persistent-volume-staging.md
@@ -0,0 +1,68 @@
+# ADR-021 — Route Surya model-download staging to the persistent cache volume via TMPDIR
+
+**Status:** Accepted  
+**Date:** 2026-05-18  
+**Issue:** #614
+
+---
+
+## Context
+
+After the container hardening baseline (ADR-019), the OCR service runs with `read_only: true` and a 512 MB `/tmp` tmpfs. The tmpfs was sized for training-ZIP extraction (typically 20–50 images, well under 100 MB).
+
+Surya's `download_directory()` (surya ≥ 0.6, `surya/common/s3.py`) stages every model file through `tempfile.TemporaryDirectory()` before moving it to the final cache location. `TemporaryDirectory()` honours `$TMPDIR` and falls back to `/tmp`. The `text_recognition` model is 1.34 GB; future Surya models will be in the same range. This blows the 512 MB budget at ~510 MB with `OSError: [Errno 28] No space left on device`.
+
+The host has 1.8 TB free on the disk that backs `/app/cache`. The failure is a routing problem, not a capacity problem.
+
+---
+
+## Decision
+
+Set `TMPDIR=/app/cache/.tmp` in the OCR container so all `tempfile` staging goes to the persistent SSD-backed cache volume.
+
+```yaml
+# docker-compose.yml / docker-compose.prod.yml — ocr-service.environment
+TMPDIR: /app/cache/.tmp
+```
+
+```dockerfile
+# ocr-service/Dockerfile — default for bare docker-run usage
+ENV TMPDIR=/app/cache/.tmp
+```
+
+```bash
+# ocr-service/entrypoint.sh — idempotent directory bootstrap
+mkdir -p "${TMPDIR:-/tmp}"
+find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true
+```
+
+A one-shot `ocr-volume-init` service in both compose files runs before `ocr-service` to `chown -R 1000:1000` the volumes and `mkdir -p /app/cache/.tmp`. This replaces the manual `docker run --rm alpine chown` step performed on 2026-05-18 and makes fresh-volume correctness a permanent infrastructure-as-code guarantee.
+
+The `/tmp` tmpfs remains at 512 MB and continues to serve training-ZIP extraction and transient PDF buffers — its original purpose.
+
+---
+
+## Consequences
+
+**Positive**
+
+- Surya model downloads complete: 1.34 GB fits on the SSD, not in 512 MB of RAM.
+- `shutil.move()` from staging → cache becomes a same-filesystem `rename(2)` — atomic and near-free.
+- Volume ownership is now automated; no manual `docker run --rm alpine chown` on redeploy.
+- `/tmp` retains its small 512 MB DoS cap for attacker-influenceable training endpoints (post-auth only, behind `X-Training-Token`).
+- ZIP Slip protection in `_validate_zip_entry()` is unaffected — it uses `os.path.realpath()` anchored to the extraction directory regardless of where that directory lives.
+
+**Negative / Trade-offs**
+
+- If the container is `docker kill`ed mid-download, partial files persist in `/app/cache/.tmp` across container restarts. Mitigated by the `find -mtime +1 -delete` in `entrypoint.sh` — orphans older than one day are removed on startup.
+- `TMPDIR` pointing inside a volume mount is non-obvious. Any future move of `/app/cache` to a different storage tier must revisit this setting. This ADR is the load-bearing reference.
+
+---
+
+## Alternatives considered
+
+**Approach B — Enlarge `/tmp` to 4 GB**  
+One-line change. Discarded because: (1) 4 GB tmpfs counts against the cgroup `mem_limit`; on CX32 hosts with `OCR_MEM_LIMIT=6g` the combined Surya resident set + tmpfs would trigger OOMKill on cold start; (2) staging GB-scale model files through RAM is using the wrong storage tier; (3) any future model larger than 4 GB requires another bump.
+
+**Approach C — Both TMPDIR redirect and enlarged /tmp**  
+Belt-and-suspenders: Approach A + 1 GB tmpfs. Discarded in favour of the cleaner Approach A. The defence-in-depth benefit does not outweigh the extra compose churn; the 512 MB cap on `/tmp` is intentional.
--- a/ocr-service/CLAUDE.md
+++ b/ocr-service/CLAUDE.md
@@ -5,3 +5,5 @@
 **LLM reminder:** the OCR service is a **single-node container** — training reloads the model in-process, so multiple replicas cause model-state divergence (see ADR-001). All job tracking and business logic stay in Spring Boot; the Python service is stateless OCR only.

 **LLM reminder:** `ALLOWED_PDF_HOSTS` must never be set to `*` — that opens SSRF. The default (`minio,localhost,127.0.0.1`) is correct for dev.
+
+**LLM reminder:** `TMPDIR` points to `/app/cache/.tmp` (persistent SSD volume). Never redirect it back to `/tmp` or any RAM-backed path — `/tmp` is 512 MB and cannot stage GB-scale Surya model downloads (causes ENOSPC). The `ocr-volume-init` container creates the directory on fresh volumes; `entrypoint.sh` ensures it exists as a fallback. See ADR-021.
--- a/ocr-service/Dockerfile
+++ b/ocr-service/Dockerfile
@@ -32,6 +32,7 @@ ENV HOME=/home/ocr
 ENV HF_HOME=/app/cache
 ENV XDG_CACHE_HOME=/app/cache
 ENV TORCH_HOME=/app/models/torch
+ENV TMPDIR=/app/cache/.tmp

 USER ocr

--- a/ocr-service/README.md
+++ b/ocr-service/README.md
@@ -32,6 +32,10 @@ Python FastAPI microservice that performs the actual handwritten text recognitio
 | `ALLOWED_PDF_HOSTS` | `minio,localhost,127.0.0.1` | YES | — | SSRF protection — comma-separated allowed PDF source hosts. Never set to `*`. |
 | `KRAKEN_MODEL_PATH` | `/app/models/` | — | — | Directory where Kraken HTR models are stored (populated by `download-kraken-models.sh`) |
 | `BLLA_MODEL_PATH` | `/app/models/blla.mlmodel` | — | — | Kraken baseline layout analysis model. Auto-downloaded via `ensure_blla_model.py` on startup if missing. |
+| `HF_HOME` | `/app/cache` | — | — | HuggingFace model cache root. Keeps model downloads on the persistent cache volume. |
+| `XDG_CACHE_HOME` | `/app/cache` | — | — | XDG cache root (used by some Surya components alongside `HF_HOME`). |
+| `TORCH_HOME` | `/app/models/torch` | — | — | PyTorch model cache. Kept on the persistent models volume. |
+| `TMPDIR` | `/app/cache/.tmp` | — | — | Download-staging directory for GB-scale Surya model files. Must point to a disk-backed path, not the 512 MB `/tmp` tmpfs — see ADR-021. |

 ## Key files

--- a/ocr-service/entrypoint.sh
+++ b/ocr-service/entrypoint.sh
@@ -1,6 +1,12 @@
 #!/bin/bash
 set -euo pipefail

+# Ensure TMPDIR exists on the persistent cache volume (created by the volume-init
+# container, but guaranteed here for fresh volumes and bare docker-run usage).
+# Remove stale partial downloads left by a previous docker-kill.
+mkdir -p "${TMPDIR:-/tmp}"
+find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true
+
 # Validate the blla segmentation base model and download it if missing or
 # incompatible. ketos 7 dropped support for legacy PyTorch ZIP archives —
 # this ensures the volume always holds a loadable CoreML protobuf model.
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -27,6 +27,7 @@ from engines import kraken as kraken_engine
 from engines import surya as surya_engine
 from models import OcrBlock, OcrRequest
 from preprocessing import preprocess_page
+from utils import _validate_zip_entry

 TRAINING_TOKEN = os.environ.get("TRAINING_TOKEN", "")
 KRAKEN_MODEL_PATH = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel")
@@ -291,14 +292,6 @@ def _check_training_token(x_training_token: str | None) -> None:
        raise HTTPException(status_code=403, detail="Invalid or missing X-Training-Token")


-def _validate_zip_entry(name: str, extract_dir: str) -> None:
-    """Reject ZIP Slip attacks: path traversal and absolute paths."""
-    if os.path.isabs(name) or name.startswith(".."):
-        raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}")
-    resolved = os.path.realpath(os.path.join(extract_dir, name))
-    if not resolved.startswith(os.path.realpath(extract_dir)):
-        raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}")
-

 def _rotate_backups(model_path: str, keep: int = 3) -> None:
    """Keep only the last `keep` timestamped backups of the model."""
--- a/ocr-service/test_tmpdir.py
+++ b/ocr-service/test_tmpdir.py
@@ -0,0 +1,151 @@
+"""Tests for TMPDIR configuration and entrypoint mkdir behavior — ADR-021."""
+
+import os
+import subprocess
+import tempfile
+import time
+
+import pytest
+
+from fastapi import HTTPException
+from utils import _validate_zip_entry
+
+_ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh")
+
+
+def _run_entrypoint(tmpdir, tmp_path):
+    """Run entrypoint.sh with TMPDIR set to tmpdir; python3/uvicorn are stubbed out."""
+    stub_bin = tmp_path / "stub_bin"
+    stub_bin.mkdir(exist_ok=True)
+    for name in ("python3", "uvicorn"):
+        stub = stub_bin / name
+        stub.write_text("#!/bin/sh\nexit 0\n")
+        stub.chmod(0o755)
+    env = {
+        **os.environ,
+        "TMPDIR": str(tmpdir),
+        "PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}",
+    }
+    return subprocess.run(["bash", _ENTRYPOINT], env=env, capture_output=True, text=True)
+
+
+def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path):
+    """Python honours the TMPDIR env var when creating temporary directories.
+
+    Documents the mechanism that routes Surya model staging to the persistent
+    cache volume instead of the 512 MB RAM tmpfs. See ADR-021.
+    """
+    custom_tmp = tmp_path / "model_staging"
+    custom_tmp.mkdir()
+    monkeypatch.setenv("TMPDIR", str(custom_tmp))
+    monkeypatch.setattr(tempfile, "tempdir", None)
+    with tempfile.TemporaryDirectory() as td:
+        assert td.startswith(str(custom_tmp))
+
+
+def test_entrypoint_creates_tmpdir(tmp_path):
+    """entrypoint.sh creates the TMPDIR directory when it does not exist.
+
+    On a fresh ocr_cache volume, /app/cache/.tmp is absent. The entrypoint
+    must create it before uvicorn starts so the first Surya model download
+    does not exhaust the 512 MB /tmp tmpfs (ENOSPC). See ADR-021.
+    """
+    custom_tmp = tmp_path / "model-staging"
+    assert not custom_tmp.exists(), "pre-condition: directory must not exist yet"
+
+    stub_bin = tmp_path / "stub_bin"
+    stub_bin.mkdir()
+    for name in ("python3", "uvicorn"):
+        stub = stub_bin / name
+        stub.write_text("#!/bin/sh\nexit 0\n")
+        stub.chmod(0o755)
+
+    env = {
+        **os.environ,
+        "TMPDIR": str(custom_tmp),
+        "PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}",
+    }
+    result = subprocess.run(
+        ["bash", _ENTRYPOINT],
+        env=env,
+        capture_output=True,
+        text=True,
+    )
+    assert result.returncode == 0, (
+        f"entrypoint.sh exited {result.returncode}\n"
+        f"stdout: {result.stdout}\nstderr: {result.stderr}"
+    )
+    assert custom_tmp.exists(), (
+        f"entrypoint.sh did not create TMPDIR={custom_tmp}\n"
+        f"stdout: {result.stdout}\nstderr: {result.stderr}"
+    )
+
+
+@pytest.mark.skipif(
+    not os.environ.get("TMPDIR", "").startswith("/app/cache"),
+    reason="TMPDIR contract only enforced inside the OCR Docker container",
+)
+def test_tmpdir_is_inside_persistent_cache_volume():
+    """TMPDIR must point to the persistent cache volume, not a RAM tmpfs.
+
+    Catches accidental reversion to /tmp or any tmpfs-backed path.
+    Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp.
+    To run manually: docker exec archiv-ocr python -m pytest test_tmpdir.py::test_tmpdir_is_inside_persistent_cache_volume -v
+    See ADR-021.
+    """
+    tmpdir = os.environ["TMPDIR"]
+    assert tmpdir.startswith("/app/cache"), (
+        f"TMPDIR={tmpdir!r} must be under /app/cache to route model downloads "
+        "to the SSD-backed cache volume — see ADR-021"
+    )
+
+
+def test_entrypoint_removes_day_old_orphans(tmp_path):
+    """entrypoint.sh deletes partial downloads older than 1 day from TMPDIR.
+
+    Simulates a file left behind by a docker-kill mid-download: backdate its
+    mtime by 2 days using os.utime(), run the entrypoint, assert it is gone.
+    See ADR-021.
+    """
+    staging = tmp_path / "staging"
+    staging.mkdir()
+    stale_file = staging / "model.safetensors.partial"
+    stale_file.write_bytes(b"partial download")
+    two_days_ago = time.time() - 2 * 24 * 3600
+    os.utime(stale_file, (two_days_ago, two_days_ago))
+
+    result = _run_entrypoint(staging, tmp_path)
+    assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}"
+    assert not stale_file.exists(), "day-old orphan should have been deleted by entrypoint.sh"
+
+
+def test_entrypoint_preserves_fresh_files(tmp_path):
+    """entrypoint.sh does not delete files newer than 1 day from TMPDIR.
+
+    An in-progress download whose mtime is recent must survive the orphan
+    cleanup so a concurrent or just-started model fetch is not interrupted.
+    See ADR-021.
+    """
+    staging = tmp_path / "staging"
+    staging.mkdir()
+    fresh_file = staging / "model.safetensors.part"
+    fresh_file.write_bytes(b"in progress")
+    # mtime is now — no os.utime() call needed
+
+    result = _run_entrypoint(staging, tmp_path)
+    assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}"
+    assert fresh_file.exists(), "recent file should not have been deleted by entrypoint.sh"
+
+
+def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path):
+    """_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR.
+
+    When TMPDIR=/app/cache/.tmp, extraction dirs live under that path.
+    Verifies os.path.realpath() still anchors correctly against the non-default base.
+    """
+    extract_dir = tmp_path / "model-staging" / "tmpXXX"
+    extract_dir.mkdir(parents=True)
+
+    with pytest.raises(HTTPException) as exc_info:
+        _validate_zip_entry("../evil.py", str(extract_dir))
+    assert exc_info.value.status_code == 400
--- a/ocr-service/utils.py
+++ b/ocr-service/utils.py
@@ -0,0 +1,14 @@
+"""Utility functions shared across the OCR service with no ML-stack imports."""
+
+import os
+
+from fastapi import HTTPException
+
+
+def _validate_zip_entry(name: str, extract_dir: str) -> None:
+    """Reject ZIP Slip attacks: path traversal and absolute paths."""
+    if os.path.isabs(name) or name.startswith(".."):
+        raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}")
+    resolved = os.path.realpath(os.path.join(extract_dir, name))
+    if not resolved.startswith(os.path.realpath(extract_dir)):
+        raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}")