diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index a086f7c8..ae6228e2 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -148,7 +148,10 @@ jobs: path: frontend/test-results/screenshots/ # ─── OCR Service Unit Tests ─────────────────────────────────────────────────── - # Only spell_check.py, test_confidence.py, test_sender_registry.py — no ML stack required. + # Only stdlib/lightweight tests — no ML stack (PyTorch/Surya/Kraken) required. + # test_tmpdir.py covers the TMPDIR env var and entrypoint mkdir behaviour (ADR-021). + # test_tmpdir_is_inside_persistent_cache_volume is skipped in CI (TMPDIR not + # set to /app/cache here); it runs inside the deployed Docker container. ocr-tests: name: OCR Service Tests runs-on: ubuntu-latest @@ -160,11 +163,11 @@ jobs: python-version: '3.11' - name: Install test dependencies - run: pip install "pyspellchecker==0.9.0" pytest pytest-asyncio + run: pip install "pyspellchecker==0.9.0" "fastapi==0.115.6" pytest pytest-asyncio working-directory: ocr-service - name: Run OCR unit tests (no ML stack required) - run: python -m pytest test_spell_check.py test_confidence.py test_sender_registry.py -v + run: python -m pytest test_spell_check.py test_confidence.py test_sender_registry.py test_tmpdir.py -v working-directory: ocr-service # ─── Backend Unit & Slice Tests ─────────────────────────────────────────────── diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index dbae6e9a..31d85e42 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -128,6 +128,23 @@ services: timeout: 5s retries: 5 + # --- OCR: Volume bootstrap --- + # Ensures correct ownership and directory structure on ocr-cache / ocr-models + # before ocr-service starts. Handles pre-existing volumes (including those + # created before the non-root ocr user was introduced in commit 1aca4c4a) + # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021. + ocr-volume-init: + image: alpine:3.21 + command: + - sh + - -c + - "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp" + volumes: + - ocr-models:/app/models + - ocr-cache:/app/cache + networks: [] + restart: "no" + ocr-service: build: context: ./ocr-service @@ -147,6 +164,9 @@ services: HF_HOME: /app/cache XDG_CACHE_HOME: /app/cache TORCH_HOME: /app/models/torch + TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs. + # /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp + # but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021. KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel TRAINING_TOKEN: ${OCR_TRAINING_TOKEN} OCR_CONFIDENCE_THRESHOLD: "0.3" @@ -164,9 +184,13 @@ services: timeout: 5s retries: 12 start_period: 120s + depends_on: + ocr-volume-init: + condition: service_completed_successfully read_only: true tmpfs: - - /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images) + - /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly). + # GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021. cap_drop: - ALL security_opt: diff --git a/docker-compose.yml b/docker-compose.yml index 91f8bbda..842f94e1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -71,6 +71,23 @@ services: networks: - archiv-net + # --- OCR: Volume bootstrap --- + # Ensures correct ownership and directory structure on ocr_cache / ocr_models + # before ocr-service starts. Handles pre-existing volumes (including those + # created before the non-root ocr user was introduced in commit 1aca4c4a) + # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021. + ocr-volume-init: + image: alpine:3.21 + command: + - sh + - -c + - "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp" + volumes: + - ocr_models:/app/models + - ocr_cache:/app/cache + networks: [] + restart: "no" + # --- OCR: Python microservice (Surya + Kraken) --- # Single-node only: OCR training reloads the model in-process after each run. # Running multiple replicas would cause training conflicts and model-state divergence. @@ -92,6 +109,9 @@ services: HF_HOME: /app/cache XDG_CACHE_HOME: /app/cache TORCH_HOME: /app/models/torch + TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs. + # /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp + # but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021. KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}" OCR_CONFIDENCE_THRESHOLD: "0.3" @@ -109,9 +129,13 @@ services: timeout: 5s retries: 12 start_period: 120s + depends_on: + ocr-volume-init: + condition: service_completed_successfully read_only: true tmpfs: - - /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images) + - /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly). + # GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021. cap_drop: - ALL security_opt: diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index aaba04e2..945346ae 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -564,6 +564,22 @@ bash scripts/download-kraken-models.sh Version-specific one-time steps that must be run before or after upgrading to a given release. Each subsection is safe to skip on a fresh install. +### Upgrading to PR #615 — TMPDIR redirect + ocr-volume-init + +`ocr-volume-init` is a new one-shot service in both compose files that runs before `ocr-service` on every `docker compose up`. It: + +1. `chown -R 1000:1000 /app/cache /app/models` — corrects volume ownership so the non-root `ocr` user (uid 1000) can write to volumes that may have been created as root (including volumes from before PR #611). +2. `mkdir -p /app/cache/.tmp` — creates the TMPDIR staging directory that Surya uses for GB-scale model downloads. Without this directory, the first model download falls back to the 512 MB `/tmp` tmpfs and fails with ENOSPC. See ADR-021. + +**Verify it succeeded:** +```bash +docker logs archiv-ocr-volume-init # dev +docker logs archiv-production-ocr-volume-init-1 # prod +``` +Expected output: no error lines; exit code 0. + +**Failure mode:** if `chown` is denied (e.g. the volume is mounted read-only), the container exits non-zero and `ocr-service` will not start (`depends_on: condition: service_completed_successfully`). Check `docker logs` for the `chown` error and verify the volume is writable. + ### Upgrading to PR #611 — non-root OCR container The OCR cache volume path changed from `/root/.cache` to `/app/cache` (PR #611 — CIS Docker §4.1 hardening). The existing volume was written as root and is inaccessible to the new non-root `ocr` user, causing a `PermissionError` on startup. diff --git a/docs/adr/021-tmpdir-persistent-volume-staging.md b/docs/adr/021-tmpdir-persistent-volume-staging.md new file mode 100644 index 00000000..aaa7a203 --- /dev/null +++ b/docs/adr/021-tmpdir-persistent-volume-staging.md @@ -0,0 +1,68 @@ +# ADR-021 — Route Surya model-download staging to the persistent cache volume via TMPDIR + +**Status:** Accepted +**Date:** 2026-05-18 +**Issue:** #614 + +--- + +## Context + +After the container hardening baseline (ADR-019), the OCR service runs with `read_only: true` and a 512 MB `/tmp` tmpfs. The tmpfs was sized for training-ZIP extraction (typically 20–50 images, well under 100 MB). + +Surya's `download_directory()` (surya ≥ 0.6, `surya/common/s3.py`) stages every model file through `tempfile.TemporaryDirectory()` before moving it to the final cache location. `TemporaryDirectory()` honours `$TMPDIR` and falls back to `/tmp`. The `text_recognition` model is 1.34 GB; future Surya models will be in the same range. This blows the 512 MB budget at ~510 MB with `OSError: [Errno 28] No space left on device`. + +The host has 1.8 TB free on the disk that backs `/app/cache`. The failure is a routing problem, not a capacity problem. + +--- + +## Decision + +Set `TMPDIR=/app/cache/.tmp` in the OCR container so all `tempfile` staging goes to the persistent SSD-backed cache volume. + +```yaml +# docker-compose.yml / docker-compose.prod.yml — ocr-service.environment +TMPDIR: /app/cache/.tmp +``` + +```dockerfile +# ocr-service/Dockerfile — default for bare docker-run usage +ENV TMPDIR=/app/cache/.tmp +``` + +```bash +# ocr-service/entrypoint.sh — idempotent directory bootstrap +mkdir -p "${TMPDIR:-/tmp}" +find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true +``` + +A one-shot `ocr-volume-init` service in both compose files runs before `ocr-service` to `chown -R 1000:1000` the volumes and `mkdir -p /app/cache/.tmp`. This replaces the manual `docker run --rm alpine chown` step performed on 2026-05-18 and makes fresh-volume correctness a permanent infrastructure-as-code guarantee. + +The `/tmp` tmpfs remains at 512 MB and continues to serve training-ZIP extraction and transient PDF buffers — its original purpose. + +--- + +## Consequences + +**Positive** + +- Surya model downloads complete: 1.34 GB fits on the SSD, not in 512 MB of RAM. +- `shutil.move()` from staging → cache becomes a same-filesystem `rename(2)` — atomic and near-free. +- Volume ownership is now automated; no manual `docker run --rm alpine chown` on redeploy. +- `/tmp` retains its small 512 MB DoS cap for attacker-influenceable training endpoints (post-auth only, behind `X-Training-Token`). +- ZIP Slip protection in `_validate_zip_entry()` is unaffected — it uses `os.path.realpath()` anchored to the extraction directory regardless of where that directory lives. + +**Negative / Trade-offs** + +- If the container is `docker kill`ed mid-download, partial files persist in `/app/cache/.tmp` across container restarts. Mitigated by the `find -mtime +1 -delete` in `entrypoint.sh` — orphans older than one day are removed on startup. +- `TMPDIR` pointing inside a volume mount is non-obvious. Any future move of `/app/cache` to a different storage tier must revisit this setting. This ADR is the load-bearing reference. + +--- + +## Alternatives considered + +**Approach B — Enlarge `/tmp` to 4 GB** +One-line change. Discarded because: (1) 4 GB tmpfs counts against the cgroup `mem_limit`; on CX32 hosts with `OCR_MEM_LIMIT=6g` the combined Surya resident set + tmpfs would trigger OOMKill on cold start; (2) staging GB-scale model files through RAM is using the wrong storage tier; (3) any future model larger than 4 GB requires another bump. + +**Approach C — Both TMPDIR redirect and enlarged /tmp** +Belt-and-suspenders: Approach A + 1 GB tmpfs. Discarded in favour of the cleaner Approach A. The defence-in-depth benefit does not outweigh the extra compose churn; the 512 MB cap on `/tmp` is intentional. diff --git a/ocr-service/CLAUDE.md b/ocr-service/CLAUDE.md index f628c60b..09d4a895 100644 --- a/ocr-service/CLAUDE.md +++ b/ocr-service/CLAUDE.md @@ -5,3 +5,5 @@ **LLM reminder:** the OCR service is a **single-node container** — training reloads the model in-process, so multiple replicas cause model-state divergence (see ADR-001). All job tracking and business logic stay in Spring Boot; the Python service is stateless OCR only. **LLM reminder:** `ALLOWED_PDF_HOSTS` must never be set to `*` — that opens SSRF. The default (`minio,localhost,127.0.0.1`) is correct for dev. + +**LLM reminder:** `TMPDIR` points to `/app/cache/.tmp` (persistent SSD volume). Never redirect it back to `/tmp` or any RAM-backed path — `/tmp` is 512 MB and cannot stage GB-scale Surya model downloads (causes ENOSPC). The `ocr-volume-init` container creates the directory on fresh volumes; `entrypoint.sh` ensures it exists as a fallback. See ADR-021. diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile index 9ad75f5c..777d0ae3 100644 --- a/ocr-service/Dockerfile +++ b/ocr-service/Dockerfile @@ -32,6 +32,7 @@ ENV HOME=/home/ocr ENV HF_HOME=/app/cache ENV XDG_CACHE_HOME=/app/cache ENV TORCH_HOME=/app/models/torch +ENV TMPDIR=/app/cache/.tmp USER ocr diff --git a/ocr-service/README.md b/ocr-service/README.md index 976db06b..f8600cb9 100644 --- a/ocr-service/README.md +++ b/ocr-service/README.md @@ -32,6 +32,10 @@ Python FastAPI microservice that performs the actual handwritten text recognitio | `ALLOWED_PDF_HOSTS` | `minio,localhost,127.0.0.1` | YES | — | SSRF protection — comma-separated allowed PDF source hosts. Never set to `*`. | | `KRAKEN_MODEL_PATH` | `/app/models/` | — | — | Directory where Kraken HTR models are stored (populated by `download-kraken-models.sh`) | | `BLLA_MODEL_PATH` | `/app/models/blla.mlmodel` | — | — | Kraken baseline layout analysis model. Auto-downloaded via `ensure_blla_model.py` on startup if missing. | +| `HF_HOME` | `/app/cache` | — | — | HuggingFace model cache root. Keeps model downloads on the persistent cache volume. | +| `XDG_CACHE_HOME` | `/app/cache` | — | — | XDG cache root (used by some Surya components alongside `HF_HOME`). | +| `TORCH_HOME` | `/app/models/torch` | — | — | PyTorch model cache. Kept on the persistent models volume. | +| `TMPDIR` | `/app/cache/.tmp` | — | — | Download-staging directory for GB-scale Surya model files. Must point to a disk-backed path, not the 512 MB `/tmp` tmpfs — see ADR-021. | ## Key files diff --git a/ocr-service/entrypoint.sh b/ocr-service/entrypoint.sh index ec6892a8..f67daee1 100644 --- a/ocr-service/entrypoint.sh +++ b/ocr-service/entrypoint.sh @@ -1,6 +1,12 @@ #!/bin/bash set -euo pipefail +# Ensure TMPDIR exists on the persistent cache volume (created by the volume-init +# container, but guaranteed here for fresh volumes and bare docker-run usage). +# Remove stale partial downloads left by a previous docker-kill. +mkdir -p "${TMPDIR:-/tmp}" +find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true + # Validate the blla segmentation base model and download it if missing or # incompatible. ketos 7 dropped support for legacy PyTorch ZIP archives — # this ensures the volume always holds a loadable CoreML protobuf model. diff --git a/ocr-service/main.py b/ocr-service/main.py index 783bf224..409cc78f 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -27,6 +27,7 @@ from engines import kraken as kraken_engine from engines import surya as surya_engine from models import OcrBlock, OcrRequest from preprocessing import preprocess_page +from utils import _validate_zip_entry TRAINING_TOKEN = os.environ.get("TRAINING_TOKEN", "") KRAKEN_MODEL_PATH = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel") @@ -291,14 +292,6 @@ def _check_training_token(x_training_token: str | None) -> None: raise HTTPException(status_code=403, detail="Invalid or missing X-Training-Token") -def _validate_zip_entry(name: str, extract_dir: str) -> None: - """Reject ZIP Slip attacks: path traversal and absolute paths.""" - if os.path.isabs(name) or name.startswith(".."): - raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}") - resolved = os.path.realpath(os.path.join(extract_dir, name)) - if not resolved.startswith(os.path.realpath(extract_dir)): - raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}") - def _rotate_backups(model_path: str, keep: int = 3) -> None: """Keep only the last `keep` timestamped backups of the model.""" diff --git a/ocr-service/test_tmpdir.py b/ocr-service/test_tmpdir.py new file mode 100644 index 00000000..aa79781b --- /dev/null +++ b/ocr-service/test_tmpdir.py @@ -0,0 +1,151 @@ +"""Tests for TMPDIR configuration and entrypoint mkdir behavior — ADR-021.""" + +import os +import subprocess +import tempfile +import time + +import pytest + +from fastapi import HTTPException +from utils import _validate_zip_entry + +_ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh") + + +def _run_entrypoint(tmpdir, tmp_path): + """Run entrypoint.sh with TMPDIR set to tmpdir; python3/uvicorn are stubbed out.""" + stub_bin = tmp_path / "stub_bin" + stub_bin.mkdir(exist_ok=True) + for name in ("python3", "uvicorn"): + stub = stub_bin / name + stub.write_text("#!/bin/sh\nexit 0\n") + stub.chmod(0o755) + env = { + **os.environ, + "TMPDIR": str(tmpdir), + "PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}", + } + return subprocess.run(["bash", _ENTRYPOINT], env=env, capture_output=True, text=True) + + +def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path): + """Python honours the TMPDIR env var when creating temporary directories. + + Documents the mechanism that routes Surya model staging to the persistent + cache volume instead of the 512 MB RAM tmpfs. See ADR-021. + """ + custom_tmp = tmp_path / "model_staging" + custom_tmp.mkdir() + monkeypatch.setenv("TMPDIR", str(custom_tmp)) + monkeypatch.setattr(tempfile, "tempdir", None) + with tempfile.TemporaryDirectory() as td: + assert td.startswith(str(custom_tmp)) + + +def test_entrypoint_creates_tmpdir(tmp_path): + """entrypoint.sh creates the TMPDIR directory when it does not exist. + + On a fresh ocr_cache volume, /app/cache/.tmp is absent. The entrypoint + must create it before uvicorn starts so the first Surya model download + does not exhaust the 512 MB /tmp tmpfs (ENOSPC). See ADR-021. + """ + custom_tmp = tmp_path / "model-staging" + assert not custom_tmp.exists(), "pre-condition: directory must not exist yet" + + stub_bin = tmp_path / "stub_bin" + stub_bin.mkdir() + for name in ("python3", "uvicorn"): + stub = stub_bin / name + stub.write_text("#!/bin/sh\nexit 0\n") + stub.chmod(0o755) + + env = { + **os.environ, + "TMPDIR": str(custom_tmp), + "PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}", + } + result = subprocess.run( + ["bash", _ENTRYPOINT], + env=env, + capture_output=True, + text=True, + ) + assert result.returncode == 0, ( + f"entrypoint.sh exited {result.returncode}\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) + assert custom_tmp.exists(), ( + f"entrypoint.sh did not create TMPDIR={custom_tmp}\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) + + +@pytest.mark.skipif( + not os.environ.get("TMPDIR", "").startswith("/app/cache"), + reason="TMPDIR contract only enforced inside the OCR Docker container", +) +def test_tmpdir_is_inside_persistent_cache_volume(): + """TMPDIR must point to the persistent cache volume, not a RAM tmpfs. + + Catches accidental reversion to /tmp or any tmpfs-backed path. + Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp. + To run manually: docker exec archiv-ocr python -m pytest test_tmpdir.py::test_tmpdir_is_inside_persistent_cache_volume -v + See ADR-021. + """ + tmpdir = os.environ["TMPDIR"] + assert tmpdir.startswith("/app/cache"), ( + f"TMPDIR={tmpdir!r} must be under /app/cache to route model downloads " + "to the SSD-backed cache volume — see ADR-021" + ) + + +def test_entrypoint_removes_day_old_orphans(tmp_path): + """entrypoint.sh deletes partial downloads older than 1 day from TMPDIR. + + Simulates a file left behind by a docker-kill mid-download: backdate its + mtime by 2 days using os.utime(), run the entrypoint, assert it is gone. + See ADR-021. + """ + staging = tmp_path / "staging" + staging.mkdir() + stale_file = staging / "model.safetensors.partial" + stale_file.write_bytes(b"partial download") + two_days_ago = time.time() - 2 * 24 * 3600 + os.utime(stale_file, (two_days_ago, two_days_ago)) + + result = _run_entrypoint(staging, tmp_path) + assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}" + assert not stale_file.exists(), "day-old orphan should have been deleted by entrypoint.sh" + + +def test_entrypoint_preserves_fresh_files(tmp_path): + """entrypoint.sh does not delete files newer than 1 day from TMPDIR. + + An in-progress download whose mtime is recent must survive the orphan + cleanup so a concurrent or just-started model fetch is not interrupted. + See ADR-021. + """ + staging = tmp_path / "staging" + staging.mkdir() + fresh_file = staging / "model.safetensors.part" + fresh_file.write_bytes(b"in progress") + # mtime is now — no os.utime() call needed + + result = _run_entrypoint(staging, tmp_path) + assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}" + assert fresh_file.exists(), "recent file should not have been deleted by entrypoint.sh" + + +def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path): + """_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR. + + When TMPDIR=/app/cache/.tmp, extraction dirs live under that path. + Verifies os.path.realpath() still anchors correctly against the non-default base. + """ + extract_dir = tmp_path / "model-staging" / "tmpXXX" + extract_dir.mkdir(parents=True) + + with pytest.raises(HTTPException) as exc_info: + _validate_zip_entry("../evil.py", str(extract_dir)) + assert exc_info.value.status_code == 400 diff --git a/ocr-service/utils.py b/ocr-service/utils.py new file mode 100644 index 00000000..18d04832 --- /dev/null +++ b/ocr-service/utils.py @@ -0,0 +1,14 @@ +"""Utility functions shared across the OCR service with no ML-stack imports.""" + +import os + +from fastapi import HTTPException + + +def _validate_zip_entry(name: str, extract_dir: str) -> None: + """Reject ZIP Slip attacks: path traversal and absolute paths.""" + if os.path.isabs(name) or name.startswith(".."): + raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}") + resolved = os.path.realpath(os.path.join(extract_dir, name)) + if not resolved.startswith(os.path.realpath(extract_dir)): + raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}")