From 09a043431e536546abafb625459e4376d6c41b03 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 10:53:15 +0200 Subject: [PATCH 01/10] build(ocr): set ENV TMPDIR=/app/cache/.tmp so docker run uses SSD staging Without this, running the image outside compose loses the TMPDIR redirect and Surya model downloads fall back to the 512 MB /tmp tmpfs (ENOSPC). See issue #614, ADR-021. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile index 9ad75f5c..777d0ae3 100644 --- a/ocr-service/Dockerfile +++ b/ocr-service/Dockerfile @@ -32,6 +32,7 @@ ENV HOME=/home/ocr ENV HF_HOME=/app/cache ENV XDG_CACHE_HOME=/app/cache ENV TORCH_HOME=/app/models/torch +ENV TMPDIR=/app/cache/.tmp USER ocr -- 2.49.1 From 240b373f68f8dc64ea7d65827ca5a2c1d4c0208f Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 10:54:17 +0200 Subject: [PATCH 02/10] fix(ocr): create TMPDIR on startup and clear day-old orphans On a fresh ocr_cache volume /app/cache/.tmp does not exist yet. The mkdir ensures the first Surya model download can proceed without ENOSPC on the 512 MB /tmp tmpfs. The find cleanup removes fragments left by docker-kill mid-download, preventing cross-job ground-truth leakage. Fixes #614. See ADR-021. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/entrypoint.sh | 7 +++ ocr-service/test_tmpdir.py | 97 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 ocr-service/test_tmpdir.py diff --git a/ocr-service/entrypoint.sh b/ocr-service/entrypoint.sh index ec6892a8..104e2fbb 100644 --- a/ocr-service/entrypoint.sh +++ b/ocr-service/entrypoint.sh @@ -1,6 +1,13 @@ #!/bin/bash set -euo pipefail +# Ensure TMPDIR exists on the persistent cache volume (created by the volume-init +# container, but guaranteed here for fresh volumes and bare docker-run usage). +# Orphaned fragments from prior docker-kill during model downloads are cleared +# on startup to prevent cross-job ground-truth leakage (Surya staging files). +mkdir -p "${TMPDIR:-/tmp}" +find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true + # Validate the blla segmentation base model and download it if missing or # incompatible. ketos 7 dropped support for legacy PyTorch ZIP archives — # this ensures the volume always holds a loadable CoreML protobuf model. diff --git a/ocr-service/test_tmpdir.py b/ocr-service/test_tmpdir.py new file mode 100644 index 00000000..fb55ccb4 --- /dev/null +++ b/ocr-service/test_tmpdir.py @@ -0,0 +1,97 @@ +"""Tests for TMPDIR configuration and entrypoint mkdir behavior — ADR-021.""" + +import os +import subprocess +import tempfile + +import pytest + +try: + from fastapi import HTTPException + from main import _validate_zip_entry + HAS_MAIN = True +except ImportError: + HAS_MAIN = False + +_ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh") + + +def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path): + """Python honours the TMPDIR env var when creating temporary directories. + + Documents the mechanism that routes Surya model staging to the persistent + cache volume instead of the 512 MB RAM tmpfs. See ADR-021. + """ + custom_tmp = tmp_path / "model_staging" + custom_tmp.mkdir() + monkeypatch.setenv("TMPDIR", str(custom_tmp)) + monkeypatch.setattr(tempfile, "tempdir", None) + with tempfile.TemporaryDirectory() as td: + assert td.startswith(str(custom_tmp)) + + +def test_entrypoint_creates_tmpdir(tmp_path): + """entrypoint.sh creates the TMPDIR directory when it does not exist. + + On a fresh ocr_cache volume, /app/cache/.tmp is absent. The entrypoint + must create it before uvicorn starts so the first Surya model download + does not exhaust the 512 MB /tmp tmpfs (ENOSPC). See ADR-021. + """ + custom_tmp = tmp_path / "model-staging" + assert not custom_tmp.exists(), "pre-condition: directory must not exist yet" + + stub_bin = tmp_path / "stub_bin" + stub_bin.mkdir() + for name in ("python3", "uvicorn"): + stub = stub_bin / name + stub.write_text("#!/bin/sh\nexit 0\n") + stub.chmod(0o755) + + env = { + **os.environ, + "TMPDIR": str(custom_tmp), + "PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}", + } + result = subprocess.run( + ["bash", _ENTRYPOINT], + env=env, + capture_output=True, + text=True, + ) + assert custom_tmp.exists(), ( + f"entrypoint.sh did not create TMPDIR={custom_tmp}\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) + + +@pytest.mark.skipif( + not os.environ.get("TMPDIR", "").startswith("/app/cache"), + reason="TMPDIR contract only enforced inside the OCR Docker container", +) +def test_tmpdir_is_inside_persistent_cache_volume(): + """TMPDIR must point to the persistent cache volume, not a RAM tmpfs. + + Catches accidental reversion to /tmp or any tmpfs-backed path. + Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp. + See ADR-021. + """ + tmpdir = os.environ["TMPDIR"] + assert tmpdir.startswith("/app/cache"), ( + f"TMPDIR={tmpdir!r} must be under /app/cache to route model downloads " + "to the SSD-backed cache volume — see ADR-021" + ) + + +@pytest.mark.skipif(not HAS_MAIN, reason="requires full ML stack (not available in CI)") +def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path): + """_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR. + + When TMPDIR=/app/cache/.tmp, extraction dirs live under that path. + Verifies os.path.realpath() still anchors correctly against the non-default base. + """ + extract_dir = tmp_path / "model-staging" / "tmpXXX" + extract_dir.mkdir(parents=True) + + with pytest.raises(HTTPException) as exc_info: + _validate_zip_entry("../evil.py", str(extract_dir)) + assert exc_info.value.status_code == 400 -- 2.49.1 From 1f7b08b74f6180d315b1af89211dd82586f2bd61 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 10:56:10 +0200 Subject: [PATCH 03/10] fix(ocr): add TMPDIR env var and ocr-volume-init service to compose files TMPDIR=/app/cache/.tmp routes Surya model staging to the SSD-backed cache volume instead of the 512 MB /tmp tmpfs. The ocr-volume-init one-shot service runs first to ensure correct ownership (uid 1000) and creates /app/cache/.tmp on fresh volumes, making AC #6 ("fresh volume still works") a permanent infrastructure-as-code guarantee rather than a manual chown step. Both docker-compose.yml and docker-compose.prod.yml are updated in the same commit to prevent the silent drift that occurred with the 512 MB tmpfs comment. Fixes #614. See ADR-021. Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.prod.yml | 25 ++++++++++++++++++++++++- docker-compose.yml | 25 ++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index dbae6e9a..517d7a98 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -128,6 +128,22 @@ services: timeout: 5s retries: 5 + # --- OCR: Volume bootstrap --- + # Ensures correct ownership and directory structure on ocr-cache / ocr-models + # before ocr-service starts. Handles pre-existing volumes (including those + # created before the non-root ocr user was introduced in commit 1aca4c4a) + # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021. + ocr-volume-init: + image: alpine:3 + command: + - sh + - -c + - "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp" + volumes: + - ocr-models:/app/models + - ocr-cache:/app/cache + restart: "no" + ocr-service: build: context: ./ocr-service @@ -147,6 +163,9 @@ services: HF_HOME: /app/cache XDG_CACHE_HOME: /app/cache TORCH_HOME: /app/models/torch + TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs. + # /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp + # but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021. KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel TRAINING_TOKEN: ${OCR_TRAINING_TOKEN} OCR_CONFIDENCE_THRESHOLD: "0.3" @@ -164,9 +183,13 @@ services: timeout: 5s retries: 12 start_period: 120s + depends_on: + ocr-volume-init: + condition: service_completed_successfully read_only: true tmpfs: - - /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images) + - /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly). + # GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021. cap_drop: - ALL security_opt: diff --git a/docker-compose.yml b/docker-compose.yml index 91f8bbda..7ebf907a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -71,6 +71,22 @@ services: networks: - archiv-net + # --- OCR: Volume bootstrap --- + # Ensures correct ownership and directory structure on ocr_cache / ocr_models + # before ocr-service starts. Handles pre-existing volumes (including those + # created before the non-root ocr user was introduced in commit 1aca4c4a) + # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021. + ocr-volume-init: + image: alpine:3 + command: + - sh + - -c + - "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp" + volumes: + - ocr_models:/app/models + - ocr_cache:/app/cache + restart: "no" + # --- OCR: Python microservice (Surya + Kraken) --- # Single-node only: OCR training reloads the model in-process after each run. # Running multiple replicas would cause training conflicts and model-state divergence. @@ -92,6 +108,9 @@ services: HF_HOME: /app/cache XDG_CACHE_HOME: /app/cache TORCH_HOME: /app/models/torch + TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs. + # /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp + # but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021. KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}" OCR_CONFIDENCE_THRESHOLD: "0.3" @@ -109,9 +128,13 @@ services: timeout: 5s retries: 12 start_period: 120s + depends_on: + ocr-volume-init: + condition: service_completed_successfully read_only: true tmpfs: - - /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images) + - /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly). + # GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021. cap_drop: - ALL security_opt: -- 2.49.1 From cfd49ff69ece15c73c582bec45f77d3316f080e5 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 10:58:10 +0200 Subject: [PATCH 04/10] docs(ocr): document TMPDIR convention and add ADR-021 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ocr-service/README.md: add HF_HOME, XDG_CACHE_HOME, TORCH_HOME, TMPDIR rows to the environment variables table - ocr-service/CLAUDE.md: LLM reminder — TMPDIR must stay on the cache volume - docs/adr/021-tmpdir-persistent-volume-staging.md: records the decision, trade-offs, and rejected alternatives (Approach B / C) for issue #614 - ci.yml: add test_tmpdir.py to the OCR CI run (stdlib-only tests, no ML stack) Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/ci.yml | 7 +- .../021-tmpdir-persistent-volume-staging.md | 68 +++++++++++++++++++ ocr-service/CLAUDE.md | 2 + ocr-service/README.md | 4 ++ 4 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 docs/adr/021-tmpdir-persistent-volume-staging.md diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index a086f7c8..20882024 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -148,7 +148,10 @@ jobs: path: frontend/test-results/screenshots/ # ─── OCR Service Unit Tests ─────────────────────────────────────────────────── - # Only spell_check.py, test_confidence.py, test_sender_registry.py — no ML stack required. + # Only stdlib/lightweight tests — no ML stack (PyTorch/Surya/Kraken) required. + # test_tmpdir.py covers the TMPDIR env var and entrypoint mkdir behaviour (ADR-021). + # test_tmpdir_is_inside_persistent_cache_volume is skipped in CI (TMPDIR not + # set to /app/cache here); it runs inside the deployed Docker container. ocr-tests: name: OCR Service Tests runs-on: ubuntu-latest @@ -164,7 +167,7 @@ jobs: working-directory: ocr-service - name: Run OCR unit tests (no ML stack required) - run: python -m pytest test_spell_check.py test_confidence.py test_sender_registry.py -v + run: python -m pytest test_spell_check.py test_confidence.py test_sender_registry.py test_tmpdir.py -v working-directory: ocr-service # ─── Backend Unit & Slice Tests ─────────────────────────────────────────────── diff --git a/docs/adr/021-tmpdir-persistent-volume-staging.md b/docs/adr/021-tmpdir-persistent-volume-staging.md new file mode 100644 index 00000000..aaa7a203 --- /dev/null +++ b/docs/adr/021-tmpdir-persistent-volume-staging.md @@ -0,0 +1,68 @@ +# ADR-021 — Route Surya model-download staging to the persistent cache volume via TMPDIR + +**Status:** Accepted +**Date:** 2026-05-18 +**Issue:** #614 + +--- + +## Context + +After the container hardening baseline (ADR-019), the OCR service runs with `read_only: true` and a 512 MB `/tmp` tmpfs. The tmpfs was sized for training-ZIP extraction (typically 20–50 images, well under 100 MB). + +Surya's `download_directory()` (surya ≥ 0.6, `surya/common/s3.py`) stages every model file through `tempfile.TemporaryDirectory()` before moving it to the final cache location. `TemporaryDirectory()` honours `$TMPDIR` and falls back to `/tmp`. The `text_recognition` model is 1.34 GB; future Surya models will be in the same range. This blows the 512 MB budget at ~510 MB with `OSError: [Errno 28] No space left on device`. + +The host has 1.8 TB free on the disk that backs `/app/cache`. The failure is a routing problem, not a capacity problem. + +--- + +## Decision + +Set `TMPDIR=/app/cache/.tmp` in the OCR container so all `tempfile` staging goes to the persistent SSD-backed cache volume. + +```yaml +# docker-compose.yml / docker-compose.prod.yml — ocr-service.environment +TMPDIR: /app/cache/.tmp +``` + +```dockerfile +# ocr-service/Dockerfile — default for bare docker-run usage +ENV TMPDIR=/app/cache/.tmp +``` + +```bash +# ocr-service/entrypoint.sh — idempotent directory bootstrap +mkdir -p "${TMPDIR:-/tmp}" +find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true +``` + +A one-shot `ocr-volume-init` service in both compose files runs before `ocr-service` to `chown -R 1000:1000` the volumes and `mkdir -p /app/cache/.tmp`. This replaces the manual `docker run --rm alpine chown` step performed on 2026-05-18 and makes fresh-volume correctness a permanent infrastructure-as-code guarantee. + +The `/tmp` tmpfs remains at 512 MB and continues to serve training-ZIP extraction and transient PDF buffers — its original purpose. + +--- + +## Consequences + +**Positive** + +- Surya model downloads complete: 1.34 GB fits on the SSD, not in 512 MB of RAM. +- `shutil.move()` from staging → cache becomes a same-filesystem `rename(2)` — atomic and near-free. +- Volume ownership is now automated; no manual `docker run --rm alpine chown` on redeploy. +- `/tmp` retains its small 512 MB DoS cap for attacker-influenceable training endpoints (post-auth only, behind `X-Training-Token`). +- ZIP Slip protection in `_validate_zip_entry()` is unaffected — it uses `os.path.realpath()` anchored to the extraction directory regardless of where that directory lives. + +**Negative / Trade-offs** + +- If the container is `docker kill`ed mid-download, partial files persist in `/app/cache/.tmp` across container restarts. Mitigated by the `find -mtime +1 -delete` in `entrypoint.sh` — orphans older than one day are removed on startup. +- `TMPDIR` pointing inside a volume mount is non-obvious. Any future move of `/app/cache` to a different storage tier must revisit this setting. This ADR is the load-bearing reference. + +--- + +## Alternatives considered + +**Approach B — Enlarge `/tmp` to 4 GB** +One-line change. Discarded because: (1) 4 GB tmpfs counts against the cgroup `mem_limit`; on CX32 hosts with `OCR_MEM_LIMIT=6g` the combined Surya resident set + tmpfs would trigger OOMKill on cold start; (2) staging GB-scale model files through RAM is using the wrong storage tier; (3) any future model larger than 4 GB requires another bump. + +**Approach C — Both TMPDIR redirect and enlarged /tmp** +Belt-and-suspenders: Approach A + 1 GB tmpfs. Discarded in favour of the cleaner Approach A. The defence-in-depth benefit does not outweigh the extra compose churn; the 512 MB cap on `/tmp` is intentional. diff --git a/ocr-service/CLAUDE.md b/ocr-service/CLAUDE.md index f628c60b..09d4a895 100644 --- a/ocr-service/CLAUDE.md +++ b/ocr-service/CLAUDE.md @@ -5,3 +5,5 @@ **LLM reminder:** the OCR service is a **single-node container** — training reloads the model in-process, so multiple replicas cause model-state divergence (see ADR-001). All job tracking and business logic stay in Spring Boot; the Python service is stateless OCR only. **LLM reminder:** `ALLOWED_PDF_HOSTS` must never be set to `*` — that opens SSRF. The default (`minio,localhost,127.0.0.1`) is correct for dev. + +**LLM reminder:** `TMPDIR` points to `/app/cache/.tmp` (persistent SSD volume). Never redirect it back to `/tmp` or any RAM-backed path — `/tmp` is 512 MB and cannot stage GB-scale Surya model downloads (causes ENOSPC). The `ocr-volume-init` container creates the directory on fresh volumes; `entrypoint.sh` ensures it exists as a fallback. See ADR-021. diff --git a/ocr-service/README.md b/ocr-service/README.md index 976db06b..f8600cb9 100644 --- a/ocr-service/README.md +++ b/ocr-service/README.md @@ -32,6 +32,10 @@ Python FastAPI microservice that performs the actual handwritten text recognitio | `ALLOWED_PDF_HOSTS` | `minio,localhost,127.0.0.1` | YES | — | SSRF protection — comma-separated allowed PDF source hosts. Never set to `*`. | | `KRAKEN_MODEL_PATH` | `/app/models/` | — | — | Directory where Kraken HTR models are stored (populated by `download-kraken-models.sh`) | | `BLLA_MODEL_PATH` | `/app/models/blla.mlmodel` | — | — | Kraken baseline layout analysis model. Auto-downloaded via `ensure_blla_model.py` on startup if missing. | +| `HF_HOME` | `/app/cache` | — | — | HuggingFace model cache root. Keeps model downloads on the persistent cache volume. | +| `XDG_CACHE_HOME` | `/app/cache` | — | — | XDG cache root (used by some Surya components alongside `HF_HOME`). | +| `TORCH_HOME` | `/app/models/torch` | — | — | PyTorch model cache. Kept on the persistent models volume. | +| `TMPDIR` | `/app/cache/.tmp` | — | — | Download-staging directory for GB-scale Surya model files. Must point to a disk-backed path, not the 512 MB `/tmp` tmpfs — see ADR-021. | ## Key files -- 2.49.1 From c2bd1b34f054ff978330875cd28018d91b24984b Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 11:17:15 +0200 Subject: [PATCH 05/10] refactor(ocr): extract _validate_zip_entry to utils.py so ZIP Slip test runs in CI _validate_zip_entry has no ML-stack dependency; importing it via main.py pulled in surya/torch and caused the test to be skipped in CI. Moving it to utils.py (fastapi only) and adding fastapi to the CI lightweight install lets test_zipslip_still_anchors_under_custom_tmpdir run on every push. Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/ci.yml | 2 +- ocr-service/main.py | 9 +-------- ocr-service/test_tmpdir.py | 9 ++------- ocr-service/utils.py | 14 ++++++++++++++ 4 files changed, 18 insertions(+), 16 deletions(-) create mode 100644 ocr-service/utils.py diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 20882024..ae6228e2 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -163,7 +163,7 @@ jobs: python-version: '3.11' - name: Install test dependencies - run: pip install "pyspellchecker==0.9.0" pytest pytest-asyncio + run: pip install "pyspellchecker==0.9.0" "fastapi==0.115.6" pytest pytest-asyncio working-directory: ocr-service - name: Run OCR unit tests (no ML stack required) diff --git a/ocr-service/main.py b/ocr-service/main.py index 783bf224..409cc78f 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -27,6 +27,7 @@ from engines import kraken as kraken_engine from engines import surya as surya_engine from models import OcrBlock, OcrRequest from preprocessing import preprocess_page +from utils import _validate_zip_entry TRAINING_TOKEN = os.environ.get("TRAINING_TOKEN", "") KRAKEN_MODEL_PATH = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel") @@ -291,14 +292,6 @@ def _check_training_token(x_training_token: str | None) -> None: raise HTTPException(status_code=403, detail="Invalid or missing X-Training-Token") -def _validate_zip_entry(name: str, extract_dir: str) -> None: - """Reject ZIP Slip attacks: path traversal and absolute paths.""" - if os.path.isabs(name) or name.startswith(".."): - raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}") - resolved = os.path.realpath(os.path.join(extract_dir, name)) - if not resolved.startswith(os.path.realpath(extract_dir)): - raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}") - def _rotate_backups(model_path: str, keep: int = 3) -> None: """Keep only the last `keep` timestamped backups of the model.""" diff --git a/ocr-service/test_tmpdir.py b/ocr-service/test_tmpdir.py index fb55ccb4..1ea985b1 100644 --- a/ocr-service/test_tmpdir.py +++ b/ocr-service/test_tmpdir.py @@ -6,12 +6,8 @@ import tempfile import pytest -try: - from fastapi import HTTPException - from main import _validate_zip_entry - HAS_MAIN = True -except ImportError: - HAS_MAIN = False +from fastapi import HTTPException +from utils import _validate_zip_entry _ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh") @@ -82,7 +78,6 @@ def test_tmpdir_is_inside_persistent_cache_volume(): ) -@pytest.mark.skipif(not HAS_MAIN, reason="requires full ML stack (not available in CI)") def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path): """_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR. diff --git a/ocr-service/utils.py b/ocr-service/utils.py new file mode 100644 index 00000000..18d04832 --- /dev/null +++ b/ocr-service/utils.py @@ -0,0 +1,14 @@ +"""Utility functions shared across the OCR service with no ML-stack imports.""" + +import os + +from fastapi import HTTPException + + +def _validate_zip_entry(name: str, extract_dir: str) -> None: + """Reject ZIP Slip attacks: path traversal and absolute paths.""" + if os.path.isabs(name) or name.startswith(".."): + raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}") + resolved = os.path.realpath(os.path.join(extract_dir, name)) + if not resolved.startswith(os.path.realpath(extract_dir)): + raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}") -- 2.49.1 From e31dac5c9ca274566d52568da01b56773ffcafa8 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 11:18:14 +0200 Subject: [PATCH 06/10] test(ocr): assert entrypoint.sh exit code in test_entrypoint_creates_tmpdir A silent non-zero exit would previously cause the test to pass incorrectly because only directory creation was checked. Exit code is now the first assertion, catching regressions before the filesystem check runs. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_tmpdir.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ocr-service/test_tmpdir.py b/ocr-service/test_tmpdir.py index 1ea985b1..1d40433d 100644 --- a/ocr-service/test_tmpdir.py +++ b/ocr-service/test_tmpdir.py @@ -54,6 +54,10 @@ def test_entrypoint_creates_tmpdir(tmp_path): capture_output=True, text=True, ) + assert result.returncode == 0, ( + f"entrypoint.sh exited {result.returncode}\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) assert custom_tmp.exists(), ( f"entrypoint.sh did not create TMPDIR={custom_tmp}\n" f"stdout: {result.stdout}\nstderr: {result.stderr}" -- 2.49.1 From 775b5c062e3175751884073250bdf36332cb618f Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 11:19:33 +0200 Subject: [PATCH 07/10] test(ocr): add orphan cleanup behavior tests for entrypoint.sh find -mtime test_entrypoint_removes_day_old_orphans and test_entrypoint_preserves_fresh_files verify the find -mtime +1 -delete logic using os.utime() to fabricate old mtimes without mocking system time. Also extracts _run_entrypoint helper to remove subprocess setup duplication. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_tmpdir.py | 54 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/ocr-service/test_tmpdir.py b/ocr-service/test_tmpdir.py index 1d40433d..835e8c07 100644 --- a/ocr-service/test_tmpdir.py +++ b/ocr-service/test_tmpdir.py @@ -3,6 +3,7 @@ import os import subprocess import tempfile +import time import pytest @@ -12,6 +13,22 @@ from utils import _validate_zip_entry _ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh") +def _run_entrypoint(tmpdir, tmp_path): + """Run entrypoint.sh with TMPDIR set to tmpdir; python3/uvicorn are stubbed out.""" + stub_bin = tmp_path / "stub_bin" + stub_bin.mkdir(exist_ok=True) + for name in ("python3", "uvicorn"): + stub = stub_bin / name + stub.write_text("#!/bin/sh\nexit 0\n") + stub.chmod(0o755) + env = { + **os.environ, + "TMPDIR": str(tmpdir), + "PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}", + } + return subprocess.run(["bash", _ENTRYPOINT], env=env, capture_output=True, text=True) + + def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path): """Python honours the TMPDIR env var when creating temporary directories. @@ -82,6 +99,43 @@ def test_tmpdir_is_inside_persistent_cache_volume(): ) +def test_entrypoint_removes_day_old_orphans(tmp_path): + """entrypoint.sh deletes partial downloads older than 1 day from TMPDIR. + + Simulates a file left behind by a docker-kill mid-download: backdate its + mtime by 2 days using os.utime(), run the entrypoint, assert it is gone. + See ADR-021. + """ + staging = tmp_path / "staging" + staging.mkdir() + stale_file = staging / "model.safetensors.partial" + stale_file.write_bytes(b"partial download") + two_days_ago = time.time() - 2 * 24 * 3600 + os.utime(stale_file, (two_days_ago, two_days_ago)) + + result = _run_entrypoint(staging, tmp_path) + assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}" + assert not stale_file.exists(), "day-old orphan should have been deleted by entrypoint.sh" + + +def test_entrypoint_preserves_fresh_files(tmp_path): + """entrypoint.sh does not delete files newer than 1 day from TMPDIR. + + An in-progress download whose mtime is recent must survive the orphan + cleanup so a concurrent or just-started model fetch is not interrupted. + See ADR-021. + """ + staging = tmp_path / "staging" + staging.mkdir() + fresh_file = staging / "model.safetensors.part" + fresh_file.write_bytes(b"in progress") + # mtime is now — no os.utime() call needed + + result = _run_entrypoint(staging, tmp_path) + assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}" + assert fresh_file.exists(), "recent file should not have been deleted by entrypoint.sh" + + def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path): """_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR. -- 2.49.1 From 6839cf2a3321d077ce0bb1a6fd04aa541ccea4a5 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 11:20:45 +0200 Subject: [PATCH 08/10] docs(ocr): clarify entrypoint comment and add manual run hint for skipped test - entrypoint.sh: replace "cross-job ground-truth leakage" with plain "Remove stale partial downloads left by a previous docker-kill" - test_tmpdir_is_inside_persistent_cache_volume: add docker exec command so future developers know how to run this deployment-contract test Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/entrypoint.sh | 3 +-- ocr-service/test_tmpdir.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocr-service/entrypoint.sh b/ocr-service/entrypoint.sh index 104e2fbb..f67daee1 100644 --- a/ocr-service/entrypoint.sh +++ b/ocr-service/entrypoint.sh @@ -3,8 +3,7 @@ set -euo pipefail # Ensure TMPDIR exists on the persistent cache volume (created by the volume-init # container, but guaranteed here for fresh volumes and bare docker-run usage). -# Orphaned fragments from prior docker-kill during model downloads are cleared -# on startup to prevent cross-job ground-truth leakage (Surya staging files). +# Remove stale partial downloads left by a previous docker-kill. mkdir -p "${TMPDIR:-/tmp}" find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true diff --git a/ocr-service/test_tmpdir.py b/ocr-service/test_tmpdir.py index 835e8c07..aa79781b 100644 --- a/ocr-service/test_tmpdir.py +++ b/ocr-service/test_tmpdir.py @@ -90,6 +90,7 @@ def test_tmpdir_is_inside_persistent_cache_volume(): Catches accidental reversion to /tmp or any tmpfs-backed path. Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp. + To run manually: docker exec archiv-ocr python -m pytest test_tmpdir.py::test_tmpdir_is_inside_persistent_cache_volume -v See ADR-021. """ tmpdir = os.environ["TMPDIR"] -- 2.49.1 From 3182da8d92d078c0129bb1dc7d520a8e7fcd6e76 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 11:21:55 +0200 Subject: [PATCH 09/10] fix(infra): pin ocr-volume-init to alpine:3.21 and drop project network MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit alpine:3 is a moving tag — pinning to 3.21 makes builds reproducible and rollbacks possible. networks: [] removes the init container from the project network since it only needs volume access, not network access (least privilege). Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.prod.yml | 3 ++- docker-compose.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 517d7a98..31d85e42 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -134,7 +134,7 @@ services: # created before the non-root ocr user was introduced in commit 1aca4c4a) # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021. ocr-volume-init: - image: alpine:3 + image: alpine:3.21 command: - sh - -c @@ -142,6 +142,7 @@ services: volumes: - ocr-models:/app/models - ocr-cache:/app/cache + networks: [] restart: "no" ocr-service: diff --git a/docker-compose.yml b/docker-compose.yml index 7ebf907a..842f94e1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -77,7 +77,7 @@ services: # created before the non-root ocr user was introduced in commit 1aca4c4a) # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021. ocr-volume-init: - image: alpine:3 + image: alpine:3.21 command: - sh - -c @@ -85,6 +85,7 @@ services: volumes: - ocr_models:/app/models - ocr_cache:/app/cache + networks: [] restart: "no" # --- OCR: Python microservice (Surya + Kraken) --- -- 2.49.1 From 193a4d6ee625ab5e5a7978457e47a3cb98e8a504 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 11:23:04 +0200 Subject: [PATCH 10/10] =?UTF-8?q?docs(deployment):=20document=20ocr-volume?= =?UTF-8?q?-init=20bootstrap=20service=20in=20=C2=A78=20upgrade=20notes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explains what ocr-volume-init does (chown volumes + create TMPDIR), how to verify it succeeded (docker logs), and what failure looks like. Addresses reviewer concerns from @mkeller and @tobiwendt on PR #615. Co-Authored-By: Claude Sonnet 4.6 --- docs/DEPLOYMENT.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index aaba04e2..945346ae 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -564,6 +564,22 @@ bash scripts/download-kraken-models.sh Version-specific one-time steps that must be run before or after upgrading to a given release. Each subsection is safe to skip on a fresh install. +### Upgrading to PR #615 — TMPDIR redirect + ocr-volume-init + +`ocr-volume-init` is a new one-shot service in both compose files that runs before `ocr-service` on every `docker compose up`. It: + +1. `chown -R 1000:1000 /app/cache /app/models` — corrects volume ownership so the non-root `ocr` user (uid 1000) can write to volumes that may have been created as root (including volumes from before PR #611). +2. `mkdir -p /app/cache/.tmp` — creates the TMPDIR staging directory that Surya uses for GB-scale model downloads. Without this directory, the first model download falls back to the 512 MB `/tmp` tmpfs and fails with ENOSPC. See ADR-021. + +**Verify it succeeded:** +```bash +docker logs archiv-ocr-volume-init # dev +docker logs archiv-production-ocr-volume-init-1 # prod +``` +Expected output: no error lines; exit code 0. + +**Failure mode:** if `chown` is denied (e.g. the volume is mounted read-only), the container exits non-zero and `ocr-service` will not start (`depends_on: condition: service_completed_successfully`). Check `docker logs` for the `chown` error and verify the volume is writable. + ### Upgrading to PR #611 — non-root OCR container The OCR cache volume path changed from `/root/.cache` to `/app/cache` (PR #611 — CIS Docker §4.1 hardening). The existing volume was written as root and is inaccessible to the new non-root `ocr` user, causing a `PermissionError` on startup. -- 2.49.1