From 1f7b08b74f6180d315b1af89211dd82586f2bd61 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 18 May 2026 10:56:10 +0200 Subject: [PATCH] fix(ocr): add TMPDIR env var and ocr-volume-init service to compose files TMPDIR=/app/cache/.tmp routes Surya model staging to the SSD-backed cache volume instead of the 512 MB /tmp tmpfs. The ocr-volume-init one-shot service runs first to ensure correct ownership (uid 1000) and creates /app/cache/.tmp on fresh volumes, making AC #6 ("fresh volume still works") a permanent infrastructure-as-code guarantee rather than a manual chown step. Both docker-compose.yml and docker-compose.prod.yml are updated in the same commit to prevent the silent drift that occurred with the 512 MB tmpfs comment. Fixes #614. See ADR-021. Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.prod.yml | 25 ++++++++++++++++++++++++- docker-compose.yml | 25 ++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index dbae6e9a..517d7a98 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -128,6 +128,22 @@ services: timeout: 5s retries: 5 + # --- OCR: Volume bootstrap --- + # Ensures correct ownership and directory structure on ocr-cache / ocr-models + # before ocr-service starts. Handles pre-existing volumes (including those + # created before the non-root ocr user was introduced in commit 1aca4c4a) + # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021. + ocr-volume-init: + image: alpine:3 + command: + - sh + - -c + - "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp" + volumes: + - ocr-models:/app/models + - ocr-cache:/app/cache + restart: "no" + ocr-service: build: context: ./ocr-service @@ -147,6 +163,9 @@ services: HF_HOME: /app/cache XDG_CACHE_HOME: /app/cache TORCH_HOME: /app/models/torch + TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs. + # /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp + # but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021. KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel TRAINING_TOKEN: ${OCR_TRAINING_TOKEN} OCR_CONFIDENCE_THRESHOLD: "0.3" @@ -164,9 +183,13 @@ services: timeout: 5s retries: 12 start_period: 120s + depends_on: + ocr-volume-init: + condition: service_completed_successfully read_only: true tmpfs: - - /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images) + - /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly). + # GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021. cap_drop: - ALL security_opt: diff --git a/docker-compose.yml b/docker-compose.yml index 91f8bbda..7ebf907a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -71,6 +71,22 @@ services: networks: - archiv-net + # --- OCR: Volume bootstrap --- + # Ensures correct ownership and directory structure on ocr_cache / ocr_models + # before ocr-service starts. Handles pre-existing volumes (including those + # created before the non-root ocr user was introduced in commit 1aca4c4a) + # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021. + ocr-volume-init: + image: alpine:3 + command: + - sh + - -c + - "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp" + volumes: + - ocr_models:/app/models + - ocr_cache:/app/cache + restart: "no" + # --- OCR: Python microservice (Surya + Kraken) --- # Single-node only: OCR training reloads the model in-process after each run. # Running multiple replicas would cause training conflicts and model-state divergence. @@ -92,6 +108,9 @@ services: HF_HOME: /app/cache XDG_CACHE_HOME: /app/cache TORCH_HOME: /app/models/torch + TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs. + # /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp + # but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021. KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}" OCR_CONFIDENCE_THRESHOLD: "0.3" @@ -109,9 +128,13 @@ services: timeout: 5s retries: 12 start_period: 120s + depends_on: + ocr-volume-init: + condition: service_completed_successfully read_only: true tmpfs: - - /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images) + - /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly). + # GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021. cap_drop: - ALL security_opt: