From 1f7b08b74f6180d315b1af89211dd82586f2bd61 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 18 May 2026 10:56:10 +0200
Subject: [PATCH] fix(ocr): add TMPDIR env var and ocr-volume-init service to
 compose files

TMPDIR=/app/cache/.tmp routes Surya model staging to the SSD-backed cache
volume instead of the 512 MB /tmp tmpfs. The ocr-volume-init one-shot service
runs first to ensure correct ownership (uid 1000) and creates /app/cache/.tmp
on fresh volumes, making AC #6 ("fresh volume still works") a permanent
infrastructure-as-code guarantee rather than a manual chown step.

Both docker-compose.yml and docker-compose.prod.yml are updated in the same
commit to prevent the silent drift that occurred with the 512 MB tmpfs comment.

Fixes #614. See ADR-021.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 25 ++++++++++++++++++++++++-
 docker-compose.yml      | 25 ++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index dbae6e9a..517d7a98 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -128,6 +128,22 @@ services:
       timeout: 5s
       retries: 5
 
+  # --- OCR: Volume bootstrap ---
+  # Ensures correct ownership and directory structure on ocr-cache / ocr-models
+  # before ocr-service starts. Handles pre-existing volumes (including those
+  # created before the non-root ocr user was introduced in commit 1aca4c4a)
+  # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
+  ocr-volume-init:
+    image: alpine:3
+    command:
+      - sh
+      - -c
+      - "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
+    volumes:
+      - ocr-models:/app/models
+      - ocr-cache:/app/cache
+    restart: "no"
+
   ocr-service:
     build:
       context: ./ocr-service
@@ -147,6 +163,9 @@ services:
       HF_HOME: /app/cache
       XDG_CACHE_HOME: /app/cache
       TORCH_HOME: /app/models/torch
+      TMPDIR: /app/cache/.tmp       # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
+                                    # /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
+                                    # but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
       KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
       TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
       OCR_CONFIDENCE_THRESHOLD: "0.3"
@@ -164,9 +183,13 @@ services:
       timeout: 5s
       retries: 12
       start_period: 120s
+    depends_on:
+      ocr-volume-init:
+        condition: service_completed_successfully
     read_only: true
     tmpfs:
-      - /tmp:size=512m   # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
+      - /tmp:size=512m   # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
+                         # GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
     cap_drop:
       - ALL
     security_opt:
diff --git a/docker-compose.yml b/docker-compose.yml
index 91f8bbda..7ebf907a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -71,6 +71,22 @@ services:
     networks:
       - archiv-net
 
+  # --- OCR: Volume bootstrap ---
+  # Ensures correct ownership and directory structure on ocr_cache / ocr_models
+  # before ocr-service starts. Handles pre-existing volumes (including those
+  # created before the non-root ocr user was introduced in commit 1aca4c4a)
+  # and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
+  ocr-volume-init:
+    image: alpine:3
+    command:
+      - sh
+      - -c
+      - "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
+    volumes:
+      - ocr_models:/app/models
+      - ocr_cache:/app/cache
+    restart: "no"
+
   # --- OCR: Python microservice (Surya + Kraken) ---
   # Single-node only: OCR training reloads the model in-process after each run.
   # Running multiple replicas would cause training conflicts and model-state divergence.
@@ -92,6 +108,9 @@ services:
       HF_HOME: /app/cache
       XDG_CACHE_HOME: /app/cache
       TORCH_HOME: /app/models/torch
+      TMPDIR: /app/cache/.tmp       # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
+                                    # /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
+                                    # but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
       KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
       TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
       OCR_CONFIDENCE_THRESHOLD: "0.3"
@@ -109,9 +128,13 @@ services:
       timeout: 5s
       retries: 12
       start_period: 120s
+    depends_on:
+      ocr-volume-init:
+        condition: service_completed_successfully
     read_only: true
     tmpfs:
-      - /tmp:size=512m   # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
+      - /tmp:size=512m   # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
+                         # GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
     cap_drop:
       - ALL
     security_opt: