fix(ocr): route Surya model staging to SSD via TMPDIR + add volume-init service #615

Merged
marcel merged 10 commits from feat/issue-614-tmpdir-persistent-volume into main 2026-05-18 11:32:37 +02:00
2 changed files with 48 additions and 2 deletions
Showing only changes of commit 1f7b08b74f - Show all commits

View File

@@ -128,6 +128,22 @@ services:
timeout: 5s
retries: 5
# --- OCR: Volume bootstrap ---
# Ensures correct ownership and directory structure on ocr-cache / ocr-models
# before ocr-service starts. Handles pre-existing volumes (including those
# created before the non-root ocr user was introduced in commit 1aca4c4a)
# and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
ocr-volume-init:
image: alpine:3
command:
- sh
- -c
- "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
volumes:
- ocr-models:/app/models
- ocr-cache:/app/cache
restart: "no"
ocr-service:
build:
context: ./ocr-service
@@ -147,6 +163,9 @@ services:
HF_HOME: /app/cache
XDG_CACHE_HOME: /app/cache
TORCH_HOME: /app/models/torch
TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
# /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
# but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
OCR_CONFIDENCE_THRESHOLD: "0.3"
@@ -164,9 +183,13 @@ services:
timeout: 5s
retries: 12
start_period: 120s
depends_on:
ocr-volume-init:
condition: service_completed_successfully
read_only: true
tmpfs:
- /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (2050 images)
- /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
# GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
cap_drop:
- ALL
security_opt:

View File

@@ -71,6 +71,22 @@ services:
networks:
- archiv-net
# --- OCR: Volume bootstrap ---
# Ensures correct ownership and directory structure on ocr_cache / ocr_models
# before ocr-service starts. Handles pre-existing volumes (including those
# created before the non-root ocr user was introduced in commit 1aca4c4a)
# and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
ocr-volume-init:
image: alpine:3
command:
- sh
- -c
- "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
volumes:
- ocr_models:/app/models
- ocr_cache:/app/cache
restart: "no"
# --- OCR: Python microservice (Surya + Kraken) ---
# Single-node only: OCR training reloads the model in-process after each run.
# Running multiple replicas would cause training conflicts and model-state divergence.
@@ -92,6 +108,9 @@ services:
HF_HOME: /app/cache
XDG_CACHE_HOME: /app/cache
TORCH_HOME: /app/models/torch
TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
# /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
# but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
OCR_CONFIDENCE_THRESHOLD: "0.3"
@@ -109,9 +128,13 @@ services:
timeout: 5s
retries: 12
start_period: 120s
depends_on:
ocr-volume-init:
condition: service_completed_successfully
read_only: true
tmpfs:
- /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (2050 images)
- /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
# GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
cap_drop:
- ALL
security_opt: