fix(ocr): add TMPDIR env var and ocr-volume-init service to compose files
TMPDIR=/app/cache/.tmp routes Surya model staging to the SSD-backed cache volume instead of the 512 MB /tmp tmpfs. The ocr-volume-init one-shot service runs first to ensure correct ownership (uid 1000) and creates /app/cache/.tmp on fresh volumes, making AC #6 ("fresh volume still works") a permanent infrastructure-as-code guarantee rather than a manual chown step. Both docker-compose.yml and docker-compose.prod.yml are updated in the same commit to prevent the silent drift that occurred with the 512 MB tmpfs comment. Fixes #614. See ADR-021. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -128,6 +128,22 @@ services:
|
|||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
|
# --- OCR: Volume bootstrap ---
|
||||||
|
# Ensures correct ownership and directory structure on ocr-cache / ocr-models
|
||||||
|
# before ocr-service starts. Handles pre-existing volumes (including those
|
||||||
|
# created before the non-root ocr user was introduced in commit 1aca4c4a)
|
||||||
|
# and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
|
||||||
|
ocr-volume-init:
|
||||||
|
image: alpine:3
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
|
||||||
|
volumes:
|
||||||
|
- ocr-models:/app/models
|
||||||
|
- ocr-cache:/app/cache
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
ocr-service:
|
ocr-service:
|
||||||
build:
|
build:
|
||||||
context: ./ocr-service
|
context: ./ocr-service
|
||||||
@@ -147,6 +163,9 @@ services:
|
|||||||
HF_HOME: /app/cache
|
HF_HOME: /app/cache
|
||||||
XDG_CACHE_HOME: /app/cache
|
XDG_CACHE_HOME: /app/cache
|
||||||
TORCH_HOME: /app/models/torch
|
TORCH_HOME: /app/models/torch
|
||||||
|
TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
|
||||||
|
# /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
|
||||||
|
# but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
|
||||||
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
||||||
TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
|
TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
|
||||||
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
||||||
@@ -164,9 +183,13 @@ services:
|
|||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 12
|
retries: 12
|
||||||
start_period: 120s
|
start_period: 120s
|
||||||
|
depends_on:
|
||||||
|
ocr-volume-init:
|
||||||
|
condition: service_completed_successfully
|
||||||
read_only: true
|
read_only: true
|
||||||
tmpfs:
|
tmpfs:
|
||||||
- /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
|
- /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
|
||||||
|
# GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
|
||||||
cap_drop:
|
cap_drop:
|
||||||
- ALL
|
- ALL
|
||||||
security_opt:
|
security_opt:
|
||||||
|
|||||||
@@ -71,6 +71,22 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- archiv-net
|
- archiv-net
|
||||||
|
|
||||||
|
# --- OCR: Volume bootstrap ---
|
||||||
|
# Ensures correct ownership and directory structure on ocr_cache / ocr_models
|
||||||
|
# before ocr-service starts. Handles pre-existing volumes (including those
|
||||||
|
# created before the non-root ocr user was introduced in commit 1aca4c4a)
|
||||||
|
# and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
|
||||||
|
ocr-volume-init:
|
||||||
|
image: alpine:3
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
|
||||||
|
volumes:
|
||||||
|
- ocr_models:/app/models
|
||||||
|
- ocr_cache:/app/cache
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
# --- OCR: Python microservice (Surya + Kraken) ---
|
# --- OCR: Python microservice (Surya + Kraken) ---
|
||||||
# Single-node only: OCR training reloads the model in-process after each run.
|
# Single-node only: OCR training reloads the model in-process after each run.
|
||||||
# Running multiple replicas would cause training conflicts and model-state divergence.
|
# Running multiple replicas would cause training conflicts and model-state divergence.
|
||||||
@@ -92,6 +108,9 @@ services:
|
|||||||
HF_HOME: /app/cache
|
HF_HOME: /app/cache
|
||||||
XDG_CACHE_HOME: /app/cache
|
XDG_CACHE_HOME: /app/cache
|
||||||
TORCH_HOME: /app/models/torch
|
TORCH_HOME: /app/models/torch
|
||||||
|
TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
|
||||||
|
# /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
|
||||||
|
# but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
|
||||||
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
||||||
TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
|
TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
|
||||||
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
||||||
@@ -109,9 +128,13 @@ services:
|
|||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 12
|
retries: 12
|
||||||
start_period: 120s
|
start_period: 120s
|
||||||
|
depends_on:
|
||||||
|
ocr-volume-init:
|
||||||
|
condition: service_completed_successfully
|
||||||
read_only: true
|
read_only: true
|
||||||
tmpfs:
|
tmpfs:
|
||||||
- /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
|
- /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
|
||||||
|
# GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
|
||||||
cap_drop:
|
cap_drop:
|
||||||
- ALL
|
- ALL
|
||||||
security_opt:
|
security_opt:
|
||||||
|
|||||||
Reference in New Issue
Block a user