fix(ocr): route Surya model staging to SSD via TMPDIR + add volume-init service #615
@@ -148,7 +148,10 @@ jobs:
|
|||||||
path: frontend/test-results/screenshots/
|
path: frontend/test-results/screenshots/
|
||||||
|
|
||||||
# ─── OCR Service Unit Tests ───────────────────────────────────────────────────
|
# ─── OCR Service Unit Tests ───────────────────────────────────────────────────
|
||||||
# Only spell_check.py, test_confidence.py, test_sender_registry.py — no ML stack required.
|
# Only stdlib/lightweight tests — no ML stack (PyTorch/Surya/Kraken) required.
|
||||||
|
# test_tmpdir.py covers the TMPDIR env var and entrypoint mkdir behaviour (ADR-021).
|
||||||
|
# test_tmpdir_is_inside_persistent_cache_volume is skipped in CI (TMPDIR not
|
||||||
|
# set to /app/cache here); it runs inside the deployed Docker container.
|
||||||
ocr-tests:
|
ocr-tests:
|
||||||
name: OCR Service Tests
|
name: OCR Service Tests
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -160,11 +163,11 @@ jobs:
|
|||||||
python-version: '3.11'
|
python-version: '3.11'
|
||||||
|
|
||||||
- name: Install test dependencies
|
- name: Install test dependencies
|
||||||
run: pip install "pyspellchecker==0.9.0" pytest pytest-asyncio
|
run: pip install "pyspellchecker==0.9.0" "fastapi==0.115.6" pytest pytest-asyncio
|
||||||
working-directory: ocr-service
|
working-directory: ocr-service
|
||||||
|
|
||||||
- name: Run OCR unit tests (no ML stack required)
|
- name: Run OCR unit tests (no ML stack required)
|
||||||
run: python -m pytest test_spell_check.py test_confidence.py test_sender_registry.py -v
|
run: python -m pytest test_spell_check.py test_confidence.py test_sender_registry.py test_tmpdir.py -v
|
||||||
working-directory: ocr-service
|
working-directory: ocr-service
|
||||||
|
|
||||||
# ─── Backend Unit & Slice Tests ───────────────────────────────────────────────
|
# ─── Backend Unit & Slice Tests ───────────────────────────────────────────────
|
||||||
|
|||||||
@@ -128,6 +128,23 @@ services:
|
|||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
|
# --- OCR: Volume bootstrap ---
|
||||||
|
# Ensures correct ownership and directory structure on ocr-cache / ocr-models
|
||||||
|
# before ocr-service starts. Handles pre-existing volumes (including those
|
||||||
|
# created before the non-root ocr user was introduced in commit 1aca4c4a)
|
||||||
|
# and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
|
||||||
|
ocr-volume-init:
|
||||||
|
image: alpine:3.21
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
|
||||||
|
volumes:
|
||||||
|
- ocr-models:/app/models
|
||||||
|
- ocr-cache:/app/cache
|
||||||
|
networks: []
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
ocr-service:
|
ocr-service:
|
||||||
build:
|
build:
|
||||||
context: ./ocr-service
|
context: ./ocr-service
|
||||||
@@ -147,6 +164,9 @@ services:
|
|||||||
HF_HOME: /app/cache
|
HF_HOME: /app/cache
|
||||||
XDG_CACHE_HOME: /app/cache
|
XDG_CACHE_HOME: /app/cache
|
||||||
TORCH_HOME: /app/models/torch
|
TORCH_HOME: /app/models/torch
|
||||||
|
TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
|
||||||
|
# /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
|
||||||
|
# but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
|
||||||
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
||||||
TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
|
TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
|
||||||
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
||||||
@@ -164,9 +184,13 @@ services:
|
|||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 12
|
retries: 12
|
||||||
start_period: 120s
|
start_period: 120s
|
||||||
|
depends_on:
|
||||||
|
ocr-volume-init:
|
||||||
|
condition: service_completed_successfully
|
||||||
read_only: true
|
read_only: true
|
||||||
tmpfs:
|
tmpfs:
|
||||||
- /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
|
- /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
|
||||||
|
# GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
|
||||||
cap_drop:
|
cap_drop:
|
||||||
- ALL
|
- ALL
|
||||||
security_opt:
|
security_opt:
|
||||||
|
|||||||
@@ -71,6 +71,23 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- archiv-net
|
- archiv-net
|
||||||
|
|
||||||
|
# --- OCR: Volume bootstrap ---
|
||||||
|
# Ensures correct ownership and directory structure on ocr_cache / ocr_models
|
||||||
|
# before ocr-service starts. Handles pre-existing volumes (including those
|
||||||
|
# created before the non-root ocr user was introduced in commit 1aca4c4a)
|
||||||
|
# and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
|
||||||
|
ocr-volume-init:
|
||||||
|
image: alpine:3.21
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
|
||||||
|
volumes:
|
||||||
|
- ocr_models:/app/models
|
||||||
|
- ocr_cache:/app/cache
|
||||||
|
networks: []
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
# --- OCR: Python microservice (Surya + Kraken) ---
|
# --- OCR: Python microservice (Surya + Kraken) ---
|
||||||
# Single-node only: OCR training reloads the model in-process after each run.
|
# Single-node only: OCR training reloads the model in-process after each run.
|
||||||
# Running multiple replicas would cause training conflicts and model-state divergence.
|
# Running multiple replicas would cause training conflicts and model-state divergence.
|
||||||
@@ -92,6 +109,9 @@ services:
|
|||||||
HF_HOME: /app/cache
|
HF_HOME: /app/cache
|
||||||
XDG_CACHE_HOME: /app/cache
|
XDG_CACHE_HOME: /app/cache
|
||||||
TORCH_HOME: /app/models/torch
|
TORCH_HOME: /app/models/torch
|
||||||
|
TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
|
||||||
|
# /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
|
||||||
|
# but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
|
||||||
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
||||||
TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
|
TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
|
||||||
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
||||||
@@ -109,9 +129,13 @@ services:
|
|||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 12
|
retries: 12
|
||||||
start_period: 120s
|
start_period: 120s
|
||||||
|
depends_on:
|
||||||
|
ocr-volume-init:
|
||||||
|
condition: service_completed_successfully
|
||||||
read_only: true
|
read_only: true
|
||||||
tmpfs:
|
tmpfs:
|
||||||
- /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
|
- /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
|
||||||
|
# GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
|
||||||
cap_drop:
|
cap_drop:
|
||||||
- ALL
|
- ALL
|
||||||
security_opt:
|
security_opt:
|
||||||
|
|||||||
@@ -564,6 +564,22 @@ bash scripts/download-kraken-models.sh
|
|||||||
|
|
||||||
Version-specific one-time steps that must be run before or after upgrading to a given release. Each subsection is safe to skip on a fresh install.
|
Version-specific one-time steps that must be run before or after upgrading to a given release. Each subsection is safe to skip on a fresh install.
|
||||||
|
|
||||||
|
### Upgrading to PR #615 — TMPDIR redirect + ocr-volume-init
|
||||||
|
|
||||||
|
`ocr-volume-init` is a new one-shot service in both compose files that runs before `ocr-service` on every `docker compose up`. It:
|
||||||
|
|
||||||
|
1. `chown -R 1000:1000 /app/cache /app/models` — corrects volume ownership so the non-root `ocr` user (uid 1000) can write to volumes that may have been created as root (including volumes from before PR #611).
|
||||||
|
2. `mkdir -p /app/cache/.tmp` — creates the TMPDIR staging directory that Surya uses for GB-scale model downloads. Without this directory, the first model download falls back to the 512 MB `/tmp` tmpfs and fails with ENOSPC. See ADR-021.
|
||||||
|
|
||||||
|
**Verify it succeeded:**
|
||||||
|
```bash
|
||||||
|
docker logs archiv-ocr-volume-init # dev
|
||||||
|
docker logs archiv-production-ocr-volume-init-1 # prod
|
||||||
|
```
|
||||||
|
Expected output: no error lines; exit code 0.
|
||||||
|
|
||||||
|
**Failure mode:** if `chown` is denied (e.g. the volume is mounted read-only), the container exits non-zero and `ocr-service` will not start (`depends_on: condition: service_completed_successfully`). Check `docker logs` for the `chown` error and verify the volume is writable.
|
||||||
|
|
||||||
### Upgrading to PR #611 — non-root OCR container
|
### Upgrading to PR #611 — non-root OCR container
|
||||||
|
|
||||||
The OCR cache volume path changed from `/root/.cache` to `/app/cache` (PR #611 — CIS Docker §4.1 hardening). The existing volume was written as root and is inaccessible to the new non-root `ocr` user, causing a `PermissionError` on startup.
|
The OCR cache volume path changed from `/root/.cache` to `/app/cache` (PR #611 — CIS Docker §4.1 hardening). The existing volume was written as root and is inaccessible to the new non-root `ocr` user, causing a `PermissionError` on startup.
|
||||||
|
|||||||
68
docs/adr/021-tmpdir-persistent-volume-staging.md
Normal file
68
docs/adr/021-tmpdir-persistent-volume-staging.md
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
# ADR-021 — Route Surya model-download staging to the persistent cache volume via TMPDIR
|
||||||
|
|
||||||
|
**Status:** Accepted
|
||||||
|
**Date:** 2026-05-18
|
||||||
|
**Issue:** #614
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
After the container hardening baseline (ADR-019), the OCR service runs with `read_only: true` and a 512 MB `/tmp` tmpfs. The tmpfs was sized for training-ZIP extraction (typically 20–50 images, well under 100 MB).
|
||||||
|
|
||||||
|
Surya's `download_directory()` (surya ≥ 0.6, `surya/common/s3.py`) stages every model file through `tempfile.TemporaryDirectory()` before moving it to the final cache location. `TemporaryDirectory()` honours `$TMPDIR` and falls back to `/tmp`. The `text_recognition` model is 1.34 GB; future Surya models will be in the same range. This blows the 512 MB budget at ~510 MB with `OSError: [Errno 28] No space left on device`.
|
||||||
|
|
||||||
|
The host has 1.8 TB free on the disk that backs `/app/cache`. The failure is a routing problem, not a capacity problem.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
Set `TMPDIR=/app/cache/.tmp` in the OCR container so all `tempfile` staging goes to the persistent SSD-backed cache volume.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# docker-compose.yml / docker-compose.prod.yml — ocr-service.environment
|
||||||
|
TMPDIR: /app/cache/.tmp
|
||||||
|
```
|
||||||
|
|
||||||
|
```dockerfile
|
||||||
|
# ocr-service/Dockerfile — default for bare docker-run usage
|
||||||
|
ENV TMPDIR=/app/cache/.tmp
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ocr-service/entrypoint.sh — idempotent directory bootstrap
|
||||||
|
mkdir -p "${TMPDIR:-/tmp}"
|
||||||
|
find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true
|
||||||
|
```
|
||||||
|
|
||||||
|
A one-shot `ocr-volume-init` service in both compose files runs before `ocr-service` to `chown -R 1000:1000` the volumes and `mkdir -p /app/cache/.tmp`. This replaces the manual `docker run --rm alpine chown` step performed on 2026-05-18 and makes fresh-volume correctness a permanent infrastructure-as-code guarantee.
|
||||||
|
|
||||||
|
The `/tmp` tmpfs remains at 512 MB and continues to serve training-ZIP extraction and transient PDF buffers — its original purpose.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
**Positive**
|
||||||
|
|
||||||
|
- Surya model downloads complete: 1.34 GB fits on the SSD, not in 512 MB of RAM.
|
||||||
|
- `shutil.move()` from staging → cache becomes a same-filesystem `rename(2)` — atomic and near-free.
|
||||||
|
- Volume ownership is now automated; no manual `docker run --rm alpine chown` on redeploy.
|
||||||
|
- `/tmp` retains its small 512 MB DoS cap for attacker-influenceable training endpoints (post-auth only, behind `X-Training-Token`).
|
||||||
|
- ZIP Slip protection in `_validate_zip_entry()` is unaffected — it uses `os.path.realpath()` anchored to the extraction directory regardless of where that directory lives.
|
||||||
|
|
||||||
|
**Negative / Trade-offs**
|
||||||
|
|
||||||
|
- If the container is `docker kill`ed mid-download, partial files persist in `/app/cache/.tmp` across container restarts. Mitigated by the `find -mtime +1 -delete` in `entrypoint.sh` — orphans older than one day are removed on startup.
|
||||||
|
- `TMPDIR` pointing inside a volume mount is non-obvious. Any future move of `/app/cache` to a different storage tier must revisit this setting. This ADR is the load-bearing reference.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Alternatives considered
|
||||||
|
|
||||||
|
**Approach B — Enlarge `/tmp` to 4 GB**
|
||||||
|
One-line change. Discarded because: (1) 4 GB tmpfs counts against the cgroup `mem_limit`; on CX32 hosts with `OCR_MEM_LIMIT=6g` the combined Surya resident set + tmpfs would trigger OOMKill on cold start; (2) staging GB-scale model files through RAM is using the wrong storage tier; (3) any future model larger than 4 GB requires another bump.
|
||||||
|
|
||||||
|
**Approach C — Both TMPDIR redirect and enlarged /tmp**
|
||||||
|
Belt-and-suspenders: Approach A + 1 GB tmpfs. Discarded in favour of the cleaner Approach A. The defence-in-depth benefit does not outweigh the extra compose churn; the 512 MB cap on `/tmp` is intentional.
|
||||||
@@ -5,3 +5,5 @@
|
|||||||
**LLM reminder:** the OCR service is a **single-node container** — training reloads the model in-process, so multiple replicas cause model-state divergence (see ADR-001). All job tracking and business logic stay in Spring Boot; the Python service is stateless OCR only.
|
**LLM reminder:** the OCR service is a **single-node container** — training reloads the model in-process, so multiple replicas cause model-state divergence (see ADR-001). All job tracking and business logic stay in Spring Boot; the Python service is stateless OCR only.
|
||||||
|
|
||||||
**LLM reminder:** `ALLOWED_PDF_HOSTS` must never be set to `*` — that opens SSRF. The default (`minio,localhost,127.0.0.1`) is correct for dev.
|
**LLM reminder:** `ALLOWED_PDF_HOSTS` must never be set to `*` — that opens SSRF. The default (`minio,localhost,127.0.0.1`) is correct for dev.
|
||||||
|
|
||||||
|
**LLM reminder:** `TMPDIR` points to `/app/cache/.tmp` (persistent SSD volume). Never redirect it back to `/tmp` or any RAM-backed path — `/tmp` is 512 MB and cannot stage GB-scale Surya model downloads (causes ENOSPC). The `ocr-volume-init` container creates the directory on fresh volumes; `entrypoint.sh` ensures it exists as a fallback. See ADR-021.
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ ENV HOME=/home/ocr
|
|||||||
ENV HF_HOME=/app/cache
|
ENV HF_HOME=/app/cache
|
||||||
ENV XDG_CACHE_HOME=/app/cache
|
ENV XDG_CACHE_HOME=/app/cache
|
||||||
ENV TORCH_HOME=/app/models/torch
|
ENV TORCH_HOME=/app/models/torch
|
||||||
|
ENV TMPDIR=/app/cache/.tmp
|
||||||
|
|
||||||
USER ocr
|
USER ocr
|
||||||
|
|
||||||
|
|||||||
@@ -32,6 +32,10 @@ Python FastAPI microservice that performs the actual handwritten text recognitio
|
|||||||
| `ALLOWED_PDF_HOSTS` | `minio,localhost,127.0.0.1` | YES | — | SSRF protection — comma-separated allowed PDF source hosts. Never set to `*`. |
|
| `ALLOWED_PDF_HOSTS` | `minio,localhost,127.0.0.1` | YES | — | SSRF protection — comma-separated allowed PDF source hosts. Never set to `*`. |
|
||||||
| `KRAKEN_MODEL_PATH` | `/app/models/` | — | — | Directory where Kraken HTR models are stored (populated by `download-kraken-models.sh`) |
|
| `KRAKEN_MODEL_PATH` | `/app/models/` | — | — | Directory where Kraken HTR models are stored (populated by `download-kraken-models.sh`) |
|
||||||
| `BLLA_MODEL_PATH` | `/app/models/blla.mlmodel` | — | — | Kraken baseline layout analysis model. Auto-downloaded via `ensure_blla_model.py` on startup if missing. |
|
| `BLLA_MODEL_PATH` | `/app/models/blla.mlmodel` | — | — | Kraken baseline layout analysis model. Auto-downloaded via `ensure_blla_model.py` on startup if missing. |
|
||||||
|
| `HF_HOME` | `/app/cache` | — | — | HuggingFace model cache root. Keeps model downloads on the persistent cache volume. |
|
||||||
|
| `XDG_CACHE_HOME` | `/app/cache` | — | — | XDG cache root (used by some Surya components alongside `HF_HOME`). |
|
||||||
|
| `TORCH_HOME` | `/app/models/torch` | — | — | PyTorch model cache. Kept on the persistent models volume. |
|
||||||
|
| `TMPDIR` | `/app/cache/.tmp` | — | — | Download-staging directory for GB-scale Surya model files. Must point to a disk-backed path, not the 512 MB `/tmp` tmpfs — see ADR-021. |
|
||||||
|
|
||||||
## Key files
|
## Key files
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,12 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Ensure TMPDIR exists on the persistent cache volume (created by the volume-init
|
||||||
|
# container, but guaranteed here for fresh volumes and bare docker-run usage).
|
||||||
|
# Remove stale partial downloads left by a previous docker-kill.
|
||||||
|
mkdir -p "${TMPDIR:-/tmp}"
|
||||||
|
find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true
|
||||||
|
|
||||||
# Validate the blla segmentation base model and download it if missing or
|
# Validate the blla segmentation base model and download it if missing or
|
||||||
# incompatible. ketos 7 dropped support for legacy PyTorch ZIP archives —
|
# incompatible. ketos 7 dropped support for legacy PyTorch ZIP archives —
|
||||||
# this ensures the volume always holds a loadable CoreML protobuf model.
|
# this ensures the volume always holds a loadable CoreML protobuf model.
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ from engines import kraken as kraken_engine
|
|||||||
from engines import surya as surya_engine
|
from engines import surya as surya_engine
|
||||||
from models import OcrBlock, OcrRequest
|
from models import OcrBlock, OcrRequest
|
||||||
from preprocessing import preprocess_page
|
from preprocessing import preprocess_page
|
||||||
|
from utils import _validate_zip_entry
|
||||||
|
|
||||||
TRAINING_TOKEN = os.environ.get("TRAINING_TOKEN", "")
|
TRAINING_TOKEN = os.environ.get("TRAINING_TOKEN", "")
|
||||||
KRAKEN_MODEL_PATH = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel")
|
KRAKEN_MODEL_PATH = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel")
|
||||||
@@ -291,14 +292,6 @@ def _check_training_token(x_training_token: str | None) -> None:
|
|||||||
raise HTTPException(status_code=403, detail="Invalid or missing X-Training-Token")
|
raise HTTPException(status_code=403, detail="Invalid or missing X-Training-Token")
|
||||||
|
|
||||||
|
|
||||||
def _validate_zip_entry(name: str, extract_dir: str) -> None:
|
|
||||||
"""Reject ZIP Slip attacks: path traversal and absolute paths."""
|
|
||||||
if os.path.isabs(name) or name.startswith(".."):
|
|
||||||
raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}")
|
|
||||||
resolved = os.path.realpath(os.path.join(extract_dir, name))
|
|
||||||
if not resolved.startswith(os.path.realpath(extract_dir)):
|
|
||||||
raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}")
|
|
||||||
|
|
||||||
|
|
||||||
def _rotate_backups(model_path: str, keep: int = 3) -> None:
|
def _rotate_backups(model_path: str, keep: int = 3) -> None:
|
||||||
"""Keep only the last `keep` timestamped backups of the model."""
|
"""Keep only the last `keep` timestamped backups of the model."""
|
||||||
|
|||||||
151
ocr-service/test_tmpdir.py
Normal file
151
ocr-service/test_tmpdir.py
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
"""Tests for TMPDIR configuration and entrypoint mkdir behavior — ADR-021."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from fastapi import HTTPException
|
||||||
|
from utils import _validate_zip_entry
|
||||||
|
|
||||||
|
_ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh")
|
||||||
|
|
||||||
|
|
||||||
|
def _run_entrypoint(tmpdir, tmp_path):
|
||||||
|
"""Run entrypoint.sh with TMPDIR set to tmpdir; python3/uvicorn are stubbed out."""
|
||||||
|
stub_bin = tmp_path / "stub_bin"
|
||||||
|
stub_bin.mkdir(exist_ok=True)
|
||||||
|
for name in ("python3", "uvicorn"):
|
||||||
|
stub = stub_bin / name
|
||||||
|
stub.write_text("#!/bin/sh\nexit 0\n")
|
||||||
|
stub.chmod(0o755)
|
||||||
|
env = {
|
||||||
|
**os.environ,
|
||||||
|
"TMPDIR": str(tmpdir),
|
||||||
|
"PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}",
|
||||||
|
}
|
||||||
|
return subprocess.run(["bash", _ENTRYPOINT], env=env, capture_output=True, text=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path):
|
||||||
|
"""Python honours the TMPDIR env var when creating temporary directories.
|
||||||
|
|
||||||
|
Documents the mechanism that routes Surya model staging to the persistent
|
||||||
|
cache volume instead of the 512 MB RAM tmpfs. See ADR-021.
|
||||||
|
"""
|
||||||
|
custom_tmp = tmp_path / "model_staging"
|
||||||
|
custom_tmp.mkdir()
|
||||||
|
monkeypatch.setenv("TMPDIR", str(custom_tmp))
|
||||||
|
monkeypatch.setattr(tempfile, "tempdir", None)
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
assert td.startswith(str(custom_tmp))
|
||||||
|
|
||||||
|
|
||||||
|
def test_entrypoint_creates_tmpdir(tmp_path):
|
||||||
|
"""entrypoint.sh creates the TMPDIR directory when it does not exist.
|
||||||
|
|
||||||
|
On a fresh ocr_cache volume, /app/cache/.tmp is absent. The entrypoint
|
||||||
|
must create it before uvicorn starts so the first Surya model download
|
||||||
|
does not exhaust the 512 MB /tmp tmpfs (ENOSPC). See ADR-021.
|
||||||
|
"""
|
||||||
|
custom_tmp = tmp_path / "model-staging"
|
||||||
|
assert not custom_tmp.exists(), "pre-condition: directory must not exist yet"
|
||||||
|
|
||||||
|
stub_bin = tmp_path / "stub_bin"
|
||||||
|
stub_bin.mkdir()
|
||||||
|
for name in ("python3", "uvicorn"):
|
||||||
|
stub = stub_bin / name
|
||||||
|
stub.write_text("#!/bin/sh\nexit 0\n")
|
||||||
|
stub.chmod(0o755)
|
||||||
|
|
||||||
|
env = {
|
||||||
|
**os.environ,
|
||||||
|
"TMPDIR": str(custom_tmp),
|
||||||
|
"PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}",
|
||||||
|
}
|
||||||
|
result = subprocess.run(
|
||||||
|
["bash", _ENTRYPOINT],
|
||||||
|
env=env,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
assert result.returncode == 0, (
|
||||||
|
f"entrypoint.sh exited {result.returncode}\n"
|
||||||
|
f"stdout: {result.stdout}\nstderr: {result.stderr}"
|
||||||
|
)
|
||||||
|
assert custom_tmp.exists(), (
|
||||||
|
f"entrypoint.sh did not create TMPDIR={custom_tmp}\n"
|
||||||
|
f"stdout: {result.stdout}\nstderr: {result.stderr}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
not os.environ.get("TMPDIR", "").startswith("/app/cache"),
|
||||||
|
reason="TMPDIR contract only enforced inside the OCR Docker container",
|
||||||
|
)
|
||||||
|
def test_tmpdir_is_inside_persistent_cache_volume():
|
||||||
|
"""TMPDIR must point to the persistent cache volume, not a RAM tmpfs.
|
||||||
|
|
||||||
|
Catches accidental reversion to /tmp or any tmpfs-backed path.
|
||||||
|
Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp.
|
||||||
|
To run manually: docker exec archiv-ocr python -m pytest test_tmpdir.py::test_tmpdir_is_inside_persistent_cache_volume -v
|
||||||
|
See ADR-021.
|
||||||
|
"""
|
||||||
|
tmpdir = os.environ["TMPDIR"]
|
||||||
|
assert tmpdir.startswith("/app/cache"), (
|
||||||
|
f"TMPDIR={tmpdir!r} must be under /app/cache to route model downloads "
|
||||||
|
"to the SSD-backed cache volume — see ADR-021"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_entrypoint_removes_day_old_orphans(tmp_path):
|
||||||
|
"""entrypoint.sh deletes partial downloads older than 1 day from TMPDIR.
|
||||||
|
|
||||||
|
Simulates a file left behind by a docker-kill mid-download: backdate its
|
||||||
|
mtime by 2 days using os.utime(), run the entrypoint, assert it is gone.
|
||||||
|
See ADR-021.
|
||||||
|
"""
|
||||||
|
staging = tmp_path / "staging"
|
||||||
|
staging.mkdir()
|
||||||
|
stale_file = staging / "model.safetensors.partial"
|
||||||
|
stale_file.write_bytes(b"partial download")
|
||||||
|
two_days_ago = time.time() - 2 * 24 * 3600
|
||||||
|
os.utime(stale_file, (two_days_ago, two_days_ago))
|
||||||
|
|
||||||
|
result = _run_entrypoint(staging, tmp_path)
|
||||||
|
assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}"
|
||||||
|
assert not stale_file.exists(), "day-old orphan should have been deleted by entrypoint.sh"
|
||||||
|
|
||||||
|
|
||||||
|
def test_entrypoint_preserves_fresh_files(tmp_path):
|
||||||
|
"""entrypoint.sh does not delete files newer than 1 day from TMPDIR.
|
||||||
|
|
||||||
|
An in-progress download whose mtime is recent must survive the orphan
|
||||||
|
cleanup so a concurrent or just-started model fetch is not interrupted.
|
||||||
|
See ADR-021.
|
||||||
|
"""
|
||||||
|
staging = tmp_path / "staging"
|
||||||
|
staging.mkdir()
|
||||||
|
fresh_file = staging / "model.safetensors.part"
|
||||||
|
fresh_file.write_bytes(b"in progress")
|
||||||
|
# mtime is now — no os.utime() call needed
|
||||||
|
|
||||||
|
result = _run_entrypoint(staging, tmp_path)
|
||||||
|
assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}"
|
||||||
|
assert fresh_file.exists(), "recent file should not have been deleted by entrypoint.sh"
|
||||||
|
|
||||||
|
|
||||||
|
def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path):
|
||||||
|
"""_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR.
|
||||||
|
|
||||||
|
When TMPDIR=/app/cache/.tmp, extraction dirs live under that path.
|
||||||
|
Verifies os.path.realpath() still anchors correctly against the non-default base.
|
||||||
|
"""
|
||||||
|
extract_dir = tmp_path / "model-staging" / "tmpXXX"
|
||||||
|
extract_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
with pytest.raises(HTTPException) as exc_info:
|
||||||
|
_validate_zip_entry("../evil.py", str(extract_dir))
|
||||||
|
assert exc_info.value.status_code == 400
|
||||||
14
ocr-service/utils.py
Normal file
14
ocr-service/utils.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
"""Utility functions shared across the OCR service with no ML-stack imports."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from fastapi import HTTPException
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_zip_entry(name: str, extract_dir: str) -> None:
|
||||||
|
"""Reject ZIP Slip attacks: path traversal and absolute paths."""
|
||||||
|
if os.path.isabs(name) or name.startswith(".."):
|
||||||
|
raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}")
|
||||||
|
resolved = os.path.realpath(os.path.join(extract_dir, name))
|
||||||
|
if not resolved.startswith(os.path.realpath(extract_dir)):
|
||||||
|
raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}")
|
||||||
Reference in New Issue
Block a user