fix(ocr): route Surya model staging to SSD via TMPDIR + add volume-init service #615
@@ -148,7 +148,10 @@ jobs:
|
||||
path: frontend/test-results/screenshots/
|
||||
|
||||
# ─── OCR Service Unit Tests ───────────────────────────────────────────────────
|
||||
# Only spell_check.py, test_confidence.py, test_sender_registry.py — no ML stack required.
|
||||
# Only stdlib/lightweight tests — no ML stack (PyTorch/Surya/Kraken) required.
|
||||
# test_tmpdir.py covers the TMPDIR env var and entrypoint mkdir behaviour (ADR-021).
|
||||
# test_tmpdir_is_inside_persistent_cache_volume is skipped in CI (TMPDIR not
|
||||
# set to /app/cache here); it runs inside the deployed Docker container.
|
||||
ocr-tests:
|
||||
name: OCR Service Tests
|
||||
runs-on: ubuntu-latest
|
||||
@@ -160,11 +163,11 @@ jobs:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install test dependencies
|
||||
run: pip install "pyspellchecker==0.9.0" pytest pytest-asyncio
|
||||
run: pip install "pyspellchecker==0.9.0" "fastapi==0.115.6" pytest pytest-asyncio
|
||||
working-directory: ocr-service
|
||||
|
||||
- name: Run OCR unit tests (no ML stack required)
|
||||
run: python -m pytest test_spell_check.py test_confidence.py test_sender_registry.py -v
|
||||
run: python -m pytest test_spell_check.py test_confidence.py test_sender_registry.py test_tmpdir.py -v
|
||||
working-directory: ocr-service
|
||||
|
||||
# ─── Backend Unit & Slice Tests ───────────────────────────────────────────────
|
||||
|
||||
@@ -128,6 +128,23 @@ services:
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# --- OCR: Volume bootstrap ---
|
||||
# Ensures correct ownership and directory structure on ocr-cache / ocr-models
|
||||
# before ocr-service starts. Handles pre-existing volumes (including those
|
||||
# created before the non-root ocr user was introduced in commit 1aca4c4a)
|
||||
# and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
|
||||
ocr-volume-init:
|
||||
image: alpine:3.21
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
|
||||
volumes:
|
||||
- ocr-models:/app/models
|
||||
- ocr-cache:/app/cache
|
||||
networks: []
|
||||
restart: "no"
|
||||
|
||||
ocr-service:
|
||||
build:
|
||||
context: ./ocr-service
|
||||
@@ -147,6 +164,9 @@ services:
|
||||
HF_HOME: /app/cache
|
||||
XDG_CACHE_HOME: /app/cache
|
||||
TORCH_HOME: /app/models/torch
|
||||
TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
|
||||
# /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
|
||||
# but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
|
||||
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
||||
TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
|
||||
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
||||
@@ -164,9 +184,13 @@ services:
|
||||
timeout: 5s
|
||||
retries: 12
|
||||
start_period: 120s
|
||||
depends_on:
|
||||
ocr-volume-init:
|
||||
condition: service_completed_successfully
|
||||
read_only: true
|
||||
tmpfs:
|
||||
- /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
|
||||
- /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
|
||||
# GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
|
||||
cap_drop:
|
||||
- ALL
|
||||
security_opt:
|
||||
|
||||
@@ -71,6 +71,23 @@ services:
|
||||
networks:
|
||||
- archiv-net
|
||||
|
||||
# --- OCR: Volume bootstrap ---
|
||||
# Ensures correct ownership and directory structure on ocr_cache / ocr_models
|
||||
# before ocr-service starts. Handles pre-existing volumes (including those
|
||||
# created before the non-root ocr user was introduced in commit 1aca4c4a)
|
||||
# and guarantees /app/cache/.tmp exists for TMPDIR staging. See ADR-021.
|
||||
ocr-volume-init:
|
||||
image: alpine:3.21
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "chown -R 1000:1000 /app/cache /app/models && mkdir -p /app/cache/.tmp && chown 1000:1000 /app/cache/.tmp"
|
||||
volumes:
|
||||
- ocr_models:/app/models
|
||||
- ocr_cache:/app/cache
|
||||
networks: []
|
||||
restart: "no"
|
||||
|
||||
# --- OCR: Python microservice (Surya + Kraken) ---
|
||||
# Single-node only: OCR training reloads the model in-process after each run.
|
||||
# Running multiple replicas would cause training conflicts and model-state divergence.
|
||||
@@ -92,6 +109,9 @@ services:
|
||||
HF_HOME: /app/cache
|
||||
XDG_CACHE_HOME: /app/cache
|
||||
TORCH_HOME: /app/models/torch
|
||||
TMPDIR: /app/cache/.tmp # Stage GB-scale Surya model downloads on SSD, not the 512 MB RAM tmpfs.
|
||||
# /tmp keeps its small DoS cap; training ZIPs still unpack under /tmp
|
||||
# but ZIP Slip protection (_validate_zip_entry) is unchanged. See ADR-021.
|
||||
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
||||
TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
|
||||
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
||||
@@ -109,9 +129,13 @@ services:
|
||||
timeout: 5s
|
||||
retries: 12
|
||||
start_period: 120s
|
||||
depends_on:
|
||||
ocr-volume-init:
|
||||
condition: service_completed_successfully
|
||||
read_only: true
|
||||
tmpfs:
|
||||
- /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images)
|
||||
- /tmp:size=512m # training-ZIP unzip + transient PDF buffers only (small, RAM-friendly).
|
||||
# GB-scale model downloads go to TMPDIR=/app/cache/.tmp instead. See ADR-021.
|
||||
cap_drop:
|
||||
- ALL
|
||||
security_opt:
|
||||
|
||||
@@ -564,6 +564,22 @@ bash scripts/download-kraken-models.sh
|
||||
|
||||
Version-specific one-time steps that must be run before or after upgrading to a given release. Each subsection is safe to skip on a fresh install.
|
||||
|
||||
### Upgrading to PR #615 — TMPDIR redirect + ocr-volume-init
|
||||
|
||||
`ocr-volume-init` is a new one-shot service in both compose files that runs before `ocr-service` on every `docker compose up`. It:
|
||||
|
||||
1. `chown -R 1000:1000 /app/cache /app/models` — corrects volume ownership so the non-root `ocr` user (uid 1000) can write to volumes that may have been created as root (including volumes from before PR #611).
|
||||
2. `mkdir -p /app/cache/.tmp` — creates the TMPDIR staging directory that Surya uses for GB-scale model downloads. Without this directory, the first model download falls back to the 512 MB `/tmp` tmpfs and fails with ENOSPC. See ADR-021.
|
||||
|
||||
**Verify it succeeded:**
|
||||
```bash
|
||||
docker logs archiv-ocr-volume-init # dev
|
||||
docker logs archiv-production-ocr-volume-init-1 # prod
|
||||
```
|
||||
Expected output: no error lines; exit code 0.
|
||||
|
||||
**Failure mode:** if `chown` is denied (e.g. the volume is mounted read-only), the container exits non-zero and `ocr-service` will not start (`depends_on: condition: service_completed_successfully`). Check `docker logs` for the `chown` error and verify the volume is writable.
|
||||
|
||||
### Upgrading to PR #611 — non-root OCR container
|
||||
|
||||
The OCR cache volume path changed from `/root/.cache` to `/app/cache` (PR #611 — CIS Docker §4.1 hardening). The existing volume was written as root and is inaccessible to the new non-root `ocr` user, causing a `PermissionError` on startup.
|
||||
|
||||
68
docs/adr/021-tmpdir-persistent-volume-staging.md
Normal file
68
docs/adr/021-tmpdir-persistent-volume-staging.md
Normal file
@@ -0,0 +1,68 @@
|
||||
# ADR-021 — Route Surya model-download staging to the persistent cache volume via TMPDIR
|
||||
|
||||
**Status:** Accepted
|
||||
**Date:** 2026-05-18
|
||||
**Issue:** #614
|
||||
|
||||
---
|
||||
|
||||
## Context
|
||||
|
||||
After the container hardening baseline (ADR-019), the OCR service runs with `read_only: true` and a 512 MB `/tmp` tmpfs. The tmpfs was sized for training-ZIP extraction (typically 20–50 images, well under 100 MB).
|
||||
|
||||
Surya's `download_directory()` (surya ≥ 0.6, `surya/common/s3.py`) stages every model file through `tempfile.TemporaryDirectory()` before moving it to the final cache location. `TemporaryDirectory()` honours `$TMPDIR` and falls back to `/tmp`. The `text_recognition` model is 1.34 GB; future Surya models will be in the same range. This blows the 512 MB budget at ~510 MB with `OSError: [Errno 28] No space left on device`.
|
||||
|
||||
The host has 1.8 TB free on the disk that backs `/app/cache`. The failure is a routing problem, not a capacity problem.
|
||||
|
||||
---
|
||||
|
||||
## Decision
|
||||
|
||||
Set `TMPDIR=/app/cache/.tmp` in the OCR container so all `tempfile` staging goes to the persistent SSD-backed cache volume.
|
||||
|
||||
```yaml
|
||||
# docker-compose.yml / docker-compose.prod.yml — ocr-service.environment
|
||||
TMPDIR: /app/cache/.tmp
|
||||
```
|
||||
|
||||
```dockerfile
|
||||
# ocr-service/Dockerfile — default for bare docker-run usage
|
||||
ENV TMPDIR=/app/cache/.tmp
|
||||
```
|
||||
|
||||
```bash
|
||||
# ocr-service/entrypoint.sh — idempotent directory bootstrap
|
||||
mkdir -p "${TMPDIR:-/tmp}"
|
||||
find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true
|
||||
```
|
||||
|
||||
A one-shot `ocr-volume-init` service in both compose files runs before `ocr-service` to `chown -R 1000:1000` the volumes and `mkdir -p /app/cache/.tmp`. This replaces the manual `docker run --rm alpine chown` step performed on 2026-05-18 and makes fresh-volume correctness a permanent infrastructure-as-code guarantee.
|
||||
|
||||
The `/tmp` tmpfs remains at 512 MB and continues to serve training-ZIP extraction and transient PDF buffers — its original purpose.
|
||||
|
||||
---
|
||||
|
||||
## Consequences
|
||||
|
||||
**Positive**
|
||||
|
||||
- Surya model downloads complete: 1.34 GB fits on the SSD, not in 512 MB of RAM.
|
||||
- `shutil.move()` from staging → cache becomes a same-filesystem `rename(2)` — atomic and near-free.
|
||||
- Volume ownership is now automated; no manual `docker run --rm alpine chown` on redeploy.
|
||||
- `/tmp` retains its small 512 MB DoS cap for attacker-influenceable training endpoints (post-auth only, behind `X-Training-Token`).
|
||||
- ZIP Slip protection in `_validate_zip_entry()` is unaffected — it uses `os.path.realpath()` anchored to the extraction directory regardless of where that directory lives.
|
||||
|
||||
**Negative / Trade-offs**
|
||||
|
||||
- If the container is `docker kill`ed mid-download, partial files persist in `/app/cache/.tmp` across container restarts. Mitigated by the `find -mtime +1 -delete` in `entrypoint.sh` — orphans older than one day are removed on startup.
|
||||
- `TMPDIR` pointing inside a volume mount is non-obvious. Any future move of `/app/cache` to a different storage tier must revisit this setting. This ADR is the load-bearing reference.
|
||||
|
||||
---
|
||||
|
||||
## Alternatives considered
|
||||
|
||||
**Approach B — Enlarge `/tmp` to 4 GB**
|
||||
One-line change. Discarded because: (1) 4 GB tmpfs counts against the cgroup `mem_limit`; on CX32 hosts with `OCR_MEM_LIMIT=6g` the combined Surya resident set + tmpfs would trigger OOMKill on cold start; (2) staging GB-scale model files through RAM is using the wrong storage tier; (3) any future model larger than 4 GB requires another bump.
|
||||
|
||||
**Approach C — Both TMPDIR redirect and enlarged /tmp**
|
||||
Belt-and-suspenders: Approach A + 1 GB tmpfs. Discarded in favour of the cleaner Approach A. The defence-in-depth benefit does not outweigh the extra compose churn; the 512 MB cap on `/tmp` is intentional.
|
||||
@@ -5,3 +5,5 @@
|
||||
**LLM reminder:** the OCR service is a **single-node container** — training reloads the model in-process, so multiple replicas cause model-state divergence (see ADR-001). All job tracking and business logic stay in Spring Boot; the Python service is stateless OCR only.
|
||||
|
||||
**LLM reminder:** `ALLOWED_PDF_HOSTS` must never be set to `*` — that opens SSRF. The default (`minio,localhost,127.0.0.1`) is correct for dev.
|
||||
|
||||
**LLM reminder:** `TMPDIR` points to `/app/cache/.tmp` (persistent SSD volume). Never redirect it back to `/tmp` or any RAM-backed path — `/tmp` is 512 MB and cannot stage GB-scale Surya model downloads (causes ENOSPC). The `ocr-volume-init` container creates the directory on fresh volumes; `entrypoint.sh` ensures it exists as a fallback. See ADR-021.
|
||||
|
||||
@@ -32,6 +32,7 @@ ENV HOME=/home/ocr
|
||||
ENV HF_HOME=/app/cache
|
||||
ENV XDG_CACHE_HOME=/app/cache
|
||||
ENV TORCH_HOME=/app/models/torch
|
||||
ENV TMPDIR=/app/cache/.tmp
|
||||
|
||||
USER ocr
|
||||
|
||||
|
||||
@@ -32,6 +32,10 @@ Python FastAPI microservice that performs the actual handwritten text recognitio
|
||||
| `ALLOWED_PDF_HOSTS` | `minio,localhost,127.0.0.1` | YES | — | SSRF protection — comma-separated allowed PDF source hosts. Never set to `*`. |
|
||||
| `KRAKEN_MODEL_PATH` | `/app/models/` | — | — | Directory where Kraken HTR models are stored (populated by `download-kraken-models.sh`) |
|
||||
| `BLLA_MODEL_PATH` | `/app/models/blla.mlmodel` | — | — | Kraken baseline layout analysis model. Auto-downloaded via `ensure_blla_model.py` on startup if missing. |
|
||||
| `HF_HOME` | `/app/cache` | — | — | HuggingFace model cache root. Keeps model downloads on the persistent cache volume. |
|
||||
| `XDG_CACHE_HOME` | `/app/cache` | — | — | XDG cache root (used by some Surya components alongside `HF_HOME`). |
|
||||
| `TORCH_HOME` | `/app/models/torch` | — | — | PyTorch model cache. Kept on the persistent models volume. |
|
||||
| `TMPDIR` | `/app/cache/.tmp` | — | — | Download-staging directory for GB-scale Surya model files. Must point to a disk-backed path, not the 512 MB `/tmp` tmpfs — see ADR-021. |
|
||||
|
||||
## Key files
|
||||
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# Ensure TMPDIR exists on the persistent cache volume (created by the volume-init
|
||||
# container, but guaranteed here for fresh volumes and bare docker-run usage).
|
||||
# Remove stale partial downloads left by a previous docker-kill.
|
||||
mkdir -p "${TMPDIR:-/tmp}"
|
||||
find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true
|
||||
|
||||
# Validate the blla segmentation base model and download it if missing or
|
||||
# incompatible. ketos 7 dropped support for legacy PyTorch ZIP archives —
|
||||
# this ensures the volume always holds a loadable CoreML protobuf model.
|
||||
|
||||
@@ -27,6 +27,7 @@ from engines import kraken as kraken_engine
|
||||
from engines import surya as surya_engine
|
||||
from models import OcrBlock, OcrRequest
|
||||
from preprocessing import preprocess_page
|
||||
from utils import _validate_zip_entry
|
||||
|
||||
TRAINING_TOKEN = os.environ.get("TRAINING_TOKEN", "")
|
||||
KRAKEN_MODEL_PATH = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel")
|
||||
@@ -291,14 +292,6 @@ def _check_training_token(x_training_token: str | None) -> None:
|
||||
raise HTTPException(status_code=403, detail="Invalid or missing X-Training-Token")
|
||||
|
||||
|
||||
def _validate_zip_entry(name: str, extract_dir: str) -> None:
|
||||
"""Reject ZIP Slip attacks: path traversal and absolute paths."""
|
||||
if os.path.isabs(name) or name.startswith(".."):
|
||||
raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}")
|
||||
resolved = os.path.realpath(os.path.join(extract_dir, name))
|
||||
if not resolved.startswith(os.path.realpath(extract_dir)):
|
||||
raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}")
|
||||
|
||||
|
||||
def _rotate_backups(model_path: str, keep: int = 3) -> None:
|
||||
"""Keep only the last `keep` timestamped backups of the model."""
|
||||
|
||||
151
ocr-service/test_tmpdir.py
Normal file
151
ocr-service/test_tmpdir.py
Normal file
@@ -0,0 +1,151 @@
|
||||
"""Tests for TMPDIR configuration and entrypoint mkdir behavior — ADR-021."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from fastapi import HTTPException
|
||||
from utils import _validate_zip_entry
|
||||
|
||||
_ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh")
|
||||
|
||||
|
||||
def _run_entrypoint(tmpdir, tmp_path):
|
||||
"""Run entrypoint.sh with TMPDIR set to tmpdir; python3/uvicorn are stubbed out."""
|
||||
stub_bin = tmp_path / "stub_bin"
|
||||
stub_bin.mkdir(exist_ok=True)
|
||||
for name in ("python3", "uvicorn"):
|
||||
stub = stub_bin / name
|
||||
stub.write_text("#!/bin/sh\nexit 0\n")
|
||||
stub.chmod(0o755)
|
||||
env = {
|
||||
**os.environ,
|
||||
"TMPDIR": str(tmpdir),
|
||||
"PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}",
|
||||
}
|
||||
return subprocess.run(["bash", _ENTRYPOINT], env=env, capture_output=True, text=True)
|
||||
|
||||
|
||||
def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path):
|
||||
"""Python honours the TMPDIR env var when creating temporary directories.
|
||||
|
||||
Documents the mechanism that routes Surya model staging to the persistent
|
||||
cache volume instead of the 512 MB RAM tmpfs. See ADR-021.
|
||||
"""
|
||||
custom_tmp = tmp_path / "model_staging"
|
||||
custom_tmp.mkdir()
|
||||
monkeypatch.setenv("TMPDIR", str(custom_tmp))
|
||||
monkeypatch.setattr(tempfile, "tempdir", None)
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
assert td.startswith(str(custom_tmp))
|
||||
|
||||
|
||||
def test_entrypoint_creates_tmpdir(tmp_path):
|
||||
"""entrypoint.sh creates the TMPDIR directory when it does not exist.
|
||||
|
||||
On a fresh ocr_cache volume, /app/cache/.tmp is absent. The entrypoint
|
||||
must create it before uvicorn starts so the first Surya model download
|
||||
does not exhaust the 512 MB /tmp tmpfs (ENOSPC). See ADR-021.
|
||||
"""
|
||||
custom_tmp = tmp_path / "model-staging"
|
||||
assert not custom_tmp.exists(), "pre-condition: directory must not exist yet"
|
||||
|
||||
stub_bin = tmp_path / "stub_bin"
|
||||
stub_bin.mkdir()
|
||||
for name in ("python3", "uvicorn"):
|
||||
stub = stub_bin / name
|
||||
stub.write_text("#!/bin/sh\nexit 0\n")
|
||||
stub.chmod(0o755)
|
||||
|
||||
env = {
|
||||
**os.environ,
|
||||
"TMPDIR": str(custom_tmp),
|
||||
"PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}",
|
||||
}
|
||||
result = subprocess.run(
|
||||
["bash", _ENTRYPOINT],
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert result.returncode == 0, (
|
||||
f"entrypoint.sh exited {result.returncode}\n"
|
||||
f"stdout: {result.stdout}\nstderr: {result.stderr}"
|
||||
)
|
||||
assert custom_tmp.exists(), (
|
||||
f"entrypoint.sh did not create TMPDIR={custom_tmp}\n"
|
||||
f"stdout: {result.stdout}\nstderr: {result.stderr}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("TMPDIR", "").startswith("/app/cache"),
|
||||
reason="TMPDIR contract only enforced inside the OCR Docker container",
|
||||
)
|
||||
def test_tmpdir_is_inside_persistent_cache_volume():
|
||||
"""TMPDIR must point to the persistent cache volume, not a RAM tmpfs.
|
||||
|
||||
Catches accidental reversion to /tmp or any tmpfs-backed path.
|
||||
Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp.
|
||||
To run manually: docker exec archiv-ocr python -m pytest test_tmpdir.py::test_tmpdir_is_inside_persistent_cache_volume -v
|
||||
See ADR-021.
|
||||
"""
|
||||
tmpdir = os.environ["TMPDIR"]
|
||||
assert tmpdir.startswith("/app/cache"), (
|
||||
f"TMPDIR={tmpdir!r} must be under /app/cache to route model downloads "
|
||||
"to the SSD-backed cache volume — see ADR-021"
|
||||
)
|
||||
|
||||
|
||||
def test_entrypoint_removes_day_old_orphans(tmp_path):
|
||||
"""entrypoint.sh deletes partial downloads older than 1 day from TMPDIR.
|
||||
|
||||
Simulates a file left behind by a docker-kill mid-download: backdate its
|
||||
mtime by 2 days using os.utime(), run the entrypoint, assert it is gone.
|
||||
See ADR-021.
|
||||
"""
|
||||
staging = tmp_path / "staging"
|
||||
staging.mkdir()
|
||||
stale_file = staging / "model.safetensors.partial"
|
||||
stale_file.write_bytes(b"partial download")
|
||||
two_days_ago = time.time() - 2 * 24 * 3600
|
||||
os.utime(stale_file, (two_days_ago, two_days_ago))
|
||||
|
||||
result = _run_entrypoint(staging, tmp_path)
|
||||
assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}"
|
||||
assert not stale_file.exists(), "day-old orphan should have been deleted by entrypoint.sh"
|
||||
|
||||
|
||||
def test_entrypoint_preserves_fresh_files(tmp_path):
|
||||
"""entrypoint.sh does not delete files newer than 1 day from TMPDIR.
|
||||
|
||||
An in-progress download whose mtime is recent must survive the orphan
|
||||
cleanup so a concurrent or just-started model fetch is not interrupted.
|
||||
See ADR-021.
|
||||
"""
|
||||
staging = tmp_path / "staging"
|
||||
staging.mkdir()
|
||||
fresh_file = staging / "model.safetensors.part"
|
||||
fresh_file.write_bytes(b"in progress")
|
||||
# mtime is now — no os.utime() call needed
|
||||
|
||||
result = _run_entrypoint(staging, tmp_path)
|
||||
assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}"
|
||||
assert fresh_file.exists(), "recent file should not have been deleted by entrypoint.sh"
|
||||
|
||||
|
||||
def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path):
|
||||
"""_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR.
|
||||
|
||||
When TMPDIR=/app/cache/.tmp, extraction dirs live under that path.
|
||||
Verifies os.path.realpath() still anchors correctly against the non-default base.
|
||||
"""
|
||||
extract_dir = tmp_path / "model-staging" / "tmpXXX"
|
||||
extract_dir.mkdir(parents=True)
|
||||
|
||||
with pytest.raises(HTTPException) as exc_info:
|
||||
_validate_zip_entry("../evil.py", str(extract_dir))
|
||||
assert exc_info.value.status_code == 400
|
||||
14
ocr-service/utils.py
Normal file
14
ocr-service/utils.py
Normal file
@@ -0,0 +1,14 @@
|
||||
"""Utility functions shared across the OCR service with no ML-stack imports."""
|
||||
|
||||
import os
|
||||
|
||||
from fastapi import HTTPException
|
||||
|
||||
|
||||
def _validate_zip_entry(name: str, extract_dir: str) -> None:
|
||||
"""Reject ZIP Slip attacks: path traversal and absolute paths."""
|
||||
if os.path.isabs(name) or name.startswith(".."):
|
||||
raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}")
|
||||
resolved = os.path.realpath(os.path.join(extract_dir, name))
|
||||
if not resolved.startswith(os.path.realpath(extract_dir)):
|
||||
raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}")
|
||||
Reference in New Issue
Block a user