fix(ocr): create TMPDIR on startup and clear day-old orphans

On a fresh ocr_cache volume /app/cache/.tmp does not exist yet. The mkdir
ensures the first Surya model download can proceed without ENOSPC on the
512 MB /tmp tmpfs. The find cleanup removes fragments left by docker-kill
mid-download, preventing cross-job ground-truth leakage.

Fixes #614. See ADR-021.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-18 10:54:17 +02:00
parent 09a043431e
commit 240b373f68
2 changed files with 104 additions and 0 deletions

View File

@@ -0,0 +1,97 @@
"""Tests for TMPDIR configuration and entrypoint mkdir behavior — ADR-021."""
import os
import subprocess
import tempfile
import pytest
try:
from fastapi import HTTPException
from main import _validate_zip_entry
HAS_MAIN = True
except ImportError:
HAS_MAIN = False
_ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh")
def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path):
"""Python honours the TMPDIR env var when creating temporary directories.
Documents the mechanism that routes Surya model staging to the persistent
cache volume instead of the 512 MB RAM tmpfs. See ADR-021.
"""
custom_tmp = tmp_path / "model_staging"
custom_tmp.mkdir()
monkeypatch.setenv("TMPDIR", str(custom_tmp))
monkeypatch.setattr(tempfile, "tempdir", None)
with tempfile.TemporaryDirectory() as td:
assert td.startswith(str(custom_tmp))
def test_entrypoint_creates_tmpdir(tmp_path):
"""entrypoint.sh creates the TMPDIR directory when it does not exist.
On a fresh ocr_cache volume, /app/cache/.tmp is absent. The entrypoint
must create it before uvicorn starts so the first Surya model download
does not exhaust the 512 MB /tmp tmpfs (ENOSPC). See ADR-021.
"""
custom_tmp = tmp_path / "model-staging"
assert not custom_tmp.exists(), "pre-condition: directory must not exist yet"
stub_bin = tmp_path / "stub_bin"
stub_bin.mkdir()
for name in ("python3", "uvicorn"):
stub = stub_bin / name
stub.write_text("#!/bin/sh\nexit 0\n")
stub.chmod(0o755)
env = {
**os.environ,
"TMPDIR": str(custom_tmp),
"PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}",
}
result = subprocess.run(
["bash", _ENTRYPOINT],
env=env,
capture_output=True,
text=True,
)
assert custom_tmp.exists(), (
f"entrypoint.sh did not create TMPDIR={custom_tmp}\n"
f"stdout: {result.stdout}\nstderr: {result.stderr}"
)
@pytest.mark.skipif(
not os.environ.get("TMPDIR", "").startswith("/app/cache"),
reason="TMPDIR contract only enforced inside the OCR Docker container",
)
def test_tmpdir_is_inside_persistent_cache_volume():
"""TMPDIR must point to the persistent cache volume, not a RAM tmpfs.
Catches accidental reversion to /tmp or any tmpfs-backed path.
Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp.
See ADR-021.
"""
tmpdir = os.environ["TMPDIR"]
assert tmpdir.startswith("/app/cache"), (
f"TMPDIR={tmpdir!r} must be under /app/cache to route model downloads "
"to the SSD-backed cache volume — see ADR-021"
)
@pytest.mark.skipif(not HAS_MAIN, reason="requires full ML stack (not available in CI)")
def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path):
"""_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR.
When TMPDIR=/app/cache/.tmp, extraction dirs live under that path.
Verifies os.path.realpath() still anchors correctly against the non-default base.
"""
extract_dir = tmp_path / "model-staging" / "tmpXXX"
extract_dir.mkdir(parents=True)
with pytest.raises(HTTPException) as exc_info:
_validate_zip_entry("../evil.py", str(extract_dir))
assert exc_info.value.status_code == 400