diff --git a/ocr-service/entrypoint.sh b/ocr-service/entrypoint.sh index ec6892a8..104e2fbb 100644 --- a/ocr-service/entrypoint.sh +++ b/ocr-service/entrypoint.sh @@ -1,6 +1,13 @@ #!/bin/bash set -euo pipefail +# Ensure TMPDIR exists on the persistent cache volume (created by the volume-init +# container, but guaranteed here for fresh volumes and bare docker-run usage). +# Orphaned fragments from prior docker-kill during model downloads are cleared +# on startup to prevent cross-job ground-truth leakage (Surya staging files). +mkdir -p "${TMPDIR:-/tmp}" +find "${TMPDIR:-/tmp}" -mindepth 1 -mtime +1 -delete 2>/dev/null || true + # Validate the blla segmentation base model and download it if missing or # incompatible. ketos 7 dropped support for legacy PyTorch ZIP archives — # this ensures the volume always holds a loadable CoreML protobuf model. diff --git a/ocr-service/test_tmpdir.py b/ocr-service/test_tmpdir.py new file mode 100644 index 00000000..fb55ccb4 --- /dev/null +++ b/ocr-service/test_tmpdir.py @@ -0,0 +1,97 @@ +"""Tests for TMPDIR configuration and entrypoint mkdir behavior — ADR-021.""" + +import os +import subprocess +import tempfile + +import pytest + +try: + from fastapi import HTTPException + from main import _validate_zip_entry + HAS_MAIN = True +except ImportError: + HAS_MAIN = False + +_ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh") + + +def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path): + """Python honours the TMPDIR env var when creating temporary directories. + + Documents the mechanism that routes Surya model staging to the persistent + cache volume instead of the 512 MB RAM tmpfs. See ADR-021. + """ + custom_tmp = tmp_path / "model_staging" + custom_tmp.mkdir() + monkeypatch.setenv("TMPDIR", str(custom_tmp)) + monkeypatch.setattr(tempfile, "tempdir", None) + with tempfile.TemporaryDirectory() as td: + assert td.startswith(str(custom_tmp)) + + +def test_entrypoint_creates_tmpdir(tmp_path): + """entrypoint.sh creates the TMPDIR directory when it does not exist. + + On a fresh ocr_cache volume, /app/cache/.tmp is absent. The entrypoint + must create it before uvicorn starts so the first Surya model download + does not exhaust the 512 MB /tmp tmpfs (ENOSPC). See ADR-021. + """ + custom_tmp = tmp_path / "model-staging" + assert not custom_tmp.exists(), "pre-condition: directory must not exist yet" + + stub_bin = tmp_path / "stub_bin" + stub_bin.mkdir() + for name in ("python3", "uvicorn"): + stub = stub_bin / name + stub.write_text("#!/bin/sh\nexit 0\n") + stub.chmod(0o755) + + env = { + **os.environ, + "TMPDIR": str(custom_tmp), + "PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}", + } + result = subprocess.run( + ["bash", _ENTRYPOINT], + env=env, + capture_output=True, + text=True, + ) + assert custom_tmp.exists(), ( + f"entrypoint.sh did not create TMPDIR={custom_tmp}\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) + + +@pytest.mark.skipif( + not os.environ.get("TMPDIR", "").startswith("/app/cache"), + reason="TMPDIR contract only enforced inside the OCR Docker container", +) +def test_tmpdir_is_inside_persistent_cache_volume(): + """TMPDIR must point to the persistent cache volume, not a RAM tmpfs. + + Catches accidental reversion to /tmp or any tmpfs-backed path. + Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp. + See ADR-021. + """ + tmpdir = os.environ["TMPDIR"] + assert tmpdir.startswith("/app/cache"), ( + f"TMPDIR={tmpdir!r} must be under /app/cache to route model downloads " + "to the SSD-backed cache volume — see ADR-021" + ) + + +@pytest.mark.skipif(not HAS_MAIN, reason="requires full ML stack (not available in CI)") +def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path): + """_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR. + + When TMPDIR=/app/cache/.tmp, extraction dirs live under that path. + Verifies os.path.realpath() still anchors correctly against the non-default base. + """ + extract_dir = tmp_path / "model-staging" / "tmpXXX" + extract_dir.mkdir(parents=True) + + with pytest.raises(HTTPException) as exc_info: + _validate_zip_entry("../evil.py", str(extract_dir)) + assert exc_info.value.status_code == 400