"""Tests for TMPDIR configuration and entrypoint mkdir behavior — ADR-021.""" import os import subprocess import tempfile import time import pytest from fastapi import HTTPException from utils import _validate_zip_entry _ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh") def _run_entrypoint(tmpdir, tmp_path): """Run entrypoint.sh with TMPDIR set to tmpdir; python3/uvicorn are stubbed out.""" stub_bin = tmp_path / "stub_bin" stub_bin.mkdir(exist_ok=True) for name in ("python3", "uvicorn"): stub = stub_bin / name stub.write_text("#!/bin/sh\nexit 0\n") stub.chmod(0o755) env = { **os.environ, "TMPDIR": str(tmpdir), "PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}", } return subprocess.run(["bash", _ENTRYPOINT], env=env, capture_output=True, text=True) def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path): """Python honours the TMPDIR env var when creating temporary directories. Documents the mechanism that routes Surya model staging to the persistent cache volume instead of the 512 MB RAM tmpfs. See ADR-021. """ custom_tmp = tmp_path / "model_staging" custom_tmp.mkdir() monkeypatch.setenv("TMPDIR", str(custom_tmp)) monkeypatch.setattr(tempfile, "tempdir", None) with tempfile.TemporaryDirectory() as td: assert td.startswith(str(custom_tmp)) def test_entrypoint_creates_tmpdir(tmp_path): """entrypoint.sh creates the TMPDIR directory when it does not exist. On a fresh ocr_cache volume, /app/cache/.tmp is absent. The entrypoint must create it before uvicorn starts so the first Surya model download does not exhaust the 512 MB /tmp tmpfs (ENOSPC). See ADR-021. """ custom_tmp = tmp_path / "model-staging" assert not custom_tmp.exists(), "pre-condition: directory must not exist yet" stub_bin = tmp_path / "stub_bin" stub_bin.mkdir() for name in ("python3", "uvicorn"): stub = stub_bin / name stub.write_text("#!/bin/sh\nexit 0\n") stub.chmod(0o755) env = { **os.environ, "TMPDIR": str(custom_tmp), "PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}", } result = subprocess.run( ["bash", _ENTRYPOINT], env=env, capture_output=True, text=True, ) assert result.returncode == 0, ( f"entrypoint.sh exited {result.returncode}\n" f"stdout: {result.stdout}\nstderr: {result.stderr}" ) assert custom_tmp.exists(), ( f"entrypoint.sh did not create TMPDIR={custom_tmp}\n" f"stdout: {result.stdout}\nstderr: {result.stderr}" ) @pytest.mark.skipif( not os.environ.get("TMPDIR", "").startswith("/app/cache"), reason="TMPDIR contract only enforced inside the OCR Docker container", ) def test_tmpdir_is_inside_persistent_cache_volume(): """TMPDIR must point to the persistent cache volume, not a RAM tmpfs. Catches accidental reversion to /tmp or any tmpfs-backed path. Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp. To run manually: docker exec archiv-ocr python -m pytest test_tmpdir.py::test_tmpdir_is_inside_persistent_cache_volume -v See ADR-021. """ tmpdir = os.environ["TMPDIR"] assert tmpdir.startswith("/app/cache"), ( f"TMPDIR={tmpdir!r} must be under /app/cache to route model downloads " "to the SSD-backed cache volume — see ADR-021" ) def test_entrypoint_removes_day_old_orphans(tmp_path): """entrypoint.sh deletes partial downloads older than 1 day from TMPDIR. Simulates a file left behind by a docker-kill mid-download: backdate its mtime by 2 days using os.utime(), run the entrypoint, assert it is gone. See ADR-021. """ staging = tmp_path / "staging" staging.mkdir() stale_file = staging / "model.safetensors.partial" stale_file.write_bytes(b"partial download") two_days_ago = time.time() - 2 * 24 * 3600 os.utime(stale_file, (two_days_ago, two_days_ago)) result = _run_entrypoint(staging, tmp_path) assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}" assert not stale_file.exists(), "day-old orphan should have been deleted by entrypoint.sh" def test_entrypoint_preserves_fresh_files(tmp_path): """entrypoint.sh does not delete files newer than 1 day from TMPDIR. An in-progress download whose mtime is recent must survive the orphan cleanup so a concurrent or just-started model fetch is not interrupted. See ADR-021. """ staging = tmp_path / "staging" staging.mkdir() fresh_file = staging / "model.safetensors.part" fresh_file.write_bytes(b"in progress") # mtime is now — no os.utime() call needed result = _run_entrypoint(staging, tmp_path) assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}" assert fresh_file.exists(), "recent file should not have been deleted by entrypoint.sh" def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path): """_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR. When TMPDIR=/app/cache/.tmp, extraction dirs live under that path. Verifies os.path.realpath() still anchors correctly against the non-default base. """ extract_dir = tmp_path / "model-staging" / "tmpXXX" extract_dir.mkdir(parents=True) with pytest.raises(HTTPException) as exc_info: _validate_zip_entry("../evil.py", str(extract_dir)) assert exc_info.value.status_code == 400