Files
familienarchiv/ocr-service/test_tmpdir.py
Marcel 775b5c062e test(ocr): add orphan cleanup behavior tests for entrypoint.sh find -mtime
test_entrypoint_removes_day_old_orphans and test_entrypoint_preserves_fresh_files
verify the find -mtime +1 -delete logic using os.utime() to fabricate old mtimes
without mocking system time. Also extracts _run_entrypoint helper to remove
subprocess setup duplication.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 11:19:33 +02:00

151 lines
5.4 KiB
Python

"""Tests for TMPDIR configuration and entrypoint mkdir behavior — ADR-021."""
import os
import subprocess
import tempfile
import time
import pytest
from fastapi import HTTPException
from utils import _validate_zip_entry
_ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh")
def _run_entrypoint(tmpdir, tmp_path):
"""Run entrypoint.sh with TMPDIR set to tmpdir; python3/uvicorn are stubbed out."""
stub_bin = tmp_path / "stub_bin"
stub_bin.mkdir(exist_ok=True)
for name in ("python3", "uvicorn"):
stub = stub_bin / name
stub.write_text("#!/bin/sh\nexit 0\n")
stub.chmod(0o755)
env = {
**os.environ,
"TMPDIR": str(tmpdir),
"PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}",
}
return subprocess.run(["bash", _ENTRYPOINT], env=env, capture_output=True, text=True)
def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path):
"""Python honours the TMPDIR env var when creating temporary directories.
Documents the mechanism that routes Surya model staging to the persistent
cache volume instead of the 512 MB RAM tmpfs. See ADR-021.
"""
custom_tmp = tmp_path / "model_staging"
custom_tmp.mkdir()
monkeypatch.setenv("TMPDIR", str(custom_tmp))
monkeypatch.setattr(tempfile, "tempdir", None)
with tempfile.TemporaryDirectory() as td:
assert td.startswith(str(custom_tmp))
def test_entrypoint_creates_tmpdir(tmp_path):
"""entrypoint.sh creates the TMPDIR directory when it does not exist.
On a fresh ocr_cache volume, /app/cache/.tmp is absent. The entrypoint
must create it before uvicorn starts so the first Surya model download
does not exhaust the 512 MB /tmp tmpfs (ENOSPC). See ADR-021.
"""
custom_tmp = tmp_path / "model-staging"
assert not custom_tmp.exists(), "pre-condition: directory must not exist yet"
stub_bin = tmp_path / "stub_bin"
stub_bin.mkdir()
for name in ("python3", "uvicorn"):
stub = stub_bin / name
stub.write_text("#!/bin/sh\nexit 0\n")
stub.chmod(0o755)
env = {
**os.environ,
"TMPDIR": str(custom_tmp),
"PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}",
}
result = subprocess.run(
["bash", _ENTRYPOINT],
env=env,
capture_output=True,
text=True,
)
assert result.returncode == 0, (
f"entrypoint.sh exited {result.returncode}\n"
f"stdout: {result.stdout}\nstderr: {result.stderr}"
)
assert custom_tmp.exists(), (
f"entrypoint.sh did not create TMPDIR={custom_tmp}\n"
f"stdout: {result.stdout}\nstderr: {result.stderr}"
)
@pytest.mark.skipif(
not os.environ.get("TMPDIR", "").startswith("/app/cache"),
reason="TMPDIR contract only enforced inside the OCR Docker container",
)
def test_tmpdir_is_inside_persistent_cache_volume():
"""TMPDIR must point to the persistent cache volume, not a RAM tmpfs.
Catches accidental reversion to /tmp or any tmpfs-backed path.
Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp.
See ADR-021.
"""
tmpdir = os.environ["TMPDIR"]
assert tmpdir.startswith("/app/cache"), (
f"TMPDIR={tmpdir!r} must be under /app/cache to route model downloads "
"to the SSD-backed cache volume — see ADR-021"
)
def test_entrypoint_removes_day_old_orphans(tmp_path):
"""entrypoint.sh deletes partial downloads older than 1 day from TMPDIR.
Simulates a file left behind by a docker-kill mid-download: backdate its
mtime by 2 days using os.utime(), run the entrypoint, assert it is gone.
See ADR-021.
"""
staging = tmp_path / "staging"
staging.mkdir()
stale_file = staging / "model.safetensors.partial"
stale_file.write_bytes(b"partial download")
two_days_ago = time.time() - 2 * 24 * 3600
os.utime(stale_file, (two_days_ago, two_days_ago))
result = _run_entrypoint(staging, tmp_path)
assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}"
assert not stale_file.exists(), "day-old orphan should have been deleted by entrypoint.sh"
def test_entrypoint_preserves_fresh_files(tmp_path):
"""entrypoint.sh does not delete files newer than 1 day from TMPDIR.
An in-progress download whose mtime is recent must survive the orphan
cleanup so a concurrent or just-started model fetch is not interrupted.
See ADR-021.
"""
staging = tmp_path / "staging"
staging.mkdir()
fresh_file = staging / "model.safetensors.part"
fresh_file.write_bytes(b"in progress")
# mtime is now — no os.utime() call needed
result = _run_entrypoint(staging, tmp_path)
assert result.returncode == 0, f"entrypoint.sh exited {result.returncode}\nstderr: {result.stderr}"
assert fresh_file.exists(), "recent file should not have been deleted by entrypoint.sh"
def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path):
"""_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR.
When TMPDIR=/app/cache/.tmp, extraction dirs live under that path.
Verifies os.path.realpath() still anchors correctly against the non-default base.
"""
extract_dir = tmp_path / "model-staging" / "tmpXXX"
extract_dir.mkdir(parents=True)
with pytest.raises(HTTPException) as exc_info:
_validate_zip_entry("../evil.py", str(extract_dir))
assert exc_info.value.status_code == 400