Files
familienarchiv/ocr-service/test_tmpdir.py
Marcel c2bd1b34f0 refactor(ocr): extract _validate_zip_entry to utils.py so ZIP Slip test runs in CI
_validate_zip_entry has no ML-stack dependency; importing it via main.py
pulled in surya/torch and caused the test to be skipped in CI. Moving it
to utils.py (fastapi only) and adding fastapi to the CI lightweight install
lets test_zipslip_still_anchors_under_custom_tmpdir run on every push.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 11:17:15 +02:00

93 lines
3.2 KiB
Python

"""Tests for TMPDIR configuration and entrypoint mkdir behavior — ADR-021."""
import os
import subprocess
import tempfile
import pytest
from fastapi import HTTPException
from utils import _validate_zip_entry
_ENTRYPOINT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "entrypoint.sh")
def test_tempfile_uses_tmpdir_when_set(monkeypatch, tmp_path):
"""Python honours the TMPDIR env var when creating temporary directories.
Documents the mechanism that routes Surya model staging to the persistent
cache volume instead of the 512 MB RAM tmpfs. See ADR-021.
"""
custom_tmp = tmp_path / "model_staging"
custom_tmp.mkdir()
monkeypatch.setenv("TMPDIR", str(custom_tmp))
monkeypatch.setattr(tempfile, "tempdir", None)
with tempfile.TemporaryDirectory() as td:
assert td.startswith(str(custom_tmp))
def test_entrypoint_creates_tmpdir(tmp_path):
"""entrypoint.sh creates the TMPDIR directory when it does not exist.
On a fresh ocr_cache volume, /app/cache/.tmp is absent. The entrypoint
must create it before uvicorn starts so the first Surya model download
does not exhaust the 512 MB /tmp tmpfs (ENOSPC). See ADR-021.
"""
custom_tmp = tmp_path / "model-staging"
assert not custom_tmp.exists(), "pre-condition: directory must not exist yet"
stub_bin = tmp_path / "stub_bin"
stub_bin.mkdir()
for name in ("python3", "uvicorn"):
stub = stub_bin / name
stub.write_text("#!/bin/sh\nexit 0\n")
stub.chmod(0o755)
env = {
**os.environ,
"TMPDIR": str(custom_tmp),
"PATH": f"{stub_bin}:{os.environ.get('PATH', '/usr/bin:/bin')}",
}
result = subprocess.run(
["bash", _ENTRYPOINT],
env=env,
capture_output=True,
text=True,
)
assert custom_tmp.exists(), (
f"entrypoint.sh did not create TMPDIR={custom_tmp}\n"
f"stdout: {result.stdout}\nstderr: {result.stderr}"
)
@pytest.mark.skipif(
not os.environ.get("TMPDIR", "").startswith("/app/cache"),
reason="TMPDIR contract only enforced inside the OCR Docker container",
)
def test_tmpdir_is_inside_persistent_cache_volume():
"""TMPDIR must point to the persistent cache volume, not a RAM tmpfs.
Catches accidental reversion to /tmp or any tmpfs-backed path.
Runs only inside the OCR Docker container where TMPDIR=/app/cache/.tmp.
See ADR-021.
"""
tmpdir = os.environ["TMPDIR"]
assert tmpdir.startswith("/app/cache"), (
f"TMPDIR={tmpdir!r} must be under /app/cache to route model downloads "
"to the SSD-backed cache volume — see ADR-021"
)
def test_zipslip_still_anchors_under_custom_tmpdir(tmp_path):
"""_validate_zip_entry rejects path-traversal when extract_dir is under a custom TMPDIR.
When TMPDIR=/app/cache/.tmp, extraction dirs live under that path.
Verifies os.path.realpath() still anchors correctly against the non-default base.
"""
extract_dir = tmp_path / "model-staging" / "tmpXXX"
extract_dir.mkdir(parents=True)
with pytest.raises(HTTPException) as exc_info:
_validate_zip_entry("../evil.py", str(extract_dir))
assert exc_info.value.status_code == 400