Files
familienarchiv/ocr-service/test_sender_registry.py
Marcel a146a2ec3c feat(ocr): per-sender model registry and /train-sender endpoint
engines/kraken.py:
- Add _SenderModelRegistry with LRU eviction (max configurable via
  OCR_MAX_CACHED_MODELS env var), double-checked locking, invalidate(),
  and path whitelist (/app/models/ only)
- Add _load_sender_model() helper for testability
- extract_page_blocks() and extract_region_text() accept optional
  sender_model_path; route to sender registry when provided

models.py:
- OcrRequest gains senderModelPath: str | None = None field

main.py:
- /ocr and /ocr/stream pass request.senderModelPath to Kraken engine
- New /train-sender endpoint: validates output_model_path, runs ketos
  train with base model as starting point, invalidates sender cache

docker-compose.yml:
- Add OCR_MAX_CACHED_MODELS: "5" to ocr-service environment

test_sender_registry.py:
- 4 tests: cache hit, LRU eviction, invalidate, path traversal guard

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-18 12:30:54 +02:00

66 lines
2.5 KiB
Python

"""Tests for the per-sender model LRU registry in engines/kraken.py."""
from unittest.mock import MagicMock, call, patch
import pytest
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_registry(max_size=5):
from engines.kraken import _SenderModelRegistry
return _SenderModelRegistry(max_size=max_size)
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
def test_cache_hit_returns_same_object():
"""Second get_model call with the same path must return the cached object."""
registry = _make_registry()
mock_model = MagicMock(name="model_a")
with patch("engines.kraken._load_sender_model", return_value=mock_model) as loader:
m1 = registry.get_model("/app/models/sender_a.mlmodel")
m2 = registry.get_model("/app/models/sender_a.mlmodel")
assert m1 is m2
loader.assert_called_once() # only loaded once despite two gets
def test_lru_eviction_removes_least_recently_used():
"""When the cache exceeds max_size, the oldest-accessed entry is evicted."""
registry = _make_registry(max_size=2)
def _side_effect(path):
return MagicMock(name=path)
with patch("engines.kraken._load_sender_model", side_effect=_side_effect):
registry.get_model("/app/models/sender_a.mlmodel")
registry.get_model("/app/models/sender_b.mlmodel")
registry.get_model("/app/models/sender_c.mlmodel") # should evict 'a'
assert registry.size() == 2
# 'a' was the least-recently-used and should be gone
assert not registry._contains("/app/models/sender_a.mlmodel")
def test_invalidate_removes_entry_from_cache():
"""invalidate() must evict the entry so the next get reloads from disk."""
registry = _make_registry()
mock_model = MagicMock(name="model_x")
with patch("engines.kraken._load_sender_model", return_value=mock_model):
registry.get_model("/app/models/sender_x.mlmodel")
assert registry.size() == 1
registry.invalidate("/app/models/sender_x.mlmodel")
assert registry.size() == 0
def test_path_outside_models_dir_raises():
"""get_model must reject paths outside /app/models/ (path traversal guard)."""
registry = _make_registry()
with pytest.raises(ValueError, match="not allowed"):
registry.get_model("/etc/passwd")