engines/kraken.py: - Add _SenderModelRegistry with LRU eviction (max configurable via OCR_MAX_CACHED_MODELS env var), double-checked locking, invalidate(), and path whitelist (/app/models/ only) - Add _load_sender_model() helper for testability - extract_page_blocks() and extract_region_text() accept optional sender_model_path; route to sender registry when provided models.py: - OcrRequest gains senderModelPath: str | None = None field main.py: - /ocr and /ocr/stream pass request.senderModelPath to Kraken engine - New /train-sender endpoint: validates output_model_path, runs ketos train with base model as starting point, invalidates sender cache docker-compose.yml: - Add OCR_MAX_CACHED_MODELS: "5" to ocr-service environment test_sender_registry.py: - 4 tests: cache hit, LRU eviction, invalidate, path traversal guard Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
66 lines
2.5 KiB
Python
66 lines
2.5 KiB
Python
"""Tests for the per-sender model LRU registry in engines/kraken.py."""
|
|
from unittest.mock import MagicMock, call, patch
|
|
|
|
import pytest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _make_registry(max_size=5):
|
|
from engines.kraken import _SenderModelRegistry
|
|
return _SenderModelRegistry(max_size=max_size)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_cache_hit_returns_same_object():
|
|
"""Second get_model call with the same path must return the cached object."""
|
|
registry = _make_registry()
|
|
mock_model = MagicMock(name="model_a")
|
|
with patch("engines.kraken._load_sender_model", return_value=mock_model) as loader:
|
|
m1 = registry.get_model("/app/models/sender_a.mlmodel")
|
|
m2 = registry.get_model("/app/models/sender_a.mlmodel")
|
|
|
|
assert m1 is m2
|
|
loader.assert_called_once() # only loaded once despite two gets
|
|
|
|
|
|
def test_lru_eviction_removes_least_recently_used():
|
|
"""When the cache exceeds max_size, the oldest-accessed entry is evicted."""
|
|
registry = _make_registry(max_size=2)
|
|
|
|
def _side_effect(path):
|
|
return MagicMock(name=path)
|
|
|
|
with patch("engines.kraken._load_sender_model", side_effect=_side_effect):
|
|
registry.get_model("/app/models/sender_a.mlmodel")
|
|
registry.get_model("/app/models/sender_b.mlmodel")
|
|
registry.get_model("/app/models/sender_c.mlmodel") # should evict 'a'
|
|
|
|
assert registry.size() == 2
|
|
# 'a' was the least-recently-used and should be gone
|
|
assert not registry._contains("/app/models/sender_a.mlmodel")
|
|
|
|
|
|
def test_invalidate_removes_entry_from_cache():
|
|
"""invalidate() must evict the entry so the next get reloads from disk."""
|
|
registry = _make_registry()
|
|
mock_model = MagicMock(name="model_x")
|
|
with patch("engines.kraken._load_sender_model", return_value=mock_model):
|
|
registry.get_model("/app/models/sender_x.mlmodel")
|
|
assert registry.size() == 1
|
|
|
|
registry.invalidate("/app/models/sender_x.mlmodel")
|
|
assert registry.size() == 0
|
|
|
|
|
|
def test_path_outside_models_dir_raises():
|
|
"""get_model must reject paths outside /app/models/ (path traversal guard)."""
|
|
registry = _make_registry()
|
|
with pytest.raises(ValueError, match="not allowed"):
|
|
registry.get_model("/etc/passwd")
|