feat(ocr): per-sender model registry and /train-sender endpoint
engines/kraken.py: - Add _SenderModelRegistry with LRU eviction (max configurable via OCR_MAX_CACHED_MODELS env var), double-checked locking, invalidate(), and path whitelist (/app/models/ only) - Add _load_sender_model() helper for testability - extract_page_blocks() and extract_region_text() accept optional sender_model_path; route to sender registry when provided models.py: - OcrRequest gains senderModelPath: str | None = None field main.py: - /ocr and /ocr/stream pass request.senderModelPath to Kraken engine - New /train-sender endpoint: validates output_model_path, runs ketos train with base model as starting point, invalidates sender cache docker-compose.yml: - Add OCR_MAX_CACHED_MODELS: "5" to ocr-service environment test_sender_registry.py: - 4 tests: cache hit, LRU eviction, invalidate, path traversal guard Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
65
ocr-service/test_sender_registry.py
Normal file
65
ocr-service/test_sender_registry.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""Tests for the per-sender model LRU registry in engines/kraken.py."""
|
||||
from unittest.mock import MagicMock, call, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_registry(max_size=5):
|
||||
from engines.kraken import _SenderModelRegistry
|
||||
return _SenderModelRegistry(max_size=max_size)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_cache_hit_returns_same_object():
|
||||
"""Second get_model call with the same path must return the cached object."""
|
||||
registry = _make_registry()
|
||||
mock_model = MagicMock(name="model_a")
|
||||
with patch("engines.kraken._load_sender_model", return_value=mock_model) as loader:
|
||||
m1 = registry.get_model("/app/models/sender_a.mlmodel")
|
||||
m2 = registry.get_model("/app/models/sender_a.mlmodel")
|
||||
|
||||
assert m1 is m2
|
||||
loader.assert_called_once() # only loaded once despite two gets
|
||||
|
||||
|
||||
def test_lru_eviction_removes_least_recently_used():
|
||||
"""When the cache exceeds max_size, the oldest-accessed entry is evicted."""
|
||||
registry = _make_registry(max_size=2)
|
||||
|
||||
def _side_effect(path):
|
||||
return MagicMock(name=path)
|
||||
|
||||
with patch("engines.kraken._load_sender_model", side_effect=_side_effect):
|
||||
registry.get_model("/app/models/sender_a.mlmodel")
|
||||
registry.get_model("/app/models/sender_b.mlmodel")
|
||||
registry.get_model("/app/models/sender_c.mlmodel") # should evict 'a'
|
||||
|
||||
assert registry.size() == 2
|
||||
# 'a' was the least-recently-used and should be gone
|
||||
assert not registry._contains("/app/models/sender_a.mlmodel")
|
||||
|
||||
|
||||
def test_invalidate_removes_entry_from_cache():
|
||||
"""invalidate() must evict the entry so the next get reloads from disk."""
|
||||
registry = _make_registry()
|
||||
mock_model = MagicMock(name="model_x")
|
||||
with patch("engines.kraken._load_sender_model", return_value=mock_model):
|
||||
registry.get_model("/app/models/sender_x.mlmodel")
|
||||
assert registry.size() == 1
|
||||
|
||||
registry.invalidate("/app/models/sender_x.mlmodel")
|
||||
assert registry.size() == 0
|
||||
|
||||
|
||||
def test_path_outside_models_dir_raises():
|
||||
"""get_model must reject paths outside /app/models/ (path traversal guard)."""
|
||||
registry = _make_registry()
|
||||
with pytest.raises(ValueError, match="not allowed"):
|
||||
registry.get_model("/etc/passwd")
|
||||
Reference in New Issue
Block a user