feat(nlp-service): spaCy model loading with get_nlp/load_all_models

This commit is contained in:
Marcel
2026-06-07 10:17:07 +02:00
committed by marcel
parent 8e63867ad8
commit 482a1c2863
2 changed files with 74 additions and 0 deletions

33
nlp-service/extractor.py Normal file
View File

@@ -0,0 +1,33 @@
from __future__ import annotations
import re
from datetime import date
import dateparser
import spacy
from spacy.language import Language
from models import ParseResponse
# ── Language model registry ──────────────────────────────────────────────────
_MODEL_NAMES: dict[str, str] = {
"de": "de_core_news_sm",
"en": "en_core_web_sm",
"es": "es_core_news_sm",
}
_nlp_cache: dict[str, Language] = {}
def get_nlp(lang: str) -> Language:
if lang not in _MODEL_NAMES:
raise ValueError(f"Unsupported language: {lang!r}. Valid: {list(_MODEL_NAMES)}")
if lang not in _nlp_cache:
_nlp_cache[lang] = spacy.load(_MODEL_NAMES[lang])
return _nlp_cache[lang]
def load_all_models() -> None:
for lang in _MODEL_NAMES:
get_nlp(lang)

View File

@@ -31,3 +31,44 @@ def test_parse_response_serializes_nulls():
assert data["dateFrom"] is None
assert data["dateTo"] == "1920-12-31"
assert data["personRole"] == "sender"
# ── Model loading ────────────────────────────────────────────────────────────
@pytest.fixture(scope="session")
def nlp_de():
from extractor import get_nlp
return get_nlp("de")
@pytest.fixture(scope="session")
def nlp_en():
from extractor import get_nlp
return get_nlp("en")
@pytest.fixture(scope="session")
def nlp_es():
from extractor import get_nlp
return get_nlp("es")
def test_get_nlp_de_loads(nlp_de):
doc = nlp_de("Test")
assert doc is not None
def test_get_nlp_en_loads(nlp_en):
doc = nlp_en("Test")
assert doc is not None
def test_get_nlp_es_loads(nlp_es):
doc = nlp_es("Prueba")
assert doc is not None
def test_get_nlp_unknown_lang_raises():
from extractor import get_nlp
with pytest.raises(ValueError, match="Unsupported language"):
get_nlp("fr")