feat(nlp-service): spaCy model loading with get_nlp/load_all_models
This commit is contained in:
33
nlp-service/extractor.py
Normal file
33
nlp-service/extractor.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
import dateparser
|
||||||
|
import spacy
|
||||||
|
from spacy.language import Language
|
||||||
|
|
||||||
|
from models import ParseResponse
|
||||||
|
|
||||||
|
# ── Language model registry ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_MODEL_NAMES: dict[str, str] = {
|
||||||
|
"de": "de_core_news_sm",
|
||||||
|
"en": "en_core_web_sm",
|
||||||
|
"es": "es_core_news_sm",
|
||||||
|
}
|
||||||
|
|
||||||
|
_nlp_cache: dict[str, Language] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def get_nlp(lang: str) -> Language:
|
||||||
|
if lang not in _MODEL_NAMES:
|
||||||
|
raise ValueError(f"Unsupported language: {lang!r}. Valid: {list(_MODEL_NAMES)}")
|
||||||
|
if lang not in _nlp_cache:
|
||||||
|
_nlp_cache[lang] = spacy.load(_MODEL_NAMES[lang])
|
||||||
|
return _nlp_cache[lang]
|
||||||
|
|
||||||
|
|
||||||
|
def load_all_models() -> None:
|
||||||
|
for lang in _MODEL_NAMES:
|
||||||
|
get_nlp(lang)
|
||||||
@@ -31,3 +31,44 @@ def test_parse_response_serializes_nulls():
|
|||||||
assert data["dateFrom"] is None
|
assert data["dateFrom"] is None
|
||||||
assert data["dateTo"] == "1920-12-31"
|
assert data["dateTo"] == "1920-12-31"
|
||||||
assert data["personRole"] == "sender"
|
assert data["personRole"] == "sender"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Model loading ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def nlp_de():
|
||||||
|
from extractor import get_nlp
|
||||||
|
return get_nlp("de")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def nlp_en():
|
||||||
|
from extractor import get_nlp
|
||||||
|
return get_nlp("en")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def nlp_es():
|
||||||
|
from extractor import get_nlp
|
||||||
|
return get_nlp("es")
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_nlp_de_loads(nlp_de):
|
||||||
|
doc = nlp_de("Test")
|
||||||
|
assert doc is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_nlp_en_loads(nlp_en):
|
||||||
|
doc = nlp_en("Test")
|
||||||
|
assert doc is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_nlp_es_loads(nlp_es):
|
||||||
|
doc = nlp_es("Prueba")
|
||||||
|
assert doc is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_nlp_unknown_lang_raises():
|
||||||
|
from extractor import get_nlp
|
||||||
|
with pytest.raises(ValueError, match="Unsupported language"):
|
||||||
|
get_nlp("fr")
|
||||||
|
|||||||
Reference in New Issue
Block a user