From 482a1c28634cc716cdca01b0cab7f2975386c089 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 7 Jun 2026 10:17:07 +0200 Subject: [PATCH] feat(nlp-service): spaCy model loading with get_nlp/load_all_models --- nlp-service/extractor.py | 33 ++++++++++++++++++++++++++++ nlp-service/test_extractor.py | 41 +++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 nlp-service/extractor.py diff --git a/nlp-service/extractor.py b/nlp-service/extractor.py new file mode 100644 index 00000000..ce752bef --- /dev/null +++ b/nlp-service/extractor.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import re +from datetime import date + +import dateparser +import spacy +from spacy.language import Language + +from models import ParseResponse + +# ── Language model registry ────────────────────────────────────────────────── + +_MODEL_NAMES: dict[str, str] = { + "de": "de_core_news_sm", + "en": "en_core_web_sm", + "es": "es_core_news_sm", +} + +_nlp_cache: dict[str, Language] = {} + + +def get_nlp(lang: str) -> Language: + if lang not in _MODEL_NAMES: + raise ValueError(f"Unsupported language: {lang!r}. Valid: {list(_MODEL_NAMES)}") + if lang not in _nlp_cache: + _nlp_cache[lang] = spacy.load(_MODEL_NAMES[lang]) + return _nlp_cache[lang] + + +def load_all_models() -> None: + for lang in _MODEL_NAMES: + get_nlp(lang) diff --git a/nlp-service/test_extractor.py b/nlp-service/test_extractor.py index 0b80d0b4..2ef7c0a6 100644 --- a/nlp-service/test_extractor.py +++ b/nlp-service/test_extractor.py @@ -31,3 +31,44 @@ def test_parse_response_serializes_nulls(): assert data["dateFrom"] is None assert data["dateTo"] == "1920-12-31" assert data["personRole"] == "sender" + + +# ── Model loading ──────────────────────────────────────────────────────────── + +@pytest.fixture(scope="session") +def nlp_de(): + from extractor import get_nlp + return get_nlp("de") + + +@pytest.fixture(scope="session") +def nlp_en(): + from extractor import get_nlp + return get_nlp("en") + + +@pytest.fixture(scope="session") +def nlp_es(): + from extractor import get_nlp + return get_nlp("es") + + +def test_get_nlp_de_loads(nlp_de): + doc = nlp_de("Test") + assert doc is not None + + +def test_get_nlp_en_loads(nlp_en): + doc = nlp_en("Test") + assert doc is not None + + +def test_get_nlp_es_loads(nlp_es): + doc = nlp_es("Prueba") + assert doc is not None + + +def test_get_nlp_unknown_lang_raises(): + from extractor import get_nlp + with pytest.raises(ValueError, match="Unsupported language"): + get_nlp("fr")