Files
familienarchiv/nlp-service/test_extractor.py
2026-06-08 10:56:32 +02:00

120 lines
3.7 KiB
Python

import pytest
from pydantic import ValidationError
# ── Models ──────────────────────────────────────────────────────────────────
def test_parse_request_valid():
from models import ParseRequest
req = ParseRequest(query="Briefe von Opa", lang="de")
assert req.query == "Briefe von Opa"
assert req.lang == "de"
def test_parse_request_rejects_unknown_lang():
from models import ParseRequest
with pytest.raises(ValidationError):
ParseRequest(query="Letters from grandpa", lang="fr")
def test_parse_response_serializes_nulls():
from models import ParseResponse
resp = ParseResponse(
personNames=["Opa"],
personRole="sender",
dateFrom=None,
dateTo="1920-12-31",
keywords=["brief"],
rawQuery="Briefe von Opa",
)
data = resp.model_dump()
assert data["dateFrom"] is None
assert data["dateTo"] == "1920-12-31"
assert data["personRole"] == "sender"
# ── Model loading ────────────────────────────────────────────────────────────
@pytest.fixture(scope="session")
def nlp_de():
from extractor import get_nlp
return get_nlp("de")
@pytest.fixture(scope="session")
def nlp_en():
from extractor import get_nlp
return get_nlp("en")
@pytest.fixture(scope="session")
def nlp_es():
from extractor import get_nlp
return get_nlp("es")
def test_get_nlp_de_loads(nlp_de):
doc = nlp_de("Test")
assert doc is not None
def test_get_nlp_en_loads(nlp_en):
doc = nlp_en("Test")
assert doc is not None
def test_get_nlp_es_loads(nlp_es):
doc = nlp_es("Prueba")
assert doc is not None
def test_get_nlp_unknown_lang_raises():
from extractor import get_nlp
with pytest.raises(ValueError, match="Unsupported language"):
get_nlp("fr")
# ── Person name extraction ───────────────────────────────────────────────────
def _make_doc_with_ents(nlp, text: str, char_ents: list[tuple[int, int, str]]):
"""Create a Doc with manually injected entity spans (no NER model needed)."""
doc = nlp.make_doc(text)
spans = [doc.char_span(s, e, label=lbl) for s, e, lbl in char_ents]
doc.ents = [sp for sp in spans if sp is not None]
return doc
def test_extract_person_names_two_persons(nlp_de):
from extractor import extract_person_names
# "Briefe von Opa Hermann an Marie"
# "Opa Hermann" = chars 11..22, "Marie" = chars 26..31
doc = _make_doc_with_ents(nlp_de, "Briefe von Opa Hermann an Marie", [
(11, 22, "PER"),
(26, 31, "PER"),
])
assert extract_person_names(doc) == ["Opa Hermann", "Marie"]
def test_extract_person_names_preserves_order(nlp_de):
from extractor import extract_person_names
# "Marie von Opa" — Marie comes first in text
# "Marie" = 0..5, "Opa" = 10..13
doc = _make_doc_with_ents(nlp_de, "Marie von Opa", [
(0, 5, "PER"),
(10, 13, "PER"),
])
assert extract_person_names(doc) == ["Marie", "Opa"]
def test_extract_person_names_empty(nlp_de):
from extractor import extract_person_names
doc = _make_doc_with_ents(nlp_de, "Briefe aus dem Krieg", [])
assert extract_person_names(doc) == []
def test_extract_person_names_ignores_non_per(nlp_de):
from extractor import extract_person_names
# DATE entity should not appear in personNames
doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")])
assert extract_person_names(doc) == []