import pytest from pydantic import ValidationError # ── Models ────────────────────────────────────────────────────────────────── def test_parse_request_valid(): from models import ParseRequest req = ParseRequest(query="Briefe von Opa", lang="de") assert req.query == "Briefe von Opa" assert req.lang == "de" def test_parse_request_rejects_unknown_lang(): from models import ParseRequest with pytest.raises(ValidationError): ParseRequest(query="Letters from grandpa", lang="fr") def test_parse_response_serializes_nulls(): from models import ParseResponse resp = ParseResponse( personNames=["Opa"], personRole="sender", dateFrom=None, dateTo="1920-12-31", keywords=["brief"], rawQuery="Briefe von Opa", ) data = resp.model_dump() assert data["dateFrom"] is None assert data["dateTo"] == "1920-12-31" assert data["personRole"] == "sender" # ── Model loading ──────────────────────────────────────────────────────────── @pytest.fixture(scope="session") def nlp_de(): from extractor import get_nlp return get_nlp("de") @pytest.fixture(scope="session") def nlp_en(): from extractor import get_nlp return get_nlp("en") @pytest.fixture(scope="session") def nlp_es(): from extractor import get_nlp return get_nlp("es") def test_get_nlp_de_loads(nlp_de): doc = nlp_de("Test") assert doc is not None def test_get_nlp_en_loads(nlp_en): doc = nlp_en("Test") assert doc is not None def test_get_nlp_es_loads(nlp_es): doc = nlp_es("Prueba") assert doc is not None def test_get_nlp_unknown_lang_raises(): from extractor import get_nlp with pytest.raises(ValueError, match="Unsupported language"): get_nlp("fr") # ── Person name extraction ─────────────────────────────────────────────────── def _make_doc_with_ents(nlp, text: str, char_ents: list[tuple[int, int, str]]): """Create a Doc with manually injected entity spans (no NER model needed).""" doc = nlp.make_doc(text) spans = [doc.char_span(s, e, label=lbl) for s, e, lbl in char_ents] doc.ents = [sp for sp in spans if sp is not None] return doc def test_extract_person_names_two_persons(nlp_de): from extractor import extract_person_names # "Briefe von Opa Hermann an Marie" # "Opa Hermann" = chars 11..22, "Marie" = chars 26..31 doc = _make_doc_with_ents(nlp_de, "Briefe von Opa Hermann an Marie", [ (11, 22, "PER"), (26, 31, "PER"), ]) assert extract_person_names(doc) == ["Opa Hermann", "Marie"] def test_extract_person_names_preserves_order(nlp_de): from extractor import extract_person_names # "Marie von Opa" — Marie comes first in text # "Marie" = 0..5, "Opa" = 10..13 doc = _make_doc_with_ents(nlp_de, "Marie von Opa", [ (0, 5, "PER"), (10, 13, "PER"), ]) assert extract_person_names(doc) == ["Marie", "Opa"] def test_extract_person_names_empty(nlp_de): from extractor import extract_person_names doc = _make_doc_with_ents(nlp_de, "Briefe aus dem Krieg", []) assert extract_person_names(doc) == [] def test_extract_person_names_ignores_non_per(nlp_de): from extractor import extract_person_names # DATE entity should not appear in personNames doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")]) assert extract_person_names(doc) == []