import pytest from pydantic import ValidationError # ── Models ────────────────────────────────────────────────────────────────── def test_parse_request_valid(): from models import ParseRequest req = ParseRequest(query="Briefe von Opa", lang="de") assert req.query == "Briefe von Opa" assert req.lang == "de" def test_parse_request_rejects_unknown_lang(): from models import ParseRequest with pytest.raises(ValidationError): ParseRequest(query="Letters from grandpa", lang="fr") def test_parse_response_serializes_nulls(): from models import ParseResponse resp = ParseResponse( personNames=["Opa"], personRole="sender", dateFrom=None, dateTo="1920-12-31", keywords=["brief"], rawQuery="Briefe von Opa", ) data = resp.model_dump() assert data["dateFrom"] is None assert data["dateTo"] == "1920-12-31" assert data["personRole"] == "sender" # ── Model loading ──────────────────────────────────────────────────────────── @pytest.fixture(scope="session") def nlp_de(): from extractor import get_nlp return get_nlp("de") @pytest.fixture(scope="session") def nlp_en(): from extractor import get_nlp return get_nlp("en") @pytest.fixture(scope="session") def nlp_es(): from extractor import get_nlp return get_nlp("es") def test_get_nlp_de_loads(nlp_de): doc = nlp_de("Test") assert doc is not None def test_get_nlp_en_loads(nlp_en): doc = nlp_en("Test") assert doc is not None def test_get_nlp_es_loads(nlp_es): doc = nlp_es("Prueba") assert doc is not None def test_get_nlp_unknown_lang_raises(): from extractor import get_nlp with pytest.raises(ValueError, match="Unsupported language"): get_nlp("fr") # ── Person name extraction ─────────────────────────────────────────────────── def _make_doc_with_ents(nlp, text: str, char_ents: list[tuple[int, int, str]]): """Create a Doc with manually injected entity spans (no NER model needed).""" doc = nlp.make_doc(text) spans = [doc.char_span(s, e, label=lbl) for s, e, lbl in char_ents] doc.ents = [sp for sp in spans if sp is not None] return doc def test_extract_person_names_two_persons(nlp_de): from extractor import extract_person_names # "Briefe von Opa Hermann an Marie" # "Opa Hermann" = chars 11..22, "Marie" = chars 26..31 doc = _make_doc_with_ents(nlp_de, "Briefe von Opa Hermann an Marie", [ (11, 22, "PER"), (26, 31, "PER"), ]) assert extract_person_names(doc) == ["Opa Hermann", "Marie"] def test_extract_person_names_preserves_order(nlp_de): from extractor import extract_person_names # "Marie von Opa" — Marie comes first in text # "Marie" = 0..5, "Opa" = 10..13 doc = _make_doc_with_ents(nlp_de, "Marie von Opa", [ (0, 5, "PER"), (10, 13, "PER"), ]) assert extract_person_names(doc) == ["Marie", "Opa"] def test_extract_person_names_empty(nlp_de): from extractor import extract_person_names doc = _make_doc_with_ents(nlp_de, "Briefe aus dem Krieg", []) assert extract_person_names(doc) == [] def test_extract_person_names_ignores_non_per(nlp_de): from extractor import extract_person_names # DATE entity should not appear in personNames doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")]) assert extract_person_names(doc) == [] # ── Role detection ─────────────────────────────────────────────────────────── def test_role_sender_von(nlp_de): from extractor import detect_person_role # "Briefe von Marie" — "von" immediately before "Marie" # "Marie" = chars 11..16 doc = _make_doc_with_ents(nlp_de, "Briefe von Marie", [(11, 16, "PER")]) per_spans = list(doc.ents) assert detect_person_role(doc, per_spans, "de") == "sender" def test_role_receiver_an(nlp_de): from extractor import detect_person_role # "Briefe an Marie" — "an" immediately before "Marie" # "Marie" = chars 10..15 doc = _make_doc_with_ents(nlp_de, "Briefe an Marie", [(10, 15, "PER")]) per_spans = list(doc.ents) assert detect_person_role(doc, per_spans, "de") == "receiver" def test_role_two_persons_returns_any(nlp_de): from extractor import detect_person_role # "von Opa an Marie" — two PER spans → always "any" # "Opa" = chars 4..7, "Marie" = chars 11..16 doc = _make_doc_with_ents(nlp_de, "von Opa an Marie", [ (4, 7, "PER"), (11, 16, "PER"), ]) per_spans = list(doc.ents) assert detect_person_role(doc, per_spans, "de") == "any" def test_role_no_prep_returns_any(nlp_de): from extractor import detect_person_role # "Briefe Marie" — no preposition # "Marie" = chars 7..12 doc = _make_doc_with_ents(nlp_de, "Briefe Marie", [(7, 12, "PER")]) per_spans = list(doc.ents) assert detect_person_role(doc, per_spans, "de") == "any" def test_role_empty_returns_any(nlp_de): from extractor import detect_person_role doc = _make_doc_with_ents(nlp_de, "Briefe 1920", []) assert detect_person_role(doc, [], "de") == "any" def test_role_sender_from_english(nlp_en): from extractor import detect_person_role # "letters from Marie" — "from" before "Marie" # "Marie" = chars 13..18 doc = _make_doc_with_ents(nlp_en, "letters from Marie", [(13, 18, "PER")]) per_spans = list(doc.ents) assert detect_person_role(doc, per_spans, "en") == "sender" def test_role_receiver_to_english(nlp_en): from extractor import detect_person_role # "letters to Marie" — "to" before "Marie" # "letters" = 0..7, " " = 7, "to" = 8..10, " " = 10, "Marie" = 11..16 doc = _make_doc_with_ents(nlp_en, "letters to Marie", [(11, 16, "PER")]) per_spans = list(doc.ents) assert detect_person_role(doc, per_spans, "en") == "receiver" # ── Date parsing ───────────────────────────────────────────────────────────── def test_date_vor_1920(nlp_de): from extractor import extract_dates # "Briefe vor 1920" — "1920" at chars 11..15 doc = _make_doc_with_ents(nlp_de, "Briefe vor 1920", [(11, 15, "DATE")]) date_from, date_to = extract_dates(doc, "de") assert date_from is None assert date_to == "1920-12-31" def test_date_nach_1900(nlp_de): from extractor import extract_dates # "Briefe nach 1900" — "1900" at chars 12..16 doc = _make_doc_with_ents(nlp_de, "Briefe nach 1900", [(12, 16, "DATE")]) date_from, date_to = extract_dates(doc, "de") assert date_from == "1900-01-01" assert date_to is None def test_date_zwischen_1900_und_1920(nlp_de): from extractor import extract_dates # "zwischen 1900 und 1920" # "1900" = chars 9..13, "1920" = chars 18..22 doc = _make_doc_with_ents(nlp_de, "zwischen 1900 und 1920", [ (9, 13, "DATE"), (18, 22, "DATE"), ]) date_from, date_to = extract_dates(doc, "de") assert date_from == "1900-01-01" assert date_to == "1920-12-31" def test_date_bare_year_makes_range(nlp_de): from extractor import extract_dates # "Briefe 1920" — no direction token → year-range # "1920" = chars 7..11 doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")]) date_from, date_to = extract_dates(doc, "de") assert date_from == "1920-01-01" assert date_to == "1920-12-31" def test_date_no_date_entity(nlp_de): from extractor import extract_dates doc = _make_doc_with_ents(nlp_de, "Briefe von Opa", []) date_from, date_to = extract_dates(doc, "de") assert date_from is None assert date_to is None def test_date_before_english(nlp_en): from extractor import extract_dates # "letters before 1920" — "1920" at chars 15..19 doc = _make_doc_with_ents(nlp_en, "letters before 1920", [(15, 19, "DATE")]) date_from, date_to = extract_dates(doc, "en") assert date_from is None assert date_to == "1920-12-31" def test_date_after_english(nlp_en): from extractor import extract_dates # "letters after 1900" — "1900" at chars 14..18 doc = _make_doc_with_ents(nlp_en, "letters after 1900", [(14, 18, "DATE")]) date_from, date_to = extract_dates(doc, "en") assert date_from == "1900-01-01" assert date_to is None # ── Keyword extraction ─────────────────────────────────────────────────────── def test_keywords_extracts_nouns(nlp_de): from extractor import extract_keywords # Use real NLP for POS tags; disable NER to avoid interference doc = nlp_de("Briefe aus dem Krieg", disable=["ner"]) keywords = extract_keywords(doc, []) # "Brief" (NOUN) and "Krieg" (NOUN) should appear as lemmas assert "brief" in keywords assert "krieg" in keywords def test_keywords_excludes_stopwords(nlp_de): from extractor import extract_keywords doc = nlp_de("Briefe aus dem Krieg", disable=["ner"]) keywords = extract_keywords(doc, []) # "dem" is a stopword article — must not appear assert "dem" not in keywords def test_keywords_excludes_per_ner_spans(nlp_de): from extractor import extract_keywords # Run full NLP for POS tags, then inject a PER span over "Hermann" # "Briefe von Hermann": B=0..6, ' '=6, v=7..10, ' '=10, H=11..18 doc = nlp_de("Briefe von Hermann") per_span = doc.char_span(11, 18, label="PER") if per_span: doc.ents = [per_span] keywords = extract_keywords(doc, list(doc.ents)) assert "hermann" not in keywords def test_keywords_excludes_short_lemmas(nlp_de): from extractor import extract_keywords doc = nlp_de("Briefe an ihn", disable=["ner"]) keywords = extract_keywords(doc, []) # "ihn" is 3 chars but is a stopword pronoun; "an" is 2 chars assert "an" not in keywords def test_keywords_deduplicates(nlp_de): from extractor import extract_keywords doc = nlp_de("Brief Brief Krieg", disable=["ner"]) keywords = extract_keywords(doc, []) assert keywords.count("brief") == 1 # ── Full extract() pipeline ────────────────────────────────────────────────── def test_extract_dates_de(): from extractor import extract result = extract("Briefe vor 1920", "de") assert result.dateFrom is None assert result.dateTo == "1920-12-31" assert result.rawQuery == "Briefe vor 1920" assert result.personNames == [] assert result.personRole == "any" def test_extract_keywords_from_topic_de(): from extractor import extract result = extract("Briefe aus dem Krieg", "de") assert "krieg" in result.keywords assert result.dateFrom is None assert result.dateTo is None def test_extract_dates_en(): from extractor import extract result = extract("letters before 1920", "en") assert result.dateTo == "1920-12-31" assert result.dateFrom is None def test_extract_dates_es(): from extractor import extract result = extract("cartas antes de 1920", "es") assert result.dateTo == "1920-12-31" assert result.dateFrom is None def test_extract_rawquery_echoed(): from extractor import extract q = "Texte über Weihnachten" result = extract(q, "de") assert result.rawQuery == q def test_extract_response_fields_are_complete(): from extractor import extract result = extract("Briefe 1900", "de") assert isinstance(result.personNames, list) assert result.personRole in ("sender", "receiver", "any") assert isinstance(result.keywords, list) assert result.rawQuery == "Briefe 1900"