301 lines
10 KiB
Python
301 lines
10 KiB
Python
import pytest
|
|
from pydantic import ValidationError
|
|
|
|
|
|
# ── Models ──────────────────────────────────────────────────────────────────
|
|
|
|
def test_parse_request_valid():
|
|
from models import ParseRequest
|
|
req = ParseRequest(query="Briefe von Opa", lang="de")
|
|
assert req.query == "Briefe von Opa"
|
|
assert req.lang == "de"
|
|
|
|
|
|
def test_parse_request_rejects_unknown_lang():
|
|
from models import ParseRequest
|
|
with pytest.raises(ValidationError):
|
|
ParseRequest(query="Letters from grandpa", lang="fr")
|
|
|
|
|
|
def test_parse_response_serializes_nulls():
|
|
from models import ParseResponse
|
|
resp = ParseResponse(
|
|
personNames=["Opa"],
|
|
personRole="sender",
|
|
dateFrom=None,
|
|
dateTo="1920-12-31",
|
|
keywords=["brief"],
|
|
rawQuery="Briefe von Opa",
|
|
)
|
|
data = resp.model_dump()
|
|
assert data["dateFrom"] is None
|
|
assert data["dateTo"] == "1920-12-31"
|
|
assert data["personRole"] == "sender"
|
|
|
|
|
|
# ── Model loading ────────────────────────────────────────────────────────────
|
|
|
|
@pytest.fixture(scope="session")
|
|
def nlp_de():
|
|
from extractor import get_nlp
|
|
return get_nlp("de")
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def nlp_en():
|
|
from extractor import get_nlp
|
|
return get_nlp("en")
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def nlp_es():
|
|
from extractor import get_nlp
|
|
return get_nlp("es")
|
|
|
|
|
|
def test_get_nlp_de_loads(nlp_de):
|
|
doc = nlp_de("Test")
|
|
assert doc is not None
|
|
|
|
|
|
def test_get_nlp_en_loads(nlp_en):
|
|
doc = nlp_en("Test")
|
|
assert doc is not None
|
|
|
|
|
|
def test_get_nlp_es_loads(nlp_es):
|
|
doc = nlp_es("Prueba")
|
|
assert doc is not None
|
|
|
|
|
|
def test_get_nlp_unknown_lang_raises():
|
|
from extractor import get_nlp
|
|
with pytest.raises(ValueError, match="Unsupported language"):
|
|
get_nlp("fr")
|
|
|
|
|
|
# ── Person name extraction ───────────────────────────────────────────────────
|
|
|
|
def _make_doc_with_ents(nlp, text: str, char_ents: list[tuple[int, int, str]]):
|
|
"""Create a Doc with manually injected entity spans (no NER model needed)."""
|
|
doc = nlp.make_doc(text)
|
|
spans = [doc.char_span(s, e, label=lbl) for s, e, lbl in char_ents]
|
|
doc.ents = [sp for sp in spans if sp is not None]
|
|
return doc
|
|
|
|
|
|
def test_extract_person_names_two_persons(nlp_de):
|
|
from extractor import extract_person_names
|
|
# "Briefe von Opa Hermann an Marie"
|
|
# "Opa Hermann" = chars 11..22, "Marie" = chars 26..31
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe von Opa Hermann an Marie", [
|
|
(11, 22, "PER"),
|
|
(26, 31, "PER"),
|
|
])
|
|
assert extract_person_names(doc) == ["Opa Hermann", "Marie"]
|
|
|
|
|
|
def test_extract_person_names_preserves_order(nlp_de):
|
|
from extractor import extract_person_names
|
|
# "Marie von Opa" — Marie comes first in text
|
|
# "Marie" = 0..5, "Opa" = 10..13
|
|
doc = _make_doc_with_ents(nlp_de, "Marie von Opa", [
|
|
(0, 5, "PER"),
|
|
(10, 13, "PER"),
|
|
])
|
|
assert extract_person_names(doc) == ["Marie", "Opa"]
|
|
|
|
|
|
def test_extract_person_names_empty(nlp_de):
|
|
from extractor import extract_person_names
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe aus dem Krieg", [])
|
|
assert extract_person_names(doc) == []
|
|
|
|
|
|
def test_extract_person_names_ignores_non_per(nlp_de):
|
|
from extractor import extract_person_names
|
|
# DATE entity should not appear in personNames
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")])
|
|
assert extract_person_names(doc) == []
|
|
|
|
|
|
# ── Role detection ───────────────────────────────────────────────────────────
|
|
|
|
def test_role_sender_von(nlp_de):
|
|
from extractor import detect_person_role
|
|
# "Briefe von Marie" — "von" immediately before "Marie"
|
|
# "Marie" = chars 11..16
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe von Marie", [(11, 16, "PER")])
|
|
per_spans = list(doc.ents)
|
|
assert detect_person_role(doc, per_spans, "de") == "sender"
|
|
|
|
|
|
def test_role_receiver_an(nlp_de):
|
|
from extractor import detect_person_role
|
|
# "Briefe an Marie" — "an" immediately before "Marie"
|
|
# "Marie" = chars 10..15
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe an Marie", [(10, 15, "PER")])
|
|
per_spans = list(doc.ents)
|
|
assert detect_person_role(doc, per_spans, "de") == "receiver"
|
|
|
|
|
|
def test_role_two_persons_returns_any(nlp_de):
|
|
from extractor import detect_person_role
|
|
# "von Opa an Marie" — two PER spans → always "any"
|
|
# "Opa" = chars 4..7, "Marie" = chars 11..16
|
|
doc = _make_doc_with_ents(nlp_de, "von Opa an Marie", [
|
|
(4, 7, "PER"),
|
|
(11, 16, "PER"),
|
|
])
|
|
per_spans = list(doc.ents)
|
|
assert detect_person_role(doc, per_spans, "de") == "any"
|
|
|
|
|
|
def test_role_no_prep_returns_any(nlp_de):
|
|
from extractor import detect_person_role
|
|
# "Briefe Marie" — no preposition
|
|
# "Marie" = chars 7..12
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe Marie", [(7, 12, "PER")])
|
|
per_spans = list(doc.ents)
|
|
assert detect_person_role(doc, per_spans, "de") == "any"
|
|
|
|
|
|
def test_role_empty_returns_any(nlp_de):
|
|
from extractor import detect_person_role
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [])
|
|
assert detect_person_role(doc, [], "de") == "any"
|
|
|
|
|
|
def test_role_sender_from_english(nlp_en):
|
|
from extractor import detect_person_role
|
|
# "letters from Marie" — "from" before "Marie"
|
|
# "Marie" = chars 13..18
|
|
doc = _make_doc_with_ents(nlp_en, "letters from Marie", [(13, 18, "PER")])
|
|
per_spans = list(doc.ents)
|
|
assert detect_person_role(doc, per_spans, "en") == "sender"
|
|
|
|
|
|
def test_role_receiver_to_english(nlp_en):
|
|
from extractor import detect_person_role
|
|
# "letters to Marie" — "to" before "Marie"
|
|
# "letters" = 0..7, " " = 7, "to" = 8..10, " " = 10, "Marie" = 11..16
|
|
doc = _make_doc_with_ents(nlp_en, "letters to Marie", [(11, 16, "PER")])
|
|
per_spans = list(doc.ents)
|
|
assert detect_person_role(doc, per_spans, "en") == "receiver"
|
|
|
|
|
|
# ── Date parsing ─────────────────────────────────────────────────────────────
|
|
|
|
def test_date_vor_1920(nlp_de):
|
|
from extractor import extract_dates
|
|
# "Briefe vor 1920" — "1920" at chars 11..15
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe vor 1920", [(11, 15, "DATE")])
|
|
date_from, date_to = extract_dates(doc, "de")
|
|
assert date_from is None
|
|
assert date_to == "1920-12-31"
|
|
|
|
|
|
def test_date_nach_1900(nlp_de):
|
|
from extractor import extract_dates
|
|
# "Briefe nach 1900" — "1900" at chars 12..16
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe nach 1900", [(12, 16, "DATE")])
|
|
date_from, date_to = extract_dates(doc, "de")
|
|
assert date_from == "1900-01-01"
|
|
assert date_to is None
|
|
|
|
|
|
def test_date_zwischen_1900_und_1920(nlp_de):
|
|
from extractor import extract_dates
|
|
# "zwischen 1900 und 1920"
|
|
# "1900" = chars 9..13, "1920" = chars 18..22
|
|
doc = _make_doc_with_ents(nlp_de, "zwischen 1900 und 1920", [
|
|
(9, 13, "DATE"),
|
|
(18, 22, "DATE"),
|
|
])
|
|
date_from, date_to = extract_dates(doc, "de")
|
|
assert date_from == "1900-01-01"
|
|
assert date_to == "1920-12-31"
|
|
|
|
|
|
def test_date_bare_year_makes_range(nlp_de):
|
|
from extractor import extract_dates
|
|
# "Briefe 1920" — no direction token → year-range
|
|
# "1920" = chars 7..11
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")])
|
|
date_from, date_to = extract_dates(doc, "de")
|
|
assert date_from == "1920-01-01"
|
|
assert date_to == "1920-12-31"
|
|
|
|
|
|
def test_date_no_date_entity(nlp_de):
|
|
from extractor import extract_dates
|
|
doc = _make_doc_with_ents(nlp_de, "Briefe von Opa", [])
|
|
date_from, date_to = extract_dates(doc, "de")
|
|
assert date_from is None
|
|
assert date_to is None
|
|
|
|
|
|
def test_date_before_english(nlp_en):
|
|
from extractor import extract_dates
|
|
# "letters before 1920" — "1920" at chars 15..19
|
|
doc = _make_doc_with_ents(nlp_en, "letters before 1920", [(15, 19, "DATE")])
|
|
date_from, date_to = extract_dates(doc, "en")
|
|
assert date_from is None
|
|
assert date_to == "1920-12-31"
|
|
|
|
|
|
def test_date_after_english(nlp_en):
|
|
from extractor import extract_dates
|
|
# "letters after 1900" — "1900" at chars 14..18
|
|
doc = _make_doc_with_ents(nlp_en, "letters after 1900", [(14, 18, "DATE")])
|
|
date_from, date_to = extract_dates(doc, "en")
|
|
assert date_from == "1900-01-01"
|
|
assert date_to is None
|
|
|
|
|
|
# ── Keyword extraction ───────────────────────────────────────────────────────
|
|
|
|
def test_keywords_extracts_nouns(nlp_de):
|
|
from extractor import extract_keywords
|
|
# Use real NLP for POS tags; disable NER to avoid interference
|
|
doc = nlp_de("Briefe aus dem Krieg", disable=["ner"])
|
|
keywords = extract_keywords(doc, [])
|
|
# "Brief" (NOUN) and "Krieg" (NOUN) should appear as lemmas
|
|
assert "brief" in keywords
|
|
assert "krieg" in keywords
|
|
|
|
|
|
def test_keywords_excludes_stopwords(nlp_de):
|
|
from extractor import extract_keywords
|
|
doc = nlp_de("Briefe aus dem Krieg", disable=["ner"])
|
|
keywords = extract_keywords(doc, [])
|
|
# "dem" is a stopword article — must not appear
|
|
assert "dem" not in keywords
|
|
|
|
|
|
def test_keywords_excludes_per_ner_spans(nlp_de):
|
|
from extractor import extract_keywords
|
|
# Run full NLP for POS tags, then inject a PER span over "Hermann"
|
|
# "Briefe von Hermann": B=0..6, ' '=6, v=7..10, ' '=10, H=11..18
|
|
doc = nlp_de("Briefe von Hermann")
|
|
per_span = doc.char_span(11, 18, label="PER")
|
|
if per_span:
|
|
doc.ents = [per_span]
|
|
keywords = extract_keywords(doc, list(doc.ents))
|
|
assert "hermann" not in keywords
|
|
|
|
|
|
def test_keywords_excludes_short_lemmas(nlp_de):
|
|
from extractor import extract_keywords
|
|
doc = nlp_de("Briefe an ihn", disable=["ner"])
|
|
keywords = extract_keywords(doc, [])
|
|
# "ihn" is 3 chars but is a stopword pronoun; "an" is 2 chars
|
|
assert "an" not in keywords
|
|
|
|
|
|
def test_keywords_deduplicates(nlp_de):
|
|
from extractor import extract_keywords
|
|
doc = nlp_de("Brief Brief Krieg", disable=["ner"])
|
|
keywords = extract_keywords(doc, [])
|
|
assert keywords.count("brief") == 1
|