diff --git a/nlp-service/extractor.py b/nlp-service/extractor.py index ca9c197e..7ca84f82 100644 --- a/nlp-service/extractor.py +++ b/nlp-service/extractor.py @@ -38,3 +38,51 @@ def load_all_models() -> None: def extract_person_names(doc) -> list[str]: """Return PER entity texts in left-to-right span order.""" return [ent.text for ent in doc.ents if ent.label_ == "PER"] + + +# ── Step 2: Role detection ─────────────────────────────────────────────────── + +_SENDER_PREPS: dict[str, frozenset[str]] = { + "de": frozenset({"von", "vom"}), + "en": frozenset({"from", "by"}), + "es": frozenset({"de", "por"}), +} + +_RECEIVER_PREPS: dict[str, frozenset[str]] = { + "de": frozenset({"an", "nach", "für"}), + "en": frozenset({"to", "for"}), + "es": frozenset({"para", "a"}), +} + + +def detect_person_role(doc, per_spans: list, lang: str) -> str: + """Return 'sender', 'receiver', or 'any'. + + Only meaningful for single-PER queries — two-person queries always return + 'any' because Java derives direction from list position. + """ + if len(per_spans) != 1: + return "any" + + span = per_spans[0] + root = span.root + sender = _SENDER_PREPS[lang] + receiver = _RECEIVER_PREPS[lang] + + # Primary: dependency-tree children of the PER root + for child in root.children: + if child.dep_ in ("case", "prep", "mo"): + if child.lower_ in sender: + return "sender" + if child.lower_ in receiver: + return "receiver" + + # Fallback: token immediately before the span start + if span.start > 0: + prev = doc[span.start - 1] + if prev.lower_ in sender: + return "sender" + if prev.lower_ in receiver: + return "receiver" + + return "any" diff --git a/nlp-service/test_extractor.py b/nlp-service/test_extractor.py index 60890936..fe14d4b1 100644 --- a/nlp-service/test_extractor.py +++ b/nlp-service/test_extractor.py @@ -117,3 +117,68 @@ def test_extract_person_names_ignores_non_per(nlp_de): # DATE entity should not appear in personNames doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")]) assert extract_person_names(doc) == [] + + +# ── Role detection ─────────────────────────────────────────────────────────── + +def test_role_sender_von(nlp_de): + from extractor import detect_person_role + # "Briefe von Marie" — "von" immediately before "Marie" + # "Marie" = chars 11..16 + doc = _make_doc_with_ents(nlp_de, "Briefe von Marie", [(11, 16, "PER")]) + per_spans = list(doc.ents) + assert detect_person_role(doc, per_spans, "de") == "sender" + + +def test_role_receiver_an(nlp_de): + from extractor import detect_person_role + # "Briefe an Marie" — "an" immediately before "Marie" + # "Marie" = chars 10..15 + doc = _make_doc_with_ents(nlp_de, "Briefe an Marie", [(10, 15, "PER")]) + per_spans = list(doc.ents) + assert detect_person_role(doc, per_spans, "de") == "receiver" + + +def test_role_two_persons_returns_any(nlp_de): + from extractor import detect_person_role + # "von Opa an Marie" — two PER spans → always "any" + # "Opa" = chars 4..7, "Marie" = chars 11..16 + doc = _make_doc_with_ents(nlp_de, "von Opa an Marie", [ + (4, 7, "PER"), + (11, 16, "PER"), + ]) + per_spans = list(doc.ents) + assert detect_person_role(doc, per_spans, "de") == "any" + + +def test_role_no_prep_returns_any(nlp_de): + from extractor import detect_person_role + # "Briefe Marie" — no preposition + # "Marie" = chars 7..12 + doc = _make_doc_with_ents(nlp_de, "Briefe Marie", [(7, 12, "PER")]) + per_spans = list(doc.ents) + assert detect_person_role(doc, per_spans, "de") == "any" + + +def test_role_empty_returns_any(nlp_de): + from extractor import detect_person_role + doc = _make_doc_with_ents(nlp_de, "Briefe 1920", []) + assert detect_person_role(doc, [], "de") == "any" + + +def test_role_sender_from_english(nlp_en): + from extractor import detect_person_role + # "letters from Marie" — "from" before "Marie" + # "Marie" = chars 13..18 + doc = _make_doc_with_ents(nlp_en, "letters from Marie", [(13, 18, "PER")]) + per_spans = list(doc.ents) + assert detect_person_role(doc, per_spans, "en") == "sender" + + +def test_role_receiver_to_english(nlp_en): + from extractor import detect_person_role + # "letters to Marie" — "to" before "Marie" + # "letters" = 0..7, " " = 7, "to" = 8..10, " " = 10, "Marie" = 11..16 + doc = _make_doc_with_ents(nlp_en, "letters to Marie", [(11, 16, "PER")]) + per_spans = list(doc.ents) + assert detect_person_role(doc, per_spans, "en") == "receiver"