From deea34c797ae62efc0f13f05aa6adb9a09417067 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 7 Jun 2026 10:21:16 +0200 Subject: [PATCH] feat(nlp-service): NER person name extraction --- nlp-service/extractor.py | 7 ++++++ nlp-service/test_extractor.py | 45 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/nlp-service/extractor.py b/nlp-service/extractor.py index ce752bef..ca9c197e 100644 --- a/nlp-service/extractor.py +++ b/nlp-service/extractor.py @@ -31,3 +31,10 @@ def get_nlp(lang: str) -> Language: def load_all_models() -> None: for lang in _MODEL_NAMES: get_nlp(lang) + + +# ── Step 1: Person name extraction ────────────────────────────────────────── + +def extract_person_names(doc) -> list[str]: + """Return PER entity texts in left-to-right span order.""" + return [ent.text for ent in doc.ents if ent.label_ == "PER"] diff --git a/nlp-service/test_extractor.py b/nlp-service/test_extractor.py index 2ef7c0a6..60890936 100644 --- a/nlp-service/test_extractor.py +++ b/nlp-service/test_extractor.py @@ -72,3 +72,48 @@ def test_get_nlp_unknown_lang_raises(): from extractor import get_nlp with pytest.raises(ValueError, match="Unsupported language"): get_nlp("fr") + + +# ── Person name extraction ─────────────────────────────────────────────────── + +def _make_doc_with_ents(nlp, text: str, char_ents: list[tuple[int, int, str]]): + """Create a Doc with manually injected entity spans (no NER model needed).""" + doc = nlp.make_doc(text) + spans = [doc.char_span(s, e, label=lbl) for s, e, lbl in char_ents] + doc.ents = [sp for sp in spans if sp is not None] + return doc + + +def test_extract_person_names_two_persons(nlp_de): + from extractor import extract_person_names + # "Briefe von Opa Hermann an Marie" + # "Opa Hermann" = chars 11..22, "Marie" = chars 26..31 + doc = _make_doc_with_ents(nlp_de, "Briefe von Opa Hermann an Marie", [ + (11, 22, "PER"), + (26, 31, "PER"), + ]) + assert extract_person_names(doc) == ["Opa Hermann", "Marie"] + + +def test_extract_person_names_preserves_order(nlp_de): + from extractor import extract_person_names + # "Marie von Opa" — Marie comes first in text + # "Marie" = 0..5, "Opa" = 10..13 + doc = _make_doc_with_ents(nlp_de, "Marie von Opa", [ + (0, 5, "PER"), + (10, 13, "PER"), + ]) + assert extract_person_names(doc) == ["Marie", "Opa"] + + +def test_extract_person_names_empty(nlp_de): + from extractor import extract_person_names + doc = _make_doc_with_ents(nlp_de, "Briefe aus dem Krieg", []) + assert extract_person_names(doc) == [] + + +def test_extract_person_names_ignores_non_per(nlp_de): + from extractor import extract_person_names + # DATE entity should not appear in personNames + doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")]) + assert extract_person_names(doc) == []