2026-06-08 10:57:01 +02:00
2 changed files with 52 additions and 0 deletions
--- a/nlp-service/extractor.py
+++ b/nlp-service/extractor.py
@@ -31,3 +31,10 @@ def get_nlp(lang: str) -> Language:
 def load_all_models() -> None:
    for lang in _MODEL_NAMES:
        get_nlp(lang)
+
+
+# ── Step 1: Person name extraction ──────────────────────────────────────────
+
+def extract_person_names(doc) -> list[str]:
+    """Return PER entity texts in left-to-right span order."""
+    return [ent.text for ent in doc.ents if ent.label_ == "PER"]
--- a/nlp-service/test_extractor.py
+++ b/nlp-service/test_extractor.py
@@ -72,3 +72,48 @@ def test_get_nlp_unknown_lang_raises():
    from extractor import get_nlp
    with pytest.raises(ValueError, match="Unsupported language"):
        get_nlp("fr")
+
+
+# ── Person name extraction ───────────────────────────────────────────────────
+
+def _make_doc_with_ents(nlp, text: str, char_ents: list[tuple[int, int, str]]):
+    """Create a Doc with manually injected entity spans (no NER model needed)."""
+    doc = nlp.make_doc(text)
+    spans = [doc.char_span(s, e, label=lbl) for s, e, lbl in char_ents]
+    doc.ents = [sp for sp in spans if sp is not None]
+    return doc
+
+
+def test_extract_person_names_two_persons(nlp_de):
+    from extractor import extract_person_names
+    # "Briefe von Opa Hermann an Marie"
+    # "Opa Hermann" = chars 11..22, "Marie" = chars 26..31
+    doc = _make_doc_with_ents(nlp_de, "Briefe von Opa Hermann an Marie", [
+        (11, 22, "PER"),
+        (26, 31, "PER"),
+    ])
+    assert extract_person_names(doc) == ["Opa Hermann", "Marie"]
+
+
+def test_extract_person_names_preserves_order(nlp_de):
+    from extractor import extract_person_names
+    # "Marie von Opa" — Marie comes first in text
+    # "Marie" = 0..5, "Opa" = 10..13
+    doc = _make_doc_with_ents(nlp_de, "Marie von Opa", [
+        (0, 5, "PER"),
+        (10, 13, "PER"),
+    ])
+    assert extract_person_names(doc) == ["Marie", "Opa"]
+
+
+def test_extract_person_names_empty(nlp_de):
+    from extractor import extract_person_names
+    doc = _make_doc_with_ents(nlp_de, "Briefe aus dem Krieg", [])
+    assert extract_person_names(doc) == []
+
+
+def test_extract_person_names_ignores_non_per(nlp_de):
+    from extractor import extract_person_names
+    # DATE entity should not appear in personNames
+    doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")])
+    assert extract_person_names(doc) == []