feat(nlp-service): keyword extraction (POS-filtered, deduped lemmas)

2026-06-07 10:24:35 +02:00
parent 3f74deda8c
commit 702a72d575
2 changed files with 74 additions and 0 deletions
--- a/nlp-service/extractor.py
+++ b/nlp-service/extractor.py
@@ -168,3 +168,30 @@ def extract_dates(doc, lang: str) -> tuple[str | None, str | None]:
        return d.isoformat(), None
    # Bare year/date — closed year-range
    return d.isoformat(), _year_end(d).isoformat()
+
+
+# ── Step 4: Keyword extraction ───────────────────────────────────────────────
+
+def extract_keywords(doc, excluded_spans: list) -> list[str]:
+    """Return lowercased lemmas of content words not inside any NER span."""
+    excluded_indices: set[int] = set()
+    for span in excluded_spans:
+        excluded_indices.update(range(span.start, span.end))
+
+    seen: set[str] = set()
+    keywords: list[str] = []
+    for token in doc:
+        if token.i in excluded_indices:
+            continue
+        if token.pos_ not in ("NOUN", "PROPN"):
+            continue
+        if token.is_stop:
+            continue
+        lemma = token.lemma_.lower()
+        if len(lemma) < 3:
+            continue
+        if lemma not in seen:
+            seen.add(lemma)
+            keywords.append(lemma)
+
+    return keywords