feat(nlp-service): keyword extraction (POS-filtered, deduped lemmas)
This commit is contained in:
@@ -168,3 +168,30 @@ def extract_dates(doc, lang: str) -> tuple[str | None, str | None]:
|
||||
return d.isoformat(), None
|
||||
# Bare year/date — closed year-range
|
||||
return d.isoformat(), _year_end(d).isoformat()
|
||||
|
||||
|
||||
# ── Step 4: Keyword extraction ───────────────────────────────────────────────
|
||||
|
||||
def extract_keywords(doc, excluded_spans: list) -> list[str]:
|
||||
"""Return lowercased lemmas of content words not inside any NER span."""
|
||||
excluded_indices: set[int] = set()
|
||||
for span in excluded_spans:
|
||||
excluded_indices.update(range(span.start, span.end))
|
||||
|
||||
seen: set[str] = set()
|
||||
keywords: list[str] = []
|
||||
for token in doc:
|
||||
if token.i in excluded_indices:
|
||||
continue
|
||||
if token.pos_ not in ("NOUN", "PROPN"):
|
||||
continue
|
||||
if token.is_stop:
|
||||
continue
|
||||
lemma = token.lemma_.lower()
|
||||
if len(lemma) < 3:
|
||||
continue
|
||||
if lemma not in seen:
|
||||
seen.add(lemma)
|
||||
keywords.append(lemma)
|
||||
|
||||
return keywords
|
||||
|
||||
Reference in New Issue
Block a user