feat(nlp-service): keyword extraction (POS-filtered, deduped lemmas)
This commit is contained in:
@@ -251,3 +251,50 @@ def test_date_after_english(nlp_en):
|
||||
date_from, date_to = extract_dates(doc, "en")
|
||||
assert date_from == "1900-01-01"
|
||||
assert date_to is None
|
||||
|
||||
|
||||
# ── Keyword extraction ───────────────────────────────────────────────────────
|
||||
|
||||
def test_keywords_extracts_nouns(nlp_de):
|
||||
from extractor import extract_keywords
|
||||
# Use real NLP for POS tags; disable NER to avoid interference
|
||||
doc = nlp_de("Briefe aus dem Krieg", disable=["ner"])
|
||||
keywords = extract_keywords(doc, [])
|
||||
# "Brief" (NOUN) and "Krieg" (NOUN) should appear as lemmas
|
||||
assert "brief" in keywords
|
||||
assert "krieg" in keywords
|
||||
|
||||
|
||||
def test_keywords_excludes_stopwords(nlp_de):
|
||||
from extractor import extract_keywords
|
||||
doc = nlp_de("Briefe aus dem Krieg", disable=["ner"])
|
||||
keywords = extract_keywords(doc, [])
|
||||
# "dem" is a stopword article — must not appear
|
||||
assert "dem" not in keywords
|
||||
|
||||
|
||||
def test_keywords_excludes_per_ner_spans(nlp_de):
|
||||
from extractor import extract_keywords
|
||||
# Run full NLP for POS tags, then inject a PER span over "Hermann"
|
||||
# "Briefe von Hermann": B=0..6, ' '=6, v=7..10, ' '=10, H=11..18
|
||||
doc = nlp_de("Briefe von Hermann")
|
||||
per_span = doc.char_span(11, 18, label="PER")
|
||||
if per_span:
|
||||
doc.ents = [per_span]
|
||||
keywords = extract_keywords(doc, list(doc.ents))
|
||||
assert "hermann" not in keywords
|
||||
|
||||
|
||||
def test_keywords_excludes_short_lemmas(nlp_de):
|
||||
from extractor import extract_keywords
|
||||
doc = nlp_de("Briefe an ihn", disable=["ner"])
|
||||
keywords = extract_keywords(doc, [])
|
||||
# "ihn" is 3 chars but is a stopword pronoun; "an" is 2 chars
|
||||
assert "an" not in keywords
|
||||
|
||||
|
||||
def test_keywords_deduplicates(nlp_de):
|
||||
from extractor import extract_keywords
|
||||
doc = nlp_de("Brief Brief Krieg", disable=["ner"])
|
||||
keywords = extract_keywords(doc, [])
|
||||
assert keywords.count("brief") == 1
|
||||
|
||||
Reference in New Issue
Block a user