diff --git a/nlp-service/extractor.py b/nlp-service/extractor.py index 19543af5..5d9b230b 100644 --- a/nlp-service/extractor.py +++ b/nlp-service/extractor.py @@ -168,3 +168,30 @@ def extract_dates(doc, lang: str) -> tuple[str | None, str | None]: return d.isoformat(), None # Bare year/date — closed year-range return d.isoformat(), _year_end(d).isoformat() + + +# ── Step 4: Keyword extraction ─────────────────────────────────────────────── + +def extract_keywords(doc, excluded_spans: list) -> list[str]: + """Return lowercased lemmas of content words not inside any NER span.""" + excluded_indices: set[int] = set() + for span in excluded_spans: + excluded_indices.update(range(span.start, span.end)) + + seen: set[str] = set() + keywords: list[str] = [] + for token in doc: + if token.i in excluded_indices: + continue + if token.pos_ not in ("NOUN", "PROPN"): + continue + if token.is_stop: + continue + lemma = token.lemma_.lower() + if len(lemma) < 3: + continue + if lemma not in seen: + seen.add(lemma) + keywords.append(lemma) + + return keywords diff --git a/nlp-service/test_extractor.py b/nlp-service/test_extractor.py index b79d188c..8d623178 100644 --- a/nlp-service/test_extractor.py +++ b/nlp-service/test_extractor.py @@ -251,3 +251,50 @@ def test_date_after_english(nlp_en): date_from, date_to = extract_dates(doc, "en") assert date_from == "1900-01-01" assert date_to is None + + +# ── Keyword extraction ─────────────────────────────────────────────────────── + +def test_keywords_extracts_nouns(nlp_de): + from extractor import extract_keywords + # Use real NLP for POS tags; disable NER to avoid interference + doc = nlp_de("Briefe aus dem Krieg", disable=["ner"]) + keywords = extract_keywords(doc, []) + # "Brief" (NOUN) and "Krieg" (NOUN) should appear as lemmas + assert "brief" in keywords + assert "krieg" in keywords + + +def test_keywords_excludes_stopwords(nlp_de): + from extractor import extract_keywords + doc = nlp_de("Briefe aus dem Krieg", disable=["ner"]) + keywords = extract_keywords(doc, []) + # "dem" is a stopword article — must not appear + assert "dem" not in keywords + + +def test_keywords_excludes_per_ner_spans(nlp_de): + from extractor import extract_keywords + # Run full NLP for POS tags, then inject a PER span over "Hermann" + # "Briefe von Hermann": B=0..6, ' '=6, v=7..10, ' '=10, H=11..18 + doc = nlp_de("Briefe von Hermann") + per_span = doc.char_span(11, 18, label="PER") + if per_span: + doc.ents = [per_span] + keywords = extract_keywords(doc, list(doc.ents)) + assert "hermann" not in keywords + + +def test_keywords_excludes_short_lemmas(nlp_de): + from extractor import extract_keywords + doc = nlp_de("Briefe an ihn", disable=["ner"]) + keywords = extract_keywords(doc, []) + # "ihn" is 3 chars but is a stopword pronoun; "an" is 2 chars + assert "an" not in keywords + + +def test_keywords_deduplicates(nlp_de): + from extractor import extract_keywords + doc = nlp_de("Brief Brief Krieg", disable=["ner"]) + keywords = extract_keywords(doc, []) + assert keywords.count("brief") == 1