From cc4c81e218f7ad52ba66b47b30911968f302b3bf Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 7 Jun 2026 10:28:40 +0200 Subject: [PATCH] =?UTF-8?q?feat(nlp-service):=20full=20extract()=20pipelin?= =?UTF-8?q?e=20=E2=80=94=20assembles=20all=20steps?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also adds regex year-fallback in extract_dates() for de/es spaCy small models that don't tag bare 4-digit years as DATE entities, and widens the direction-token window to 2 tokens back to handle Spanish "antes de". Co-Authored-By: Claude Sonnet 4.6 --- nlp-service/extractor.py | 54 +++++++++++++++++++++++++++++++++-- nlp-service/test_extractor.py | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 3 deletions(-) diff --git a/nlp-service/extractor.py b/nlp-service/extractor.py index 5d9b230b..5d6bb629 100644 --- a/nlp-service/extractor.py +++ b/nlp-service/extractor.py @@ -132,9 +132,28 @@ def _year_end(d: date) -> date: return d +def _find_year_spans(doc) -> list: + """Fallback: find tokens that look like 4-digit years (1000–2999) when NER + produces no DATE entities. Returns a list of single-token pseudo-spans + (spaCy Span objects) labelled 'DATE'.""" + spans = [] + for token in doc: + if _YEAR_RE.match(token.text): + year = int(token.text) + if 1000 < year < 3000: + span = doc[token.i : token.i + 1] + spans.append(span) + return spans + + def extract_dates(doc, lang: str) -> tuple[str | None, str | None]: """Return (date_from, date_to) as ISO strings or None.""" date_spans = [ent for ent in doc.ents if ent.label_ == "DATE"] + + # Fallback: some spaCy small models (de, es) don't tag bare years as DATE + if not date_spans: + date_spans = _find_year_spans(doc) + if not date_spans: return None, None @@ -160,11 +179,16 @@ def extract_dates(doc, lang: str) -> tuple[str | None, str | None]: if not d: return None, None - prev_lower = doc[span.start - 1].lower_ if span.start > 0 else "" + # Check up to 2 tokens before the date span to handle multi-word prepositions + # like Spanish "antes de 1920" where the keyword is 2 tokens back. + prev_tokens = [ + doc[span.start - i].lower_ + for i in range(1, min(3, span.start + 1)) + ] - if prev_lower in before_tokens: + if any(t in before_tokens for t in prev_tokens): return None, _year_end(d).isoformat() - if prev_lower in after_tokens: + if any(t in after_tokens for t in prev_tokens): return d.isoformat(), None # Bare year/date — closed year-range return d.isoformat(), _year_end(d).isoformat() @@ -195,3 +219,27 @@ def extract_keywords(doc, excluded_spans: list) -> list[str]: keywords.append(lemma) return keywords + + +# ── Step 5: Assembly ───────────────────────────────────────────────────────── + +def extract(query: str, lang: str) -> ParseResponse: + """Run the full NLP pipeline and return a ParseResponse.""" + nlp = get_nlp(lang) + doc = nlp(query) + + per_spans = [ent for ent in doc.ents if ent.label_ == "PER"] + + person_names = extract_person_names(doc) + person_role = detect_person_role(doc, per_spans, lang) + date_from, date_to = extract_dates(doc, lang) + keywords = extract_keywords(doc, list(doc.ents)) + + return ParseResponse( + personNames=person_names, + personRole=person_role, + dateFrom=date_from, + dateTo=date_to, + keywords=keywords, + rawQuery=query, + ) diff --git a/nlp-service/test_extractor.py b/nlp-service/test_extractor.py index 8d623178..fb117884 100644 --- a/nlp-service/test_extractor.py +++ b/nlp-service/test_extractor.py @@ -298,3 +298,53 @@ def test_keywords_deduplicates(nlp_de): doc = nlp_de("Brief Brief Krieg", disable=["ner"]) keywords = extract_keywords(doc, []) assert keywords.count("brief") == 1 + + +# ── Full extract() pipeline ────────────────────────────────────────────────── + +def test_extract_dates_de(): + from extractor import extract + result = extract("Briefe vor 1920", "de") + assert result.dateFrom is None + assert result.dateTo == "1920-12-31" + assert result.rawQuery == "Briefe vor 1920" + assert result.personNames == [] + assert result.personRole == "any" + + +def test_extract_keywords_from_topic_de(): + from extractor import extract + result = extract("Briefe aus dem Krieg", "de") + assert "krieg" in result.keywords + assert result.dateFrom is None + assert result.dateTo is None + + +def test_extract_dates_en(): + from extractor import extract + result = extract("letters before 1920", "en") + assert result.dateTo == "1920-12-31" + assert result.dateFrom is None + + +def test_extract_dates_es(): + from extractor import extract + result = extract("cartas antes de 1920", "es") + assert result.dateTo == "1920-12-31" + assert result.dateFrom is None + + +def test_extract_rawquery_echoed(): + from extractor import extract + q = "Texte über Weihnachten" + result = extract(q, "de") + assert result.rawQuery == q + + +def test_extract_response_fields_are_complete(): + from extractor import extract + result = extract("Briefe 1900", "de") + assert isinstance(result.personNames, list) + assert result.personRole in ("sender", "receiver", "any") + assert isinstance(result.keywords, list) + assert result.rawQuery == "Briefe 1900"