feat(nlp-service): full extract() pipeline — assembles all steps

Also adds regex year-fallback in extract_dates() for de/es spaCy small
models that don't tag bare 4-digit years as DATE entities, and widens
the direction-token window to 2 tokens back to handle Spanish "antes de".

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-06-07 10:28:40 +02:00
parent 55f419d20f
commit cc4c81e218
2 changed files with 101 additions and 3 deletions

View File

@@ -132,9 +132,28 @@ def _year_end(d: date) -> date:
return d
def _find_year_spans(doc) -> list:
"""Fallback: find tokens that look like 4-digit years (10002999) when NER
produces no DATE entities. Returns a list of single-token pseudo-spans
(spaCy Span objects) labelled 'DATE'."""
spans = []
for token in doc:
if _YEAR_RE.match(token.text):
year = int(token.text)
if 1000 < year < 3000:
span = doc[token.i : token.i + 1]
spans.append(span)
return spans
def extract_dates(doc, lang: str) -> tuple[str | None, str | None]:
"""Return (date_from, date_to) as ISO strings or None."""
date_spans = [ent for ent in doc.ents if ent.label_ == "DATE"]
# Fallback: some spaCy small models (de, es) don't tag bare years as DATE
if not date_spans:
date_spans = _find_year_spans(doc)
if not date_spans:
return None, None
@@ -160,11 +179,16 @@ def extract_dates(doc, lang: str) -> tuple[str | None, str | None]:
if not d:
return None, None
prev_lower = doc[span.start - 1].lower_ if span.start > 0 else ""
# Check up to 2 tokens before the date span to handle multi-word prepositions
# like Spanish "antes de 1920" where the keyword is 2 tokens back.
prev_tokens = [
doc[span.start - i].lower_
for i in range(1, min(3, span.start + 1))
]
if prev_lower in before_tokens:
if any(t in before_tokens for t in prev_tokens):
return None, _year_end(d).isoformat()
if prev_lower in after_tokens:
if any(t in after_tokens for t in prev_tokens):
return d.isoformat(), None
# Bare year/date — closed year-range
return d.isoformat(), _year_end(d).isoformat()
@@ -195,3 +219,27 @@ def extract_keywords(doc, excluded_spans: list) -> list[str]:
keywords.append(lemma)
return keywords
# ── Step 5: Assembly ─────────────────────────────────────────────────────────
def extract(query: str, lang: str) -> ParseResponse:
"""Run the full NLP pipeline and return a ParseResponse."""
nlp = get_nlp(lang)
doc = nlp(query)
per_spans = [ent for ent in doc.ents if ent.label_ == "PER"]
person_names = extract_person_names(doc)
person_role = detect_person_role(doc, per_spans, lang)
date_from, date_to = extract_dates(doc, lang)
keywords = extract_keywords(doc, list(doc.ents))
return ParseResponse(
personNames=person_names,
personRole=person_role,
dateFrom=date_from,
dateTo=date_to,
keywords=keywords,
rawQuery=query,
)