feat(nlp-service): full extract() pipeline — assembles all steps
Also adds regex year-fallback in extract_dates() for de/es spaCy small models that don't tag bare 4-digit years as DATE entities, and widens the direction-token window to 2 tokens back to handle Spanish "antes de". Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -132,9 +132,28 @@ def _year_end(d: date) -> date:
|
||||
return d
|
||||
|
||||
|
||||
def _find_year_spans(doc) -> list:
|
||||
"""Fallback: find tokens that look like 4-digit years (1000–2999) when NER
|
||||
produces no DATE entities. Returns a list of single-token pseudo-spans
|
||||
(spaCy Span objects) labelled 'DATE'."""
|
||||
spans = []
|
||||
for token in doc:
|
||||
if _YEAR_RE.match(token.text):
|
||||
year = int(token.text)
|
||||
if 1000 < year < 3000:
|
||||
span = doc[token.i : token.i + 1]
|
||||
spans.append(span)
|
||||
return spans
|
||||
|
||||
|
||||
def extract_dates(doc, lang: str) -> tuple[str | None, str | None]:
|
||||
"""Return (date_from, date_to) as ISO strings or None."""
|
||||
date_spans = [ent for ent in doc.ents if ent.label_ == "DATE"]
|
||||
|
||||
# Fallback: some spaCy small models (de, es) don't tag bare years as DATE
|
||||
if not date_spans:
|
||||
date_spans = _find_year_spans(doc)
|
||||
|
||||
if not date_spans:
|
||||
return None, None
|
||||
|
||||
@@ -160,11 +179,16 @@ def extract_dates(doc, lang: str) -> tuple[str | None, str | None]:
|
||||
if not d:
|
||||
return None, None
|
||||
|
||||
prev_lower = doc[span.start - 1].lower_ if span.start > 0 else ""
|
||||
# Check up to 2 tokens before the date span to handle multi-word prepositions
|
||||
# like Spanish "antes de 1920" where the keyword is 2 tokens back.
|
||||
prev_tokens = [
|
||||
doc[span.start - i].lower_
|
||||
for i in range(1, min(3, span.start + 1))
|
||||
]
|
||||
|
||||
if prev_lower in before_tokens:
|
||||
if any(t in before_tokens for t in prev_tokens):
|
||||
return None, _year_end(d).isoformat()
|
||||
if prev_lower in after_tokens:
|
||||
if any(t in after_tokens for t in prev_tokens):
|
||||
return d.isoformat(), None
|
||||
# Bare year/date — closed year-range
|
||||
return d.isoformat(), _year_end(d).isoformat()
|
||||
@@ -195,3 +219,27 @@ def extract_keywords(doc, excluded_spans: list) -> list[str]:
|
||||
keywords.append(lemma)
|
||||
|
||||
return keywords
|
||||
|
||||
|
||||
# ── Step 5: Assembly ─────────────────────────────────────────────────────────
|
||||
|
||||
def extract(query: str, lang: str) -> ParseResponse:
|
||||
"""Run the full NLP pipeline and return a ParseResponse."""
|
||||
nlp = get_nlp(lang)
|
||||
doc = nlp(query)
|
||||
|
||||
per_spans = [ent for ent in doc.ents if ent.label_ == "PER"]
|
||||
|
||||
person_names = extract_person_names(doc)
|
||||
person_role = detect_person_role(doc, per_spans, lang)
|
||||
date_from, date_to = extract_dates(doc, lang)
|
||||
keywords = extract_keywords(doc, list(doc.ents))
|
||||
|
||||
return ParseResponse(
|
||||
personNames=person_names,
|
||||
personRole=person_role,
|
||||
dateFrom=date_from,
|
||||
dateTo=date_to,
|
||||
keywords=keywords,
|
||||
rawQuery=query,
|
||||
)
|
||||
|
||||
@@ -298,3 +298,53 @@ def test_keywords_deduplicates(nlp_de):
|
||||
doc = nlp_de("Brief Brief Krieg", disable=["ner"])
|
||||
keywords = extract_keywords(doc, [])
|
||||
assert keywords.count("brief") == 1
|
||||
|
||||
|
||||
# ── Full extract() pipeline ──────────────────────────────────────────────────
|
||||
|
||||
def test_extract_dates_de():
|
||||
from extractor import extract
|
||||
result = extract("Briefe vor 1920", "de")
|
||||
assert result.dateFrom is None
|
||||
assert result.dateTo == "1920-12-31"
|
||||
assert result.rawQuery == "Briefe vor 1920"
|
||||
assert result.personNames == []
|
||||
assert result.personRole == "any"
|
||||
|
||||
|
||||
def test_extract_keywords_from_topic_de():
|
||||
from extractor import extract
|
||||
result = extract("Briefe aus dem Krieg", "de")
|
||||
assert "krieg" in result.keywords
|
||||
assert result.dateFrom is None
|
||||
assert result.dateTo is None
|
||||
|
||||
|
||||
def test_extract_dates_en():
|
||||
from extractor import extract
|
||||
result = extract("letters before 1920", "en")
|
||||
assert result.dateTo == "1920-12-31"
|
||||
assert result.dateFrom is None
|
||||
|
||||
|
||||
def test_extract_dates_es():
|
||||
from extractor import extract
|
||||
result = extract("cartas antes de 1920", "es")
|
||||
assert result.dateTo == "1920-12-31"
|
||||
assert result.dateFrom is None
|
||||
|
||||
|
||||
def test_extract_rawquery_echoed():
|
||||
from extractor import extract
|
||||
q = "Texte über Weihnachten"
|
||||
result = extract(q, "de")
|
||||
assert result.rawQuery == q
|
||||
|
||||
|
||||
def test_extract_response_fields_are_complete():
|
||||
from extractor import extract
|
||||
result = extract("Briefe 1900", "de")
|
||||
assert isinstance(result.personNames, list)
|
||||
assert result.personRole in ("sender", "receiver", "any")
|
||||
assert isinstance(result.keywords, list)
|
||||
assert result.rawQuery == "Briefe 1900"
|
||||
|
||||
Reference in New Issue
Block a user