feat(nlp-service): full extract() pipeline — assembles all steps
Also adds regex year-fallback in extract_dates() for de/es spaCy small models that don't tag bare 4-digit years as DATE entities, and widens the direction-token window to 2 tokens back to handle Spanish "antes de". Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -298,3 +298,53 @@ def test_keywords_deduplicates(nlp_de):
|
||||
doc = nlp_de("Brief Brief Krieg", disable=["ner"])
|
||||
keywords = extract_keywords(doc, [])
|
||||
assert keywords.count("brief") == 1
|
||||
|
||||
|
||||
# ── Full extract() pipeline ──────────────────────────────────────────────────
|
||||
|
||||
def test_extract_dates_de():
|
||||
from extractor import extract
|
||||
result = extract("Briefe vor 1920", "de")
|
||||
assert result.dateFrom is None
|
||||
assert result.dateTo == "1920-12-31"
|
||||
assert result.rawQuery == "Briefe vor 1920"
|
||||
assert result.personNames == []
|
||||
assert result.personRole == "any"
|
||||
|
||||
|
||||
def test_extract_keywords_from_topic_de():
|
||||
from extractor import extract
|
||||
result = extract("Briefe aus dem Krieg", "de")
|
||||
assert "krieg" in result.keywords
|
||||
assert result.dateFrom is None
|
||||
assert result.dateTo is None
|
||||
|
||||
|
||||
def test_extract_dates_en():
|
||||
from extractor import extract
|
||||
result = extract("letters before 1920", "en")
|
||||
assert result.dateTo == "1920-12-31"
|
||||
assert result.dateFrom is None
|
||||
|
||||
|
||||
def test_extract_dates_es():
|
||||
from extractor import extract
|
||||
result = extract("cartas antes de 1920", "es")
|
||||
assert result.dateTo == "1920-12-31"
|
||||
assert result.dateFrom is None
|
||||
|
||||
|
||||
def test_extract_rawquery_echoed():
|
||||
from extractor import extract
|
||||
q = "Texte über Weihnachten"
|
||||
result = extract(q, "de")
|
||||
assert result.rawQuery == q
|
||||
|
||||
|
||||
def test_extract_response_fields_are_complete():
|
||||
from extractor import extract
|
||||
result = extract("Briefe 1900", "de")
|
||||
assert isinstance(result.personNames, list)
|
||||
assert result.personRole in ("sender", "receiver", "any")
|
||||
assert isinstance(result.keywords, list)
|
||||
assert result.rawQuery == "Briefe 1900"
|
||||
|
||||
Reference in New Issue
Block a user