feat(nlp-service): date range extraction with direction detection

This commit is contained in:
Marcel
2026-06-07 10:23:33 +02:00
parent 0ab2e2a743
commit 53f6dcbfed
2 changed files with 151 additions and 0 deletions

View File

@@ -182,3 +182,72 @@ def test_role_receiver_to_english(nlp_en):
doc = _make_doc_with_ents(nlp_en, "letters to Marie", [(11, 16, "PER")])
per_spans = list(doc.ents)
assert detect_person_role(doc, per_spans, "en") == "receiver"
# ── Date parsing ─────────────────────────────────────────────────────────────
def test_date_vor_1920(nlp_de):
from extractor import extract_dates
# "Briefe vor 1920" — "1920" at chars 11..15
doc = _make_doc_with_ents(nlp_de, "Briefe vor 1920", [(11, 15, "DATE")])
date_from, date_to = extract_dates(doc, "de")
assert date_from is None
assert date_to == "1920-12-31"
def test_date_nach_1900(nlp_de):
from extractor import extract_dates
# "Briefe nach 1900" — "1900" at chars 12..16
doc = _make_doc_with_ents(nlp_de, "Briefe nach 1900", [(12, 16, "DATE")])
date_from, date_to = extract_dates(doc, "de")
assert date_from == "1900-01-01"
assert date_to is None
def test_date_zwischen_1900_und_1920(nlp_de):
from extractor import extract_dates
# "zwischen 1900 und 1920"
# "1900" = chars 9..13, "1920" = chars 18..22
doc = _make_doc_with_ents(nlp_de, "zwischen 1900 und 1920", [
(9, 13, "DATE"),
(18, 22, "DATE"),
])
date_from, date_to = extract_dates(doc, "de")
assert date_from == "1900-01-01"
assert date_to == "1920-12-31"
def test_date_bare_year_makes_range(nlp_de):
from extractor import extract_dates
# "Briefe 1920" — no direction token → year-range
# "1920" = chars 7..11
doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")])
date_from, date_to = extract_dates(doc, "de")
assert date_from == "1920-01-01"
assert date_to == "1920-12-31"
def test_date_no_date_entity(nlp_de):
from extractor import extract_dates
doc = _make_doc_with_ents(nlp_de, "Briefe von Opa", [])
date_from, date_to = extract_dates(doc, "de")
assert date_from is None
assert date_to is None
def test_date_before_english(nlp_en):
from extractor import extract_dates
# "letters before 1920" — "1920" at chars 15..19
doc = _make_doc_with_ents(nlp_en, "letters before 1920", [(15, 19, "DATE")])
date_from, date_to = extract_dates(doc, "en")
assert date_from is None
assert date_to == "1920-12-31"
def test_date_after_english(nlp_en):
from extractor import extract_dates
# "letters after 1900" — "1900" at chars 14..18
doc = _make_doc_with_ents(nlp_en, "letters after 1900", [(14, 18, "DATE")])
date_from, date_to = extract_dates(doc, "en")
assert date_from == "1900-01-01"
assert date_to is None