feat(nlp-service): date range extraction with direction detection

This commit is contained in:
Marcel
2026-06-07 10:23:33 +02:00
parent 0ab2e2a743
commit 53f6dcbfed
2 changed files with 151 additions and 0 deletions

View File

@@ -86,3 +86,85 @@ def detect_person_role(doc, per_spans: list, lang: str) -> str:
return "receiver"
return "any"
# ── Step 3: Date parsing ─────────────────────────────────────────────────────
_YEAR_RE = re.compile(r"^\d{4}$")
_DATE_BEFORE: dict[str, frozenset[str]] = {
"de": frozenset({"vor"}),
"en": frozenset({"before"}),
"es": frozenset({"antes"}),
}
_DATE_AFTER: dict[str, frozenset[str]] = {
"de": frozenset({"nach"}),
"en": frozenset({"after"}),
"es": frozenset({"después", "despues"}),
}
_DATE_BETWEEN: dict[str, frozenset[str]] = {
"de": frozenset({"zwischen"}),
"en": frozenset({"between"}),
"es": frozenset({"entre"}),
}
def _parse_date_text(text: str, lang: str) -> date | None:
text = text.strip()
if _YEAR_RE.match(text):
year = int(text)
if 1000 < year < 3000:
return date(year, 1, 1)
parsed = dateparser.parse(
text,
languages=[lang],
settings={"PREFER_DAY_OF_MONTH": "first", "RETURN_AS_TIMEZONE_AWARE": False},
)
return parsed.date() if parsed else None
def _year_end(d: date) -> date:
"""If d is Jan 1, return Dec 31 of the same year (year-only boundary)."""
if d.month == 1 and d.day == 1:
return date(d.year, 12, 31)
return d
def extract_dates(doc, lang: str) -> tuple[str | None, str | None]:
"""Return (date_from, date_to) as ISO strings or None."""
date_spans = [ent for ent in doc.ents if ent.label_ == "DATE"]
if not date_spans:
return None, None
between_tokens = _DATE_BETWEEN[lang]
before_tokens = _DATE_BEFORE[lang]
after_tokens = _DATE_AFTER[lang]
# "zwischen X und Y" / "between X and Y" — two DATE spans form a range
has_between = any(tok.lower_ in between_tokens for tok in doc)
if has_between and len(date_spans) >= 2:
parsed = []
for span in date_spans[:2]:
d = _parse_date_text(span.text, lang)
if d:
parsed.append(d)
if len(parsed) == 2:
parsed.sort()
return parsed[0].isoformat(), _year_end(parsed[1]).isoformat()
# Single DATE span — use direction token
span = date_spans[0]
d = _parse_date_text(span.text, lang)
if not d:
return None, None
prev_lower = doc[span.start - 1].lower_ if span.start > 0 else ""
if prev_lower in before_tokens:
return None, _year_end(d).isoformat()
if prev_lower in after_tokens:
return d.isoformat(), None
# Bare year/date — closed year-range
return d.isoformat(), _year_end(d).isoformat()