feat(nlp-service): date range extraction with direction detection
This commit is contained in:
@@ -86,3 +86,85 @@ def detect_person_role(doc, per_spans: list, lang: str) -> str:
|
||||
return "receiver"
|
||||
|
||||
return "any"
|
||||
|
||||
|
||||
# ── Step 3: Date parsing ─────────────────────────────────────────────────────
|
||||
|
||||
_YEAR_RE = re.compile(r"^\d{4}$")
|
||||
|
||||
_DATE_BEFORE: dict[str, frozenset[str]] = {
|
||||
"de": frozenset({"vor"}),
|
||||
"en": frozenset({"before"}),
|
||||
"es": frozenset({"antes"}),
|
||||
}
|
||||
|
||||
_DATE_AFTER: dict[str, frozenset[str]] = {
|
||||
"de": frozenset({"nach"}),
|
||||
"en": frozenset({"after"}),
|
||||
"es": frozenset({"después", "despues"}),
|
||||
}
|
||||
|
||||
_DATE_BETWEEN: dict[str, frozenset[str]] = {
|
||||
"de": frozenset({"zwischen"}),
|
||||
"en": frozenset({"between"}),
|
||||
"es": frozenset({"entre"}),
|
||||
}
|
||||
|
||||
|
||||
def _parse_date_text(text: str, lang: str) -> date | None:
|
||||
text = text.strip()
|
||||
if _YEAR_RE.match(text):
|
||||
year = int(text)
|
||||
if 1000 < year < 3000:
|
||||
return date(year, 1, 1)
|
||||
parsed = dateparser.parse(
|
||||
text,
|
||||
languages=[lang],
|
||||
settings={"PREFER_DAY_OF_MONTH": "first", "RETURN_AS_TIMEZONE_AWARE": False},
|
||||
)
|
||||
return parsed.date() if parsed else None
|
||||
|
||||
|
||||
def _year_end(d: date) -> date:
|
||||
"""If d is Jan 1, return Dec 31 of the same year (year-only boundary)."""
|
||||
if d.month == 1 and d.day == 1:
|
||||
return date(d.year, 12, 31)
|
||||
return d
|
||||
|
||||
|
||||
def extract_dates(doc, lang: str) -> tuple[str | None, str | None]:
|
||||
"""Return (date_from, date_to) as ISO strings or None."""
|
||||
date_spans = [ent for ent in doc.ents if ent.label_ == "DATE"]
|
||||
if not date_spans:
|
||||
return None, None
|
||||
|
||||
between_tokens = _DATE_BETWEEN[lang]
|
||||
before_tokens = _DATE_BEFORE[lang]
|
||||
after_tokens = _DATE_AFTER[lang]
|
||||
|
||||
# "zwischen X und Y" / "between X and Y" — two DATE spans form a range
|
||||
has_between = any(tok.lower_ in between_tokens for tok in doc)
|
||||
if has_between and len(date_spans) >= 2:
|
||||
parsed = []
|
||||
for span in date_spans[:2]:
|
||||
d = _parse_date_text(span.text, lang)
|
||||
if d:
|
||||
parsed.append(d)
|
||||
if len(parsed) == 2:
|
||||
parsed.sort()
|
||||
return parsed[0].isoformat(), _year_end(parsed[1]).isoformat()
|
||||
|
||||
# Single DATE span — use direction token
|
||||
span = date_spans[0]
|
||||
d = _parse_date_text(span.text, lang)
|
||||
if not d:
|
||||
return None, None
|
||||
|
||||
prev_lower = doc[span.start - 1].lower_ if span.start > 0 else ""
|
||||
|
||||
if prev_lower in before_tokens:
|
||||
return None, _year_end(d).isoformat()
|
||||
if prev_lower in after_tokens:
|
||||
return d.isoformat(), None
|
||||
# Bare year/date — closed year-range
|
||||
return d.isoformat(), _year_end(d).isoformat()
|
||||
|
||||
Reference in New Issue
Block a user