feat(nlp-service): date range extraction with direction detection
This commit is contained in:
@@ -86,3 +86,85 @@ def detect_person_role(doc, per_spans: list, lang: str) -> str:
|
||||
return "receiver"
|
||||
|
||||
return "any"
|
||||
|
||||
|
||||
# ── Step 3: Date parsing ─────────────────────────────────────────────────────
|
||||
|
||||
_YEAR_RE = re.compile(r"^\d{4}$")
|
||||
|
||||
_DATE_BEFORE: dict[str, frozenset[str]] = {
|
||||
"de": frozenset({"vor"}),
|
||||
"en": frozenset({"before"}),
|
||||
"es": frozenset({"antes"}),
|
||||
}
|
||||
|
||||
_DATE_AFTER: dict[str, frozenset[str]] = {
|
||||
"de": frozenset({"nach"}),
|
||||
"en": frozenset({"after"}),
|
||||
"es": frozenset({"después", "despues"}),
|
||||
}
|
||||
|
||||
_DATE_BETWEEN: dict[str, frozenset[str]] = {
|
||||
"de": frozenset({"zwischen"}),
|
||||
"en": frozenset({"between"}),
|
||||
"es": frozenset({"entre"}),
|
||||
}
|
||||
|
||||
|
||||
def _parse_date_text(text: str, lang: str) -> date | None:
|
||||
text = text.strip()
|
||||
if _YEAR_RE.match(text):
|
||||
year = int(text)
|
||||
if 1000 < year < 3000:
|
||||
return date(year, 1, 1)
|
||||
parsed = dateparser.parse(
|
||||
text,
|
||||
languages=[lang],
|
||||
settings={"PREFER_DAY_OF_MONTH": "first", "RETURN_AS_TIMEZONE_AWARE": False},
|
||||
)
|
||||
return parsed.date() if parsed else None
|
||||
|
||||
|
||||
def _year_end(d: date) -> date:
|
||||
"""If d is Jan 1, return Dec 31 of the same year (year-only boundary)."""
|
||||
if d.month == 1 and d.day == 1:
|
||||
return date(d.year, 12, 31)
|
||||
return d
|
||||
|
||||
|
||||
def extract_dates(doc, lang: str) -> tuple[str | None, str | None]:
|
||||
"""Return (date_from, date_to) as ISO strings or None."""
|
||||
date_spans = [ent for ent in doc.ents if ent.label_ == "DATE"]
|
||||
if not date_spans:
|
||||
return None, None
|
||||
|
||||
between_tokens = _DATE_BETWEEN[lang]
|
||||
before_tokens = _DATE_BEFORE[lang]
|
||||
after_tokens = _DATE_AFTER[lang]
|
||||
|
||||
# "zwischen X und Y" / "between X and Y" — two DATE spans form a range
|
||||
has_between = any(tok.lower_ in between_tokens for tok in doc)
|
||||
if has_between and len(date_spans) >= 2:
|
||||
parsed = []
|
||||
for span in date_spans[:2]:
|
||||
d = _parse_date_text(span.text, lang)
|
||||
if d:
|
||||
parsed.append(d)
|
||||
if len(parsed) == 2:
|
||||
parsed.sort()
|
||||
return parsed[0].isoformat(), _year_end(parsed[1]).isoformat()
|
||||
|
||||
# Single DATE span — use direction token
|
||||
span = date_spans[0]
|
||||
d = _parse_date_text(span.text, lang)
|
||||
if not d:
|
||||
return None, None
|
||||
|
||||
prev_lower = doc[span.start - 1].lower_ if span.start > 0 else ""
|
||||
|
||||
if prev_lower in before_tokens:
|
||||
return None, _year_end(d).isoformat()
|
||||
if prev_lower in after_tokens:
|
||||
return d.isoformat(), None
|
||||
# Bare year/date — closed year-range
|
||||
return d.isoformat(), _year_end(d).isoformat()
|
||||
|
||||
@@ -182,3 +182,72 @@ def test_role_receiver_to_english(nlp_en):
|
||||
doc = _make_doc_with_ents(nlp_en, "letters to Marie", [(11, 16, "PER")])
|
||||
per_spans = list(doc.ents)
|
||||
assert detect_person_role(doc, per_spans, "en") == "receiver"
|
||||
|
||||
|
||||
# ── Date parsing ─────────────────────────────────────────────────────────────
|
||||
|
||||
def test_date_vor_1920(nlp_de):
|
||||
from extractor import extract_dates
|
||||
# "Briefe vor 1920" — "1920" at chars 11..15
|
||||
doc = _make_doc_with_ents(nlp_de, "Briefe vor 1920", [(11, 15, "DATE")])
|
||||
date_from, date_to = extract_dates(doc, "de")
|
||||
assert date_from is None
|
||||
assert date_to == "1920-12-31"
|
||||
|
||||
|
||||
def test_date_nach_1900(nlp_de):
|
||||
from extractor import extract_dates
|
||||
# "Briefe nach 1900" — "1900" at chars 12..16
|
||||
doc = _make_doc_with_ents(nlp_de, "Briefe nach 1900", [(12, 16, "DATE")])
|
||||
date_from, date_to = extract_dates(doc, "de")
|
||||
assert date_from == "1900-01-01"
|
||||
assert date_to is None
|
||||
|
||||
|
||||
def test_date_zwischen_1900_und_1920(nlp_de):
|
||||
from extractor import extract_dates
|
||||
# "zwischen 1900 und 1920"
|
||||
# "1900" = chars 9..13, "1920" = chars 18..22
|
||||
doc = _make_doc_with_ents(nlp_de, "zwischen 1900 und 1920", [
|
||||
(9, 13, "DATE"),
|
||||
(18, 22, "DATE"),
|
||||
])
|
||||
date_from, date_to = extract_dates(doc, "de")
|
||||
assert date_from == "1900-01-01"
|
||||
assert date_to == "1920-12-31"
|
||||
|
||||
|
||||
def test_date_bare_year_makes_range(nlp_de):
|
||||
from extractor import extract_dates
|
||||
# "Briefe 1920" — no direction token → year-range
|
||||
# "1920" = chars 7..11
|
||||
doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")])
|
||||
date_from, date_to = extract_dates(doc, "de")
|
||||
assert date_from == "1920-01-01"
|
||||
assert date_to == "1920-12-31"
|
||||
|
||||
|
||||
def test_date_no_date_entity(nlp_de):
|
||||
from extractor import extract_dates
|
||||
doc = _make_doc_with_ents(nlp_de, "Briefe von Opa", [])
|
||||
date_from, date_to = extract_dates(doc, "de")
|
||||
assert date_from is None
|
||||
assert date_to is None
|
||||
|
||||
|
||||
def test_date_before_english(nlp_en):
|
||||
from extractor import extract_dates
|
||||
# "letters before 1920" — "1920" at chars 15..19
|
||||
doc = _make_doc_with_ents(nlp_en, "letters before 1920", [(15, 19, "DATE")])
|
||||
date_from, date_to = extract_dates(doc, "en")
|
||||
assert date_from is None
|
||||
assert date_to == "1920-12-31"
|
||||
|
||||
|
||||
def test_date_after_english(nlp_en):
|
||||
from extractor import extract_dates
|
||||
# "letters after 1900" — "1900" at chars 14..18
|
||||
doc = _make_doc_with_ents(nlp_en, "letters after 1900", [(14, 18, "DATE")])
|
||||
date_from, date_to = extract_dates(doc, "en")
|
||||
assert date_from == "1900-01-01"
|
||||
assert date_to is None
|
||||
|
||||
Reference in New Issue
Block a user