diff --git a/nlp-service/extractor.py b/nlp-service/extractor.py index 7ca84f82..19543af5 100644 --- a/nlp-service/extractor.py +++ b/nlp-service/extractor.py @@ -86,3 +86,85 @@ def detect_person_role(doc, per_spans: list, lang: str) -> str: return "receiver" return "any" + + +# ── Step 3: Date parsing ───────────────────────────────────────────────────── + +_YEAR_RE = re.compile(r"^\d{4}$") + +_DATE_BEFORE: dict[str, frozenset[str]] = { + "de": frozenset({"vor"}), + "en": frozenset({"before"}), + "es": frozenset({"antes"}), +} + +_DATE_AFTER: dict[str, frozenset[str]] = { + "de": frozenset({"nach"}), + "en": frozenset({"after"}), + "es": frozenset({"después", "despues"}), +} + +_DATE_BETWEEN: dict[str, frozenset[str]] = { + "de": frozenset({"zwischen"}), + "en": frozenset({"between"}), + "es": frozenset({"entre"}), +} + + +def _parse_date_text(text: str, lang: str) -> date | None: + text = text.strip() + if _YEAR_RE.match(text): + year = int(text) + if 1000 < year < 3000: + return date(year, 1, 1) + parsed = dateparser.parse( + text, + languages=[lang], + settings={"PREFER_DAY_OF_MONTH": "first", "RETURN_AS_TIMEZONE_AWARE": False}, + ) + return parsed.date() if parsed else None + + +def _year_end(d: date) -> date: + """If d is Jan 1, return Dec 31 of the same year (year-only boundary).""" + if d.month == 1 and d.day == 1: + return date(d.year, 12, 31) + return d + + +def extract_dates(doc, lang: str) -> tuple[str | None, str | None]: + """Return (date_from, date_to) as ISO strings or None.""" + date_spans = [ent for ent in doc.ents if ent.label_ == "DATE"] + if not date_spans: + return None, None + + between_tokens = _DATE_BETWEEN[lang] + before_tokens = _DATE_BEFORE[lang] + after_tokens = _DATE_AFTER[lang] + + # "zwischen X und Y" / "between X and Y" — two DATE spans form a range + has_between = any(tok.lower_ in between_tokens for tok in doc) + if has_between and len(date_spans) >= 2: + parsed = [] + for span in date_spans[:2]: + d = _parse_date_text(span.text, lang) + if d: + parsed.append(d) + if len(parsed) == 2: + parsed.sort() + return parsed[0].isoformat(), _year_end(parsed[1]).isoformat() + + # Single DATE span — use direction token + span = date_spans[0] + d = _parse_date_text(span.text, lang) + if not d: + return None, None + + prev_lower = doc[span.start - 1].lower_ if span.start > 0 else "" + + if prev_lower in before_tokens: + return None, _year_end(d).isoformat() + if prev_lower in after_tokens: + return d.isoformat(), None + # Bare year/date — closed year-range + return d.isoformat(), _year_end(d).isoformat() diff --git a/nlp-service/test_extractor.py b/nlp-service/test_extractor.py index fe14d4b1..b79d188c 100644 --- a/nlp-service/test_extractor.py +++ b/nlp-service/test_extractor.py @@ -182,3 +182,72 @@ def test_role_receiver_to_english(nlp_en): doc = _make_doc_with_ents(nlp_en, "letters to Marie", [(11, 16, "PER")]) per_spans = list(doc.ents) assert detect_person_role(doc, per_spans, "en") == "receiver" + + +# ── Date parsing ───────────────────────────────────────────────────────────── + +def test_date_vor_1920(nlp_de): + from extractor import extract_dates + # "Briefe vor 1920" — "1920" at chars 11..15 + doc = _make_doc_with_ents(nlp_de, "Briefe vor 1920", [(11, 15, "DATE")]) + date_from, date_to = extract_dates(doc, "de") + assert date_from is None + assert date_to == "1920-12-31" + + +def test_date_nach_1900(nlp_de): + from extractor import extract_dates + # "Briefe nach 1900" — "1900" at chars 12..16 + doc = _make_doc_with_ents(nlp_de, "Briefe nach 1900", [(12, 16, "DATE")]) + date_from, date_to = extract_dates(doc, "de") + assert date_from == "1900-01-01" + assert date_to is None + + +def test_date_zwischen_1900_und_1920(nlp_de): + from extractor import extract_dates + # "zwischen 1900 und 1920" + # "1900" = chars 9..13, "1920" = chars 18..22 + doc = _make_doc_with_ents(nlp_de, "zwischen 1900 und 1920", [ + (9, 13, "DATE"), + (18, 22, "DATE"), + ]) + date_from, date_to = extract_dates(doc, "de") + assert date_from == "1900-01-01" + assert date_to == "1920-12-31" + + +def test_date_bare_year_makes_range(nlp_de): + from extractor import extract_dates + # "Briefe 1920" — no direction token → year-range + # "1920" = chars 7..11 + doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")]) + date_from, date_to = extract_dates(doc, "de") + assert date_from == "1920-01-01" + assert date_to == "1920-12-31" + + +def test_date_no_date_entity(nlp_de): + from extractor import extract_dates + doc = _make_doc_with_ents(nlp_de, "Briefe von Opa", []) + date_from, date_to = extract_dates(doc, "de") + assert date_from is None + assert date_to is None + + +def test_date_before_english(nlp_en): + from extractor import extract_dates + # "letters before 1920" — "1920" at chars 15..19 + doc = _make_doc_with_ents(nlp_en, "letters before 1920", [(15, 19, "DATE")]) + date_from, date_to = extract_dates(doc, "en") + assert date_from is None + assert date_to == "1920-12-31" + + +def test_date_after_english(nlp_en): + from extractor import extract_dates + # "letters after 1900" — "1900" at chars 14..18 + doc = _make_doc_with_ents(nlp_en, "letters after 1900", [(14, 18, "DATE")]) + date_from, date_to = extract_dates(doc, "en") + assert date_from == "1900-01-01" + assert date_to is None