From df14e6b1ee0b08d6b0a2ea84f279fed0449aa85f Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 13:30:07 +0200 Subject: [PATCH] feat(normalizer): parse_date dispatch + iso/numeric matchers Co-Authored-By: Claude Opus 4.7 --- tools/import-normalizer/dates.py | 79 +++++++++++++++++++++ tools/import-normalizer/tests/test_dates.py | 22 ++++++ 2 files changed, 101 insertions(+) diff --git a/tools/import-normalizer/dates.py b/tools/import-normalizer/dates.py index 464092c1..a34bd357 100644 --- a/tools/import-normalizer/dates.py +++ b/tools/import-normalizer/dates.py @@ -1,5 +1,7 @@ """Tolerant historical date parsing for the family archive.""" import datetime +import re +from dataclasses import dataclass from enum import StrEnum import config @@ -58,6 +60,83 @@ def expand_year(token: str): return None +@dataclass(frozen=True) +class ParsedDate: + iso: str | None + precision: Precision + raw: str + + +_LEADING_MARKERS = re.compile( + r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I) + + +def _preprocess(raw: str): + """Return (cleaned_string, approx_flag).""" + s = (raw or "").strip() + if not s: + return "", False + low = s.lower() + approx = ("?" in s) or any( + m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich")) + s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)" + s = s.replace("?", " ") + s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief") + s = _LEADING_MARKERS.sub("", s) + s = re.sub(r"\s+", " ", s).strip(" .,") + return s, approx + + +_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})") + + +def _match_iso(s): + if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s): + try: + datetime.date.fromisoformat(s) + return s, Precision.DAY + except ValueError: + return None + return None + + +def _match_numeric(s): + m = _NUM_RE.fullmatch(s) + if not m: + return None + day, month = int(m.group(1)), int(m.group(2)) + year = expand_year(m.group(3)) + if year is None or not (1 <= month <= 12): + return None + try: + return datetime.date(year, month, day).isoformat(), Precision.DAY + except ValueError: + return None + + +# Matchers are tried in order. Later tasks append to this list. +_MATCHERS = [_match_iso, _match_numeric] + + +def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate: + if date_overrides: + key = (raw or "").strip() + if key in date_overrides: + iso, prec = date_overrides[key] + return ParsedDate(iso or None, Precision(prec), raw) + cleaned, approx = _preprocess(raw) + if not cleaned: + return ParsedDate(None, Precision.UNKNOWN, raw) + for matcher in _MATCHERS: + result = matcher(cleaned) + if result: + iso, precision = result + if approx: + precision = Precision.APPROX + return ParsedDate(iso, precision, raw) + return ParsedDate(None, Precision.UNKNOWN, raw) + + def easter(year: int) -> datetime.date: """Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm.""" a = year % 19 diff --git a/tools/import-normalizer/tests/test_dates.py b/tools/import-normalizer/tests/test_dates.py index 62fb79fa..d8e06e22 100644 --- a/tools/import-normalizer/tests/test_dates.py +++ b/tools/import-normalizer/tests/test_dates.py @@ -36,3 +36,25 @@ def test_expand_year(): assert dates.expand_year("99") == 1899 assert dates.expand_year("65") is None # 58..72 ambiguous assert dates.expand_year("x") is None + +def test_parse_iso_and_empty(): + assert dates.parse_date("1910-04-23") == dates.ParsedDate("1910-04-23", Precision.DAY, "1910-04-23") + assert dates.parse_date("") == dates.ParsedDate(None, Precision.UNKNOWN, "") + assert dates.parse_date("?") == dates.ParsedDate(None, Precision.UNKNOWN, "?") + +def test_parse_numeric_forms(): + assert dates.parse_date("15.2.1888").iso == "1888-02-15" + assert dates.parse_date("13.5.09").iso == "1909-05-13" + assert dates.parse_date("17/6. 1916").iso == "1916-06-17" + assert dates.parse_date("11.10.08").iso == "1908-10-11" + assert dates.parse_date("30.1.889").iso == "1889-01-30" + assert dates.parse_date("15.2.1888").precision == Precision.DAY + +def test_parse_numeric_unparseable(): + assert dates.parse_date("8.9.").precision == Precision.UNKNOWN # no year + assert dates.parse_date("13.5.65").precision == Precision.UNKNOWN # ambiguous 2-digit year + +def test_parse_approx_marker_upgrades_precision(): + r = dates.parse_date("17.Nov (?) 1887") # month-name handled in a later task; here just the marker path + # after the marker is detected, a parsed date becomes APPROX (verified fully in Task 8) + assert r.raw == "17.Nov (?) 1887"