From 4942c0ea075ec97d80b24e665f9a602eceb8d164 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 13:42:36 +0200 Subject: [PATCH] feat(normalizer): day-first month-name matcher Co-Authored-By: Claude Opus 4.7 --- tools/import-normalizer/dates.py | 25 ++++++++++++++++++++- tools/import-normalizer/tests/test_dates.py | 13 +++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/tools/import-normalizer/dates.py b/tools/import-normalizer/dates.py index 75605688..0e5934e9 100644 --- a/tools/import-normalizer/dates.py +++ b/tools/import-normalizer/dates.py @@ -134,8 +134,31 @@ def _match_roman(s): return None +_MONTH_A_RE = re.compile(r"(\d{1,2})[.\s]*([A-Za-zÄÖÜäöü]+)\.?\s*(\d{2,4})") + + +def _lookup_month(token: str): + return config.MONTHS.get(token.lower().strip(" .")) + + +def _build_day_month_year(day, month, year): + if not month or year is None or not (1 <= month <= 12): + return None + try: + return datetime.date(year, month, day).isoformat(), Precision.DAY + except ValueError: + return None + + +def _match_monthname_a(s): + m = _MONTH_A_RE.fullmatch(s) + if not m: + return None + return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3))) + + # Matchers are tried in order. Later tasks append to this list. -_MATCHERS = [_match_iso, _match_numeric, _match_roman] +_MATCHERS = [_match_iso, _match_numeric, _match_roman, _match_monthname_a] def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate: diff --git a/tools/import-normalizer/tests/test_dates.py b/tools/import-normalizer/tests/test_dates.py index c520ca36..7762d436 100644 --- a/tools/import-normalizer/tests/test_dates.py +++ b/tools/import-normalizer/tests/test_dates.py @@ -55,9 +55,9 @@ def test_parse_numeric_unparseable(): assert dates.parse_date("13.5.65").precision == Precision.UNKNOWN # ambiguous 2-digit year def test_parse_approx_marker_upgrades_precision(): - r = dates.parse_date("17.Nov (?) 1887") # month-name handled in a later task; here just the marker path + r = dates.parse_date("17.Nov (?) 1887") # month-name matcher now active; (?) marks approx assert r.raw == "17.Nov (?) 1887" - assert r.precision == Precision.UNKNOWN # no month-name matcher until Task 7; full APPROX check in Task 8 + assert r.precision == Precision.APPROX # month-name matcher parses date; (?) upgrades to APPROX def test_parse_leading_qualifier_is_approx(): r = dates.parse_date("nach 1.5.1900") # qualifier stripped, numeric date salvaged, precision APPROX @@ -69,3 +69,12 @@ def test_parse_roman_months(): assert dates.parse_date("19.XII.1954").iso == "1954-12-19" assert dates.parse_date("1.III.27").iso == "1927-03-01" assert dates.parse_date("22.III.18").precision == Precision.DAY + +def test_parse_monthname_day_first(): + assert dates.parse_date("6.März 1888").iso == "1888-03-06" + assert dates.parse_date("29.Sept.1891").iso == "1891-09-29" + assert dates.parse_date("10.Oct.95").iso == "1895-10-10" + assert dates.parse_date("9.December1889").iso == "1889-12-09" + assert dates.parse_date("18.Dez.1916").iso == "1916-12-18" + assert dates.parse_date("4Dezember 1936").iso == "1936-12-04" + assert dates.parse_date("25 August 1968").iso == "1968-08-25"