From 53a661adb6310e8d065c2750391af127c921e095 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 13:47:26 +0200 Subject: [PATCH] feat(normalizer): month/year, feast/season, range matchers + overrides Co-Authored-By: Claude Opus 4.7 --- tools/import-normalizer/dates.py | 81 ++++++++++++++++++++- tools/import-normalizer/tests/test_dates.py | 49 +++++++++++++ 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/tools/import-normalizer/dates.py b/tools/import-normalizer/dates.py index 0e5934e9..b411b494 100644 --- a/tools/import-normalizer/dates.py +++ b/tools/import-normalizer/dates.py @@ -157,8 +157,85 @@ def _match_monthname_a(s): return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3))) -# Matchers are tried in order. Later tasks append to this list. -_MATCHERS = [_match_iso, _match_numeric, _match_roman, _match_monthname_a] +_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\.?\s*(\d{2,4})") + + +def _match_monthname_b(s): + m = _MONTH_B_RE.fullmatch(s) + if not m: + return None + return _build_day_month_year(int(m.group(2)), _lookup_month(m.group(1)), expand_year(m.group(3))) + + +_MONTH_YEAR_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s+(\d{2,4})") +_TOKEN_YEAR_RE = re.compile(r"(.+?)\.?\s+(\d{2,4})") +_YEAR_ONLY_RE = re.compile(r"\d{4}") +_RANGE_YY_RE = re.compile(r"(\d{4})\s*/\s*\d{2}") +_RANGE_HYPHEN_RE = re.compile(r"(.*\d)\s*[-–]\s*\d.*") +# Intra-month day range, e.g. "7./8. Sept.1923" — require a dot before the slash so it +# does NOT swallow slash-as-dot single dates like "17/6. 1916" (which has no dot before "/"). +_RANGE_DAY_RE = re.compile(r"(\d{1,2})\./(\d{1,2})\.\s*(.+)") + + +def _match_month_year(s): + m = _MONTH_YEAR_RE.fullmatch(s) + if not m: + return None + month = _lookup_month(m.group(1)) + year = expand_year(m.group(2)) + if not month or year is None: + return None + return datetime.date(year, month, 1).isoformat(), Precision.MONTH + + +def _match_feast_season(s): + m = _TOKEN_YEAR_RE.fullmatch(s) + if not m: + return None + year = expand_year(m.group(2)) + if year is None: + return None + return resolve_feast_or_season(m.group(1), year) + + +def _match_year_only(s): + if _YEAR_ONLY_RE.fullmatch(s): + return datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR + return None + + +def _match_range(s): + m = _RANGE_YY_RE.fullmatch(s) + if m: + return datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE + m = _RANGE_DAY_RE.fullmatch(s) + if m: + first = f"{m.group(1)}.{m.group(3)}" # "7." + "Sept.1923" -> "7.Sept.1923" + for matcher in (_match_numeric, _match_monthname_a): + r = matcher(first) + if r: + return r[0], Precision.RANGE + m = _RANGE_HYPHEN_RE.fullmatch(s) + if m: + start = m.group(1).strip() + for matcher in (_match_numeric, _match_roman, _match_monthname_a, _match_year_only): + r = matcher(start) + if r: + return r[0], Precision.RANGE + return None + + +_MATCHERS = [ + _match_iso, + _match_range, + _match_numeric, + _match_roman, + _match_monthname_a, + _match_month_year, + _match_monthname_b, + _match_feast_season, + _match_year_only, +] def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate: diff --git a/tools/import-normalizer/tests/test_dates.py b/tools/import-normalizer/tests/test_dates.py index 7762d436..b0953d24 100644 --- a/tools/import-normalizer/tests/test_dates.py +++ b/tools/import-normalizer/tests/test_dates.py @@ -78,3 +78,52 @@ def test_parse_monthname_day_first(): assert dates.parse_date("18.Dez.1916").iso == "1916-12-18" assert dates.parse_date("4Dezember 1936").iso == "1936-12-04" assert dates.parse_date("25 August 1968").iso == "1968-08-25" + +def test_parse_month_year_year_only(): + assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895") + assert dates.parse_date("October 1903").iso == "1903-10-01" + assert dates.parse_date("1905") == dates.ParsedDate("1905-01-01", Precision.YEAR, "1905") + +def test_parse_feast_and_season_via_parse_date(): + assert dates.parse_date("Pfingsten 1922") == dates.ParsedDate("1922-06-04", Precision.DAY, "Pfingsten 1922") + assert dates.parse_date("Herbst 1913") == dates.ParsedDate("1913-10-01", Precision.SEASON, "Herbst 1913") + assert dates.parse_date("Pfingstsonntag 1915").precision == Precision.DAY + +def test_parse_ranges(): + assert dates.parse_date("8.1.1916 - 15.3.1916") == dates.ParsedDate("1916-01-08", Precision.RANGE, "8.1.1916 - 15.3.1916") + assert dates.parse_date("1881/82") == dates.ParsedDate("1881-01-01", Precision.RANGE, "1881/82") + assert dates.parse_date("1945/46?").iso == "1945-01-01" # '?' stripped -> RANGE, then APPROX + assert dates.parse_date("1945/46?").precision == Precision.APPROX + +def test_parse_approx_full(): + r = dates.parse_date("17.Nov (?) 1887") + assert r.iso == "1887-11-17" + assert r.precision == Precision.APPROX + +def test_parse_english_month_first_now_works(): + assert dates.parse_date("April 12. 1922").iso == "1922-04-12" + assert dates.parse_date("Mai 1895").iso == "1895-05-01" # not shadowed by month-first matcher + +def test_parse_unparseable_examples(): + assert dates.parse_date("Freitag 1919").precision == Precision.UNKNOWN + +def test_parse_invalid_calendar_date_is_unknown(): + # try/except ValueError in the matchers must route impossible dates to UNKNOWN (-> review), + # never silently clamp. This is the most likely real-data bug class at 7,600 rows. + assert dates.parse_date("30.2.1888").precision == Precision.UNKNOWN + assert dates.parse_date("31.4.1916").precision == Precision.UNKNOWN + +def test_parse_intra_month_day_range(): + # "7./8. Sept.1923" -> start day, RANGE. Must NOT be confused with slash-date "17/6. 1916". + assert dates.parse_date("7./8. Sept.1923") == dates.ParsedDate("1923-09-07", Precision.RANGE, "7./8. Sept.1923") + assert dates.parse_date("17/6. 1916") == dates.ParsedDate("1916-06-17", Precision.DAY, "17/6. 1916") + +def test_parse_trailing_note_stripped_but_raw_preserved(): + r = dates.parse_date("17.Nov 1887, 2. Brief") # REQ-DATE-04 + assert r.iso == "1887-11-17" + assert "2. Brief" in r.raw # original string preserved verbatim + +def test_parse_date_override_wins(): + ovr = {"13.5.65": ("1965-05-13", "DAY")} + r = dates.parse_date("13.5.65", ovr) # ambiguous without override + assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65")