"""Tolerant historical date parsing for the family archive.""" import datetime import re from dataclasses import dataclass from enum import StrEnum import config class Precision(StrEnum): DAY = "DAY" MONTH = "MONTH" SEASON = "SEASON" YEAR = "YEAR" RANGE = "RANGE" APPROX = "APPROX" UNKNOWN = "UNKNOWN" def _advent_sunday(year: int, n: int) -> datetime.date: """n-th Advent (1..4). 4th Advent = last Sunday on/before Dec 24.""" dec24 = datetime.date(year, 12, 24) back_to_sunday = (dec24.weekday() - 6) % 7 # Mon=0..Sun=6 fourth = dec24 - datetime.timedelta(days=back_to_sunday) return fourth - datetime.timedelta(days=(4 - n) * 7) def resolve_feast_or_season(token: str, year: int): """Return (iso, Precision) for a known feast/season token, else None.""" key = " ".join(token.lower().split()).strip(" .") if key in config.MOVABLE_FEASTS: d = easter(year) + datetime.timedelta(days=config.MOVABLE_FEASTS[key]) return d.isoformat(), Precision.DAY if key in config.FIXED_FEASTS: month, day = config.FIXED_FEASTS[key] return datetime.date(year, month, day).isoformat(), Precision.DAY advent = {"1. advent": 1, "2. advent": 2, "3. advent": 3, "4. advent": 4, "advent": 1} if key in advent: return _advent_sunday(year, advent[key]).isoformat(), Precision.DAY if key in config.SEASON_MONTHS: return datetime.date(year, config.SEASON_MONTHS[key], 1).isoformat(), Precision.SEASON return None def expand_year(token: str): """Expand a 2/3/4-digit year string per the 1873–1957 century rule. None if ambiguous.""" token = token.strip() if not token.isdigit(): return None n, v = len(token), int(token) if n == 4: return v if n == 3: return 1000 + v if n == 2: if v <= config.TWO_DIGIT_19XX_MAX: return 1900 + v if v >= config.TWO_DIGIT_18XX_MIN: return 1800 + v return None return None @dataclass(frozen=True) class ParsedDate: iso: str | None precision: Precision raw: str _LEADING_MARKERS = re.compile( r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I) def _preprocess(raw: str): """Return (cleaned_string, approx_flag). Any uncertainty/qualifier marker -> approx.""" s = (raw or "").strip() if not s: return "", False low = s.lower() approx = ("?" in s) or any( m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich")) s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)" s = s.replace("?", " ") s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief") stripped = _LEADING_MARKERS.sub("", s) if stripped != s: # a leading qualifier (um/ca/nach/vor/anfang/…) signals approximation approx = True s = re.sub(r"\s+", " ", stripped).strip(" .,") return s, approx _NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})") def _match_iso(s): if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s): try: datetime.date.fromisoformat(s) return s, Precision.DAY except ValueError: return None return None def _match_numeric(s): m = _NUM_RE.fullmatch(s) if not m: return None day, month = int(m.group(1)), int(m.group(2)) year = expand_year(m.group(3)) if year is None or not (1 <= month <= 12): return None try: return datetime.date(year, month, day).isoformat(), Precision.DAY except ValueError: return None _ROMAN_RE = re.compile(r"(\d{1,2})\.\s*([IVXLC]+)\.?\s*(\d{2,4})", re.I) def _match_roman(s): m = _ROMAN_RE.fullmatch(s) if not m: return None day = int(m.group(1)) month = config.ROMAN_MONTHS.get(m.group(2).lower()) year = expand_year(m.group(3)) if not month or year is None: return None try: return datetime.date(year, month, day).isoformat(), Precision.DAY except ValueError: return None _MONTH_A_RE = re.compile(r"(\d{1,2})[.\s]*([A-Za-zÄÖÜäöü]+)\.?\s*(\d{2,4})") def _lookup_month(token: str): return config.MONTHS.get(token.lower().strip(" .")) def _build_day_month_year(day, month, year): if not month or year is None or not (1 <= month <= 12): return None try: return datetime.date(year, month, day).isoformat(), Precision.DAY except ValueError: return None def _match_monthname_a(s): m = _MONTH_A_RE.fullmatch(s) if not m: return None return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3))) # dot after day is REQUIRED so this can't match "Mai 1895" (MONTH YYYY) as day=18 _MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\.\s*(\d{2,4})") def _match_monthname_b(s): m = _MONTH_B_RE.fullmatch(s) if not m: return None return _build_day_month_year(int(m.group(2)), _lookup_month(m.group(1)), expand_year(m.group(3))) _MONTH_YEAR_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s+(\d{2,4})") _TOKEN_YEAR_RE = re.compile(r"(.+?)\.?\s+(\d{2,4})") _YEAR_ONLY_RE = re.compile(r"\d{4}") _RANGE_YY_RE = re.compile(r"(\d{4})\s*/\s*\d{2}") _RANGE_HYPHEN_RE = re.compile(r"(.*\d)\s*[-–]\s*\d.*") # Intra-month day range, e.g. "7./8. Sept.1923" — require a dot before the slash so it # does NOT swallow slash-as-dot single dates like "17/6. 1916" (which has no dot before "/"). _RANGE_DAY_RE = re.compile(r"(\d{1,2})\./(\d{1,2})\.\s*(.+)") def _match_month_year(s): m = _MONTH_YEAR_RE.fullmatch(s) if not m: return None month = _lookup_month(m.group(1)) year = expand_year(m.group(2)) if not month or year is None: return None return datetime.date(year, month, 1).isoformat(), Precision.MONTH def _match_feast_season(s): m = _TOKEN_YEAR_RE.fullmatch(s) if not m: return None year = expand_year(m.group(2)) if year is None: return None return resolve_feast_or_season(m.group(1), year) def _match_year_only(s): if _YEAR_ONLY_RE.fullmatch(s): return datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR return None def _match_range(s): m = _RANGE_YY_RE.fullmatch(s) if m: return datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE m = _RANGE_DAY_RE.fullmatch(s) if m: first = f"{m.group(1)}.{m.group(3)}" # "7." + "Sept.1923" -> "7.Sept.1923" for matcher in (_match_numeric, _match_monthname_a): r = matcher(first) if r: return r[0], Precision.RANGE m = _RANGE_HYPHEN_RE.fullmatch(s) if m: start = m.group(1).strip() for matcher in (_match_numeric, _match_roman, _match_monthname_a, _match_year_only): r = matcher(start) if r: return r[0], Precision.RANGE return None _MATCHERS = [ _match_iso, _match_range, _match_numeric, _match_roman, _match_monthname_a, _match_month_year, _match_monthname_b, _match_feast_season, _match_year_only, ] def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate: if date_overrides: key = (raw or "").strip() if key in date_overrides: iso, prec = date_overrides[key] return ParsedDate(iso or None, Precision(prec), raw) cleaned, approx = _preprocess(raw) if not cleaned: return ParsedDate(None, Precision.UNKNOWN, raw) for matcher in _MATCHERS: result = matcher(cleaned) if result: iso, precision = result if approx: precision = Precision.APPROX return ParsedDate(iso, precision, raw) return ParsedDate(None, Precision.UNKNOWN, raw) def easter(year: int) -> datetime.date: """Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm.""" a = year % 19 b = year // 100 c = year % 100 d = b // 4 e = b % 4 f = (b + 8) // 25 g = (b - f + 1) // 3 h = (19 * a + b - d - g + 15) % 30 i = c // 4 k = c % 4 l = (32 + 2 * e + 2 * i - h - k) % 7 m = (a + 11 * h + 22 * l) // 451 month = (h + l - 7 * m + 114) // 31 day = ((h + l - 7 * m + 114) % 31) + 1 return datetime.date(year, month, day)