familienarchiv/tools/import-normalizer/dates.py

"""Tolerant historical date parsing for the family archive."""
import datetime
import re
from dataclasses import dataclass
from enum import StrEnum
import config


class Precision(StrEnum):
    DAY = "DAY"
    MONTH = "MONTH"
    SEASON = "SEASON"
    YEAR = "YEAR"
    RANGE = "RANGE"
    APPROX = "APPROX"
    UNKNOWN = "UNKNOWN"


def _advent_sunday(year: int, n: int) -> datetime.date:
    """n-th Advent (1..4). 4th Advent = last Sunday on/before Dec 24."""
    dec24 = datetime.date(year, 12, 24)
    back_to_sunday = (dec24.weekday() - 6) % 7  # Mon=0..Sun=6
    fourth = dec24 - datetime.timedelta(days=back_to_sunday)
    return fourth - datetime.timedelta(days=(4 - n) * 7)


def resolve_feast_or_season(token: str, year: int):
    """Return (iso, Precision) for a known feast/season token, else None."""
    key = " ".join(token.lower().split()).strip(" .")
    if key in config.MOVABLE_FEASTS:
        d = easter(year) + datetime.timedelta(days=config.MOVABLE_FEASTS[key])
        return d.isoformat(), Precision.DAY
    if key in config.FIXED_FEASTS:
        month, day = config.FIXED_FEASTS[key]
        return datetime.date(year, month, day).isoformat(), Precision.DAY
    advent = {"1. advent": 1, "2. advent": 2, "3. advent": 3, "4. advent": 4, "advent": 1}
    if key in advent:
        return _advent_sunday(year, advent[key]).isoformat(), Precision.DAY
    if key in config.SEASON_MONTHS:
        return datetime.date(year, config.SEASON_MONTHS[key], 1).isoformat(), Precision.SEASON
    return None


def expand_year(token: str):
    """Expand a 2/3/4-digit year string per the 1873–1957 century rule. None if ambiguous."""
    token = token.strip()
    if not token.isdigit():
        return None
    n, v = len(token), int(token)
    if n == 4:
        # reject gross typos (e.g. "9003") so they go to review instead of a bogus year
        return v if 1700 <= v <= 2100 else None
    if n == 3:
        return 1000 + v
    if n == 2:
        if v <= config.TWO_DIGIT_19XX_MAX:
            return 1900 + v
        if v >= config.TWO_DIGIT_18XX_MIN:
            return 1800 + v
        return None
    return None


@dataclass(frozen=True)
class ParsedDate:
    iso: str | None
    precision: Precision
    raw: str
    end: str | None = None   # RANGE end day; None for every non-RANGE precision


@dataclass(frozen=True)
class MatchResult:
    """Uniform return shape for every _match_* matcher.

    A matcher returns None when it does not match, or a MatchResult when it does.
    `end` is the RANGE end day (None for every non-RANGE precision); `needs_review`
    is True only for a half-resolved RANGE whose start parsed but end did not.
    """
    iso: str
    precision: Precision
    end: str | None = None
    needs_review: bool = False


_LEADING_MARKERS = re.compile(
    r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I)


def _preprocess(raw: str):
    """Return (cleaned_string, approx_flag). Any uncertainty/qualifier marker -> approx."""
    s = (raw or "").strip()
    if not s:
        return "", False
    low = s.lower()
    approx = ("?" in s) or any(
        m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich"))
    s = re.sub(r"\(\s*\?\s*\)", " ", s)   # remove "(?)"
    s = s.replace("?", " ")
    s = re.sub(r",.*$", "", s)            # drop trailing editorial note (", 2. Brief")
    stripped = _LEADING_MARKERS.sub("", s)
    if stripped != s:                     # a leading qualifier (um/ca/nach/vor/anfang/…) signals approximation
        approx = True
    s = re.sub(r"\s+", " ", stripped).strip(" .,")
    return s, approx


_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})")


def _match_iso(s):
    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
        try:
            datetime.date.fromisoformat(s)
            return MatchResult(s, Precision.DAY)
        except ValueError:
            return None
    return None


def _match_numeric(s):
    m = _NUM_RE.fullmatch(s)
    if not m:
        return None
    day, month = int(m.group(1)), int(m.group(2))
    year = expand_year(m.group(3))
    if year is None or not (1 <= month <= 12):
        return None
    try:
        return MatchResult(datetime.date(year, month, day).isoformat(), Precision.DAY)
    except ValueError:
        return None


_ROMAN_RE = re.compile(r"(\d{1,2})\.\s*([IVXLC]+)\.?\s*(\d{2,4})", re.I)


def _match_roman(s):
    m = _ROMAN_RE.fullmatch(s)
    if not m:
        return None
    day = int(m.group(1))
    month = config.ROMAN_MONTHS.get(m.group(2).lower())
    year = expand_year(m.group(3))
    if not month or year is None:
        return None
    try:
        return MatchResult(datetime.date(year, month, day).isoformat(), Precision.DAY)
    except ValueError:
        return None


_MONTH_A_RE = re.compile(r"(\d{1,2})[.\s]*([A-Za-zÄÖÜäöü]+)\.?\s*(\d{2,4})")


def _lookup_month(token: str):
    return config.MONTHS.get(token.lower().strip(" ."))


def _build_day_month_year(day, month, year):
    if not month or year is None or not (1 <= month <= 12):
        return None
    try:
        return MatchResult(datetime.date(year, month, day).isoformat(), Precision.DAY)
    except ValueError:
        return None


def _match_monthname_a(s):
    m = _MONTH_A_RE.fullmatch(s)
    if not m:
        return None
    return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3)))


# A separator (dot OR hyphen/en-dash) after the day is REQUIRED so this can't match
# "Mai 1895" (MONTH YYYY) as day=18; the hyphen form also covers Spanish "Mayo 18-1929".
_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\s*[.\-–]\s*(\d{2,4})")


def _match_monthname_b(s):
    m = _MONTH_B_RE.fullmatch(s)
    if not m:
        return None
    return _build_day_month_year(int(m.group(2)), _lookup_month(m.group(1)), expand_year(m.group(3)))


_MONTH_YEAR_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s+(\d{2,4})")
_TOKEN_YEAR_RE = re.compile(r"(.+?)\.?\s+(\d{2,4})")
_YEAR_ONLY_RE = re.compile(r"\d{4}")
_RANGE_YY_RE = re.compile(r"(\d{4})\s*/\s*\d{2}")
_RANGE_HYPHEN_RE = re.compile(r"(.*\d)\s*[-–]\s*\d.*")
# Intra-month day range, e.g. "7./8. Sept.1923" — require a dot before the slash so it
# does NOT swallow slash-as-dot single dates like "17/6. 1916" (which has no dot before "/").
_RANGE_DAY_RE = re.compile(r"(\d{1,2})\./(\d{1,2})\.\s*(.+)")


def _match_month_year(s):
    m = _MONTH_YEAR_RE.fullmatch(s)
    if not m:
        return None
    month = _lookup_month(m.group(1))
    year = expand_year(m.group(2))
    if not month or year is None:
        return None
    return MatchResult(datetime.date(year, month, 1).isoformat(), Precision.MONTH)


def _match_feast_season(s):
    m = _TOKEN_YEAR_RE.fullmatch(s)
    if not m:
        return None
    year = expand_year(m.group(2))
    if year is None:
        return None
    resolved = resolve_feast_or_season(m.group(1), year)
    if resolved is None:
        return None
    iso, precision = resolved
    return MatchResult(iso, precision)


def _match_year_only(s):
    if _YEAR_ONLY_RE.fullmatch(s):
        return MatchResult(datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR)
    return None


def _match_range(s):
    m = _RANGE_YY_RE.fullmatch(s)
    if m:
        return MatchResult(datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE)
    m = _RANGE_DAY_RE.fullmatch(s)
    if m:
        day_start, day_end, rest = m.group(1), m.group(2), m.group(3)
        # "10." + "1.1917" -> "10.1.1917"; resolve start and end day against the shared month/year
        for matcher in (_match_numeric, _match_roman, _match_monthname_a):
            start = matcher(f"{day_start}.{rest}")
            if start:
                end = matcher(f"{day_end}.{rest}")
                return MatchResult(start.iso, Precision.RANGE,
                                   end.iso if end else None)
    m = _RANGE_HYPHEN_RE.fullmatch(s)
    if m:
        start = m.group(1).strip()
        for matcher in (_match_numeric, _match_roman, _match_monthname_a, _match_year_only):
            r = matcher(start)
            if r:
                return MatchResult(r.iso, Precision.RANGE)
    return None


_MATCHERS = [
    _match_iso,
    _match_range,
    _match_numeric,
    _match_roman,
    _match_monthname_a,
    _match_month_year,
    _match_monthname_b,
    _match_feast_season,
    _match_year_only,
]


def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
    if date_overrides:
        key = (raw or "").strip()
        if key in date_overrides:
            iso, prec = date_overrides[key]
            return ParsedDate(iso or None, Precision(prec), raw)
    cleaned, approx = _preprocess(raw)
    if not cleaned:
        return ParsedDate(None, Precision.UNKNOWN, raw)
    for matcher in _MATCHERS:
        result = matcher(cleaned)
        if result:
            precision = Precision.APPROX if approx else result.precision
            return ParsedDate(result.iso, precision, raw, result.end)
    return ParsedDate(None, Precision.UNKNOWN, raw)


def easter(year: int) -> datetime.date:
    """Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
    a = year % 19
    b = year // 100
    c = year % 100
    d = b // 4
    e = b % 4
    f = (b + 8) // 25
    g = (b - f + 1) // 3
    h = (19 * a + b - d - g + 15) % 30
    i = c // 4
    k = c % 4
    l = (32 + 2 * e + 2 * i - h - k) % 7
    m = (a + 11 * h + 22 * l) // 451
    month = (h + l - 7 * m + 114) // 31
    day = ((h + l - 7 * m + 114) % 31) + 1
    return datetime.date(year, month, day)