Files
familienarchiv/tools/import-normalizer/dates.py
2026-05-25 13:30:07 +02:00

157 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tolerant historical date parsing for the family archive."""
import datetime
import re
from dataclasses import dataclass
from enum import StrEnum
import config
class Precision(StrEnum):
DAY = "DAY"
MONTH = "MONTH"
SEASON = "SEASON"
YEAR = "YEAR"
RANGE = "RANGE"
APPROX = "APPROX"
UNKNOWN = "UNKNOWN"
def _advent_sunday(year: int, n: int) -> datetime.date:
"""n-th Advent (1..4). 4th Advent = last Sunday on/before Dec 24."""
dec24 = datetime.date(year, 12, 24)
back_to_sunday = (dec24.weekday() - 6) % 7 # Mon=0..Sun=6
fourth = dec24 - datetime.timedelta(days=back_to_sunday)
return fourth - datetime.timedelta(days=(4 - n) * 7)
def resolve_feast_or_season(token: str, year: int):
"""Return (iso, Precision) for a known feast/season token, else None."""
key = " ".join(token.lower().split()).strip(" .")
if key in config.MOVABLE_FEASTS:
d = easter(year) + datetime.timedelta(days=config.MOVABLE_FEASTS[key])
return d.isoformat(), Precision.DAY
if key in config.FIXED_FEASTS:
month, day = config.FIXED_FEASTS[key]
return datetime.date(year, month, day).isoformat(), Precision.DAY
advent = {"1. advent": 1, "2. advent": 2, "3. advent": 3, "4. advent": 4, "advent": 1}
if key in advent:
return _advent_sunday(year, advent[key]).isoformat(), Precision.DAY
if key in config.SEASON_MONTHS:
return datetime.date(year, config.SEASON_MONTHS[key], 1).isoformat(), Precision.SEASON
return None
def expand_year(token: str):
"""Expand a 2/3/4-digit year string per the 18731957 century rule. None if ambiguous."""
token = token.strip()
if not token.isdigit():
return None
n, v = len(token), int(token)
if n == 4:
return v
if n == 3:
return 1000 + v
if n == 2:
if v <= config.TWO_DIGIT_19XX_MAX:
return 1900 + v
if v >= config.TWO_DIGIT_18XX_MIN:
return 1800 + v
return None
return None
@dataclass(frozen=True)
class ParsedDate:
iso: str | None
precision: Precision
raw: str
_LEADING_MARKERS = re.compile(
r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I)
def _preprocess(raw: str):
"""Return (cleaned_string, approx_flag)."""
s = (raw or "").strip()
if not s:
return "", False
low = s.lower()
approx = ("?" in s) or any(
m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich"))
s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)"
s = s.replace("?", " ")
s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief")
s = _LEADING_MARKERS.sub("", s)
s = re.sub(r"\s+", " ", s).strip(" .,")
return s, approx
_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})")
def _match_iso(s):
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
try:
datetime.date.fromisoformat(s)
return s, Precision.DAY
except ValueError:
return None
return None
def _match_numeric(s):
m = _NUM_RE.fullmatch(s)
if not m:
return None
day, month = int(m.group(1)), int(m.group(2))
year = expand_year(m.group(3))
if year is None or not (1 <= month <= 12):
return None
try:
return datetime.date(year, month, day).isoformat(), Precision.DAY
except ValueError:
return None
# Matchers are tried in order. Later tasks append to this list.
_MATCHERS = [_match_iso, _match_numeric]
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
if date_overrides:
key = (raw or "").strip()
if key in date_overrides:
iso, prec = date_overrides[key]
return ParsedDate(iso or None, Precision(prec), raw)
cleaned, approx = _preprocess(raw)
if not cleaned:
return ParsedDate(None, Precision.UNKNOWN, raw)
for matcher in _MATCHERS:
result = matcher(cleaned)
if result:
iso, precision = result
if approx:
precision = Precision.APPROX
return ParsedDate(iso, precision, raw)
return ParsedDate(None, Precision.UNKNOWN, raw)
def easter(year: int) -> datetime.date:
"""Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
a = year % 19
b = year // 100
c = year % 100
d = b // 4
e = b % 4
f = (b + 8) // 25
g = (b - f + 1) // 3
h = (19 * a + b - d - g + 15) % 30
i = c // 4
k = c % 4
l = (32 + 2 * e + 2 * i - h - k) % 7
m = (a + 11 * h + 22 * l) // 451
month = (h + l - 7 * m + 114) // 31
day = ((h + l - 7 * m + 114) % 31) + 1
return datetime.date(year, month, day)