Files
familienarchiv/tools/import-normalizer/dates.py
Marcel 5efe3b8a7c
All checks were successful
CI / Unit & Component Tests (pull_request) Successful in 3m31s
CI / OCR Service Tests (pull_request) Successful in 22s
CI / Backend Unit Tests (pull_request) Successful in 3m42s
CI / fail2ban Regex (pull_request) Successful in 45s
CI / Semgrep Security Scan (pull_request) Successful in 20s
CI / Compose Bucket Idempotency (pull_request) Successful in 1m2s
feat(normalizer): parse Spanish month names + Month DD-YYYY hyphen form
Add Spanish month names (Mexican-branch letters) to config.MONTHS and let
the month-first matcher accept a hyphen (not just a dot) before the year, so
"Mayo 18-1929"/"Junio 7-904" parse without manual overrides. Also bound
4-digit years to 1700-2100 so gross typos ("23-9003") stay in review instead
of producing a bogus year. Cuts unknown-date rate 9.2% -> 7.9%.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 17:00:33 +02:00

280 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tolerant historical date parsing for the family archive."""
import datetime
import re
from dataclasses import dataclass
from enum import StrEnum
import config
class Precision(StrEnum):
DAY = "DAY"
MONTH = "MONTH"
SEASON = "SEASON"
YEAR = "YEAR"
RANGE = "RANGE"
APPROX = "APPROX"
UNKNOWN = "UNKNOWN"
def _advent_sunday(year: int, n: int) -> datetime.date:
"""n-th Advent (1..4). 4th Advent = last Sunday on/before Dec 24."""
dec24 = datetime.date(year, 12, 24)
back_to_sunday = (dec24.weekday() - 6) % 7 # Mon=0..Sun=6
fourth = dec24 - datetime.timedelta(days=back_to_sunday)
return fourth - datetime.timedelta(days=(4 - n) * 7)
def resolve_feast_or_season(token: str, year: int):
"""Return (iso, Precision) for a known feast/season token, else None."""
key = " ".join(token.lower().split()).strip(" .")
if key in config.MOVABLE_FEASTS:
d = easter(year) + datetime.timedelta(days=config.MOVABLE_FEASTS[key])
return d.isoformat(), Precision.DAY
if key in config.FIXED_FEASTS:
month, day = config.FIXED_FEASTS[key]
return datetime.date(year, month, day).isoformat(), Precision.DAY
advent = {"1. advent": 1, "2. advent": 2, "3. advent": 3, "4. advent": 4, "advent": 1}
if key in advent:
return _advent_sunday(year, advent[key]).isoformat(), Precision.DAY
if key in config.SEASON_MONTHS:
return datetime.date(year, config.SEASON_MONTHS[key], 1).isoformat(), Precision.SEASON
return None
def expand_year(token: str):
"""Expand a 2/3/4-digit year string per the 18731957 century rule. None if ambiguous."""
token = token.strip()
if not token.isdigit():
return None
n, v = len(token), int(token)
if n == 4:
# reject gross typos (e.g. "9003") so they go to review instead of a bogus year
return v if 1700 <= v <= 2100 else None
if n == 3:
return 1000 + v
if n == 2:
if v <= config.TWO_DIGIT_19XX_MAX:
return 1900 + v
if v >= config.TWO_DIGIT_18XX_MIN:
return 1800 + v
return None
return None
@dataclass(frozen=True)
class ParsedDate:
iso: str | None
precision: Precision
raw: str
_LEADING_MARKERS = re.compile(
r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I)
def _preprocess(raw: str):
"""Return (cleaned_string, approx_flag). Any uncertainty/qualifier marker -> approx."""
s = (raw or "").strip()
if not s:
return "", False
low = s.lower()
approx = ("?" in s) or any(
m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich"))
s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)"
s = s.replace("?", " ")
s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief")
stripped = _LEADING_MARKERS.sub("", s)
if stripped != s: # a leading qualifier (um/ca/nach/vor/anfang/…) signals approximation
approx = True
s = re.sub(r"\s+", " ", stripped).strip(" .,")
return s, approx
_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})")
def _match_iso(s):
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
try:
datetime.date.fromisoformat(s)
return s, Precision.DAY
except ValueError:
return None
return None
def _match_numeric(s):
m = _NUM_RE.fullmatch(s)
if not m:
return None
day, month = int(m.group(1)), int(m.group(2))
year = expand_year(m.group(3))
if year is None or not (1 <= month <= 12):
return None
try:
return datetime.date(year, month, day).isoformat(), Precision.DAY
except ValueError:
return None
_ROMAN_RE = re.compile(r"(\d{1,2})\.\s*([IVXLC]+)\.?\s*(\d{2,4})", re.I)
def _match_roman(s):
m = _ROMAN_RE.fullmatch(s)
if not m:
return None
day = int(m.group(1))
month = config.ROMAN_MONTHS.get(m.group(2).lower())
year = expand_year(m.group(3))
if not month or year is None:
return None
try:
return datetime.date(year, month, day).isoformat(), Precision.DAY
except ValueError:
return None
_MONTH_A_RE = re.compile(r"(\d{1,2})[.\s]*([A-Za-zÄÖÜäöü]+)\.?\s*(\d{2,4})")
def _lookup_month(token: str):
return config.MONTHS.get(token.lower().strip(" ."))
def _build_day_month_year(day, month, year):
if not month or year is None or not (1 <= month <= 12):
return None
try:
return datetime.date(year, month, day).isoformat(), Precision.DAY
except ValueError:
return None
def _match_monthname_a(s):
m = _MONTH_A_RE.fullmatch(s)
if not m:
return None
return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3)))
# A separator (dot OR hyphen/en-dash) after the day is REQUIRED so this can't match
# "Mai 1895" (MONTH YYYY) as day=18; the hyphen form also covers Spanish "Mayo 18-1929".
_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\s*[.\-]\s*(\d{2,4})")
def _match_monthname_b(s):
m = _MONTH_B_RE.fullmatch(s)
if not m:
return None
return _build_day_month_year(int(m.group(2)), _lookup_month(m.group(1)), expand_year(m.group(3)))
_MONTH_YEAR_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s+(\d{2,4})")
_TOKEN_YEAR_RE = re.compile(r"(.+?)\.?\s+(\d{2,4})")
_YEAR_ONLY_RE = re.compile(r"\d{4}")
_RANGE_YY_RE = re.compile(r"(\d{4})\s*/\s*\d{2}")
_RANGE_HYPHEN_RE = re.compile(r"(.*\d)\s*[-]\s*\d.*")
# Intra-month day range, e.g. "7./8. Sept.1923" — require a dot before the slash so it
# does NOT swallow slash-as-dot single dates like "17/6. 1916" (which has no dot before "/").
_RANGE_DAY_RE = re.compile(r"(\d{1,2})\./(\d{1,2})\.\s*(.+)")
def _match_month_year(s):
m = _MONTH_YEAR_RE.fullmatch(s)
if not m:
return None
month = _lookup_month(m.group(1))
year = expand_year(m.group(2))
if not month or year is None:
return None
return datetime.date(year, month, 1).isoformat(), Precision.MONTH
def _match_feast_season(s):
m = _TOKEN_YEAR_RE.fullmatch(s)
if not m:
return None
year = expand_year(m.group(2))
if year is None:
return None
return resolve_feast_or_season(m.group(1), year)
def _match_year_only(s):
if _YEAR_ONLY_RE.fullmatch(s):
return datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR
return None
def _match_range(s):
m = _RANGE_YY_RE.fullmatch(s)
if m:
return datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE
m = _RANGE_DAY_RE.fullmatch(s)
if m:
first = f"{m.group(1)}.{m.group(3)}" # "7." + "Sept.1923" -> "7.Sept.1923"
for matcher in (_match_numeric, _match_monthname_a):
r = matcher(first)
if r:
return r[0], Precision.RANGE
m = _RANGE_HYPHEN_RE.fullmatch(s)
if m:
start = m.group(1).strip()
for matcher in (_match_numeric, _match_roman, _match_monthname_a, _match_year_only):
r = matcher(start)
if r:
return r[0], Precision.RANGE
return None
_MATCHERS = [
_match_iso,
_match_range,
_match_numeric,
_match_roman,
_match_monthname_a,
_match_month_year,
_match_monthname_b,
_match_feast_season,
_match_year_only,
]
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
if date_overrides:
key = (raw or "").strip()
if key in date_overrides:
iso, prec = date_overrides[key]
return ParsedDate(iso or None, Precision(prec), raw)
cleaned, approx = _preprocess(raw)
if not cleaned:
return ParsedDate(None, Precision.UNKNOWN, raw)
for matcher in _MATCHERS:
result = matcher(cleaned)
if result:
iso, precision = result
if approx:
precision = Precision.APPROX
return ParsedDate(iso, precision, raw)
return ParsedDate(None, Precision.UNKNOWN, raw)
def easter(year: int) -> datetime.date:
"""Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
a = year % 19
b = year // 100
c = year % 100
d = b // 4
e = b % 4
f = (b + 8) // 25
g = (b - f + 1) // 3
h = (19 * a + b - d - g + 15) % 30
i = c // 4
k = c % 4
l = (32 + 2 * e + 2 * i - h - k) % 7
m = (a + 11 * h + 22 * l) // 451
month = (h + l - 7 * m + 114) // 31
day = ((h + l - 7 * m + 114) % 31) + 1
return datetime.date(year, month, day)