_preprocess now sets approx=True when a leading marker is stripped; add _match_year_only so bare years (e.g. "nach 1900" -> "1900") resolve to 1900-01-01/YEAR before being upgraded to APPROX. Strengthen test_parse_approx_marker_upgrades_precision and add test_parse_leading_qualifier_is_approx (11 tests, all pass). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
168 lines
5.1 KiB
Python
168 lines
5.1 KiB
Python
"""Tolerant historical date parsing for the family archive."""
|
||
import datetime
|
||
import re
|
||
from dataclasses import dataclass
|
||
from enum import StrEnum
|
||
import config
|
||
|
||
|
||
class Precision(StrEnum):
|
||
DAY = "DAY"
|
||
MONTH = "MONTH"
|
||
SEASON = "SEASON"
|
||
YEAR = "YEAR"
|
||
RANGE = "RANGE"
|
||
APPROX = "APPROX"
|
||
UNKNOWN = "UNKNOWN"
|
||
|
||
|
||
def _advent_sunday(year: int, n: int) -> datetime.date:
|
||
"""n-th Advent (1..4). 4th Advent = last Sunday on/before Dec 24."""
|
||
dec24 = datetime.date(year, 12, 24)
|
||
back_to_sunday = (dec24.weekday() - 6) % 7 # Mon=0..Sun=6
|
||
fourth = dec24 - datetime.timedelta(days=back_to_sunday)
|
||
return fourth - datetime.timedelta(days=(4 - n) * 7)
|
||
|
||
|
||
def resolve_feast_or_season(token: str, year: int):
|
||
"""Return (iso, Precision) for a known feast/season token, else None."""
|
||
key = " ".join(token.lower().split()).strip(" .")
|
||
if key in config.MOVABLE_FEASTS:
|
||
d = easter(year) + datetime.timedelta(days=config.MOVABLE_FEASTS[key])
|
||
return d.isoformat(), Precision.DAY
|
||
if key in config.FIXED_FEASTS:
|
||
month, day = config.FIXED_FEASTS[key]
|
||
return datetime.date(year, month, day).isoformat(), Precision.DAY
|
||
advent = {"1. advent": 1, "2. advent": 2, "3. advent": 3, "4. advent": 4, "advent": 1}
|
||
if key in advent:
|
||
return _advent_sunday(year, advent[key]).isoformat(), Precision.DAY
|
||
if key in config.SEASON_MONTHS:
|
||
return datetime.date(year, config.SEASON_MONTHS[key], 1).isoformat(), Precision.SEASON
|
||
return None
|
||
|
||
|
||
def expand_year(token: str):
|
||
"""Expand a 2/3/4-digit year string per the 1873–1957 century rule. None if ambiguous."""
|
||
token = token.strip()
|
||
if not token.isdigit():
|
||
return None
|
||
n, v = len(token), int(token)
|
||
if n == 4:
|
||
return v
|
||
if n == 3:
|
||
return 1000 + v
|
||
if n == 2:
|
||
if v <= config.TWO_DIGIT_19XX_MAX:
|
||
return 1900 + v
|
||
if v >= config.TWO_DIGIT_18XX_MIN:
|
||
return 1800 + v
|
||
return None
|
||
return None
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class ParsedDate:
|
||
iso: str | None
|
||
precision: Precision
|
||
raw: str
|
||
|
||
|
||
_LEADING_MARKERS = re.compile(
|
||
r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I)
|
||
|
||
|
||
def _preprocess(raw: str):
|
||
"""Return (cleaned_string, approx_flag). Any uncertainty/qualifier marker -> approx."""
|
||
s = (raw or "").strip()
|
||
if not s:
|
||
return "", False
|
||
low = s.lower()
|
||
approx = ("?" in s) or any(
|
||
m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich"))
|
||
s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)"
|
||
s = s.replace("?", " ")
|
||
s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief")
|
||
stripped = _LEADING_MARKERS.sub("", s)
|
||
if stripped != s: # a leading qualifier (um/ca/nach/vor/anfang/…) signals approximation
|
||
approx = True
|
||
s = re.sub(r"\s+", " ", stripped).strip(" .,")
|
||
return s, approx
|
||
|
||
|
||
_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})")
|
||
|
||
|
||
def _match_iso(s):
|
||
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
|
||
try:
|
||
datetime.date.fromisoformat(s)
|
||
return s, Precision.DAY
|
||
except ValueError:
|
||
return None
|
||
return None
|
||
|
||
|
||
def _match_numeric(s):
|
||
m = _NUM_RE.fullmatch(s)
|
||
if not m:
|
||
return None
|
||
day, month = int(m.group(1)), int(m.group(2))
|
||
year = expand_year(m.group(3))
|
||
if year is None or not (1 <= month <= 12):
|
||
return None
|
||
try:
|
||
return datetime.date(year, month, day).isoformat(), Precision.DAY
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
_YEAR_ONLY_RE = re.compile(r"\d{4}")
|
||
|
||
|
||
def _match_year_only(s):
|
||
if _YEAR_ONLY_RE.fullmatch(s):
|
||
return datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR
|
||
return None
|
||
|
||
|
||
# Matchers are tried in order. Later tasks append to this list.
|
||
_MATCHERS = [_match_iso, _match_numeric, _match_year_only]
|
||
|
||
|
||
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
|
||
if date_overrides:
|
||
key = (raw or "").strip()
|
||
if key in date_overrides:
|
||
iso, prec = date_overrides[key]
|
||
return ParsedDate(iso or None, Precision(prec), raw)
|
||
cleaned, approx = _preprocess(raw)
|
||
if not cleaned:
|
||
return ParsedDate(None, Precision.UNKNOWN, raw)
|
||
for matcher in _MATCHERS:
|
||
result = matcher(cleaned)
|
||
if result:
|
||
iso, precision = result
|
||
if approx:
|
||
precision = Precision.APPROX
|
||
return ParsedDate(iso, precision, raw)
|
||
return ParsedDate(None, Precision.UNKNOWN, raw)
|
||
|
||
|
||
def easter(year: int) -> datetime.date:
|
||
"""Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
|
||
a = year % 19
|
||
b = year // 100
|
||
c = year % 100
|
||
d = b // 4
|
||
e = b % 4
|
||
f = (b + 8) // 25
|
||
g = (b - f + 1) // 3
|
||
h = (19 * a + b - d - g + 15) % 30
|
||
i = c // 4
|
||
k = c % 4
|
||
l = (32 + 2 * e + 2 * i - h - k) % 7
|
||
m = (a + 11 * h + 22 * l) // 451
|
||
month = (h + l - 7 * m + 114) // 31
|
||
day = ((h + l - 7 * m + 114) % 31) + 1
|
||
return datetime.date(year, month, day)
|