feat(normalizer): parse_date dispatch + iso/numeric matchers
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
"""Tolerant historical date parsing for the family archive."""
|
||||
import datetime
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import StrEnum
|
||||
import config
|
||||
|
||||
@@ -58,6 +60,83 @@ def expand_year(token: str):
|
||||
return None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ParsedDate:
|
||||
iso: str | None
|
||||
precision: Precision
|
||||
raw: str
|
||||
|
||||
|
||||
_LEADING_MARKERS = re.compile(
|
||||
r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I)
|
||||
|
||||
|
||||
def _preprocess(raw: str):
|
||||
"""Return (cleaned_string, approx_flag)."""
|
||||
s = (raw or "").strip()
|
||||
if not s:
|
||||
return "", False
|
||||
low = s.lower()
|
||||
approx = ("?" in s) or any(
|
||||
m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich"))
|
||||
s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)"
|
||||
s = s.replace("?", " ")
|
||||
s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief")
|
||||
s = _LEADING_MARKERS.sub("", s)
|
||||
s = re.sub(r"\s+", " ", s).strip(" .,")
|
||||
return s, approx
|
||||
|
||||
|
||||
_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})")
|
||||
|
||||
|
||||
def _match_iso(s):
|
||||
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
|
||||
try:
|
||||
datetime.date.fromisoformat(s)
|
||||
return s, Precision.DAY
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _match_numeric(s):
|
||||
m = _NUM_RE.fullmatch(s)
|
||||
if not m:
|
||||
return None
|
||||
day, month = int(m.group(1)), int(m.group(2))
|
||||
year = expand_year(m.group(3))
|
||||
if year is None or not (1 <= month <= 12):
|
||||
return None
|
||||
try:
|
||||
return datetime.date(year, month, day).isoformat(), Precision.DAY
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
# Matchers are tried in order. Later tasks append to this list.
|
||||
_MATCHERS = [_match_iso, _match_numeric]
|
||||
|
||||
|
||||
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
|
||||
if date_overrides:
|
||||
key = (raw or "").strip()
|
||||
if key in date_overrides:
|
||||
iso, prec = date_overrides[key]
|
||||
return ParsedDate(iso or None, Precision(prec), raw)
|
||||
cleaned, approx = _preprocess(raw)
|
||||
if not cleaned:
|
||||
return ParsedDate(None, Precision.UNKNOWN, raw)
|
||||
for matcher in _MATCHERS:
|
||||
result = matcher(cleaned)
|
||||
if result:
|
||||
iso, precision = result
|
||||
if approx:
|
||||
precision = Precision.APPROX
|
||||
return ParsedDate(iso, precision, raw)
|
||||
return ParsedDate(None, Precision.UNKNOWN, raw)
|
||||
|
||||
|
||||
def easter(year: int) -> datetime.date:
|
||||
"""Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
|
||||
a = year % 19
|
||||
|
||||
Reference in New Issue
Block a user