feat(normalizer): parse_date dispatch + iso/numeric matchers

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 13:30:07 +02:00
parent 1908dde859
commit df14e6b1ee
2 changed files with 101 additions and 0 deletions

View File

@@ -1,5 +1,7 @@
"""Tolerant historical date parsing for the family archive.""" """Tolerant historical date parsing for the family archive."""
import datetime import datetime
import re
from dataclasses import dataclass
from enum import StrEnum from enum import StrEnum
import config import config
@@ -58,6 +60,83 @@ def expand_year(token: str):
return None return None
@dataclass(frozen=True)
class ParsedDate:
iso: str | None
precision: Precision
raw: str
_LEADING_MARKERS = re.compile(
r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I)
def _preprocess(raw: str):
"""Return (cleaned_string, approx_flag)."""
s = (raw or "").strip()
if not s:
return "", False
low = s.lower()
approx = ("?" in s) or any(
m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich"))
s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)"
s = s.replace("?", " ")
s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief")
s = _LEADING_MARKERS.sub("", s)
s = re.sub(r"\s+", " ", s).strip(" .,")
return s, approx
_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})")
def _match_iso(s):
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
try:
datetime.date.fromisoformat(s)
return s, Precision.DAY
except ValueError:
return None
return None
def _match_numeric(s):
m = _NUM_RE.fullmatch(s)
if not m:
return None
day, month = int(m.group(1)), int(m.group(2))
year = expand_year(m.group(3))
if year is None or not (1 <= month <= 12):
return None
try:
return datetime.date(year, month, day).isoformat(), Precision.DAY
except ValueError:
return None
# Matchers are tried in order. Later tasks append to this list.
_MATCHERS = [_match_iso, _match_numeric]
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
if date_overrides:
key = (raw or "").strip()
if key in date_overrides:
iso, prec = date_overrides[key]
return ParsedDate(iso or None, Precision(prec), raw)
cleaned, approx = _preprocess(raw)
if not cleaned:
return ParsedDate(None, Precision.UNKNOWN, raw)
for matcher in _MATCHERS:
result = matcher(cleaned)
if result:
iso, precision = result
if approx:
precision = Precision.APPROX
return ParsedDate(iso, precision, raw)
return ParsedDate(None, Precision.UNKNOWN, raw)
def easter(year: int) -> datetime.date: def easter(year: int) -> datetime.date:
"""Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm.""" """Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
a = year % 19 a = year % 19

View File

@@ -36,3 +36,25 @@ def test_expand_year():
assert dates.expand_year("99") == 1899 assert dates.expand_year("99") == 1899
assert dates.expand_year("65") is None # 58..72 ambiguous assert dates.expand_year("65") is None # 58..72 ambiguous
assert dates.expand_year("x") is None assert dates.expand_year("x") is None
def test_parse_iso_and_empty():
assert dates.parse_date("1910-04-23") == dates.ParsedDate("1910-04-23", Precision.DAY, "1910-04-23")
assert dates.parse_date("") == dates.ParsedDate(None, Precision.UNKNOWN, "")
assert dates.parse_date("?") == dates.ParsedDate(None, Precision.UNKNOWN, "?")
def test_parse_numeric_forms():
assert dates.parse_date("15.2.1888").iso == "1888-02-15"
assert dates.parse_date("13.5.09").iso == "1909-05-13"
assert dates.parse_date("17/6. 1916").iso == "1916-06-17"
assert dates.parse_date("11.10.08").iso == "1908-10-11"
assert dates.parse_date("30.1.889").iso == "1889-01-30"
assert dates.parse_date("15.2.1888").precision == Precision.DAY
def test_parse_numeric_unparseable():
assert dates.parse_date("8.9.").precision == Precision.UNKNOWN # no year
assert dates.parse_date("13.5.65").precision == Precision.UNKNOWN # ambiguous 2-digit year
def test_parse_approx_marker_upgrades_precision():
r = dates.parse_date("17.Nov (?) 1887") # month-name handled in a later task; here just the marker path
# after the marker is detected, a parsed date becomes APPROX (verified fully in Task 8)
assert r.raw == "17.Nov (?) 1887"