feat(normalizer): parse_date dispatch + iso/numeric matchers

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 13:30:07 +02:00
parent 1908dde859
commit df14e6b1ee
2 changed files with 101 additions and 0 deletions

View File

@@ -1,5 +1,7 @@
"""Tolerant historical date parsing for the family archive."""
import datetime
import re
from dataclasses import dataclass
from enum import StrEnum
import config
@@ -58,6 +60,83 @@ def expand_year(token: str):
return None
@dataclass(frozen=True)
class ParsedDate:
iso: str | None
precision: Precision
raw: str
_LEADING_MARKERS = re.compile(
r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I)
def _preprocess(raw: str):
"""Return (cleaned_string, approx_flag)."""
s = (raw or "").strip()
if not s:
return "", False
low = s.lower()
approx = ("?" in s) or any(
m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich"))
s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)"
s = s.replace("?", " ")
s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief")
s = _LEADING_MARKERS.sub("", s)
s = re.sub(r"\s+", " ", s).strip(" .,")
return s, approx
_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})")
def _match_iso(s):
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
try:
datetime.date.fromisoformat(s)
return s, Precision.DAY
except ValueError:
return None
return None
def _match_numeric(s):
m = _NUM_RE.fullmatch(s)
if not m:
return None
day, month = int(m.group(1)), int(m.group(2))
year = expand_year(m.group(3))
if year is None or not (1 <= month <= 12):
return None
try:
return datetime.date(year, month, day).isoformat(), Precision.DAY
except ValueError:
return None
# Matchers are tried in order. Later tasks append to this list.
_MATCHERS = [_match_iso, _match_numeric]
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
if date_overrides:
key = (raw or "").strip()
if key in date_overrides:
iso, prec = date_overrides[key]
return ParsedDate(iso or None, Precision(prec), raw)
cleaned, approx = _preprocess(raw)
if not cleaned:
return ParsedDate(None, Precision.UNKNOWN, raw)
for matcher in _MATCHERS:
result = matcher(cleaned)
if result:
iso, precision = result
if approx:
precision = Precision.APPROX
return ParsedDate(iso, precision, raw)
return ParsedDate(None, Precision.UNKNOWN, raw)
def easter(year: int) -> datetime.date:
"""Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
a = year % 19

View File

@@ -36,3 +36,25 @@ def test_expand_year():
assert dates.expand_year("99") == 1899
assert dates.expand_year("65") is None # 58..72 ambiguous
assert dates.expand_year("x") is None
def test_parse_iso_and_empty():
assert dates.parse_date("1910-04-23") == dates.ParsedDate("1910-04-23", Precision.DAY, "1910-04-23")
assert dates.parse_date("") == dates.ParsedDate(None, Precision.UNKNOWN, "")
assert dates.parse_date("?") == dates.ParsedDate(None, Precision.UNKNOWN, "?")
def test_parse_numeric_forms():
assert dates.parse_date("15.2.1888").iso == "1888-02-15"
assert dates.parse_date("13.5.09").iso == "1909-05-13"
assert dates.parse_date("17/6. 1916").iso == "1916-06-17"
assert dates.parse_date("11.10.08").iso == "1908-10-11"
assert dates.parse_date("30.1.889").iso == "1889-01-30"
assert dates.parse_date("15.2.1888").precision == Precision.DAY
def test_parse_numeric_unparseable():
assert dates.parse_date("8.9.").precision == Precision.UNKNOWN # no year
assert dates.parse_date("13.5.65").precision == Precision.UNKNOWN # ambiguous 2-digit year
def test_parse_approx_marker_upgrades_precision():
r = dates.parse_date("17.Nov (?) 1887") # month-name handled in a later task; here just the marker path
# after the marker is detected, a parsed date becomes APPROX (verified fully in Task 8)
assert r.raw == "17.Nov (?) 1887"