feat(normalizer): parse_date dispatch + iso/numeric matchers
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,7 @@
|
|||||||
"""Tolerant historical date parsing for the family archive."""
|
"""Tolerant historical date parsing for the family archive."""
|
||||||
import datetime
|
import datetime
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
from enum import StrEnum
|
from enum import StrEnum
|
||||||
import config
|
import config
|
||||||
|
|
||||||
@@ -58,6 +60,83 @@ def expand_year(token: str):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ParsedDate:
|
||||||
|
iso: str | None
|
||||||
|
precision: Precision
|
||||||
|
raw: str
|
||||||
|
|
||||||
|
|
||||||
|
_LEADING_MARKERS = re.compile(
|
||||||
|
r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I)
|
||||||
|
|
||||||
|
|
||||||
|
def _preprocess(raw: str):
|
||||||
|
"""Return (cleaned_string, approx_flag)."""
|
||||||
|
s = (raw or "").strip()
|
||||||
|
if not s:
|
||||||
|
return "", False
|
||||||
|
low = s.lower()
|
||||||
|
approx = ("?" in s) or any(
|
||||||
|
m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich"))
|
||||||
|
s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)"
|
||||||
|
s = s.replace("?", " ")
|
||||||
|
s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief")
|
||||||
|
s = _LEADING_MARKERS.sub("", s)
|
||||||
|
s = re.sub(r"\s+", " ", s).strip(" .,")
|
||||||
|
return s, approx
|
||||||
|
|
||||||
|
|
||||||
|
_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})")
|
||||||
|
|
||||||
|
|
||||||
|
def _match_iso(s):
|
||||||
|
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
|
||||||
|
try:
|
||||||
|
datetime.date.fromisoformat(s)
|
||||||
|
return s, Precision.DAY
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _match_numeric(s):
|
||||||
|
m = _NUM_RE.fullmatch(s)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
day, month = int(m.group(1)), int(m.group(2))
|
||||||
|
year = expand_year(m.group(3))
|
||||||
|
if year is None or not (1 <= month <= 12):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.date(year, month, day).isoformat(), Precision.DAY
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# Matchers are tried in order. Later tasks append to this list.
|
||||||
|
_MATCHERS = [_match_iso, _match_numeric]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
|
||||||
|
if date_overrides:
|
||||||
|
key = (raw or "").strip()
|
||||||
|
if key in date_overrides:
|
||||||
|
iso, prec = date_overrides[key]
|
||||||
|
return ParsedDate(iso or None, Precision(prec), raw)
|
||||||
|
cleaned, approx = _preprocess(raw)
|
||||||
|
if not cleaned:
|
||||||
|
return ParsedDate(None, Precision.UNKNOWN, raw)
|
||||||
|
for matcher in _MATCHERS:
|
||||||
|
result = matcher(cleaned)
|
||||||
|
if result:
|
||||||
|
iso, precision = result
|
||||||
|
if approx:
|
||||||
|
precision = Precision.APPROX
|
||||||
|
return ParsedDate(iso, precision, raw)
|
||||||
|
return ParsedDate(None, Precision.UNKNOWN, raw)
|
||||||
|
|
||||||
|
|
||||||
def easter(year: int) -> datetime.date:
|
def easter(year: int) -> datetime.date:
|
||||||
"""Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
|
"""Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
|
||||||
a = year % 19
|
a = year % 19
|
||||||
|
|||||||
@@ -36,3 +36,25 @@ def test_expand_year():
|
|||||||
assert dates.expand_year("99") == 1899
|
assert dates.expand_year("99") == 1899
|
||||||
assert dates.expand_year("65") is None # 58..72 ambiguous
|
assert dates.expand_year("65") is None # 58..72 ambiguous
|
||||||
assert dates.expand_year("x") is None
|
assert dates.expand_year("x") is None
|
||||||
|
|
||||||
|
def test_parse_iso_and_empty():
|
||||||
|
assert dates.parse_date("1910-04-23") == dates.ParsedDate("1910-04-23", Precision.DAY, "1910-04-23")
|
||||||
|
assert dates.parse_date("") == dates.ParsedDate(None, Precision.UNKNOWN, "")
|
||||||
|
assert dates.parse_date("?") == dates.ParsedDate(None, Precision.UNKNOWN, "?")
|
||||||
|
|
||||||
|
def test_parse_numeric_forms():
|
||||||
|
assert dates.parse_date("15.2.1888").iso == "1888-02-15"
|
||||||
|
assert dates.parse_date("13.5.09").iso == "1909-05-13"
|
||||||
|
assert dates.parse_date("17/6. 1916").iso == "1916-06-17"
|
||||||
|
assert dates.parse_date("11.10.08").iso == "1908-10-11"
|
||||||
|
assert dates.parse_date("30.1.889").iso == "1889-01-30"
|
||||||
|
assert dates.parse_date("15.2.1888").precision == Precision.DAY
|
||||||
|
|
||||||
|
def test_parse_numeric_unparseable():
|
||||||
|
assert dates.parse_date("8.9.").precision == Precision.UNKNOWN # no year
|
||||||
|
assert dates.parse_date("13.5.65").precision == Precision.UNKNOWN # ambiguous 2-digit year
|
||||||
|
|
||||||
|
def test_parse_approx_marker_upgrades_precision():
|
||||||
|
r = dates.parse_date("17.Nov (?) 1887") # month-name handled in a later task; here just the marker path
|
||||||
|
# after the marker is detected, a parsed date becomes APPROX (verified fully in Task 8)
|
||||||
|
assert r.raw == "17.Nov (?) 1887"
|
||||||
|
|||||||
Reference in New Issue
Block a user