feat(normalizer): month/year, feast/season, range matchers + overrides
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -157,8 +157,85 @@ def _match_monthname_a(s):
|
||||
return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3)))
|
||||
|
||||
|
||||
# Matchers are tried in order. Later tasks append to this list.
|
||||
_MATCHERS = [_match_iso, _match_numeric, _match_roman, _match_monthname_a]
|
||||
_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\.?\s*(\d{2,4})")
|
||||
|
||||
|
||||
def _match_monthname_b(s):
|
||||
m = _MONTH_B_RE.fullmatch(s)
|
||||
if not m:
|
||||
return None
|
||||
return _build_day_month_year(int(m.group(2)), _lookup_month(m.group(1)), expand_year(m.group(3)))
|
||||
|
||||
|
||||
_MONTH_YEAR_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s+(\d{2,4})")
|
||||
_TOKEN_YEAR_RE = re.compile(r"(.+?)\.?\s+(\d{2,4})")
|
||||
_YEAR_ONLY_RE = re.compile(r"\d{4}")
|
||||
_RANGE_YY_RE = re.compile(r"(\d{4})\s*/\s*\d{2}")
|
||||
_RANGE_HYPHEN_RE = re.compile(r"(.*\d)\s*[-–]\s*\d.*")
|
||||
# Intra-month day range, e.g. "7./8. Sept.1923" — require a dot before the slash so it
|
||||
# does NOT swallow slash-as-dot single dates like "17/6. 1916" (which has no dot before "/").
|
||||
_RANGE_DAY_RE = re.compile(r"(\d{1,2})\./(\d{1,2})\.\s*(.+)")
|
||||
|
||||
|
||||
def _match_month_year(s):
|
||||
m = _MONTH_YEAR_RE.fullmatch(s)
|
||||
if not m:
|
||||
return None
|
||||
month = _lookup_month(m.group(1))
|
||||
year = expand_year(m.group(2))
|
||||
if not month or year is None:
|
||||
return None
|
||||
return datetime.date(year, month, 1).isoformat(), Precision.MONTH
|
||||
|
||||
|
||||
def _match_feast_season(s):
|
||||
m = _TOKEN_YEAR_RE.fullmatch(s)
|
||||
if not m:
|
||||
return None
|
||||
year = expand_year(m.group(2))
|
||||
if year is None:
|
||||
return None
|
||||
return resolve_feast_or_season(m.group(1), year)
|
||||
|
||||
|
||||
def _match_year_only(s):
|
||||
if _YEAR_ONLY_RE.fullmatch(s):
|
||||
return datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR
|
||||
return None
|
||||
|
||||
|
||||
def _match_range(s):
|
||||
m = _RANGE_YY_RE.fullmatch(s)
|
||||
if m:
|
||||
return datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE
|
||||
m = _RANGE_DAY_RE.fullmatch(s)
|
||||
if m:
|
||||
first = f"{m.group(1)}.{m.group(3)}" # "7." + "Sept.1923" -> "7.Sept.1923"
|
||||
for matcher in (_match_numeric, _match_monthname_a):
|
||||
r = matcher(first)
|
||||
if r:
|
||||
return r[0], Precision.RANGE
|
||||
m = _RANGE_HYPHEN_RE.fullmatch(s)
|
||||
if m:
|
||||
start = m.group(1).strip()
|
||||
for matcher in (_match_numeric, _match_roman, _match_monthname_a, _match_year_only):
|
||||
r = matcher(start)
|
||||
if r:
|
||||
return r[0], Precision.RANGE
|
||||
return None
|
||||
|
||||
|
||||
_MATCHERS = [
|
||||
_match_iso,
|
||||
_match_range,
|
||||
_match_numeric,
|
||||
_match_roman,
|
||||
_match_monthname_a,
|
||||
_match_month_year,
|
||||
_match_monthname_b,
|
||||
_match_feast_season,
|
||||
_match_year_only,
|
||||
]
|
||||
|
||||
|
||||
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
|
||||
|
||||
Reference in New Issue
Block a user