feat(normalizer): parse Spanish month names + Month DD-YYYY hyphen form
All checks were successful
CI / Unit & Component Tests (pull_request) Successful in 3m31s
CI / OCR Service Tests (pull_request) Successful in 22s
CI / Backend Unit Tests (pull_request) Successful in 3m42s
CI / fail2ban Regex (pull_request) Successful in 45s
CI / Semgrep Security Scan (pull_request) Successful in 20s
CI / Compose Bucket Idempotency (pull_request) Successful in 1m2s

Add Spanish month names (Mexican-branch letters) to config.MONTHS and let
the month-first matcher accept a hyphen (not just a dot) before the year, so
"Mayo 18-1929"/"Junio 7-904" parse without manual overrides. Also bound
4-digit years to 1700-2100 so gross typos ("23-9003") stay in review instead
of producing a bogus year. Cuts unknown-date rate 9.2% -> 7.9%.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 17:00:33 +02:00
parent 0f1f9055c3
commit 5efe3b8a7c
3 changed files with 28 additions and 3 deletions

View File

@@ -85,6 +85,10 @@ MONTHS = {
"oktober": 10, "okt": 10, "oct": 10, "october": 10, "oktober": 10, "okt": 10, "oct": 10, "october": 10,
"november": 11, "nov": 11, "november": 11, "nov": 11,
"dezember": 12, "dez": 12, "dec": 12, "december": 12, "dezember": 12, "dez": 12, "dec": 12, "december": 12,
# Spanish (Mexican-branch correspondence)
"enero": 1, "febrero": 2, "marzo": 3, "abril": 4, "mayo": 5, "junio": 6,
"julio": 7, "agosto": 8, "septiembre": 9, "setiembre": 9, "octubre": 10,
"noviembre": 11, "diciembre": 12,
} }
ROMAN_MONTHS = { ROMAN_MONTHS = {

View File

@@ -48,7 +48,8 @@ def expand_year(token: str):
return None return None
n, v = len(token), int(token) n, v = len(token), int(token)
if n == 4: if n == 4:
return v # reject gross typos (e.g. "9003") so they go to review instead of a bogus year
return v if 1700 <= v <= 2100 else None
if n == 3: if n == 3:
return 1000 + v return 1000 + v
if n == 2: if n == 2:
@@ -157,8 +158,9 @@ def _match_monthname_a(s):
return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3))) return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3)))
# dot after day is REQUIRED so this can't match "Mai 1895" (MONTH YYYY) as day=18 # A separator (dot OR hyphen/en-dash) after the day is REQUIRED so this can't match
_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\.\s*(\d{2,4})") # "Mai 1895" (MONTH YYYY) as day=18; the hyphen form also covers Spanish "Mayo 18-1929".
_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\s*[.\-]\s*(\d{2,4})")
def _match_monthname_b(s): def _match_monthname_b(s):

View File

@@ -35,6 +35,7 @@ def test_expand_year():
assert dates.expand_year("73") == 1873 # 73..99 -> 18xx assert dates.expand_year("73") == 1873 # 73..99 -> 18xx
assert dates.expand_year("99") == 1899 assert dates.expand_year("99") == 1899
assert dates.expand_year("65") is None # 58..72 ambiguous assert dates.expand_year("65") is None # 58..72 ambiguous
assert dates.expand_year("9003") is None # implausible 4-digit year -> reject (typo)
assert dates.expand_year("x") is None assert dates.expand_year("x") is None
def test_parse_iso_and_empty(): def test_parse_iso_and_empty():
@@ -127,3 +128,21 @@ def test_parse_date_override_wins():
ovr = {"13.5.65": ("1965-05-13", "DAY")} ovr = {"13.5.65": ("1965-05-13", "DAY")}
r = dates.parse_date("13.5.65", ovr) # ambiguous without override r = dates.parse_date("13.5.65", ovr) # ambiguous without override
assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65") assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65")
def test_parse_spanish_months():
# Mexican-branch letters: Spanish month names, day-first and month-first (hyphen/dot before year)
assert dates.parse_date("21.Enero 1911").iso == "1911-01-21" # day-first
assert dates.parse_date("Junio 17.929").iso == "1929-06-17" # month-first, dot, 3-digit year
assert dates.parse_date("Mayo 18-1929").iso == "1929-05-18" # month-first, hyphen
assert dates.parse_date("Abril 10-929").iso == "1929-04-10" # hyphen, 3-digit year
assert dates.parse_date("Agosto 27-929").iso == "1929-08-27"
assert dates.parse_date("febrero 14-29").iso == "1929-02-14" # hyphen, 2-digit year
assert dates.parse_date("Mayo 18-1929").precision == Precision.DAY
def test_implausible_year_goes_to_review():
# a source typo like "October 23-9003" must NOT parse to a bogus year 9003 — stays UNKNOWN
assert dates.parse_date("October 23-9003").precision == Precision.UNKNOWN
def test_hyphen_month_first_does_not_shadow_month_year():
# the hyphen-separator generalization must NOT make "Mai 1895" parse as day=18
assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")