feat(normalizer): parse Spanish month names + Month DD-YYYY hyphen form
All checks were successful
CI / Unit & Component Tests (pull_request) Successful in 3m31s
CI / OCR Service Tests (pull_request) Successful in 22s
CI / Backend Unit Tests (pull_request) Successful in 3m42s
CI / fail2ban Regex (pull_request) Successful in 45s
CI / Semgrep Security Scan (pull_request) Successful in 20s
CI / Compose Bucket Idempotency (pull_request) Successful in 1m2s

Add Spanish month names (Mexican-branch letters) to config.MONTHS and let
the month-first matcher accept a hyphen (not just a dot) before the year, so
"Mayo 18-1929"/"Junio 7-904" parse without manual overrides. Also bound
4-digit years to 1700-2100 so gross typos ("23-9003") stay in review instead
of producing a bogus year. Cuts unknown-date rate 9.2% -> 7.9%.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 17:00:33 +02:00
parent 0f1f9055c3
commit 5efe3b8a7c
3 changed files with 28 additions and 3 deletions

View File

@@ -35,6 +35,7 @@ def test_expand_year():
assert dates.expand_year("73") == 1873 # 73..99 -> 18xx
assert dates.expand_year("99") == 1899
assert dates.expand_year("65") is None # 58..72 ambiguous
assert dates.expand_year("9003") is None # implausible 4-digit year -> reject (typo)
assert dates.expand_year("x") is None
def test_parse_iso_and_empty():
@@ -127,3 +128,21 @@ def test_parse_date_override_wins():
ovr = {"13.5.65": ("1965-05-13", "DAY")}
r = dates.parse_date("13.5.65", ovr) # ambiguous without override
assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65")
def test_parse_spanish_months():
# Mexican-branch letters: Spanish month names, day-first and month-first (hyphen/dot before year)
assert dates.parse_date("21.Enero 1911").iso == "1911-01-21" # day-first
assert dates.parse_date("Junio 17.929").iso == "1929-06-17" # month-first, dot, 3-digit year
assert dates.parse_date("Mayo 18-1929").iso == "1929-05-18" # month-first, hyphen
assert dates.parse_date("Abril 10-929").iso == "1929-04-10" # hyphen, 3-digit year
assert dates.parse_date("Agosto 27-929").iso == "1929-08-27"
assert dates.parse_date("febrero 14-29").iso == "1929-02-14" # hyphen, 2-digit year
assert dates.parse_date("Mayo 18-1929").precision == Precision.DAY
def test_implausible_year_goes_to_review():
# a source typo like "October 23-9003" must NOT parse to a bogus year 9003 — stays UNKNOWN
assert dates.parse_date("October 23-9003").precision == Precision.UNKNOWN
def test_hyphen_month_first_does_not_shadow_month_year():
# the hyphen-separator generalization must NOT make "Mai 1895" parse as day=18
assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")