feat(normalizer): parse Spanish month names + Month DD-YYYY hyphen form
All checks were successful
CI / Unit & Component Tests (pull_request) Successful in 3m31s
CI / OCR Service Tests (pull_request) Successful in 22s
CI / Backend Unit Tests (pull_request) Successful in 3m42s
CI / fail2ban Regex (pull_request) Successful in 45s
CI / Semgrep Security Scan (pull_request) Successful in 20s
CI / Compose Bucket Idempotency (pull_request) Successful in 1m2s
All checks were successful
CI / Unit & Component Tests (pull_request) Successful in 3m31s
CI / OCR Service Tests (pull_request) Successful in 22s
CI / Backend Unit Tests (pull_request) Successful in 3m42s
CI / fail2ban Regex (pull_request) Successful in 45s
CI / Semgrep Security Scan (pull_request) Successful in 20s
CI / Compose Bucket Idempotency (pull_request) Successful in 1m2s
Add Spanish month names (Mexican-branch letters) to config.MONTHS and let
the month-first matcher accept a hyphen (not just a dot) before the year, so
"Mayo 18-1929"/"Junio 7-904" parse without manual overrides. Also bound
4-digit years to 1700-2100 so gross typos ("23-9003") stay in review instead
of producing a bogus year. Cuts unknown-date rate 9.2% -> 7.9%.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -35,6 +35,7 @@ def test_expand_year():
|
||||
assert dates.expand_year("73") == 1873 # 73..99 -> 18xx
|
||||
assert dates.expand_year("99") == 1899
|
||||
assert dates.expand_year("65") is None # 58..72 ambiguous
|
||||
assert dates.expand_year("9003") is None # implausible 4-digit year -> reject (typo)
|
||||
assert dates.expand_year("x") is None
|
||||
|
||||
def test_parse_iso_and_empty():
|
||||
@@ -127,3 +128,21 @@ def test_parse_date_override_wins():
|
||||
ovr = {"13.5.65": ("1965-05-13", "DAY")}
|
||||
r = dates.parse_date("13.5.65", ovr) # ambiguous without override
|
||||
assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65")
|
||||
|
||||
def test_parse_spanish_months():
|
||||
# Mexican-branch letters: Spanish month names, day-first and month-first (hyphen/dot before year)
|
||||
assert dates.parse_date("21.Enero 1911").iso == "1911-01-21" # day-first
|
||||
assert dates.parse_date("Junio 17.929").iso == "1929-06-17" # month-first, dot, 3-digit year
|
||||
assert dates.parse_date("Mayo 18-1929").iso == "1929-05-18" # month-first, hyphen
|
||||
assert dates.parse_date("Abril 10-929").iso == "1929-04-10" # hyphen, 3-digit year
|
||||
assert dates.parse_date("Agosto 27-929").iso == "1929-08-27"
|
||||
assert dates.parse_date("febrero 14-29").iso == "1929-02-14" # hyphen, 2-digit year
|
||||
assert dates.parse_date("Mayo 18-1929").precision == Precision.DAY
|
||||
|
||||
def test_implausible_year_goes_to_review():
|
||||
# a source typo like "October 23-9003" must NOT parse to a bogus year 9003 — stays UNKNOWN
|
||||
assert dates.parse_date("October 23-9003").precision == Precision.UNKNOWN
|
||||
|
||||
def test_hyphen_month_first_does_not_shadow_month_year():
|
||||
# the hyphen-separator generalization must NOT make "Mai 1895" parse as day=18
|
||||
assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")
|
||||
|
||||
Reference in New Issue
Block a user