diff --git a/tools/import-normalizer/config.py b/tools/import-normalizer/config.py index d789a2af..f055d422 100644 --- a/tools/import-normalizer/config.py +++ b/tools/import-normalizer/config.py @@ -85,6 +85,10 @@ MONTHS = { "oktober": 10, "okt": 10, "oct": 10, "october": 10, "november": 11, "nov": 11, "dezember": 12, "dez": 12, "dec": 12, "december": 12, + # Spanish (Mexican-branch correspondence) + "enero": 1, "febrero": 2, "marzo": 3, "abril": 4, "mayo": 5, "junio": 6, + "julio": 7, "agosto": 8, "septiembre": 9, "setiembre": 9, "octubre": 10, + "noviembre": 11, "diciembre": 12, } ROMAN_MONTHS = { diff --git a/tools/import-normalizer/dates.py b/tools/import-normalizer/dates.py index b4eaca6a..77245680 100644 --- a/tools/import-normalizer/dates.py +++ b/tools/import-normalizer/dates.py @@ -48,7 +48,8 @@ def expand_year(token: str): return None n, v = len(token), int(token) if n == 4: - return v + # reject gross typos (e.g. "9003") so they go to review instead of a bogus year + return v if 1700 <= v <= 2100 else None if n == 3: return 1000 + v if n == 2: @@ -157,8 +158,9 @@ def _match_monthname_a(s): return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3))) -# dot after day is REQUIRED so this can't match "Mai 1895" (MONTH YYYY) as day=18 -_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\.\s*(\d{2,4})") +# A separator (dot OR hyphen/en-dash) after the day is REQUIRED so this can't match +# "Mai 1895" (MONTH YYYY) as day=18; the hyphen form also covers Spanish "Mayo 18-1929". +_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\s*[.\-–]\s*(\d{2,4})") def _match_monthname_b(s): diff --git a/tools/import-normalizer/tests/test_dates.py b/tools/import-normalizer/tests/test_dates.py index a08b6b61..2a43ad61 100644 --- a/tools/import-normalizer/tests/test_dates.py +++ b/tools/import-normalizer/tests/test_dates.py @@ -35,6 +35,7 @@ def test_expand_year(): assert dates.expand_year("73") == 1873 # 73..99 -> 18xx assert dates.expand_year("99") == 1899 assert dates.expand_year("65") is None # 58..72 ambiguous + assert dates.expand_year("9003") is None # implausible 4-digit year -> reject (typo) assert dates.expand_year("x") is None def test_parse_iso_and_empty(): @@ -127,3 +128,21 @@ def test_parse_date_override_wins(): ovr = {"13.5.65": ("1965-05-13", "DAY")} r = dates.parse_date("13.5.65", ovr) # ambiguous without override assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65") + +def test_parse_spanish_months(): + # Mexican-branch letters: Spanish month names, day-first and month-first (hyphen/dot before year) + assert dates.parse_date("21.Enero 1911").iso == "1911-01-21" # day-first + assert dates.parse_date("Junio 17.929").iso == "1929-06-17" # month-first, dot, 3-digit year + assert dates.parse_date("Mayo 18-1929").iso == "1929-05-18" # month-first, hyphen + assert dates.parse_date("Abril 10-929").iso == "1929-04-10" # hyphen, 3-digit year + assert dates.parse_date("Agosto 27-929").iso == "1929-08-27" + assert dates.parse_date("febrero 14-29").iso == "1929-02-14" # hyphen, 2-digit year + assert dates.parse_date("Mayo 18-1929").precision == Precision.DAY + +def test_implausible_year_goes_to_review(): + # a source typo like "October 23-9003" must NOT parse to a bogus year 9003 — stays UNKNOWN + assert dates.parse_date("October 23-9003").precision == Precision.UNKNOWN + +def test_hyphen_month_first_does_not_shadow_month_year(): + # the hyphen-separator generalization must NOT make "Mai 1895" parse as day=18 + assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")