Files
familienarchiv/tools/import-normalizer/tests/test_dates.py
Marcel 5efe3b8a7c
All checks were successful
CI / Unit & Component Tests (pull_request) Successful in 3m31s
CI / OCR Service Tests (pull_request) Successful in 22s
CI / Backend Unit Tests (pull_request) Successful in 3m42s
CI / fail2ban Regex (pull_request) Successful in 45s
CI / Semgrep Security Scan (pull_request) Successful in 20s
CI / Compose Bucket Idempotency (pull_request) Successful in 1m2s
feat(normalizer): parse Spanish month names + Month DD-YYYY hyphen form
Add Spanish month names (Mexican-branch letters) to config.MONTHS and let
the month-first matcher accept a hyphen (not just a dot) before the year, so
"Mayo 18-1929"/"Junio 7-904" parse without manual overrides. Also bound
4-digit years to 1700-2100 so gross typos ("23-9003") stay in review instead
of producing a bogus year. Cuts unknown-date rate 9.2% -> 7.9%.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 17:00:33 +02:00

149 lines
7.9 KiB
Python

import datetime
import dates
from dates import Precision
def test_easter_known_years():
# Anonymous Gregorian algorithm — verified against published tables
assert dates.easter(2024) == datetime.date(2024, 3, 31)
assert dates.easter(2000) == datetime.date(2000, 4, 23)
assert dates.easter(1922) == datetime.date(1922, 4, 16)
assert dates.easter(1888) == datetime.date(1888, 4, 1)
def test_resolve_feast_movable():
assert dates.resolve_feast_or_season("Pfingsten", 1922) == ("1922-06-04", Precision.DAY)
assert dates.resolve_feast_or_season("Ostern", 2024) == ("2024-03-31", Precision.DAY)
assert dates.resolve_feast_or_season("Pfingstmontag", 1922) == ("1922-06-05", Precision.DAY)
def test_resolve_feast_fixed():
assert dates.resolve_feast_or_season("Weihnachten", 1900) == ("1900-12-25", Precision.DAY)
assert dates.resolve_feast_or_season("Neujahr", 1910) == ("1910-01-01", Precision.DAY)
def test_resolve_season():
assert dates.resolve_feast_or_season("Herbst", 1913) == ("1913-10-01", Precision.SEASON)
assert dates.resolve_feast_or_season("Sommer", 1910) == ("1910-07-01", Precision.SEASON)
def test_resolve_unknown_token_returns_none():
assert dates.resolve_feast_or_season("Freitag", 1919) is None
def test_expand_year():
assert dates.expand_year("1888") == 1888
assert dates.expand_year("889") == 1889 # 3-digit -> 1DDD
assert dates.expand_year("923") == 1923
assert dates.expand_year("08") == 1908 # 00..57 -> 19xx
assert dates.expand_year("17") == 1917
assert dates.expand_year("57") == 1957
assert dates.expand_year("73") == 1873 # 73..99 -> 18xx
assert dates.expand_year("99") == 1899
assert dates.expand_year("65") is None # 58..72 ambiguous
assert dates.expand_year("9003") is None # implausible 4-digit year -> reject (typo)
assert dates.expand_year("x") is None
def test_parse_iso_and_empty():
assert dates.parse_date("1910-04-23") == dates.ParsedDate("1910-04-23", Precision.DAY, "1910-04-23")
assert dates.parse_date("") == dates.ParsedDate(None, Precision.UNKNOWN, "")
assert dates.parse_date("?") == dates.ParsedDate(None, Precision.UNKNOWN, "?")
def test_parse_numeric_forms():
assert dates.parse_date("15.2.1888").iso == "1888-02-15"
assert dates.parse_date("13.5.09").iso == "1909-05-13"
assert dates.parse_date("17/6. 1916").iso == "1916-06-17"
assert dates.parse_date("11.10.08").iso == "1908-10-11"
assert dates.parse_date("30.1.889").iso == "1889-01-30"
assert dates.parse_date("15.2.1888").precision == Precision.DAY
def test_parse_numeric_unparseable():
assert dates.parse_date("8.9.").precision == Precision.UNKNOWN # no year
assert dates.parse_date("13.5.65").precision == Precision.UNKNOWN # ambiguous 2-digit year
def test_parse_approx_marker_upgrades_precision():
r = dates.parse_date("17.Nov (?) 1887") # month-name matcher now active; (?) marks approx
assert r.raw == "17.Nov (?) 1887"
assert r.precision == Precision.APPROX # month-name matcher parses date; (?) upgrades to APPROX
def test_parse_leading_qualifier_is_approx():
r = dates.parse_date("nach 1.5.1900") # qualifier stripped, numeric date salvaged, precision APPROX
assert r.iso == "1900-05-01"
assert r.precision == Precision.APPROX
def test_parse_roman_months():
assert dates.parse_date("22.III.18").iso == "1918-03-22"
assert dates.parse_date("19.XII.1954").iso == "1954-12-19"
assert dates.parse_date("1.III.27").iso == "1927-03-01"
assert dates.parse_date("22.III.18").precision == Precision.DAY
def test_parse_monthname_day_first():
assert dates.parse_date("6.März 1888").iso == "1888-03-06"
assert dates.parse_date("29.Sept.1891").iso == "1891-09-29"
assert dates.parse_date("10.Oct.95").iso == "1895-10-10"
assert dates.parse_date("9.December1889").iso == "1889-12-09"
assert dates.parse_date("18.Dez.1916").iso == "1916-12-18"
assert dates.parse_date("4Dezember 1936").iso == "1936-12-04"
assert dates.parse_date("25 August 1968").iso == "1968-08-25"
def test_parse_month_year_year_only():
assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")
assert dates.parse_date("October 1903") == dates.ParsedDate("1903-10-01", Precision.MONTH, "October 1903")
assert dates.parse_date("1905") == dates.ParsedDate("1905-01-01", Precision.YEAR, "1905")
def test_parse_feast_and_season_via_parse_date():
assert dates.parse_date("Pfingsten 1922") == dates.ParsedDate("1922-06-04", Precision.DAY, "Pfingsten 1922")
assert dates.parse_date("Herbst 1913") == dates.ParsedDate("1913-10-01", Precision.SEASON, "Herbst 1913")
assert dates.parse_date("Pfingstsonntag 1915").precision == Precision.DAY
def test_parse_ranges():
assert dates.parse_date("8.1.1916 - 15.3.1916") == dates.ParsedDate("1916-01-08", Precision.RANGE, "8.1.1916 - 15.3.1916")
assert dates.parse_date("1881/82") == dates.ParsedDate("1881-01-01", Precision.RANGE, "1881/82")
assert dates.parse_date("1945/46?").iso == "1945-01-01" # '?' stripped -> RANGE, then APPROX
assert dates.parse_date("1945/46?").precision == Precision.APPROX
def test_parse_approx_full():
r = dates.parse_date("17.Nov (?) 1887")
assert r.iso == "1887-11-17"
assert r.precision == Precision.APPROX
def test_parse_english_month_first_now_works():
assert dates.parse_date("April 12. 1922").iso == "1922-04-12"
assert dates.parse_date("Mai 1895").iso == "1895-05-01" # not shadowed by month-first matcher
def test_parse_unparseable_examples():
assert dates.parse_date("Freitag 1919").precision == Precision.UNKNOWN
def test_parse_invalid_calendar_date_is_unknown():
# try/except ValueError in the matchers must route impossible dates to UNKNOWN (-> review),
# never silently clamp. This is the most likely real-data bug class at 7,600 rows.
assert dates.parse_date("30.2.1888").precision == Precision.UNKNOWN
assert dates.parse_date("31.4.1916").precision == Precision.UNKNOWN
def test_parse_intra_month_day_range():
# "7./8. Sept.1923" -> start day, RANGE. Must NOT be confused with slash-date "17/6. 1916".
assert dates.parse_date("7./8. Sept.1923") == dates.ParsedDate("1923-09-07", Precision.RANGE, "7./8. Sept.1923")
assert dates.parse_date("17/6. 1916") == dates.ParsedDate("1916-06-17", Precision.DAY, "17/6. 1916")
def test_parse_trailing_note_stripped_but_raw_preserved():
r = dates.parse_date("17.Nov 1887, 2. Brief") # REQ-DATE-04
assert r.iso == "1887-11-17"
assert "2. Brief" in r.raw # original string preserved verbatim
def test_parse_date_override_wins():
ovr = {"13.5.65": ("1965-05-13", "DAY")}
r = dates.parse_date("13.5.65", ovr) # ambiguous without override
assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65")
def test_parse_spanish_months():
# Mexican-branch letters: Spanish month names, day-first and month-first (hyphen/dot before year)
assert dates.parse_date("21.Enero 1911").iso == "1911-01-21" # day-first
assert dates.parse_date("Junio 17.929").iso == "1929-06-17" # month-first, dot, 3-digit year
assert dates.parse_date("Mayo 18-1929").iso == "1929-05-18" # month-first, hyphen
assert dates.parse_date("Abril 10-929").iso == "1929-04-10" # hyphen, 3-digit year
assert dates.parse_date("Agosto 27-929").iso == "1929-08-27"
assert dates.parse_date("febrero 14-29").iso == "1929-02-14" # hyphen, 2-digit year
assert dates.parse_date("Mayo 18-1929").precision == Precision.DAY
def test_implausible_year_goes_to_review():
# a source typo like "October 23-9003" must NOT parse to a bogus year 9003 — stays UNKNOWN
assert dates.parse_date("October 23-9003").precision == Precision.UNKNOWN
def test_hyphen_month_first_does_not_shadow_month_year():
# the hyphen-separator generalization must NOT make "Mai 1895" parse as day=18
assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")