Import normalizer: offline tool to normalize the raw archive spreadsheets #663

Merged
marcel merged 172 commits from docs/import-migration into main 2026-05-28 15:05:51 +02:00
3 changed files with 28 additions and 3 deletions
Showing only changes of commit 5efe3b8a7c - Show all commits

View File

@@ -85,6 +85,10 @@ MONTHS = {
"oktober": 10, "okt": 10, "oct": 10, "october": 10,
"november": 11, "nov": 11,
"dezember": 12, "dez": 12, "dec": 12, "december": 12,
# Spanish (Mexican-branch correspondence)
"enero": 1, "febrero": 2, "marzo": 3, "abril": 4, "mayo": 5, "junio": 6,
"julio": 7, "agosto": 8, "septiembre": 9, "setiembre": 9, "octubre": 10,
"noviembre": 11, "diciembre": 12,
}
ROMAN_MONTHS = {

View File

@@ -48,7 +48,8 @@ def expand_year(token: str):
return None
n, v = len(token), int(token)
if n == 4:
return v
# reject gross typos (e.g. "9003") so they go to review instead of a bogus year
return v if 1700 <= v <= 2100 else None
if n == 3:
return 1000 + v
if n == 2:
@@ -157,8 +158,9 @@ def _match_monthname_a(s):
return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3)))
# dot after day is REQUIRED so this can't match "Mai 1895" (MONTH YYYY) as day=18
_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\.\s*(\d{2,4})")
# A separator (dot OR hyphen/en-dash) after the day is REQUIRED so this can't match
# "Mai 1895" (MONTH YYYY) as day=18; the hyphen form also covers Spanish "Mayo 18-1929".
_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\s*[.\-]\s*(\d{2,4})")
def _match_monthname_b(s):

View File

@@ -35,6 +35,7 @@ def test_expand_year():
assert dates.expand_year("73") == 1873 # 73..99 -> 18xx
assert dates.expand_year("99") == 1899
assert dates.expand_year("65") is None # 58..72 ambiguous
assert dates.expand_year("9003") is None # implausible 4-digit year -> reject (typo)
assert dates.expand_year("x") is None
def test_parse_iso_and_empty():
@@ -127,3 +128,21 @@ def test_parse_date_override_wins():
ovr = {"13.5.65": ("1965-05-13", "DAY")}
r = dates.parse_date("13.5.65", ovr) # ambiguous without override
assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65")
def test_parse_spanish_months():
# Mexican-branch letters: Spanish month names, day-first and month-first (hyphen/dot before year)
assert dates.parse_date("21.Enero 1911").iso == "1911-01-21" # day-first
assert dates.parse_date("Junio 17.929").iso == "1929-06-17" # month-first, dot, 3-digit year
assert dates.parse_date("Mayo 18-1929").iso == "1929-05-18" # month-first, hyphen
assert dates.parse_date("Abril 10-929").iso == "1929-04-10" # hyphen, 3-digit year
assert dates.parse_date("Agosto 27-929").iso == "1929-08-27"
assert dates.parse_date("febrero 14-29").iso == "1929-02-14" # hyphen, 2-digit year
assert dates.parse_date("Mayo 18-1929").precision == Precision.DAY
def test_implausible_year_goes_to_review():
# a source typo like "October 23-9003" must NOT parse to a bogus year 9003 — stays UNKNOWN
assert dates.parse_date("October 23-9003").precision == Precision.UNKNOWN
def test_hyphen_month_first_does_not_shadow_month_year():
# the hyphen-separator generalization must NOT make "Mai 1895" parse as day=18
assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")