From 443c7a48dbb7d57ce9ea5d9548cc32402cd39060 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 20:46:42 +0200 Subject: [PATCH] fix(normalizer): don't convert plausible typo years as Excel serials --- tools/import-normalizer/persons_tree.py | 44 ++++++++++++++----- .../tests/test_persons_tree.py | 7 +++ 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/tools/import-normalizer/persons_tree.py b/tools/import-normalizer/persons_tree.py index a52787a6..68e77ffb 100644 --- a/tools/import-normalizer/persons_tree.py +++ b/tools/import-normalizer/persons_tree.py @@ -11,14 +11,26 @@ import dates from persons import _strip_accents +_MIN_YEAR = 1700 +_MAX_YEAR = 2100 +# Threshold: if parse_date parses a pure-digit string as a year outside [_MIN_YEAR, _MAX_YEAR], +# but the year is a plausible typo (1000-3000), don't try serial conversion. +# Years outside this range (e.g., 7568) are implausible and should try serial conversion. +_PLAUSIBLE_TYPO_MIN = 1000 +_PLAUSIBLE_TYPO_MAX = 3000 + + def _parse_year(raw: str | None) -> int | None: """Extract a birth/death year from an Excel cell string. - Handles four cases: - 1. ISO string (openpyxl date-formatted cell) → parse_date() - 2. Numeric string that is an Excel serial (1-80000) → timedelta conversion - 3. Any other string → parse_date() - 4. Unresolvable → None + Handles three cases: + 1. ISO / German / text string parseable by parse_date() → extract year if in range + 2. Pure-integer string (out-of-range or unparseable) → try Excel serial conversion + (unless it's a plausible typo year, e.g., "1023" for "1923") + 3. Mixed-format or unresolvable → None + + Serial conversion only fires for pure-digit strings and implausible years, + preventing typo years like "1023" from being mis-converted as serials. """ if raw is None: return None @@ -26,21 +38,31 @@ def _parse_year(raw: str | None) -> int | None: if not s: return None + # Check if it's a pure-digit string (candidate for serial conversion) + is_pure_digit = re.fullmatch(r"\d+", s) is not None + # Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.) result = dates.parse_date(s) if result.iso: year = int(result.iso[:4]) - # Reject years outside 1700-2100 (same guard as expand_year()) - if 1700 <= year <= 2100: + if _MIN_YEAR <= year <= _MAX_YEAR: return year + # Year is out of range. Only try serial conversion if it's an implausible year. + # Plausible typos (e.g., 1023 for 1923) should not be converted as serials. + if is_pure_digit and not (_PLAUSIBLE_TYPO_MIN <= year <= _PLAUSIBLE_TYPO_MAX): + n = int(s) + if 1 <= n <= 80_000: + d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n) + if _MIN_YEAR <= d.year <= _MAX_YEAR: + return d.year + return None - # If it's a pure integer string, try Excel serial conversion. - # parse_date() may parse large serials like "7568" as year 7568 or other edge cases. - if re.fullmatch(r"\d+", s): + # parse_date() found nothing. Try serial conversion only for pure-digit strings. + if is_pure_digit: n = int(s) if 1 <= n <= 80_000: d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n) - if 1700 <= d.year <= 2100: + if _MIN_YEAR <= d.year <= _MAX_YEAR: return d.year return None diff --git a/tools/import-normalizer/tests/test_persons_tree.py b/tools/import-normalizer/tests/test_persons_tree.py index 30c602ff..0419ebcd 100644 --- a/tools/import-normalizer/tests/test_persons_tree.py +++ b/tools/import-normalizer/tests/test_persons_tree.py @@ -52,3 +52,10 @@ def test_parse_year_unresolvable_truncated(): def test_parse_year_typo_year(): # "4.3.1023" — year 1023 outside 1500-2100 guard — returns None assert persons_tree._parse_year("4.3.1023") is None + + +def test_parse_year_bare_out_of_range_year_is_none(): + # "1023" is a plausible typo for "1923" but is NOT an Excel serial. + # parse_date("1023") parses it as year 1023 (out of 1700-2100 guard). + # The serial branch must NOT re-interpret it as a serial. + assert persons_tree._parse_year("1023") is None