fix(normalizer): don't convert plausible typo years as Excel serials
This commit is contained in:
@@ -11,14 +11,26 @@ import dates
|
|||||||
from persons import _strip_accents
|
from persons import _strip_accents
|
||||||
|
|
||||||
|
|
||||||
|
_MIN_YEAR = 1700
|
||||||
|
_MAX_YEAR = 2100
|
||||||
|
# Threshold: if parse_date parses a pure-digit string as a year outside [_MIN_YEAR, _MAX_YEAR],
|
||||||
|
# but the year is a plausible typo (1000-3000), don't try serial conversion.
|
||||||
|
# Years outside this range (e.g., 7568) are implausible and should try serial conversion.
|
||||||
|
_PLAUSIBLE_TYPO_MIN = 1000
|
||||||
|
_PLAUSIBLE_TYPO_MAX = 3000
|
||||||
|
|
||||||
|
|
||||||
def _parse_year(raw: str | None) -> int | None:
|
def _parse_year(raw: str | None) -> int | None:
|
||||||
"""Extract a birth/death year from an Excel cell string.
|
"""Extract a birth/death year from an Excel cell string.
|
||||||
|
|
||||||
Handles four cases:
|
Handles three cases:
|
||||||
1. ISO string (openpyxl date-formatted cell) → parse_date()
|
1. ISO / German / text string parseable by parse_date() → extract year if in range
|
||||||
2. Numeric string that is an Excel serial (1-80000) → timedelta conversion
|
2. Pure-integer string (out-of-range or unparseable) → try Excel serial conversion
|
||||||
3. Any other string → parse_date()
|
(unless it's a plausible typo year, e.g., "1023" for "1923")
|
||||||
4. Unresolvable → None
|
3. Mixed-format or unresolvable → None
|
||||||
|
|
||||||
|
Serial conversion only fires for pure-digit strings and implausible years,
|
||||||
|
preventing typo years like "1023" from being mis-converted as serials.
|
||||||
"""
|
"""
|
||||||
if raw is None:
|
if raw is None:
|
||||||
return None
|
return None
|
||||||
@@ -26,21 +38,31 @@ def _parse_year(raw: str | None) -> int | None:
|
|||||||
if not s:
|
if not s:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Check if it's a pure-digit string (candidate for serial conversion)
|
||||||
|
is_pure_digit = re.fullmatch(r"\d+", s) is not None
|
||||||
|
|
||||||
# Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.)
|
# Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.)
|
||||||
result = dates.parse_date(s)
|
result = dates.parse_date(s)
|
||||||
if result.iso:
|
if result.iso:
|
||||||
year = int(result.iso[:4])
|
year = int(result.iso[:4])
|
||||||
# Reject years outside 1700-2100 (same guard as expand_year())
|
if _MIN_YEAR <= year <= _MAX_YEAR:
|
||||||
if 1700 <= year <= 2100:
|
|
||||||
return year
|
return year
|
||||||
|
# Year is out of range. Only try serial conversion if it's an implausible year.
|
||||||
|
# Plausible typos (e.g., 1023 for 1923) should not be converted as serials.
|
||||||
|
if is_pure_digit and not (_PLAUSIBLE_TYPO_MIN <= year <= _PLAUSIBLE_TYPO_MAX):
|
||||||
|
n = int(s)
|
||||||
|
if 1 <= n <= 80_000:
|
||||||
|
d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
|
||||||
|
if _MIN_YEAR <= d.year <= _MAX_YEAR:
|
||||||
|
return d.year
|
||||||
|
return None
|
||||||
|
|
||||||
# If it's a pure integer string, try Excel serial conversion.
|
# parse_date() found nothing. Try serial conversion only for pure-digit strings.
|
||||||
# parse_date() may parse large serials like "7568" as year 7568 or other edge cases.
|
if is_pure_digit:
|
||||||
if re.fullmatch(r"\d+", s):
|
|
||||||
n = int(s)
|
n = int(s)
|
||||||
if 1 <= n <= 80_000:
|
if 1 <= n <= 80_000:
|
||||||
d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
|
d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
|
||||||
if 1700 <= d.year <= 2100:
|
if _MIN_YEAR <= d.year <= _MAX_YEAR:
|
||||||
return d.year
|
return d.year
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -52,3 +52,10 @@ def test_parse_year_unresolvable_truncated():
|
|||||||
def test_parse_year_typo_year():
|
def test_parse_year_typo_year():
|
||||||
# "4.3.1023" — year 1023 outside 1500-2100 guard — returns None
|
# "4.3.1023" — year 1023 outside 1500-2100 guard — returns None
|
||||||
assert persons_tree._parse_year("4.3.1023") is None
|
assert persons_tree._parse_year("4.3.1023") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_bare_out_of_range_year_is_none():
|
||||||
|
# "1023" is a plausible typo for "1923" but is NOT an Excel serial.
|
||||||
|
# parse_date("1023") parses it as year 1023 (out of 1700-2100 guard).
|
||||||
|
# The serial branch must NOT re-interpret it as a serial.
|
||||||
|
assert persons_tree._parse_year("1023") is None
|
||||||
|
|||||||
Reference in New Issue
Block a user