"""Normalize Personendatei 2.xlsx into canonical-persons-tree.json.""" import argparse import datetime import json import re import sys from pathlib import Path import config import dates from persons import _strip_accents _MIN_YEAR = 1700 _MAX_YEAR = 2100 # Threshold: if parse_date parses a pure-digit string as a year outside [_MIN_YEAR, _MAX_YEAR], # but the year is a plausible typo (1000-3000), don't try serial conversion. # Years outside this range (e.g., 7568) are implausible and should try serial conversion. _PLAUSIBLE_TYPO_MIN = 1000 _PLAUSIBLE_TYPO_MAX = 3000 def _parse_year(raw: str | None) -> int | None: """Extract a birth/death year from an Excel cell string. Handles three cases: 1. ISO / German / text string parseable by parse_date() → extract year if in range 2. Pure-integer string (out-of-range or unparseable) → try Excel serial conversion (unless it's a plausible typo year, e.g., "1023" for "1923") 3. Mixed-format or unresolvable → None Serial conversion only fires for pure-digit strings and implausible years, preventing typo years like "1023" from being mis-converted as serials. """ if raw is None: return None s = str(raw).strip() if not s: return None # Check if it's a pure-digit string (candidate for serial conversion) is_pure_digit = re.fullmatch(r"\d+", s) is not None # Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.) result = dates.parse_date(s) if result.iso: year = int(result.iso[:4]) if _MIN_YEAR <= year <= _MAX_YEAR: return year # Year is out of range. Only try serial conversion if it's an implausible year. # Plausible typos (e.g., 1023 for 1923) should not be converted as serials. if is_pure_digit and not (_PLAUSIBLE_TYPO_MIN <= year <= _PLAUSIBLE_TYPO_MAX): n = int(s) if 1 <= n <= 80_000: d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n) if _MIN_YEAR <= d.year <= _MAX_YEAR: return d.year return None # parse_date() found nothing. Try serial conversion only for pure-digit strings. if is_pure_digit: n = int(s) if 1 <= n <= 80_000: d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n) if _MIN_YEAR <= d.year <= _MAX_YEAR: return d.year return None def _parse_generation(raw: str | None) -> int | None: """Extract the generation integer from column A values like 'G 3', 'G3', 'G 0'.""" if not raw: return None m = re.search(r"\d+", str(raw)) return int(m.group()) if m else None _GEO_SUFFIXES = {"aachen", "mex", "mexiko", "sen", "jun", "jr"} def _norm_tree(s: str) -> str: """Normalize a name string for tree matching. - Strip surrounding quotes, remove parenthetical substrings - Diacritic → ASCII (ä→ae etc.), lowercase, dots → spaces - Remove known geographic/honorific suffix tokens - Collapse whitespace """ s = (s or "").strip().strip("\"'") s = re.sub(r"\([^)]*\)", "", s) s = _strip_accents(s).lower().replace(".", " ") tokens = [t for t in s.split() if t and t not in _GEO_SUFFIXES] return " ".join(tokens).strip("., ") def _build_index(persons: list[dict]) -> dict[str, list[str]]: """Build a name → [rowId, …] lookup index with four keys per person.""" index: dict[str, list[str]] = {} def _add(key: str, row_id: str) -> None: if key: index.setdefault(key, []).append(row_id) for p in persons: row_id = p["rowId"] first = p.get("firstName") or "" last = p.get("lastName") or "" maiden = p.get("maidenName") or "" _add(_norm_tree(f"{first} {last}"), row_id) _add(_norm_tree(f"{last} {first}"), row_id) if maiden: _add(_norm_tree(f"{first} {maiden}"), row_id) _add(_norm_tree(last), row_id) return index def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str | None]: """Return (row_id, None) on unique match, (None, reason) otherwise.""" key = _norm_tree(raw) if not key: return None, "empty" hits = index.get(key, []) if len(hits) == 1: return hits[0], None if len(hits) == 0: return None, "not_found" return None, "ambiguous"