familienarchiv/tools/import-normalizer/persons_tree.py

"""Normalize Personendatei 2.xlsx into canonical-persons-tree.json."""
import argparse
import datetime
import json
import re
import sys
from pathlib import Path

import config
import dates
from persons import _strip_accents


_MIN_YEAR = 1700
_MAX_YEAR = 2100
# Threshold: if parse_date parses a pure-digit string as a year outside [_MIN_YEAR, _MAX_YEAR],
# but the year is a plausible typo (1000-3000), don't try serial conversion.
# Years outside this range (e.g., 7568) are implausible and should try serial conversion.
_PLAUSIBLE_TYPO_MIN = 1000
_PLAUSIBLE_TYPO_MAX = 3000


def _parse_year(raw: str | None) -> int | None:
    """Extract a birth/death year from an Excel cell string.

    Handles three cases:
    1. ISO / German / text string parseable by parse_date() → extract year if in range
    2. Pure-integer string (out-of-range or unparseable) → try Excel serial conversion
       (unless it's a plausible typo year, e.g., "1023" for "1923")
    3. Mixed-format or unresolvable → None

    Serial conversion only fires for pure-digit strings and implausible years,
    preventing typo years like "1023" from being mis-converted as serials.
    """
    if raw is None:
        return None
    s = str(raw).strip()
    if not s:
        return None

    # Check if it's a pure-digit string (candidate for serial conversion)
    is_pure_digit = re.fullmatch(r"\d+", s) is not None

    # Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.)
    result = dates.parse_date(s)
    if result.iso:
        year = int(result.iso[:4])
        if _MIN_YEAR <= year <= _MAX_YEAR:
            return year
        # Year is out of range. Only try serial conversion if it's an implausible year.
        # Plausible typos (e.g., 1023 for 1923) should not be converted as serials.
        if is_pure_digit and not (_PLAUSIBLE_TYPO_MIN <= year <= _PLAUSIBLE_TYPO_MAX):
            n = int(s)
            if 1 <= n <= 80_000:
                d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
                if _MIN_YEAR <= d.year <= _MAX_YEAR:
                    return d.year
        return None

    # parse_date() found nothing. Try serial conversion only for pure-digit strings.
    if is_pure_digit:
        n = int(s)
        if 1 <= n <= 80_000:
            d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
            if _MIN_YEAR <= d.year <= _MAX_YEAR:
                return d.year

    return None


def _parse_generation(raw: str | None) -> int | None:
    """Extract the generation integer from column A values like 'G 3', 'G3', 'G  0'."""
    if not raw:
        return None
    m = re.search(r"\d+", str(raw))
    return int(m.group()) if m else None


_GEO_SUFFIXES = {"aachen", "mex", "mexiko", "sen", "jun", "jr"}


def _norm_tree(s: str) -> str:
    """Normalize a name string for tree matching.

    - Strip surrounding quotes, remove parenthetical substrings
    - Diacritic → ASCII (ä→ae etc.), lowercase, dots → spaces
    - Remove known geographic/honorific suffix tokens
    - Collapse whitespace
    """
    s = (s or "").strip().strip("\"'")
    s = re.sub(r"\([^)]*\)", "", s)
    s = _strip_accents(s).lower().replace(".", " ")
    tokens = [t for t in s.split() if t and t not in _GEO_SUFFIXES]
    return " ".join(tokens).strip("., ")


def _build_index(persons: list[dict]) -> dict[str, list[str]]:
    """Build a name → [rowId, …] lookup index with four keys per person."""
    index: dict[str, list[str]] = {}

    def _add(key: str, row_id: str) -> None:
        if key:
            index.setdefault(key, []).append(row_id)

    for p in persons:
        row_id = p["rowId"]
        first = p.get("firstName") or ""
        last = p.get("lastName") or ""
        maiden = p.get("maidenName") or ""

        _add(_norm_tree(f"{first} {last}"), row_id)
        _add(_norm_tree(f"{last} {first}"), row_id)
        if maiden:
            _add(_norm_tree(f"{first} {maiden}"), row_id)
        _add(_norm_tree(last), row_id)

    return index


def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str | None]:
    """Return (row_id, None) on unique match, (None, reason) otherwise."""
    key = _norm_tree(raw)
    if not key:
        return None, "empty"
    hits = index.get(key, [])
    if len(hits) == 1:
        return hits[0], None
    if len(hits) == 0:
        return None, "not_found"
    return None, "ambiguous"