feat(normalizer): add persons_tree skeleton + year extraction
This commit is contained in:
46
tools/import-normalizer/persons_tree.py
Normal file
46
tools/import-normalizer/persons_tree.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""Normalize Personendatei 2.xlsx into canonical-persons-tree.json."""
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import config
|
||||
import dates
|
||||
from persons import _strip_accents
|
||||
|
||||
|
||||
def _parse_year(raw: str | None) -> int | None:
|
||||
"""Extract a birth/death year from an Excel cell string.
|
||||
|
||||
Handles four cases:
|
||||
1. ISO string (openpyxl date-formatted cell) → parse_date()
|
||||
2. Numeric string that is an Excel serial (1-80000) → timedelta conversion
|
||||
3. Any other string → parse_date()
|
||||
4. Unresolvable → None
|
||||
"""
|
||||
if raw is None:
|
||||
return None
|
||||
s = str(raw).strip()
|
||||
if not s:
|
||||
return None
|
||||
|
||||
# Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.)
|
||||
result = dates.parse_date(s)
|
||||
if result.iso:
|
||||
year = int(result.iso[:4])
|
||||
# Reject years outside 1700-2100 (same guard as expand_year())
|
||||
if 1700 <= year <= 2100:
|
||||
return year
|
||||
|
||||
# If it's a pure integer string, try Excel serial conversion.
|
||||
# parse_date() may parse large serials like "7568" as year 7568 or other edge cases.
|
||||
if re.fullmatch(r"\d+", s):
|
||||
n = int(s)
|
||||
if 1 <= n <= 80_000:
|
||||
d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
|
||||
if 1700 <= d.year <= 2100:
|
||||
return d.year
|
||||
|
||||
return None
|
||||
Reference in New Issue
Block a user