feat(normalizer): add persons_tree skeleton + year extraction
This commit is contained in:
46
tools/import-normalizer/persons_tree.py
Normal file
46
tools/import-normalizer/persons_tree.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
"""Normalize Personendatei 2.xlsx into canonical-persons-tree.json."""
|
||||||
|
import argparse
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import config
|
||||||
|
import dates
|
||||||
|
from persons import _strip_accents
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_year(raw: str | None) -> int | None:
|
||||||
|
"""Extract a birth/death year from an Excel cell string.
|
||||||
|
|
||||||
|
Handles four cases:
|
||||||
|
1. ISO string (openpyxl date-formatted cell) → parse_date()
|
||||||
|
2. Numeric string that is an Excel serial (1-80000) → timedelta conversion
|
||||||
|
3. Any other string → parse_date()
|
||||||
|
4. Unresolvable → None
|
||||||
|
"""
|
||||||
|
if raw is None:
|
||||||
|
return None
|
||||||
|
s = str(raw).strip()
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.)
|
||||||
|
result = dates.parse_date(s)
|
||||||
|
if result.iso:
|
||||||
|
year = int(result.iso[:4])
|
||||||
|
# Reject years outside 1700-2100 (same guard as expand_year())
|
||||||
|
if 1700 <= year <= 2100:
|
||||||
|
return year
|
||||||
|
|
||||||
|
# If it's a pure integer string, try Excel serial conversion.
|
||||||
|
# parse_date() may parse large serials like "7568" as year 7568 or other edge cases.
|
||||||
|
if re.fullmatch(r"\d+", s):
|
||||||
|
n = int(s)
|
||||||
|
if 1 <= n <= 80_000:
|
||||||
|
d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
|
||||||
|
if 1700 <= d.year <= 2100:
|
||||||
|
return d.year
|
||||||
|
|
||||||
|
return None
|
||||||
54
tools/import-normalizer/tests/test_persons_tree.py
Normal file
54
tools/import-normalizer/tests/test_persons_tree.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
import persons_tree
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_iso_string():
|
||||||
|
assert persons_tree._parse_year("1920-09-20") == 1920
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_excel_serial_birth():
|
||||||
|
# 7568 days from 1899-12-30 = 1920-09-19 or -20 depending on leap counting
|
||||||
|
assert persons_tree._parse_year("7568") == 1920
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_excel_serial_death():
|
||||||
|
# 36222 days from 1899-12-30 ≈ 1999
|
||||||
|
assert persons_tree._parse_year("36222") == 1999
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_excel_serial_small():
|
||||||
|
# 177 days from 1899-12-30 = 1900-06-25
|
||||||
|
assert persons_tree._parse_year("177") == 1900
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_german_date_string():
|
||||||
|
assert persons_tree._parse_year("30.8.1862") == 1862
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_year_only():
|
||||||
|
assert persons_tree._parse_year("1930") == 1930
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_free_text():
|
||||||
|
assert persons_tree._parse_year("August 1941") == 1941
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_none():
|
||||||
|
assert persons_tree._parse_year(None) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_empty():
|
||||||
|
assert persons_tree._parse_year("") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_unresolvable_truncated():
|
||||||
|
# "2.9.196" has no valid 4-digit year — returns None
|
||||||
|
assert persons_tree._parse_year("2.9.196") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_typo_year():
|
||||||
|
# "4.3.1023" — year 1023 outside 1500-2100 guard — returns None
|
||||||
|
assert persons_tree._parse_year("4.3.1023") is None
|
||||||
Reference in New Issue
Block a user