From 9ae1196d1ca064525e526a701638103a532f0590 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 20:41:25 +0200 Subject: [PATCH] feat(normalizer): add persons_tree skeleton + year extraction --- tools/import-normalizer/persons_tree.py | 46 ++++++++++++++++ .../tests/test_persons_tree.py | 54 +++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 tools/import-normalizer/persons_tree.py create mode 100644 tools/import-normalizer/tests/test_persons_tree.py diff --git a/tools/import-normalizer/persons_tree.py b/tools/import-normalizer/persons_tree.py new file mode 100644 index 00000000..a52787a6 --- /dev/null +++ b/tools/import-normalizer/persons_tree.py @@ -0,0 +1,46 @@ +"""Normalize Personendatei 2.xlsx into canonical-persons-tree.json.""" +import argparse +import datetime +import json +import re +import sys +from pathlib import Path + +import config +import dates +from persons import _strip_accents + + +def _parse_year(raw: str | None) -> int | None: + """Extract a birth/death year from an Excel cell string. + + Handles four cases: + 1. ISO string (openpyxl date-formatted cell) → parse_date() + 2. Numeric string that is an Excel serial (1-80000) → timedelta conversion + 3. Any other string → parse_date() + 4. Unresolvable → None + """ + if raw is None: + return None + s = str(raw).strip() + if not s: + return None + + # Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.) + result = dates.parse_date(s) + if result.iso: + year = int(result.iso[:4]) + # Reject years outside 1700-2100 (same guard as expand_year()) + if 1700 <= year <= 2100: + return year + + # If it's a pure integer string, try Excel serial conversion. + # parse_date() may parse large serials like "7568" as year 7568 or other edge cases. + if re.fullmatch(r"\d+", s): + n = int(s) + if 1 <= n <= 80_000: + d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n) + if 1700 <= d.year <= 2100: + return d.year + + return None diff --git a/tools/import-normalizer/tests/test_persons_tree.py b/tools/import-normalizer/tests/test_persons_tree.py new file mode 100644 index 00000000..30c602ff --- /dev/null +++ b/tools/import-normalizer/tests/test_persons_tree.py @@ -0,0 +1,54 @@ +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import persons_tree + + +def test_parse_year_iso_string(): + assert persons_tree._parse_year("1920-09-20") == 1920 + + +def test_parse_year_excel_serial_birth(): + # 7568 days from 1899-12-30 = 1920-09-19 or -20 depending on leap counting + assert persons_tree._parse_year("7568") == 1920 + + +def test_parse_year_excel_serial_death(): + # 36222 days from 1899-12-30 ≈ 1999 + assert persons_tree._parse_year("36222") == 1999 + + +def test_parse_year_excel_serial_small(): + # 177 days from 1899-12-30 = 1900-06-25 + assert persons_tree._parse_year("177") == 1900 + + +def test_parse_year_german_date_string(): + assert persons_tree._parse_year("30.8.1862") == 1862 + + +def test_parse_year_year_only(): + assert persons_tree._parse_year("1930") == 1930 + + +def test_parse_year_free_text(): + assert persons_tree._parse_year("August 1941") == 1941 + + +def test_parse_year_none(): + assert persons_tree._parse_year(None) is None + + +def test_parse_year_empty(): + assert persons_tree._parse_year("") is None + + +def test_parse_year_unresolvable_truncated(): + # "2.9.196" has no valid 4-digit year — returns None + assert persons_tree._parse_year("2.9.196") is None + + +def test_parse_year_typo_year(): + # "4.3.1023" — year 1023 outside 1500-2100 guard — returns None + assert persons_tree._parse_year("4.3.1023") is None