Wires the two-pass pipeline (parse → deduplicate → index → resolve) into a runnable CLI with --input, --output, and --dry-run flags. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
458 lines
15 KiB
Python
458 lines
15 KiB
Python
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
import persons_tree
|
|
|
|
|
|
def test_parse_year_iso_string():
|
|
assert persons_tree._parse_year("1920-09-20") == 1920
|
|
|
|
|
|
def test_parse_year_excel_serial_birth():
|
|
# 7568 days from 1899-12-30 = 1920-09-19 or -20 depending on leap counting
|
|
assert persons_tree._parse_year("7568") == 1920
|
|
|
|
|
|
def test_parse_year_excel_serial_death():
|
|
# 36222 days from 1899-12-30 ≈ 1999
|
|
assert persons_tree._parse_year("36222") == 1999
|
|
|
|
|
|
def test_parse_year_excel_serial_small():
|
|
# 177 days from 1899-12-30 = 1900-06-25
|
|
assert persons_tree._parse_year("177") == 1900
|
|
|
|
|
|
def test_parse_year_german_date_string():
|
|
assert persons_tree._parse_year("30.8.1862") == 1862
|
|
|
|
|
|
def test_parse_year_year_only():
|
|
assert persons_tree._parse_year("1930") == 1930
|
|
|
|
|
|
def test_parse_year_free_text():
|
|
assert persons_tree._parse_year("August 1941") == 1941
|
|
|
|
|
|
def test_parse_year_none():
|
|
assert persons_tree._parse_year(None) is None
|
|
|
|
|
|
def test_parse_year_empty():
|
|
assert persons_tree._parse_year("") is None
|
|
|
|
|
|
def test_parse_year_unresolvable_truncated():
|
|
# "2.9.196" has no valid 4-digit year — returns None
|
|
assert persons_tree._parse_year("2.9.196") is None
|
|
|
|
|
|
def test_parse_year_typo_year():
|
|
# "4.3.1023" — year 1023 outside 1700-2100 guard — returns None
|
|
assert persons_tree._parse_year("4.3.1023") is None
|
|
|
|
|
|
def test_parse_year_bare_out_of_range_year_is_none():
|
|
# "1023" is a plausible typo for "1923" but is NOT an Excel serial.
|
|
# parse_date("1023") parses it as year 1023 (out of 1700-2100 guard).
|
|
# The serial branch must NOT re-interpret it as a serial.
|
|
assert persons_tree._parse_year("1023") is None
|
|
|
|
|
|
def test_parse_generation_space():
|
|
assert persons_tree._parse_generation("G 3") == 3
|
|
|
|
|
|
def test_parse_generation_no_space():
|
|
assert persons_tree._parse_generation("G3") == 3
|
|
|
|
|
|
def test_parse_generation_extra_spaces():
|
|
assert persons_tree._parse_generation("G 0") == 0
|
|
|
|
|
|
def test_parse_generation_trailing_garbage():
|
|
assert persons_tree._parse_generation("G 2 de Gruyter") == 2
|
|
|
|
|
|
def test_parse_generation_empty():
|
|
assert persons_tree._parse_generation("") is None
|
|
|
|
|
|
def test_parse_generation_none():
|
|
assert persons_tree._parse_generation(None) is None
|
|
|
|
|
|
def test_norm_tree_basic():
|
|
assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer"
|
|
|
|
|
|
def test_norm_tree_diacritics():
|
|
assert persons_tree._norm_tree("Wöhler") == "woehler"
|
|
|
|
|
|
def test_norm_tree_strips_parens():
|
|
assert persons_tree._norm_tree("Otto (Herbert)") == "otto"
|
|
|
|
|
|
def test_norm_tree_strips_quotes():
|
|
assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly"
|
|
|
|
|
|
def test_norm_tree_strips_geographic_suffix():
|
|
assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram"
|
|
|
|
|
|
def test_norm_tree_strips_mexiko():
|
|
assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram"
|
|
|
|
|
|
def test_norm_tree_collapses_whitespace():
|
|
assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter"
|
|
|
|
|
|
def test_build_index_forward_lookup():
|
|
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
|
idx = persons_tree._build_index(persons)
|
|
assert "werner allemeyer" in idx
|
|
assert idx["werner allemeyer"] == ["row_002"]
|
|
|
|
|
|
def test_build_index_reversed_lookup():
|
|
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
|
idx = persons_tree._build_index(persons)
|
|
assert idx.get("allemeyer werner") == ["row_002"]
|
|
|
|
|
|
def test_build_index_maiden_name_lookup():
|
|
persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}]
|
|
idx = persons_tree._build_index(persons)
|
|
assert idx.get("elsgard woehler") == ["row_002"]
|
|
|
|
|
|
def test_build_index_single_token_fallback():
|
|
persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}]
|
|
idx = persons_tree._build_index(persons)
|
|
assert idx.get("cram") == ["row_028"]
|
|
|
|
|
|
def test_build_index_ambiguous_single_token():
|
|
persons = [
|
|
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
|
|
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
|
|
]
|
|
idx = persons_tree._build_index(persons)
|
|
assert set(idx["cram"]) == {"row_028", "row_019"}
|
|
|
|
|
|
def test_resolve_one_found():
|
|
persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
|
idx = persons_tree._build_index(persons)
|
|
row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx)
|
|
assert row_id == "row_003"
|
|
assert reason is None
|
|
|
|
|
|
def test_resolve_one_not_found():
|
|
idx = {}
|
|
row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx)
|
|
assert row_id is None
|
|
assert reason == "not_found"
|
|
|
|
|
|
def test_resolve_one_ambiguous():
|
|
persons = [
|
|
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
|
|
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
|
|
]
|
|
idx = persons_tree._build_index(persons)
|
|
row_id, reason = persons_tree._resolve_one("Cram", idx)
|
|
assert row_id is None
|
|
assert reason == "ambiguous"
|
|
|
|
|
|
def test_parse_row_serial_dates():
|
|
fields = {
|
|
"generation": "G 3", "last_name": "Allemeyer", "first_name": "Elsgard",
|
|
"maiden_name": "Wöhler", "birth_date": "7568", "birth_place": "Garz",
|
|
"death_date": "36222", "death_place": "Espelkamp",
|
|
"spouse": "Allemeyer Werner", "notes": "Nichte von Herbert",
|
|
}
|
|
p = persons_tree._parse_row(2, fields)
|
|
assert p["rowId"] == "row_002"
|
|
assert p["firstName"] == "Elsgard"
|
|
assert p["lastName"] == "Allemeyer"
|
|
assert p["maidenName"] == "Wöhler"
|
|
assert p["birthYear"] == 1920
|
|
assert p["deathYear"] == 1999
|
|
assert p["birthPlace"] == "Garz"
|
|
assert p["deathPlace"] == "Espelkamp"
|
|
assert p["generation"] == 3
|
|
assert p["familyMember"] is True
|
|
assert p["_spouse_raw"] == "Allemeyer Werner"
|
|
assert p["_bemerkung_raw"] == "Nichte von Herbert"
|
|
assert "[Geburtsdatum" not in (p["notes"] or "")
|
|
|
|
|
|
def test_parse_row_string_birth_date():
|
|
fields = {
|
|
"generation": "G 2", "last_name": "Cram", "first_name": "Herbert",
|
|
"maiden_name": "", "birth_date": "25.6.1890", "birth_place": "Texas",
|
|
"death_date": "", "death_place": "", "spouse": "", "notes": "",
|
|
}
|
|
p = persons_tree._parse_row(28, fields)
|
|
assert p["birthYear"] == 1890
|
|
assert p["deathYear"] is None
|
|
assert p["notes"] is None or p["notes"] == ""
|
|
|
|
|
|
def test_parse_row_unresolvable_date_goes_to_notes():
|
|
fields = {
|
|
"generation": "G 3", "last_name": "Heydrich", "first_name": "Dieter",
|
|
"maiden_name": "", "birth_date": "28.9.", "birth_place": "",
|
|
"death_date": "", "death_place": "", "spouse": "", "notes": "Bruder v Ingrid",
|
|
}
|
|
p = persons_tree._parse_row(96, fields)
|
|
assert p["birthYear"] is None
|
|
assert "[Geburtsdatum: 28.9.]" in p["notes"]
|
|
assert "Bruder v Ingrid" in p["notes"]
|
|
|
|
|
|
def test_parse_row_empty_spouse_and_notes():
|
|
fields = {
|
|
"generation": "G 4", "last_name": "Allemeyer", "first_name": "Jürgen",
|
|
"maiden_name": "", "birth_date": "", "birth_place": "",
|
|
"death_date": "", "death_place": "", "spouse": "", "notes": "",
|
|
}
|
|
p = persons_tree._parse_row(4, fields)
|
|
assert p["_spouse_raw"] is None
|
|
assert p["_bemerkung_raw"] is None
|
|
|
|
|
|
def test_deduplicate_no_duplicates():
|
|
persons = [
|
|
{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "birthYear": 1920},
|
|
{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "birthYear": 1923},
|
|
]
|
|
result, skipped = persons_tree._deduplicate(persons)
|
|
assert len(result) == 2
|
|
assert skipped == []
|
|
|
|
|
|
def test_deduplicate_exact_match():
|
|
# rows 127/138: same firstName, lastName, birthYear
|
|
persons = [
|
|
{"rowId": "row_127", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
|
|
{"rowId": "row_138", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
|
|
]
|
|
result, skipped = persons_tree._deduplicate(persons)
|
|
assert [p["rowId"] for p in result] == ["row_127"]
|
|
assert len(skipped) == 1
|
|
assert "row_138" in skipped[0]
|
|
|
|
|
|
def test_deduplicate_none_birth_year_after_known():
|
|
# rows 129/139: row 129 has birthYear=1964, row 139 has birthYear=None
|
|
persons = [
|
|
{"rowId": "row_129", "firstName": "Christoph", "lastName": "Seils", "birthYear": 1964},
|
|
{"rowId": "row_139", "firstName": "Christoph", "lastName": "Seils", "birthYear": None},
|
|
]
|
|
result, skipped = persons_tree._deduplicate(persons)
|
|
assert [p["rowId"] for p in result] == ["row_129"]
|
|
assert len(skipped) == 1
|
|
|
|
|
|
def test_deduplicate_both_none_birth_year_kept():
|
|
# Two people with no birth year but same name: keep first only
|
|
persons = [
|
|
{"rowId": "row_A", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
|
|
{"rowId": "row_B", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
|
|
]
|
|
result, skipped = persons_tree._deduplicate(persons)
|
|
assert [p["rowId"] for p in result] == ["row_A"]
|
|
assert len(skipped) == 1
|
|
|
|
|
|
def _make_persons(*args):
|
|
"""Helper: args are (rowId, firstName, lastName, maidenName, spouse_raw) tuples."""
|
|
return [
|
|
{"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3],
|
|
"_spouse_raw": a[4], "_bemerkung_raw": None,
|
|
"birthYear": None, "deathYear": None, "birthPlace": None, "deathPlace": None,
|
|
"generation": None, "familyMember": True, "alias": None, "notes": None}
|
|
for a in args
|
|
]
|
|
|
|
|
|
def test_resolve_spouses_success():
|
|
persons = _make_persons(
|
|
("row_002", "Elsgard", "Allemeyer", "Wöhler", "Allemeyer Werner"),
|
|
("row_003", "Werner", "Allemeyer", None, "Elsgard Wöhler"),
|
|
)
|
|
idx = persons_tree._build_index(persons)
|
|
rels, unres = persons_tree._resolve_spouses(persons, idx)
|
|
assert len(rels) == 1
|
|
assert rels[0]["type"] == "SPOUSE_OF"
|
|
assert set([rels[0]["personId"], rels[0]["relatedPersonId"]]) == {"row_002", "row_003"}
|
|
assert unres == []
|
|
|
|
|
|
def test_resolve_spouses_not_found():
|
|
persons = _make_persons(
|
|
("row_007", "Charlotte", "Blomquist", "Ruge", '"Tante Lolly"'),
|
|
)
|
|
idx = persons_tree._build_index(persons)
|
|
rels, unres = persons_tree._resolve_spouses(persons, idx)
|
|
assert rels == []
|
|
assert len(unres) == 1
|
|
assert unres[0]["rowId"] == "row_007"
|
|
assert unres[0]["reason"] == "not_found"
|
|
|
|
|
|
def test_resolve_spouses_empty_spouse_field():
|
|
persons = _make_persons(
|
|
("row_004", "Jürgen", "Allemeyer", None, None),
|
|
)
|
|
idx = persons_tree._build_index(persons)
|
|
rels, unres = persons_tree._resolve_spouses(persons, idx)
|
|
assert rels == [] and unres == []
|
|
|
|
|
|
def _register(*args):
|
|
"""Build index from (rowId, first, last, maiden) tuples."""
|
|
persons = [
|
|
{"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]}
|
|
for a in args
|
|
]
|
|
return persons, persons_tree._build_index(persons)
|
|
|
|
|
|
def test_parse_bemerkung_sohn_two_parents():
|
|
_, idx = _register(
|
|
("row_019", "Clara", "Cram", "de Gruyter"),
|
|
("row_028", "Herbert", "Cram", None),
|
|
)
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_021", "Sohn v Clara Cram u Herbert Cram", idx
|
|
)
|
|
assert len(rels) == 2
|
|
assert all(r["type"] == "PARENT_OF" for r in rels)
|
|
child_ids = {r["relatedPersonId"] for r in rels}
|
|
parent_ids = {r["personId"] for r in rels}
|
|
assert child_ids == {"row_021"}
|
|
assert "row_019" in parent_ids and "row_028" in parent_ids
|
|
assert unres == []
|
|
assert notes == ""
|
|
|
|
|
|
def test_parse_bemerkung_tochter_von():
|
|
_, idx = _register(("row_019", "Clara", "Cram", None))
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_036", "Tochter von Clara Cram", idx
|
|
)
|
|
assert len(rels) == 1
|
|
assert rels[0] == {
|
|
"personId": "row_019",
|
|
"relatedPersonId": "row_036",
|
|
"type": "PARENT_OF",
|
|
"source": "bemerkung",
|
|
"rawBemerkung": "Tochter von Clara Cram",
|
|
}
|
|
assert notes == ""
|
|
|
|
|
|
def test_parse_bemerkung_vater():
|
|
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_031", "Vater v Herbert Cram", idx
|
|
)
|
|
assert len(rels) == 1
|
|
assert rels[0]["personId"] == "row_031"
|
|
assert rels[0]["relatedPersonId"] == "row_028"
|
|
assert rels[0]["type"] == "PARENT_OF"
|
|
|
|
|
|
def test_parse_bemerkung_unmatched_parent_name():
|
|
_, idx = _register() # empty index
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_004", "Sohn v Elsgard A.", idx
|
|
)
|
|
assert rels == []
|
|
assert len(unres) == 1
|
|
assert unres[0]["reason"] == "not_found"
|
|
assert notes == ""
|
|
|
|
|
|
def test_parse_bemerkung_skip_nichte():
|
|
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_002", "Nichte von Herbert", idx
|
|
)
|
|
assert rels == []
|
|
assert unres == []
|
|
assert notes == "Nichte von Herbert"
|
|
|
|
|
|
def test_parse_bemerkung_skip_bruder():
|
|
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_033", "Bruder v Herbert", idx
|
|
)
|
|
assert rels == []
|
|
assert unres == []
|
|
assert notes == "Bruder v Herbert"
|
|
|
|
|
|
def test_parse_bemerkung_empty():
|
|
_, idx = _register()
|
|
rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx)
|
|
assert rels == [] and unres == [] and notes == ""
|
|
|
|
|
|
def test_parse_bemerkung_plain_remark():
|
|
_, idx = _register()
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_029", "Verfasserin der Cram-Chronik !!", idx
|
|
)
|
|
assert rels == [] and unres == []
|
|
assert notes == "Verfasserin der Cram-Chronik !!"
|
|
|
|
|
|
def test_parse_bemerkung_sohn_with_trailing_remark():
|
|
_, idx = _register(
|
|
("row_019", "Clara", "Cram", "de Gruyter"),
|
|
("row_028", "Herbert", "Cram", None),
|
|
)
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_021", "Sohn v Clara Cram u Herbert Cram, nach Mexiko emigriert", idx
|
|
)
|
|
assert len(rels) == 2
|
|
assert unres == []
|
|
assert notes == "nach Mexiko emigriert"
|
|
|
|
|
|
import subprocess
|
|
|
|
|
|
def test_dry_run_exits_zero(tmp_path):
|
|
"""dry-run should complete without writing any file and exit 0."""
|
|
input_path = Path(__file__).parent.parent.parent.parent / "import" / "Personendatei 2.xlsx"
|
|
if not input_path.exists():
|
|
import pytest
|
|
pytest.skip("source Excel file not present")
|
|
|
|
result = subprocess.run(
|
|
[
|
|
sys.executable, str(Path(__file__).parent.parent / "persons_tree.py"),
|
|
"--input", str(input_path),
|
|
"--output", str(tmp_path / "out.json"),
|
|
"--dry-run",
|
|
],
|
|
capture_output=True, text=True,
|
|
)
|
|
assert result.returncode == 0, result.stderr
|
|
assert not (tmp_path / "out.json").exists()
|
|
assert "persons parsed" in result.stdout
|