Files
familienarchiv/tools/import-normalizer/tests/test_persons_tree.py
Marcel ace41ad209 fix(normalizer): remove unauthorized first-name index key from _build_index
Remove the 5th unauthorized index key (_norm_tree(first)) from _build_index.
The spec requires exactly 4 keys per person:
1. forward (first last)
2. reversed (last first)
3. maiden name (first maiden) if maiden set
4. lastName only (last)

Update test data to use full names in Bemerkung fields (e.g., 'Clara Cram'
instead of 'Clara') since single first names alone are no longer resolvable.
All 52 tests pass.
2026-05-25 21:08:49 +02:00

421 lines
14 KiB
Python

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import persons_tree
def test_parse_year_iso_string():
assert persons_tree._parse_year("1920-09-20") == 1920
def test_parse_year_excel_serial_birth():
# 7568 days from 1899-12-30 = 1920-09-19 or -20 depending on leap counting
assert persons_tree._parse_year("7568") == 1920
def test_parse_year_excel_serial_death():
# 36222 days from 1899-12-30 ≈ 1999
assert persons_tree._parse_year("36222") == 1999
def test_parse_year_excel_serial_small():
# 177 days from 1899-12-30 = 1900-06-25
assert persons_tree._parse_year("177") == 1900
def test_parse_year_german_date_string():
assert persons_tree._parse_year("30.8.1862") == 1862
def test_parse_year_year_only():
assert persons_tree._parse_year("1930") == 1930
def test_parse_year_free_text():
assert persons_tree._parse_year("August 1941") == 1941
def test_parse_year_none():
assert persons_tree._parse_year(None) is None
def test_parse_year_empty():
assert persons_tree._parse_year("") is None
def test_parse_year_unresolvable_truncated():
# "2.9.196" has no valid 4-digit year — returns None
assert persons_tree._parse_year("2.9.196") is None
def test_parse_year_typo_year():
# "4.3.1023" — year 1023 outside 1700-2100 guard — returns None
assert persons_tree._parse_year("4.3.1023") is None
def test_parse_year_bare_out_of_range_year_is_none():
# "1023" is a plausible typo for "1923" but is NOT an Excel serial.
# parse_date("1023") parses it as year 1023 (out of 1700-2100 guard).
# The serial branch must NOT re-interpret it as a serial.
assert persons_tree._parse_year("1023") is None
def test_parse_generation_space():
assert persons_tree._parse_generation("G 3") == 3
def test_parse_generation_no_space():
assert persons_tree._parse_generation("G3") == 3
def test_parse_generation_extra_spaces():
assert persons_tree._parse_generation("G 0") == 0
def test_parse_generation_trailing_garbage():
assert persons_tree._parse_generation("G 2 de Gruyter") == 2
def test_parse_generation_empty():
assert persons_tree._parse_generation("") is None
def test_parse_generation_none():
assert persons_tree._parse_generation(None) is None
def test_norm_tree_basic():
assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer"
def test_norm_tree_diacritics():
assert persons_tree._norm_tree("Wöhler") == "woehler"
def test_norm_tree_strips_parens():
assert persons_tree._norm_tree("Otto (Herbert)") == "otto"
def test_norm_tree_strips_quotes():
assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly"
def test_norm_tree_strips_geographic_suffix():
assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram"
def test_norm_tree_strips_mexiko():
assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram"
def test_norm_tree_collapses_whitespace():
assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter"
def test_build_index_forward_lookup():
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
idx = persons_tree._build_index(persons)
assert "werner allemeyer" in idx
assert idx["werner allemeyer"] == ["row_002"]
def test_build_index_reversed_lookup():
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
idx = persons_tree._build_index(persons)
assert idx.get("allemeyer werner") == ["row_002"]
def test_build_index_maiden_name_lookup():
persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}]
idx = persons_tree._build_index(persons)
assert idx.get("elsgard woehler") == ["row_002"]
def test_build_index_single_token_fallback():
persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}]
idx = persons_tree._build_index(persons)
assert idx.get("cram") == ["row_028"]
def test_build_index_ambiguous_single_token():
persons = [
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
]
idx = persons_tree._build_index(persons)
assert set(idx["cram"]) == {"row_028", "row_019"}
def test_resolve_one_found():
persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
idx = persons_tree._build_index(persons)
row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx)
assert row_id == "row_003"
assert reason is None
def test_resolve_one_not_found():
idx = {}
row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx)
assert row_id is None
assert reason == "not_found"
def test_resolve_one_ambiguous():
persons = [
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
]
idx = persons_tree._build_index(persons)
row_id, reason = persons_tree._resolve_one("Cram", idx)
assert row_id is None
assert reason == "ambiguous"
def test_parse_row_serial_dates():
fields = {
"generation": "G 3", "last_name": "Allemeyer", "first_name": "Elsgard",
"maiden_name": "Wöhler", "birth_date": "7568", "birth_place": "Garz",
"death_date": "36222", "death_place": "Espelkamp",
"spouse": "Allemeyer Werner", "notes": "Nichte von Herbert",
}
p = persons_tree._parse_row(2, fields)
assert p["rowId"] == "row_002"
assert p["firstName"] == "Elsgard"
assert p["lastName"] == "Allemeyer"
assert p["maidenName"] == "Wöhler"
assert p["birthYear"] == 1920
assert p["deathYear"] == 1999
assert p["birthPlace"] == "Garz"
assert p["deathPlace"] == "Espelkamp"
assert p["generation"] == 3
assert p["familyMember"] is True
assert p["_spouse_raw"] == "Allemeyer Werner"
assert p["_bemerkung_raw"] == "Nichte von Herbert"
assert "[Geburtsdatum" not in (p["notes"] or "")
def test_parse_row_string_birth_date():
fields = {
"generation": "G 2", "last_name": "Cram", "first_name": "Herbert",
"maiden_name": "", "birth_date": "25.6.1890", "birth_place": "Texas",
"death_date": "", "death_place": "", "spouse": "", "notes": "",
}
p = persons_tree._parse_row(28, fields)
assert p["birthYear"] == 1890
assert p["deathYear"] is None
assert p["notes"] is None or p["notes"] == ""
def test_parse_row_unresolvable_date_goes_to_notes():
fields = {
"generation": "G 3", "last_name": "Heydrich", "first_name": "Dieter",
"maiden_name": "", "birth_date": "28.9.", "birth_place": "",
"death_date": "", "death_place": "", "spouse": "", "notes": "Bruder v Ingrid",
}
p = persons_tree._parse_row(96, fields)
assert p["birthYear"] is None
assert "[Geburtsdatum: 28.9.]" in p["notes"]
assert "Bruder v Ingrid" in p["notes"]
def test_parse_row_empty_spouse_and_notes():
fields = {
"generation": "G 4", "last_name": "Allemeyer", "first_name": "Jürgen",
"maiden_name": "", "birth_date": "", "birth_place": "",
"death_date": "", "death_place": "", "spouse": "", "notes": "",
}
p = persons_tree._parse_row(4, fields)
assert p["_spouse_raw"] is None
assert p["_bemerkung_raw"] is None
def test_deduplicate_no_duplicates():
persons = [
{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "birthYear": 1920},
{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "birthYear": 1923},
]
result, skipped = persons_tree._deduplicate(persons)
assert len(result) == 2
assert skipped == []
def test_deduplicate_exact_match():
# rows 127/138: same firstName, lastName, birthYear
persons = [
{"rowId": "row_127", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
{"rowId": "row_138", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
]
result, skipped = persons_tree._deduplicate(persons)
assert [p["rowId"] for p in result] == ["row_127"]
assert len(skipped) == 1
assert "row_138" in skipped[0]
def test_deduplicate_none_birth_year_after_known():
# rows 129/139: row 129 has birthYear=1964, row 139 has birthYear=None
persons = [
{"rowId": "row_129", "firstName": "Christoph", "lastName": "Seils", "birthYear": 1964},
{"rowId": "row_139", "firstName": "Christoph", "lastName": "Seils", "birthYear": None},
]
result, skipped = persons_tree._deduplicate(persons)
assert [p["rowId"] for p in result] == ["row_129"]
assert len(skipped) == 1
def test_deduplicate_both_none_birth_year_kept():
# Two people with no birth year but same name: keep first only
persons = [
{"rowId": "row_A", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
{"rowId": "row_B", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
]
result, skipped = persons_tree._deduplicate(persons)
assert [p["rowId"] for p in result] == ["row_A"]
assert len(skipped) == 1
def _make_persons(*args):
"""Helper: args are (rowId, firstName, lastName, maidenName, spouse_raw) tuples."""
return [
{"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3],
"_spouse_raw": a[4], "_bemerkung_raw": None,
"birthYear": None, "deathYear": None, "birthPlace": None, "deathPlace": None,
"generation": None, "familyMember": True, "alias": None, "notes": None}
for a in args
]
def test_resolve_spouses_success():
persons = _make_persons(
("row_002", "Elsgard", "Allemeyer", "Wöhler", "Allemeyer Werner"),
("row_003", "Werner", "Allemeyer", None, "Elsgard Wöhler"),
)
idx = persons_tree._build_index(persons)
rels, unres = persons_tree._resolve_spouses(persons, idx)
assert len(rels) == 1
assert rels[0]["type"] == "SPOUSE_OF"
assert set([rels[0]["personId"], rels[0]["relatedPersonId"]]) == {"row_002", "row_003"}
assert unres == []
def test_resolve_spouses_not_found():
persons = _make_persons(
("row_007", "Charlotte", "Blomquist", "Ruge", '"Tante Lolly"'),
)
idx = persons_tree._build_index(persons)
rels, unres = persons_tree._resolve_spouses(persons, idx)
assert rels == []
assert len(unres) == 1
assert unres[0]["rowId"] == "row_007"
assert unres[0]["reason"] == "not_found"
def test_resolve_spouses_empty_spouse_field():
persons = _make_persons(
("row_004", "Jürgen", "Allemeyer", None, None),
)
idx = persons_tree._build_index(persons)
rels, unres = persons_tree._resolve_spouses(persons, idx)
assert rels == [] and unres == []
def _register(*args):
"""Build index from (rowId, first, last, maiden) tuples."""
persons = [
{"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]}
for a in args
]
return persons, persons_tree._build_index(persons)
def test_parse_bemerkung_sohn_two_parents():
_, idx = _register(
("row_019", "Clara", "Cram", "de Gruyter"),
("row_028", "Herbert", "Cram", None),
)
rels, unres, notes = persons_tree._parse_bemerkung(
"row_021", "Sohn v Clara Cram u Herbert Cram", idx
)
assert len(rels) == 2
assert all(r["type"] == "PARENT_OF" for r in rels)
child_ids = {r["relatedPersonId"] for r in rels}
parent_ids = {r["personId"] for r in rels}
assert child_ids == {"row_021"}
assert "row_019" in parent_ids and "row_028" in parent_ids
assert unres == []
assert notes == ""
def test_parse_bemerkung_tochter_von():
_, idx = _register(("row_019", "Clara", "Cram", None))
rels, unres, notes = persons_tree._parse_bemerkung(
"row_036", "Tochter von Clara Cram", idx
)
assert len(rels) == 1
assert rels[0] == {
"personId": "row_019",
"relatedPersonId": "row_036",
"type": "PARENT_OF",
"source": "bemerkung",
"rawBemerkung": "Tochter von Clara Cram",
}
assert notes == ""
def test_parse_bemerkung_vater():
_, idx = _register(("row_028", "Herbert", "Cram", None))
rels, unres, notes = persons_tree._parse_bemerkung(
"row_031", "Vater v Herbert Cram", idx
)
assert len(rels) == 1
assert rels[0]["personId"] == "row_031"
assert rels[0]["relatedPersonId"] == "row_028"
assert rels[0]["type"] == "PARENT_OF"
def test_parse_bemerkung_unmatched_parent_name():
_, idx = _register() # empty index
rels, unres, notes = persons_tree._parse_bemerkung(
"row_004", "Sohn v Elsgard A.", idx
)
assert rels == []
assert len(unres) == 1
assert unres[0]["reason"] == "not_found"
assert notes == ""
def test_parse_bemerkung_skip_nichte():
_, idx = _register(("row_028", "Herbert", "Cram", None))
rels, unres, notes = persons_tree._parse_bemerkung(
"row_002", "Nichte von Herbert", idx
)
assert rels == []
assert unres == []
assert notes == "Nichte von Herbert"
def test_parse_bemerkung_skip_bruder():
_, idx = _register(("row_028", "Herbert", "Cram", None))
rels, unres, notes = persons_tree._parse_bemerkung(
"row_033", "Bruder v Herbert", idx
)
assert rels == []
assert unres == []
assert notes == "Bruder v Herbert"
def test_parse_bemerkung_empty():
_, idx = _register()
rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx)
assert rels == [] and unres == [] and notes == ""
def test_parse_bemerkung_plain_remark():
_, idx = _register()
rels, unres, notes = persons_tree._parse_bemerkung(
"row_029", "Verfasserin der Cram-Chronik !!", idx
)
assert rels == [] and unres == []
assert notes == "Verfasserin der Cram-Chronik !!"