import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) import persons_tree def test_parse_year_iso_string(): assert persons_tree._parse_year("1920-09-20") == 1920 def test_parse_year_excel_serial_birth(): # 7568 days from 1899-12-30 = 1920-09-19 or -20 depending on leap counting assert persons_tree._parse_year("7568") == 1920 def test_parse_year_excel_serial_death(): # 36222 days from 1899-12-30 ≈ 1999 assert persons_tree._parse_year("36222") == 1999 def test_parse_year_excel_serial_small(): # 177 days from 1899-12-30 = 1900-06-25 assert persons_tree._parse_year("177") == 1900 def test_parse_year_german_date_string(): assert persons_tree._parse_year("30.8.1862") == 1862 def test_parse_year_year_only(): assert persons_tree._parse_year("1930") == 1930 def test_parse_year_free_text(): assert persons_tree._parse_year("August 1941") == 1941 def test_parse_year_none(): assert persons_tree._parse_year(None) is None def test_parse_year_empty(): assert persons_tree._parse_year("") is None def test_parse_year_unresolvable_truncated(): # "2.9.196" has no valid 4-digit year — returns None assert persons_tree._parse_year("2.9.196") is None def test_parse_year_typo_year(): # "4.3.1023" — year 1023 outside 1700-2100 guard — returns None assert persons_tree._parse_year("4.3.1023") is None def test_parse_year_bare_out_of_range_year_is_none(): # "1023" is a plausible typo for "1923" but is NOT an Excel serial. # parse_date("1023") parses it as year 1023 (out of 1700-2100 guard). # The serial branch must NOT re-interpret it as a serial. assert persons_tree._parse_year("1023") is None def test_parse_generation_space(): assert persons_tree._parse_generation("G 3") == 3 def test_parse_generation_no_space(): assert persons_tree._parse_generation("G3") == 3 def test_parse_generation_extra_spaces(): assert persons_tree._parse_generation("G 0") == 0 def test_parse_generation_trailing_garbage(): assert persons_tree._parse_generation("G 2 de Gruyter") == 2 def test_parse_generation_empty(): assert persons_tree._parse_generation("") is None def test_parse_generation_none(): assert persons_tree._parse_generation(None) is None def test_norm_tree_basic(): assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer" def test_norm_tree_diacritics(): assert persons_tree._norm_tree("Wöhler") == "woehler" def test_norm_tree_strips_parens(): assert persons_tree._norm_tree("Otto (Herbert)") == "otto" def test_norm_tree_strips_quotes(): assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly" def test_norm_tree_strips_geographic_suffix(): assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram" def test_norm_tree_strips_mexiko(): assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram" def test_norm_tree_collapses_whitespace(): assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter" def test_build_index_forward_lookup(): persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] idx = persons_tree._build_index(persons) assert "werner allemeyer" in idx assert idx["werner allemeyer"] == ["row_002"] def test_build_index_reversed_lookup(): persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] idx = persons_tree._build_index(persons) assert idx.get("allemeyer werner") == ["row_002"] def test_build_index_maiden_name_lookup(): persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}] idx = persons_tree._build_index(persons) assert idx.get("elsgard woehler") == ["row_002"] def test_build_index_single_token_fallback(): persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}] idx = persons_tree._build_index(persons) assert idx.get("cram") == ["row_028"] def test_build_index_ambiguous_single_token(): persons = [ {"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}, {"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None}, ] idx = persons_tree._build_index(persons) assert set(idx["cram"]) == {"row_028", "row_019"} def test_resolve_one_found(): persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] idx = persons_tree._build_index(persons) row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx) assert row_id == "row_003" assert reason is None def test_resolve_one_not_found(): idx = {} row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx) assert row_id is None assert reason == "not_found" def test_resolve_one_ambiguous(): persons = [ {"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}, {"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None}, ] idx = persons_tree._build_index(persons) row_id, reason = persons_tree._resolve_one("Cram", idx) assert row_id is None assert reason == "ambiguous" def test_parse_row_serial_dates(): fields = { "generation": "G 3", "last_name": "Allemeyer", "first_name": "Elsgard", "maiden_name": "Wöhler", "birth_date": "7568", "birth_place": "Garz", "death_date": "36222", "death_place": "Espelkamp", "spouse": "Allemeyer Werner", "notes": "Nichte von Herbert", } p = persons_tree._parse_row(2, fields) assert p["rowId"] == "row_002" assert p["firstName"] == "Elsgard" assert p["lastName"] == "Allemeyer" assert p["maidenName"] == "Wöhler" assert p["birthYear"] == 1920 assert p["deathYear"] == 1999 assert p["birthPlace"] == "Garz" assert p["deathPlace"] == "Espelkamp" assert p["generation"] == 3 assert p["familyMember"] is True assert p["_spouse_raw"] == "Allemeyer Werner" assert p["_bemerkung_raw"] == "Nichte von Herbert" assert "[Geburtsdatum" not in (p["notes"] or "") def test_parse_row_string_birth_date(): fields = { "generation": "G 2", "last_name": "Cram", "first_name": "Herbert", "maiden_name": "", "birth_date": "25.6.1890", "birth_place": "Texas", "death_date": "", "death_place": "", "spouse": "", "notes": "", } p = persons_tree._parse_row(28, fields) assert p["birthYear"] == 1890 assert p["deathYear"] is None assert p["notes"] is None or p["notes"] == "" def test_parse_row_unresolvable_date_goes_to_notes(): fields = { "generation": "G 3", "last_name": "Heydrich", "first_name": "Dieter", "maiden_name": "", "birth_date": "28.9.", "birth_place": "", "death_date": "", "death_place": "", "spouse": "", "notes": "Bruder v Ingrid", } p = persons_tree._parse_row(96, fields) assert p["birthYear"] is None assert "[Geburtsdatum: 28.9.]" in p["notes"] assert "Bruder v Ingrid" in p["notes"] def test_parse_row_empty_spouse_and_notes(): fields = { "generation": "G 4", "last_name": "Allemeyer", "first_name": "Jürgen", "maiden_name": "", "birth_date": "", "birth_place": "", "death_date": "", "death_place": "", "spouse": "", "notes": "", } p = persons_tree._parse_row(4, fields) assert p["_spouse_raw"] is None assert p["_bemerkung_raw"] is None def test_deduplicate_no_duplicates(): persons = [ {"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "birthYear": 1920}, {"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "birthYear": 1923}, ] result, skipped = persons_tree._deduplicate(persons) assert len(result) == 2 assert skipped == [] def test_deduplicate_exact_match(): # rows 127/138: same firstName, lastName, birthYear persons = [ {"rowId": "row_127", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951}, {"rowId": "row_138", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951}, ] result, skipped = persons_tree._deduplicate(persons) assert [p["rowId"] for p in result] == ["row_127"] assert len(skipped) == 1 assert "row_138" in skipped[0] def test_deduplicate_none_birth_year_after_known(): # rows 129/139: row 129 has birthYear=1964, row 139 has birthYear=None persons = [ {"rowId": "row_129", "firstName": "Christoph", "lastName": "Seils", "birthYear": 1964}, {"rowId": "row_139", "firstName": "Christoph", "lastName": "Seils", "birthYear": None}, ] result, skipped = persons_tree._deduplicate(persons) assert [p["rowId"] for p in result] == ["row_129"] assert len(skipped) == 1 def test_deduplicate_both_none_birth_year_kept(): # Two people with no birth year but same name: keep first only persons = [ {"rowId": "row_A", "firstName": "Hans", "lastName": "Cram", "birthYear": None}, {"rowId": "row_B", "firstName": "Hans", "lastName": "Cram", "birthYear": None}, ] result, skipped = persons_tree._deduplicate(persons) assert [p["rowId"] for p in result] == ["row_A"] assert len(skipped) == 1 def _make_persons(*args): """Helper: args are (rowId, firstName, lastName, maidenName, spouse_raw) tuples.""" return [ {"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3], "_spouse_raw": a[4], "_bemerkung_raw": None, "birthYear": None, "deathYear": None, "birthPlace": None, "deathPlace": None, "generation": None, "familyMember": True, "alias": None, "notes": None} for a in args ] def test_resolve_spouses_success(): persons = _make_persons( ("row_002", "Elsgard", "Allemeyer", "Wöhler", "Allemeyer Werner"), ("row_003", "Werner", "Allemeyer", None, "Elsgard Wöhler"), ) idx = persons_tree._build_index(persons) rels, unres = persons_tree._resolve_spouses(persons, idx) assert len(rels) == 1 assert rels[0]["type"] == "SPOUSE_OF" assert set([rels[0]["personId"], rels[0]["relatedPersonId"]]) == {"row_002", "row_003"} assert unres == [] def test_resolve_spouses_not_found(): persons = _make_persons( ("row_007", "Charlotte", "Blomquist", "Ruge", '"Tante Lolly"'), ) idx = persons_tree._build_index(persons) rels, unres = persons_tree._resolve_spouses(persons, idx) assert rels == [] assert len(unres) == 1 assert unres[0]["rowId"] == "row_007" assert unres[0]["reason"] == "not_found" def test_resolve_spouses_empty_spouse_field(): persons = _make_persons( ("row_004", "Jürgen", "Allemeyer", None, None), ) idx = persons_tree._build_index(persons) rels, unres = persons_tree._resolve_spouses(persons, idx) assert rels == [] and unres == [] def _register(*args): """Build index from (rowId, first, last, maiden) tuples.""" persons = [ {"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]} for a in args ] return persons, persons_tree._build_index(persons) def test_parse_bemerkung_sohn_two_parents(): _, idx = _register( ("row_019", "Clara", "Cram", "de Gruyter"), ("row_028", "Herbert", "Cram", None), ) rels, unres, notes = persons_tree._parse_bemerkung( "row_021", "Sohn v Clara Cram u Herbert Cram", idx ) assert len(rels) == 2 assert all(r["type"] == "PARENT_OF" for r in rels) child_ids = {r["relatedPersonId"] for r in rels} parent_ids = {r["personId"] for r in rels} assert child_ids == {"row_021"} assert "row_019" in parent_ids and "row_028" in parent_ids assert unres == [] assert notes == "" def test_parse_bemerkung_tochter_von(): _, idx = _register(("row_019", "Clara", "Cram", None)) rels, unres, notes = persons_tree._parse_bemerkung( "row_036", "Tochter von Clara Cram", idx ) assert len(rels) == 1 assert rels[0] == { "personId": "row_019", "relatedPersonId": "row_036", "type": "PARENT_OF", "source": "bemerkung", "rawBemerkung": "Tochter von Clara Cram", } assert notes == "" def test_parse_bemerkung_vater(): _, idx = _register(("row_028", "Herbert", "Cram", None)) rels, unres, notes = persons_tree._parse_bemerkung( "row_031", "Vater v Herbert Cram", idx ) assert len(rels) == 1 assert rels[0]["personId"] == "row_031" assert rels[0]["relatedPersonId"] == "row_028" assert rels[0]["type"] == "PARENT_OF" def test_parse_bemerkung_unmatched_parent_name(): _, idx = _register() # empty index rels, unres, notes = persons_tree._parse_bemerkung( "row_004", "Sohn v Elsgard A.", idx ) assert rels == [] assert len(unres) == 1 assert unres[0]["reason"] == "not_found" assert notes == "" def test_parse_bemerkung_skip_nichte(): _, idx = _register(("row_028", "Herbert", "Cram", None)) rels, unres, notes = persons_tree._parse_bemerkung( "row_002", "Nichte von Herbert", idx ) assert rels == [] assert unres == [] assert notes == "Nichte von Herbert" def test_parse_bemerkung_skip_bruder(): _, idx = _register(("row_028", "Herbert", "Cram", None)) rels, unres, notes = persons_tree._parse_bemerkung( "row_033", "Bruder v Herbert", idx ) assert rels == [] assert unres == [] assert notes == "Bruder v Herbert" def test_parse_bemerkung_empty(): _, idx = _register() rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx) assert rels == [] and unres == [] and notes == "" def test_parse_bemerkung_plain_remark(): _, idx = _register() rels, unres, notes = persons_tree._parse_bemerkung( "row_029", "Verfasserin der Cram-Chronik !!", idx ) assert rels == [] and unres == [] assert notes == "Verfasserin der Cram-Chronik !!" def test_parse_bemerkung_sohn_with_trailing_remark(): _, idx = _register( ("row_019", "Clara", "Cram", "de Gruyter"), ("row_028", "Herbert", "Cram", None), ) rels, unres, notes = persons_tree._parse_bemerkung( "row_021", "Sohn v Clara Cram u Herbert Cram, nach Mexiko emigriert", idx ) assert len(rels) == 2 assert unres == [] assert notes == "nach Mexiko emigriert" def test_generated_at_is_fixed_for_reproducibility(): # NFR-IDEM-01: a pinned timestamp so the committed tree JSON doesn't churn on every run assert persons_tree._GENERATED_AT == "2020-01-01T00:00:00" def test_attach_person_ids_propagates_register_slug(): # the tree person must carry the register's verbatim person_id (slug), not a recomputed one raw_dicts = [ {"generation": "G 1", "last_name": "de Gruyter", "first_name": "Walter", "maiden_name": "", "birth_date": "", "birth_place": "", "death_date": "", "death_place": "", "spouse": "", "notes": ""}, {"generation": "G 1", "last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller", "birth_date": "", "birth_place": "", "death_date": "", "death_place": "", "spouse": "", "notes": ""}, ] tree_persons = [persons_tree._parse_row(n, d) for n, d in enumerate(raw_dicts, start=2)] persons_tree._attach_person_ids(tree_persons, raw_dicts) assert tree_persons[0]["personId"] == "de-gruyter-walter" assert tree_persons[1]["personId"] == "de-gruyter-eugenie" def test_attach_person_ids_raises_on_length_divergence(): # The propagation is a positional zip; if tree_persons and the register drift in # length (e.g. a future filter change), zip would silently truncate and mis-join ids. # The guard must fail loudly instead. raw_dicts = [ {"generation": "G 1", "last_name": "de Gruyter", "first_name": "Walter", "maiden_name": "", "birth_date": "", "birth_place": "", "death_date": "", "death_place": "", "spouse": "", "notes": ""}, # second register row has a last name -> parse_register keeps it ... {"generation": "G 1", "last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller", "birth_date": "", "birth_place": "", "death_date": "", "death_place": "", "spouse": "", "notes": ""}, ] # ... but the tree side only has one person -> lengths diverge. tree_persons = [persons_tree._parse_row(2, raw_dicts[0])] import pytest with pytest.raises(ValueError, match="length"): persons_tree._attach_person_ids(tree_persons, raw_dicts) def test_attach_person_ids_carries_register_collision_suffix(): # when two register rows slug-collide, the register suffixes the ids (-1, -2); # those exact suffixed ids must reach the tree persons, never a recomputed bare slug raw_dicts = [ {"generation": "G 2", "last_name": "Cram", "first_name": "Hans", "maiden_name": "", "birth_date": "1890", "birth_place": "", "death_date": "", "death_place": "", "spouse": "", "notes": ""}, {"generation": "G 3", "last_name": "Cram", "first_name": "Hans", "maiden_name": "", "birth_date": "1925", "birth_place": "", "death_date": "", "death_place": "", "spouse": "", "notes": ""}, ] tree_persons = [persons_tree._parse_row(n, d) for n, d in enumerate(raw_dicts, start=2)] persons_tree._attach_person_ids(tree_persons, raw_dicts) assert tree_persons[0]["personId"] == "cram-hans-1" assert tree_persons[1]["personId"] == "cram-hans-2" import subprocess def test_dry_run_exits_zero(tmp_path): """dry-run should complete without writing any file and exit 0.""" input_path = Path(__file__).parent.parent.parent.parent / "import" / "Personendatei 2.xlsx" if not input_path.exists(): import pytest pytest.skip("source Excel file not present") result = subprocess.run( [ sys.executable, str(Path(__file__).parent.parent / "persons_tree.py"), "--input", str(input_path), "--output", str(tmp_path / "out.json"), "--dry-run", ], capture_output=True, text=True, ) assert result.returncode == 0, result.stderr assert not (tmp_path / "out.json").exists() assert "persons parsed" in result.stdout