Files
familienarchiv/tools/import-normalizer/tests/test_normalize.py
Marcel 94a40237f4 feat(normalizer): generate structured tags from Schlagwort + Inhalt fields
Adds tags.py module implementing a three-outcome heuristic:
- Individual-to-individual correspondence tags ("Clara an Herbert") → dropped
- Group/collective correspondence ("Clara an Kinder", "Walter an Geschwister") → Briefwechsel/<value>
- Semantic/event tags ("Brautbriefe", "Alltag", "zur Hochzeit") → Themen/<value>

Three correspondence patterns detected: space-an-space, starts-with-"an ",
and abbreviated-sender form ("Maria W.an Clara").

COLLECTIVE_TERMS in config.py extended with 17 plural/group relational terms
(söhne, brüder, schwiegereltern, cousinen, etc.) confirmed against the full Excel.

Also adds two-phase summary mining: every run emits review/tag-candidates.csv;
subsequent runs apply keywords from overrides/approved-themes.csv as Themen tags.

Outputs: canonical-documents.xlsx gets pipe-separated "Parent/Child" tag paths;
canonical-tag-tree.xlsx provides the full tag hierarchy for backend pre-import.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-25 19:47:36 +02:00

122 lines
6.4 KiB
Python

import openpyxl
import normalize
def _doc_wb(tmp_path):
wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Familienarchiv"
ws.append(["Index", "Datei", "Box", "Mappe", "BriefeschreiberIn", "EmpfängerIn",
"Datum des Briefes", "Ort", "Schlagwort", "Inhalt"])
ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"])
ws.append(["W-0001x", r"..\__scan\W-0001x.pdf", "", "", "Walter de Gruyter", "Eugenie Müller", "", "", "", ""])
ws.append(["", "", "", "", "Section banner row", "", "", "", "", ""])
ws.append(["C-0001", "", "", "", "Hans Wittkopf", "?", "Freitag 1919", "", "", ""])
ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "dup"])
p = tmp_path / "docs.xlsx"; wb.save(p); return p
def _person_wb(tmp_path):
wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Tabelle1"
ws.append(["Generation", "Familienname", "Vorname", "geb als", "Geburtsdatum",
"Geburtsort", "Todesdatum", "Sterbeort", "verheiratet mit", "Bemerkung"])
ws.append(["G 1", "de Gruyter", "Walter", "", "", "", "", "", "", ""])
ws.append(["G 1", "de Gruyter", "Eugenie", "Müller", "", "", "", "", "", ""])
p = tmp_path / "persons.xlsx"; wb.save(p); return p
def test_run_end_to_end(tmp_path):
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
stats = normalize.run(
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
out_dir=out_dir, review_dir=review_dir,
date_overrides={}, name_overrides={})
assert (out_dir / "canonical-documents.xlsx").exists()
assert (out_dir / "canonical-persons.xlsx").exists()
assert stats["documents_emitted"] == 3 # W-0001, C-0001, W-0001 (dup) — x and blank excluded
assert stats["skipped_x_suffix"] == 1
assert stats["blank_index_rows"] == 1
assert stats["duplicate_index_rows"] == 2
assert stats["unresolved_unknown"] >= 1 # the "?" receiver is an UNKNOWN-class name
assert (review_dir / "skipped-x-suffix.csv").exists()
assert (review_dir / "unparsed-dates.csv").exists()
# C-0001's "Freitag 1919" is unparseable -> must appear in the review file (NFR-DATA-01)
assert "Freitag 1919" in (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8")
assert (review_dir / "unresolved-names.csv").exists()
unresolved_text = (review_dir / "unresolved-names.csv").read_text(encoding="utf-8")
assert "unknown" in unresolved_text and "?" in unresolved_text # the "?" receiver
assert not (review_dir / "ambiguous-receivers.csv").exists() # replaced
# determinism (NFR-IDEM-01): a second run yields identical canonical content + review files
def _matrix(p):
wb = openpyxl.load_workbook(p)
return [[c.value for c in row] for row in wb.active.iter_rows()]
docs1 = _matrix(out_dir / "canonical-documents.xlsx")
persons1 = _matrix(out_dir / "canonical-persons.xlsx")
unparsed1 = (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8")
normalize.run(document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
out_dir=out_dir, review_dir=review_dir, date_overrides={}, name_overrides={})
assert _matrix(out_dir / "canonical-documents.xlsx") == docs1
assert _matrix(out_dir / "canonical-persons.xlsx") == persons1
assert (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") == unparsed1
assert len(docs1) == 4 # header + 3 docs
def test_tag_tree_output_emitted(tmp_path):
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
normalize.run(
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
out_dir=out_dir, review_dir=review_dir,
date_overrides={}, name_overrides={})
assert (out_dir / "canonical-tag-tree.xlsx").exists()
def test_tag_candidates_review_emitted(tmp_path):
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
normalize.run(
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
out_dir=out_dir, review_dir=review_dir,
date_overrides={}, name_overrides={})
assert (review_dir / "tag-candidates.csv").exists()
text = (review_dir / "tag-candidates.csv").read_text(encoding="utf-8")
assert "candidate" in text and "count" in text
def test_schlagwort_encoded_as_themen_in_documents(tmp_path):
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
normalize.run(
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
out_dir=out_dir, review_dir=review_dir,
date_overrides={}, name_overrides={})
wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
ws = wb.active
header = [c.value for c in ws[1]]
tag_col = header.index("tags")
tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
assert any(v and "Themen/Brautbriefe" in v for v in tag_values)
assert not any(v and v.strip() == "Brautbriefe" for v in tag_values)
def test_approved_themes_applied(tmp_path):
themes_file = tmp_path / "approved-themes.csv"
themes_file.write_text("candidate\ngeschäftsreise\n", encoding="utf-8")
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
normalize.run(
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
out_dir=out_dir, review_dir=review_dir,
date_overrides={}, name_overrides={},
approved_themes_path=themes_file)
wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
ws = wb.active
header = [c.value for c in ws[1]]
tag_col = header.index("tags")
tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
# W-0001 has Inhalt "Geschäftsreise" — should get an extra Themen/geschäftsreise tag
assert any(v and "Themen/geschäftsreise" in v for v in tag_values)