feat(normalizer): orchestrator + end-to-end integration test
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
59
tools/import-normalizer/tests/test_normalize.py
Normal file
59
tools/import-normalizer/tests/test_normalize.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import openpyxl
|
||||
import normalize
|
||||
|
||||
|
||||
def _doc_wb(tmp_path):
|
||||
wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Familienarchiv"
|
||||
ws.append(["Index", "Datei", "Box", "Mappe", "BriefeschreiberIn", "EmpfängerIn",
|
||||
"Datum des Briefes", "Ort", "Schlagwort", "Inhalt"])
|
||||
ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
||||
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"])
|
||||
ws.append(["W-0001x", r"..\__scan\W-0001x.pdf", "", "", "Walter de Gruyter", "Eugenie Müller", "", "", "", ""])
|
||||
ws.append(["", "", "", "", "Section banner row", "", "", "", "", ""])
|
||||
ws.append(["C-0001", "", "", "", "Hans Wittkopf", "", "Freitag 1919", "", "", ""])
|
||||
ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
||||
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "dup"])
|
||||
p = tmp_path / "docs.xlsx"; wb.save(p); return p
|
||||
|
||||
|
||||
def _person_wb(tmp_path):
|
||||
wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Tabelle1"
|
||||
ws.append(["Generation", "Familienname", "Vorname", "geb als", "Geburtsdatum",
|
||||
"Geburtsort", "Todesdatum", "Sterbeort", "verheiratet mit", "Bemerkung"])
|
||||
ws.append(["G 1", "de Gruyter", "Walter", "", "", "", "", "", "", ""])
|
||||
ws.append(["G 1", "de Gruyter", "Eugenie", "Müller", "", "", "", "", "", ""])
|
||||
p = tmp_path / "persons.xlsx"; wb.save(p); return p
|
||||
|
||||
|
||||
def test_run_end_to_end(tmp_path):
|
||||
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||
stats = normalize.run(
|
||||
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||
out_dir=out_dir, review_dir=review_dir,
|
||||
date_overrides={}, name_overrides={})
|
||||
assert (out_dir / "canonical-documents.xlsx").exists()
|
||||
assert (out_dir / "canonical-persons.xlsx").exists()
|
||||
assert stats["documents_emitted"] == 3 # W-0001, C-0001, W-0001 (dup) — x and blank excluded
|
||||
assert stats["skipped_x_suffix"] == 1
|
||||
assert stats["blank_index_rows"] == 1
|
||||
assert stats["duplicate_index_rows"] == 2
|
||||
assert (review_dir / "skipped-x-suffix.csv").exists()
|
||||
assert (review_dir / "unparsed-dates.csv").exists()
|
||||
# C-0001's "Freitag 1919" is unparseable -> must appear in the review file (NFR-DATA-01)
|
||||
assert "Freitag 1919" in (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8")
|
||||
|
||||
# determinism (NFR-IDEM-01): a second run yields identical canonical content + review files
|
||||
def _matrix(p):
|
||||
wb = openpyxl.load_workbook(p)
|
||||
return [[c.value for c in row] for row in wb.active.iter_rows()]
|
||||
docs1 = _matrix(out_dir / "canonical-documents.xlsx")
|
||||
persons1 = _matrix(out_dir / "canonical-persons.xlsx")
|
||||
unparsed1 = (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8")
|
||||
normalize.run(document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||
out_dir=out_dir, review_dir=review_dir, date_overrides={}, name_overrides={})
|
||||
assert _matrix(out_dir / "canonical-documents.xlsx") == docs1
|
||||
assert _matrix(out_dir / "canonical-persons.xlsx") == persons1
|
||||
assert (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") == unparsed1
|
||||
assert len(docs1) == 4 # header + 3 docs
|
||||
Reference in New Issue
Block a user