feat(normalizer): unresolved-names report + fix ambiguous-pair over-flagging
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -10,7 +10,7 @@ def _doc_wb(tmp_path):
|
||||
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"])
|
||||
ws.append(["W-0001x", r"..\__scan\W-0001x.pdf", "", "", "Walter de Gruyter", "Eugenie Müller", "", "", "", ""])
|
||||
ws.append(["", "", "", "", "Section banner row", "", "", "", "", ""])
|
||||
ws.append(["C-0001", "", "", "", "Hans Wittkopf", "", "Freitag 1919", "", "", ""])
|
||||
ws.append(["C-0001", "", "", "", "Hans Wittkopf", "?", "Freitag 1919", "", "", ""])
|
||||
ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
||||
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "dup"])
|
||||
p = tmp_path / "docs.xlsx"; wb.save(p); return p
|
||||
@@ -42,6 +42,11 @@ def test_run_end_to_end(tmp_path):
|
||||
assert (review_dir / "unparsed-dates.csv").exists()
|
||||
# C-0001's "Freitag 1919" is unparseable -> must appear in the review file (NFR-DATA-01)
|
||||
assert "Freitag 1919" in (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8")
|
||||
assert (out_dir / "canonical-documents.xlsx").exists() # (keep existing asserts above)
|
||||
assert (review_dir / "unresolved-names.csv").exists()
|
||||
unresolved_text = (review_dir / "unresolved-names.csv").read_text(encoding="utf-8")
|
||||
assert "unknown" in unresolved_text and "?" in unresolved_text # the "?" receiver
|
||||
assert not (review_dir / "ambiguous-receivers.csv").exists() # replaced
|
||||
|
||||
# determinism (NFR-IDEM-01): a second run yields identical canonical content + review files
|
||||
def _matrix(p):
|
||||
|
||||
Reference in New Issue
Block a user