feat(normalizer): row extraction, triage, canonical record
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
31
tools/import-normalizer/tests/test_documents.py
Normal file
31
tools/import-normalizer/tests/test_documents.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import documents
|
||||
from documents import Triage
|
||||
|
||||
def test_extract_row():
|
||||
header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4,
|
||||
"receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9}
|
||||
cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
||||
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
|
||||
raw = documents.extract_row(cells, header, source_row=3)
|
||||
assert raw.index == "W-0001"
|
||||
assert raw.sender == "Walter de Gruyter"
|
||||
assert raw.date == "15.2.1888"
|
||||
assert raw.source_row == 3
|
||||
|
||||
def test_triage():
|
||||
assert documents.triage(["", "", ""]) == Triage.EMPTY
|
||||
assert documents.triage(["", "", "Walter"]) == Triage.BLANK_INDEX # data but no index
|
||||
assert documents.triage(["W-0001x", "x"]) == Triage.X_SUFFIX
|
||||
assert documents.triage(["W-0001", "x"]) == Triage.OK
|
||||
|
||||
def test_classify_blank_index():
|
||||
header = {"sender": 4, "receivers": 5}
|
||||
banner = ["", "", "", "", "Brautbriefe von Walter an Eugenie", ""]
|
||||
data = ["", "", "V", "1", "", "Eugenie"]
|
||||
assert documents.classify_blank_index(banner, header) == "section_banner"
|
||||
assert documents.classify_blank_index(data, header) == "data_no_index"
|
||||
|
||||
def test_index_file_mismatch():
|
||||
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
|
||||
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
|
||||
assert documents.index_file_mismatch("W-0001", "") is False
|
||||
Reference in New Issue
Block a user