Import normalizer: offline tool to normalize the raw archive spreadsheets #663

Merged
marcel merged 172 commits from docs/import-migration into main 2026-05-28 15:05:51 +02:00
2 changed files with 4 additions and 1 deletions
Showing only changes of commit 3066d3d3ff - Show all commits

View File

@@ -60,7 +60,7 @@ def triage(cells: list[str], index_col: int = 0) -> Triage:
nonempty = [c for c in cells if c and str(c).strip()] nonempty = [c for c in cells if c and str(c).strip()]
if not nonempty: if not nonempty:
return Triage.EMPTY return Triage.EMPTY
index = (cells[index_col] or "").strip() if index_col < len(cells) else "" index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
if not index: if not index:
return Triage.BLANK_INDEX return Triage.BLANK_INDEX
if index.endswith("x"): if index.endswith("x"):
@@ -78,6 +78,7 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
def index_file_mismatch(index: str, file_path: str) -> bool: def index_file_mismatch(index: str, file_path: str) -> bool:
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
if not file_path.strip(): if not file_path.strip():
return False return False
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]

View File

@@ -29,3 +29,5 @@ def test_index_file_mismatch():
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
assert documents.index_file_mismatch("W-0001", "") is False assert documents.index_file_mismatch("W-0001", "") is False
assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path
assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir