diff --git a/tools/import-normalizer/documents.py b/tools/import-normalizer/documents.py index e33f2901..f9b56e72 100644 --- a/tools/import-normalizer/documents.py +++ b/tools/import-normalizer/documents.py @@ -60,7 +60,7 @@ def triage(cells: list[str], index_col: int = 0) -> Triage: nonempty = [c for c in cells if c and str(c).strip()] if not nonempty: return Triage.EMPTY - index = (cells[index_col] or "").strip() if index_col < len(cells) else "" + index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else "" if not index: return Triage.BLANK_INDEX if index.endswith("x"): @@ -78,6 +78,7 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str: def index_file_mismatch(index: str, file_path: str) -> bool: + # Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf). if not file_path.strip(): return False basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] diff --git a/tools/import-normalizer/tests/test_documents.py b/tools/import-normalizer/tests/test_documents.py index 4c4f76a4..ec3066d6 100644 --- a/tools/import-normalizer/tests/test_documents.py +++ b/tools/import-normalizer/tests/test_documents.py @@ -29,3 +29,5 @@ def test_index_file_mismatch(): assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False assert documents.index_file_mismatch("W-0001", "") is False + assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path + assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir