2026-05-28 15:05:51 +02:00
2 changed files with 4 additions and 1 deletions
--- a/tools/import-normalizer/documents.py
+++ b/tools/import-normalizer/documents.py
@@ -60,7 +60,7 @@ def triage(cells: list[str], index_col: int = 0) -> Triage:
    nonempty = [c for c in cells if c and str(c).strip()]
    if not nonempty:
        return Triage.EMPTY
-    index = (cells[index_col] or "").strip() if index_col < len(cells) else ""
+    index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
    if not index:
        return Triage.BLANK_INDEX
    if index.endswith("x"):
@@ -78,6 +78,7 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
 def index_file_mismatch(index: str, file_path: str) -> bool:
    # Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
    if not file_path.strip():
        return False
    basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
--- a/tools/import-normalizer/tests/test_documents.py
+++ b/tools/import-normalizer/tests/test_documents.py
@@ -29,3 +29,5 @@ def test_index_file_mismatch():
    assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
    assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
    assert documents.index_file_mismatch("W-0001", "") is False
    assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False  # unix path
    assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False         # no dir