refactor(normalizer): harden triage index guard + index_file_mismatch tests

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 14:15:50 +02:00
parent 3e7ddea90a
commit 3066d3d3ff
2 changed files with 4 additions and 1 deletions

View File

@@ -60,7 +60,7 @@ def triage(cells: list[str], index_col: int = 0) -> Triage:
nonempty = [c for c in cells if c and str(c).strip()]
if not nonempty:
return Triage.EMPTY
index = (cells[index_col] or "").strip() if index_col < len(cells) else ""
index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
if not index:
return Triage.BLANK_INDEX
if index.endswith("x"):
@@ -78,6 +78,7 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
def index_file_mismatch(index: str, file_path: str) -> bool:
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
if not file_path.strip():
return False
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]

View File

@@ -29,3 +29,5 @@ def test_index_file_mismatch():
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
assert documents.index_file_mismatch("W-0001", "") is False
assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path
assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir