refactor(normalizer): harden triage index guard + index_file_mismatch tests
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -60,7 +60,7 @@ def triage(cells: list[str], index_col: int = 0) -> Triage:
|
||||
nonempty = [c for c in cells if c and str(c).strip()]
|
||||
if not nonempty:
|
||||
return Triage.EMPTY
|
||||
index = (cells[index_col] or "").strip() if index_col < len(cells) else ""
|
||||
index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
|
||||
if not index:
|
||||
return Triage.BLANK_INDEX
|
||||
if index.endswith("x"):
|
||||
@@ -78,6 +78,7 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
|
||||
|
||||
|
||||
def index_file_mismatch(index: str, file_path: str) -> bool:
|
||||
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
|
||||
if not file_path.strip():
|
||||
return False
|
||||
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
|
||||
|
||||
@@ -29,3 +29,5 @@ def test_index_file_mismatch():
|
||||
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
|
||||
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
|
||||
assert documents.index_file_mismatch("W-0001", "") is False
|
||||
assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path
|
||||
assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir
|
||||
|
||||
Reference in New Issue
Block a user