refactor(normalizer): harden triage index guard + index_file_mismatch tests
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -60,7 +60,7 @@ def triage(cells: list[str], index_col: int = 0) -> Triage:
|
|||||||
nonempty = [c for c in cells if c and str(c).strip()]
|
nonempty = [c for c in cells if c and str(c).strip()]
|
||||||
if not nonempty:
|
if not nonempty:
|
||||||
return Triage.EMPTY
|
return Triage.EMPTY
|
||||||
index = (cells[index_col] or "").strip() if index_col < len(cells) else ""
|
index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
|
||||||
if not index:
|
if not index:
|
||||||
return Triage.BLANK_INDEX
|
return Triage.BLANK_INDEX
|
||||||
if index.endswith("x"):
|
if index.endswith("x"):
|
||||||
@@ -78,6 +78,7 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def index_file_mismatch(index: str, file_path: str) -> bool:
|
def index_file_mismatch(index: str, file_path: str) -> bool:
|
||||||
|
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
|
||||||
if not file_path.strip():
|
if not file_path.strip():
|
||||||
return False
|
return False
|
||||||
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
|
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
|
||||||
|
|||||||
@@ -29,3 +29,5 @@ def test_index_file_mismatch():
|
|||||||
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
|
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
|
||||||
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
|
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
|
||||||
assert documents.index_file_mismatch("W-0001", "") is False
|
assert documents.index_file_mismatch("W-0001", "") is False
|
||||||
|
assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path
|
||||||
|
assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir
|
||||||
|
|||||||
Reference in New Issue
Block a user