From a1f96f60ac1baa9c6c7f0a4055e4eaa427827749 Mon Sep 17 00:00:00 2001 From: Marcel Date: Wed, 27 May 2026 21:20:08 +0200 Subject: [PATCH] feat(importing): log import-row breadcrumbs and distinguish skip outcomes Address PR #687 review concerns on DocumentImporter: - Tobias: thread a 1-based source row number into importRow so the "index rejected" skip log carries a breadcrumb (the row number, never the raw hostile index) for post-import triage. - Elicit: emit a distinct log when a valid index has no .pdf on disk (normal PLACEHOLDER) so it is not conflated with a rejected index. - Nora: add a log.warn in resolvePdfByIndex's getCanonicalPath IOException branch so the quiet fail-safe skip surfaces in ops, distinct from the deliberate symlink-escape abort. - Felix: replace inline fully-qualified java.util.regex.Pattern with an import. - Nora: document that \d is intentionally ASCII-only (do not add UNICODE_CHARACTER_CLASS). Refs #686 Co-Authored-By: Claude Opus 4.7 --- .../importing/DocumentImporter.java | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java index 21494447..ab693920 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java @@ -37,6 +37,7 @@ import java.util.List; import java.util.Optional; import java.util.Set; import java.util.UUID; +import java.util.regex.Pattern; /** * Loads {@code canonical-documents.xlsx} into the document domain. Java performs no @@ -67,8 +68,11 @@ public class DocumentImporter { // "Mü-0001"), one or more hyphens (the corpus has a few "C--0029" data-entry artefacts), // digits, and an optional trailing "x" the normalizer recognises. Anchored, with no // separator / dot / slash characters in the class, so ".pdf" can never traverse. - private static final java.util.regex.Pattern INDEX_PATTERN = - java.util.regex.Pattern.compile("[A-Za-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF]{1,4}-+\\d+x?"); + // NOTE: `\d` here is intentionally ASCII-only ([0-9]). Java's java.util.regex matches `\d` + // against [0-9] unless Pattern.UNICODE_CHARACTER_CLASS is set — do NOT add that flag, or + // Arabic-Indic / fullwidth digits would silently widen the accepted set. + private static final Pattern INDEX_PATTERN = + Pattern.compile("[A-Za-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF]{1,4}-+\\d+x?"); private final DocumentService documentService; private final PersonService personService; @@ -93,10 +97,14 @@ public class DocumentImporter { List rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS); int processed = 0; List skipped = new ArrayList<>(); + // 1-based source row number for ops triage breadcrumbs (the spreadsheet header is row 1, + // so the first data row is row 2 — matches what an operator sees in the .xlsx). + int rowNumber = 1; for (CanonicalSheetReader.Row row : rows) { + rowNumber++; String index = row.get("index"); if (index.isBlank()) continue; - Optional skipReason = importRow(row, index); + Optional skipReason = importRow(row, index, rowNumber); if (skipReason.isPresent()) { skipped.add(new ImportStatus.SkippedFile(index, skipReason.get())); } else { @@ -107,13 +115,24 @@ public class DocumentImporter { return new LoadResult(processed, skipped); } - private Optional importRow(CanonicalSheetReader.Row row, String index) { + private Optional importRow(CanonicalSheetReader.Row row, String index, int rowNumber) { if (!isValidImportIndex(index)) { - log.warn("Skipping import row: index rejected"); + // Breadcrumb is the source row number, NOT the raw (possibly-hostile) index — an + // operator triaging the import can find the offending row in the .xlsx without us + // echoing attacker-controlled input into the log. + log.warn("Skipping import row {}: index rejected (fails catalog-shape validation)", rowNumber); return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); } - Optional resolved = resolvePdfByIndex(index); - if (resolved.isPresent()) { + Optional resolved = resolvePdfByIndex(index, rowNumber); + if (resolved.isEmpty()) { + // Distinct from the "index rejected" skip above: the index is VALID but no + // .pdf is on disk, so the row becomes a normal PLACEHOLDER (not skipped). The + // index is a validated catalog id (no hostile content), so it is safe to log here — + // this surfaces a corpus that drifts from the ".pdf" assumption (e.g. a file + // that arrived under a different name) rather than dropping it silently. + log.info("Import row {}: index {} is valid but {}.pdf is absent — creating PLACEHOLDER", + rowNumber, index, index); + } else { try { if (!isPdfMagicBytes(resolved.get())) { return Optional.of(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE); @@ -319,7 +338,7 @@ public class DocumentImporter { // O(1) direct lookup: the PDF is exactly importDir/.pdf. The caller has already // validated the index shape; the canonical-path containment assertion below is // defense-in-depth so even a symlinked .pdf cannot read outside importDir. - private Optional resolvePdfByIndex(String index) { + private Optional resolvePdfByIndex(String index, int rowNumber) { File baseDir = new File(importDir); File candidate = baseDir.toPath().resolve(index + ".pdf").toFile(); try { @@ -330,6 +349,11 @@ public class DocumentImporter { } return Optional.of(candidate); } catch (IOException e) { + // Distinct from the deliberate symlink-escape abort above (which throws): canonical + // resolution itself failed (e.g. the OS rejected the path mid-resolution). We fail + // safe to a PLACEHOLDER, but never silently — log it so the asymmetry surfaces in ops. + log.warn("Canonical path resolution failed for import row {}: treating {}.pdf as absent", + rowNumber, index, e); return Optional.empty(); } }