feat(importing): log import-row breadcrumbs and distinguish skip outcomes
Address PR #687 review concerns on DocumentImporter: - Tobias: thread a 1-based source row number into importRow so the "index rejected" skip log carries a breadcrumb (the row number, never the raw hostile index) for post-import triage. - Elicit: emit a distinct log when a valid index has no <index>.pdf on disk (normal PLACEHOLDER) so it is not conflated with a rejected index. - Nora: add a log.warn in resolvePdfByIndex's getCanonicalPath IOException branch so the quiet fail-safe skip surfaces in ops, distinct from the deliberate symlink-escape abort. - Felix: replace inline fully-qualified java.util.regex.Pattern with an import. - Nora: document that \d is intentionally ASCII-only (do not add UNICODE_CHARACTER_CLASS). Refs #686 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -37,6 +37,7 @@ import java.util.List;
|
|||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads {@code canonical-documents.xlsx} into the document domain. Java performs no
|
* Loads {@code canonical-documents.xlsx} into the document domain. Java performs no
|
||||||
@@ -67,8 +68,11 @@ public class DocumentImporter {
|
|||||||
// "Mü-0001"), one or more hyphens (the corpus has a few "C--0029" data-entry artefacts),
|
// "Mü-0001"), one or more hyphens (the corpus has a few "C--0029" data-entry artefacts),
|
||||||
// digits, and an optional trailing "x" the normalizer recognises. Anchored, with no
|
// digits, and an optional trailing "x" the normalizer recognises. Anchored, with no
|
||||||
// separator / dot / slash characters in the class, so "<index>.pdf" can never traverse.
|
// separator / dot / slash characters in the class, so "<index>.pdf" can never traverse.
|
||||||
private static final java.util.regex.Pattern INDEX_PATTERN =
|
// NOTE: `\d` here is intentionally ASCII-only ([0-9]). Java's java.util.regex matches `\d`
|
||||||
java.util.regex.Pattern.compile("[A-Za-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF]{1,4}-+\\d+x?");
|
// against [0-9] unless Pattern.UNICODE_CHARACTER_CLASS is set — do NOT add that flag, or
|
||||||
|
// Arabic-Indic / fullwidth digits would silently widen the accepted set.
|
||||||
|
private static final Pattern INDEX_PATTERN =
|
||||||
|
Pattern.compile("[A-Za-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF]{1,4}-+\\d+x?");
|
||||||
|
|
||||||
private final DocumentService documentService;
|
private final DocumentService documentService;
|
||||||
private final PersonService personService;
|
private final PersonService personService;
|
||||||
@@ -93,10 +97,14 @@ public class DocumentImporter {
|
|||||||
List<CanonicalSheetReader.Row> rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS);
|
List<CanonicalSheetReader.Row> rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS);
|
||||||
int processed = 0;
|
int processed = 0;
|
||||||
List<ImportStatus.SkippedFile> skipped = new ArrayList<>();
|
List<ImportStatus.SkippedFile> skipped = new ArrayList<>();
|
||||||
|
// 1-based source row number for ops triage breadcrumbs (the spreadsheet header is row 1,
|
||||||
|
// so the first data row is row 2 — matches what an operator sees in the .xlsx).
|
||||||
|
int rowNumber = 1;
|
||||||
for (CanonicalSheetReader.Row row : rows) {
|
for (CanonicalSheetReader.Row row : rows) {
|
||||||
|
rowNumber++;
|
||||||
String index = row.get("index");
|
String index = row.get("index");
|
||||||
if (index.isBlank()) continue;
|
if (index.isBlank()) continue;
|
||||||
Optional<ImportStatus.SkipReason> skipReason = importRow(row, index);
|
Optional<ImportStatus.SkipReason> skipReason = importRow(row, index, rowNumber);
|
||||||
if (skipReason.isPresent()) {
|
if (skipReason.isPresent()) {
|
||||||
skipped.add(new ImportStatus.SkippedFile(index, skipReason.get()));
|
skipped.add(new ImportStatus.SkippedFile(index, skipReason.get()));
|
||||||
} else {
|
} else {
|
||||||
@@ -107,13 +115,24 @@ public class DocumentImporter {
|
|||||||
return new LoadResult(processed, skipped);
|
return new LoadResult(processed, skipped);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Optional<ImportStatus.SkipReason> importRow(CanonicalSheetReader.Row row, String index) {
|
private Optional<ImportStatus.SkipReason> importRow(CanonicalSheetReader.Row row, String index, int rowNumber) {
|
||||||
if (!isValidImportIndex(index)) {
|
if (!isValidImportIndex(index)) {
|
||||||
log.warn("Skipping import row: index rejected");
|
// Breadcrumb is the source row number, NOT the raw (possibly-hostile) index — an
|
||||||
|
// operator triaging the import can find the offending row in the .xlsx without us
|
||||||
|
// echoing attacker-controlled input into the log.
|
||||||
|
log.warn("Skipping import row {}: index rejected (fails catalog-shape validation)", rowNumber);
|
||||||
return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
|
return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
|
||||||
}
|
}
|
||||||
Optional<File> resolved = resolvePdfByIndex(index);
|
Optional<File> resolved = resolvePdfByIndex(index, rowNumber);
|
||||||
if (resolved.isPresent()) {
|
if (resolved.isEmpty()) {
|
||||||
|
// Distinct from the "index rejected" skip above: the index is VALID but no
|
||||||
|
// <index>.pdf is on disk, so the row becomes a normal PLACEHOLDER (not skipped). The
|
||||||
|
// index is a validated catalog id (no hostile content), so it is safe to log here —
|
||||||
|
// this surfaces a corpus that drifts from the "<index>.pdf" assumption (e.g. a file
|
||||||
|
// that arrived under a different name) rather than dropping it silently.
|
||||||
|
log.info("Import row {}: index {} is valid but {}.pdf is absent — creating PLACEHOLDER",
|
||||||
|
rowNumber, index, index);
|
||||||
|
} else {
|
||||||
try {
|
try {
|
||||||
if (!isPdfMagicBytes(resolved.get())) {
|
if (!isPdfMagicBytes(resolved.get())) {
|
||||||
return Optional.of(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE);
|
return Optional.of(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE);
|
||||||
@@ -319,7 +338,7 @@ public class DocumentImporter {
|
|||||||
// O(1) direct lookup: the PDF is exactly importDir/<index>.pdf. The caller has already
|
// O(1) direct lookup: the PDF is exactly importDir/<index>.pdf. The caller has already
|
||||||
// validated the index shape; the canonical-path containment assertion below is
|
// validated the index shape; the canonical-path containment assertion below is
|
||||||
// defense-in-depth so even a symlinked <index>.pdf cannot read outside importDir.
|
// defense-in-depth so even a symlinked <index>.pdf cannot read outside importDir.
|
||||||
private Optional<File> resolvePdfByIndex(String index) {
|
private Optional<File> resolvePdfByIndex(String index, int rowNumber) {
|
||||||
File baseDir = new File(importDir);
|
File baseDir = new File(importDir);
|
||||||
File candidate = baseDir.toPath().resolve(index + ".pdf").toFile();
|
File candidate = baseDir.toPath().resolve(index + ".pdf").toFile();
|
||||||
try {
|
try {
|
||||||
@@ -330,6 +349,11 @@ public class DocumentImporter {
|
|||||||
}
|
}
|
||||||
return Optional.of(candidate);
|
return Optional.of(candidate);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
// Distinct from the deliberate symlink-escape abort above (which throws): canonical
|
||||||
|
// resolution itself failed (e.g. the OS rejected the path mid-resolution). We fail
|
||||||
|
// safe to a PLACEHOLDER, but never silently — log it so the asymmetry surfaces in ops.
|
||||||
|
log.warn("Canonical path resolution failed for import row {}: treating {}.pdf as absent",
|
||||||
|
rowNumber, index, e);
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user