feat(importing): resolve import PDFs directly by index
The corpus is uniform — every PDF is <index>.pdf flat in the import dir — so resolve a document's PDF with an O(1) importDir.resolve(index + ".pdf") lookup instead of a recursive directory walk over the file column. The index is validated against a strict catalog pattern (1–4 Latin letters incl. umlauts, hyphen(s), digits, optional x) plus the ported separator/dot/dotdot/null/slash-homoglyph/absolute-path guards, and the resolved canonical path is asserted to stay inside the import dir as defense-in-depth. The %PDF magic-byte check still gates upload; status UPLOADED/PLACEHOLDER and the index→originalFilename upsert key are unchanged. The file column and findFileRecursive walk are gone, and the security regression tests now assert a malicious or garbage index is rejected and a valid index resolves to exactly importDir/<index>.pdf within containment. Closes #686 Closes #676 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -28,7 +28,6 @@ import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeParseException;
|
||||
@@ -38,19 +37,22 @@ import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* Loads {@code canonical-documents.xlsx} into the document domain. Java performs no
|
||||
* semantic transformation: the normalizer already resolved people to slugs and dates to
|
||||
* ISO values. This loader maps columns by header name, routes each attribution
|
||||
* register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}),
|
||||
* parses clean dates, and keeps the file/S3/thumbnail plumbing.
|
||||
* parses clean dates, and keeps the S3/thumbnail plumbing.
|
||||
*
|
||||
* <p>The {@code file} value is hostile input regardless of upstream trust (CWE-22 does not
|
||||
* care that it came from our Python tool): its basename is validated with
|
||||
* {@link #isValidImportFilename} and then resolved with canonical-path containment in
|
||||
* {@link #findFileRecursive}.
|
||||
* <p>The import corpus is uniform — every PDF is named {@code <index>.pdf} flat in the import
|
||||
* dir — so a document's PDF is resolved <em>directly by its index</em>:
|
||||
* {@code importDir.resolve(index + ".pdf")}. The {@code index} is still hostile input
|
||||
* regardless of upstream trust (CWE-22 does not care it came from our Python tool): it is
|
||||
* validated against a strict catalog pattern with {@link #isValidImportIndex} (no path
|
||||
* separators, no {@code .}/{@code ..}, no absolute path, no slash homoglyphs) and the
|
||||
* resolved path is asserted to stay inside the import dir in {@link #resolvePdfByIndex} as
|
||||
* defense-in-depth. The {@code %PDF} magic-byte check still gates upload.
|
||||
*/
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@@ -58,9 +60,16 @@ import java.util.stream.Stream;
|
||||
public class DocumentImporter {
|
||||
|
||||
static final List<String> REQUIRED_HEADERS = List.of(
|
||||
"index", "file", "sender_person_id", "sender_name",
|
||||
"index", "sender_person_id", "sender_name",
|
||||
"receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision");
|
||||
|
||||
// Catalog index shape: 1–4 letters (ASCII + Latin-1 letters, e.g. the German "ü" in
|
||||
// "Mü-0001"), one or more hyphens (the corpus has a few "C--0029" data-entry artefacts),
|
||||
// digits, and an optional trailing "x" the normalizer recognises. Anchored, with no
|
||||
// separator / dot / slash characters in the class, so "<index>.pdf" can never traverse.
|
||||
private static final java.util.regex.Pattern INDEX_PATTERN =
|
||||
java.util.regex.Pattern.compile("[A-Za-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF]{1,4}-+\\d+x?");
|
||||
|
||||
private final DocumentService documentService;
|
||||
private final PersonService personService;
|
||||
private final TagService tagService;
|
||||
@@ -87,9 +96,9 @@ public class DocumentImporter {
|
||||
for (CanonicalSheetReader.Row row : rows) {
|
||||
String index = row.get("index");
|
||||
if (index.isBlank()) continue;
|
||||
Optional<ImportStatus.SkipReason> skipReason = importRow(row, index, skipped);
|
||||
Optional<ImportStatus.SkipReason> skipReason = importRow(row, index);
|
||||
if (skipReason.isPresent()) {
|
||||
skipped.add(new ImportStatus.SkippedFile(displayName(row, index), skipReason.get()));
|
||||
skipped.add(new ImportStatus.SkippedFile(index, skipReason.get()));
|
||||
} else {
|
||||
processed++;
|
||||
}
|
||||
@@ -98,15 +107,12 @@ public class DocumentImporter {
|
||||
return new LoadResult(processed, skipped);
|
||||
}
|
||||
|
||||
private Optional<ImportStatus.SkipReason> importRow(CanonicalSheetReader.Row row, String index,
|
||||
List<ImportStatus.SkippedFile> skipped) {
|
||||
Optional<File> resolved;
|
||||
try {
|
||||
resolved = resolveFile(row.get("file"));
|
||||
} catch (InvalidImportFilenameException e) {
|
||||
log.warn("Skipping import row {}: filename rejected", index);
|
||||
private Optional<ImportStatus.SkipReason> importRow(CanonicalSheetReader.Row row, String index) {
|
||||
if (!isValidImportIndex(index)) {
|
||||
log.warn("Skipping import row: index rejected");
|
||||
return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
|
||||
}
|
||||
Optional<File> resolved = resolvePdfByIndex(index);
|
||||
if (resolved.isPresent()) {
|
||||
try {
|
||||
if (!isPdfMagicBytes(resolved.get())) {
|
||||
@@ -257,21 +263,6 @@ public class DocumentImporter {
|
||||
|
||||
// ─── file handling + S3 (small ≤20-line methods) ─────────────────────────────────
|
||||
|
||||
private Optional<File> resolveFile(String fileColumn) {
|
||||
if (fileColumn == null || fileColumn.isBlank()) return Optional.empty();
|
||||
String basename = basenameOf(fileColumn);
|
||||
if (!isValidImportFilename(basename)) {
|
||||
throw new InvalidImportFilenameException();
|
||||
}
|
||||
return findFileRecursive(basename);
|
||||
}
|
||||
|
||||
private static String basenameOf(String fileColumn) {
|
||||
String normalized = fileColumn.replace('\\', '/');
|
||||
int lastSlash = normalized.lastIndexOf('/');
|
||||
return lastSlash < 0 ? normalized.trim() : normalized.substring(lastSlash + 1).trim();
|
||||
}
|
||||
|
||||
private String probeContentType(File file) {
|
||||
try {
|
||||
String probed = Files.probeContentType(file.toPath());
|
||||
@@ -290,20 +281,23 @@ public class DocumentImporter {
|
||||
RequestBody.fromFile(file));
|
||||
}
|
||||
|
||||
// ─── security guards — ported verbatim from MassImportService — do not weaken ────
|
||||
// ─── index validation + containment — defense-in-depth, do not weaken ────────────
|
||||
|
||||
private boolean isValidImportFilename(String filename) {
|
||||
if (filename == null || filename.isBlank()) return false;
|
||||
if (filename.contains("/")) return false;
|
||||
if (filename.contains("\\")) return false;
|
||||
if (filename.contains("∕")) return false; // U+2215 DIVISION SLASH
|
||||
if (filename.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS
|
||||
if (filename.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR
|
||||
if (filename.contains("..")) return false;
|
||||
if (filename.equals(".")) return false;
|
||||
if (filename.contains("\0")) return false;
|
||||
if (Paths.get(filename).isAbsolute()) return false;
|
||||
return true;
|
||||
// The index is the only thing that drives the on-disk lookup, so it must never contain a
|
||||
// path separator, traversal token, slash homoglyph, null byte, or absolute-path marker —
|
||||
// each guard mirrors the filename guards ported from MassImportService — and it must match
|
||||
// the strict catalog shape so anything unexpected is skipped loudly rather than read.
|
||||
private boolean isValidImportIndex(String index) {
|
||||
if (index == null || index.isBlank()) return false;
|
||||
if (index.contains("/")) return false;
|
||||
if (index.contains("\\")) return false;
|
||||
if (index.contains("∕")) return false; // U+2215 DIVISION SLASH
|
||||
if (index.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS
|
||||
if (index.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR
|
||||
if (index.contains(".")) return false; // no dots — "<index>.pdf" is the only extension
|
||||
if (index.contains("\0")) return false;
|
||||
if (Paths.get(index).isAbsolute()) return false;
|
||||
return INDEX_PATTERN.matcher(index).matches();
|
||||
}
|
||||
|
||||
// package-private: a Mockito spy in tests can override to inject IOException
|
||||
@@ -322,14 +316,14 @@ public class DocumentImporter {
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<File> findFileRecursive(String filename) {
|
||||
// O(1) direct lookup: the PDF is exactly importDir/<index>.pdf. The caller has already
|
||||
// validated the index shape; the canonical-path containment assertion below is
|
||||
// defense-in-depth so even a symlinked <index>.pdf cannot read outside importDir.
|
||||
private Optional<File> resolvePdfByIndex(String index) {
|
||||
File baseDir = new File(importDir);
|
||||
try (Stream<Path> walk = Files.walk(baseDir.toPath())) {
|
||||
Optional<Path> match = walk.filter(p -> !Files.isDirectory(p))
|
||||
.filter(p -> p.getFileName().toString().equals(filename))
|
||||
.findFirst();
|
||||
if (match.isEmpty()) return Optional.empty();
|
||||
File candidate = match.get().toFile();
|
||||
File candidate = baseDir.toPath().resolve(index + ".pdf").toFile();
|
||||
try {
|
||||
if (!candidate.isFile()) return Optional.empty();
|
||||
String baseDirCanonical = baseDir.getCanonicalPath();
|
||||
if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) {
|
||||
throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate);
|
||||
@@ -340,15 +334,7 @@ public class DocumentImporter {
|
||||
}
|
||||
}
|
||||
|
||||
private static String displayName(CanonicalSheetReader.Row row, String index) {
|
||||
String file = row.get("file");
|
||||
return file.isBlank() ? index : basenameOf(file);
|
||||
}
|
||||
|
||||
private static String blankToNull(String s) {
|
||||
return (s == null || s.isBlank()) ? null : s;
|
||||
}
|
||||
|
||||
private static final class InvalidImportFilenameException extends RuntimeException {
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user