From f5eb22723952319e4697008549b241bd343c962c Mon Sep 17 00:00:00 2001 From: Marcel Date: Wed, 27 May 2026 21:02:03 +0200 Subject: [PATCH] feat(importing): resolve import PDFs directly by index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The corpus is uniform — every PDF is .pdf flat in the import dir — so resolve a document's PDF with an O(1) importDir.resolve(index + ".pdf") lookup instead of a recursive directory walk over the file column. The index is validated against a strict catalog pattern (1–4 Latin letters incl. umlauts, hyphen(s), digits, optional x) plus the ported separator/dot/dotdot/null/slash-homoglyph/absolute-path guards, and the resolved canonical path is asserted to stay inside the import dir as defense-in-depth. The %PDF magic-byte check still gates upload; status UPLOADED/PLACEHOLDER and the index→originalFilename upsert key are unchanged. The file column and findFileRecursive walk are gone, and the security regression tests now assert a malicious or garbage index is rejected and a valid index resolves to exactly importDir/.pdf within containment. Closes #686 Closes #676 Co-Authored-By: Claude Opus 4.7 --- .../importing/DocumentImporter.java | 106 ++++---- .../CanonicalImportIntegrationTest.java | 12 +- .../importing/DocumentImporterTest.java | 232 +++++++++++------- 3 files changed, 193 insertions(+), 157 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java index 085a4ff4..21494447 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java @@ -28,7 +28,6 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; -import java.nio.file.Path; import java.nio.file.Paths; import java.time.LocalDate; import java.time.format.DateTimeParseException; @@ -38,19 +37,22 @@ import java.util.List; import java.util.Optional; import java.util.Set; import java.util.UUID; -import java.util.stream.Stream; /** * Loads {@code canonical-documents.xlsx} into the document domain. Java performs no * semantic transformation: the normalizer already resolved people to slugs and dates to * ISO values. This loader maps columns by header name, routes each attribution * register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}), - * parses clean dates, and keeps the file/S3/thumbnail plumbing. + * parses clean dates, and keeps the S3/thumbnail plumbing. * - *

The {@code file} value is hostile input regardless of upstream trust (CWE-22 does not - * care that it came from our Python tool): its basename is validated with - * {@link #isValidImportFilename} and then resolved with canonical-path containment in - * {@link #findFileRecursive}. + *

The import corpus is uniform — every PDF is named {@code .pdf} flat in the import + * dir — so a document's PDF is resolved directly by its index: + * {@code importDir.resolve(index + ".pdf")}. The {@code index} is still hostile input + * regardless of upstream trust (CWE-22 does not care it came from our Python tool): it is + * validated against a strict catalog pattern with {@link #isValidImportIndex} (no path + * separators, no {@code .}/{@code ..}, no absolute path, no slash homoglyphs) and the + * resolved path is asserted to stay inside the import dir in {@link #resolvePdfByIndex} as + * defense-in-depth. The {@code %PDF} magic-byte check still gates upload. */ @Component @RequiredArgsConstructor @@ -58,9 +60,16 @@ import java.util.stream.Stream; public class DocumentImporter { static final List REQUIRED_HEADERS = List.of( - "index", "file", "sender_person_id", "sender_name", + "index", "sender_person_id", "sender_name", "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision"); + // Catalog index shape: 1–4 letters (ASCII + Latin-1 letters, e.g. the German "ü" in + // "Mü-0001"), one or more hyphens (the corpus has a few "C--0029" data-entry artefacts), + // digits, and an optional trailing "x" the normalizer recognises. Anchored, with no + // separator / dot / slash characters in the class, so ".pdf" can never traverse. + private static final java.util.regex.Pattern INDEX_PATTERN = + java.util.regex.Pattern.compile("[A-Za-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF]{1,4}-+\\d+x?"); + private final DocumentService documentService; private final PersonService personService; private final TagService tagService; @@ -87,9 +96,9 @@ public class DocumentImporter { for (CanonicalSheetReader.Row row : rows) { String index = row.get("index"); if (index.isBlank()) continue; - Optional skipReason = importRow(row, index, skipped); + Optional skipReason = importRow(row, index); if (skipReason.isPresent()) { - skipped.add(new ImportStatus.SkippedFile(displayName(row, index), skipReason.get())); + skipped.add(new ImportStatus.SkippedFile(index, skipReason.get())); } else { processed++; } @@ -98,15 +107,12 @@ public class DocumentImporter { return new LoadResult(processed, skipped); } - private Optional importRow(CanonicalSheetReader.Row row, String index, - List skipped) { - Optional resolved; - try { - resolved = resolveFile(row.get("file")); - } catch (InvalidImportFilenameException e) { - log.warn("Skipping import row {}: filename rejected", index); + private Optional importRow(CanonicalSheetReader.Row row, String index) { + if (!isValidImportIndex(index)) { + log.warn("Skipping import row: index rejected"); return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); } + Optional resolved = resolvePdfByIndex(index); if (resolved.isPresent()) { try { if (!isPdfMagicBytes(resolved.get())) { @@ -257,21 +263,6 @@ public class DocumentImporter { // ─── file handling + S3 (small ≤20-line methods) ───────────────────────────────── - private Optional resolveFile(String fileColumn) { - if (fileColumn == null || fileColumn.isBlank()) return Optional.empty(); - String basename = basenameOf(fileColumn); - if (!isValidImportFilename(basename)) { - throw new InvalidImportFilenameException(); - } - return findFileRecursive(basename); - } - - private static String basenameOf(String fileColumn) { - String normalized = fileColumn.replace('\\', '/'); - int lastSlash = normalized.lastIndexOf('/'); - return lastSlash < 0 ? normalized.trim() : normalized.substring(lastSlash + 1).trim(); - } - private String probeContentType(File file) { try { String probed = Files.probeContentType(file.toPath()); @@ -290,20 +281,23 @@ public class DocumentImporter { RequestBody.fromFile(file)); } - // ─── security guards — ported verbatim from MassImportService — do not weaken ──── + // ─── index validation + containment — defense-in-depth, do not weaken ──────────── - private boolean isValidImportFilename(String filename) { - if (filename == null || filename.isBlank()) return false; - if (filename.contains("/")) return false; - if (filename.contains("\\")) return false; - if (filename.contains("∕")) return false; // U+2215 DIVISION SLASH - if (filename.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS - if (filename.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR - if (filename.contains("..")) return false; - if (filename.equals(".")) return false; - if (filename.contains("\0")) return false; - if (Paths.get(filename).isAbsolute()) return false; - return true; + // The index is the only thing that drives the on-disk lookup, so it must never contain a + // path separator, traversal token, slash homoglyph, null byte, or absolute-path marker — + // each guard mirrors the filename guards ported from MassImportService — and it must match + // the strict catalog shape so anything unexpected is skipped loudly rather than read. + private boolean isValidImportIndex(String index) { + if (index == null || index.isBlank()) return false; + if (index.contains("/")) return false; + if (index.contains("\\")) return false; + if (index.contains("∕")) return false; // U+2215 DIVISION SLASH + if (index.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS + if (index.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR + if (index.contains(".")) return false; // no dots — ".pdf" is the only extension + if (index.contains("\0")) return false; + if (Paths.get(index).isAbsolute()) return false; + return INDEX_PATTERN.matcher(index).matches(); } // package-private: a Mockito spy in tests can override to inject IOException @@ -322,14 +316,14 @@ public class DocumentImporter { } } - private Optional findFileRecursive(String filename) { + // O(1) direct lookup: the PDF is exactly importDir/.pdf. The caller has already + // validated the index shape; the canonical-path containment assertion below is + // defense-in-depth so even a symlinked .pdf cannot read outside importDir. + private Optional resolvePdfByIndex(String index) { File baseDir = new File(importDir); - try (Stream walk = Files.walk(baseDir.toPath())) { - Optional match = walk.filter(p -> !Files.isDirectory(p)) - .filter(p -> p.getFileName().toString().equals(filename)) - .findFirst(); - if (match.isEmpty()) return Optional.empty(); - File candidate = match.get().toFile(); + File candidate = baseDir.toPath().resolve(index + ".pdf").toFile(); + try { + if (!candidate.isFile()) return Optional.empty(); String baseDirCanonical = baseDir.getCanonicalPath(); if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) { throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate); @@ -340,15 +334,7 @@ public class DocumentImporter { } } - private static String displayName(CanonicalSheetReader.Row row, String index) { - String file = row.get("file"); - return file.isBlank() ? index : basenameOf(file); - } - private static String blankToNull(String s) { return (s == null || s.isBlank()) ? null : s; } - - private static final class InvalidImportFilenameException extends RuntimeException { - } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java index bd2b8d8f..090ffe31 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java @@ -140,12 +140,12 @@ class CanonicalImportIntegrationTest { // Re-stage the document sheet with W-0001's receiver and tag removed. writeSheet(artifactDir.resolve("canonical-documents.xlsx"), - List.of("index", "file", "sender_person_id", "sender_name", "receiver_person_ids", + List.of("index", "sender_person_id", "sender_name", "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"), List.of( - List.of("W-0001", "", "de-gruyter-walter", "Walter de Gruyter", + List.of("W-0001", "de-gruyter-walter", "Walter de Gruyter", "", "", "1888-02-15", "15.2.1888", "DAY", "", "Rotterdam", "", "Geschäftsreise"), - List.of("W-0002", "", "de-gruyter-eugenie", "Eugenie de Gruyter", + List.of("W-0002", "de-gruyter-eugenie", "Eugenie de Gruyter", "de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "", "Middelburg", "Themen/Brautbriefe", "Reisepläne"))); @@ -196,13 +196,13 @@ class CanonicalImportIntegrationTest { """); writeSheet(dir.resolve("canonical-documents.xlsx"), - List.of("index", "file", "sender_person_id", "sender_name", "receiver_person_ids", + List.of("index", "sender_person_id", "sender_name", "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"), List.of( - List.of("W-0001", "", "de-gruyter-walter", "Walter de Gruyter", + List.of("W-0001", "de-gruyter-walter", "Walter de Gruyter", "de-gruyter-eugenie", "Eugenie de Gruyter", "1888-02-15", "15.2.1888", "DAY", "", "Rotterdam", "Themen/Brautbriefe", "Geschäftsreise"), - List.of("W-0002", "", "de-gruyter-eugenie", "Eugenie de Gruyter", + List.of("W-0002", "de-gruyter-eugenie", "Eugenie de Gruyter", "de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "", "Middelburg", "Themen/Brautbriefe", "Reisepläne"))); } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java index 99d7bd5c..c6d78df3 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java @@ -24,7 +24,6 @@ import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.PutObjectRequest; import java.io.File; -import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.time.LocalDate; @@ -58,99 +57,149 @@ class DocumentImporterTest { ReflectionTestUtils.setField(importer, "bucketName", "test-bucket"); } - // ─── security regression — ported from MassImportServiceTest — do not remove ───── + // ─── index validation — a malicious/garbage index can never reach disk I/O ───────── @Test - void isValidImportFilename_returnsFalse_whenNull() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", (String) null)).isFalse(); + void isValidImportIndex_returnsFalse_whenNull() { + assertThat(validIndex(null)).isFalse(); } @Test - void isValidImportFilename_returnsFalse_whenBlank() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", " ")).isFalse(); + void isValidImportIndex_returnsFalse_whenBlank() { + assertThat(validIndex(" ")).isFalse(); } @Test - void isValidImportFilename_returnsFalse_whenForwardSlash() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "etc/passwd")).isFalse(); + void isValidImportIndex_returnsFalse_whenForwardSlash() { + assertThat(validIndex("etc/passwd")).isFalse(); } @Test - void isValidImportFilename_returnsFalse_whenBackslash() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..\\etc\\passwd")).isFalse(); + void isValidImportIndex_returnsFalse_whenBackslash() { + assertThat(validIndex("..\\etc\\passwd")).isFalse(); } @Test - void isValidImportFilename_returnsFalse_whenDotDot() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "doc..evil.pdf")).isFalse(); + void isValidImportIndex_returnsFalse_whenDotDot() { + assertThat(validIndex("W-..0001")).isFalse(); } @Test - void isValidImportFilename_returnsFalse_whenIsDotDot() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..")).isFalse(); + void isValidImportIndex_returnsFalse_whenIsDotDot() { + assertThat(validIndex("..")).isFalse(); } @Test - void isValidImportFilename_returnsFalse_whenAbsolutePath() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "/etc/passwd")).isFalse(); + void isValidImportIndex_returnsFalse_whenSingleDot() { + assertThat(validIndex(".")).isFalse(); } @Test - void isValidImportFilename_returnsFalse_whenNullByte() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "file\0.pdf")).isFalse(); + void isValidImportIndex_returnsFalse_whenAbsolutePath() { + assertThat(validIndex("/etc/passwd")).isFalse(); } @Test - void isValidImportFilename_returnsFalse_whenUnicodeDivisionSlash() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo∕bar.pdf")).isFalse(); + void isValidImportIndex_returnsFalse_whenNullByte() { + assertThat(validIndex("W-0001\0")).isFalse(); } @Test - void isValidImportFilename_returnsFalse_whenFullwidthSlash() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo/bar.pdf")).isFalse(); + void isValidImportIndex_returnsFalse_whenUnicodeDivisionSlash() { + assertThat(validIndex("W∕0001")).isFalse(); } @Test - void isValidImportFilename_returnsFalse_whenReverseSolidusOperator() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo⧵bar.pdf")).isFalse(); + void isValidImportIndex_returnsFalse_whenFullwidthSlash() { + assertThat(validIndex("W/0001")).isFalse(); } @Test - void isValidImportFilename_returnsTrue_whenPlainBasename() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "document.pdf")).isTrue(); + void isValidImportIndex_returnsFalse_whenReverseSolidusOperator() { + assertThat(validIndex("W⧵0001")).isFalse(); } @Test - void isValidImportFilename_returnsTrue_whenLeadingDot() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", ".hidden.pdf")).isTrue(); + void isValidImportIndex_returnsFalse_whenContainsDotPdfExtension() { + // The index is the bare catalog id; appending ".pdf" is the importer's job. A dot in + // the index would let "W-0001.pdf" become "W-0001.pdf.pdf" or smuggle an extension. + assertThat(validIndex("W-0001.pdf")).isFalse(); } @Test - void isValidImportFilename_returnsTrue_whenHasSpaces() { - assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "Brief an Oma.pdf")).isTrue(); + void isValidImportIndex_returnsTrue_whenPlainCatalogIndex() { + assertThat(validIndex("W-0124")).isTrue(); } @Test - void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir( - @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception { - Path outsideFile = outsideDir.resolve("secret.pdf"); - Files.writeString(outsideFile, "sensitive"); - Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile); - ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString()); - - org.assertj.core.api.Assertions.assertThatThrownBy( - () -> ReflectionTestUtils.invokeMethod(importer, "findFileRecursive", "secret.pdf")) - .isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class); + void isValidImportIndex_returnsTrue_whenTwoLetterPrefix() { + assertThat(validIndex("Al-0001")).isTrue(); } - // ─── path traversal in the file column cannot escape importDir ─────────────────── + @Test + void isValidImportIndex_returnsTrue_whenThreeLetterPrefix() { + assertThat(validIndex("CuH-0010")).isTrue(); + } @Test - void load_rejectsFileColumn_whenBasenameIsTraversalToken(@TempDir Path tempDir) throws Exception { - // A file column whose basename is itself a traversal token must be rejected - // outright, never used for disk I/O. + void isValidImportIndex_returnsTrue_whenUmlautPrefix() { + // Real corpus indices carry a German umlaut, e.g. "Mü-0001.pdf" exists on disk. + assertThat(validIndex("Mü-0001")).isTrue(); + } + + @Test + void isValidImportIndex_returnsTrue_whenDoubleHyphen() { + // Real corpus: "C--0029" appears in the spreadsheet (a data-entry artefact, but a + // legitimate catalog shape that must still resolve, not crash). + assertThat(validIndex("C--0029")).isTrue(); + } + + @Test + void isValidImportIndex_returnsTrue_whenXSuffix() { + // The normalizer recognises an x-suffix catalog id; allow it defensively. + assertThat(validIndex("W-0001x")).isTrue(); + } + + // ─── a valid index resolves to exactly importDir/.pdf within containment ───── + + @Test + void load_resolvesPdfByIndex_uploadsToS3_andSetsStatusUploaded(@TempDir Path tempDir) throws Exception { ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); - Path xlsx = writeDocs(tempDir, docRow("W-0001", "evil/..", "", "", "", "", "", "", "", "")); + byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D}; + Files.write(tempDir.resolve("W-0124.pdf"), pdf); + when(documentService.findByOriginalFilename("W-0124")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0124", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + // exactly importDir/.pdf was uploaded — the S3 key carries that basename + org.mockito.ArgumentCaptor bodyCaptor = org.mockito.ArgumentCaptor.forClass(RequestBody.class); + verify(s3Client).putObject(any(PutObjectRequest.class), bodyCaptor.capture()); + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getStatus() == DocumentStatus.UPLOADED + && d.getFilePath() != null + && d.getFilePath().endsWith("_W-0124.pdf"))); + } + + @Test + void load_yieldsPlaceholder_whenIndexedPdfMissing(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("X-9999")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("X-9999", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER)); + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + } + + @Test + void load_rejectsMaliciousIndex_neverReadsOutsideImportDir(@TempDir Path tempDir) throws Exception { + // An index with a path separator must be skipped outright, never used for disk I/O. + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Path xlsx = writeDocs(tempDir, docRow("../../etc/cron.d/x", "", "", "", "", "", "", "", "")); DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); @@ -158,22 +207,35 @@ class DocumentImporterTest { .extracting(ImportStatus.SkippedFile::reason) .containsExactly(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); verify(documentService, never()).save(any()); + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); } @Test - void load_traversalFileColumn_cannotEscapeImportDir_yieldsPlaceholder(@TempDir Path tempDir) throws Exception { - // ../../etc/cron.d/x reduces to basename "x"; the disk lookup is confined to - // importDir, so no file is found, nothing is uploaded, and the row becomes a - // metadata-only PLACEHOLDER — the file outside importDir is never read. + void resolvePdfByIndex_throwsWhenResolvedPathEscapesImportDir_viaSymlink( + @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception { + // Containment defense-in-depth: even a syntactically valid index whose .pdf is a + // symlink pointing outside importDir must be refused — the resolved canonical path is + // asserted to stay inside importDir. + Path outsideFile = outsideDir.resolve("secret.pdf"); + Files.writeString(outsideFile, "sensitive"); + Files.createSymbolicLink(importDirPath.resolve("W-0001.pdf"), outsideFile); + ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString()); + + org.assertj.core.api.Assertions.assertThatThrownBy( + () -> ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "W-0001")) + .isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class); + } + + @Test + void resolvePdfByIndex_returnsExactlyImportDirIndexPdf_whenPresent(@TempDir Path tempDir) throws Exception { ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); - when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - Path xlsx = writeDocs(tempDir, docRow("W-0001", "../../etc/cron.d/x", "", "", "", "", "", "", "", "")); + Path expected = tempDir.resolve("Eu-0628.pdf"); + Files.writeString(expected, "%PDF-1.4"); - importer.load(xlsx.toFile()); + Optional resolved = ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "Eu-0628"); - verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER)); + assertThat(resolved).isPresent(); + assertThat(resolved.get().getCanonicalFile()).isEqualTo(expected.toFile().getCanonicalFile()); } // ─── PDF magic-byte guard — ported — do not remove ────────────────────────────── @@ -183,7 +245,7 @@ class DocumentImporterTest { ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); Files.writeString(tempDir.resolve("W-0001.pdf"), "not a pdf"); lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", "")); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "")); DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); @@ -198,7 +260,7 @@ class DocumentImporterTest { ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); Files.writeString(tempDir.resolve("W-0001.pdf"), "content"); lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", "")); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "")); DocumentImporter spyImporter = org.mockito.Mockito.spy(importer); org.mockito.Mockito.doThrow(new java.io.IOException("read error")) @@ -217,7 +279,7 @@ class DocumentImporterTest { Document existing = Document.builder().id(UUID.randomUUID()) .originalFilename("W-0001").status(DocumentStatus.UPLOADED).build(); when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.of(existing)); - Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "", "")); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "")); DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); @@ -227,29 +289,14 @@ class DocumentImporterTest { verify(documentService, never()).save(any()); } - // ─── file column drives status: present → UPLOADED, empty → PLACEHOLDER ─────────── + // ─── presence of importDir/.pdf drives status: present → UPLOADED, absent → PLACEHOLDER ─ @Test - void load_uploadsToS3_andSetsStatusUploaded_whenFilePresent(@TempDir Path tempDir) throws Exception { - ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); - byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D}; - Files.write(tempDir.resolve("W-0001.pdf"), pdf); - when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", "")); - - importer.load(xlsx.toFile()); - - verify(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.UPLOADED)); - } - - @Test - void load_setsStatusPlaceholder_whenFileColumnEmpty(@TempDir Path tempDir) throws Exception { + void load_setsStatusPlaceholder_whenNoIndexedPdf(@TempDir Path tempDir) throws Exception { ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); when(documentService.findByOriginalFilename("W-0099")).thenReturn(Optional.empty()); when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", "", "")); + Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", "")); importer.load(xlsx.toFile()); @@ -267,7 +314,7 @@ class DocumentImporterTest { when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty()); when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); when(personService.findBySourceRef("de-gruyter-walter")).thenReturn(Optional.of(walter)); - Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "de-gruyter-walter", "Walter de Gruyter", + Path xlsx = writeDocs(tempDir, docRow("W-0001", "de-gruyter-walter", "Walter de Gruyter", "", "", "", "", "", "")); importer.load(xlsx.toFile()); @@ -285,7 +332,7 @@ class DocumentImporterTest { when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); when(personService.findBySourceRef("schwester-hanni")).thenReturn(Optional.empty()); when(personService.upsertBySourceRef(any())).thenReturn(provisional); - Path xlsx = writeDocs(tempDir, docRow("W-0002", "", "schwester-hanni", "Schwester Hanni", + Path xlsx = writeDocs(tempDir, docRow("W-0002", "schwester-hanni", "Schwester Hanni", "", "", "", "", "", "")); importer.load(xlsx.toFile()); @@ -302,7 +349,7 @@ class DocumentImporterTest { ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); when(documentService.findByOriginalFilename("W-0003")).thenReturn(Optional.empty()); when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "", "?", + Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "?", "", "", "", "", "", "")); importer.load(xlsx.toFile()); @@ -322,7 +369,7 @@ class DocumentImporterTest { when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); when(personService.findBySourceRef("cram-herbert")).thenReturn(Optional.of(herbert)); when(personService.findBySourceRef("clara")).thenReturn(Optional.of(clara)); - Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "", "", + Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "", "cram-herbert|clara", "Herbert Cram|Clara", "", "", "", "")); importer.load(xlsx.toFile()); @@ -341,7 +388,7 @@ class DocumentImporterTest { ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); when(documentService.findByOriginalFilename("W-0005")).thenReturn(Optional.empty()); when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "", "", + Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "", "", "", "1916-06-01", "1.6.1916", "MONTH", "")); importer.load(xlsx.toFile()); @@ -375,7 +422,7 @@ class DocumentImporterTest { .originalFilename("W-0007").status(DocumentStatus.PLACEHOLDER).build(); when(documentService.findByOriginalFilename("W-0007")).thenReturn(Optional.of(existing)); when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", "", "")); + Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", "")); importer.load(xlsx.toFile()); @@ -396,7 +443,7 @@ class DocumentImporterTest { when(documentService.findByOriginalFilename("W-0008")).thenReturn(Optional.of(existing)); when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); // The canonical row now carries no receiver and no tag: both stale links must go. - Path xlsx = writeDocs(tempDir, docRow("W-0008", "", "", "", "", "", "", "", "", "")); + Path xlsx = writeDocs(tempDir, docRow("W-0008", "", "", "", "", "", "", "", "")); importer.load(xlsx.toFile()); @@ -411,7 +458,7 @@ class DocumentImporterTest { ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); when(documentService.findByOriginalFilename("W-0100")).thenReturn(Optional.empty()); when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - Path xlsx = writeDocs(tempDir, docRow("W-0100", "", "", "", "", "", + Path xlsx = writeDocs(tempDir, docRow("W-0100", "", "", "", "", "1916-06-01", "Juni 1916", "MONTH", "")); importer.load(xlsx.toFile()); @@ -425,7 +472,7 @@ class DocumentImporterTest { ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); when(documentService.findByOriginalFilename("W-0101")).thenReturn(Optional.empty()); when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - Path xlsx = writeDocs(tempDir, docRow("W-0101", "", "", "", "", "", + Path xlsx = writeDocs(tempDir, docRow("W-0101", "", "", "", "", "1943-12-24", "24.12.1943", "DAY", "")); importer.load(xlsx.toFile()); @@ -439,7 +486,7 @@ class DocumentImporterTest { ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); when(documentService.findByOriginalFilename("W-0102")).thenReturn(Optional.empty()); when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - Path xlsx = writeDocs(tempDir, docRow("W-0102", "", "", "", "", "", + Path xlsx = writeDocs(tempDir, docRow("W-0102", "", "", "", "", "", "?", "UNKNOWN", "")); importer.load(xlsx.toFile()); @@ -450,12 +497,15 @@ class DocumentImporterTest { // ─── helpers ───────────────────────────────────────────────────────────────────── - private Map docRow(String index, String file, String senderId, String senderName, + private Boolean validIndex(String index) { + return ReflectionTestUtils.invokeMethod(importer, "isValidImportIndex", index); + } + + private Map docRow(String index, String senderId, String senderName, String receiverIds, String receiverNames, String dateIso, String dateRaw, String datePrecision, String dateEnd) { Map r = new LinkedHashMap<>(); r.put("index", index); - r.put("file", file); r.put("sender_person_id", senderId); r.put("sender_name", senderName); r.put("receiver_person_ids", receiverIds); @@ -471,7 +521,7 @@ class DocumentImporterTest { } private Map docRowWithTag(String index, String tagPath) { - Map r = docRow(index, "", "", "", "", "", "", "", "", ""); + Map r = docRow(index, "", "", "", "", "", "", "", ""); r.put("tags", tagPath); return r; } @@ -479,7 +529,7 @@ class DocumentImporterTest { @SafeVarargs private Path writeDocs(Path dir, Map... rows) throws Exception { Path xlsx = dir.resolve("canonical-documents.xlsx"); - List headers = List.of("index", "file", "sender_person_id", "sender_name", + List headers = List.of("index", "sender_person_id", "sender_name", "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"); try (XSSFWorkbook wb = new XSSFWorkbook()) { @@ -494,7 +544,7 @@ class DocumentImporterTest { row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), "")); } } - try (OutputStream out = Files.newOutputStream(xlsx)) { + try (java.io.OutputStream out = Files.newOutputStream(xlsx)) { wb.write(out); } }