diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java
index 085a4ff4..21494447 100644
--- a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java
@@ -28,7 +28,6 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
-import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.format.DateTimeParseException;
@@ -38,19 +37,22 @@ import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
-import java.util.stream.Stream;
/**
* Loads {@code canonical-documents.xlsx} into the document domain. Java performs no
* semantic transformation: the normalizer already resolved people to slugs and dates to
* ISO values. This loader maps columns by header name, routes each attribution
* register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}),
- * parses clean dates, and keeps the file/S3/thumbnail plumbing.
+ * parses clean dates, and keeps the S3/thumbnail plumbing.
*
- *
The {@code file} value is hostile input regardless of upstream trust (CWE-22 does not
- * care that it came from our Python tool): its basename is validated with
- * {@link #isValidImportFilename} and then resolved with canonical-path containment in
- * {@link #findFileRecursive}.
+ *
The import corpus is uniform — every PDF is named {@code .pdf} flat in the import
+ * dir — so a document's PDF is resolved directly by its index:
+ * {@code importDir.resolve(index + ".pdf")}. The {@code index} is still hostile input
+ * regardless of upstream trust (CWE-22 does not care it came from our Python tool): it is
+ * validated against a strict catalog pattern with {@link #isValidImportIndex} (no path
+ * separators, no {@code .}/{@code ..}, no absolute path, no slash homoglyphs) and the
+ * resolved path is asserted to stay inside the import dir in {@link #resolvePdfByIndex} as
+ * defense-in-depth. The {@code %PDF} magic-byte check still gates upload.
*/
@Component
@RequiredArgsConstructor
@@ -58,9 +60,16 @@ import java.util.stream.Stream;
public class DocumentImporter {
static final List REQUIRED_HEADERS = List.of(
- "index", "file", "sender_person_id", "sender_name",
+ "index", "sender_person_id", "sender_name",
"receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision");
+ // Catalog index shape: 1–4 letters (ASCII + Latin-1 letters, e.g. the German "ü" in
+ // "Mü-0001"), one or more hyphens (the corpus has a few "C--0029" data-entry artefacts),
+ // digits, and an optional trailing "x" the normalizer recognises. Anchored, with no
+ // separator / dot / slash characters in the class, so ".pdf" can never traverse.
+ private static final java.util.regex.Pattern INDEX_PATTERN =
+ java.util.regex.Pattern.compile("[A-Za-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF]{1,4}-+\\d+x?");
+
private final DocumentService documentService;
private final PersonService personService;
private final TagService tagService;
@@ -87,9 +96,9 @@ public class DocumentImporter {
for (CanonicalSheetReader.Row row : rows) {
String index = row.get("index");
if (index.isBlank()) continue;
- Optional skipReason = importRow(row, index, skipped);
+ Optional skipReason = importRow(row, index);
if (skipReason.isPresent()) {
- skipped.add(new ImportStatus.SkippedFile(displayName(row, index), skipReason.get()));
+ skipped.add(new ImportStatus.SkippedFile(index, skipReason.get()));
} else {
processed++;
}
@@ -98,15 +107,12 @@ public class DocumentImporter {
return new LoadResult(processed, skipped);
}
- private Optional importRow(CanonicalSheetReader.Row row, String index,
- List skipped) {
- Optional resolved;
- try {
- resolved = resolveFile(row.get("file"));
- } catch (InvalidImportFilenameException e) {
- log.warn("Skipping import row {}: filename rejected", index);
+ private Optional importRow(CanonicalSheetReader.Row row, String index) {
+ if (!isValidImportIndex(index)) {
+ log.warn("Skipping import row: index rejected");
return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
}
+ Optional resolved = resolvePdfByIndex(index);
if (resolved.isPresent()) {
try {
if (!isPdfMagicBytes(resolved.get())) {
@@ -257,21 +263,6 @@ public class DocumentImporter {
// ─── file handling + S3 (small ≤20-line methods) ─────────────────────────────────
- private Optional resolveFile(String fileColumn) {
- if (fileColumn == null || fileColumn.isBlank()) return Optional.empty();
- String basename = basenameOf(fileColumn);
- if (!isValidImportFilename(basename)) {
- throw new InvalidImportFilenameException();
- }
- return findFileRecursive(basename);
- }
-
- private static String basenameOf(String fileColumn) {
- String normalized = fileColumn.replace('\\', '/');
- int lastSlash = normalized.lastIndexOf('/');
- return lastSlash < 0 ? normalized.trim() : normalized.substring(lastSlash + 1).trim();
- }
-
private String probeContentType(File file) {
try {
String probed = Files.probeContentType(file.toPath());
@@ -290,20 +281,23 @@ public class DocumentImporter {
RequestBody.fromFile(file));
}
- // ─── security guards — ported verbatim from MassImportService — do not weaken ────
+ // ─── index validation + containment — defense-in-depth, do not weaken ────────────
- private boolean isValidImportFilename(String filename) {
- if (filename == null || filename.isBlank()) return false;
- if (filename.contains("/")) return false;
- if (filename.contains("\\")) return false;
- if (filename.contains("∕")) return false; // U+2215 DIVISION SLASH
- if (filename.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS
- if (filename.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR
- if (filename.contains("..")) return false;
- if (filename.equals(".")) return false;
- if (filename.contains("\0")) return false;
- if (Paths.get(filename).isAbsolute()) return false;
- return true;
+ // The index is the only thing that drives the on-disk lookup, so it must never contain a
+ // path separator, traversal token, slash homoglyph, null byte, or absolute-path marker —
+ // each guard mirrors the filename guards ported from MassImportService — and it must match
+ // the strict catalog shape so anything unexpected is skipped loudly rather than read.
+ private boolean isValidImportIndex(String index) {
+ if (index == null || index.isBlank()) return false;
+ if (index.contains("/")) return false;
+ if (index.contains("\\")) return false;
+ if (index.contains("∕")) return false; // U+2215 DIVISION SLASH
+ if (index.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS
+ if (index.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR
+ if (index.contains(".")) return false; // no dots — ".pdf" is the only extension
+ if (index.contains("\0")) return false;
+ if (Paths.get(index).isAbsolute()) return false;
+ return INDEX_PATTERN.matcher(index).matches();
}
// package-private: a Mockito spy in tests can override to inject IOException
@@ -322,14 +316,14 @@ public class DocumentImporter {
}
}
- private Optional findFileRecursive(String filename) {
+ // O(1) direct lookup: the PDF is exactly importDir/.pdf. The caller has already
+ // validated the index shape; the canonical-path containment assertion below is
+ // defense-in-depth so even a symlinked .pdf cannot read outside importDir.
+ private Optional resolvePdfByIndex(String index) {
File baseDir = new File(importDir);
- try (Stream walk = Files.walk(baseDir.toPath())) {
- Optional match = walk.filter(p -> !Files.isDirectory(p))
- .filter(p -> p.getFileName().toString().equals(filename))
- .findFirst();
- if (match.isEmpty()) return Optional.empty();
- File candidate = match.get().toFile();
+ File candidate = baseDir.toPath().resolve(index + ".pdf").toFile();
+ try {
+ if (!candidate.isFile()) return Optional.empty();
String baseDirCanonical = baseDir.getCanonicalPath();
if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) {
throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate);
@@ -340,15 +334,7 @@ public class DocumentImporter {
}
}
- private static String displayName(CanonicalSheetReader.Row row, String index) {
- String file = row.get("file");
- return file.isBlank() ? index : basenameOf(file);
- }
-
private static String blankToNull(String s) {
return (s == null || s.isBlank()) ? null : s;
}
-
- private static final class InvalidImportFilenameException extends RuntimeException {
- }
}
diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java
index bd2b8d8f..090ffe31 100644
--- a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java
@@ -140,12 +140,12 @@ class CanonicalImportIntegrationTest {
// Re-stage the document sheet with W-0001's receiver and tag removed.
writeSheet(artifactDir.resolve("canonical-documents.xlsx"),
- List.of("index", "file", "sender_person_id", "sender_name", "receiver_person_ids",
+ List.of("index", "sender_person_id", "sender_name", "receiver_person_ids",
"receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"),
List.of(
- List.of("W-0001", "", "de-gruyter-walter", "Walter de Gruyter",
+ List.of("W-0001", "de-gruyter-walter", "Walter de Gruyter",
"", "", "1888-02-15", "15.2.1888", "DAY", "", "Rotterdam", "", "Geschäftsreise"),
- List.of("W-0002", "", "de-gruyter-eugenie", "Eugenie de Gruyter",
+ List.of("W-0002", "de-gruyter-eugenie", "Eugenie de Gruyter",
"de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "",
"Middelburg", "Themen/Brautbriefe", "Reisepläne")));
@@ -196,13 +196,13 @@ class CanonicalImportIntegrationTest {
""");
writeSheet(dir.resolve("canonical-documents.xlsx"),
- List.of("index", "file", "sender_person_id", "sender_name", "receiver_person_ids",
+ List.of("index", "sender_person_id", "sender_name", "receiver_person_ids",
"receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"),
List.of(
- List.of("W-0001", "", "de-gruyter-walter", "Walter de Gruyter",
+ List.of("W-0001", "de-gruyter-walter", "Walter de Gruyter",
"de-gruyter-eugenie", "Eugenie de Gruyter", "1888-02-15", "15.2.1888", "DAY", "",
"Rotterdam", "Themen/Brautbriefe", "Geschäftsreise"),
- List.of("W-0002", "", "de-gruyter-eugenie", "Eugenie de Gruyter",
+ List.of("W-0002", "de-gruyter-eugenie", "Eugenie de Gruyter",
"de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "",
"Middelburg", "Themen/Brautbriefe", "Reisepläne")));
}
diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java
index 99d7bd5c..c6d78df3 100644
--- a/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java
@@ -24,7 +24,6 @@ import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
import java.io.File;
-import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDate;
@@ -58,99 +57,149 @@ class DocumentImporterTest {
ReflectionTestUtils.setField(importer, "bucketName", "test-bucket");
}
- // ─── security regression — ported from MassImportServiceTest — do not remove ─────
+ // ─── index validation — a malicious/garbage index can never reach disk I/O ─────────
@Test
- void isValidImportFilename_returnsFalse_whenNull() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", (String) null)).isFalse();
+ void isValidImportIndex_returnsFalse_whenNull() {
+ assertThat(validIndex(null)).isFalse();
}
@Test
- void isValidImportFilename_returnsFalse_whenBlank() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", " ")).isFalse();
+ void isValidImportIndex_returnsFalse_whenBlank() {
+ assertThat(validIndex(" ")).isFalse();
}
@Test
- void isValidImportFilename_returnsFalse_whenForwardSlash() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "etc/passwd")).isFalse();
+ void isValidImportIndex_returnsFalse_whenForwardSlash() {
+ assertThat(validIndex("etc/passwd")).isFalse();
}
@Test
- void isValidImportFilename_returnsFalse_whenBackslash() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..\\etc\\passwd")).isFalse();
+ void isValidImportIndex_returnsFalse_whenBackslash() {
+ assertThat(validIndex("..\\etc\\passwd")).isFalse();
}
@Test
- void isValidImportFilename_returnsFalse_whenDotDot() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "doc..evil.pdf")).isFalse();
+ void isValidImportIndex_returnsFalse_whenDotDot() {
+ assertThat(validIndex("W-..0001")).isFalse();
}
@Test
- void isValidImportFilename_returnsFalse_whenIsDotDot() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..")).isFalse();
+ void isValidImportIndex_returnsFalse_whenIsDotDot() {
+ assertThat(validIndex("..")).isFalse();
}
@Test
- void isValidImportFilename_returnsFalse_whenAbsolutePath() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "/etc/passwd")).isFalse();
+ void isValidImportIndex_returnsFalse_whenSingleDot() {
+ assertThat(validIndex(".")).isFalse();
}
@Test
- void isValidImportFilename_returnsFalse_whenNullByte() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "file\0.pdf")).isFalse();
+ void isValidImportIndex_returnsFalse_whenAbsolutePath() {
+ assertThat(validIndex("/etc/passwd")).isFalse();
}
@Test
- void isValidImportFilename_returnsFalse_whenUnicodeDivisionSlash() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo∕bar.pdf")).isFalse();
+ void isValidImportIndex_returnsFalse_whenNullByte() {
+ assertThat(validIndex("W-0001\0")).isFalse();
}
@Test
- void isValidImportFilename_returnsFalse_whenFullwidthSlash() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo/bar.pdf")).isFalse();
+ void isValidImportIndex_returnsFalse_whenUnicodeDivisionSlash() {
+ assertThat(validIndex("W∕0001")).isFalse();
}
@Test
- void isValidImportFilename_returnsFalse_whenReverseSolidusOperator() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo⧵bar.pdf")).isFalse();
+ void isValidImportIndex_returnsFalse_whenFullwidthSlash() {
+ assertThat(validIndex("W/0001")).isFalse();
}
@Test
- void isValidImportFilename_returnsTrue_whenPlainBasename() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "document.pdf")).isTrue();
+ void isValidImportIndex_returnsFalse_whenReverseSolidusOperator() {
+ assertThat(validIndex("W⧵0001")).isFalse();
}
@Test
- void isValidImportFilename_returnsTrue_whenLeadingDot() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", ".hidden.pdf")).isTrue();
+ void isValidImportIndex_returnsFalse_whenContainsDotPdfExtension() {
+ // The index is the bare catalog id; appending ".pdf" is the importer's job. A dot in
+ // the index would let "W-0001.pdf" become "W-0001.pdf.pdf" or smuggle an extension.
+ assertThat(validIndex("W-0001.pdf")).isFalse();
}
@Test
- void isValidImportFilename_returnsTrue_whenHasSpaces() {
- assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "Brief an Oma.pdf")).isTrue();
+ void isValidImportIndex_returnsTrue_whenPlainCatalogIndex() {
+ assertThat(validIndex("W-0124")).isTrue();
}
@Test
- void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir(
- @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception {
- Path outsideFile = outsideDir.resolve("secret.pdf");
- Files.writeString(outsideFile, "sensitive");
- Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile);
- ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString());
-
- org.assertj.core.api.Assertions.assertThatThrownBy(
- () -> ReflectionTestUtils.invokeMethod(importer, "findFileRecursive", "secret.pdf"))
- .isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class);
+ void isValidImportIndex_returnsTrue_whenTwoLetterPrefix() {
+ assertThat(validIndex("Al-0001")).isTrue();
}
- // ─── path traversal in the file column cannot escape importDir ───────────────────
+ @Test
+ void isValidImportIndex_returnsTrue_whenThreeLetterPrefix() {
+ assertThat(validIndex("CuH-0010")).isTrue();
+ }
@Test
- void load_rejectsFileColumn_whenBasenameIsTraversalToken(@TempDir Path tempDir) throws Exception {
- // A file column whose basename is itself a traversal token must be rejected
- // outright, never used for disk I/O.
+ void isValidImportIndex_returnsTrue_whenUmlautPrefix() {
+ // Real corpus indices carry a German umlaut, e.g. "Mü-0001.pdf" exists on disk.
+ assertThat(validIndex("Mü-0001")).isTrue();
+ }
+
+ @Test
+ void isValidImportIndex_returnsTrue_whenDoubleHyphen() {
+ // Real corpus: "C--0029" appears in the spreadsheet (a data-entry artefact, but a
+ // legitimate catalog shape that must still resolve, not crash).
+ assertThat(validIndex("C--0029")).isTrue();
+ }
+
+ @Test
+ void isValidImportIndex_returnsTrue_whenXSuffix() {
+ // The normalizer recognises an x-suffix catalog id; allow it defensively.
+ assertThat(validIndex("W-0001x")).isTrue();
+ }
+
+ // ─── a valid index resolves to exactly importDir/.pdf within containment ─────
+
+ @Test
+ void load_resolvesPdfByIndex_uploadsToS3_andSetsStatusUploaded(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
- Path xlsx = writeDocs(tempDir, docRow("W-0001", "evil/..", "", "", "", "", "", "", "", ""));
+ byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D};
+ Files.write(tempDir.resolve("W-0124.pdf"), pdf);
+ when(documentService.findByOriginalFilename("W-0124")).thenReturn(Optional.empty());
+ when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
+ Path xlsx = writeDocs(tempDir, docRow("W-0124", "", "", "", "", "", "", "", ""));
+
+ importer.load(xlsx.toFile());
+
+ // exactly importDir/.pdf was uploaded — the S3 key carries that basename
+ org.mockito.ArgumentCaptor bodyCaptor = org.mockito.ArgumentCaptor.forClass(RequestBody.class);
+ verify(s3Client).putObject(any(PutObjectRequest.class), bodyCaptor.capture());
+ verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d ->
+ d.getStatus() == DocumentStatus.UPLOADED
+ && d.getFilePath() != null
+ && d.getFilePath().endsWith("_W-0124.pdf")));
+ }
+
+ @Test
+ void load_yieldsPlaceholder_whenIndexedPdfMissing(@TempDir Path tempDir) throws Exception {
+ ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
+ when(documentService.findByOriginalFilename("X-9999")).thenReturn(Optional.empty());
+ when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
+ Path xlsx = writeDocs(tempDir, docRow("X-9999", "", "", "", "", "", "", "", ""));
+
+ importer.load(xlsx.toFile());
+
+ verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER));
+ verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
+ }
+
+ @Test
+ void load_rejectsMaliciousIndex_neverReadsOutsideImportDir(@TempDir Path tempDir) throws Exception {
+ // An index with a path separator must be skipped outright, never used for disk I/O.
+ ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
+ Path xlsx = writeDocs(tempDir, docRow("../../etc/cron.d/x", "", "", "", "", "", "", "", ""));
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
@@ -158,22 +207,35 @@ class DocumentImporterTest {
.extracting(ImportStatus.SkippedFile::reason)
.containsExactly(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
verify(documentService, never()).save(any());
+ verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
}
@Test
- void load_traversalFileColumn_cannotEscapeImportDir_yieldsPlaceholder(@TempDir Path tempDir) throws Exception {
- // ../../etc/cron.d/x reduces to basename "x"; the disk lookup is confined to
- // importDir, so no file is found, nothing is uploaded, and the row becomes a
- // metadata-only PLACEHOLDER — the file outside importDir is never read.
+ void resolvePdfByIndex_throwsWhenResolvedPathEscapesImportDir_viaSymlink(
+ @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception {
+ // Containment defense-in-depth: even a syntactically valid index whose .pdf is a
+ // symlink pointing outside importDir must be refused — the resolved canonical path is
+ // asserted to stay inside importDir.
+ Path outsideFile = outsideDir.resolve("secret.pdf");
+ Files.writeString(outsideFile, "sensitive");
+ Files.createSymbolicLink(importDirPath.resolve("W-0001.pdf"), outsideFile);
+ ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString());
+
+ org.assertj.core.api.Assertions.assertThatThrownBy(
+ () -> ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "W-0001"))
+ .isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class);
+ }
+
+ @Test
+ void resolvePdfByIndex_returnsExactlyImportDirIndexPdf_whenPresent(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
- when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
- when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
- Path xlsx = writeDocs(tempDir, docRow("W-0001", "../../etc/cron.d/x", "", "", "", "", "", "", "", ""));
+ Path expected = tempDir.resolve("Eu-0628.pdf");
+ Files.writeString(expected, "%PDF-1.4");
- importer.load(xlsx.toFile());
+ Optional resolved = ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "Eu-0628");
- verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
- verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER));
+ assertThat(resolved).isPresent();
+ assertThat(resolved.get().getCanonicalFile()).isEqualTo(expected.toFile().getCanonicalFile());
}
// ─── PDF magic-byte guard — ported — do not remove ──────────────────────────────
@@ -183,7 +245,7 @@ class DocumentImporterTest {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Files.writeString(tempDir.resolve("W-0001.pdf"), "not a pdf");
lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty());
- Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
+ Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", ""));
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
@@ -198,7 +260,7 @@ class DocumentImporterTest {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Files.writeString(tempDir.resolve("W-0001.pdf"), "content");
lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty());
- Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
+ Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", ""));
DocumentImporter spyImporter = org.mockito.Mockito.spy(importer);
org.mockito.Mockito.doThrow(new java.io.IOException("read error"))
@@ -217,7 +279,7 @@ class DocumentImporterTest {
Document existing = Document.builder().id(UUID.randomUUID())
.originalFilename("W-0001").status(DocumentStatus.UPLOADED).build();
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.of(existing));
- Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "", ""));
+ Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", ""));
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
@@ -227,29 +289,14 @@ class DocumentImporterTest {
verify(documentService, never()).save(any());
}
- // ─── file column drives status: present → UPLOADED, empty → PLACEHOLDER ───────────
+ // ─── presence of importDir/.pdf drives status: present → UPLOADED, absent → PLACEHOLDER ─
@Test
- void load_uploadsToS3_andSetsStatusUploaded_whenFilePresent(@TempDir Path tempDir) throws Exception {
- ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
- byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D};
- Files.write(tempDir.resolve("W-0001.pdf"), pdf);
- when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
- when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
- Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
-
- importer.load(xlsx.toFile());
-
- verify(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class));
- verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.UPLOADED));
- }
-
- @Test
- void load_setsStatusPlaceholder_whenFileColumnEmpty(@TempDir Path tempDir) throws Exception {
+ void load_setsStatusPlaceholder_whenNoIndexedPdf(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
when(documentService.findByOriginalFilename("W-0099")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
- Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", "", ""));
+ Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", ""));
importer.load(xlsx.toFile());
@@ -267,7 +314,7 @@ class DocumentImporterTest {
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(personService.findBySourceRef("de-gruyter-walter")).thenReturn(Optional.of(walter));
- Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "de-gruyter-walter", "Walter de Gruyter",
+ Path xlsx = writeDocs(tempDir, docRow("W-0001", "de-gruyter-walter", "Walter de Gruyter",
"", "", "", "", "", ""));
importer.load(xlsx.toFile());
@@ -285,7 +332,7 @@ class DocumentImporterTest {
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(personService.findBySourceRef("schwester-hanni")).thenReturn(Optional.empty());
when(personService.upsertBySourceRef(any())).thenReturn(provisional);
- Path xlsx = writeDocs(tempDir, docRow("W-0002", "", "schwester-hanni", "Schwester Hanni",
+ Path xlsx = writeDocs(tempDir, docRow("W-0002", "schwester-hanni", "Schwester Hanni",
"", "", "", "", "", ""));
importer.load(xlsx.toFile());
@@ -302,7 +349,7 @@ class DocumentImporterTest {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
when(documentService.findByOriginalFilename("W-0003")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
- Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "", "?",
+ Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "?",
"", "", "", "", "", ""));
importer.load(xlsx.toFile());
@@ -322,7 +369,7 @@ class DocumentImporterTest {
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(personService.findBySourceRef("cram-herbert")).thenReturn(Optional.of(herbert));
when(personService.findBySourceRef("clara")).thenReturn(Optional.of(clara));
- Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "", "",
+ Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "",
"cram-herbert|clara", "Herbert Cram|Clara", "", "", "", ""));
importer.load(xlsx.toFile());
@@ -341,7 +388,7 @@ class DocumentImporterTest {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
when(documentService.findByOriginalFilename("W-0005")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
- Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "", "",
+ Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "",
"", "", "1916-06-01", "1.6.1916", "MONTH", ""));
importer.load(xlsx.toFile());
@@ -375,7 +422,7 @@ class DocumentImporterTest {
.originalFilename("W-0007").status(DocumentStatus.PLACEHOLDER).build();
when(documentService.findByOriginalFilename("W-0007")).thenReturn(Optional.of(existing));
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
- Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", "", ""));
+ Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", ""));
importer.load(xlsx.toFile());
@@ -396,7 +443,7 @@ class DocumentImporterTest {
when(documentService.findByOriginalFilename("W-0008")).thenReturn(Optional.of(existing));
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
// The canonical row now carries no receiver and no tag: both stale links must go.
- Path xlsx = writeDocs(tempDir, docRow("W-0008", "", "", "", "", "", "", "", "", ""));
+ Path xlsx = writeDocs(tempDir, docRow("W-0008", "", "", "", "", "", "", "", ""));
importer.load(xlsx.toFile());
@@ -411,7 +458,7 @@ class DocumentImporterTest {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
when(documentService.findByOriginalFilename("W-0100")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
- Path xlsx = writeDocs(tempDir, docRow("W-0100", "", "", "", "", "",
+ Path xlsx = writeDocs(tempDir, docRow("W-0100", "", "", "", "",
"1916-06-01", "Juni 1916", "MONTH", ""));
importer.load(xlsx.toFile());
@@ -425,7 +472,7 @@ class DocumentImporterTest {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
when(documentService.findByOriginalFilename("W-0101")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
- Path xlsx = writeDocs(tempDir, docRow("W-0101", "", "", "", "", "",
+ Path xlsx = writeDocs(tempDir, docRow("W-0101", "", "", "", "",
"1943-12-24", "24.12.1943", "DAY", ""));
importer.load(xlsx.toFile());
@@ -439,7 +486,7 @@ class DocumentImporterTest {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
when(documentService.findByOriginalFilename("W-0102")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
- Path xlsx = writeDocs(tempDir, docRow("W-0102", "", "", "", "", "",
+ Path xlsx = writeDocs(tempDir, docRow("W-0102", "", "", "", "",
"", "?", "UNKNOWN", ""));
importer.load(xlsx.toFile());
@@ -450,12 +497,15 @@ class DocumentImporterTest {
// ─── helpers ─────────────────────────────────────────────────────────────────────
- private Map docRow(String index, String file, String senderId, String senderName,
+ private Boolean validIndex(String index) {
+ return ReflectionTestUtils.invokeMethod(importer, "isValidImportIndex", index);
+ }
+
+ private Map docRow(String index, String senderId, String senderName,
String receiverIds, String receiverNames, String dateIso,
String dateRaw, String datePrecision, String dateEnd) {
Map r = new LinkedHashMap<>();
r.put("index", index);
- r.put("file", file);
r.put("sender_person_id", senderId);
r.put("sender_name", senderName);
r.put("receiver_person_ids", receiverIds);
@@ -471,7 +521,7 @@ class DocumentImporterTest {
}
private Map docRowWithTag(String index, String tagPath) {
- Map r = docRow(index, "", "", "", "", "", "", "", "", "");
+ Map r = docRow(index, "", "", "", "", "", "", "", "");
r.put("tags", tagPath);
return r;
}
@@ -479,7 +529,7 @@ class DocumentImporterTest {
@SafeVarargs
private Path writeDocs(Path dir, Map... rows) throws Exception {
Path xlsx = dir.resolve("canonical-documents.xlsx");
- List headers = List.of("index", "file", "sender_person_id", "sender_name",
+ List headers = List.of("index", "sender_person_id", "sender_name",
"receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision",
"date_end", "location", "tags", "summary");
try (XSSFWorkbook wb = new XSSFWorkbook()) {
@@ -494,7 +544,7 @@ class DocumentImporterTest {
row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), ""));
}
}
- try (OutputStream out = Files.newOutputStream(xlsx)) {
+ try (java.io.OutputStream out = Files.newOutputStream(xlsx)) {
wb.write(out);
}
}