diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java new file mode 100644 index 00000000..fb021d0f --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java @@ -0,0 +1,324 @@ +package org.raddatz.familienarchiv.importing; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.document.DatePrecision; +import org.raddatz.familienarchiv.document.Document; +import org.raddatz.familienarchiv.document.DocumentService; +import org.raddatz.familienarchiv.document.DocumentStatus; +import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonType; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.raddatz.familienarchiv.tag.Tag; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; +import org.springframework.transaction.annotation.Transactional; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; + +import org.raddatz.familienarchiv.tag.TagService; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.LocalDate; +import java.time.format.DateTimeParseException; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Stream; + +/** + * Loads {@code canonical-documents.xlsx} into the document domain. Java performs no + * semantic transformation: the normalizer already resolved people to slugs and dates to + * ISO values. This loader maps columns by header name, routes each attribution + * register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}), + * parses clean dates, and keeps the file/S3/thumbnail plumbing. + * + *

The {@code file} value is hostile input regardless of upstream trust (CWE-22 does not + * care that it came from our Python tool): its basename is validated with + * {@link #isValidImportFilename} and then resolved with canonical-path containment in + * {@link #findFileRecursive}. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class DocumentImporter { + + static final List REQUIRED_HEADERS = List.of( + "index", "file", "sender_person_id", "sender_name", + "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision"); + + private final DocumentService documentService; + private final PersonService personService; + private final TagService tagService; + private final S3Client s3Client; + private final ThumbnailAsyncRunner thumbnailAsyncRunner; + + @Value("${app.s3.bucket:familienarchiv}") + private String bucketName; + + @Value("${app.import.dir:/import}") + private String importDir; + + /** Outcome of loading the document sheet: processed count + per-file skips. */ + public record LoadResult(int processed, List skippedFiles) {} + + public LoadResult load(File artifact) { + List rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS); + int processed = 0; + List skipped = new ArrayList<>(); + for (CanonicalSheetReader.Row row : rows) { + String index = row.get("index"); + if (index.isBlank()) continue; + Optional skipReason = importRow(row, index, skipped); + if (skipReason.isPresent()) { + skipped.add(new ImportStatus.SkippedFile(displayName(row, index), skipReason.get())); + } else { + processed++; + } + } + log.info("Imported {} documents from {} ({} skipped)", processed, artifact.getName(), skipped.size()); + return new LoadResult(processed, skipped); + } + + private Optional importRow(CanonicalSheetReader.Row row, String index, + List skipped) { + Optional resolved; + try { + resolved = resolveFile(row.get("file")); + } catch (InvalidImportFilenameException e) { + log.warn("Skipping import row {}: filename rejected", index); + return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); + } + if (resolved.isPresent()) { + try { + if (!isPdfMagicBytes(resolved.get())) { + return Optional.of(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE); + } + } catch (IOException e) { + log.error("Magic-byte check failed for row {}", index, e); + return Optional.of(ImportStatus.SkipReason.FILE_READ_ERROR); + } + } + return persist(row, index, resolved); + } + + @Transactional + protected Optional persist(CanonicalSheetReader.Row row, String index, Optional file) { + Document existing = documentService.findByOriginalFilename(index).orElse(null); + if (existing != null && existing.getStatus() != DocumentStatus.PLACEHOLDER) { + return Optional.of(ImportStatus.SkipReason.ALREADY_EXISTS); + } + + String s3Key = null; + String contentType = null; + DocumentStatus status = DocumentStatus.PLACEHOLDER; + if (file.isPresent()) { + contentType = probeContentType(file.get()); + s3Key = "documents/" + UUID.randomUUID() + "_" + file.get().getName(); + try { + uploadToS3(file.get(), s3Key, contentType); + status = DocumentStatus.UPLOADED; + } catch (Exception e) { + log.error("S3 upload failed for {}", file.get().getName(), e); + return Optional.of(ImportStatus.SkipReason.S3_UPLOAD_FAILED); + } + } + + Document doc = buildDocument(row, index, existing, s3Key, contentType, status); + Document saved = documentService.save(doc); + if (file.isPresent()) { + thumbnailAsyncRunner.dispatchAfterCommit(saved.getId()); + } + return Optional.empty(); + } + + private Document buildDocument(CanonicalSheetReader.Row row, String index, Document existing, + String s3Key, String contentType, DocumentStatus status) { + Document doc = existing != null ? existing + : Document.builder().originalFilename(index).build(); + + String senderName = row.get("sender_name"); + String receiverNames = row.get("receiver_names"); + Person sender = resolveSender(row.get("sender_person_id"), senderName); + Set receivers = resolveReceivers(row.get("receiver_person_ids")); + + doc.setTitle(index); + doc.setStatus(status); + doc.setFilePath(s3Key); + doc.setContentType(contentType); + doc.setSender(sender); + doc.setSenderText(blankToNull(senderName)); + doc.getReceivers().addAll(receivers); + doc.setReceiverText(blankToNull(receiverNames)); + doc.setDocumentDate(parseIsoDate(row.get("date_iso"))); + doc.setMetaDatePrecision(parsePrecision(row.get("date_precision"))); + doc.setMetaDateEnd(parseIsoDate(row.get("date_end"))); + doc.setMetaDateRaw(blankToNull(row.get("date_raw"))); + doc.setLocation(blankToNull(row.get("location"))); + doc.setSummary(blankToNull(row.get("summary"))); + attachTag(doc, row.get("tags")); + doc.setMetadataComplete(doc.getDocumentDate() != null || sender != null || !receivers.isEmpty()); + return doc; + } + + // ─── attribution routing — register-first, always retain raw ───────────────────── + + private Person resolveSender(String slug, String rawName) { + if (slug.isBlank()) return null; + return resolvePerson(slug, rawName); + } + + private Set resolveReceivers(String slugs) { + Set receivers = new LinkedHashSet<>(); + for (String slug : CanonicalSheetReader.splitList(slugs)) { + receivers.add(resolvePerson(slug, slug)); + } + return receivers; + } + + private Person resolvePerson(String slug, String rawName) { + return personService.findBySourceRef(slug) + .orElseGet(() -> personService.upsertBySourceRef(PersonUpsertCommand.builder() + .sourceRef(slug) + .lastName(blankToNull(rawName) == null ? slug : rawName) + .personType(PersonType.PERSON) + .provisional(true) + .build())); + } + + private void attachTag(Document doc, String tagPath) { + if (tagPath.isBlank()) return; + tagService.findBySourceRef(tagPath).ifPresent(tag -> doc.getTags().add(tag)); + } + + // ─── clean-value parsing (no semantic logic) ───────────────────────────────────── + + private static LocalDate parseIsoDate(String value) { + if (value == null || value.isBlank()) return null; + try { + return LocalDate.parse(value.trim()); + } catch (DateTimeParseException e) { + return null; + } + } + + private static DatePrecision parsePrecision(String value) { + if (value == null || value.isBlank()) return DatePrecision.UNKNOWN; + try { + return DatePrecision.valueOf(value.trim()); + } catch (IllegalArgumentException e) { + return DatePrecision.UNKNOWN; + } + } + + // ─── file handling + S3 (small ≤20-line methods) ───────────────────────────────── + + private Optional resolveFile(String fileColumn) { + if (fileColumn == null || fileColumn.isBlank()) return Optional.empty(); + String basename = basenameOf(fileColumn); + if (!isValidImportFilename(basename)) { + throw new InvalidImportFilenameException(); + } + return findFileRecursive(basename); + } + + private static String basenameOf(String fileColumn) { + String normalized = fileColumn.replace('\\', '/'); + int lastSlash = normalized.lastIndexOf('/'); + return lastSlash < 0 ? normalized.trim() : normalized.substring(lastSlash + 1).trim(); + } + + private String probeContentType(File file) { + try { + String probed = Files.probeContentType(file.toPath()); + return probed != null ? probed : "application/octet-stream"; + } catch (IOException e) { + return "application/octet-stream"; + } + } + + private void uploadToS3(File file, String s3Key, String contentType) { + s3Client.putObject(PutObjectRequest.builder() + .bucket(bucketName) + .key(s3Key) + .contentType(contentType) + .build(), + RequestBody.fromFile(file)); + } + + // ─── security guards — ported verbatim from MassImportService — do not weaken ──── + + private boolean isValidImportFilename(String filename) { + if (filename == null || filename.isBlank()) return false; + if (filename.contains("/")) return false; + if (filename.contains("\\")) return false; + if (filename.contains("∕")) return false; // U+2215 DIVISION SLASH + if (filename.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS + if (filename.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR + if (filename.contains("..")) return false; + if (filename.equals(".")) return false; + if (filename.contains("\0")) return false; + if (Paths.get(filename).isAbsolute()) return false; + return true; + } + + // package-private: a Mockito spy in tests can override to inject IOException + InputStream openFileStream(File file) throws IOException { + return new FileInputStream(file); + } + + private boolean isPdfMagicBytes(File file) throws IOException { + try (InputStream is = openFileStream(file)) { + byte[] header = is.readNBytes(4); + return header.length == 4 + && header[0] == 0x25 // % + && header[1] == 0x50 // P + && header[2] == 0x44 // D + && header[3] == 0x46; // F + } + } + + private Optional findFileRecursive(String filename) { + File baseDir = new File(importDir); + try (Stream walk = Files.walk(baseDir.toPath())) { + Optional match = walk.filter(p -> !Files.isDirectory(p)) + .filter(p -> p.getFileName().toString().equals(filename)) + .findFirst(); + if (match.isEmpty()) return Optional.empty(); + File candidate = match.get().toFile(); + String baseDirCanonical = baseDir.getCanonicalPath(); + if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) { + throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate); + } + return Optional.of(candidate); + } catch (IOException e) { + return Optional.empty(); + } + } + + private static String displayName(CanonicalSheetReader.Row row, String index) { + String file = row.get("file"); + return file.isBlank() ? index : basenameOf(file); + } + + private static String blankToNull(String s) { + return (s == null || s.isBlank()) ? null : s; + } + + private static final class InvalidImportFilenameException extends RuntimeException { + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java new file mode 100644 index 00000000..ae21adc2 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java @@ -0,0 +1,50 @@ +package org.raddatz.familienarchiv.importing; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.swagger.v3.oas.annotations.media.Schema; + +import java.time.LocalDateTime; +import java.util.List; + +/** + * Async import state surfaced to {@code admin/system/ImportStatusCard.svelte} via the + * generated types. The shape ({@code state, statusCode, processed, skippedFiles, skipped}) + * is kept verbatim from the retired MassImportService so the admin UI keeps working. + */ +public record ImportStatus( + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) State state, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String statusCode, + @JsonIgnore String message, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) int processed, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) List skippedFiles, + LocalDateTime startedAt +) { + + public enum State { IDLE, RUNNING, DONE, FAILED } + + public enum SkipReason { + INVALID_FILENAME_PATH_TRAVERSAL, + INVALID_PDF_SIGNATURE, + FILE_READ_ERROR, + ALREADY_EXISTS, + S3_UPLOAD_FAILED + } + + public record SkippedFile( + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String filename, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) SkipReason reason + ) {} + + // Note: @Schema on a record accessor method is not picked up by SpringDoc; the + // "skipped" count is a computed convenience field derived from skippedFiles.size(). + @JsonProperty("skipped") + public int skipped() { + return skippedFiles.size(); + } + + /** Defensive-copy constructor — callers cannot mutate the stored list after construction. */ + public ImportStatus { + skippedFiles = List.copyOf(skippedFiles); + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java index d02dcef8..6ad17454 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java @@ -80,6 +80,11 @@ public class PersonService { return personRepository.findByFirstNameIgnoreCaseAndLastNameIgnoreCase(firstName, lastName); } + /** Lookup by the normalizer person_id — used by the canonical importer for register-first matching. */ + public Optional findBySourceRef(String sourceRef) { + return personRepository.findBySourceRef(sourceRef); + } + @Nullable @Transactional public Person findOrCreateByAlias(String rawName) { diff --git a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java index 46a25712..14e1e9fa 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java @@ -7,6 +7,7 @@ import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.UUID; import java.util.stream.Collectors; @@ -49,6 +50,11 @@ public class TagService { .orElseThrow(() -> DomainException.notFound(ErrorCode.TAG_NOT_FOUND, "Tag not found: " + id)); } + /** Lookup by the canonical tag_path — used by the canonical importer to attach a document's tag. */ + public Optional findBySourceRef(String sourceRef) { + return tagRepository.findBySourceRef(sourceRef); + } + public Tag findOrCreate(String name) { String cleanName = name.trim(); return tagRepository.findByNameIgnoreCase(cleanName) diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java new file mode 100644 index 00000000..bdcaa76b --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java @@ -0,0 +1,437 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.document.Document; +import org.raddatz.familienarchiv.document.DocumentService; +import org.raddatz.familienarchiv.document.DocumentStatus; +import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.raddatz.familienarchiv.tag.Tag; +import org.raddatz.familienarchiv.tag.TagService; +import org.springframework.test.util.ReflectionTestUtils; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; + +import java.io.File; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.LocalDate; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.lenient; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class DocumentImporterTest { + + @Mock DocumentService documentService; + @Mock PersonService personService; + @Mock TagService tagService; + @Mock S3Client s3Client; + @Mock ThumbnailAsyncRunner thumbnailAsyncRunner; + + DocumentImporter importer; + + @BeforeEach + void setUp() { + importer = new DocumentImporter(documentService, personService, tagService, s3Client, thumbnailAsyncRunner); + ReflectionTestUtils.setField(importer, "bucketName", "test-bucket"); + } + + // ─── security regression — ported from MassImportServiceTest — do not remove ───── + + @Test + void isValidImportFilename_returnsFalse_whenNull() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", (String) null)).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenBlank() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", " ")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenForwardSlash() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "etc/passwd")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenBackslash() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..\\etc\\passwd")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenDotDot() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "doc..evil.pdf")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenIsDotDot() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenAbsolutePath() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "/etc/passwd")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenNullByte() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "file\0.pdf")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenUnicodeDivisionSlash() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo∕bar.pdf")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenFullwidthSlash() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo/bar.pdf")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenReverseSolidusOperator() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo⧵bar.pdf")).isFalse(); + } + + @Test + void isValidImportFilename_returnsTrue_whenPlainBasename() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "document.pdf")).isTrue(); + } + + @Test + void isValidImportFilename_returnsTrue_whenLeadingDot() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", ".hidden.pdf")).isTrue(); + } + + @Test + void isValidImportFilename_returnsTrue_whenHasSpaces() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "Brief an Oma.pdf")).isTrue(); + } + + @Test + void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir( + @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception { + Path outsideFile = outsideDir.resolve("secret.pdf"); + Files.writeString(outsideFile, "sensitive"); + Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile); + ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString()); + + org.assertj.core.api.Assertions.assertThatThrownBy( + () -> ReflectionTestUtils.invokeMethod(importer, "findFileRecursive", "secret.pdf")) + .isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class); + } + + // ─── path traversal in the file column cannot escape importDir ─────────────────── + + @Test + void load_rejectsFileColumn_whenBasenameIsTraversalToken(@TempDir Path tempDir) throws Exception { + // A file column whose basename is itself a traversal token must be rejected + // outright, never used for disk I/O. + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "evil/..", "", "", "", "", "", "", "", "")); + + DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); + verify(documentService, never()).save(any()); + } + + @Test + void load_traversalFileColumn_cannotEscapeImportDir_yieldsPlaceholder(@TempDir Path tempDir) throws Exception { + // ../../etc/cron.d/x reduces to basename "x"; the disk lookup is confined to + // importDir, so no file is found, nothing is uploaded, and the row becomes a + // metadata-only PLACEHOLDER — the file outside importDir is never read. + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "../../etc/cron.d/x", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER)); + } + + // ─── PDF magic-byte guard — ported — do not remove ────────────────────────────── + + @Test + void load_skipsFile_whenNotPdfMagicBytes(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Files.writeString(tempDir.resolve("W-0001.pdf"), "not a pdf"); + lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", "")); + + DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE); + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + } + + @Test + void load_skipsFile_whenMagicByteCheckThrowsIoException(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Files.writeString(tempDir.resolve("W-0001.pdf"), "content"); + lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", "")); + + DocumentImporter spyImporter = org.mockito.Mockito.spy(importer); + org.mockito.Mockito.doThrow(new java.io.IOException("read error")) + .when(spyImporter).openFileStream(any(File.class)); + + DocumentImporter.LoadResult result = spyImporter.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.FILE_READ_ERROR); + } + + @Test + void load_skipsAlreadyExists_whenDocumentUploadedNotPlaceholder(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Document existing = Document.builder().id(UUID.randomUUID()) + .originalFilename("W-0001").status(DocumentStatus.UPLOADED).build(); + when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.of(existing)); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "", "")); + + DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.ALREADY_EXISTS); + verify(documentService, never()).save(any()); + } + + // ─── file column drives status: present → UPLOADED, empty → PLACEHOLDER ─────────── + + @Test + void load_uploadsToS3_andSetsStatusUploaded_whenFilePresent(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D}; + Files.write(tempDir.resolve("W-0001.pdf"), pdf); + when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.UPLOADED)); + } + + @Test + void load_setsStatusPlaceholder_whenFileColumnEmpty(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0099")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER)); + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + } + + // ─── attribution routing — register-first + always retain raw ──────────────────── + + @Test + void load_linksRegisterSender_andRetainsRawSenderText(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person walter = Person.builder().id(UUID.randomUUID()).sourceRef("de-gruyter-walter") + .firstName("Walter").lastName("de Gruyter").build(); + when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("de-gruyter-walter")).thenReturn(Optional.of(walter)); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "de-gruyter-walter", "Walter de Gruyter", + "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getSender() == walter && "Walter de Gruyter".equals(d.getSenderText()))); + } + + @Test + void load_createsProvisionalSender_whenSlugUnmatchedInRegister(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person provisional = Person.builder().id(UUID.randomUUID()).sourceRef("schwester-hanni") + .lastName("Schwester Hanni").provisional(true).build(); + when(documentService.findByOriginalFilename("W-0002")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("schwester-hanni")).thenReturn(Optional.empty()); + when(personService.upsertBySourceRef(any())).thenReturn(provisional); + Path xlsx = writeDocs(tempDir, docRow("W-0002", "", "schwester-hanni", "Schwester Hanni", + "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + org.mockito.ArgumentCaptor captor = + org.mockito.ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService).upsertBySourceRef(captor.capture()); + assertThat(captor.getValue().provisional()).isTrue(); + assertThat(captor.getValue().lastName()).isEqualTo("Schwester Hanni"); + } + + @Test + void load_createsNoSenderPerson_whenSlugEmptyButRawPresent(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0003")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "", "?", + "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(personService, never()).findBySourceRef(any()); + verify(personService, never()).upsertBySourceRef(any()); + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getSender() == null && "?".equals(d.getSenderText()))); + } + + @Test + void load_splitsMultipleReceivers_andRetainsRawReceiverText(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person herbert = Person.builder().id(UUID.randomUUID()).sourceRef("cram-herbert").lastName("Cram").build(); + Person clara = Person.builder().id(UUID.randomUUID()).sourceRef("clara").lastName("Clara").build(); + when(documentService.findByOriginalFilename("W-0004")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("cram-herbert")).thenReturn(Optional.of(herbert)); + when(personService.findBySourceRef("clara")).thenReturn(Optional.of(clara)); + Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "", "", + "cram-herbert|clara", "Herbert Cram|Clara", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getReceivers().size() == 2 + && d.getReceivers().contains(herbert) + && d.getReceivers().contains(clara) + && "Herbert Cram|Clara".equals(d.getReceiverText()))); + } + + // ─── clean date values parse without semantic logic ────────────────────────────── + + @Test + void load_parsesCleanDateAndPrecision(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0005")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "", "", + "", "", "1916-06-01", "1.6.1916", "MONTH", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + LocalDate.of(1916, 6, 1).equals(d.getDocumentDate()) + && d.getMetaDatePrecision() == org.raddatz.familienarchiv.document.DatePrecision.MONTH + && "1.6.1916".equals(d.getMetaDateRaw()))); + } + + @Test + void load_attachesTagBySourceRef(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Tag tag = Tag.builder().id(UUID.randomUUID()).name("Brautbriefe").sourceRef("Themen/Brautbriefe").build(); + when(documentService.findByOriginalFilename("W-0006")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(tagService.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.of(tag)); + Path xlsx = writeDocs(tempDir, docRowWithTag("W-0006", "Themen/Brautbriefe")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getTags().contains(tag))); + } + + // ─── idempotency — update existing document in place by index ───────────────────── + + @Test + void load_updatesExistingDocumentInPlace_whenIndexExists(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Document existing = Document.builder().id(UUID.randomUUID()) + .originalFilename("W-0007").status(DocumentStatus.PLACEHOLDER).build(); + when(documentService.findByOriginalFilename("W-0007")).thenReturn(Optional.of(existing)); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getId().equals(existing.getId()))); + } + + // ─── helpers ───────────────────────────────────────────────────────────────────── + + private Map docRow(String index, String file, String senderId, String senderName, + String receiverIds, String receiverNames, String dateIso, + String dateRaw, String datePrecision, String dateEnd) { + Map r = new LinkedHashMap<>(); + r.put("index", index); + r.put("file", file); + r.put("sender_person_id", senderId); + r.put("sender_name", senderName); + r.put("receiver_person_ids", receiverIds); + r.put("receiver_names", receiverNames); + r.put("date_iso", dateIso); + r.put("date_raw", dateRaw); + r.put("date_precision", datePrecision); + r.put("date_end", dateEnd); + r.put("location", ""); + r.put("tags", ""); + r.put("summary", ""); + return r; + } + + private Map docRowWithTag(String index, String tagPath) { + Map r = docRow(index, "", "", "", "", "", "", "", "", ""); + r.put("tags", tagPath); + return r; + } + + @SafeVarargs + private Path writeDocs(Path dir, Map... rows) throws Exception { + Path xlsx = dir.resolve("canonical-documents.xlsx"); + List headers = List.of("index", "file", "sender_person_id", "sender_name", + "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision", + "date_end", "location", "tags", "summary"); + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + for (int i = 0; i < headers.size(); i++) { + header.createCell(i).setCellValue(headers.get(i)); + } + for (int r = 0; r < rows.length; r++) { + Row row = sheet.createRow(r + 1); + for (int c = 0; c < headers.size(); c++) { + row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), "")); + } + } + try (OutputStream out = Files.newOutputStream(xlsx)) { + wb.write(out); + } + } + return xlsx; + } +}