feat(importing): add DocumentImporter loader with ported security guards

Fourth canonical loader. Maps canonical-documents.xlsx by header name,
routes each attribution register-first by source_ref (provisional person
when a slug is unmatched), ALWAYS retains the raw sender_name/receiver_names
in sender_text/receiver_text, splits pipe-delimited receivers, parses clean
date_iso/date_precision/date_end/date_raw with no semantic logic, attaches
the tag by canonical tag_path, and keeps the S3 upload + thumbnail plumbing
in small resolveFile/uploadToS3/buildDocument methods. Documents upsert by
index (originalFilename); UPLOADED when a file resolves on disk, PLACEHOLDER
otherwise.

Security guards ported intact from MassImportService BEFORE retiring it:
isValidImportFilename (forward/back slash, three Unicode slash homoglyphs,
.., null byte, absolute path), findFileRecursive canonical-path containment
(symlink-escape), and the %PDF magic-byte check + FILE_READ_ERROR path. The
file column is treated as hostile input (CWE-22): its basename is validated
then resolved only inside importDir, so a traversal value cannot escape.

Extracts the verbatim ImportStatus/SkipReason/SkippedFile shape into its own
class so the admin UI contract is unchanged.

Assumption: the committed canonical-documents.xlsx carries no
sender_category/receiver_category columns (the issue's described schema) —
the normalizer already resolved Option-A routing into slugs + raw names, so
the loader routes by slug presence rather than a category enum.

Refs #669

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-27 10:33:17 +02:00
parent cbf1984430
commit c56ba6219c
5 changed files with 822 additions and 0 deletions

View File

@@ -0,0 +1,437 @@
package org.raddatz.familienarchiv.importing;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.raddatz.familienarchiv.document.Document;
import org.raddatz.familienarchiv.document.DocumentService;
import org.raddatz.familienarchiv.document.DocumentStatus;
import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner;
import org.raddatz.familienarchiv.person.Person;
import org.raddatz.familienarchiv.person.PersonService;
import org.raddatz.familienarchiv.person.PersonUpsertCommand;
import org.raddatz.familienarchiv.tag.Tag;
import org.raddatz.familienarchiv.tag.TagService;
import org.springframework.test.util.ReflectionTestUtils;
import software.amazon.awssdk.core.sync.RequestBody;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
import java.io.File;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDate;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.lenient;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
@ExtendWith(MockitoExtension.class)
class DocumentImporterTest {
@Mock DocumentService documentService;
@Mock PersonService personService;
@Mock TagService tagService;
@Mock S3Client s3Client;
@Mock ThumbnailAsyncRunner thumbnailAsyncRunner;
DocumentImporter importer;
@BeforeEach
void setUp() {
importer = new DocumentImporter(documentService, personService, tagService, s3Client, thumbnailAsyncRunner);
ReflectionTestUtils.setField(importer, "bucketName", "test-bucket");
}
// ─── security regression — ported from MassImportServiceTest — do not remove ─────
@Test
void isValidImportFilename_returnsFalse_whenNull() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", (String) null)).isFalse();
}
@Test
void isValidImportFilename_returnsFalse_whenBlank() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", " ")).isFalse();
}
@Test
void isValidImportFilename_returnsFalse_whenForwardSlash() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "etc/passwd")).isFalse();
}
@Test
void isValidImportFilename_returnsFalse_whenBackslash() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..\\etc\\passwd")).isFalse();
}
@Test
void isValidImportFilename_returnsFalse_whenDotDot() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "doc..evil.pdf")).isFalse();
}
@Test
void isValidImportFilename_returnsFalse_whenIsDotDot() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..")).isFalse();
}
@Test
void isValidImportFilename_returnsFalse_whenAbsolutePath() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "/etc/passwd")).isFalse();
}
@Test
void isValidImportFilename_returnsFalse_whenNullByte() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "file\0.pdf")).isFalse();
}
@Test
void isValidImportFilename_returnsFalse_whenUnicodeDivisionSlash() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foobar.pdf")).isFalse();
}
@Test
void isValidImportFilename_returnsFalse_whenFullwidthSlash() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foobar.pdf")).isFalse();
}
@Test
void isValidImportFilename_returnsFalse_whenReverseSolidusOperator() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foobar.pdf")).isFalse();
}
@Test
void isValidImportFilename_returnsTrue_whenPlainBasename() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "document.pdf")).isTrue();
}
@Test
void isValidImportFilename_returnsTrue_whenLeadingDot() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", ".hidden.pdf")).isTrue();
}
@Test
void isValidImportFilename_returnsTrue_whenHasSpaces() {
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "Brief an Oma.pdf")).isTrue();
}
@Test
void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir(
@TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception {
Path outsideFile = outsideDir.resolve("secret.pdf");
Files.writeString(outsideFile, "sensitive");
Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile);
ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString());
org.assertj.core.api.Assertions.assertThatThrownBy(
() -> ReflectionTestUtils.invokeMethod(importer, "findFileRecursive", "secret.pdf"))
.isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class);
}
// ─── path traversal in the file column cannot escape importDir ───────────────────
@Test
void load_rejectsFileColumn_whenBasenameIsTraversalToken(@TempDir Path tempDir) throws Exception {
// A file column whose basename is itself a traversal token must be rejected
// outright, never used for disk I/O.
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Path xlsx = writeDocs(tempDir, docRow("W-0001", "evil/..", "", "", "", "", "", "", "", ""));
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
assertThat(result.skippedFiles())
.extracting(ImportStatus.SkippedFile::reason)
.containsExactly(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
verify(documentService, never()).save(any());
}
@Test
void load_traversalFileColumn_cannotEscapeImportDir_yieldsPlaceholder(@TempDir Path tempDir) throws Exception {
// ../../etc/cron.d/x reduces to basename "x"; the disk lookup is confined to
// importDir, so no file is found, nothing is uploaded, and the row becomes a
// metadata-only PLACEHOLDER — the file outside importDir is never read.
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
Path xlsx = writeDocs(tempDir, docRow("W-0001", "../../etc/cron.d/x", "", "", "", "", "", "", "", ""));
importer.load(xlsx.toFile());
verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER));
}
// ─── PDF magic-byte guard — ported — do not remove ──────────────────────────────
@Test
void load_skipsFile_whenNotPdfMagicBytes(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Files.writeString(tempDir.resolve("W-0001.pdf"), "not a pdf");
lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty());
Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
assertThat(result.skippedFiles())
.extracting(ImportStatus.SkippedFile::reason)
.containsExactly(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE);
verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
}
@Test
void load_skipsFile_whenMagicByteCheckThrowsIoException(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Files.writeString(tempDir.resolve("W-0001.pdf"), "content");
lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty());
Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
DocumentImporter spyImporter = org.mockito.Mockito.spy(importer);
org.mockito.Mockito.doThrow(new java.io.IOException("read error"))
.when(spyImporter).openFileStream(any(File.class));
DocumentImporter.LoadResult result = spyImporter.load(xlsx.toFile());
assertThat(result.skippedFiles())
.extracting(ImportStatus.SkippedFile::reason)
.containsExactly(ImportStatus.SkipReason.FILE_READ_ERROR);
}
@Test
void load_skipsAlreadyExists_whenDocumentUploadedNotPlaceholder(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Document existing = Document.builder().id(UUID.randomUUID())
.originalFilename("W-0001").status(DocumentStatus.UPLOADED).build();
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.of(existing));
Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "", ""));
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
assertThat(result.skippedFiles())
.extracting(ImportStatus.SkippedFile::reason)
.containsExactly(ImportStatus.SkipReason.ALREADY_EXISTS);
verify(documentService, never()).save(any());
}
// ─── file column drives status: present → UPLOADED, empty → PLACEHOLDER ───────────
@Test
void load_uploadsToS3_andSetsStatusUploaded_whenFilePresent(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D};
Files.write(tempDir.resolve("W-0001.pdf"), pdf);
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
importer.load(xlsx.toFile());
verify(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class));
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.UPLOADED));
}
@Test
void load_setsStatusPlaceholder_whenFileColumnEmpty(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
when(documentService.findByOriginalFilename("W-0099")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", "", ""));
importer.load(xlsx.toFile());
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER));
verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
}
// ─── attribution routing — register-first + always retain raw ────────────────────
@Test
void load_linksRegisterSender_andRetainsRawSenderText(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Person walter = Person.builder().id(UUID.randomUUID()).sourceRef("de-gruyter-walter")
.firstName("Walter").lastName("de Gruyter").build();
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(personService.findBySourceRef("de-gruyter-walter")).thenReturn(Optional.of(walter));
Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "de-gruyter-walter", "Walter de Gruyter",
"", "", "", "", "", ""));
importer.load(xlsx.toFile());
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d ->
d.getSender() == walter && "Walter de Gruyter".equals(d.getSenderText())));
}
@Test
void load_createsProvisionalSender_whenSlugUnmatchedInRegister(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Person provisional = Person.builder().id(UUID.randomUUID()).sourceRef("schwester-hanni")
.lastName("Schwester Hanni").provisional(true).build();
when(documentService.findByOriginalFilename("W-0002")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(personService.findBySourceRef("schwester-hanni")).thenReturn(Optional.empty());
when(personService.upsertBySourceRef(any())).thenReturn(provisional);
Path xlsx = writeDocs(tempDir, docRow("W-0002", "", "schwester-hanni", "Schwester Hanni",
"", "", "", "", "", ""));
importer.load(xlsx.toFile());
org.mockito.ArgumentCaptor<PersonUpsertCommand> captor =
org.mockito.ArgumentCaptor.forClass(PersonUpsertCommand.class);
verify(personService).upsertBySourceRef(captor.capture());
assertThat(captor.getValue().provisional()).isTrue();
assertThat(captor.getValue().lastName()).isEqualTo("Schwester Hanni");
}
@Test
void load_createsNoSenderPerson_whenSlugEmptyButRawPresent(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
when(documentService.findByOriginalFilename("W-0003")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "", "?",
"", "", "", "", "", ""));
importer.load(xlsx.toFile());
verify(personService, never()).findBySourceRef(any());
verify(personService, never()).upsertBySourceRef(any());
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d ->
d.getSender() == null && "?".equals(d.getSenderText())));
}
@Test
void load_splitsMultipleReceivers_andRetainsRawReceiverText(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Person herbert = Person.builder().id(UUID.randomUUID()).sourceRef("cram-herbert").lastName("Cram").build();
Person clara = Person.builder().id(UUID.randomUUID()).sourceRef("clara").lastName("Clara").build();
when(documentService.findByOriginalFilename("W-0004")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(personService.findBySourceRef("cram-herbert")).thenReturn(Optional.of(herbert));
when(personService.findBySourceRef("clara")).thenReturn(Optional.of(clara));
Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "", "",
"cram-herbert|clara", "Herbert Cram|Clara", "", "", "", ""));
importer.load(xlsx.toFile());
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d ->
d.getReceivers().size() == 2
&& d.getReceivers().contains(herbert)
&& d.getReceivers().contains(clara)
&& "Herbert Cram|Clara".equals(d.getReceiverText())));
}
// ─── clean date values parse without semantic logic ──────────────────────────────
@Test
void load_parsesCleanDateAndPrecision(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
when(documentService.findByOriginalFilename("W-0005")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "", "",
"", "", "1916-06-01", "1.6.1916", "MONTH", ""));
importer.load(xlsx.toFile());
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d ->
LocalDate.of(1916, 6, 1).equals(d.getDocumentDate())
&& d.getMetaDatePrecision() == org.raddatz.familienarchiv.document.DatePrecision.MONTH
&& "1.6.1916".equals(d.getMetaDateRaw())));
}
@Test
void load_attachesTagBySourceRef(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Tag tag = Tag.builder().id(UUID.randomUUID()).name("Brautbriefe").sourceRef("Themen/Brautbriefe").build();
when(documentService.findByOriginalFilename("W-0006")).thenReturn(Optional.empty());
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(tagService.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.of(tag));
Path xlsx = writeDocs(tempDir, docRowWithTag("W-0006", "Themen/Brautbriefe"));
importer.load(xlsx.toFile());
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getTags().contains(tag)));
}
// ─── idempotency — update existing document in place by index ─────────────────────
@Test
void load_updatesExistingDocumentInPlace_whenIndexExists(@TempDir Path tempDir) throws Exception {
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
Document existing = Document.builder().id(UUID.randomUUID())
.originalFilename("W-0007").status(DocumentStatus.PLACEHOLDER).build();
when(documentService.findByOriginalFilename("W-0007")).thenReturn(Optional.of(existing));
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", "", ""));
importer.load(xlsx.toFile());
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getId().equals(existing.getId())));
}
// ─── helpers ─────────────────────────────────────────────────────────────────────
private Map<String, String> docRow(String index, String file, String senderId, String senderName,
String receiverIds, String receiverNames, String dateIso,
String dateRaw, String datePrecision, String dateEnd) {
Map<String, String> r = new LinkedHashMap<>();
r.put("index", index);
r.put("file", file);
r.put("sender_person_id", senderId);
r.put("sender_name", senderName);
r.put("receiver_person_ids", receiverIds);
r.put("receiver_names", receiverNames);
r.put("date_iso", dateIso);
r.put("date_raw", dateRaw);
r.put("date_precision", datePrecision);
r.put("date_end", dateEnd);
r.put("location", "");
r.put("tags", "");
r.put("summary", "");
return r;
}
private Map<String, String> docRowWithTag(String index, String tagPath) {
Map<String, String> r = docRow(index, "", "", "", "", "", "", "", "", "");
r.put("tags", tagPath);
return r;
}
@SafeVarargs
private Path writeDocs(Path dir, Map<String, String>... rows) throws Exception {
Path xlsx = dir.resolve("canonical-documents.xlsx");
List<String> headers = List.of("index", "file", "sender_person_id", "sender_name",
"receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision",
"date_end", "location", "tags", "summary");
try (XSSFWorkbook wb = new XSSFWorkbook()) {
Sheet sheet = wb.createSheet("Sheet1");
Row header = sheet.createRow(0);
for (int i = 0; i < headers.size(); i++) {
header.createCell(i).setCellValue(headers.get(i));
}
for (int r = 0; r < rows.length; r++) {
Row row = sheet.createRow(r + 1);
for (int c = 0; c < headers.size(); c++) {
row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), ""));
}
}
try (OutputStream out = Files.newOutputStream(xlsx)) {
wb.write(out);
}
}
return xlsx;
}
}