feat(import): resolve PDFs directly by index, drop the file column (#686) #687
@@ -28,7 +28,6 @@ import java.io.FileInputStream;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
import java.time.format.DateTimeParseException;
|
import java.time.format.DateTimeParseException;
|
||||||
@@ -38,19 +37,23 @@ import java.util.List;
|
|||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import java.util.stream.Stream;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads {@code canonical-documents.xlsx} into the document domain. Java performs no
|
* Loads {@code canonical-documents.xlsx} into the document domain. Java performs no
|
||||||
* semantic transformation: the normalizer already resolved people to slugs and dates to
|
* semantic transformation: the normalizer already resolved people to slugs and dates to
|
||||||
* ISO values. This loader maps columns by header name, routes each attribution
|
* ISO values. This loader maps columns by header name, routes each attribution
|
||||||
* register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}),
|
* register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}),
|
||||||
* parses clean dates, and keeps the file/S3/thumbnail plumbing.
|
* parses clean dates, and keeps the S3/thumbnail plumbing.
|
||||||
*
|
*
|
||||||
* <p>The {@code file} value is hostile input regardless of upstream trust (CWE-22 does not
|
* <p>The import corpus is uniform — every PDF is named {@code <index>.pdf} flat in the import
|
||||||
* care that it came from our Python tool): its basename is validated with
|
* dir — so a document's PDF is resolved <em>directly by its index</em>:
|
||||||
* {@link #isValidImportFilename} and then resolved with canonical-path containment in
|
* {@code importDir.resolve(index + ".pdf")}. The {@code index} is still hostile input
|
||||||
* {@link #findFileRecursive}.
|
* regardless of upstream trust (CWE-22 does not care it came from our Python tool): it is
|
||||||
|
* validated against a strict catalog pattern with {@link #isValidImportIndex} (no path
|
||||||
|
* separators, no {@code .}/{@code ..}, no absolute path, no slash homoglyphs) and the
|
||||||
|
* resolved path is asserted to stay inside the import dir in {@link #resolvePdfByIndex} as
|
||||||
|
* defense-in-depth. The {@code %PDF} magic-byte check still gates upload.
|
||||||
*/
|
*/
|
||||||
@Component
|
@Component
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@@ -58,9 +61,19 @@ import java.util.stream.Stream;
|
|||||||
public class DocumentImporter {
|
public class DocumentImporter {
|
||||||
|
|
||||||
static final List<String> REQUIRED_HEADERS = List.of(
|
static final List<String> REQUIRED_HEADERS = List.of(
|
||||||
"index", "file", "sender_person_id", "sender_name",
|
"index", "sender_person_id", "sender_name",
|
||||||
"receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision");
|
"receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision");
|
||||||
|
|
||||||
|
// Catalog index shape: 1–4 letters (ASCII + Latin-1 letters, e.g. the German "ü" in
|
||||||
|
// "Mü-0001"), one or more hyphens (the corpus has a few "C--0029" data-entry artefacts),
|
||||||
|
// digits, and an optional trailing "x" the normalizer recognises. Anchored, with no
|
||||||
|
// separator / dot / slash characters in the class, so "<index>.pdf" can never traverse.
|
||||||
|
// NOTE: `\d` here is intentionally ASCII-only ([0-9]). Java's java.util.regex matches `\d`
|
||||||
|
// against [0-9] unless Pattern.UNICODE_CHARACTER_CLASS is set — do NOT add that flag, or
|
||||||
|
// Arabic-Indic / fullwidth digits would silently widen the accepted set.
|
||||||
|
private static final Pattern INDEX_PATTERN =
|
||||||
|
Pattern.compile("[A-Za-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF]{1,4}-+\\d+x?");
|
||||||
|
|
||||||
private final DocumentService documentService;
|
private final DocumentService documentService;
|
||||||
private final PersonService personService;
|
private final PersonService personService;
|
||||||
private final TagService tagService;
|
private final TagService tagService;
|
||||||
@@ -84,12 +97,16 @@ public class DocumentImporter {
|
|||||||
List<CanonicalSheetReader.Row> rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS);
|
List<CanonicalSheetReader.Row> rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS);
|
||||||
int processed = 0;
|
int processed = 0;
|
||||||
List<ImportStatus.SkippedFile> skipped = new ArrayList<>();
|
List<ImportStatus.SkippedFile> skipped = new ArrayList<>();
|
||||||
|
// 1-based source row number for ops triage breadcrumbs (the spreadsheet header is row 1,
|
||||||
|
// so the first data row is row 2 — matches what an operator sees in the .xlsx).
|
||||||
|
int rowNumber = 1;
|
||||||
for (CanonicalSheetReader.Row row : rows) {
|
for (CanonicalSheetReader.Row row : rows) {
|
||||||
|
rowNumber++;
|
||||||
String index = row.get("index");
|
String index = row.get("index");
|
||||||
if (index.isBlank()) continue;
|
if (index.isBlank()) continue;
|
||||||
Optional<ImportStatus.SkipReason> skipReason = importRow(row, index, skipped);
|
Optional<ImportStatus.SkipReason> skipReason = importRow(row, index, rowNumber);
|
||||||
if (skipReason.isPresent()) {
|
if (skipReason.isPresent()) {
|
||||||
skipped.add(new ImportStatus.SkippedFile(displayName(row, index), skipReason.get()));
|
skipped.add(new ImportStatus.SkippedFile(index, skipReason.get()));
|
||||||
} else {
|
} else {
|
||||||
processed++;
|
processed++;
|
||||||
}
|
}
|
||||||
@@ -98,16 +115,24 @@ public class DocumentImporter {
|
|||||||
return new LoadResult(processed, skipped);
|
return new LoadResult(processed, skipped);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Optional<ImportStatus.SkipReason> importRow(CanonicalSheetReader.Row row, String index,
|
private Optional<ImportStatus.SkipReason> importRow(CanonicalSheetReader.Row row, String index, int rowNumber) {
|
||||||
List<ImportStatus.SkippedFile> skipped) {
|
if (!isValidImportIndex(index)) {
|
||||||
Optional<File> resolved;
|
// Breadcrumb is the source row number, NOT the raw (possibly-hostile) index — an
|
||||||
try {
|
// operator triaging the import can find the offending row in the .xlsx without us
|
||||||
resolved = resolveFile(row.get("file"));
|
// echoing attacker-controlled input into the log.
|
||||||
} catch (InvalidImportFilenameException e) {
|
log.warn("Skipping import row {}: index rejected (fails catalog-shape validation)", rowNumber);
|
||||||
log.warn("Skipping import row {}: filename rejected", index);
|
|
||||||
return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
|
return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
|
||||||
}
|
}
|
||||||
if (resolved.isPresent()) {
|
Optional<File> resolved = resolvePdfByIndex(index, rowNumber);
|
||||||
|
if (resolved.isEmpty()) {
|
||||||
|
// Distinct from the "index rejected" skip above: the index is VALID but no
|
||||||
|
// <index>.pdf is on disk, so the row becomes a normal PLACEHOLDER (not skipped). The
|
||||||
|
// index is a validated catalog id (no hostile content), so it is safe to log here —
|
||||||
|
// this surfaces a corpus that drifts from the "<index>.pdf" assumption (e.g. a file
|
||||||
|
// that arrived under a different name) rather than dropping it silently.
|
||||||
|
log.info("Import row {}: index {} is valid but {}.pdf is absent — creating PLACEHOLDER",
|
||||||
|
rowNumber, index, index);
|
||||||
|
} else {
|
||||||
try {
|
try {
|
||||||
if (!isPdfMagicBytes(resolved.get())) {
|
if (!isPdfMagicBytes(resolved.get())) {
|
||||||
return Optional.of(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE);
|
return Optional.of(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE);
|
||||||
@@ -257,21 +282,6 @@ public class DocumentImporter {
|
|||||||
|
|
||||||
// ─── file handling + S3 (small ≤20-line methods) ─────────────────────────────────
|
// ─── file handling + S3 (small ≤20-line methods) ─────────────────────────────────
|
||||||
|
|
||||||
private Optional<File> resolveFile(String fileColumn) {
|
|
||||||
if (fileColumn == null || fileColumn.isBlank()) return Optional.empty();
|
|
||||||
String basename = basenameOf(fileColumn);
|
|
||||||
if (!isValidImportFilename(basename)) {
|
|
||||||
throw new InvalidImportFilenameException();
|
|
||||||
}
|
|
||||||
return findFileRecursive(basename);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String basenameOf(String fileColumn) {
|
|
||||||
String normalized = fileColumn.replace('\\', '/');
|
|
||||||
int lastSlash = normalized.lastIndexOf('/');
|
|
||||||
return lastSlash < 0 ? normalized.trim() : normalized.substring(lastSlash + 1).trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
private String probeContentType(File file) {
|
private String probeContentType(File file) {
|
||||||
try {
|
try {
|
||||||
String probed = Files.probeContentType(file.toPath());
|
String probed = Files.probeContentType(file.toPath());
|
||||||
@@ -290,20 +300,23 @@ public class DocumentImporter {
|
|||||||
RequestBody.fromFile(file));
|
RequestBody.fromFile(file));
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─── security guards — ported verbatim from MassImportService — do not weaken ────
|
// ─── index validation + containment — defense-in-depth, do not weaken ────────────
|
||||||
|
|
||||||
private boolean isValidImportFilename(String filename) {
|
// The index is the only thing that drives the on-disk lookup, so it must never contain a
|
||||||
if (filename == null || filename.isBlank()) return false;
|
// path separator, traversal token, slash homoglyph, null byte, or absolute-path marker —
|
||||||
if (filename.contains("/")) return false;
|
// each guard mirrors the filename guards ported from MassImportService — and it must match
|
||||||
if (filename.contains("\\")) return false;
|
// the strict catalog shape so anything unexpected is skipped loudly rather than read.
|
||||||
if (filename.contains("∕")) return false; // U+2215 DIVISION SLASH
|
private boolean isValidImportIndex(String index) {
|
||||||
if (filename.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS
|
if (index == null || index.isBlank()) return false;
|
||||||
if (filename.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR
|
if (index.contains("/")) return false;
|
||||||
if (filename.contains("..")) return false;
|
if (index.contains("\\")) return false;
|
||||||
if (filename.equals(".")) return false;
|
if (index.contains("∕")) return false; // U+2215 DIVISION SLASH
|
||||||
if (filename.contains("\0")) return false;
|
if (index.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS
|
||||||
if (Paths.get(filename).isAbsolute()) return false;
|
if (index.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR
|
||||||
return true;
|
if (index.contains(".")) return false; // no dots — "<index>.pdf" is the only extension
|
||||||
|
if (index.contains("\0")) return false;
|
||||||
|
if (Paths.get(index).isAbsolute()) return false;
|
||||||
|
return INDEX_PATTERN.matcher(index).matches();
|
||||||
}
|
}
|
||||||
|
|
||||||
// package-private: a Mockito spy in tests can override to inject IOException
|
// package-private: a Mockito spy in tests can override to inject IOException
|
||||||
@@ -322,33 +335,30 @@ public class DocumentImporter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Optional<File> findFileRecursive(String filename) {
|
// O(1) direct lookup: the PDF is exactly importDir/<index>.pdf. The caller has already
|
||||||
|
// validated the index shape; the canonical-path containment assertion below is
|
||||||
|
// defense-in-depth so even a symlinked <index>.pdf cannot read outside importDir.
|
||||||
|
private Optional<File> resolvePdfByIndex(String index, int rowNumber) {
|
||||||
File baseDir = new File(importDir);
|
File baseDir = new File(importDir);
|
||||||
try (Stream<Path> walk = Files.walk(baseDir.toPath())) {
|
File candidate = baseDir.toPath().resolve(index + ".pdf").toFile();
|
||||||
Optional<Path> match = walk.filter(p -> !Files.isDirectory(p))
|
try {
|
||||||
.filter(p -> p.getFileName().toString().equals(filename))
|
if (!candidate.isFile()) return Optional.empty();
|
||||||
.findFirst();
|
|
||||||
if (match.isEmpty()) return Optional.empty();
|
|
||||||
File candidate = match.get().toFile();
|
|
||||||
String baseDirCanonical = baseDir.getCanonicalPath();
|
String baseDirCanonical = baseDir.getCanonicalPath();
|
||||||
if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) {
|
if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) {
|
||||||
throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate);
|
throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate);
|
||||||
}
|
}
|
||||||
return Optional.of(candidate);
|
return Optional.of(candidate);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
// Distinct from the deliberate symlink-escape abort above (which throws): canonical
|
||||||
|
// resolution itself failed (e.g. the OS rejected the path mid-resolution). We fail
|
||||||
|
// safe to a PLACEHOLDER, but never silently — log it so the asymmetry surfaces in ops.
|
||||||
|
log.warn("Canonical path resolution failed for import row {}: treating {}.pdf as absent",
|
||||||
|
rowNumber, index, e);
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String displayName(CanonicalSheetReader.Row row, String index) {
|
|
||||||
String file = row.get("file");
|
|
||||||
return file.isBlank() ? index : basenameOf(file);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String blankToNull(String s) {
|
private static String blankToNull(String s) {
|
||||||
return (s == null || s.isBlank()) ? null : s;
|
return (s == null || s.isBlank()) ? null : s;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final class InvalidImportFilenameException extends RuntimeException {
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -140,12 +140,12 @@ class CanonicalImportIntegrationTest {
|
|||||||
|
|
||||||
// Re-stage the document sheet with W-0001's receiver and tag removed.
|
// Re-stage the document sheet with W-0001's receiver and tag removed.
|
||||||
writeSheet(artifactDir.resolve("canonical-documents.xlsx"),
|
writeSheet(artifactDir.resolve("canonical-documents.xlsx"),
|
||||||
List.of("index", "file", "sender_person_id", "sender_name", "receiver_person_ids",
|
List.of("index", "sender_person_id", "sender_name", "receiver_person_ids",
|
||||||
"receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"),
|
"receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"),
|
||||||
List.of(
|
List.of(
|
||||||
List.of("W-0001", "", "de-gruyter-walter", "Walter de Gruyter",
|
List.of("W-0001", "de-gruyter-walter", "Walter de Gruyter",
|
||||||
"", "", "1888-02-15", "15.2.1888", "DAY", "", "Rotterdam", "", "Geschäftsreise"),
|
"", "", "1888-02-15", "15.2.1888", "DAY", "", "Rotterdam", "", "Geschäftsreise"),
|
||||||
List.of("W-0002", "", "de-gruyter-eugenie", "Eugenie de Gruyter",
|
List.of("W-0002", "de-gruyter-eugenie", "Eugenie de Gruyter",
|
||||||
"de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "",
|
"de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "",
|
||||||
"Middelburg", "Themen/Brautbriefe", "Reisepläne")));
|
"Middelburg", "Themen/Brautbriefe", "Reisepläne")));
|
||||||
|
|
||||||
@@ -196,13 +196,13 @@ class CanonicalImportIntegrationTest {
|
|||||||
""");
|
""");
|
||||||
|
|
||||||
writeSheet(dir.resolve("canonical-documents.xlsx"),
|
writeSheet(dir.resolve("canonical-documents.xlsx"),
|
||||||
List.of("index", "file", "sender_person_id", "sender_name", "receiver_person_ids",
|
List.of("index", "sender_person_id", "sender_name", "receiver_person_ids",
|
||||||
"receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"),
|
"receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"),
|
||||||
List.of(
|
List.of(
|
||||||
List.of("W-0001", "", "de-gruyter-walter", "Walter de Gruyter",
|
List.of("W-0001", "de-gruyter-walter", "Walter de Gruyter",
|
||||||
"de-gruyter-eugenie", "Eugenie de Gruyter", "1888-02-15", "15.2.1888", "DAY", "",
|
"de-gruyter-eugenie", "Eugenie de Gruyter", "1888-02-15", "15.2.1888", "DAY", "",
|
||||||
"Rotterdam", "Themen/Brautbriefe", "Geschäftsreise"),
|
"Rotterdam", "Themen/Brautbriefe", "Geschäftsreise"),
|
||||||
List.of("W-0002", "", "de-gruyter-eugenie", "Eugenie de Gruyter",
|
List.of("W-0002", "de-gruyter-eugenie", "Eugenie de Gruyter",
|
||||||
"de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "",
|
"de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "",
|
||||||
"Middelburg", "Themen/Brautbriefe", "Reisepläne")));
|
"Middelburg", "Themen/Brautbriefe", "Reisepläne")));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -58,99 +58,179 @@ class DocumentImporterTest {
|
|||||||
ReflectionTestUtils.setField(importer, "bucketName", "test-bucket");
|
ReflectionTestUtils.setField(importer, "bucketName", "test-bucket");
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─── security regression — ported from MassImportServiceTest — do not remove ─────
|
// ─── index validation — a malicious/garbage index can never reach disk I/O ─────────
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenNull() {
|
void isValidImportIndex_returnsFalse_whenNull() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", (String) null)).isFalse();
|
assertThat(validIndex(null)).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenBlank() {
|
void isValidImportIndex_returnsFalse_whenBlank() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", " ")).isFalse();
|
assertThat(validIndex(" ")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenForwardSlash() {
|
void isValidImportIndex_returnsFalse_whenForwardSlash() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "etc/passwd")).isFalse();
|
assertThat(validIndex("etc/passwd")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenBackslash() {
|
void isValidImportIndex_returnsFalse_whenBackslash() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..\\etc\\passwd")).isFalse();
|
assertThat(validIndex("..\\etc\\passwd")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenDotDot() {
|
void isValidImportIndex_returnsFalse_whenDotDot() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "doc..evil.pdf")).isFalse();
|
assertThat(validIndex("W-..0001")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenIsDotDot() {
|
void isValidImportIndex_returnsFalse_whenIsDotDot() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..")).isFalse();
|
assertThat(validIndex("..")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenAbsolutePath() {
|
void isValidImportIndex_returnsFalse_whenSingleDot() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "/etc/passwd")).isFalse();
|
assertThat(validIndex(".")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenNullByte() {
|
void isValidImportIndex_returnsFalse_whenAbsolutePath() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "file\0.pdf")).isFalse();
|
assertThat(validIndex("/etc/passwd")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenUnicodeDivisionSlash() {
|
void isValidImportIndex_returnsFalse_whenNullByte() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo∕bar.pdf")).isFalse();
|
assertThat(validIndex("W-0001\0")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenFullwidthSlash() {
|
void isValidImportIndex_returnsFalse_whenUnicodeDivisionSlash() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo/bar.pdf")).isFalse();
|
assertThat(validIndex("W∕0001")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsFalse_whenReverseSolidusOperator() {
|
void isValidImportIndex_returnsFalse_whenFullwidthSlash() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo⧵bar.pdf")).isFalse();
|
assertThat(validIndex("W/0001")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsTrue_whenPlainBasename() {
|
void isValidImportIndex_returnsFalse_whenReverseSolidusOperator() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "document.pdf")).isTrue();
|
assertThat(validIndex("W⧵0001")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsTrue_whenLeadingDot() {
|
void isValidImportIndex_returnsFalse_whenContainsDotPdfExtension() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", ".hidden.pdf")).isTrue();
|
// The index is the bare catalog id; appending ".pdf" is the importer's job. A dot in
|
||||||
|
// the index would let "W-0001.pdf" become "W-0001.pdf.pdf" or smuggle an extension.
|
||||||
|
assertThat(validIndex("W-0001.pdf")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── catalog-shape rejects — pass the char pre-checks but must fail INDEX_PATTERN ────
|
||||||
|
// These pin the regex branch itself: each string contains no separator, dot, slash
|
||||||
|
// homoglyph, null byte, or absolute marker, so it sails past every char guard and is
|
||||||
|
// rejected *only* because INDEX_PATTERN.matches() returns false. A weaker pattern would
|
||||||
|
// let them through — these tests would then go red.
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportIndex_returnsFalse_whenSpaceInIndex() {
|
||||||
|
// The real-world reject: "J 0070" is a space-typo with no PDF on disk.
|
||||||
|
assertThat(validIndex("J 0070")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportFilename_returnsTrue_whenHasSpaces() {
|
void isValidImportIndex_returnsFalse_whenFiveLetterPrefix() {
|
||||||
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "Brief an Oma.pdf")).isTrue();
|
// The catalog prefix is at most 4 letters; 5 must not match.
|
||||||
|
assertThat(validIndex("WXYZA-0001")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir(
|
void isValidImportIndex_returnsFalse_whenNoLetterPrefix() {
|
||||||
@TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception {
|
// A digit-led id (no letter prefix) is not a catalog shape.
|
||||||
Path outsideFile = outsideDir.resolve("secret.pdf");
|
assertThat(validIndex("12-0001")).isFalse();
|
||||||
Files.writeString(outsideFile, "sensitive");
|
|
||||||
Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile);
|
|
||||||
ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString());
|
|
||||||
|
|
||||||
org.assertj.core.api.Assertions.assertThatThrownBy(
|
|
||||||
() -> ReflectionTestUtils.invokeMethod(importer, "findFileRecursive", "secret.pdf"))
|
|
||||||
.isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─── path traversal in the file column cannot escape importDir ───────────────────
|
@Test
|
||||||
|
void isValidImportIndex_returnsFalse_whenUppercaseXSuffix() {
|
||||||
|
// Only a lowercase trailing "x" is allowed; an uppercase "X" suffix must fail.
|
||||||
|
assertThat(validIndex("W-0001X")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void load_rejectsFileColumn_whenBasenameIsTraversalToken(@TempDir Path tempDir) throws Exception {
|
void isValidImportIndex_returnsTrue_whenPlainCatalogIndex() {
|
||||||
// A file column whose basename is itself a traversal token must be rejected
|
assertThat(validIndex("W-0124")).isTrue();
|
||||||
// outright, never used for disk I/O.
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportIndex_returnsTrue_whenTwoLetterPrefix() {
|
||||||
|
assertThat(validIndex("Al-0001")).isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportIndex_returnsTrue_whenThreeLetterPrefix() {
|
||||||
|
assertThat(validIndex("CuH-0010")).isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportIndex_returnsTrue_whenUmlautPrefix() {
|
||||||
|
// Real corpus indices carry a German umlaut, e.g. "Mü-0001.pdf" exists on disk.
|
||||||
|
assertThat(validIndex("Mü-0001")).isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportIndex_returnsTrue_whenDoubleHyphen() {
|
||||||
|
// Real corpus: "C--0029" appears in the spreadsheet (a data-entry artefact, but a
|
||||||
|
// legitimate catalog shape that must still resolve, not crash).
|
||||||
|
assertThat(validIndex("C--0029")).isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportIndex_returnsTrue_whenXSuffix() {
|
||||||
|
// The normalizer recognises an x-suffix catalog id; allow it defensively.
|
||||||
|
assertThat(validIndex("W-0001x")).isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── a valid index resolves to exactly importDir/<index>.pdf within containment ─────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_resolvesPdfByIndex_uploadsToS3_andSetsStatusUploaded(@TempDir Path tempDir) throws Exception {
|
||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0001", "evil/..", "", "", "", "", "", "", "", ""));
|
byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D};
|
||||||
|
Files.write(tempDir.resolve("W-0124.pdf"), pdf);
|
||||||
|
when(documentService.findByOriginalFilename("W-0124")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0124", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
// exactly importDir/<index>.pdf was uploaded — the S3 key carries that basename
|
||||||
|
org.mockito.ArgumentCaptor<RequestBody> bodyCaptor = org.mockito.ArgumentCaptor.forClass(RequestBody.class);
|
||||||
|
verify(s3Client).putObject(any(PutObjectRequest.class), bodyCaptor.capture());
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d ->
|
||||||
|
d.getStatus() == DocumentStatus.UPLOADED
|
||||||
|
&& d.getFilePath() != null
|
||||||
|
&& d.getFilePath().endsWith("_W-0124.pdf")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_yieldsPlaceholder_whenIndexedPdfMissing(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
when(documentService.findByOriginalFilename("X-9999")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("X-9999", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER));
|
||||||
|
verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_rejectsMaliciousIndex_neverReadsOutsideImportDir(@TempDir Path tempDir) throws Exception {
|
||||||
|
// An index with a path separator must be skipped outright, never used for disk I/O.
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("../../etc/cron.d/x", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
|
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
|
||||||
|
|
||||||
@@ -158,24 +238,49 @@ class DocumentImporterTest {
|
|||||||
.extracting(ImportStatus.SkippedFile::reason)
|
.extracting(ImportStatus.SkippedFile::reason)
|
||||||
.containsExactly(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
|
.containsExactly(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
|
||||||
verify(documentService, never()).save(any());
|
verify(documentService, never()).save(any());
|
||||||
|
verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void load_traversalFileColumn_cannotEscapeImportDir_yieldsPlaceholder(@TempDir Path tempDir) throws Exception {
|
void resolvePdfByIndex_throwsWhenResolvedPathEscapesImportDir_viaSymlink(
|
||||||
// ../../etc/cron.d/x reduces to basename "x"; the disk lookup is confined to
|
@TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception {
|
||||||
// importDir, so no file is found, nothing is uploaded, and the row becomes a
|
// Containment defense-in-depth: even a syntactically valid index whose <index>.pdf is a
|
||||||
// metadata-only PLACEHOLDER — the file outside importDir is never read.
|
// symlink pointing outside importDir must be refused — the resolved canonical path is
|
||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
// asserted to stay inside importDir.
|
||||||
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
|
Path outsideFile = outsideDir.resolve("secret.pdf");
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
Files.writeString(outsideFile, "sensitive");
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0001", "../../etc/cron.d/x", "", "", "", "", "", "", "", ""));
|
Files.createSymbolicLink(importDirPath.resolve("W-0001.pdf"), outsideFile);
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString());
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
org.assertj.core.api.Assertions.assertThatThrownBy(
|
||||||
|
() -> ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "W-0001", 2))
|
||||||
verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
|
.isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class);
|
||||||
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void resolvePdfByIndex_returnsExactlyImportDirIndexPdf_whenPresent(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Path expected = tempDir.resolve("Eu-0628.pdf");
|
||||||
|
Files.writeString(expected, "%PDF-1.4");
|
||||||
|
|
||||||
|
Optional<File> resolved = ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "Eu-0628", 2);
|
||||||
|
|
||||||
|
assertThat(resolved).isPresent();
|
||||||
|
assertThat(resolved.get().getCanonicalFile()).isEqualTo(expected.toFile().getCanonicalFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE (Sara, PR #687): the IOException branch of resolvePdfByIndex — where
|
||||||
|
// File.getCanonicalPath() itself throws (an OS-level failure mid-resolution, not the
|
||||||
|
// symlink-escape DomainException) — is intentionally NOT covered by a test. Unlike
|
||||||
|
// isPdfMagicBytes, which has the package-private openFileStream(File) seam a Mockito spy can
|
||||||
|
// make throw, getCanonicalPath() is called on a File built internally with no injection seam,
|
||||||
|
// and there is no portable, deterministic way to make it throw on a temp file (it does not
|
||||||
|
// throw for missing/symlinked paths — those are handled by isFile()/the containment check).
|
||||||
|
// Adding a seam purely to test this would be production code in service of a non-defect; the
|
||||||
|
// substantive fix is the log.warn() now emitted in that branch so the quiet skip surfaces in
|
||||||
|
// ops. Left uncovered by deliberate decision, documented here so the branch is not assumed
|
||||||
|
// tested.
|
||||||
|
|
||||||
// ─── PDF magic-byte guard — ported — do not remove ──────────────────────────────
|
// ─── PDF magic-byte guard — ported — do not remove ──────────────────────────────
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -183,7 +288,7 @@ class DocumentImporterTest {
|
|||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
Files.writeString(tempDir.resolve("W-0001.pdf"), "not a pdf");
|
Files.writeString(tempDir.resolve("W-0001.pdf"), "not a pdf");
|
||||||
lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty());
|
lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty());
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
|
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
|
||||||
|
|
||||||
@@ -198,7 +303,7 @@ class DocumentImporterTest {
|
|||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
Files.writeString(tempDir.resolve("W-0001.pdf"), "content");
|
Files.writeString(tempDir.resolve("W-0001.pdf"), "content");
|
||||||
lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty());
|
lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty());
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
DocumentImporter spyImporter = org.mockito.Mockito.spy(importer);
|
DocumentImporter spyImporter = org.mockito.Mockito.spy(importer);
|
||||||
org.mockito.Mockito.doThrow(new java.io.IOException("read error"))
|
org.mockito.Mockito.doThrow(new java.io.IOException("read error"))
|
||||||
@@ -217,7 +322,7 @@ class DocumentImporterTest {
|
|||||||
Document existing = Document.builder().id(UUID.randomUUID())
|
Document existing = Document.builder().id(UUID.randomUUID())
|
||||||
.originalFilename("W-0001").status(DocumentStatus.UPLOADED).build();
|
.originalFilename("W-0001").status(DocumentStatus.UPLOADED).build();
|
||||||
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.of(existing));
|
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.of(existing));
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "", ""));
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
|
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
|
||||||
|
|
||||||
@@ -227,29 +332,14 @@ class DocumentImporterTest {
|
|||||||
verify(documentService, never()).save(any());
|
verify(documentService, never()).save(any());
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─── file column drives status: present → UPLOADED, empty → PLACEHOLDER ───────────
|
// ─── presence of importDir/<index>.pdf drives status: present → UPLOADED, absent → PLACEHOLDER ─
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void load_uploadsToS3_andSetsStatusUploaded_whenFilePresent(@TempDir Path tempDir) throws Exception {
|
void load_setsStatusPlaceholder_whenNoIndexedPdf(@TempDir Path tempDir) throws Exception {
|
||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
|
||||||
byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D};
|
|
||||||
Files.write(tempDir.resolve("W-0001.pdf"), pdf);
|
|
||||||
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
|
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
|
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
|
||||||
|
|
||||||
verify(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class));
|
|
||||||
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.UPLOADED));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void load_setsStatusPlaceholder_whenFileColumnEmpty(@TempDir Path tempDir) throws Exception {
|
|
||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
when(documentService.findByOriginalFilename("W-0099")).thenReturn(Optional.empty());
|
when(documentService.findByOriginalFilename("W-0099")).thenReturn(Optional.empty());
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", "", ""));
|
Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
@@ -267,7 +357,7 @@ class DocumentImporterTest {
|
|||||||
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
|
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
when(personService.findBySourceRef("de-gruyter-walter")).thenReturn(Optional.of(walter));
|
when(personService.findBySourceRef("de-gruyter-walter")).thenReturn(Optional.of(walter));
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "de-gruyter-walter", "Walter de Gruyter",
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "de-gruyter-walter", "Walter de Gruyter",
|
||||||
"", "", "", "", "", ""));
|
"", "", "", "", "", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
@@ -285,7 +375,7 @@ class DocumentImporterTest {
|
|||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
when(personService.findBySourceRef("schwester-hanni")).thenReturn(Optional.empty());
|
when(personService.findBySourceRef("schwester-hanni")).thenReturn(Optional.empty());
|
||||||
when(personService.upsertBySourceRef(any())).thenReturn(provisional);
|
when(personService.upsertBySourceRef(any())).thenReturn(provisional);
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0002", "", "schwester-hanni", "Schwester Hanni",
|
Path xlsx = writeDocs(tempDir, docRow("W-0002", "schwester-hanni", "Schwester Hanni",
|
||||||
"", "", "", "", "", ""));
|
"", "", "", "", "", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
@@ -302,7 +392,7 @@ class DocumentImporterTest {
|
|||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
when(documentService.findByOriginalFilename("W-0003")).thenReturn(Optional.empty());
|
when(documentService.findByOriginalFilename("W-0003")).thenReturn(Optional.empty());
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "", "?",
|
Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "?",
|
||||||
"", "", "", "", "", ""));
|
"", "", "", "", "", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
@@ -322,7 +412,7 @@ class DocumentImporterTest {
|
|||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
when(personService.findBySourceRef("cram-herbert")).thenReturn(Optional.of(herbert));
|
when(personService.findBySourceRef("cram-herbert")).thenReturn(Optional.of(herbert));
|
||||||
when(personService.findBySourceRef("clara")).thenReturn(Optional.of(clara));
|
when(personService.findBySourceRef("clara")).thenReturn(Optional.of(clara));
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "", "",
|
Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "",
|
||||||
"cram-herbert|clara", "Herbert Cram|Clara", "", "", "", ""));
|
"cram-herbert|clara", "Herbert Cram|Clara", "", "", "", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
@@ -341,7 +431,7 @@ class DocumentImporterTest {
|
|||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
when(documentService.findByOriginalFilename("W-0005")).thenReturn(Optional.empty());
|
when(documentService.findByOriginalFilename("W-0005")).thenReturn(Optional.empty());
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "", "",
|
Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "",
|
||||||
"", "", "1916-06-01", "1.6.1916", "MONTH", ""));
|
"", "", "1916-06-01", "1.6.1916", "MONTH", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
@@ -375,7 +465,7 @@ class DocumentImporterTest {
|
|||||||
.originalFilename("W-0007").status(DocumentStatus.PLACEHOLDER).build();
|
.originalFilename("W-0007").status(DocumentStatus.PLACEHOLDER).build();
|
||||||
when(documentService.findByOriginalFilename("W-0007")).thenReturn(Optional.of(existing));
|
when(documentService.findByOriginalFilename("W-0007")).thenReturn(Optional.of(existing));
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", "", ""));
|
Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
@@ -396,7 +486,7 @@ class DocumentImporterTest {
|
|||||||
when(documentService.findByOriginalFilename("W-0008")).thenReturn(Optional.of(existing));
|
when(documentService.findByOriginalFilename("W-0008")).thenReturn(Optional.of(existing));
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
// The canonical row now carries no receiver and no tag: both stale links must go.
|
// The canonical row now carries no receiver and no tag: both stale links must go.
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0008", "", "", "", "", "", "", "", "", ""));
|
Path xlsx = writeDocs(tempDir, docRow("W-0008", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
@@ -411,7 +501,7 @@ class DocumentImporterTest {
|
|||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
when(documentService.findByOriginalFilename("W-0100")).thenReturn(Optional.empty());
|
when(documentService.findByOriginalFilename("W-0100")).thenReturn(Optional.empty());
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0100", "", "", "", "", "",
|
Path xlsx = writeDocs(tempDir, docRow("W-0100", "", "", "", "",
|
||||||
"1916-06-01", "Juni 1916", "MONTH", ""));
|
"1916-06-01", "Juni 1916", "MONTH", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
@@ -425,7 +515,7 @@ class DocumentImporterTest {
|
|||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
when(documentService.findByOriginalFilename("W-0101")).thenReturn(Optional.empty());
|
when(documentService.findByOriginalFilename("W-0101")).thenReturn(Optional.empty());
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0101", "", "", "", "", "",
|
Path xlsx = writeDocs(tempDir, docRow("W-0101", "", "", "", "",
|
||||||
"1943-12-24", "24.12.1943", "DAY", ""));
|
"1943-12-24", "24.12.1943", "DAY", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
@@ -439,7 +529,7 @@ class DocumentImporterTest {
|
|||||||
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
when(documentService.findByOriginalFilename("W-0102")).thenReturn(Optional.empty());
|
when(documentService.findByOriginalFilename("W-0102")).thenReturn(Optional.empty());
|
||||||
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
Path xlsx = writeDocs(tempDir, docRow("W-0102", "", "", "", "", "",
|
Path xlsx = writeDocs(tempDir, docRow("W-0102", "", "", "", "",
|
||||||
"", "?", "UNKNOWN", ""));
|
"", "?", "UNKNOWN", ""));
|
||||||
|
|
||||||
importer.load(xlsx.toFile());
|
importer.load(xlsx.toFile());
|
||||||
@@ -450,12 +540,15 @@ class DocumentImporterTest {
|
|||||||
|
|
||||||
// ─── helpers ─────────────────────────────────────────────────────────────────────
|
// ─── helpers ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
private Map<String, String> docRow(String index, String file, String senderId, String senderName,
|
private Boolean validIndex(String index) {
|
||||||
|
return ReflectionTestUtils.invokeMethod(importer, "isValidImportIndex", index);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, String> docRow(String index, String senderId, String senderName,
|
||||||
String receiverIds, String receiverNames, String dateIso,
|
String receiverIds, String receiverNames, String dateIso,
|
||||||
String dateRaw, String datePrecision, String dateEnd) {
|
String dateRaw, String datePrecision, String dateEnd) {
|
||||||
Map<String, String> r = new LinkedHashMap<>();
|
Map<String, String> r = new LinkedHashMap<>();
|
||||||
r.put("index", index);
|
r.put("index", index);
|
||||||
r.put("file", file);
|
|
||||||
r.put("sender_person_id", senderId);
|
r.put("sender_person_id", senderId);
|
||||||
r.put("sender_name", senderName);
|
r.put("sender_name", senderName);
|
||||||
r.put("receiver_person_ids", receiverIds);
|
r.put("receiver_person_ids", receiverIds);
|
||||||
@@ -471,7 +564,7 @@ class DocumentImporterTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Map<String, String> docRowWithTag(String index, String tagPath) {
|
private Map<String, String> docRowWithTag(String index, String tagPath) {
|
||||||
Map<String, String> r = docRow(index, "", "", "", "", "", "", "", "", "");
|
Map<String, String> r = docRow(index, "", "", "", "", "", "", "", "");
|
||||||
r.put("tags", tagPath);
|
r.put("tags", tagPath);
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
@@ -479,7 +572,7 @@ class DocumentImporterTest {
|
|||||||
@SafeVarargs
|
@SafeVarargs
|
||||||
private Path writeDocs(Path dir, Map<String, String>... rows) throws Exception {
|
private Path writeDocs(Path dir, Map<String, String>... rows) throws Exception {
|
||||||
Path xlsx = dir.resolve("canonical-documents.xlsx");
|
Path xlsx = dir.resolve("canonical-documents.xlsx");
|
||||||
List<String> headers = List.of("index", "file", "sender_person_id", "sender_name",
|
List<String> headers = List.of("index", "sender_person_id", "sender_name",
|
||||||
"receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision",
|
"receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision",
|
||||||
"date_end", "location", "tags", "summary");
|
"date_end", "location", "tags", "summary");
|
||||||
try (XSSFWorkbook wb = new XSSFWorkbook()) {
|
try (XSSFWorkbook wb = new XSSFWorkbook()) {
|
||||||
|
|||||||
@@ -26,8 +26,10 @@
|
|||||||
# MAIL_HOST, MAIL_PORT, SMTP relay (production only; staging uses mailpit)
|
# MAIL_HOST, MAIL_PORT, SMTP relay (production only; staging uses mailpit)
|
||||||
# MAIL_USERNAME, MAIL_PASSWORD
|
# MAIL_USERNAME, MAIL_PASSWORD
|
||||||
# APP_MAIL_FROM sender address (e.g. noreply@raddatz.cloud)
|
# APP_MAIL_FROM sender address (e.g. noreply@raddatz.cloud)
|
||||||
# IMPORT_HOST_DIR absolute host path holding ONLY the ODS
|
# IMPORT_HOST_DIR absolute host path holding the canonical
|
||||||
# spreadsheet and PDFs for /admin/system mass
|
# import artifacts (canonical-*.xlsx +
|
||||||
|
# canonical-persons-tree.json) and the
|
||||||
|
# <index>.pdf files for /admin/system
|
||||||
# import — mounted read-only at /import inside
|
# import — mounted read-only at /import inside
|
||||||
# the backend. Compose refuses to start when
|
# the backend. Compose refuses to start when
|
||||||
# this var is unset, so staging and prod cannot
|
# this var is unset, so staging and prod cannot
|
||||||
@@ -217,12 +219,13 @@ services:
|
|||||||
# Bound to localhost only — Caddy fronts external traffic.
|
# Bound to localhost only — Caddy fronts external traffic.
|
||||||
ports:
|
ports:
|
||||||
- "127.0.0.1:${PORT_BACKEND}:8080"
|
- "127.0.0.1:${PORT_BACKEND}:8080"
|
||||||
# Host path holding the ODS spreadsheet + PDFs for the mass-import endpoint.
|
# Host path holding the canonical import artifacts (canonical-*.xlsx +
|
||||||
# Read-only; MassImportService only reads (Files.list / Files.walk on /import).
|
# canonical-persons-tree.json) + <index>.pdf files for the import endpoint.
|
||||||
|
# Read-only; the canonical importer only reads them from /import.
|
||||||
# Required — no default — so staging and prod cannot accidentally share an
|
# Required — no default — so staging and prod cannot accidentally share an
|
||||||
# import source. CI workflows pin this per-env (see .gitea/workflows/).
|
# import source. CI workflows pin this per-env (see .gitea/workflows/).
|
||||||
volumes:
|
volumes:
|
||||||
- ${IMPORT_HOST_DIR:?Set IMPORT_HOST_DIR to a host path holding the mass-import payload (ODS + PDFs). See docs/DEPLOYMENT.md.}:/import:ro
|
- ${IMPORT_HOST_DIR:?Set IMPORT_HOST_DIR to a host path holding the import payload (canonical artifacts + <index>.pdf files). See docs/DEPLOYMENT.md.}:/import:ro
|
||||||
environment:
|
environment:
|
||||||
SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/archiv
|
SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/archiv
|
||||||
SPRING_DATASOURCE_USERNAME: archiv
|
SPRING_DATASOURCE_USERNAME: archiv
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ Members of the cross-cutting layer have no entity of their own, no user-facing C
|
|||||||
| `dashboard` | Stats aggregation for the admin dashboard and Family Pulse widget | Aggregates from 3+ domains; no owned entities |
|
| `dashboard` | Stats aggregation for the admin dashboard and Family Pulse widget | Aggregates from 3+ domains; no owned entities |
|
||||||
| `exception` | `DomainException`, `ErrorCode` enum, `GlobalExceptionHandler` | Framework infra; consumed by every controller and service. Adding a new `ErrorCode` requires matching updates in `frontend/src/lib/shared/errors.ts` and all three `messages/*.json` locale files. Current security-related codes: `CSRF_TOKEN_MISSING` (403 on mutating request without valid `X-XSRF-TOKEN` header), `TOO_MANY_LOGIN_ATTEMPTS` (429 when login rate limit exceeded). |
|
| `exception` | `DomainException`, `ErrorCode` enum, `GlobalExceptionHandler` | Framework infra; consumed by every controller and service. Adding a new `ErrorCode` requires matching updates in `frontend/src/lib/shared/errors.ts` and all three `messages/*.json` locale files. Current security-related codes: `CSRF_TOKEN_MISSING` (403 on mutating request without valid `X-XSRF-TOKEN` header), `TOO_MANY_LOGIN_ATTEMPTS` (429 when login rate limit exceeded). |
|
||||||
| `filestorage` | `FileService` — MinIO/S3 upload, download, presigned-URL generation | Generic service; consumed by `document` and `ocr` |
|
| `filestorage` | `FileService` — MinIO/S3 upload, download, presigned-URL generation | Generic service; consumed by `document` and `ocr` |
|
||||||
| `importing` | `MassImportService` — async ODS/Excel batch import | Orchestrates across `person`, `tag`, `document` |
|
| `importing` | `CanonicalImportOrchestrator` — async canonical import running four idempotent loaders (`TagTreeImporter` → `PersonRegisterImporter` → `PersonTreeImporter` → `DocumentImporter`) over the normalizer's committed canonical artifacts (`canonical-*.xlsx` + `canonical-persons-tree.json`) | Orchestrates across `person`, `tag`, `document` |
|
||||||
| `security` | `SecurityConfig`, `Permission` enum, `@RequirePermission` annotation, `PermissionAspect` (AOP) | Framework infra; enforced globally across all controllers |
|
| `security` | `SecurityConfig`, `Permission` enum, `@RequirePermission` annotation, `PermissionAspect` (AOP) | Framework infra; enforced globally across all controllers |
|
||||||
|
|
||||||
**Frontend `shared/`** follows the same admission criteria. Key members: `api.server.ts` (typed openapi-fetch client factory), `errors.ts` (backend `ErrorCode` → i18n mapping), `shared/primitives/` (generic UI components used across ≥2 domains), `shared/discussion/` (comment/mention editor used by `document` and `geschichte`), `shared/utils/` (pure date/sort/debounce utilities).
|
**Frontend `shared/`** follows the same admission criteria. Key members: `api.server.ts` (typed openapi-fetch client factory), `errors.ts` (backend `ErrorCode` → i18n mapping), `shared/primitives/` (generic UI components used across ≥2 domains), `shared/discussion/` (comment/mention editor used by `document` and `geschichte`), `shared/utils/` (pure date/sort/debounce utilities).
|
||||||
|
|||||||
@@ -99,7 +99,7 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back
|
|||||||
| `APP_BASE_URL` | Public-facing URL for email links | `http://localhost:3000` | YES (prod) | — |
|
| `APP_BASE_URL` | Public-facing URL for email links | `http://localhost:3000` | YES (prod) | — |
|
||||||
| `APP_OCR_BASE_URL` | Internal URL of the OCR service | — | YES | — |
|
| `APP_OCR_BASE_URL` | Internal URL of the OCR service | — | YES | — |
|
||||||
| `APP_OCR_TRAINING_TOKEN` | Secret token for OCR training endpoints | — | YES (prod) | YES |
|
| `APP_OCR_TRAINING_TOKEN` | Secret token for OCR training endpoints | — | YES (prod) | YES |
|
||||||
| `IMPORT_HOST_DIR` | Absolute host path holding the ODS spreadsheet + PDFs for the `/admin/system` mass-import card. Mounted read-only at `/import` inside the backend (compose-only — backend reads via `app.import.dir`). Compose refuses to start when unset, so staging and prod cannot accidentally share the source. Convention: `/srv/familienarchiv-staging/import` and `/srv/familienarchiv-production/import` | — | YES (prod compose) | — |
|
| `IMPORT_HOST_DIR` | Absolute host path holding the normalizer's canonical artifacts (`canonical-{documents,persons,tag-tree}.xlsx` + `canonical-persons-tree.json`) **plus the `<index>.pdf` files** for the `/admin/system` import. Mounted read-only at `/import` inside the backend (the canonical importer reads via `app.import.dir`). Compose refuses to start when unset, so staging and prod cannot accidentally share the source. Convention: `/srv/familienarchiv-staging/import` and `/srv/familienarchiv-production/import` | — | YES (prod compose) | — |
|
||||||
| `MAIL_HOST` | SMTP host | `mailpit` (dev) | YES (prod) | — |
|
| `MAIL_HOST` | SMTP host | `mailpit` (dev) | YES (prod) | — |
|
||||||
| `MAIL_PORT` | SMTP port | `1025` (dev) | YES (prod) | — |
|
| `MAIL_PORT` | SMTP port | `1025` (dev) | YES (prod) | — |
|
||||||
| `MAIL_USERNAME` | SMTP username | — | YES (prod) | YES |
|
| `MAIL_USERNAME` | SMTP username | — | YES (prod) | YES |
|
||||||
@@ -577,10 +577,15 @@ python3 -m venv .venv && .venv/bin/pip install -r requirements.txt # once, on
|
|||||||
# writes the four canonical artifacts into ./out/
|
# writes the four canonical artifacts into ./out/
|
||||||
```
|
```
|
||||||
|
|
||||||
**Dev:** place all four canonical artifacts **plus** the referenced PDFs into `./import/`
|
**Dev:** place all four canonical artifacts **plus** the PDFs into `./import/`
|
||||||
at the repo root (the dev compose bind-mounts it to `/import`, which is `app.import.dir`).
|
at the repo root (the dev compose bind-mounts it to `/import`, which is `app.import.dir`).
|
||||||
The orchestrator smoke-checks that all four artifacts are present before starting and fails
|
Each PDF must be named `<index>.pdf` (e.g. `W-0124.pdf`, `Mü-0001.pdf`) and live flat in the
|
||||||
closed (`IMPORT_ARTIFACT_INVALID`) if any is missing.
|
import dir: since #686 the importer resolves a document's PDF directly by its index
|
||||||
|
(`importDir/<index>.pdf`), not via a `datei`/`file` column — the recursive directory walk and
|
||||||
|
its basename/homoglyph guards are gone, replaced by strict index validation plus a
|
||||||
|
canonical-path containment assertion (a document whose `<index>.pdf` is absent simply becomes a
|
||||||
|
`PLACEHOLDER`). The orchestrator smoke-checks that all four artifacts are present before
|
||||||
|
starting and fails closed (`IMPORT_ARTIFACT_INVALID`) if any is missing.
|
||||||
|
|
||||||
**Staging/production:**
|
**Staging/production:**
|
||||||
|
|
||||||
|
|||||||
@@ -94,17 +94,6 @@ The schema includes `spring_session` and `spring_session_attributes` tables, but
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### `MassImportService` provides no status or error feedback
|
|
||||||
**File:** `service/MassImportService.java`, `controller/AdminController.java`
|
|
||||||
|
|
||||||
`/api/admin/trigger-import` returns immediately (async), but there is no way for the admin to know whether the import succeeded, failed, or is still running. Errors during async execution are silently swallowed.
|
|
||||||
|
|
||||||
**Fix options:**
|
|
||||||
- Store import job status in a DB table (`import_jobs`) with state (`RUNNING`, `DONE`, `FAILED`) and expose a `GET /api/admin/import-status` endpoint
|
|
||||||
- Alternatively, make the endpoint synchronous since it already blocks on file I/O — only use async if you need true non-blocking behaviour
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Missing Capabilities
|
## Missing Capabilities
|
||||||
|
|
||||||
### No test coverage
|
### No test coverage
|
||||||
@@ -114,7 +103,7 @@ The only test is a Spring context load test. No unit or integration tests exist
|
|||||||
|
|
||||||
**Suggested starting points (highest value for effort):**
|
**Suggested starting points (highest value for effort):**
|
||||||
1. `DocumentSpecifications` — pure logic, easy to unit test with an in-memory H2 or Testcontainers PostgreSQL
|
1. `DocumentSpecifications` — pure logic, easy to unit test with an in-memory H2 or Testcontainers PostgreSQL
|
||||||
2. `ExcelService` — parsing logic, test with fixture `.xlsx` files (one exists in `api_tests/`)
|
2. Canonical import loaders (`CanonicalSheetReader`, `DocumentImporter`, etc.) — parsing/upsert logic, test with fixture canonical `.xlsx` files
|
||||||
3. `PermissionAspect` — security logic should be tested; use `@WithMockUser` from Spring Security Test
|
3. `PermissionAspect` — security logic should be tested; use `@WithMockUser` from Spring Security Test
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
@@ -102,12 +102,23 @@ Settled sub-decisions:
|
|||||||
- **`provisional` is now populated.** Importer-minted persons are `provisional = true`;
|
- **`provisional` is now populated.** Importer-minted persons are `provisional = true`;
|
||||||
register and tree persons stay `false`. This is the Phase-3 contract the schema (decision 1)
|
register and tree persons stay `false`. This is the Phase-3 contract the schema (decision 1)
|
||||||
left at default-`false`.
|
left at default-`false`.
|
||||||
- **Security guards are defense-in-depth, not upstream-trust.** The `file` column is treated as
|
- **PDFs resolve directly by index (`<index>.pdf`), not by a `file` column.** The corpus is
|
||||||
hostile (CWE-22 does not care it came from our tool): its basename is validated
|
uniform — all PDFs are named `<index>.pdf` flat in the import dir (e.g. `W-0124.pdf`,
|
||||||
(`isValidImportFilename` — slash/backslash, three Unicode slash homoglyphs, `..`, null byte,
|
`Mü-0001.pdf`) — so `DocumentImporter` resolves a document's PDF with an O(1)
|
||||||
absolute path) and resolved only inside the import dir with canonical-path containment, so a
|
`importDir.resolve(index + ".pdf")` lookup. The redundant `file` column (carrying the
|
||||||
traversal value can never escape. The `%PDF` magic-byte check gates upload. These guards and
|
spreadsheet's messy `datei` value) and the recursive directory walk that resolved it were
|
||||||
their tests were ported from `MassImportService` **before** it was deleted.
|
removed (#686, which also closed #676 — the O(rows×tree) walk is gone). The normalizer no
|
||||||
|
longer emits `file` or the `index_file_mismatch` review flag.
|
||||||
|
- **Security guards are defense-in-depth, not upstream-trust.** The `index` is the only thing
|
||||||
|
that drives the on-disk lookup, so it is treated as hostile (CWE-22 does not care it came from
|
||||||
|
our tool): `isValidImportIndex` rejects slash/backslash, three Unicode slash homoglyphs, any
|
||||||
|
`.` (so `<index>.pdf` is the only extension and `..` can never appear), null byte, and
|
||||||
|
absolute paths, and requires a strict catalog shape (1–4 Latin letters incl. umlauts, one or
|
||||||
|
more hyphens, digits, optional trailing `x`). A bad index skips the row with a clear
|
||||||
|
`SkipReason` (`INVALID_FILENAME_PATH_TRAVERSAL`). The resolved canonical path is still asserted
|
||||||
|
to stay inside the import dir as a second line of defense (a symlinked `<index>.pdf` cannot
|
||||||
|
escape), and the `%PDF` magic-byte check still gates upload. These guards and their tests were
|
||||||
|
ported from the file-column resolution (originally from `MassImportService`).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -138,11 +149,25 @@ Settled sub-decisions:
|
|||||||
the same state, so the operational recovery for a partial failure is simply to fix the
|
the same state, so the operational recovery for a partial failure is simply to fix the
|
||||||
offending artifact and re-trigger the import — no manual cleanup of half-written data is
|
offending artifact and re-trigger the import — no manual cleanup of half-written data is
|
||||||
required. A future maintainer must not assume all-or-nothing semantics.
|
required. A future maintainer must not assume all-or-nothing semantics.
|
||||||
- **Path-escape aborts the whole import (fail-closed), by design.** A path-traversal or
|
- **The index pattern is corpus-specific and must be revisited if the catalog scheme grows.**
|
||||||
symlink-escape in a row's file path is treated as an attack signal: the import aborts rather
|
`INDEX_PATTERN` accepts only the *current* corpus shape — at most four Latin-1 letters (incl.
|
||||||
than recording the row as a `SkippedFile` and continuing. This is a deliberate owner decision
|
umlauts) followed by one or more hyphens, ASCII digits, and an optional trailing `x`. This is a
|
||||||
(2026-05-27) over a per-file skip — a malicious path must surface loudly, not be silently
|
conscious constraint, not a general filename validator: a future sub-collection catalogued with
|
||||||
tolerated.
|
a 5-letter prefix, a digit-led id, or a non-Latin-1 letter (e.g. `Č` or a Cyrillic id) would
|
||||||
|
fail `isValidImportIndex` and its rows would be **skipped** (`INVALID_FILENAME_PATH_TRAVERSAL`),
|
||||||
|
not imported. Likewise a real PDF that does not follow `<index>.pdf` produces a `PLACEHOLDER`
|
||||||
|
(the importer logs both cases distinctly — see #686). If the catalog scheme ever changes, the
|
||||||
|
pattern and its tests must be widened deliberately; do not loosen it casually, as it is the
|
||||||
|
allowlist that keeps the on-disk lookup safe. Note `\d` is intentionally ASCII-only — adding
|
||||||
|
`Pattern.UNICODE_CHARACTER_CLASS` would silently widen the accepted digit set.
|
||||||
|
- **A malicious/garbage index skips its row with a loud `SkipReason`, by design.** Since #686
|
||||||
|
the index is the only on-disk lookup key. An index that fails `isValidImportIndex`
|
||||||
|
(path separator, traversal token, slash homoglyph, null byte, absolute path, or a non-catalog
|
||||||
|
shape) is recorded as a `SkippedFile` with reason `INVALID_FILENAME_PATH_TRAVERSAL` and the
|
||||||
|
import continues with the remaining rows — nothing outside the import dir is ever read. A
|
||||||
|
symlinked `<index>.pdf` whose canonical path escapes the import dir is the one case that still
|
||||||
|
aborts the import (a `DomainException` from the containment assertion), because a syntactically
|
||||||
|
valid index resolving outside the dir is an environment-level attack signal, not a row typo.
|
||||||
- **`PersonSummaryDTO` coupling.** `provisional` was added to the `PersonSummaryDTO` native
|
- **`PersonSummaryDTO` coupling.** `provisional` was added to the `PersonSummaryDTO` native
|
||||||
interface projection; because the projection is backed by native SQL, the column had to be
|
interface projection; because the projection is backed by native SQL, the column had to be
|
||||||
added to all three native `SELECT`s (`findAllWithDocumentCount`, `searchWithDocumentCount`,
|
added to all three native `SELECT`s (`findAllWithDocumentCount`, `searchWithDocumentCount`,
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ C4Component
|
|||||||
|
|
||||||
### 3b — Document Management & Import
|
### 3b — Document Management & Import
|
||||||
|
|
||||||
Document management, file storage, and bulk Excel/ODS import.
|
Document management, file storage, and the canonical import.
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
C4Component
|
C4Component
|
||||||
@@ -105,12 +105,11 @@ C4Component
|
|||||||
|
|
||||||
System_Boundary(backend, "API Backend (Spring Boot)") {
|
System_Boundary(backend, "API Backend (Spring Boot)") {
|
||||||
Component(docCtrl, "DocumentController", "Spring MVC — /api/documents", "CRUD for documents: search, get by ID, update metadata, upload/download file, conversation thread, and batch metadata updates.")
|
Component(docCtrl, "DocumentController", "Spring MVC — /api/documents", "CRUD for documents: search, get by ID, update metadata, upload/download file, conversation thread, and batch metadata updates.")
|
||||||
Component(adminCtrl, "AdminController", "Spring MVC — /api/admin", "Triggers asynchronous Excel/ODS mass import (requires ADMIN permission). Reports import state (IDLE/RUNNING/DONE/FAILED).")
|
Component(adminCtrl, "AdminController", "Spring MVC — /api/admin", "Triggers the asynchronous canonical import (requires ADMIN permission). Reports import state via GET /api/admin/import-status (IDLE/RUNNING/DONE/FAILED).")
|
||||||
|
|
||||||
Component(docSvc, "DocumentService", "Spring Service", "Core document business logic: store, update, search. Resolves persons and tags, delegates file I/O to FileService, builds dynamic JPA Specifications, and integrates with audit logging.")
|
Component(docSvc, "DocumentService", "Spring Service", "Core document business logic: store, update, search. Resolves persons and tags, delegates file I/O to FileService, builds dynamic JPA Specifications, and integrates with audit logging.")
|
||||||
Component(fileSvc, "FileService", "Spring Service", "Wraps AWS SDK v2 S3Client. Uploads files with UUID-keyed paths, computes SHA-256 hash, downloads with content-type detection, and generates presigned URLs for OCR access.")
|
Component(fileSvc, "FileService", "Spring Service", "Wraps AWS SDK v2 S3Client. Uploads files with UUID-keyed paths, computes SHA-256 hash, downloads with content-type detection, and generates presigned URLs for OCR access.")
|
||||||
Component(massImport, "MassImportService", "Spring Service — @Async", "Reads Excel/ODS files from /import mount. Tracks import state (IDLE/RUNNING/DONE/FAILED) and delegates to ExcelService. Returns immediately; processing runs asynchronously.")
|
Component(importOrch, "CanonicalImportOrchestrator", "Spring Service — @Async", "Runs four idempotent loaders (TagTree → PersonRegister → PersonTree → Document) in a fixed DAG over the normalizer's committed canonical artifacts (canonical-*.xlsx + canonical-persons-tree.json) from /import — see diagram 3b. Owns the IDLE/RUNNING/DONE/FAILED state machine.")
|
||||||
Component(excelSvc, "ExcelService", "Spring Service", "Parses Excel/ODS workbooks (Apache POI). Column indices configurable via application.properties. Creates/updates document records per row.")
|
|
||||||
Component(minioConf, "MinioConfig", "Spring @Configuration", "Creates the S3Client and S3Presigner beans with path-style access for MinIO. Validates MinIO connectivity on startup.")
|
Component(minioConf, "MinioConfig", "Spring @Configuration", "Creates the S3Client and S3Presigner beans with path-style access for MinIO. Validates MinIO connectivity on startup.")
|
||||||
|
|
||||||
Component(docRepo, "DocumentRepository", "Spring Data JPA", "Queries documents with Specification-based dynamic search, bidirectional conversation thread queries, full-text search with ranking and match highlighting, and transcription pipeline queue projections.")
|
Component(docRepo, "DocumentRepository", "Spring Data JPA", "Queries documents with Specification-based dynamic search, bidirectional conversation thread queries, full-text search with ranking and match highlighting, and transcription pipeline queue projections.")
|
||||||
@@ -123,14 +122,15 @@ C4Component
|
|||||||
Rel(frontend, docCtrl, "Document requests", "HTTP / JSON")
|
Rel(frontend, docCtrl, "Document requests", "HTTP / JSON")
|
||||||
Rel(frontend, adminCtrl, "Trigger import", "HTTP / JSON")
|
Rel(frontend, adminCtrl, "Trigger import", "HTTP / JSON")
|
||||||
Rel(docCtrl, docSvc, "Delegates to", "")
|
Rel(docCtrl, docSvc, "Delegates to", "")
|
||||||
Rel(adminCtrl, massImport, "Triggers", "")
|
Rel(adminCtrl, importOrch, "Triggers", "")
|
||||||
Rel(docSvc, fileSvc, "Upload / download files", "")
|
Rel(docSvc, fileSvc, "Upload / download files", "")
|
||||||
Rel(docSvc, docRepo, "Reads / writes documents", "")
|
Rel(docSvc, docRepo, "Reads / writes documents", "")
|
||||||
Rel(docSvc, docSpec, "Builds search predicates", "")
|
Rel(docSvc, docSpec, "Builds search predicates", "")
|
||||||
Rel(docSvc, personSvc, "Resolves sender / receivers", "")
|
Rel(docSvc, personSvc, "Resolves sender / receivers", "")
|
||||||
Rel(docSvc, tagSvc, "Finds or creates tags", "")
|
Rel(docSvc, tagSvc, "Finds or creates tags", "")
|
||||||
Rel(massImport, excelSvc, "Parses Excel/ODS file", "")
|
Rel(importOrch, docSvc, "Upserts documents (PDF by index) — see 3b", "")
|
||||||
Rel(excelSvc, docSvc, "Creates / updates documents", "")
|
Rel(importOrch, personSvc, "Upserts persons + relationships", "")
|
||||||
|
Rel(importOrch, tagSvc, "Upserts tag hierarchy", "")
|
||||||
Rel(minioConf, fileSvc, "Provides S3Client and S3Presigner beans", "")
|
Rel(minioConf, fileSvc, "Provides S3Client and S3Presigner beans", "")
|
||||||
Rel(fileSvc, minio, "PUT / GET / presigned URL objects", "S3 API / HTTP")
|
Rel(fileSvc, minio, "PUT / GET / presigned URL objects", "S3 API / HTTP")
|
||||||
Rel(docRepo, db, "SQL queries", "JDBC")
|
Rel(docRepo, db, "SQL queries", "JDBC")
|
||||||
@@ -492,7 +492,7 @@ C4Component
|
|||||||
Component(adminGroups, "/admin/groups, /admin/groups/[id], /admin/groups/new", "SvelteKit Routes", "Permission group management: create/edit groups and their permission sets.")
|
Component(adminGroups, "/admin/groups, /admin/groups/[id], /admin/groups/new", "SvelteKit Routes", "Permission group management: create/edit groups and their permission sets.")
|
||||||
Component(adminTags, "/admin/tags and /admin/tags/[id]", "SvelteKit Routes", "Tag administration: edit tag hierarchy, merge tags, delete subtrees.")
|
Component(adminTags, "/admin/tags and /admin/tags/[id]", "SvelteKit Routes", "Tag administration: edit tag hierarchy, merge tags, delete subtrees.")
|
||||||
Component(adminOcr, "/admin/ocr and /admin/ocr/[personId]", "SvelteKit Routes", "Global and per-person OCR configuration. Manages script types and triggers sender model training.")
|
Component(adminOcr, "/admin/ocr and /admin/ocr/[personId]", "SvelteKit Routes", "Global and per-person OCR configuration. Manages script types and triggers sender model training.")
|
||||||
Component(adminSystem, "/admin/system", "SvelteKit Route", "System status panel. Triggers Excel/ODS mass import (POST /api/admin/trigger-import). Displays import state.")
|
Component(adminSystem, "/admin/system", "SvelteKit Route", "System status panel. Triggers the canonical import (POST /api/admin/trigger-import). Displays import state.")
|
||||||
Component(hilfe, "/hilfe/transkription", "SvelteKit Route", "Static transcription style guide for Kurrent and Sütterlin character recognition. No backend calls.")
|
Component(hilfe, "/hilfe/transkription", "SvelteKit Route", "Static transcription style guide for Kurrent and Sütterlin character recognition. No backend calls.")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ System_Boundary(backend, "API Backend (Spring Boot)") {
|
|||||||
Component(tagTreeLoader, "TagTreeImporter", "Spring Component", "Upserts the tag hierarchy from canonical-tag-tree.xlsx via TagService (by canonical tag_path).")
|
Component(tagTreeLoader, "TagTreeImporter", "Spring Component", "Upserts the tag hierarchy from canonical-tag-tree.xlsx via TagService (by canonical tag_path).")
|
||||||
Component(personRegLoader, "PersonRegisterImporter", "Spring Component", "Upserts register persons from canonical-persons.xlsx via PersonService (by normalizer person_id).")
|
Component(personRegLoader, "PersonRegisterImporter", "Spring Component", "Upserts register persons from canonical-persons.xlsx via PersonService (by normalizer person_id).")
|
||||||
Component(personTreeLoader, "PersonTreeImporter", "Spring Component", "Upserts tree persons + relationships from canonical-persons-tree.json via PersonService and RelationshipService.")
|
Component(personTreeLoader, "PersonTreeImporter", "Spring Component", "Upserts tree persons + relationships from canonical-persons-tree.json via PersonService and RelationshipService.")
|
||||||
Component(docLoader, "DocumentImporter", "Spring Component", "Loads canonical-documents.xlsx: routes attribution register-first (raw cell always retained in sender_text/receiver_text), parses clean dates, builds an honest precision-aware title via DocumentTitleFormatter, keeps the S3 upload + thumbnail plumbing, and ports the path-traversal / homoglyph / absolute-path / %PDF magic-byte security guards.")
|
Component(docLoader, "DocumentImporter", "Spring Component", "Loads canonical-documents.xlsx: routes attribution register-first (raw cell always retained in sender_text/receiver_text), parses clean dates, builds an honest precision-aware title via DocumentTitleFormatter, keeps the S3 upload + thumbnail plumbing, and resolves each PDF by index (importDir/<index>.pdf) guarded by strict index validation + canonical-path containment + %PDF magic-byte check (no recursive walk).")
|
||||||
Component(titleFmt, "DocumentTitleFormatter", "Pure helper", "Formats the date label baked into an import title at exactly the data's precision (MONTH -> 'Juni 1916', never a fabricated day). Mirrors the frontend formatDocumentDate; both are pinned to docs/date-label-fixtures.json (#666).")
|
Component(titleFmt, "DocumentTitleFormatter", "Pure helper", "Formats the date label baked into an import title at exactly the data's precision (MONTH -> 'Juni 1916', never a fabricated day). Mirrors the frontend formatDocumentDate; both are pinned to docs/date-label-fixtures.json (#666).")
|
||||||
Component(sheetReader, "CanonicalSheetReader", "POI helper", "Maps a canonical .xlsx by header name (no positional indices), splits pipe-delimited list columns, fails closed (IMPORT_ARTIFACT_INVALID) on a missing required header.")
|
Component(sheetReader, "CanonicalSheetReader", "POI helper", "Maps a canonical .xlsx by header name (no positional indices), splits pipe-delimited list columns, fails closed (IMPORT_ARTIFACT_INVALID) on a missing required header.")
|
||||||
Component(minioConf, "MinioConfig", "Spring @Configuration", "Creates the S3Client and S3Presigner beans with path-style access for MinIO. Validates MinIO connectivity on startup.")
|
Component(minioConf, "MinioConfig", "Spring @Configuration", "Creates the S3Client and S3Presigner beans with path-style access for MinIO. Validates MinIO connectivity on startup.")
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ System_Boundary(frontend, "Web Frontend (SvelteKit / SSR)") {
|
|||||||
Component(adminGroups, "/admin/groups, /admin/groups/[id], /admin/groups/new", "SvelteKit Routes", "Permission group management: create/edit groups and their permission sets.")
|
Component(adminGroups, "/admin/groups, /admin/groups/[id], /admin/groups/new", "SvelteKit Routes", "Permission group management: create/edit groups and their permission sets.")
|
||||||
Component(adminTags, "/admin/tags and /admin/tags/[id]", "SvelteKit Routes", "Tag administration: edit tag hierarchy, merge tags, delete subtrees.")
|
Component(adminTags, "/admin/tags and /admin/tags/[id]", "SvelteKit Routes", "Tag administration: edit tag hierarchy, merge tags, delete subtrees.")
|
||||||
Component(adminOcr, "/admin/ocr and /admin/ocr/[personId]", "SvelteKit Routes", "Global and per-person OCR configuration. Manages script types and triggers sender model training.")
|
Component(adminOcr, "/admin/ocr and /admin/ocr/[personId]", "SvelteKit Routes", "Global and per-person OCR configuration. Manages script types and triggers sender model training.")
|
||||||
Component(adminSystem, "/admin/system", "SvelteKit Route", "System status panel. Triggers Excel/ODS mass import (POST /api/admin/trigger-import). Displays import state.")
|
Component(adminSystem, "/admin/system", "SvelteKit Route", "System status panel. Triggers the canonical import (POST /api/admin/trigger-import). Displays import state.")
|
||||||
Component(hilfe, "/hilfe/transkription", "SvelteKit Route", "Static transcription style guide for Kurrent and Sütterlin character recognition. No backend calls.")
|
Component(hilfe, "/hilfe/transkription", "SvelteKit Route", "Static transcription style guide for Kurrent and Sütterlin character recognition. No backend calls.")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -26,7 +26,6 @@ Outputs:
|
|||||||
| --- | --- |
|
| --- | --- |
|
||||||
| `unparsed-dates.csv` | For each `raw` (sorted by frequency), fill `suggested_iso` + `suggested_precision`, then paste `raw,suggested_iso,suggested_precision` into `overrides/dates.csv` (header `raw,iso,precision`). |
|
| `unparsed-dates.csv` | For each `raw` (sorted by frequency), fill `suggested_iso` + `suggested_precision`, then paste `raw,suggested_iso,suggested_precision` into `overrides/dates.csv` (header `raw,iso,precision`). |
|
||||||
| `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv` (look up valid ids in `out/canonical-persons.xlsx`). |
|
| `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv` (look up valid ids in `out/canonical-persons.xlsx`). |
|
||||||
| `index-file-mismatch.csv` | The `Datei` path disagrees with the index-derived filename — reconcile when the PDFs arrive. |
|
|
||||||
| `duplicate-index.csv`, `blank-index-rows.csv`, `skipped-x-suffix.csv` | Inspect; fix in the source spreadsheet if needed. |
|
| `duplicate-index.csv`, `blank-index-rows.csv`, `skipped-x-suffix.csv` | Inspect; fix in the source spreadsheet if needed. |
|
||||||
|
|
||||||
> `unresolved-names.csv` is the focused "names that need a human" list. Non-family
|
> `unresolved-names.csv` is the focused "names that need a human" list. Non-family
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ OVERRIDES_DIR = BASE_DIR / "overrides"
|
|||||||
# --- Header text (lowercased, whitespace-collapsed) -> canonical field ---
|
# --- Header text (lowercased, whitespace-collapsed) -> canonical field ---
|
||||||
DOCUMENT_HEADER_MAP = {
|
DOCUMENT_HEADER_MAP = {
|
||||||
"index": "index",
|
"index": "index",
|
||||||
"datei": "file",
|
|
||||||
"box": "box",
|
"box": "box",
|
||||||
"mappe": "folder",
|
"mappe": "folder",
|
||||||
"briefeschreiberin": "sender",
|
"briefeschreiberin": "sender",
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ class Triage(Enum):
|
|||||||
class RawRow:
|
class RawRow:
|
||||||
source_row: int
|
source_row: int
|
||||||
index: str = ""
|
index: str = ""
|
||||||
file: str = ""
|
|
||||||
box: str = ""
|
box: str = ""
|
||||||
folder: str = ""
|
folder: str = ""
|
||||||
sender: str = ""
|
sender: str = ""
|
||||||
@@ -31,7 +30,6 @@ class RawRow:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class CanonicalDocument:
|
class CanonicalDocument:
|
||||||
index: str
|
index: str
|
||||||
file: str = ""
|
|
||||||
box: str = ""
|
box: str = ""
|
||||||
folder: str = ""
|
folder: str = ""
|
||||||
sender_person_id: str = ""
|
sender_person_id: str = ""
|
||||||
@@ -49,7 +47,7 @@ class CanonicalDocument:
|
|||||||
needs_review: list = field(default_factory=list)
|
needs_review: list = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
|
_FIELDS = ["index", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
|
||||||
|
|
||||||
|
|
||||||
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
|
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
|
||||||
@@ -82,15 +80,6 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
|
|||||||
return "data_no_index"
|
return "data_no_index"
|
||||||
|
|
||||||
|
|
||||||
def index_file_mismatch(index: str, file_path: str) -> bool:
|
|
||||||
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
|
|
||||||
if not file_path.strip():
|
|
||||||
return False
|
|
||||||
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
|
|
||||||
stem = basename.rsplit(".", 1)[0]
|
|
||||||
return stem != index
|
|
||||||
|
|
||||||
|
|
||||||
def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument:
|
def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument:
|
||||||
pd = _dates.parse_date(raw.date, date_overrides)
|
pd = _dates.parse_date(raw.date, date_overrides)
|
||||||
flags = []
|
flags = []
|
||||||
@@ -109,11 +98,9 @@ def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = fr
|
|||||||
flags.append("unparsed_date")
|
flags.append("unparsed_date")
|
||||||
if pd.needs_review:
|
if pd.needs_review:
|
||||||
flags.append("range_end_unparsed")
|
flags.append("range_end_unparsed")
|
||||||
if index_file_mismatch(raw.index, raw.file):
|
|
||||||
flags.append("index_file_mismatch")
|
|
||||||
|
|
||||||
return CanonicalDocument(
|
return CanonicalDocument(
|
||||||
index=raw.index, file=raw.file, box=raw.box, folder=raw.folder,
|
index=raw.index, box=raw.box, folder=raw.folder,
|
||||||
sender_person_id=sender_id, sender_name=sender_name,
|
sender_person_id=sender_id, sender_name=sender_name,
|
||||||
receiver_person_ids=[r[0] for r in receivers],
|
receiver_person_ids=[r[0] for r in receivers],
|
||||||
receiver_names=[r[1] for r in receivers],
|
receiver_names=[r[1] for r in receivers],
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
|||||||
d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS)
|
d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS)
|
||||||
index_col = d_fields["index"]
|
index_col = d_fields["index"]
|
||||||
|
|
||||||
canon_docs, blank_index, skipped_x, mismatches = [], [], [], []
|
canon_docs, blank_index, skipped_x = [], [], []
|
||||||
unparsed_by_raw: dict[str, list] = {}
|
unparsed_by_raw: dict[str, list] = {}
|
||||||
dates_by_override = 0
|
dates_by_override = 0
|
||||||
empty_count = 0
|
empty_count = 0
|
||||||
@@ -59,8 +59,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
|||||||
doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes))
|
doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes))
|
||||||
if "unparsed_date" in doc.needs_review:
|
if "unparsed_date" in doc.needs_review:
|
||||||
unparsed_by_raw.setdefault(raw.date, []).append(source_row)
|
unparsed_by_raw.setdefault(raw.date, []).append(source_row)
|
||||||
if "index_file_mismatch" in doc.needs_review:
|
|
||||||
mismatches.append([source_row, raw.index, raw.file])
|
|
||||||
canon_docs.append(doc)
|
canon_docs.append(doc)
|
||||||
|
|
||||||
# REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them.
|
# REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them.
|
||||||
@@ -102,7 +100,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
|||||||
key=lambda r: (r[0], -r[2], r[1]))
|
key=lambda r: (r[0], -r[2], r[1]))
|
||||||
writers.write_review_csv(review_dir / "unresolved-names.csv",
|
writers.write_review_csv(review_dir / "unresolved-names.csv",
|
||||||
["category", "raw", "count", "example_rows"], unresolved_rows)
|
["category", "raw", "count", "example_rows"], unresolved_rows)
|
||||||
writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches)
|
|
||||||
|
|
||||||
all_summaries = [doc.summary for doc in canon_docs if doc.summary]
|
all_summaries = [doc.summary for doc in canon_docs if doc.summary]
|
||||||
candidates = _tags.mine_summary_candidates(all_summaries)
|
candidates = _tags.mine_summary_candidates(all_summaries)
|
||||||
@@ -140,7 +137,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
|||||||
"blank_index_rows": len(blank_index),
|
"blank_index_rows": len(blank_index),
|
||||||
"skipped_x_suffix": len(skipped_x),
|
"skipped_x_suffix": len(skipped_x),
|
||||||
"duplicate_index_rows": len(duplicates),
|
"duplicate_index_rows": len(duplicates),
|
||||||
"index_file_mismatches": len(mismatches),
|
|
||||||
"# OVERRIDES": "",
|
"# OVERRIDES": "",
|
||||||
"date_overrides_loaded": len(date_overrides),
|
"date_overrides_loaded": len(date_overrides),
|
||||||
"name_overrides_loaded": len(name_overrides),
|
"name_overrides_loaded": len(name_overrides),
|
||||||
|
|||||||
Binary file not shown.
@@ -3,9 +3,9 @@ import documents
|
|||||||
from documents import Triage
|
from documents import Triage
|
||||||
|
|
||||||
def test_extract_row():
|
def test_extract_row():
|
||||||
header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4,
|
header = {"index": 0, "box": 1, "folder": 2, "sender": 3,
|
||||||
"receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9}
|
"receivers": 4, "date": 5, "location": 6, "tags": 7, "summary": 8}
|
||||||
cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
cells = ["W-0001", "V", "1", "Walter de Gruyter",
|
||||||
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
|
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
|
||||||
raw = documents.extract_row(cells, header, source_row=3)
|
raw = documents.extract_row(cells, header, source_row=3)
|
||||||
assert raw.index == "W-0001"
|
assert raw.index == "W-0001"
|
||||||
@@ -26,14 +26,6 @@ def test_classify_blank_index():
|
|||||||
assert documents.classify_blank_index(banner, header) == "section_banner"
|
assert documents.classify_blank_index(banner, header) == "section_banner"
|
||||||
assert documents.classify_blank_index(data, header) == "data_no_index"
|
assert documents.classify_blank_index(data, header) == "data_no_index"
|
||||||
|
|
||||||
def test_index_file_mismatch():
|
|
||||||
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
|
|
||||||
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
|
|
||||||
assert documents.index_file_mismatch("W-0001", "") is False
|
|
||||||
assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path
|
|
||||||
assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir
|
|
||||||
|
|
||||||
|
|
||||||
def _ctx():
|
def _ctx():
|
||||||
people = persons.parse_register([
|
people = persons.parse_register([
|
||||||
{"last_name": "de Gruyter", "first_name": "Walter"},
|
{"last_name": "de Gruyter", "first_name": "Walter"},
|
||||||
@@ -46,22 +38,19 @@ def test_to_canonical_resolves_and_flags():
|
|||||||
raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
|
raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
|
||||||
sender="Walter de Gruyter", receivers="Eugenie Müller",
|
sender="Walter de Gruyter", receivers="Eugenie Müller",
|
||||||
date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
|
date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
|
||||||
summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf")
|
summary="Geschäftsreise")
|
||||||
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||||
assert doc.sender_person_id == "de-gruyter-walter"
|
assert doc.sender_person_id == "de-gruyter-walter"
|
||||||
assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias
|
assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias
|
||||||
assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
|
assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
|
||||||
assert doc.tags == ["Themen/Brautbriefe"]
|
assert doc.tags == ["Themen/Brautbriefe"]
|
||||||
assert doc.file == r"..\__scan\W-0001.pdf" # file name carried through for the importer
|
|
||||||
assert doc.needs_review == []
|
assert doc.needs_review == []
|
||||||
|
|
||||||
|
|
||||||
def test_to_canonical_carries_file_name():
|
def test_canonical_document_has_no_file_field():
|
||||||
ctx = _ctx()
|
# #686: PDFs resolve by index (<index>.pdf) in the importer; the file field is gone.
|
||||||
raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="",
|
doc = documents.CanonicalDocument(index="W-0001")
|
||||||
file="H-0730.pdf")
|
assert not hasattr(doc, "file")
|
||||||
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
|
||||||
assert doc.file == "H-0730.pdf"
|
|
||||||
|
|
||||||
|
|
||||||
def test_to_canonical_range_carries_date_end():
|
def test_to_canonical_range_carries_date_end():
|
||||||
|
|||||||
@@ -32,18 +32,19 @@ def test_write_documents_xlsx_joins_lists(tmp_path):
|
|||||||
assert row["needs_review"] == "unparsed_date"
|
assert row["needs_review"] == "unparsed_date"
|
||||||
|
|
||||||
|
|
||||||
def test_write_documents_xlsx_carries_file_and_date_end(tmp_path):
|
def test_write_documents_xlsx_carries_date_end_and_has_no_file_column(tmp_path):
|
||||||
|
# #686: PDFs resolve by index (<index>.pdf), so the redundant "file" column is dropped.
|
||||||
doc = documents.CanonicalDocument(
|
doc = documents.CanonicalDocument(
|
||||||
index="H-0730", file="H-0730.pdf", date_iso="1917-01-10",
|
index="H-0730", date_iso="1917-01-10",
|
||||||
date_precision="RANGE", date_end="1917-01-11")
|
date_precision="RANGE", date_end="1917-01-11")
|
||||||
out = tmp_path / "docs.xlsx"
|
out = tmp_path / "docs.xlsx"
|
||||||
writers.write_documents_xlsx([doc], out)
|
writers.write_documents_xlsx([doc], out)
|
||||||
wb = openpyxl.load_workbook(out)
|
wb = openpyxl.load_workbook(out)
|
||||||
ws = wb.active
|
ws = wb.active
|
||||||
header = [c.value for c in ws[1]]
|
header = [c.value for c in ws[1]]
|
||||||
assert "file" in header and "date_end" in header
|
assert "file" not in header
|
||||||
|
assert "date_end" in header
|
||||||
row = {h: c.value for h, c in zip(header, ws[2])}
|
row = {h: c.value for h, c in zip(header, ws[2])}
|
||||||
assert row["file"] == "H-0730.pdf"
|
|
||||||
assert row["date_end"] == "1917-01-11"
|
assert row["date_end"] == "1917-01-11"
|
||||||
|
|
||||||
def test_write_documents_xlsx_pins_timestamp(tmp_path):
|
def test_write_documents_xlsx_pins_timestamp(tmp_path):
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ def _csv_safe(value):
|
|||||||
return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s
|
return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s
|
||||||
|
|
||||||
|
|
||||||
DOC_COLUMNS = ["index", "file", "box", "folder", "sender_person_id", "sender_name",
|
DOC_COLUMNS = ["index", "box", "folder", "sender_person_id", "sender_name",
|
||||||
"receiver_person_ids", "receiver_names", "date_iso", "date_raw",
|
"receiver_person_ids", "receiver_names", "date_iso", "date_raw",
|
||||||
"date_precision", "date_end", "location", "tags", "summary",
|
"date_precision", "date_end", "location", "tags", "summary",
|
||||||
"source_row", "needs_review"]
|
"source_row", "needs_review"]
|
||||||
|
|||||||
Reference in New Issue
Block a user