Import normalizer: offline tool to normalize the raw archive spreadsheets #663
@@ -24,6 +24,7 @@ import software.amazon.awssdk.services.s3.S3Client;
|
|||||||
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
|
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.OutputStream;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
@@ -126,6 +127,36 @@ class DocumentImporterTest {
|
|||||||
assertThat(validIndex("W-0001.pdf")).isFalse();
|
assertThat(validIndex("W-0001.pdf")).isFalse();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─── catalog-shape rejects — pass the char pre-checks but must fail INDEX_PATTERN ────
|
||||||
|
// These pin the regex branch itself: each string contains no separator, dot, slash
|
||||||
|
// homoglyph, null byte, or absolute marker, so it sails past every char guard and is
|
||||||
|
// rejected *only* because INDEX_PATTERN.matches() returns false. A weaker pattern would
|
||||||
|
// let them through — these tests would then go red.
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportIndex_returnsFalse_whenSpaceInIndex() {
|
||||||
|
// The real-world reject: "J 0070" is a space-typo with no PDF on disk.
|
||||||
|
assertThat(validIndex("J 0070")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportIndex_returnsFalse_whenFiveLetterPrefix() {
|
||||||
|
// The catalog prefix is at most 4 letters; 5 must not match.
|
||||||
|
assertThat(validIndex("WXYZA-0001")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportIndex_returnsFalse_whenNoLetterPrefix() {
|
||||||
|
// A digit-led id (no letter prefix) is not a catalog shape.
|
||||||
|
assertThat(validIndex("12-0001")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportIndex_returnsFalse_whenUppercaseXSuffix() {
|
||||||
|
// Only a lowercase trailing "x" is allowed; an uppercase "X" suffix must fail.
|
||||||
|
assertThat(validIndex("W-0001X")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isValidImportIndex_returnsTrue_whenPlainCatalogIndex() {
|
void isValidImportIndex_returnsTrue_whenPlainCatalogIndex() {
|
||||||
assertThat(validIndex("W-0124")).isTrue();
|
assertThat(validIndex("W-0124")).isTrue();
|
||||||
@@ -222,7 +253,7 @@ class DocumentImporterTest {
|
|||||||
ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString());
|
ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString());
|
||||||
|
|
||||||
org.assertj.core.api.Assertions.assertThatThrownBy(
|
org.assertj.core.api.Assertions.assertThatThrownBy(
|
||||||
() -> ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "W-0001"))
|
() -> ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "W-0001", 2))
|
||||||
.isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class);
|
.isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -232,12 +263,24 @@ class DocumentImporterTest {
|
|||||||
Path expected = tempDir.resolve("Eu-0628.pdf");
|
Path expected = tempDir.resolve("Eu-0628.pdf");
|
||||||
Files.writeString(expected, "%PDF-1.4");
|
Files.writeString(expected, "%PDF-1.4");
|
||||||
|
|
||||||
Optional<File> resolved = ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "Eu-0628");
|
Optional<File> resolved = ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "Eu-0628", 2);
|
||||||
|
|
||||||
assertThat(resolved).isPresent();
|
assertThat(resolved).isPresent();
|
||||||
assertThat(resolved.get().getCanonicalFile()).isEqualTo(expected.toFile().getCanonicalFile());
|
assertThat(resolved.get().getCanonicalFile()).isEqualTo(expected.toFile().getCanonicalFile());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NOTE (Sara, PR #687): the IOException branch of resolvePdfByIndex — where
|
||||||
|
// File.getCanonicalPath() itself throws (an OS-level failure mid-resolution, not the
|
||||||
|
// symlink-escape DomainException) — is intentionally NOT covered by a test. Unlike
|
||||||
|
// isPdfMagicBytes, which has the package-private openFileStream(File) seam a Mockito spy can
|
||||||
|
// make throw, getCanonicalPath() is called on a File built internally with no injection seam,
|
||||||
|
// and there is no portable, deterministic way to make it throw on a temp file (it does not
|
||||||
|
// throw for missing/symlinked paths — those are handled by isFile()/the containment check).
|
||||||
|
// Adding a seam purely to test this would be production code in service of a non-defect; the
|
||||||
|
// substantive fix is the log.warn() now emitted in that branch so the quiet skip surfaces in
|
||||||
|
// ops. Left uncovered by deliberate decision, documented here so the branch is not assumed
|
||||||
|
// tested.
|
||||||
|
|
||||||
// ─── PDF magic-byte guard — ported — do not remove ──────────────────────────────
|
// ─── PDF magic-byte guard — ported — do not remove ──────────────────────────────
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -544,7 +587,7 @@ class DocumentImporterTest {
|
|||||||
row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), ""));
|
row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), ""));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
try (java.io.OutputStream out = Files.newOutputStream(xlsx)) {
|
try (OutputStream out = Files.newOutputStream(xlsx)) {
|
||||||
wb.write(out);
|
wb.write(out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user