feat(training): add recognition training data export

- TrainingDataExportService: PDFBox rendering at 300 DPI, crop by
  annotation coordinates, ZIP with <uuid>.png + <uuid>.gt.txt pairs
- Skips documents with missing S3 files (logs WARN, continues)
- GET /api/ocr/training-data/export (ADMIN); 204 when no enrolled blocks

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 14:35:06 +02:00
parent fdf1eb92ad
commit cfa3c4df67
4 changed files with 459 additions and 0 deletions

View File

@@ -43,6 +43,7 @@ class OcrControllerTest {
@MockitoBean OcrProgressService ocrProgressService;
@MockitoBean UserService userService;
@MockitoBean CustomUserDetailsService customUserDetailsService;
@MockitoBean TrainingDataExportService trainingDataExportService;
@Test
@WithMockUser(authorities = "WRITE_ALL")
@@ -121,6 +122,47 @@ class OcrControllerTest {
.andExpect(jsonPath("$.jobId").value(jobId.toString()));
}
// ─── GET /api/ocr/training-data/export ───────────────────────────────────
@Test
void exportTrainingData_returns401_whenUnauthenticated() throws Exception {
mockMvc.perform(get("/api/ocr/training-data/export"))
.andExpect(status().isUnauthorized());
}
@Test
@WithMockUser(authorities = "READ_ALL")
void exportTrainingData_returns403_whenNotAdmin() throws Exception {
mockMvc.perform(get("/api/ocr/training-data/export"))
.andExpect(status().isForbidden());
}
@Test
@WithMockUser(authorities = "ADMIN")
void exportTrainingData_returns204_whenNoEligibleBlocks() throws Exception {
when(trainingDataExportService.queryEligibleBlocks()).thenReturn(List.of());
mockMvc.perform(get("/api/ocr/training-data/export"))
.andExpect(status().isNoContent());
}
@Test
@WithMockUser(authorities = "ADMIN")
void exportTrainingData_returns200_withZipContentType_whenBlocksExist() throws Exception {
org.raddatz.familienarchiv.model.TranscriptionBlock block =
org.raddatz.familienarchiv.model.TranscriptionBlock.builder()
.id(UUID.randomUUID()).documentId(UUID.randomUUID())
.annotationId(UUID.randomUUID()).text("x").sortOrder(0).build();
when(trainingDataExportService.queryEligibleBlocks()).thenReturn(List.of(block));
when(trainingDataExportService.exportToZip()).thenReturn(out -> {});
mockMvc.perform(get("/api/ocr/training-data/export"))
.andExpect(status().isOk())
.andExpect(result ->
org.assertj.core.api.Assertions.assertThat(
result.getResponse().getContentType()).contains("application/zip"));
}
@Test
@WithMockUser(authorities = "READ_ALL")
void getDocumentOcrStatus_returnsNone_whenNoOcrJobExists() throws Exception {

View File

@@ -0,0 +1,258 @@
package org.raddatz.familienarchiv.service;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.raddatz.familienarchiv.PostgresContainerConfig;
import org.raddatz.familienarchiv.config.FlywayConfig;
import org.raddatz.familienarchiv.model.*;
import org.raddatz.familienarchiv.repository.AnnotationRepository;
import org.raddatz.familienarchiv.repository.DocumentRepository;
import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase;
import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest;
import org.springframework.context.annotation.Import;
import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.util.HashSet;
import java.util.Set;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.*;
@DataJpaTest
@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE)
@Import({PostgresContainerConfig.class, FlywayConfig.class})
class TrainingDataExportServiceTest {
@Autowired TranscriptionBlockRepository blockRepository;
@Autowired DocumentRepository documentRepository;
@Autowired AnnotationRepository annotationRepository;
static byte[] minimalPdfBytes;
@BeforeAll
static void createMinimalPdf() throws Exception {
try (PDDocument doc = new PDDocument()) {
doc.addPage(new PDPage(PDRectangle.A4));
ByteArrayOutputStream out = new ByteArrayOutputStream();
doc.save(out);
minimalPdfBytes = out.toByteArray();
}
}
// ─── Query: enrollment filter ─────────────────────────────────────────────
@Test
void export_includesManualBlockFromEnrolledDocument() throws Exception {
UUID docId = enrolledDoc("enrolled.pdf");
UUID annotId = annotation(docId);
blockRepository.save(manualBlock(docId, annotId, "Liebe Mutter"));
FileService fileService = mockFileService();
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
StreamingResponseBody body = service.exportToZip();
byte[] zipBytes = stream(body);
assertThat(zipEntryNames(zipBytes)).isNotEmpty();
}
@Test
void export_excludesManualBlockFromNonEnrolledDocument() throws Exception {
UUID docId = nonEnrolledDoc("notenrolled.pdf");
UUID annotId = annotation(docId);
blockRepository.save(manualBlock(docId, annotId, "Liebe Tante"));
FileService fileService = mockFileService();
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
StreamingResponseBody body = service.exportToZip();
byte[] zipBytes = stream(body);
assertThat(zipEntryNames(zipBytes)).isEmpty();
}
@Test
void export_includesReviewedOcrBlockFromEnrolledDocument() throws Exception {
UUID docId = enrolledDoc("ocr-reviewed.pdf");
UUID annotId = annotation(docId);
TranscriptionBlock block = TranscriptionBlock.builder()
.annotationId(annotId).documentId(docId)
.text("OCR text").sortOrder(0)
.source(BlockSource.OCR).reviewed(true).build();
blockRepository.save(block);
FileService fileService = mockFileService();
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
StreamingResponseBody body = service.exportToZip();
assertThat(zipEntryNames(stream(body))).isNotEmpty();
}
@Test
void export_excludesUnreviewedOcrBlockFromEnrolledDocument() throws Exception {
UUID docId = enrolledDoc("ocr-unreviewed.pdf");
UUID annotId = annotation(docId);
TranscriptionBlock block = TranscriptionBlock.builder()
.annotationId(annotId).documentId(docId)
.text("Raw OCR").sortOrder(0)
.source(BlockSource.OCR).reviewed(false).build();
blockRepository.save(block);
FileService fileService = mockFileService();
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
StreamingResponseBody body = service.exportToZip();
assertThat(zipEntryNames(stream(body))).isEmpty();
}
// ─── ZIP structure ────────────────────────────────────────────────────────
@Test
void export_producesExactly2EntriesPerBlock_pngAndTxt() throws Exception {
UUID docId = enrolledDoc("zip-struct.pdf");
UUID annotId = annotation(docId);
blockRepository.save(manualBlock(docId, annotId, "Erste Zeile"));
blockRepository.save(manualBlock(docId, annotId, "Zweite Zeile"));
FileService fileService = mockFileService();
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
byte[] zipBytes = stream(service.exportToZip());
var names = zipEntryNames(zipBytes);
assertThat(names).hasSize(4); // 2 blocks × 2 entries each
assertThat(names.stream().filter(n -> n.endsWith(".png")).count()).isEqualTo(2);
assertThat(names.stream().filter(n -> n.endsWith(".gt.txt")).count()).isEqualTo(2);
}
@Test
void export_gtTxtContainsBlockText() throws Exception {
UUID docId = enrolledDoc("txt-content.pdf");
UUID annotId = annotation(docId);
String expectedText = "Sehr geehrte Frau";
blockRepository.save(manualBlock(docId, annotId, expectedText));
FileService fileService = mockFileService();
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
byte[] zipBytes = stream(service.exportToZip());
String txtContent = readZipEntry(zipBytes, ".gt.txt");
assertThat(txtContent).isEqualTo(expectedText);
}
// ─── S3 failure resilience ────────────────────────────────────────────────
@Test
void export_skipsDocumentWhenS3DownloadFails_andStillIncludesOtherDocuments() throws Exception {
UUID failDocId = enrolledDoc("fail.pdf");
UUID okDocId = enrolledDoc("ok.pdf");
UUID failAnnotId = annotation(failDocId);
UUID okAnnotId = annotation(okDocId);
blockRepository.save(manualBlock(failDocId, failAnnotId, "Will fail"));
blockRepository.save(manualBlock(okDocId, okAnnotId, "Will succeed"));
FileService fileService = mock(FileService.class);
when(fileService.downloadFileBytes("fail.pdf")).thenThrow(new FileService.StorageFileNotFoundException("missing"));
when(fileService.downloadFileBytes("ok.pdf")).thenReturn(minimalPdfBytes);
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
byte[] zipBytes = stream(service.exportToZip());
var names = zipEntryNames(zipBytes);
// ok.pdf block produces 2 entries; fail.pdf block is skipped
assertThat(names).hasSize(2);
}
// ─── Empty export ─────────────────────────────────────────────────────────
@Test
void queryEligibleBlocks_returnsEmpty_whenNoEnrolledDocuments() {
FileService fileService = mockFileService();
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
assertThat(service.queryEligibleBlocks()).isEmpty();
}
// ─── helpers ─────────────────────────────────────────────────────────────
private UUID enrolledDoc(String filename) {
Document doc = documentRepository.save(Document.builder()
.title(filename).originalFilename(filename).filePath(filename)
.status(DocumentStatus.UPLOADED)
.trainingLabels(new HashSet<>(Set.of(TrainingLabel.KURRENT_RECOGNITION)))
.build());
return doc.getId();
}
private UUID nonEnrolledDoc(String filename) {
Document doc = documentRepository.save(Document.builder()
.title(filename).originalFilename(filename).filePath(filename)
.status(DocumentStatus.UPLOADED)
.build());
return doc.getId();
}
private UUID annotation(UUID docId) {
return annotationRepository.save(DocumentAnnotation.builder()
.documentId(docId).pageNumber(1)
.x(0.1).y(0.1).width(0.8).height(0.1).color("#00C7B1")
.build()).getId();
}
private TranscriptionBlock manualBlock(UUID docId, UUID annotId, String text) {
return TranscriptionBlock.builder()
.annotationId(annotId).documentId(docId)
.text(text).sortOrder(0)
.source(BlockSource.MANUAL).reviewed(false).build();
}
private FileService mockFileService() {
FileService fs = mock(FileService.class);
try {
when(fs.downloadFileBytes(anyString())).thenReturn(minimalPdfBytes);
} catch (Exception e) {
throw new RuntimeException(e);
}
return fs;
}
private static byte[] stream(StreamingResponseBody body) throws Exception {
ByteArrayOutputStream out = new ByteArrayOutputStream();
body.writeTo(out);
return out.toByteArray();
}
private static java.util.List<String> zipEntryNames(byte[] zipBytes) throws Exception {
var names = new java.util.ArrayList<String>();
try (ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(zipBytes))) {
ZipEntry entry;
while ((entry = zis.getNextEntry()) != null) {
names.add(entry.getName());
zis.closeEntry();
}
}
return names;
}
private static String readZipEntry(byte[] zipBytes, String suffix) throws Exception {
try (ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(zipBytes))) {
ZipEntry entry;
while ((entry = zis.getNextEntry()) != null) {
if (entry.getName().endsWith(suffix)) {
return new String(zis.readAllBytes(), java.nio.charset.StandardCharsets.UTF_8);
}
zis.closeEntry();
}
}
return null;
}
}