feat(training): add recognition training data export
- TrainingDataExportService: PDFBox rendering at 300 DPI, crop by annotation coordinates, ZIP with <uuid>.png + <uuid>.gt.txt pairs - Skips documents with missing S3 files (logs WARN, continues) - GET /api/ocr/training-data/export (ADMIN); 204 when no enrolled blocks Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,11 +12,15 @@ import org.raddatz.familienarchiv.security.RequirePermission;
|
||||
import org.raddatz.familienarchiv.service.OcrBatchService;
|
||||
import org.raddatz.familienarchiv.service.OcrProgressService;
|
||||
import org.raddatz.familienarchiv.service.OcrService;
|
||||
import org.raddatz.familienarchiv.service.TrainingDataExportService;
|
||||
import org.raddatz.familienarchiv.service.UserService;
|
||||
import org.springframework.http.HttpHeaders;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.security.core.Authentication;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody;
|
||||
import org.springframework.web.servlet.mvc.method.annotation.SseEmitter;
|
||||
|
||||
import jakarta.validation.Valid;
|
||||
@@ -32,6 +36,7 @@ public class OcrController {
|
||||
private final OcrBatchService ocrBatchService;
|
||||
private final OcrProgressService ocrProgressService;
|
||||
private final UserService userService;
|
||||
private final TrainingDataExportService trainingDataExportService;
|
||||
|
||||
@PostMapping("/api/documents/{documentId}/ocr")
|
||||
@ResponseStatus(HttpStatus.ACCEPTED)
|
||||
@@ -75,6 +80,19 @@ public class OcrController {
|
||||
return ocrService.getDocumentOcrStatus(documentId);
|
||||
}
|
||||
|
||||
@GetMapping("/api/ocr/training-data/export")
|
||||
@RequirePermission(Permission.ADMIN)
|
||||
public ResponseEntity<StreamingResponseBody> exportTrainingData() {
|
||||
if (trainingDataExportService.queryEligibleBlocks().isEmpty()) {
|
||||
return ResponseEntity.noContent().build();
|
||||
}
|
||||
StreamingResponseBody body = trainingDataExportService.exportToZip();
|
||||
return ResponseEntity.ok()
|
||||
.contentType(MediaType.parseMediaType("application/zip"))
|
||||
.header(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=\"training-data.zip\"")
|
||||
.body(body);
|
||||
}
|
||||
|
||||
private UUID resolveUserId(Authentication authentication) {
|
||||
if (authentication == null || !authentication.isAuthenticated()) return null;
|
||||
try {
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
package org.raddatz.familienarchiv.service;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.raddatz.familienarchiv.model.Document;
|
||||
import org.raddatz.familienarchiv.model.DocumentAnnotation;
|
||||
import org.raddatz.familienarchiv.model.TranscriptionBlock;
|
||||
import org.raddatz.familienarchiv.repository.AnnotationRepository;
|
||||
import org.raddatz.familienarchiv.repository.DocumentRepository;
|
||||
import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class TrainingDataExportService {
|
||||
|
||||
private final TranscriptionBlockRepository blockRepository;
|
||||
private final AnnotationRepository annotationRepository;
|
||||
private final DocumentRepository documentRepository;
|
||||
private final FileService fileService;
|
||||
|
||||
public List<TranscriptionBlock> queryEligibleBlocks() {
|
||||
return blockRepository.findEligibleKurrentBlocks();
|
||||
}
|
||||
|
||||
public StreamingResponseBody exportToZip() {
|
||||
// Collect all data before entering the lambda — no open DB txn during streaming
|
||||
List<TranscriptionBlock> blocks = queryEligibleBlocks();
|
||||
if (blocks.isEmpty()) {
|
||||
return out -> {}; // caller checks isEmpty() for 204 response
|
||||
}
|
||||
|
||||
// Group blocks by documentId so we only download each PDF once
|
||||
Map<UUID, List<TranscriptionBlock>> byDoc = new LinkedHashMap<>();
|
||||
for (TranscriptionBlock b : blocks) {
|
||||
byDoc.computeIfAbsent(b.getDocumentId(), k -> new ArrayList<>()).add(b);
|
||||
}
|
||||
|
||||
// Pre-fetch annotations keyed by id
|
||||
Map<UUID, DocumentAnnotation> annotations = new HashMap<>();
|
||||
for (TranscriptionBlock b : blocks) {
|
||||
annotationRepository.findById(b.getAnnotationId())
|
||||
.ifPresent(a -> annotations.put(a.getId(), a));
|
||||
}
|
||||
|
||||
// Pre-fetch documents keyed by id
|
||||
Map<UUID, Document> documents = new HashMap<>();
|
||||
for (UUID docId : byDoc.keySet()) {
|
||||
documentRepository.findById(docId).ifPresent(d -> documents.put(d.getId(), d));
|
||||
}
|
||||
|
||||
return out -> {
|
||||
try (ZipOutputStream zip = new ZipOutputStream(out)) {
|
||||
for (Map.Entry<UUID, List<TranscriptionBlock>> entry : byDoc.entrySet()) {
|
||||
UUID docId = entry.getKey();
|
||||
Document doc = documents.get(docId);
|
||||
if (doc == null || doc.getFilePath() == null) {
|
||||
log.warn("Skipping document {} — no file path", docId);
|
||||
continue;
|
||||
}
|
||||
|
||||
byte[] pdfBytes;
|
||||
try {
|
||||
pdfBytes = fileService.downloadFileBytes(doc.getFilePath());
|
||||
} catch (FileService.StorageFileNotFoundException | IOException e) {
|
||||
log.warn("Skipping document {} — S3 download failed: {}", docId, e.getMessage());
|
||||
continue;
|
||||
}
|
||||
|
||||
try (PDDocument pdf = Loader.loadPDF(pdfBytes)) {
|
||||
PDFRenderer renderer = new PDFRenderer(pdf);
|
||||
for (TranscriptionBlock block : entry.getValue()) {
|
||||
DocumentAnnotation ann = annotations.get(block.getAnnotationId());
|
||||
if (ann == null) continue;
|
||||
|
||||
int pageIdx = ann.getPageNumber() - 1; // pageNumber is 1-based
|
||||
if (pageIdx < 0 || pageIdx >= pdf.getNumberOfPages()) continue;
|
||||
|
||||
BufferedImage pageImage = renderPageImage(renderer, pageIdx);
|
||||
BufferedImage cropped = cropBlockImage(pageImage, ann);
|
||||
|
||||
writeTrainingPair(zip, block.getId(), cropped, block.getText());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("Skipping document {} — rendering failed: {}", docId, e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
BufferedImage renderPageImage(PDFRenderer renderer, int pageIdx) throws IOException {
|
||||
return renderer.renderImageWithDPI(pageIdx, 300);
|
||||
}
|
||||
|
||||
BufferedImage cropBlockImage(BufferedImage page, DocumentAnnotation ann) {
|
||||
int imgW = page.getWidth();
|
||||
int imgH = page.getHeight();
|
||||
|
||||
int x = (int) (ann.getX() * imgW);
|
||||
int y = (int) (ann.getY() * imgH);
|
||||
int w = (int) (ann.getWidth() * imgW);
|
||||
int h = (int) (ann.getHeight() * imgH);
|
||||
|
||||
// Clamp to image bounds
|
||||
x = Math.max(0, Math.min(x, imgW - 1));
|
||||
y = Math.max(0, Math.min(y, imgH - 1));
|
||||
w = Math.max(1, Math.min(w, imgW - x));
|
||||
h = Math.max(1, Math.min(h, imgH - y));
|
||||
|
||||
return page.getSubimage(x, y, w, h);
|
||||
}
|
||||
|
||||
void writeTrainingPair(ZipOutputStream zip, UUID blockId, BufferedImage image, String text) throws IOException {
|
||||
String base = blockId.toString();
|
||||
|
||||
// Write PNG
|
||||
zip.putNextEntry(new ZipEntry(base + ".png"));
|
||||
ImageIO.write(image, "PNG", zip);
|
||||
zip.closeEntry();
|
||||
|
||||
// Write ground-truth text
|
||||
zip.putNextEntry(new ZipEntry(base + ".gt.txt"));
|
||||
zip.write((text != null ? text : "").getBytes(StandardCharsets.UTF_8));
|
||||
zip.closeEntry();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user