feat(training): add segmentation training pipeline and complete Part 6

- Add /segtrain endpoint to OCR service (ZIP upload, ketos.segtrain,
  backup rotation, in-process model reload)
- Add segtrainModel() to OcrClient and RestClientOcrClient (10-min timeout,
  X-Training-Token header)
- Add SegmentationTrainingExportService: PAGE XML export with polygon
  de-normalization and per-page PNG rendering via PDFBox
- Add GET /api/ocr/segmentation-training-data/export endpoint
- Make TranscriptionBlock.text nullable for segmentation-only blocks
  (V31 migration)
- Add Paraglide i18n translation keys for all training UI strings (de/en/es)
- Pass source prop from TranscriptionEditView to TranscriptionBlock

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 15:15:17 +02:00
parent 86e9c05aaf
commit 9b2f91ee59
13 changed files with 383 additions and 5 deletions

View File

@@ -14,6 +14,7 @@ import org.raddatz.familienarchiv.service.OcrBatchService;
import org.raddatz.familienarchiv.service.OcrProgressService;
import org.raddatz.familienarchiv.service.OcrService;
import org.raddatz.familienarchiv.service.OcrTrainingService;
import org.raddatz.familienarchiv.service.SegmentationTrainingExportService;
import org.raddatz.familienarchiv.service.TrainingDataExportService;
import org.raddatz.familienarchiv.service.UserService;
import org.springframework.http.HttpHeaders;
@@ -39,6 +40,7 @@ public class OcrController {
private final OcrProgressService ocrProgressService;
private final UserService userService;
private final TrainingDataExportService trainingDataExportService;
private final SegmentationTrainingExportService segmentationTrainingExportService;
private final OcrTrainingService ocrTrainingService;
@PostMapping("/api/documents/{documentId}/ocr")
@@ -96,6 +98,19 @@ public class OcrController {
.body(body);
}
@GetMapping("/api/ocr/segmentation-training-data/export")
@RequirePermission(Permission.ADMIN)
public ResponseEntity<StreamingResponseBody> exportSegmentationTrainingData() {
if (segmentationTrainingExportService.querySegmentationBlocks().isEmpty()) {
return ResponseEntity.noContent().build();
}
StreamingResponseBody body = segmentationTrainingExportService.exportToZip();
return ResponseEntity.ok()
.contentType(MediaType.parseMediaType("application/zip"))
.header(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=\"segmentation-data.zip\"")
.body(body);
}
@PostMapping("/api/ocr/train")
@ResponseStatus(HttpStatus.CREATED)
@RequirePermission(Permission.ADMIN)

View File

@@ -30,8 +30,7 @@ public class TranscriptionBlock {
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private UUID documentId;
@Column(nullable = false, columnDefinition = "TEXT")
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
@Column(columnDefinition = "TEXT")
private String text;
@Column(length = 200)

View File

@@ -20,6 +20,14 @@ public interface OcrClient {
record TrainingResult(Double loss, Double accuracy, Integer epochs) {}
/**
* Send a segmentation training ZIP to the OCR service for fine-tuning the blla model.
*
* @param trainingDataZip raw ZIP bytes produced by SegmentationTrainingExportService
* @return training result metrics
*/
TrainingResult segtrainModel(byte[] trainingDataZip);
/**
* Stream OCR results page-by-page via NDJSON. Implementations should override
* this method. The default exists only for backward compatibility during migration

View File

@@ -131,6 +131,35 @@ public class RestClientOcrClient implements OcrClient, OcrHealthClient {
return new OcrClient.TrainingResult(result.loss(), result.accuracy(), result.epochs());
}
@Override
public OcrClient.TrainingResult segtrainModel(byte[] trainingDataZip) {
ByteArrayResource zipResource = new ByteArrayResource(trainingDataZip) {
@Override
public String getFilename() { return "segmentation-data.zip"; }
};
MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
HttpHeaders partHeaders = new HttpHeaders();
partHeaders.setContentType(MediaType.parseMediaType("application/zip"));
body.add("file", new HttpEntity<>(zipResource, partHeaders));
var spec = trainingRestClient.post()
.uri("/segtrain")
.contentType(MediaType.MULTIPART_FORM_DATA);
if (trainingToken != null && !trainingToken.isBlank()) {
spec = spec.header("X-Training-Token", trainingToken);
}
TrainingResultJson result = spec
.body(body)
.retrieve()
.body(TrainingResultJson.class);
if (result == null) return new OcrClient.TrainingResult(null, null, null);
return new OcrClient.TrainingResult(result.loss(), result.accuracy(), result.epochs());
}
@Override
public boolean isHealthy() {
try {

View File

@@ -0,0 +1,174 @@
package org.raddatz.familienarchiv.service;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.raddatz.familienarchiv.model.Document;
import org.raddatz.familienarchiv.model.DocumentAnnotation;
import org.raddatz.familienarchiv.model.TranscriptionBlock;
import org.raddatz.familienarchiv.repository.AnnotationRepository;
import org.raddatz.familienarchiv.repository.DocumentRepository;
import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository;
import org.springframework.stereotype.Service;
import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
@Service
@RequiredArgsConstructor
@Slf4j
public class SegmentationTrainingExportService {
private final TranscriptionBlockRepository blockRepository;
private final AnnotationRepository annotationRepository;
private final DocumentRepository documentRepository;
private final FileService fileService;
public List<TranscriptionBlock> querySegmentationBlocks() {
return blockRepository.findSegmentationBlocks();
}
public StreamingResponseBody exportToZip() {
List<TranscriptionBlock> blocks = querySegmentationBlocks();
if (blocks.isEmpty()) {
return out -> {};
}
// Group by documentId so we download each PDF only once
Map<UUID, List<TranscriptionBlock>> byDoc = new LinkedHashMap<>();
for (TranscriptionBlock b : blocks) {
byDoc.computeIfAbsent(b.getDocumentId(), k -> new ArrayList<>()).add(b);
}
// Pre-fetch annotations keyed by id
Map<UUID, DocumentAnnotation> annotations = new HashMap<>();
for (TranscriptionBlock b : blocks) {
annotationRepository.findById(b.getAnnotationId())
.ifPresent(a -> annotations.put(a.getId(), a));
}
// Pre-fetch documents keyed by id
Map<UUID, Document> documents = new HashMap<>();
for (UUID docId : byDoc.keySet()) {
documentRepository.findById(docId).ifPresent(d -> documents.put(d.getId(), d));
}
return out -> {
try (ZipOutputStream zip = new ZipOutputStream(out)) {
for (Map.Entry<UUID, List<TranscriptionBlock>> entry : byDoc.entrySet()) {
UUID docId = entry.getKey();
Document doc = documents.get(docId);
if (doc == null || doc.getFilePath() == null) {
log.warn("Skipping document {} — no file path", docId);
continue;
}
byte[] pdfBytes;
try {
pdfBytes = fileService.downloadFileBytes(doc.getFilePath());
} catch (FileService.StorageFileNotFoundException | IOException e) {
log.warn("Skipping document {} — S3 download failed: {}", docId, e.getMessage());
continue;
}
// Group blocks by page number for this document
Map<Integer, List<TranscriptionBlock>> byPage = new LinkedHashMap<>();
for (TranscriptionBlock b : entry.getValue()) {
DocumentAnnotation ann = annotations.get(b.getAnnotationId());
if (ann != null) {
byPage.computeIfAbsent(ann.getPageNumber(), k -> new ArrayList<>()).add(b);
}
}
try (PDDocument pdf = Loader.loadPDF(pdfBytes)) {
PDFRenderer renderer = new PDFRenderer(pdf);
for (Map.Entry<Integer, List<TranscriptionBlock>> pageEntry : byPage.entrySet()) {
int pageNumber = pageEntry.getKey();
int pageIdx = pageNumber - 1;
if (pageIdx < 0 || pageIdx >= pdf.getNumberOfPages()) continue;
BufferedImage pageImage = renderer.renderImageWithDPI(pageIdx, 300);
String basename = "page-" + docId + "-" + pageNumber;
// Collect annotations for this page
List<DocumentAnnotation> pageAnnotations = new ArrayList<>();
for (TranscriptionBlock b : pageEntry.getValue()) {
DocumentAnnotation ann = annotations.get(b.getAnnotationId());
if (ann != null) pageAnnotations.add(ann);
}
writePngEntry(zip, basename, pageImage);
writePageXmlEntry(zip, basename, pageImage, pageAnnotations);
}
} catch (Exception e) {
log.warn("Skipping document {} — rendering failed: {}", docId, e.getMessage());
}
}
}
};
}
private void writePngEntry(ZipOutputStream zip, String basename, BufferedImage image) throws IOException {
zip.putNextEntry(new ZipEntry(basename + ".png"));
ImageIO.write(image, "PNG", zip);
zip.closeEntry();
}
private void writePageXmlEntry(ZipOutputStream zip, String basename,
BufferedImage pageImage,
List<DocumentAnnotation> annotations) throws IOException {
int imgW = pageImage.getWidth();
int imgH = pageImage.getHeight();
StringBuilder regions = new StringBuilder();
for (DocumentAnnotation ann : annotations) {
String coords = buildPolygonCoords(ann, imgW, imgH);
String regionId = ann.getId().toString();
regions.append(" <TextRegion id=\"").append(regionId).append("\">\n");
regions.append(" <Coords points=\"").append(coords).append("\"/>\n");
regions.append(" </TextRegion>\n");
}
String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+ "<PcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15\">\n"
+ " <Page imageFilename=\"" + basename + ".png\""
+ " imageWidth=\"" + imgW + "\""
+ " imageHeight=\"" + imgH + "\">\n"
+ regions
+ " </Page>\n"
+ "</PcGts>\n";
zip.putNextEntry(new ZipEntry(basename + ".xml"));
zip.write(xml.getBytes(StandardCharsets.UTF_8));
zip.closeEntry();
}
String buildPolygonCoords(DocumentAnnotation ann, int imgW, int imgH) {
List<List<Double>> polygon = ann.getPolygon();
if (polygon != null && !polygon.isEmpty()) {
// Use explicit polygon — de-normalize to pixel coordinates
StringBuilder sb = new StringBuilder();
for (List<Double> pt : polygon) {
if (sb.length() > 0) sb.append(' ');
int px = (int) (pt.get(0) * imgW);
int py = (int) (pt.get(1) * imgH);
sb.append(px).append(',').append(py);
}
return sb.toString();
}
// Fall back to bounding box from x/y/width/height
int x = (int) (ann.getX() * imgW);
int y = (int) (ann.getY() * imgH);
int w = (int) (ann.getWidth() * imgW);
int h = (int) (ann.getHeight() * imgH);
return x + "," + y + " " + (x + w) + "," + y + " " + (x + w) + "," + (y + h) + " " + x + "," + (y + h);
}
}

View File

@@ -0,0 +1,5 @@
-- Intentional: segmentation-only blocks have no text.
-- This migration is irreversible without a data cleanup step
-- (cannot re-add NOT NULL if null rows exist).
ALTER TABLE transcription_blocks ALTER COLUMN text DROP NOT NULL;
ALTER TABLE transcription_blocks ALTER COLUMN text SET DEFAULT '';

View File

@@ -20,6 +20,7 @@ class OcrClientDefaultStreamTest {
new OcrBlockResult(1, 0.1, 0.1, 0.8, 0.04, null, "Line 3"));
}
@Override public TrainingResult trainModel(byte[] zip) { return null; }
@Override public TrainingResult segtrainModel(byte[] zip) { return null; }
};
List<OcrStreamEvent> events = new ArrayList<>();
@@ -50,6 +51,7 @@ class OcrClientDefaultStreamTest {
OcrClient client = new OcrClient() {
@Override public List<OcrBlockResult> extractBlocks(String pdfUrl, ScriptType scriptType) { return List.of(); }
@Override public TrainingResult trainModel(byte[] zip) { return null; }
@Override public TrainingResult segtrainModel(byte[] zip) { return null; }
};
List<OcrStreamEvent> events = new ArrayList<>();