diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java
index 06a23946..cf0b10e3 100644
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java
@@ -127,15 +127,47 @@ public class TrainingDataExportService {
void writeTrainingPair(ZipOutputStream zip, UUID blockId, BufferedImage image, String text) throws IOException {
String base = blockId.toString();
+ int w = image.getWidth();
+ int h = image.getHeight();
+ // Baseline at 75 % height — typical text baseline position in a cropped line image
+ int baselineY = (h * 3) / 4;
// Write PNG
zip.putNextEntry(new ZipEntry(base + ".png"));
ImageIO.write(image, "PNG", zip);
zip.closeEntry();
- // Write ground-truth text
- zip.putNextEntry(new ZipEntry(base + ".gt.txt"));
- zip.write((text != null ? text : "").getBytes(StandardCharsets.UTF_8));
+ // Write PAGE XML (Kraken 7+ dropped the legacy "path" format)
+ String safeText = escapeXml(text != null ? text : "");
+ String xml = String.format(
+ "\n" +
+ "\n" +
+ " familienarchiv\n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ " %s\n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ "\n",
+ base, w, h,
+ w - 1, w - 1, h - 1, h - 1,
+ w - 1, w - 1, h - 1, h - 1,
+ baselineY, w - 1, baselineY,
+ safeText);
+
+ zip.putNextEntry(new ZipEntry(base + ".xml"));
+ zip.write(xml.getBytes(StandardCharsets.UTF_8));
zip.closeEntry();
}
+
+ private static String escapeXml(String text) {
+ return text.replace("&", "&")
+ .replace("<", "<")
+ .replace(">", ">");
+ }
}
diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java
index 12523084..cce70601 100644
--- a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java
@@ -149,11 +149,11 @@ class TrainingDataExportServiceTest {
var names = zipEntryNames(zipBytes);
assertThat(names).hasSize(4); // 2 blocks × 2 entries each
assertThat(names.stream().filter(n -> n.endsWith(".png")).count()).isEqualTo(2);
- assertThat(names.stream().filter(n -> n.endsWith(".gt.txt")).count()).isEqualTo(2);
+ assertThat(names.stream().filter(n -> n.endsWith(".xml")).count()).isEqualTo(2);
}
@Test
- void export_gtTxtContainsBlockText() throws Exception {
+ void export_pageXmlContainsBlockText() throws Exception {
UUID docId = enrolledDoc("txt-content.pdf");
UUID annotId = annotation(docId);
String expectedText = "Sehr geehrte Frau";
@@ -163,8 +163,22 @@ class TrainingDataExportServiceTest {
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
byte[] zipBytes = stream(service.exportToZip());
- String txtContent = readZipEntry(zipBytes, ".gt.txt");
- assertThat(txtContent).isEqualTo(expectedText);
+ String xmlContent = readZipEntry(zipBytes, ".xml");
+ assertThat(xmlContent).contains("" + expectedText + "");
+ }
+
+ @Test
+ void export_pageXmlEscapesSpecialCharacters() throws Exception {
+ UUID docId = enrolledDoc("special-chars.pdf");
+ UUID annotId = annotation(docId);
+ blockRepository.save(manualBlock(docId, annotId, "A & B < C > D"));
+
+ FileService fileService = mockFileService();
+ TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
+
+ byte[] zipBytes = stream(service.exportToZip());
+ String xmlContent = readZipEntry(zipBytes, ".xml");
+ assertThat(xmlContent).contains("A & B < C > D");
}
// ─── S3 failure resilience ────────────────────────────────────────────────
diff --git a/frontend/src/lib/generated/api.ts b/frontend/src/lib/generated/api.ts
index 02e96063..fba18932 100644
--- a/frontend/src/lib/generated/api.ts
+++ b/frontend/src/lib/generated/api.ts
@@ -244,6 +244,22 @@ export interface paths {
patch?: never;
trace?: never;
};
+ "/api/ocr/segtrain": {
+ parameters: {
+ query?: never;
+ header?: never;
+ path?: never;
+ cookie?: never;
+ };
+ get?: never;
+ put?: never;
+ post: operations["triggerSegTraining"];
+ delete?: never;
+ options?: never;
+ head?: never;
+ patch?: never;
+ trace?: never;
+ };
"/api/ocr/batch": {
parameters: {
query?: never;
@@ -740,6 +756,22 @@ export interface paths {
patch?: never;
trace?: never;
};
+ "/api/ocr/segmentation-training-data/export": {
+ parameters: {
+ query?: never;
+ header?: never;
+ path?: never;
+ cookie?: never;
+ };
+ get: operations["exportSegmentationTrainingData"];
+ put?: never;
+ post?: never;
+ delete?: never;
+ options?: never;
+ head?: never;
+ patch?: never;
+ trace?: never;
+ };
"/api/ocr/jobs/{jobId}": {
parameters: {
query?: never;
@@ -1183,7 +1215,7 @@ export interface components {
annotationId: string;
/** Format: uuid */
documentId: string;
- text: string;
+ text?: string;
label?: string;
/** Format: int32 */
sortOrder: number;
@@ -1247,6 +1279,14 @@ export interface components {
/** Format: int32 */
documentCount: number;
modelName: string;
+ /** Format: double */
+ cer?: number;
+ /** Format: double */
+ loss?: number;
+ /** Format: double */
+ accuracy?: number;
+ /** Format: int32 */
+ epochs?: number;
errorMessage?: string;
/** Format: uuid */
triggeredBy?: string;
@@ -1311,6 +1351,7 @@ export interface components {
TriggerOcrDTO: {
/** @enum {string} */
scriptType?: "UNKNOWN" | "TYPEWRITER" | "HANDWRITING_LATIN" | "HANDWRITING_KURRENT";
+ useExistingAnnotations?: boolean;
};
CreateAnnotationDTO: {
/** Format: int32 */
@@ -1473,18 +1514,18 @@ export interface components {
/** Format: int32 */
number?: number;
sort?: components["schemas"]["SortObject"];
- first?: boolean;
- last?: boolean;
/** Format: int32 */
numberOfElements?: number;
+ first?: boolean;
+ last?: boolean;
empty?: boolean;
};
PageableObject: {
+ paged?: boolean;
/** Format: int32 */
pageNumber?: number;
/** Format: int32 */
pageSize?: number;
- paged?: boolean;
/** Format: int64 */
offset?: number;
sort?: components["schemas"]["SortObject"];
@@ -2201,6 +2242,26 @@ export interface operations {
};
};
};
+ triggerSegTraining: {
+ parameters: {
+ query?: never;
+ header?: never;
+ path?: never;
+ cookie?: never;
+ };
+ requestBody?: never;
+ responses: {
+ /** @description Created */
+ 201: {
+ headers: {
+ [name: string]: unknown;
+ };
+ content: {
+ "*/*": components["schemas"]["OcrTrainingRun"];
+ };
+ };
+ };
+ };
triggerBatch: {
parameters: {
query?: never;
@@ -3106,6 +3167,26 @@ export interface operations {
};
};
};
+ exportSegmentationTrainingData: {
+ parameters: {
+ query?: never;
+ header?: never;
+ path?: never;
+ cookie?: never;
+ };
+ requestBody?: never;
+ responses: {
+ /** @description OK */
+ 200: {
+ headers: {
+ [name: string]: unknown;
+ };
+ content: {
+ "*/*": components["schemas"]["StreamingResponseBody"];
+ };
+ };
+ };
+ };
getJobStatus: {
parameters: {
query?: never;
diff --git a/ocr-service/main.py b/ocr-service/main.py
index cdaac88b..16b2d955 100644
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -357,7 +357,7 @@ async def train_model(
log.info("Extracted %d ZIP entries to %s", len(os.listdir(tmp_dir)), tmp_dir)
- ground_truth = glob.glob(os.path.join(tmp_dir, "*.gt.txt"))
+ ground_truth = glob.glob(os.path.join(tmp_dir, "*.xml"))
if not ground_truth:
raise HTTPException(status_code=422, detail="No ground-truth files found in ZIP")
@@ -368,7 +368,7 @@ async def train_model(
cmd = [
"ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
"train",
- "-f", "path",
+ "-f", "page",
"-o", checkpoint_dir,
"-q", "fixed",
"-N", "10",