From 49c9022285fded350b6d3f5147a80ea0afa8bfe0 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 13 Apr 2026 21:45:08 +0200
Subject: [PATCH] fix(training): switch to PAGE XML format for kurrent
 recognition training

Kraken 7 removed support for the legacy `path` format (image + .gt.txt
pairs) in VGSLRecognitionDataModule despite the CLI still advertising it.
Switching to PAGE XML (-f page) format which is the supported standard.

- Java export now writes .xml alongside .png (PAGE XML with TextLine,
  Baseline at 75% height, and Unicode transcription)
- XML special characters in transcription text are escaped (&amp; &lt; &gt;)
- Python trainer globs *.xml and passes -f page to ketos train
- Regenerated frontend API types to include cer/loss/accuracy/epochs on
  OcrTrainingRun (were missing, causing empty CER column in history)
- Updated and extended TrainingDataExportServiceTest

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../service/TrainingDataExportService.java    | 38 +++++++-
 .../TrainingDataExportServiceTest.java        | 22 ++++-
 frontend/src/lib/generated/api.ts             | 89 ++++++++++++++++++-
 ocr-service/main.py                           |  4 +-
 4 files changed, 140 insertions(+), 13 deletions(-)
diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java
index 06a23946..cf0b10e3 100644
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java
@@ -127,15 +127,47 @@ public class TrainingDataExportService {
 
     void writeTrainingPair(ZipOutputStream zip, UUID blockId, BufferedImage image, String text) throws IOException {
         String base = blockId.toString();
+        int w = image.getWidth();
+        int h = image.getHeight();
+        // Baseline at 75 % height — typical text baseline position in a cropped line image
+        int baselineY = (h * 3) / 4;
 
         // Write PNG
         zip.putNextEntry(new ZipEntry(base + ".png"));
         ImageIO.write(image, "PNG", zip);
         zip.closeEntry();
 
-        // Write ground-truth text
-        zip.putNextEntry(new ZipEntry(base + ".gt.txt"));
-        zip.write((text != null ? text : "").getBytes(StandardCharsets.UTF_8));
+        // Write PAGE XML (Kraken 7+ dropped the legacy "path" format)
+        String safeText = escapeXml(text != null ? text : "");
+        String xml = String.format(
+                "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+                "<PcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15\">\n" +
+                "  <Metadata><Creator>familienarchiv</Creator></Metadata>\n" +
+                "  <Page imageFilename=\"%s.png\" imageWidth=\"%d\" imageHeight=\"%d\">\n" +
+                "    <TextRegion id=\"r0\" type=\"paragraph\">\n" +
+                "      <Coords points=\"0,0 %d,0 %d,%d 0,%d\"/>\n" +
+                "      <TextLine id=\"l0\">\n" +
+                "        <Coords points=\"0,0 %d,0 %d,%d 0,%d\"/>\n" +
+                "        <Baseline points=\"0,%d %d,%d\"/>\n" +
+                "        <TextEquiv><Unicode>%s</Unicode></TextEquiv>\n" +
+                "      </TextLine>\n" +
+                "    </TextRegion>\n" +
+                "  </Page>\n" +
+                "</PcGts>\n",
+                base, w, h,
+                w - 1, w - 1, h - 1, h - 1,
+                w - 1, w - 1, h - 1, h - 1,
+                baselineY, w - 1, baselineY,
+                safeText);
+
+        zip.putNextEntry(new ZipEntry(base + ".xml"));
+        zip.write(xml.getBytes(StandardCharsets.UTF_8));
         zip.closeEntry();
     }
+
+    private static String escapeXml(String text) {
+        return text.replace("&", "&amp;")
+                   .replace("<", "&lt;")
+                   .replace(">", "&gt;");
+    }
 }
diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java
index 12523084..cce70601 100644
--- a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java
@@ -149,11 +149,11 @@ class TrainingDataExportServiceTest {
         var names = zipEntryNames(zipBytes);
         assertThat(names).hasSize(4); // 2 blocks × 2 entries each
         assertThat(names.stream().filter(n -> n.endsWith(".png")).count()).isEqualTo(2);
-        assertThat(names.stream().filter(n -> n.endsWith(".gt.txt")).count()).isEqualTo(2);
+        assertThat(names.stream().filter(n -> n.endsWith(".xml")).count()).isEqualTo(2);
     }
 
     @Test
-    void export_gtTxtContainsBlockText() throws Exception {
+    void export_pageXmlContainsBlockText() throws Exception {
         UUID docId = enrolledDoc("txt-content.pdf");
         UUID annotId = annotation(docId);
         String expectedText = "Sehr geehrte Frau";
@@ -163,8 +163,22 @@ class TrainingDataExportServiceTest {
         TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
 
         byte[] zipBytes = stream(service.exportToZip());
-        String txtContent = readZipEntry(zipBytes, ".gt.txt");
-        assertThat(txtContent).isEqualTo(expectedText);
+        String xmlContent = readZipEntry(zipBytes, ".xml");
+        assertThat(xmlContent).contains("<Unicode>" + expectedText + "</Unicode>");
+    }
+
+    @Test
+    void export_pageXmlEscapesSpecialCharacters() throws Exception {
+        UUID docId = enrolledDoc("special-chars.pdf");
+        UUID annotId = annotation(docId);
+        blockRepository.save(manualBlock(docId, annotId, "A & B < C > D"));
+
+        FileService fileService = mockFileService();
+        TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
+
+        byte[] zipBytes = stream(service.exportToZip());
+        String xmlContent = readZipEntry(zipBytes, ".xml");
+        assertThat(xmlContent).contains("<Unicode>A &amp; B &lt; C &gt; D</Unicode>");
     }
 
     // ─── S3 failure resilience ────────────────────────────────────────────────
diff --git a/frontend/src/lib/generated/api.ts b/frontend/src/lib/generated/api.ts
index 02e96063..fba18932 100644
--- a/frontend/src/lib/generated/api.ts
+++ b/frontend/src/lib/generated/api.ts
@@ -244,6 +244,22 @@ export interface paths {
         patch?: never;
         trace?: never;
     };
+    "/api/ocr/segtrain": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        post: operations["triggerSegTraining"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
     "/api/ocr/batch": {
         parameters: {
             query?: never;
@@ -740,6 +756,22 @@ export interface paths {
         patch?: never;
         trace?: never;
     };
+    "/api/ocr/segmentation-training-data/export": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get: operations["exportSegmentationTrainingData"];
+        put?: never;
+        post?: never;
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
     "/api/ocr/jobs/{jobId}": {
         parameters: {
             query?: never;
@@ -1183,7 +1215,7 @@ export interface components {
             annotationId: string;
             /** Format: uuid */
             documentId: string;
-            text: string;
+            text?: string;
             label?: string;
             /** Format: int32 */
             sortOrder: number;
@@ -1247,6 +1279,14 @@ export interface components {
             /** Format: int32 */
             documentCount: number;
             modelName: string;
+            /** Format: double */
+            cer?: number;
+            /** Format: double */
+            loss?: number;
+            /** Format: double */
+            accuracy?: number;
+            /** Format: int32 */
+            epochs?: number;
             errorMessage?: string;
             /** Format: uuid */
             triggeredBy?: string;
@@ -1311,6 +1351,7 @@ export interface components {
         TriggerOcrDTO: {
             /** @enum {string} */
             scriptType?: "UNKNOWN" | "TYPEWRITER" | "HANDWRITING_LATIN" | "HANDWRITING_KURRENT";
+            useExistingAnnotations?: boolean;
         };
         CreateAnnotationDTO: {
             /** Format: int32 */
@@ -1473,18 +1514,18 @@ export interface components {
             /** Format: int32 */
             number?: number;
             sort?: components["schemas"]["SortObject"];
-            first?: boolean;
-            last?: boolean;
             /** Format: int32 */
             numberOfElements?: number;
+            first?: boolean;
+            last?: boolean;
             empty?: boolean;
         };
         PageableObject: {
+            paged?: boolean;
             /** Format: int32 */
             pageNumber?: number;
             /** Format: int32 */
             pageSize?: number;
-            paged?: boolean;
             /** Format: int64 */
             offset?: number;
             sort?: components["schemas"]["SortObject"];
@@ -2201,6 +2242,26 @@ export interface operations {
             };
         };
     };
+    triggerSegTraining: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Created */
+            201: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "*/*": components["schemas"]["OcrTrainingRun"];
+                };
+            };
+        };
+    };
     triggerBatch: {
         parameters: {
             query?: never;
@@ -3106,6 +3167,26 @@ export interface operations {
             };
         };
     };
+    exportSegmentationTrainingData: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description OK */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "*/*": components["schemas"]["StreamingResponseBody"];
+                };
+            };
+        };
+    };
     getJobStatus: {
         parameters: {
             query?: never;
diff --git a/ocr-service/main.py b/ocr-service/main.py
index cdaac88b..16b2d955 100644
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -357,7 +357,7 @@ async def train_model(
 
             log.info("Extracted %d ZIP entries to %s", len(os.listdir(tmp_dir)), tmp_dir)
 
-            ground_truth = glob.glob(os.path.join(tmp_dir, "*.gt.txt"))
+            ground_truth = glob.glob(os.path.join(tmp_dir, "*.xml"))
             if not ground_truth:
                 raise HTTPException(status_code=422, detail="No ground-truth files found in ZIP")
 
@@ -368,7 +368,7 @@ async def train_model(
             cmd = [
                 "ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
                 "train",
-                "-f", "path",
+                "-f", "page",
                 "-o", checkpoint_dir,
                 "-q", "fixed",
                 "-N", "10",