fix(training): switch to PAGE XML format for kurrent recognition training

Kraken 7 removed support for the legacy `path` format (image + .gt.txt pairs) in VGSLRecognitionDataModule despite the CLI still advertising it. Switching to PAGE XML (-f page) format which is the supported standard. - Java export now writes .xml alongside .png (PAGE XML with TextLine, Baseline at 75% height, and Unicode transcription) - XML special characters in transcription text are escaped (& < >) - Python trainer globs *.xml and passes -f page to ketos train - Regenerated frontend API types to include cer/loss/accuracy/epochs on OcrTrainingRun (were missing, causing empty CER column in history) - Updated and extended TrainingDataExportServiceTest Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 21:45:08 +02:00
parent 94b9c56527
commit 49c9022285
4 changed files with 140 additions and 13 deletions
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java
@@ -127,15 +127,47 @@ public class TrainingDataExportService {

    void writeTrainingPair(ZipOutputStream zip, UUID blockId, BufferedImage image, String text) throws IOException {
        String base = blockId.toString();
+        int w = image.getWidth();
+        int h = image.getHeight();
+        // Baseline at 75 % height — typical text baseline position in a cropped line image
+        int baselineY = (h * 3) / 4;

        // Write PNG
        zip.putNextEntry(new ZipEntry(base + ".png"));
        ImageIO.write(image, "PNG", zip);
        zip.closeEntry();

-        // Write ground-truth text
-        zip.putNextEntry(new ZipEntry(base + ".gt.txt"));
-        zip.write((text != null ? text : "").getBytes(StandardCharsets.UTF_8));
+        // Write PAGE XML (Kraken 7+ dropped the legacy "path" format)
+        String safeText = escapeXml(text != null ? text : "");
+        String xml = String.format(
+                "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+                "<PcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15\">\n" +
+                "  <Metadata><Creator>familienarchiv</Creator></Metadata>\n" +
+                "  <Page imageFilename=\"%s.png\" imageWidth=\"%d\" imageHeight=\"%d\">\n" +
+                "    <TextRegion id=\"r0\" type=\"paragraph\">\n" +
+                "      <Coords points=\"0,0 %d,0 %d,%d 0,%d\"/>\n" +
+                "      <TextLine id=\"l0\">\n" +
+                "        <Coords points=\"0,0 %d,0 %d,%d 0,%d\"/>\n" +
+                "        <Baseline points=\"0,%d %d,%d\"/>\n" +
+                "        <TextEquiv><Unicode>%s</Unicode></TextEquiv>\n" +
+                "      </TextLine>\n" +
+                "    </TextRegion>\n" +
+                "  </Page>\n" +
+                "</PcGts>\n",
+                base, w, h,
+                w - 1, w - 1, h - 1, h - 1,
+                w - 1, w - 1, h - 1, h - 1,
+                baselineY, w - 1, baselineY,
+                safeText);
+
+        zip.putNextEntry(new ZipEntry(base + ".xml"));
+        zip.write(xml.getBytes(StandardCharsets.UTF_8));
        zip.closeEntry();
    }
+
+    private static String escapeXml(String text) {
+        return text.replace("&", "&amp;")
+                   .replace("<", "&lt;")
+                   .replace(">", "&gt;");
+    }
 }
--- a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java
@@ -149,11 +149,11 @@ class TrainingDataExportServiceTest {
        var names = zipEntryNames(zipBytes);
        assertThat(names).hasSize(4); // 2 blocks × 2 entries each
        assertThat(names.stream().filter(n -> n.endsWith(".png")).count()).isEqualTo(2);
-        assertThat(names.stream().filter(n -> n.endsWith(".gt.txt")).count()).isEqualTo(2);
+        assertThat(names.stream().filter(n -> n.endsWith(".xml")).count()).isEqualTo(2);
    }

    @Test
-    void export_gtTxtContainsBlockText() throws Exception {
+    void export_pageXmlContainsBlockText() throws Exception {
        UUID docId = enrolledDoc("txt-content.pdf");
        UUID annotId = annotation(docId);
        String expectedText = "Sehr geehrte Frau";
@@ -163,8 +163,22 @@ class TrainingDataExportServiceTest {
        TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);

        byte[] zipBytes = stream(service.exportToZip());
-        String txtContent = readZipEntry(zipBytes, ".gt.txt");
-        assertThat(txtContent).isEqualTo(expectedText);
+        String xmlContent = readZipEntry(zipBytes, ".xml");
+        assertThat(xmlContent).contains("<Unicode>" + expectedText + "</Unicode>");
+    }
+
+    @Test
+    void export_pageXmlEscapesSpecialCharacters() throws Exception {
+        UUID docId = enrolledDoc("special-chars.pdf");
+        UUID annotId = annotation(docId);
+        blockRepository.save(manualBlock(docId, annotId, "A & B < C > D"));
+
+        FileService fileService = mockFileService();
+        TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
+
+        byte[] zipBytes = stream(service.exportToZip());
+        String xmlContent = readZipEntry(zipBytes, ".xml");
+        assertThat(xmlContent).contains("<Unicode>A &amp; B &lt; C &gt; D</Unicode>");
    }

    // ─── S3 failure resilience ────────────────────────────────────────────────