From 49c9022285fded350b6d3f5147a80ea0afa8bfe0 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 13 Apr 2026 21:45:08 +0200 Subject: [PATCH] fix(training): switch to PAGE XML format for kurrent recognition training Kraken 7 removed support for the legacy `path` format (image + .gt.txt pairs) in VGSLRecognitionDataModule despite the CLI still advertising it. Switching to PAGE XML (-f page) format which is the supported standard. - Java export now writes .xml alongside .png (PAGE XML with TextLine, Baseline at 75% height, and Unicode transcription) - XML special characters in transcription text are escaped (& < >) - Python trainer globs *.xml and passes -f page to ketos train - Regenerated frontend API types to include cer/loss/accuracy/epochs on OcrTrainingRun (were missing, causing empty CER column in history) - Updated and extended TrainingDataExportServiceTest Co-Authored-By: Claude Sonnet 4.6 --- .../service/TrainingDataExportService.java | 38 +++++++- .../TrainingDataExportServiceTest.java | 22 ++++- frontend/src/lib/generated/api.ts | 89 ++++++++++++++++++- ocr-service/main.py | 4 +- 4 files changed, 140 insertions(+), 13 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java index 06a23946..cf0b10e3 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/TrainingDataExportService.java @@ -127,15 +127,47 @@ public class TrainingDataExportService { void writeTrainingPair(ZipOutputStream zip, UUID blockId, BufferedImage image, String text) throws IOException { String base = blockId.toString(); + int w = image.getWidth(); + int h = image.getHeight(); + // Baseline at 75 % height — typical text baseline position in a cropped line image + int baselineY = (h * 3) / 4; // Write PNG zip.putNextEntry(new ZipEntry(base + ".png")); ImageIO.write(image, "PNG", zip); zip.closeEntry(); - // Write ground-truth text - zip.putNextEntry(new ZipEntry(base + ".gt.txt")); - zip.write((text != null ? text : "").getBytes(StandardCharsets.UTF_8)); + // Write PAGE XML (Kraken 7+ dropped the legacy "path" format) + String safeText = escapeXml(text != null ? text : ""); + String xml = String.format( + "\n" + + "\n" + + " familienarchiv\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " %s\n" + + " \n" + + " \n" + + " \n" + + "\n", + base, w, h, + w - 1, w - 1, h - 1, h - 1, + w - 1, w - 1, h - 1, h - 1, + baselineY, w - 1, baselineY, + safeText); + + zip.putNextEntry(new ZipEntry(base + ".xml")); + zip.write(xml.getBytes(StandardCharsets.UTF_8)); zip.closeEntry(); } + + private static String escapeXml(String text) { + return text.replace("&", "&") + .replace("<", "<") + .replace(">", ">"); + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java index 12523084..cce70601 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java @@ -149,11 +149,11 @@ class TrainingDataExportServiceTest { var names = zipEntryNames(zipBytes); assertThat(names).hasSize(4); // 2 blocks × 2 entries each assertThat(names.stream().filter(n -> n.endsWith(".png")).count()).isEqualTo(2); - assertThat(names.stream().filter(n -> n.endsWith(".gt.txt")).count()).isEqualTo(2); + assertThat(names.stream().filter(n -> n.endsWith(".xml")).count()).isEqualTo(2); } @Test - void export_gtTxtContainsBlockText() throws Exception { + void export_pageXmlContainsBlockText() throws Exception { UUID docId = enrolledDoc("txt-content.pdf"); UUID annotId = annotation(docId); String expectedText = "Sehr geehrte Frau"; @@ -163,8 +163,22 @@ class TrainingDataExportServiceTest { TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService); byte[] zipBytes = stream(service.exportToZip()); - String txtContent = readZipEntry(zipBytes, ".gt.txt"); - assertThat(txtContent).isEqualTo(expectedText); + String xmlContent = readZipEntry(zipBytes, ".xml"); + assertThat(xmlContent).contains("" + expectedText + ""); + } + + @Test + void export_pageXmlEscapesSpecialCharacters() throws Exception { + UUID docId = enrolledDoc("special-chars.pdf"); + UUID annotId = annotation(docId); + blockRepository.save(manualBlock(docId, annotId, "A & B < C > D")); + + FileService fileService = mockFileService(); + TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService); + + byte[] zipBytes = stream(service.exportToZip()); + String xmlContent = readZipEntry(zipBytes, ".xml"); + assertThat(xmlContent).contains("A & B < C > D"); } // ─── S3 failure resilience ──────────────────────────────────────────────── diff --git a/frontend/src/lib/generated/api.ts b/frontend/src/lib/generated/api.ts index 02e96063..fba18932 100644 --- a/frontend/src/lib/generated/api.ts +++ b/frontend/src/lib/generated/api.ts @@ -244,6 +244,22 @@ export interface paths { patch?: never; trace?: never; }; + "/api/ocr/segtrain": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + post: operations["triggerSegTraining"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/ocr/batch": { parameters: { query?: never; @@ -740,6 +756,22 @@ export interface paths { patch?: never; trace?: never; }; + "/api/ocr/segmentation-training-data/export": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get: operations["exportSegmentationTrainingData"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/ocr/jobs/{jobId}": { parameters: { query?: never; @@ -1183,7 +1215,7 @@ export interface components { annotationId: string; /** Format: uuid */ documentId: string; - text: string; + text?: string; label?: string; /** Format: int32 */ sortOrder: number; @@ -1247,6 +1279,14 @@ export interface components { /** Format: int32 */ documentCount: number; modelName: string; + /** Format: double */ + cer?: number; + /** Format: double */ + loss?: number; + /** Format: double */ + accuracy?: number; + /** Format: int32 */ + epochs?: number; errorMessage?: string; /** Format: uuid */ triggeredBy?: string; @@ -1311,6 +1351,7 @@ export interface components { TriggerOcrDTO: { /** @enum {string} */ scriptType?: "UNKNOWN" | "TYPEWRITER" | "HANDWRITING_LATIN" | "HANDWRITING_KURRENT"; + useExistingAnnotations?: boolean; }; CreateAnnotationDTO: { /** Format: int32 */ @@ -1473,18 +1514,18 @@ export interface components { /** Format: int32 */ number?: number; sort?: components["schemas"]["SortObject"]; - first?: boolean; - last?: boolean; /** Format: int32 */ numberOfElements?: number; + first?: boolean; + last?: boolean; empty?: boolean; }; PageableObject: { + paged?: boolean; /** Format: int32 */ pageNumber?: number; /** Format: int32 */ pageSize?: number; - paged?: boolean; /** Format: int64 */ offset?: number; sort?: components["schemas"]["SortObject"]; @@ -2201,6 +2242,26 @@ export interface operations { }; }; }; + triggerSegTraining: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Created */ + 201: { + headers: { + [name: string]: unknown; + }; + content: { + "*/*": components["schemas"]["OcrTrainingRun"]; + }; + }; + }; + }; triggerBatch: { parameters: { query?: never; @@ -3106,6 +3167,26 @@ export interface operations { }; }; }; + exportSegmentationTrainingData: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description OK */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "*/*": components["schemas"]["StreamingResponseBody"]; + }; + }; + }; + }; getJobStatus: { parameters: { query?: never; diff --git a/ocr-service/main.py b/ocr-service/main.py index cdaac88b..16b2d955 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -357,7 +357,7 @@ async def train_model( log.info("Extracted %d ZIP entries to %s", len(os.listdir(tmp_dir)), tmp_dir) - ground_truth = glob.glob(os.path.join(tmp_dir, "*.gt.txt")) + ground_truth = glob.glob(os.path.join(tmp_dir, "*.xml")) if not ground_truth: raise HTTPException(status_code=422, detail="No ground-truth files found in ZIP") @@ -368,7 +368,7 @@ async def train_model( cmd = [ "ketos", "--workers", "0", "--device", "cpu", "--threads", "2", "train", - "-f", "path", + "-f", "page", "-o", checkpoint_dir, "-q", "fixed", "-N", "10",