fix(training): switch to PAGE XML format for kurrent recognition training

Kraken 7 removed support for the legacy `path` format (image + .gt.txt
pairs) in VGSLRecognitionDataModule despite the CLI still advertising it.
Switching to PAGE XML (-f page) format which is the supported standard.

- Java export now writes .xml alongside .png (PAGE XML with TextLine,
  Baseline at 75% height, and Unicode transcription)
- XML special characters in transcription text are escaped (& < >)
- Python trainer globs *.xml and passes -f page to ketos train
- Regenerated frontend API types to include cer/loss/accuracy/epochs on
  OcrTrainingRun (were missing, causing empty CER column in history)
- Updated and extended TrainingDataExportServiceTest

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 21:45:08 +02:00
parent 94b9c56527
commit 49c9022285
4 changed files with 140 additions and 13 deletions

View File

@@ -127,15 +127,47 @@ public class TrainingDataExportService {
void writeTrainingPair(ZipOutputStream zip, UUID blockId, BufferedImage image, String text) throws IOException { void writeTrainingPair(ZipOutputStream zip, UUID blockId, BufferedImage image, String text) throws IOException {
String base = blockId.toString(); String base = blockId.toString();
int w = image.getWidth();
int h = image.getHeight();
// Baseline at 75 % height — typical text baseline position in a cropped line image
int baselineY = (h * 3) / 4;
// Write PNG // Write PNG
zip.putNextEntry(new ZipEntry(base + ".png")); zip.putNextEntry(new ZipEntry(base + ".png"));
ImageIO.write(image, "PNG", zip); ImageIO.write(image, "PNG", zip);
zip.closeEntry(); zip.closeEntry();
// Write ground-truth text // Write PAGE XML (Kraken 7+ dropped the legacy "path" format)
zip.putNextEntry(new ZipEntry(base + ".gt.txt")); String safeText = escapeXml(text != null ? text : "");
zip.write((text != null ? text : "").getBytes(StandardCharsets.UTF_8)); String xml = String.format(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
"<PcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15\">\n" +
" <Metadata><Creator>familienarchiv</Creator></Metadata>\n" +
" <Page imageFilename=\"%s.png\" imageWidth=\"%d\" imageHeight=\"%d\">\n" +
" <TextRegion id=\"r0\" type=\"paragraph\">\n" +
" <Coords points=\"0,0 %d,0 %d,%d 0,%d\"/>\n" +
" <TextLine id=\"l0\">\n" +
" <Coords points=\"0,0 %d,0 %d,%d 0,%d\"/>\n" +
" <Baseline points=\"0,%d %d,%d\"/>\n" +
" <TextEquiv><Unicode>%s</Unicode></TextEquiv>\n" +
" </TextLine>\n" +
" </TextRegion>\n" +
" </Page>\n" +
"</PcGts>\n",
base, w, h,
w - 1, w - 1, h - 1, h - 1,
w - 1, w - 1, h - 1, h - 1,
baselineY, w - 1, baselineY,
safeText);
zip.putNextEntry(new ZipEntry(base + ".xml"));
zip.write(xml.getBytes(StandardCharsets.UTF_8));
zip.closeEntry(); zip.closeEntry();
} }
private static String escapeXml(String text) {
return text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;");
}
} }

View File

@@ -149,11 +149,11 @@ class TrainingDataExportServiceTest {
var names = zipEntryNames(zipBytes); var names = zipEntryNames(zipBytes);
assertThat(names).hasSize(4); // 2 blocks × 2 entries each assertThat(names).hasSize(4); // 2 blocks × 2 entries each
assertThat(names.stream().filter(n -> n.endsWith(".png")).count()).isEqualTo(2); assertThat(names.stream().filter(n -> n.endsWith(".png")).count()).isEqualTo(2);
assertThat(names.stream().filter(n -> n.endsWith(".gt.txt")).count()).isEqualTo(2); assertThat(names.stream().filter(n -> n.endsWith(".xml")).count()).isEqualTo(2);
} }
@Test @Test
void export_gtTxtContainsBlockText() throws Exception { void export_pageXmlContainsBlockText() throws Exception {
UUID docId = enrolledDoc("txt-content.pdf"); UUID docId = enrolledDoc("txt-content.pdf");
UUID annotId = annotation(docId); UUID annotId = annotation(docId);
String expectedText = "Sehr geehrte Frau"; String expectedText = "Sehr geehrte Frau";
@@ -163,8 +163,22 @@ class TrainingDataExportServiceTest {
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService); TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
byte[] zipBytes = stream(service.exportToZip()); byte[] zipBytes = stream(service.exportToZip());
String txtContent = readZipEntry(zipBytes, ".gt.txt"); String xmlContent = readZipEntry(zipBytes, ".xml");
assertThat(txtContent).isEqualTo(expectedText); assertThat(xmlContent).contains("<Unicode>" + expectedText + "</Unicode>");
}
@Test
void export_pageXmlEscapesSpecialCharacters() throws Exception {
UUID docId = enrolledDoc("special-chars.pdf");
UUID annotId = annotation(docId);
blockRepository.save(manualBlock(docId, annotId, "A & B < C > D"));
FileService fileService = mockFileService();
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
byte[] zipBytes = stream(service.exportToZip());
String xmlContent = readZipEntry(zipBytes, ".xml");
assertThat(xmlContent).contains("<Unicode>A &amp; B &lt; C &gt; D</Unicode>");
} }
// ─── S3 failure resilience ──────────────────────────────────────────────── // ─── S3 failure resilience ────────────────────────────────────────────────

View File

@@ -244,6 +244,22 @@ export interface paths {
patch?: never; patch?: never;
trace?: never; trace?: never;
}; };
"/api/ocr/segtrain": {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
get?: never;
put?: never;
post: operations["triggerSegTraining"];
delete?: never;
options?: never;
head?: never;
patch?: never;
trace?: never;
};
"/api/ocr/batch": { "/api/ocr/batch": {
parameters: { parameters: {
query?: never; query?: never;
@@ -740,6 +756,22 @@ export interface paths {
patch?: never; patch?: never;
trace?: never; trace?: never;
}; };
"/api/ocr/segmentation-training-data/export": {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
get: operations["exportSegmentationTrainingData"];
put?: never;
post?: never;
delete?: never;
options?: never;
head?: never;
patch?: never;
trace?: never;
};
"/api/ocr/jobs/{jobId}": { "/api/ocr/jobs/{jobId}": {
parameters: { parameters: {
query?: never; query?: never;
@@ -1183,7 +1215,7 @@ export interface components {
annotationId: string; annotationId: string;
/** Format: uuid */ /** Format: uuid */
documentId: string; documentId: string;
text: string; text?: string;
label?: string; label?: string;
/** Format: int32 */ /** Format: int32 */
sortOrder: number; sortOrder: number;
@@ -1247,6 +1279,14 @@ export interface components {
/** Format: int32 */ /** Format: int32 */
documentCount: number; documentCount: number;
modelName: string; modelName: string;
/** Format: double */
cer?: number;
/** Format: double */
loss?: number;
/** Format: double */
accuracy?: number;
/** Format: int32 */
epochs?: number;
errorMessage?: string; errorMessage?: string;
/** Format: uuid */ /** Format: uuid */
triggeredBy?: string; triggeredBy?: string;
@@ -1311,6 +1351,7 @@ export interface components {
TriggerOcrDTO: { TriggerOcrDTO: {
/** @enum {string} */ /** @enum {string} */
scriptType?: "UNKNOWN" | "TYPEWRITER" | "HANDWRITING_LATIN" | "HANDWRITING_KURRENT"; scriptType?: "UNKNOWN" | "TYPEWRITER" | "HANDWRITING_LATIN" | "HANDWRITING_KURRENT";
useExistingAnnotations?: boolean;
}; };
CreateAnnotationDTO: { CreateAnnotationDTO: {
/** Format: int32 */ /** Format: int32 */
@@ -1473,18 +1514,18 @@ export interface components {
/** Format: int32 */ /** Format: int32 */
number?: number; number?: number;
sort?: components["schemas"]["SortObject"]; sort?: components["schemas"]["SortObject"];
first?: boolean;
last?: boolean;
/** Format: int32 */ /** Format: int32 */
numberOfElements?: number; numberOfElements?: number;
first?: boolean;
last?: boolean;
empty?: boolean; empty?: boolean;
}; };
PageableObject: { PageableObject: {
paged?: boolean;
/** Format: int32 */ /** Format: int32 */
pageNumber?: number; pageNumber?: number;
/** Format: int32 */ /** Format: int32 */
pageSize?: number; pageSize?: number;
paged?: boolean;
/** Format: int64 */ /** Format: int64 */
offset?: number; offset?: number;
sort?: components["schemas"]["SortObject"]; sort?: components["schemas"]["SortObject"];
@@ -2201,6 +2242,26 @@ export interface operations {
}; };
}; };
}; };
triggerSegTraining: {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
requestBody?: never;
responses: {
/** @description Created */
201: {
headers: {
[name: string]: unknown;
};
content: {
"*/*": components["schemas"]["OcrTrainingRun"];
};
};
};
};
triggerBatch: { triggerBatch: {
parameters: { parameters: {
query?: never; query?: never;
@@ -3106,6 +3167,26 @@ export interface operations {
}; };
}; };
}; };
exportSegmentationTrainingData: {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
requestBody?: never;
responses: {
/** @description OK */
200: {
headers: {
[name: string]: unknown;
};
content: {
"*/*": components["schemas"]["StreamingResponseBody"];
};
};
};
};
getJobStatus: { getJobStatus: {
parameters: { parameters: {
query?: never; query?: never;

View File

@@ -357,7 +357,7 @@ async def train_model(
log.info("Extracted %d ZIP entries to %s", len(os.listdir(tmp_dir)), tmp_dir) log.info("Extracted %d ZIP entries to %s", len(os.listdir(tmp_dir)), tmp_dir)
ground_truth = glob.glob(os.path.join(tmp_dir, "*.gt.txt")) ground_truth = glob.glob(os.path.join(tmp_dir, "*.xml"))
if not ground_truth: if not ground_truth:
raise HTTPException(status_code=422, detail="No ground-truth files found in ZIP") raise HTTPException(status_code=422, detail="No ground-truth files found in ZIP")
@@ -368,7 +368,7 @@ async def train_model(
cmd = [ cmd = [
"ketos", "--workers", "0", "--device", "cpu", "--threads", "2", "ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
"train", "train",
"-f", "path", "-f", "page",
"-o", checkpoint_dir, "-o", checkpoint_dir,
"-q", "fixed", "-q", "fixed",
"-N", "10", "-N", "10",