fix(training): switch to PAGE XML format for kurrent recognition training

Kraken 7 removed support for the legacy `path` format (image + .gt.txt
pairs) in VGSLRecognitionDataModule despite the CLI still advertising it.
Switching to PAGE XML (-f page) format which is the supported standard.

- Java export now writes .xml alongside .png (PAGE XML with TextLine,
  Baseline at 75% height, and Unicode transcription)
- XML special characters in transcription text are escaped (& < >)
- Python trainer globs *.xml and passes -f page to ketos train
- Regenerated frontend API types to include cer/loss/accuracy/epochs on
  OcrTrainingRun (were missing, causing empty CER column in history)
- Updated and extended TrainingDataExportServiceTest

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 21:45:08 +02:00
parent 94b9c56527
commit 49c9022285
4 changed files with 140 additions and 13 deletions

View File

@@ -244,6 +244,22 @@ export interface paths {
patch?: never;
trace?: never;
};
"/api/ocr/segtrain": {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
get?: never;
put?: never;
post: operations["triggerSegTraining"];
delete?: never;
options?: never;
head?: never;
patch?: never;
trace?: never;
};
"/api/ocr/batch": {
parameters: {
query?: never;
@@ -740,6 +756,22 @@ export interface paths {
patch?: never;
trace?: never;
};
"/api/ocr/segmentation-training-data/export": {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
get: operations["exportSegmentationTrainingData"];
put?: never;
post?: never;
delete?: never;
options?: never;
head?: never;
patch?: never;
trace?: never;
};
"/api/ocr/jobs/{jobId}": {
parameters: {
query?: never;
@@ -1183,7 +1215,7 @@ export interface components {
annotationId: string;
/** Format: uuid */
documentId: string;
text: string;
text?: string;
label?: string;
/** Format: int32 */
sortOrder: number;
@@ -1247,6 +1279,14 @@ export interface components {
/** Format: int32 */
documentCount: number;
modelName: string;
/** Format: double */
cer?: number;
/** Format: double */
loss?: number;
/** Format: double */
accuracy?: number;
/** Format: int32 */
epochs?: number;
errorMessage?: string;
/** Format: uuid */
triggeredBy?: string;
@@ -1311,6 +1351,7 @@ export interface components {
TriggerOcrDTO: {
/** @enum {string} */
scriptType?: "UNKNOWN" | "TYPEWRITER" | "HANDWRITING_LATIN" | "HANDWRITING_KURRENT";
useExistingAnnotations?: boolean;
};
CreateAnnotationDTO: {
/** Format: int32 */
@@ -1473,18 +1514,18 @@ export interface components {
/** Format: int32 */
number?: number;
sort?: components["schemas"]["SortObject"];
first?: boolean;
last?: boolean;
/** Format: int32 */
numberOfElements?: number;
first?: boolean;
last?: boolean;
empty?: boolean;
};
PageableObject: {
paged?: boolean;
/** Format: int32 */
pageNumber?: number;
/** Format: int32 */
pageSize?: number;
paged?: boolean;
/** Format: int64 */
offset?: number;
sort?: components["schemas"]["SortObject"];
@@ -2201,6 +2242,26 @@ export interface operations {
};
};
};
triggerSegTraining: {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
requestBody?: never;
responses: {
/** @description Created */
201: {
headers: {
[name: string]: unknown;
};
content: {
"*/*": components["schemas"]["OcrTrainingRun"];
};
};
};
};
triggerBatch: {
parameters: {
query?: never;
@@ -3106,6 +3167,26 @@ export interface operations {
};
};
};
exportSegmentationTrainingData: {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
requestBody?: never;
responses: {
/** @description OK */
200: {
headers: {
[name: string]: unknown;
};
content: {
"*/*": components["schemas"]["StreamingResponseBody"];
};
};
};
};
getJobStatus: {
parameters: {
query?: never;