fix(training): switch to PAGE XML format for kurrent recognition training
Kraken 7 removed support for the legacy `path` format (image + .gt.txt pairs) in VGSLRecognitionDataModule despite the CLI still advertising it. Switching to PAGE XML (-f page) format which is the supported standard. - Java export now writes .xml alongside .png (PAGE XML with TextLine, Baseline at 75% height, and Unicode transcription) - XML special characters in transcription text are escaped (& < >) - Python trainer globs *.xml and passes -f page to ketos train - Regenerated frontend API types to include cer/loss/accuracy/epochs on OcrTrainingRun (were missing, causing empty CER column in history) - Updated and extended TrainingDataExportServiceTest Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -244,6 +244,22 @@ export interface paths {
|
||||
patch?: never;
|
||||
trace?: never;
|
||||
};
|
||||
"/api/ocr/segtrain": {
|
||||
parameters: {
|
||||
query?: never;
|
||||
header?: never;
|
||||
path?: never;
|
||||
cookie?: never;
|
||||
};
|
||||
get?: never;
|
||||
put?: never;
|
||||
post: operations["triggerSegTraining"];
|
||||
delete?: never;
|
||||
options?: never;
|
||||
head?: never;
|
||||
patch?: never;
|
||||
trace?: never;
|
||||
};
|
||||
"/api/ocr/batch": {
|
||||
parameters: {
|
||||
query?: never;
|
||||
@@ -740,6 +756,22 @@ export interface paths {
|
||||
patch?: never;
|
||||
trace?: never;
|
||||
};
|
||||
"/api/ocr/segmentation-training-data/export": {
|
||||
parameters: {
|
||||
query?: never;
|
||||
header?: never;
|
||||
path?: never;
|
||||
cookie?: never;
|
||||
};
|
||||
get: operations["exportSegmentationTrainingData"];
|
||||
put?: never;
|
||||
post?: never;
|
||||
delete?: never;
|
||||
options?: never;
|
||||
head?: never;
|
||||
patch?: never;
|
||||
trace?: never;
|
||||
};
|
||||
"/api/ocr/jobs/{jobId}": {
|
||||
parameters: {
|
||||
query?: never;
|
||||
@@ -1183,7 +1215,7 @@ export interface components {
|
||||
annotationId: string;
|
||||
/** Format: uuid */
|
||||
documentId: string;
|
||||
text: string;
|
||||
text?: string;
|
||||
label?: string;
|
||||
/** Format: int32 */
|
||||
sortOrder: number;
|
||||
@@ -1247,6 +1279,14 @@ export interface components {
|
||||
/** Format: int32 */
|
||||
documentCount: number;
|
||||
modelName: string;
|
||||
/** Format: double */
|
||||
cer?: number;
|
||||
/** Format: double */
|
||||
loss?: number;
|
||||
/** Format: double */
|
||||
accuracy?: number;
|
||||
/** Format: int32 */
|
||||
epochs?: number;
|
||||
errorMessage?: string;
|
||||
/** Format: uuid */
|
||||
triggeredBy?: string;
|
||||
@@ -1311,6 +1351,7 @@ export interface components {
|
||||
TriggerOcrDTO: {
|
||||
/** @enum {string} */
|
||||
scriptType?: "UNKNOWN" | "TYPEWRITER" | "HANDWRITING_LATIN" | "HANDWRITING_KURRENT";
|
||||
useExistingAnnotations?: boolean;
|
||||
};
|
||||
CreateAnnotationDTO: {
|
||||
/** Format: int32 */
|
||||
@@ -1473,18 +1514,18 @@ export interface components {
|
||||
/** Format: int32 */
|
||||
number?: number;
|
||||
sort?: components["schemas"]["SortObject"];
|
||||
first?: boolean;
|
||||
last?: boolean;
|
||||
/** Format: int32 */
|
||||
numberOfElements?: number;
|
||||
first?: boolean;
|
||||
last?: boolean;
|
||||
empty?: boolean;
|
||||
};
|
||||
PageableObject: {
|
||||
paged?: boolean;
|
||||
/** Format: int32 */
|
||||
pageNumber?: number;
|
||||
/** Format: int32 */
|
||||
pageSize?: number;
|
||||
paged?: boolean;
|
||||
/** Format: int64 */
|
||||
offset?: number;
|
||||
sort?: components["schemas"]["SortObject"];
|
||||
@@ -2201,6 +2242,26 @@ export interface operations {
|
||||
};
|
||||
};
|
||||
};
|
||||
triggerSegTraining: {
|
||||
parameters: {
|
||||
query?: never;
|
||||
header?: never;
|
||||
path?: never;
|
||||
cookie?: never;
|
||||
};
|
||||
requestBody?: never;
|
||||
responses: {
|
||||
/** @description Created */
|
||||
201: {
|
||||
headers: {
|
||||
[name: string]: unknown;
|
||||
};
|
||||
content: {
|
||||
"*/*": components["schemas"]["OcrTrainingRun"];
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
triggerBatch: {
|
||||
parameters: {
|
||||
query?: never;
|
||||
@@ -3106,6 +3167,26 @@ export interface operations {
|
||||
};
|
||||
};
|
||||
};
|
||||
exportSegmentationTrainingData: {
|
||||
parameters: {
|
||||
query?: never;
|
||||
header?: never;
|
||||
path?: never;
|
||||
cookie?: never;
|
||||
};
|
||||
requestBody?: never;
|
||||
responses: {
|
||||
/** @description OK */
|
||||
200: {
|
||||
headers: {
|
||||
[name: string]: unknown;
|
||||
};
|
||||
content: {
|
||||
"*/*": components["schemas"]["StreamingResponseBody"];
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
getJobStatus: {
|
||||
parameters: {
|
||||
query?: never;
|
||||
|
||||
Reference in New Issue
Block a user