fix(training): switch to PAGE XML format for kurrent recognition training
Kraken 7 removed support for the legacy `path` format (image + .gt.txt pairs) in VGSLRecognitionDataModule despite the CLI still advertising it. Switching to PAGE XML (-f page) format which is the supported standard. - Java export now writes .xml alongside .png (PAGE XML with TextLine, Baseline at 75% height, and Unicode transcription) - XML special characters in transcription text are escaped (& < >) - Python trainer globs *.xml and passes -f page to ketos train - Regenerated frontend API types to include cer/loss/accuracy/epochs on OcrTrainingRun (were missing, causing empty CER column in history) - Updated and extended TrainingDataExportServiceTest Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -127,15 +127,47 @@ public class TrainingDataExportService {
|
|||||||
|
|
||||||
void writeTrainingPair(ZipOutputStream zip, UUID blockId, BufferedImage image, String text) throws IOException {
|
void writeTrainingPair(ZipOutputStream zip, UUID blockId, BufferedImage image, String text) throws IOException {
|
||||||
String base = blockId.toString();
|
String base = blockId.toString();
|
||||||
|
int w = image.getWidth();
|
||||||
|
int h = image.getHeight();
|
||||||
|
// Baseline at 75 % height — typical text baseline position in a cropped line image
|
||||||
|
int baselineY = (h * 3) / 4;
|
||||||
|
|
||||||
// Write PNG
|
// Write PNG
|
||||||
zip.putNextEntry(new ZipEntry(base + ".png"));
|
zip.putNextEntry(new ZipEntry(base + ".png"));
|
||||||
ImageIO.write(image, "PNG", zip);
|
ImageIO.write(image, "PNG", zip);
|
||||||
zip.closeEntry();
|
zip.closeEntry();
|
||||||
|
|
||||||
// Write ground-truth text
|
// Write PAGE XML (Kraken 7+ dropped the legacy "path" format)
|
||||||
zip.putNextEntry(new ZipEntry(base + ".gt.txt"));
|
String safeText = escapeXml(text != null ? text : "");
|
||||||
zip.write((text != null ? text : "").getBytes(StandardCharsets.UTF_8));
|
String xml = String.format(
|
||||||
|
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
|
||||||
|
"<PcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15\">\n" +
|
||||||
|
" <Metadata><Creator>familienarchiv</Creator></Metadata>\n" +
|
||||||
|
" <Page imageFilename=\"%s.png\" imageWidth=\"%d\" imageHeight=\"%d\">\n" +
|
||||||
|
" <TextRegion id=\"r0\" type=\"paragraph\">\n" +
|
||||||
|
" <Coords points=\"0,0 %d,0 %d,%d 0,%d\"/>\n" +
|
||||||
|
" <TextLine id=\"l0\">\n" +
|
||||||
|
" <Coords points=\"0,0 %d,0 %d,%d 0,%d\"/>\n" +
|
||||||
|
" <Baseline points=\"0,%d %d,%d\"/>\n" +
|
||||||
|
" <TextEquiv><Unicode>%s</Unicode></TextEquiv>\n" +
|
||||||
|
" </TextLine>\n" +
|
||||||
|
" </TextRegion>\n" +
|
||||||
|
" </Page>\n" +
|
||||||
|
"</PcGts>\n",
|
||||||
|
base, w, h,
|
||||||
|
w - 1, w - 1, h - 1, h - 1,
|
||||||
|
w - 1, w - 1, h - 1, h - 1,
|
||||||
|
baselineY, w - 1, baselineY,
|
||||||
|
safeText);
|
||||||
|
|
||||||
|
zip.putNextEntry(new ZipEntry(base + ".xml"));
|
||||||
|
zip.write(xml.getBytes(StandardCharsets.UTF_8));
|
||||||
zip.closeEntry();
|
zip.closeEntry();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static String escapeXml(String text) {
|
||||||
|
return text.replace("&", "&")
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -149,11 +149,11 @@ class TrainingDataExportServiceTest {
|
|||||||
var names = zipEntryNames(zipBytes);
|
var names = zipEntryNames(zipBytes);
|
||||||
assertThat(names).hasSize(4); // 2 blocks × 2 entries each
|
assertThat(names).hasSize(4); // 2 blocks × 2 entries each
|
||||||
assertThat(names.stream().filter(n -> n.endsWith(".png")).count()).isEqualTo(2);
|
assertThat(names.stream().filter(n -> n.endsWith(".png")).count()).isEqualTo(2);
|
||||||
assertThat(names.stream().filter(n -> n.endsWith(".gt.txt")).count()).isEqualTo(2);
|
assertThat(names.stream().filter(n -> n.endsWith(".xml")).count()).isEqualTo(2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void export_gtTxtContainsBlockText() throws Exception {
|
void export_pageXmlContainsBlockText() throws Exception {
|
||||||
UUID docId = enrolledDoc("txt-content.pdf");
|
UUID docId = enrolledDoc("txt-content.pdf");
|
||||||
UUID annotId = annotation(docId);
|
UUID annotId = annotation(docId);
|
||||||
String expectedText = "Sehr geehrte Frau";
|
String expectedText = "Sehr geehrte Frau";
|
||||||
@@ -163,8 +163,22 @@ class TrainingDataExportServiceTest {
|
|||||||
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
|
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
|
||||||
|
|
||||||
byte[] zipBytes = stream(service.exportToZip());
|
byte[] zipBytes = stream(service.exportToZip());
|
||||||
String txtContent = readZipEntry(zipBytes, ".gt.txt");
|
String xmlContent = readZipEntry(zipBytes, ".xml");
|
||||||
assertThat(txtContent).isEqualTo(expectedText);
|
assertThat(xmlContent).contains("<Unicode>" + expectedText + "</Unicode>");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void export_pageXmlEscapesSpecialCharacters() throws Exception {
|
||||||
|
UUID docId = enrolledDoc("special-chars.pdf");
|
||||||
|
UUID annotId = annotation(docId);
|
||||||
|
blockRepository.save(manualBlock(docId, annotId, "A & B < C > D"));
|
||||||
|
|
||||||
|
FileService fileService = mockFileService();
|
||||||
|
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
|
||||||
|
|
||||||
|
byte[] zipBytes = stream(service.exportToZip());
|
||||||
|
String xmlContent = readZipEntry(zipBytes, ".xml");
|
||||||
|
assertThat(xmlContent).contains("<Unicode>A & B < C > D</Unicode>");
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─── S3 failure resilience ────────────────────────────────────────────────
|
// ─── S3 failure resilience ────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -244,6 +244,22 @@ export interface paths {
|
|||||||
patch?: never;
|
patch?: never;
|
||||||
trace?: never;
|
trace?: never;
|
||||||
};
|
};
|
||||||
|
"/api/ocr/segtrain": {
|
||||||
|
parameters: {
|
||||||
|
query?: never;
|
||||||
|
header?: never;
|
||||||
|
path?: never;
|
||||||
|
cookie?: never;
|
||||||
|
};
|
||||||
|
get?: never;
|
||||||
|
put?: never;
|
||||||
|
post: operations["triggerSegTraining"];
|
||||||
|
delete?: never;
|
||||||
|
options?: never;
|
||||||
|
head?: never;
|
||||||
|
patch?: never;
|
||||||
|
trace?: never;
|
||||||
|
};
|
||||||
"/api/ocr/batch": {
|
"/api/ocr/batch": {
|
||||||
parameters: {
|
parameters: {
|
||||||
query?: never;
|
query?: never;
|
||||||
@@ -740,6 +756,22 @@ export interface paths {
|
|||||||
patch?: never;
|
patch?: never;
|
||||||
trace?: never;
|
trace?: never;
|
||||||
};
|
};
|
||||||
|
"/api/ocr/segmentation-training-data/export": {
|
||||||
|
parameters: {
|
||||||
|
query?: never;
|
||||||
|
header?: never;
|
||||||
|
path?: never;
|
||||||
|
cookie?: never;
|
||||||
|
};
|
||||||
|
get: operations["exportSegmentationTrainingData"];
|
||||||
|
put?: never;
|
||||||
|
post?: never;
|
||||||
|
delete?: never;
|
||||||
|
options?: never;
|
||||||
|
head?: never;
|
||||||
|
patch?: never;
|
||||||
|
trace?: never;
|
||||||
|
};
|
||||||
"/api/ocr/jobs/{jobId}": {
|
"/api/ocr/jobs/{jobId}": {
|
||||||
parameters: {
|
parameters: {
|
||||||
query?: never;
|
query?: never;
|
||||||
@@ -1183,7 +1215,7 @@ export interface components {
|
|||||||
annotationId: string;
|
annotationId: string;
|
||||||
/** Format: uuid */
|
/** Format: uuid */
|
||||||
documentId: string;
|
documentId: string;
|
||||||
text: string;
|
text?: string;
|
||||||
label?: string;
|
label?: string;
|
||||||
/** Format: int32 */
|
/** Format: int32 */
|
||||||
sortOrder: number;
|
sortOrder: number;
|
||||||
@@ -1247,6 +1279,14 @@ export interface components {
|
|||||||
/** Format: int32 */
|
/** Format: int32 */
|
||||||
documentCount: number;
|
documentCount: number;
|
||||||
modelName: string;
|
modelName: string;
|
||||||
|
/** Format: double */
|
||||||
|
cer?: number;
|
||||||
|
/** Format: double */
|
||||||
|
loss?: number;
|
||||||
|
/** Format: double */
|
||||||
|
accuracy?: number;
|
||||||
|
/** Format: int32 */
|
||||||
|
epochs?: number;
|
||||||
errorMessage?: string;
|
errorMessage?: string;
|
||||||
/** Format: uuid */
|
/** Format: uuid */
|
||||||
triggeredBy?: string;
|
triggeredBy?: string;
|
||||||
@@ -1311,6 +1351,7 @@ export interface components {
|
|||||||
TriggerOcrDTO: {
|
TriggerOcrDTO: {
|
||||||
/** @enum {string} */
|
/** @enum {string} */
|
||||||
scriptType?: "UNKNOWN" | "TYPEWRITER" | "HANDWRITING_LATIN" | "HANDWRITING_KURRENT";
|
scriptType?: "UNKNOWN" | "TYPEWRITER" | "HANDWRITING_LATIN" | "HANDWRITING_KURRENT";
|
||||||
|
useExistingAnnotations?: boolean;
|
||||||
};
|
};
|
||||||
CreateAnnotationDTO: {
|
CreateAnnotationDTO: {
|
||||||
/** Format: int32 */
|
/** Format: int32 */
|
||||||
@@ -1473,18 +1514,18 @@ export interface components {
|
|||||||
/** Format: int32 */
|
/** Format: int32 */
|
||||||
number?: number;
|
number?: number;
|
||||||
sort?: components["schemas"]["SortObject"];
|
sort?: components["schemas"]["SortObject"];
|
||||||
first?: boolean;
|
|
||||||
last?: boolean;
|
|
||||||
/** Format: int32 */
|
/** Format: int32 */
|
||||||
numberOfElements?: number;
|
numberOfElements?: number;
|
||||||
|
first?: boolean;
|
||||||
|
last?: boolean;
|
||||||
empty?: boolean;
|
empty?: boolean;
|
||||||
};
|
};
|
||||||
PageableObject: {
|
PageableObject: {
|
||||||
|
paged?: boolean;
|
||||||
/** Format: int32 */
|
/** Format: int32 */
|
||||||
pageNumber?: number;
|
pageNumber?: number;
|
||||||
/** Format: int32 */
|
/** Format: int32 */
|
||||||
pageSize?: number;
|
pageSize?: number;
|
||||||
paged?: boolean;
|
|
||||||
/** Format: int64 */
|
/** Format: int64 */
|
||||||
offset?: number;
|
offset?: number;
|
||||||
sort?: components["schemas"]["SortObject"];
|
sort?: components["schemas"]["SortObject"];
|
||||||
@@ -2201,6 +2242,26 @@ export interface operations {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
triggerSegTraining: {
|
||||||
|
parameters: {
|
||||||
|
query?: never;
|
||||||
|
header?: never;
|
||||||
|
path?: never;
|
||||||
|
cookie?: never;
|
||||||
|
};
|
||||||
|
requestBody?: never;
|
||||||
|
responses: {
|
||||||
|
/** @description Created */
|
||||||
|
201: {
|
||||||
|
headers: {
|
||||||
|
[name: string]: unknown;
|
||||||
|
};
|
||||||
|
content: {
|
||||||
|
"*/*": components["schemas"]["OcrTrainingRun"];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
triggerBatch: {
|
triggerBatch: {
|
||||||
parameters: {
|
parameters: {
|
||||||
query?: never;
|
query?: never;
|
||||||
@@ -3106,6 +3167,26 @@ export interface operations {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
exportSegmentationTrainingData: {
|
||||||
|
parameters: {
|
||||||
|
query?: never;
|
||||||
|
header?: never;
|
||||||
|
path?: never;
|
||||||
|
cookie?: never;
|
||||||
|
};
|
||||||
|
requestBody?: never;
|
||||||
|
responses: {
|
||||||
|
/** @description OK */
|
||||||
|
200: {
|
||||||
|
headers: {
|
||||||
|
[name: string]: unknown;
|
||||||
|
};
|
||||||
|
content: {
|
||||||
|
"*/*": components["schemas"]["StreamingResponseBody"];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
getJobStatus: {
|
getJobStatus: {
|
||||||
parameters: {
|
parameters: {
|
||||||
query?: never;
|
query?: never;
|
||||||
|
|||||||
@@ -357,7 +357,7 @@ async def train_model(
|
|||||||
|
|
||||||
log.info("Extracted %d ZIP entries to %s", len(os.listdir(tmp_dir)), tmp_dir)
|
log.info("Extracted %d ZIP entries to %s", len(os.listdir(tmp_dir)), tmp_dir)
|
||||||
|
|
||||||
ground_truth = glob.glob(os.path.join(tmp_dir, "*.gt.txt"))
|
ground_truth = glob.glob(os.path.join(tmp_dir, "*.xml"))
|
||||||
if not ground_truth:
|
if not ground_truth:
|
||||||
raise HTTPException(status_code=422, detail="No ground-truth files found in ZIP")
|
raise HTTPException(status_code=422, detail="No ground-truth files found in ZIP")
|
||||||
|
|
||||||
@@ -368,7 +368,7 @@ async def train_model(
|
|||||||
cmd = [
|
cmd = [
|
||||||
"ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
|
"ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
|
||||||
"train",
|
"train",
|
||||||
"-f", "path",
|
"-f", "page",
|
||||||
"-o", checkpoint_dir,
|
"-o", checkpoint_dir,
|
||||||
"-q", "fixed",
|
"-q", "fixed",
|
||||||
"-N", "10",
|
"-N", "10",
|
||||||
|
|||||||
Reference in New Issue
Block a user