fix(training): switch to PAGE XML format for kurrent recognition training
Kraken 7 removed support for the legacy `path` format (image + .gt.txt pairs) in VGSLRecognitionDataModule despite the CLI still advertising it. Switching to PAGE XML (-f page) format which is the supported standard. - Java export now writes .xml alongside .png (PAGE XML with TextLine, Baseline at 75% height, and Unicode transcription) - XML special characters in transcription text are escaped (& < >) - Python trainer globs *.xml and passes -f page to ketos train - Regenerated frontend API types to include cer/loss/accuracy/epochs on OcrTrainingRun (were missing, causing empty CER column in history) - Updated and extended TrainingDataExportServiceTest Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -127,15 +127,47 @@ public class TrainingDataExportService {
|
||||
|
||||
void writeTrainingPair(ZipOutputStream zip, UUID blockId, BufferedImage image, String text) throws IOException {
|
||||
String base = blockId.toString();
|
||||
int w = image.getWidth();
|
||||
int h = image.getHeight();
|
||||
// Baseline at 75 % height — typical text baseline position in a cropped line image
|
||||
int baselineY = (h * 3) / 4;
|
||||
|
||||
// Write PNG
|
||||
zip.putNextEntry(new ZipEntry(base + ".png"));
|
||||
ImageIO.write(image, "PNG", zip);
|
||||
zip.closeEntry();
|
||||
|
||||
// Write ground-truth text
|
||||
zip.putNextEntry(new ZipEntry(base + ".gt.txt"));
|
||||
zip.write((text != null ? text : "").getBytes(StandardCharsets.UTF_8));
|
||||
// Write PAGE XML (Kraken 7+ dropped the legacy "path" format)
|
||||
String safeText = escapeXml(text != null ? text : "");
|
||||
String xml = String.format(
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
|
||||
"<PcGts xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15\">\n" +
|
||||
" <Metadata><Creator>familienarchiv</Creator></Metadata>\n" +
|
||||
" <Page imageFilename=\"%s.png\" imageWidth=\"%d\" imageHeight=\"%d\">\n" +
|
||||
" <TextRegion id=\"r0\" type=\"paragraph\">\n" +
|
||||
" <Coords points=\"0,0 %d,0 %d,%d 0,%d\"/>\n" +
|
||||
" <TextLine id=\"l0\">\n" +
|
||||
" <Coords points=\"0,0 %d,0 %d,%d 0,%d\"/>\n" +
|
||||
" <Baseline points=\"0,%d %d,%d\"/>\n" +
|
||||
" <TextEquiv><Unicode>%s</Unicode></TextEquiv>\n" +
|
||||
" </TextLine>\n" +
|
||||
" </TextRegion>\n" +
|
||||
" </Page>\n" +
|
||||
"</PcGts>\n",
|
||||
base, w, h,
|
||||
w - 1, w - 1, h - 1, h - 1,
|
||||
w - 1, w - 1, h - 1, h - 1,
|
||||
baselineY, w - 1, baselineY,
|
||||
safeText);
|
||||
|
||||
zip.putNextEntry(new ZipEntry(base + ".xml"));
|
||||
zip.write(xml.getBytes(StandardCharsets.UTF_8));
|
||||
zip.closeEntry();
|
||||
}
|
||||
|
||||
private static String escapeXml(String text) {
|
||||
return text.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user