fix(training): switch to PAGE XML format for kurrent recognition training
Kraken 7 removed support for the legacy `path` format (image + .gt.txt pairs) in VGSLRecognitionDataModule despite the CLI still advertising it. Switching to PAGE XML (-f page) format which is the supported standard. - Java export now writes .xml alongside .png (PAGE XML with TextLine, Baseline at 75% height, and Unicode transcription) - XML special characters in transcription text are escaped (& < >) - Python trainer globs *.xml and passes -f page to ketos train - Regenerated frontend API types to include cer/loss/accuracy/epochs on OcrTrainingRun (were missing, causing empty CER column in history) - Updated and extended TrainingDataExportServiceTest Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -149,11 +149,11 @@ class TrainingDataExportServiceTest {
|
||||
var names = zipEntryNames(zipBytes);
|
||||
assertThat(names).hasSize(4); // 2 blocks × 2 entries each
|
||||
assertThat(names.stream().filter(n -> n.endsWith(".png")).count()).isEqualTo(2);
|
||||
assertThat(names.stream().filter(n -> n.endsWith(".gt.txt")).count()).isEqualTo(2);
|
||||
assertThat(names.stream().filter(n -> n.endsWith(".xml")).count()).isEqualTo(2);
|
||||
}
|
||||
|
||||
@Test
|
||||
void export_gtTxtContainsBlockText() throws Exception {
|
||||
void export_pageXmlContainsBlockText() throws Exception {
|
||||
UUID docId = enrolledDoc("txt-content.pdf");
|
||||
UUID annotId = annotation(docId);
|
||||
String expectedText = "Sehr geehrte Frau";
|
||||
@@ -163,8 +163,22 @@ class TrainingDataExportServiceTest {
|
||||
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
|
||||
|
||||
byte[] zipBytes = stream(service.exportToZip());
|
||||
String txtContent = readZipEntry(zipBytes, ".gt.txt");
|
||||
assertThat(txtContent).isEqualTo(expectedText);
|
||||
String xmlContent = readZipEntry(zipBytes, ".xml");
|
||||
assertThat(xmlContent).contains("<Unicode>" + expectedText + "</Unicode>");
|
||||
}
|
||||
|
||||
@Test
|
||||
void export_pageXmlEscapesSpecialCharacters() throws Exception {
|
||||
UUID docId = enrolledDoc("special-chars.pdf");
|
||||
UUID annotId = annotation(docId);
|
||||
blockRepository.save(manualBlock(docId, annotId, "A & B < C > D"));
|
||||
|
||||
FileService fileService = mockFileService();
|
||||
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
|
||||
|
||||
byte[] zipBytes = stream(service.exportToZip());
|
||||
String xmlContent = readZipEntry(zipBytes, ".xml");
|
||||
assertThat(xmlContent).contains("<Unicode>A & B < C > D</Unicode>");
|
||||
}
|
||||
|
||||
// ─── S3 failure resilience ────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user