feat(ocr): full OCR pipeline with polygon annotations, training, and guided mode #232

Merged
marcel merged 40 commits from feat/issue-226-227-ocr-pipeline-polygon into main 2026-04-14 10:31:35 +02:00
3 changed files with 36 additions and 7 deletions
Showing only changes of commit a99afef319 - Show all commits

View File

@@ -22,7 +22,7 @@ public interface TranscriptionBlockRepository extends JpaRepository<Transcriptio
SELECT b FROM TranscriptionBlock b
JOIN DocumentAnnotation a ON a.id = b.annotationId
JOIN Document d ON d.id = b.documentId
WHERE (b.source = 'MANUAL' OR (b.source = 'OCR' AND b.reviewed = true))
WHERE b.reviewed = true
AND 'KURRENT_RECOGNITION' MEMBER OF d.trainingLabels
""")
List<TranscriptionBlock> findEligibleKurrentBlocks();

View File

@@ -52,8 +52,8 @@ class TrainingBlockQueryTest {
}
@Test
void findEligibleKurrentBlocks_includesManualBlock() {
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
void findEligibleKurrentBlocks_includesReviewedManualBlock() {
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true));
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
@@ -61,6 +61,16 @@ class TrainingBlockQueryTest {
assertThat(result.get(0).getSource()).isEqualTo(BlockSource.MANUAL);
}
@Test
void findEligibleKurrentBlocks_excludesUnreviewedManualBlock() {
// MANUAL blocks filled by guided OCR but not yet reviewed should not count as checked text
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
assertThat(result).isEmpty();
}
@Test
void findEligibleKurrentBlocks_includesReviewedOcrBlock() {
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true));
@@ -90,10 +100,11 @@ class TrainingBlockQueryTest {
}
@Test
void findEligibleKurrentBlocks_returnsAllEligibleAcrossBothSources() {
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
void findEligibleKurrentBlocks_returnsAllReviewedBlocksAcrossBothSources() {
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true));
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true));
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); // excluded
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();

View File

@@ -67,6 +67,24 @@ class TrainingDataExportServiceTest {
assertThat(zipEntryNames(zipBytes)).isNotEmpty();
}
@Test
void export_excludesUnreviewedManualBlockFromEnrolledDocument() throws Exception {
// MANUAL blocks whose text hasn't been verified yet should not count as training data
UUID docId = enrolledDoc("unreviewed-manual.pdf");
UUID annotId = annotation(docId);
TranscriptionBlock block = TranscriptionBlock.builder()
.annotationId(annotId).documentId(docId)
.text("Liebe Tante").sortOrder(0)
.source(BlockSource.MANUAL).reviewed(false).build();
blockRepository.save(block);
FileService fileService = mockFileService();
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
StreamingResponseBody body = service.exportToZip();
assertThat(zipEntryNames(stream(body))).isEmpty();
}
@Test
void export_excludesManualBlockFromNonEnrolledDocument() throws Exception {
UUID docId = nonEnrolledDoc("notenrolled.pdf");
@@ -212,7 +230,7 @@ class TrainingDataExportServiceTest {
return TranscriptionBlock.builder()
.annotationId(annotId).documentId(docId)
.text(text).sortOrder(0)
.source(BlockSource.MANUAL).reviewed(false).build();
.source(BlockSource.MANUAL).reviewed(true).build();
}
private FileService mockFileService() {