feat(ocr): full OCR pipeline with polygon annotations, training, and guided mode #232
@@ -22,7 +22,7 @@ public interface TranscriptionBlockRepository extends JpaRepository<Transcriptio
|
||||
SELECT b FROM TranscriptionBlock b
|
||||
JOIN DocumentAnnotation a ON a.id = b.annotationId
|
||||
JOIN Document d ON d.id = b.documentId
|
||||
WHERE (b.source = 'MANUAL' OR (b.source = 'OCR' AND b.reviewed = true))
|
||||
WHERE b.reviewed = true
|
||||
AND 'KURRENT_RECOGNITION' MEMBER OF d.trainingLabels
|
||||
""")
|
||||
List<TranscriptionBlock> findEligibleKurrentBlocks();
|
||||
|
||||
@@ -52,8 +52,8 @@ class TrainingBlockQueryTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void findEligibleKurrentBlocks_includesManualBlock() {
|
||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
|
||||
void findEligibleKurrentBlocks_includesReviewedManualBlock() {
|
||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true));
|
||||
|
||||
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
|
||||
|
||||
@@ -61,6 +61,16 @@ class TrainingBlockQueryTest {
|
||||
assertThat(result.get(0).getSource()).isEqualTo(BlockSource.MANUAL);
|
||||
}
|
||||
|
||||
@Test
|
||||
void findEligibleKurrentBlocks_excludesUnreviewedManualBlock() {
|
||||
// MANUAL blocks filled by guided OCR but not yet reviewed should not count as checked text
|
||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
|
||||
|
||||
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
|
||||
|
||||
assertThat(result).isEmpty();
|
||||
}
|
||||
|
||||
@Test
|
||||
void findEligibleKurrentBlocks_includesReviewedOcrBlock() {
|
||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true));
|
||||
@@ -90,10 +100,11 @@ class TrainingBlockQueryTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void findEligibleKurrentBlocks_returnsAllEligibleAcrossBothSources() {
|
||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
|
||||
void findEligibleKurrentBlocks_returnsAllReviewedBlocksAcrossBothSources() {
|
||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true));
|
||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true));
|
||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded
|
||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); // excluded
|
||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded
|
||||
|
||||
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
|
||||
|
||||
|
||||
@@ -67,6 +67,24 @@ class TrainingDataExportServiceTest {
|
||||
assertThat(zipEntryNames(zipBytes)).isNotEmpty();
|
||||
}
|
||||
|
||||
@Test
|
||||
void export_excludesUnreviewedManualBlockFromEnrolledDocument() throws Exception {
|
||||
// MANUAL blocks whose text hasn't been verified yet should not count as training data
|
||||
UUID docId = enrolledDoc("unreviewed-manual.pdf");
|
||||
UUID annotId = annotation(docId);
|
||||
TranscriptionBlock block = TranscriptionBlock.builder()
|
||||
.annotationId(annotId).documentId(docId)
|
||||
.text("Liebe Tante").sortOrder(0)
|
||||
.source(BlockSource.MANUAL).reviewed(false).build();
|
||||
blockRepository.save(block);
|
||||
|
||||
FileService fileService = mockFileService();
|
||||
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
|
||||
|
||||
StreamingResponseBody body = service.exportToZip();
|
||||
assertThat(zipEntryNames(stream(body))).isEmpty();
|
||||
}
|
||||
|
||||
@Test
|
||||
void export_excludesManualBlockFromNonEnrolledDocument() throws Exception {
|
||||
UUID docId = nonEnrolledDoc("notenrolled.pdf");
|
||||
@@ -212,7 +230,7 @@ class TrainingDataExportServiceTest {
|
||||
return TranscriptionBlock.builder()
|
||||
.annotationId(annotId).documentId(docId)
|
||||
.text(text).sortOrder(0)
|
||||
.source(BlockSource.MANUAL).reviewed(false).build();
|
||||
.source(BlockSource.MANUAL).reviewed(true).build();
|
||||
}
|
||||
|
||||
private FileService mockFileService() {
|
||||
|
||||
Reference in New Issue
Block a user