feat(ocr): full OCR pipeline with polygon annotations, training, and guided mode #232
@@ -22,7 +22,7 @@ public interface TranscriptionBlockRepository extends JpaRepository<Transcriptio
|
|||||||
SELECT b FROM TranscriptionBlock b
|
SELECT b FROM TranscriptionBlock b
|
||||||
JOIN DocumentAnnotation a ON a.id = b.annotationId
|
JOIN DocumentAnnotation a ON a.id = b.annotationId
|
||||||
JOIN Document d ON d.id = b.documentId
|
JOIN Document d ON d.id = b.documentId
|
||||||
WHERE (b.source = 'MANUAL' OR (b.source = 'OCR' AND b.reviewed = true))
|
WHERE b.reviewed = true
|
||||||
AND 'KURRENT_RECOGNITION' MEMBER OF d.trainingLabels
|
AND 'KURRENT_RECOGNITION' MEMBER OF d.trainingLabels
|
||||||
""")
|
""")
|
||||||
List<TranscriptionBlock> findEligibleKurrentBlocks();
|
List<TranscriptionBlock> findEligibleKurrentBlocks();
|
||||||
|
|||||||
@@ -52,8 +52,8 @@ class TrainingBlockQueryTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void findEligibleKurrentBlocks_includesManualBlock() {
|
void findEligibleKurrentBlocks_includesReviewedManualBlock() {
|
||||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
|
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true));
|
||||||
|
|
||||||
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
|
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
|
||||||
|
|
||||||
@@ -61,6 +61,16 @@ class TrainingBlockQueryTest {
|
|||||||
assertThat(result.get(0).getSource()).isEqualTo(BlockSource.MANUAL);
|
assertThat(result.get(0).getSource()).isEqualTo(BlockSource.MANUAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void findEligibleKurrentBlocks_excludesUnreviewedManualBlock() {
|
||||||
|
// MANUAL blocks filled by guided OCR but not yet reviewed should not count as checked text
|
||||||
|
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
|
||||||
|
|
||||||
|
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
|
||||||
|
|
||||||
|
assertThat(result).isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void findEligibleKurrentBlocks_includesReviewedOcrBlock() {
|
void findEligibleKurrentBlocks_includesReviewedOcrBlock() {
|
||||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true));
|
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true));
|
||||||
@@ -90,9 +100,10 @@ class TrainingBlockQueryTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void findEligibleKurrentBlocks_returnsAllEligibleAcrossBothSources() {
|
void findEligibleKurrentBlocks_returnsAllReviewedBlocksAcrossBothSources() {
|
||||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
|
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true));
|
||||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true));
|
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true));
|
||||||
|
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); // excluded
|
||||||
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded
|
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded
|
||||||
|
|
||||||
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
|
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
|
||||||
|
|||||||
@@ -67,6 +67,24 @@ class TrainingDataExportServiceTest {
|
|||||||
assertThat(zipEntryNames(zipBytes)).isNotEmpty();
|
assertThat(zipEntryNames(zipBytes)).isNotEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void export_excludesUnreviewedManualBlockFromEnrolledDocument() throws Exception {
|
||||||
|
// MANUAL blocks whose text hasn't been verified yet should not count as training data
|
||||||
|
UUID docId = enrolledDoc("unreviewed-manual.pdf");
|
||||||
|
UUID annotId = annotation(docId);
|
||||||
|
TranscriptionBlock block = TranscriptionBlock.builder()
|
||||||
|
.annotationId(annotId).documentId(docId)
|
||||||
|
.text("Liebe Tante").sortOrder(0)
|
||||||
|
.source(BlockSource.MANUAL).reviewed(false).build();
|
||||||
|
blockRepository.save(block);
|
||||||
|
|
||||||
|
FileService fileService = mockFileService();
|
||||||
|
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
|
||||||
|
|
||||||
|
StreamingResponseBody body = service.exportToZip();
|
||||||
|
assertThat(zipEntryNames(stream(body))).isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void export_excludesManualBlockFromNonEnrolledDocument() throws Exception {
|
void export_excludesManualBlockFromNonEnrolledDocument() throws Exception {
|
||||||
UUID docId = nonEnrolledDoc("notenrolled.pdf");
|
UUID docId = nonEnrolledDoc("notenrolled.pdf");
|
||||||
@@ -212,7 +230,7 @@ class TrainingDataExportServiceTest {
|
|||||||
return TranscriptionBlock.builder()
|
return TranscriptionBlock.builder()
|
||||||
.annotationId(annotId).documentId(docId)
|
.annotationId(annotId).documentId(docId)
|
||||||
.text(text).sortOrder(0)
|
.text(text).sortOrder(0)
|
||||||
.source(BlockSource.MANUAL).reviewed(false).build();
|
.source(BlockSource.MANUAL).reviewed(true).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
private FileService mockFileService() {
|
private FileService mockFileService() {
|
||||||
|
|||||||
Reference in New Issue
Block a user