feat(ocr): add guided OCR mode using existing annotation regions
When a document has manually drawn annotation boxes, the user can now enable "Nur annotierte Bereiche" in the OCR trigger panel. The engine skips layout detection entirely and runs recognition only within the pre-drawn bounding boxes, preserving manual transcription blocks. - Python: adds OcrRegion model, extend OcrRequest/OcrBlock; guided branch in /ocr/stream groups by page and crops each region - Engines: add extract_region_text() to both Kraken and Surya - Java: adds OcrBlockResult.annotationId, OcrClient.OcrRegion, TriggerOcrDTO.useExistingAnnotations; OcrAsyncRunner dispatches to upsertGuidedBlock when annotationId is present; OcrService threads the flag through to runSingleDocument - TranscriptionService: adds upsertGuidedBlock (creates, updates OCR, or preserves MANUAL blocks) - Frontend: guided OCR toggle in OcrTrigger shown when blocks exist; skips destructive-replace confirmation in guided mode Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -108,7 +108,12 @@ async def run_ocr(request: OcrRequest):
|
||||
|
||||
@app.post("/ocr/stream")
|
||||
async def run_ocr_stream(request: OcrRequest):
|
||||
"""Run OCR on a PDF with NDJSON streaming — one JSON line per completed page."""
|
||||
"""Run OCR on a PDF with NDJSON streaming — one JSON line per completed page.
|
||||
|
||||
When request.regions is provided, runs in guided mode: each region is cropped
|
||||
and recognized individually, skipping full-page layout detection. The response
|
||||
blocks include the annotationId from the region.
|
||||
"""
|
||||
if not _models_ready:
|
||||
raise HTTPException(status_code=503, detail="Models not loaded yet")
|
||||
|
||||
@@ -123,6 +128,81 @@ async def run_ocr_stream(request: OcrRequest):
|
||||
detail="Kraken model not available — cannot process Kurrent script",
|
||||
)
|
||||
|
||||
engine = kraken_engine if use_kraken else surya_engine
|
||||
|
||||
if request.regions:
|
||||
# Guided mode: recognize only the user-drawn annotation regions
|
||||
regions_by_page: dict[int, list] = {}
|
||||
for region in request.regions:
|
||||
regions_by_page.setdefault(region.pageNumber, []).append(region)
|
||||
|
||||
async def generate_guided():
|
||||
total_pages = len(images)
|
||||
yield json.dumps({"type": "start", "totalPages": total_pages}) + "\n"
|
||||
|
||||
total_blocks = 0
|
||||
skipped_pages = 0
|
||||
|
||||
for page_idx, image in enumerate(images, start=1):
|
||||
page_regions = regions_by_page.get(page_idx, [])
|
||||
if not page_regions:
|
||||
yield json.dumps({
|
||||
"type": "page",
|
||||
"pageNumber": page_idx,
|
||||
"blocks": [],
|
||||
}) + "\n"
|
||||
del image
|
||||
continue
|
||||
|
||||
try:
|
||||
blocks = []
|
||||
for region in page_regions:
|
||||
text = await asyncio.to_thread(
|
||||
engine.extract_region_text, image,
|
||||
region.x, region.y, region.width, region.height,
|
||||
)
|
||||
blocks.append({
|
||||
"pageNumber": page_idx,
|
||||
"x": region.x,
|
||||
"y": region.y,
|
||||
"width": region.width,
|
||||
"height": region.height,
|
||||
"polygon": None,
|
||||
"text": text,
|
||||
"annotationId": region.annotationId,
|
||||
})
|
||||
|
||||
total_blocks += len(blocks)
|
||||
yield json.dumps({
|
||||
"type": "page",
|
||||
"pageNumber": page_idx,
|
||||
"blocks": blocks,
|
||||
}) + "\n"
|
||||
|
||||
except Exception:
|
||||
logger.exception("Guided OCR failed on page %d", page_idx)
|
||||
skipped_pages += 1
|
||||
yield json.dumps({
|
||||
"type": "error",
|
||||
"pageNumber": page_idx,
|
||||
"message": f"Guided OCR processing failed on page {page_idx}",
|
||||
}) + "\n"
|
||||
|
||||
finally:
|
||||
del image
|
||||
|
||||
yield json.dumps({
|
||||
"type": "done",
|
||||
"totalBlocks": total_blocks,
|
||||
"skippedPages": skipped_pages,
|
||||
}) + "\n"
|
||||
|
||||
return StreamingResponse(
|
||||
generate_guided(),
|
||||
media_type="application/x-ndjson",
|
||||
headers={"X-Accel-Buffering": "no", "Cache-Control": "no-cache"},
|
||||
)
|
||||
|
||||
async def generate():
|
||||
total_pages = len(images)
|
||||
yield json.dumps({"type": "start", "totalPages": total_pages}) + "\n"
|
||||
@@ -132,7 +212,6 @@ async def run_ocr_stream(request: OcrRequest):
|
||||
|
||||
for page_idx, image in enumerate(images, start=1):
|
||||
try:
|
||||
engine = kraken_engine if use_kraken else surya_engine
|
||||
blocks = await asyncio.to_thread(
|
||||
engine.extract_page_blocks, image, page_idx, request.language
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user