feat: OCR pipeline with NDJSON streaming and real-time progress (#226, #227, #231) #229

Merged
marcel merged 74 commits from feat/issue-226-227-ocr-pipeline-polygon into main 2026-04-13 12:39:04 +02:00
2 changed files with 19 additions and 2 deletions
Showing only changes of commit d8dcba1a71 - Show all commits

View File

@@ -7,6 +7,7 @@ import TranscriptionEditView from '$lib/components/TranscriptionEditView.svelte'
import TranscriptionReadView from '$lib/components/TranscriptionReadView.svelte'; import TranscriptionReadView from '$lib/components/TranscriptionReadView.svelte';
import TranscriptionPanelHeader from '$lib/components/TranscriptionPanelHeader.svelte'; import TranscriptionPanelHeader from '$lib/components/TranscriptionPanelHeader.svelte';
import type { TranscriptionBlockData } from '$lib/types'; import type { TranscriptionBlockData } from '$lib/types';
import { getErrorMessage } from '$lib/errors';
let { data } = $props(); let { data } = $props();
@@ -129,6 +130,7 @@ async function reviewToggle(blockId: string) {
let ocrRunning = $state(false); let ocrRunning = $state(false);
let ocrProgressMessage = $state(''); let ocrProgressMessage = $state('');
let ocrErrorMessage = $state('');
let ocrPollTimer = $state<ReturnType<typeof setInterval> | null>(null); let ocrPollTimer = $state<ReturnType<typeof setInterval> | null>(null);
function translateOcrProgress(code: string): string { function translateOcrProgress(code: string): string {
@@ -154,6 +156,7 @@ function translateOcrProgress(code: string): string {
async function triggerOcr(scriptType: string) { async function triggerOcr(scriptType: string) {
ocrRunning = true; ocrRunning = true;
ocrErrorMessage = '';
try { try {
const res = await fetch(`/api/documents/${doc.id}/ocr`, { const res = await fetch(`/api/documents/${doc.id}/ocr`, {
method: 'POST', method: 'POST',
@@ -165,10 +168,14 @@ async function triggerOcr(scriptType: string) {
pollOcrJob(data.jobId); pollOcrJob(data.jobId);
} else { } else {
ocrRunning = false; ocrRunning = false;
const body = await res.json().catch(() => null);
const code = (body as { code?: string } | null)?.code;
ocrErrorMessage = code ? getErrorMessage(code) : m.ocr_status_error();
} }
} catch (e) { } catch (e) {
console.error('Failed to trigger OCR:', e); console.error('Failed to trigger OCR:', e);
ocrRunning = false; ocrRunning = false;
ocrErrorMessage = m.ocr_status_error();
} }
} }
@@ -185,6 +192,9 @@ function pollOcrJob(jobId: string) {
ocrPollTimer = null; ocrPollTimer = null;
ocrRunning = false; ocrRunning = false;
ocrProgressMessage = ''; ocrProgressMessage = '';
if (job.status === 'FAILED') {
ocrErrorMessage = m.ocr_status_error();
}
await loadTranscriptionBlocks(); await loadTranscriptionBlocks();
annotationReloadKey++; annotationReloadKey++;
panelMode = transcriptionBlocks.length > 0 ? 'read' : 'edit'; panelMode = transcriptionBlocks.length > 0 ? 'read' : 'edit';
@@ -399,6 +409,11 @@ onMount(() => {
onClose={() => (transcribeMode = false)} onClose={() => (transcribeMode = false)}
/> />
<div class="flex-1 overflow-y-auto"> <div class="flex-1 overflow-y-auto">
{#if ocrErrorMessage}
<div class="mx-4 mt-4 rounded-sm border border-red-200 bg-red-50 px-4 py-3">
<p class="text-sm text-red-700">{ocrErrorMessage}</p>
</div>
{/if}
{#if ocrRunning} {#if ocrRunning}
<div class="flex flex-1 flex-col items-center justify-center px-6 py-12 text-center"> <div class="flex flex-1 flex-col items-center justify-center px-6 py-12 text-center">
<svg <svg

View File

@@ -1,5 +1,6 @@
"""OCR microservice — FastAPI app with Surya and Kraken engine support.""" """OCR microservice — FastAPI app with Surya and Kraken engine support."""
import asyncio
import io import io
import logging import logging
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
@@ -52,6 +53,7 @@ async def run_ocr(request: OcrRequest):
Downloads the PDF from the provided URL, converts pages to images, Downloads the PDF from the provided URL, converts pages to images,
and runs the appropriate OCR engine based on scriptType. and runs the appropriate OCR engine based on scriptType.
OCR engines run in a thread pool so the event loop stays free for /health.
""" """
if not _models_ready: if not _models_ready:
raise HTTPException(status_code=503, detail="Models not loaded yet") raise HTTPException(status_code=503, detail="Models not loaded yet")
@@ -66,10 +68,10 @@ async def run_ocr(request: OcrRequest):
status_code=400, status_code=400,
detail="Kraken model not available — cannot process Kurrent script", detail="Kraken model not available — cannot process Kurrent script",
) )
blocks = kraken_engine.extract_blocks(images, request.language) blocks = await asyncio.to_thread(kraken_engine.extract_blocks, images, request.language)
else: else:
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
blocks = surya_engine.extract_blocks(images, request.language) blocks = await asyncio.to_thread(surya_engine.extract_blocks, images, request.language)
threshold = get_threshold(script_type) threshold = get_threshold(script_type)
for block in blocks: for block in blocks: