feat: OCR pipeline with NDJSON streaming and real-time progress (#226, #227, #231) #229

Merged
marcel merged 74 commits from feat/issue-226-227-ocr-pipeline-polygon into main 2026-04-13 12:39:04 +02:00
2 changed files with 5 additions and 4 deletions
Showing only changes of commit 7f78bc9cf4 - Show all commits

View File

@@ -78,8 +78,8 @@ services:
dockerfile: Dockerfile
container_name: archive-ocr
restart: unless-stopped
mem_limit: 6g
memswap_limit: 6g
mem_limit: 10g
memswap_limit: 10g
volumes:
- ocr_models:/app/models
environment:

View File

@@ -92,8 +92,9 @@ async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
for page_idx in range(len(pdf)):
page = pdf[page_idx]
# Render at 300 DPI for good OCR quality
bitmap = page.render(scale=300 / 72)
# Render at 200 DPI — balances OCR quality vs memory usage
# (Surya 0.17 models use ~5GB idle; 300 DPI causes OOM on multi-page docs)
bitmap = page.render(scale=200 / 72)
pil_image = bitmap.to_pil()
images.append(pil_image)