diff --git a/docker-compose.yml b/docker-compose.yml index 6d68e1aa..72a70e13 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -78,8 +78,8 @@ services: dockerfile: Dockerfile container_name: archive-ocr restart: unless-stopped - mem_limit: 6g - memswap_limit: 6g + mem_limit: 10g + memswap_limit: 10g volumes: - ocr_models:/app/models environment: diff --git a/ocr-service/main.py b/ocr-service/main.py index 34e996f3..ee8d9935 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -92,8 +92,9 @@ async def _download_and_convert_pdf(url: str) -> list[Image.Image]: for page_idx in range(len(pdf)): page = pdf[page_idx] - # Render at 300 DPI for good OCR quality - bitmap = page.render(scale=300 / 72) + # Render at 200 DPI — balances OCR quality vs memory usage + # (Surya 0.17 models use ~5GB idle; 300 DPI causes OOM on multi-page docs) + bitmap = page.render(scale=200 / 72) pil_image = bitmap.to_pil() images.append(pil_image)