feat(ocr): add SSRF protection for PDF URL downloads

Validates PDF download URLs against an ALLOWED_PDF_HOSTS allowlist (default: minio,localhost,127.0.0.1) and disables redirect following to prevent redirect-based SSRF. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 12:29:42 +02:00
parent 0beaf351f0
commit 70689b8f7b
2 changed files with 50 additions and 1 deletions
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -4,7 +4,9 @@ import asyncio
 import io
 import json
 import logging
+import os
 from contextlib import asynccontextmanager
+from urllib.parse import urlparse

 import httpx
 import pypdfium2 as pdfium
@@ -22,6 +24,18 @@ logger = logging.getLogger(__name__)

 _models_ready = False

+ALLOWED_PDF_HOSTS = set(
+    h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
+)
+
+
+def _validate_url(url: str) -> None:
+    """Validate that the PDF URL points to an allowed host (SSRF protection)."""
+    parsed = urlparse(url)
+    hostname = parsed.hostname or ""
+    if hostname not in ALLOWED_PDF_HOSTS:
+        raise HTTPException(status_code=400, detail=f"PDF host not allowed: {hostname}")
+

@asynccontextmanager
 async def lifespan(app: FastAPI):
@@ -157,7 +171,10 @@ async def run_ocr_stream(request: OcrRequest):

 async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
    """Download a PDF from a presigned URL and convert each page to a PIL Image."""
-    async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
+    _validate_url(url)
+    async with httpx.AsyncClient(
+        timeout=httpx.Timeout(300.0), follow_redirects=False
+    ) as client:
        response = await client.get(url)
        response.raise_for_status()