From 70689b8f7ba7e1de34024a08e202530030d1029a Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 13 Apr 2026 12:29:42 +0200 Subject: [PATCH] feat(ocr): add SSRF protection for PDF URL downloads Validates PDF download URLs against an ALLOWED_PDF_HOSTS allowlist (default: minio,localhost,127.0.0.1) and disables redirect following to prevent redirect-based SSRF. Co-Authored-By: Claude Opus 4.6 (1M context) --- ocr-service/main.py | 19 ++++++++++++++++++- ocr-service/test_stream.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/ocr-service/main.py b/ocr-service/main.py index 66cb01dc..11f8b520 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -4,7 +4,9 @@ import asyncio import io import json import logging +import os from contextlib import asynccontextmanager +from urllib.parse import urlparse import httpx import pypdfium2 as pdfium @@ -22,6 +24,18 @@ logger = logging.getLogger(__name__) _models_ready = False +ALLOWED_PDF_HOSTS = set( + h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",") +) + + +def _validate_url(url: str) -> None: + """Validate that the PDF URL points to an allowed host (SSRF protection).""" + parsed = urlparse(url) + hostname = parsed.hostname or "" + if hostname not in ALLOWED_PDF_HOSTS: + raise HTTPException(status_code=400, detail=f"PDF host not allowed: {hostname}") + @asynccontextmanager async def lifespan(app: FastAPI): @@ -157,7 +171,10 @@ async def run_ocr_stream(request: OcrRequest): async def _download_and_convert_pdf(url: str) -> list[Image.Image]: """Download a PDF from a presigned URL and convert each page to a PIL Image.""" - async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client: + _validate_url(url) + async with httpx.AsyncClient( + timeout=httpx.Timeout(300.0), follow_redirects=False + ) as client: response = await client.get(url) response.raise_for_status() diff --git a/ocr-service/test_stream.py b/ocr-service/test_stream.py index 5b9a9332..73164602 100644 --- a/ocr-service/test_stream.py +++ b/ocr-service/test_stream.py @@ -239,6 +239,38 @@ async def test_ocr_stream_returns_400_when_kraken_unavailable_for_kurrent(mock_i assert response.status_code == 400 +# ─── SSRF protection ───────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_ocr_stream_rejects_disallowed_host(): + with patch("main._models_ready", True): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/ocr/stream", json={ + "pdfUrl": "http://evil.example.com/malicious.pdf", + "scriptType": "TYPEWRITER", + }) + + assert response.status_code == 400 + assert "not allowed" in response.json()["detail"] + + +@pytest.mark.asyncio +async def test_ocr_stream_allows_minio_host(mock_images): + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main._models_ready", True), \ + patch("main.surya_engine") as mock_surya: + mock_surya.extract_page_blocks.return_value = [_make_block(0)] + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/ocr/stream", json={ + "pdfUrl": "http://minio/test.pdf", + "scriptType": "TYPEWRITER", + }) + + assert response.status_code == 200 + + @pytest.mark.asyncio async def test_ocr_stream_applies_confidence_markers(mock_images): """Low-confidence words should be replaced with [unleserlich] in the stream output."""