The PDF viewer uses 1-based currentPage (starting at 1) but the OCR engines produced 0-based pageNumber from enumerate(). Annotations created by OCR were assigned to page 0, which doesn't exist in the viewer. Change enumerate() to start=1 in both engines and the streaming endpoint. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
164 lines
6.6 KiB
Python
164 lines
6.6 KiB
Python
"""Tests for the NDJSON streaming OCR endpoint POST /ocr/stream."""
|
|
|
|
import json
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
from httpx import ASGITransport, AsyncClient
|
|
|
|
from main import app
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_images():
|
|
"""Return 3 fake PIL images."""
|
|
from PIL import Image
|
|
return [Image.new("RGB", (100, 200)) for _ in range(3)]
|
|
|
|
|
|
def _make_block(page_idx, text="Test"):
|
|
return {
|
|
"pageNumber": page_idx,
|
|
"x": 0.1, "y": 0.2, "width": 0.8, "height": 0.1,
|
|
"polygon": None, "text": text,
|
|
"words": [{"text": text, "confidence": 0.95}],
|
|
}
|
|
|
|
|
|
# ─── P3: start event with total pages ────────────────────────────────────────
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_ocr_stream_emits_start_event_with_total_pages(mock_images):
|
|
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
|
patch("main._models_ready", True), \
|
|
patch("main.surya_engine") as mock_surya:
|
|
mock_surya.extract_page_blocks.return_value = [_make_block(0)]
|
|
|
|
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
|
response = await client.post("/ocr/stream", json={
|
|
"pdfUrl": "http://minio/test.pdf",
|
|
"scriptType": "TYPEWRITER",
|
|
"language": "de",
|
|
})
|
|
|
|
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
|
|
assert lines[0] == {"type": "start", "totalPages": 3}
|
|
|
|
|
|
# ─── P4: page events per completed page ──────────────────────────────────────
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_ocr_stream_emits_page_event_per_page_with_blocks(mock_images):
|
|
def page_blocks(image, page_idx, language="de"):
|
|
return [_make_block(page_idx, f"Page {page_idx}")]
|
|
|
|
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
|
patch("main._models_ready", True), \
|
|
patch("main.surya_engine") as mock_surya:
|
|
mock_surya.extract_page_blocks.side_effect = page_blocks
|
|
|
|
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
|
response = await client.post("/ocr/stream", json={
|
|
"pdfUrl": "http://minio/test.pdf",
|
|
"scriptType": "TYPEWRITER",
|
|
})
|
|
|
|
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
|
|
page_events = [l for l in lines if l["type"] == "page"]
|
|
assert len(page_events) == 3
|
|
assert page_events[0]["pageNumber"] == 1
|
|
assert page_events[1]["pageNumber"] == 2
|
|
assert page_events[2]["pageNumber"] == 3
|
|
assert len(page_events[0]["blocks"]) == 1
|
|
|
|
|
|
# ─── P5: done event with total blocks and skipped ────────────────────────────
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_ocr_stream_emits_done_with_total_blocks(mock_images):
|
|
def page_blocks(image, page_idx, language="de"):
|
|
return [_make_block(page_idx)] * 2 # 2 blocks per page
|
|
|
|
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
|
patch("main._models_ready", True), \
|
|
patch("main.surya_engine") as mock_surya:
|
|
mock_surya.extract_page_blocks.side_effect = page_blocks
|
|
|
|
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
|
response = await client.post("/ocr/stream", json={
|
|
"pdfUrl": "http://minio/test.pdf",
|
|
"scriptType": "TYPEWRITER",
|
|
})
|
|
|
|
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
|
|
done = [l for l in lines if l["type"] == "done"][0]
|
|
assert done["totalBlocks"] == 6
|
|
assert done["skippedPages"] == 0
|
|
|
|
|
|
# ─── P6: error event on page failure, continues ──────────────────────────────
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_images):
|
|
call_count = 0
|
|
|
|
def page_blocks(image, page_idx, language="de"):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if page_idx == 2:
|
|
raise RuntimeError("Engine crashed on page 2")
|
|
return [_make_block(page_idx)]
|
|
|
|
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
|
patch("main._models_ready", True), \
|
|
patch("main.surya_engine") as mock_surya:
|
|
mock_surya.extract_page_blocks.side_effect = page_blocks
|
|
|
|
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
|
response = await client.post("/ocr/stream", json={
|
|
"pdfUrl": "http://minio/test.pdf",
|
|
"scriptType": "TYPEWRITER",
|
|
})
|
|
|
|
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
|
|
types = [l["type"] for l in lines]
|
|
assert "error" in types
|
|
error_event = [l for l in lines if l["type"] == "error"][0]
|
|
assert error_event["pageNumber"] == 2
|
|
# Error message must be generic, not the raw traceback
|
|
assert "Engine crashed" not in error_event["message"]
|
|
|
|
# Processing continued: pages 0 and 2 have page events
|
|
page_events = [l for l in lines if l["type"] == "page"]
|
|
assert len(page_events) == 2
|
|
|
|
done = [l for l in lines if l["type"] == "done"][0]
|
|
assert done["skippedPages"] == 1
|
|
|
|
|
|
# ─── P7: old /ocr endpoint still works ───────────────────────────────────────
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
|
|
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
|
patch("main._models_ready", True), \
|
|
patch("main.surya_engine") as mock_surya:
|
|
mock_surya.extract_blocks.return_value = [_make_block(1), _make_block(2)]
|
|
|
|
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
|
response = await client.post("/ocr", json={
|
|
"pdfUrl": "http://minio/test.pdf",
|
|
"scriptType": "TYPEWRITER",
|
|
})
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
assert len(data) == 2
|
|
assert data[0]["pageNumber"] == 1
|