fix(ocr): use 1-based page numbers to match frontend PDF viewer

The PDF viewer uses 1-based currentPage (starting at 1) but the OCR
engines produced 0-based pageNumber from enumerate(). Annotations
created by OCR were assigned to page 0, which doesn't exist in the
viewer. Change enumerate() to start=1 in both engines and the
streaming endpoint.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 10:32:08 +02:00
parent bac67706b9
commit 97e5138934
5 changed files with 15 additions and 15 deletions

View File

@@ -88,7 +88,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
"""
all_blocks = []
for page_idx, image in enumerate(images):
for page_idx, image in enumerate(images, start=1):
all_blocks.extend(extract_page_blocks(image, page_idx, language))
return all_blocks

View File

@@ -90,7 +90,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
"""
all_blocks = []
for page_idx, image in enumerate(images):
for page_idx, image in enumerate(images, start=1):
all_blocks.extend(extract_page_blocks(image, page_idx, language))
del image

View File

@@ -108,7 +108,7 @@ async def run_ocr_stream(request: OcrRequest):
total_blocks = 0
skipped_pages = 0
for page_idx, image in enumerate(images):
for page_idx, image in enumerate(images, start=1):
try:
engine = kraken_engine if use_kraken else surya_engine
blocks = await asyncio.to_thread(

View File

@@ -61,8 +61,8 @@ def test_surya_extract_blocks_delegates_to_extract_page_blocks():
blocks = surya.extract_blocks([image1, image2])
assert len(blocks) == 2
assert blocks[0]["pageNumber"] == 0
assert blocks[1]["pageNumber"] == 1
assert blocks[0]["pageNumber"] == 1
assert blocks[1]["pageNumber"] == 2
# ─── Kraken extract_page_blocks ──────────────────────────────────────────────
@@ -128,5 +128,5 @@ def test_kraken_extract_blocks_delegates_to_extract_page_blocks():
blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2])
assert len(blocks) == 2
assert blocks[0]["pageNumber"] == 0
assert blocks[1]["pageNumber"] == 1
assert blocks[0]["pageNumber"] == 1
assert blocks[1]["pageNumber"] == 2

View File

@@ -68,9 +68,9 @@ async def test_ocr_stream_emits_page_event_per_page_with_blocks(mock_images):
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
page_events = [l for l in lines if l["type"] == "page"]
assert len(page_events) == 3
assert page_events[0]["pageNumber"] == 0
assert page_events[1]["pageNumber"] == 1
assert page_events[2]["pageNumber"] == 2
assert page_events[0]["pageNumber"] == 1
assert page_events[1]["pageNumber"] == 2
assert page_events[2]["pageNumber"] == 3
assert len(page_events[0]["blocks"]) == 1
@@ -109,8 +109,8 @@ async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_i
def page_blocks(image, page_idx, language="de"):
nonlocal call_count
call_count += 1
if page_idx == 1:
raise RuntimeError("Engine crashed on page 1")
if page_idx == 2:
raise RuntimeError("Engine crashed on page 2")
return [_make_block(page_idx)]
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
@@ -128,7 +128,7 @@ async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_i
types = [l["type"] for l in lines]
assert "error" in types
error_event = [l for l in lines if l["type"] == "error"][0]
assert error_event["pageNumber"] == 1
assert error_event["pageNumber"] == 2
# Error message must be generic, not the raw traceback
assert "Engine crashed" not in error_event["message"]
@@ -148,7 +148,7 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
patch("main._models_ready", True), \
patch("main.surya_engine") as mock_surya:
mock_surya.extract_blocks.return_value = [_make_block(0), _make_block(1)]
mock_surya.extract_blocks.return_value = [_make_block(1), _make_block(2)]
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
response = await client.post("/ocr", json={
@@ -160,4 +160,4 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
data = response.json()
assert isinstance(data, list)
assert len(data) == 2
assert data[0]["pageNumber"] == 0
assert data[0]["pageNumber"] == 1