fix(ocr): use 1-based page numbers to match frontend PDF viewer

The PDF viewer uses 1-based currentPage (starting at 1) but the OCR
engines produced 0-based pageNumber from enumerate(). Annotations
created by OCR were assigned to page 0, which doesn't exist in the
viewer. Change enumerate() to start=1 in both engines and the
streaming endpoint.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 10:32:08 +02:00
parent bac67706b9
commit 97e5138934
5 changed files with 15 additions and 15 deletions

View File

@@ -88,7 +88,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
""" """
all_blocks = [] all_blocks = []
for page_idx, image in enumerate(images): for page_idx, image in enumerate(images, start=1):
all_blocks.extend(extract_page_blocks(image, page_idx, language)) all_blocks.extend(extract_page_blocks(image, page_idx, language))
return all_blocks return all_blocks

View File

@@ -90,7 +90,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
""" """
all_blocks = [] all_blocks = []
for page_idx, image in enumerate(images): for page_idx, image in enumerate(images, start=1):
all_blocks.extend(extract_page_blocks(image, page_idx, language)) all_blocks.extend(extract_page_blocks(image, page_idx, language))
del image del image

View File

@@ -108,7 +108,7 @@ async def run_ocr_stream(request: OcrRequest):
total_blocks = 0 total_blocks = 0
skipped_pages = 0 skipped_pages = 0
for page_idx, image in enumerate(images): for page_idx, image in enumerate(images, start=1):
try: try:
engine = kraken_engine if use_kraken else surya_engine engine = kraken_engine if use_kraken else surya_engine
blocks = await asyncio.to_thread( blocks = await asyncio.to_thread(

View File

@@ -61,8 +61,8 @@ def test_surya_extract_blocks_delegates_to_extract_page_blocks():
blocks = surya.extract_blocks([image1, image2]) blocks = surya.extract_blocks([image1, image2])
assert len(blocks) == 2 assert len(blocks) == 2
assert blocks[0]["pageNumber"] == 0 assert blocks[0]["pageNumber"] == 1
assert blocks[1]["pageNumber"] == 1 assert blocks[1]["pageNumber"] == 2
# ─── Kraken extract_page_blocks ────────────────────────────────────────────── # ─── Kraken extract_page_blocks ──────────────────────────────────────────────
@@ -128,5 +128,5 @@ def test_kraken_extract_blocks_delegates_to_extract_page_blocks():
blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2]) blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2])
assert len(blocks) == 2 assert len(blocks) == 2
assert blocks[0]["pageNumber"] == 0 assert blocks[0]["pageNumber"] == 1
assert blocks[1]["pageNumber"] == 1 assert blocks[1]["pageNumber"] == 2

View File

@@ -68,9 +68,9 @@ async def test_ocr_stream_emits_page_event_per_page_with_blocks(mock_images):
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()] lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
page_events = [l for l in lines if l["type"] == "page"] page_events = [l for l in lines if l["type"] == "page"]
assert len(page_events) == 3 assert len(page_events) == 3
assert page_events[0]["pageNumber"] == 0 assert page_events[0]["pageNumber"] == 1
assert page_events[1]["pageNumber"] == 1 assert page_events[1]["pageNumber"] == 2
assert page_events[2]["pageNumber"] == 2 assert page_events[2]["pageNumber"] == 3
assert len(page_events[0]["blocks"]) == 1 assert len(page_events[0]["blocks"]) == 1
@@ -109,8 +109,8 @@ async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_i
def page_blocks(image, page_idx, language="de"): def page_blocks(image, page_idx, language="de"):
nonlocal call_count nonlocal call_count
call_count += 1 call_count += 1
if page_idx == 1: if page_idx == 2:
raise RuntimeError("Engine crashed on page 1") raise RuntimeError("Engine crashed on page 2")
return [_make_block(page_idx)] return [_make_block(page_idx)]
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
@@ -128,7 +128,7 @@ async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_i
types = [l["type"] for l in lines] types = [l["type"] for l in lines]
assert "error" in types assert "error" in types
error_event = [l for l in lines if l["type"] == "error"][0] error_event = [l for l in lines if l["type"] == "error"][0]
assert error_event["pageNumber"] == 1 assert error_event["pageNumber"] == 2
# Error message must be generic, not the raw traceback # Error message must be generic, not the raw traceback
assert "Engine crashed" not in error_event["message"] assert "Engine crashed" not in error_event["message"]
@@ -148,7 +148,7 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
patch("main._models_ready", True), \ patch("main._models_ready", True), \
patch("main.surya_engine") as mock_surya: patch("main.surya_engine") as mock_surya:
mock_surya.extract_blocks.return_value = [_make_block(0), _make_block(1)] mock_surya.extract_blocks.return_value = [_make_block(1), _make_block(2)]
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
response = await client.post("/ocr", json={ response = await client.post("/ocr", json={
@@ -160,4 +160,4 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
data = response.json() data = response.json()
assert isinstance(data, list) assert isinstance(data, list)
assert len(data) == 2 assert len(data) == 2
assert data[0]["pageNumber"] == 0 assert data[0]["pageNumber"] == 1