fix(ocr): use 1-based page numbers to match frontend PDF viewer
The PDF viewer uses 1-based currentPage (starting at 1) but the OCR engines produced 0-based pageNumber from enumerate(). Annotations created by OCR were assigned to page 0, which doesn't exist in the viewer. Change enumerate() to start=1 in both engines and the streaming endpoint. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -88,7 +88,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
"""
|
||||
all_blocks = []
|
||||
|
||||
for page_idx, image in enumerate(images):
|
||||
for page_idx, image in enumerate(images, start=1):
|
||||
all_blocks.extend(extract_page_blocks(image, page_idx, language))
|
||||
|
||||
return all_blocks
|
||||
|
||||
@@ -90,7 +90,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
"""
|
||||
all_blocks = []
|
||||
|
||||
for page_idx, image in enumerate(images):
|
||||
for page_idx, image in enumerate(images, start=1):
|
||||
all_blocks.extend(extract_page_blocks(image, page_idx, language))
|
||||
del image
|
||||
|
||||
|
||||
@@ -108,7 +108,7 @@ async def run_ocr_stream(request: OcrRequest):
|
||||
total_blocks = 0
|
||||
skipped_pages = 0
|
||||
|
||||
for page_idx, image in enumerate(images):
|
||||
for page_idx, image in enumerate(images, start=1):
|
||||
try:
|
||||
engine = kraken_engine if use_kraken else surya_engine
|
||||
blocks = await asyncio.to_thread(
|
||||
|
||||
@@ -61,8 +61,8 @@ def test_surya_extract_blocks_delegates_to_extract_page_blocks():
|
||||
blocks = surya.extract_blocks([image1, image2])
|
||||
|
||||
assert len(blocks) == 2
|
||||
assert blocks[0]["pageNumber"] == 0
|
||||
assert blocks[1]["pageNumber"] == 1
|
||||
assert blocks[0]["pageNumber"] == 1
|
||||
assert blocks[1]["pageNumber"] == 2
|
||||
|
||||
|
||||
# ─── Kraken extract_page_blocks ──────────────────────────────────────────────
|
||||
@@ -128,5 +128,5 @@ def test_kraken_extract_blocks_delegates_to_extract_page_blocks():
|
||||
blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2])
|
||||
|
||||
assert len(blocks) == 2
|
||||
assert blocks[0]["pageNumber"] == 0
|
||||
assert blocks[1]["pageNumber"] == 1
|
||||
assert blocks[0]["pageNumber"] == 1
|
||||
assert blocks[1]["pageNumber"] == 2
|
||||
|
||||
@@ -68,9 +68,9 @@ async def test_ocr_stream_emits_page_event_per_page_with_blocks(mock_images):
|
||||
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
|
||||
page_events = [l for l in lines if l["type"] == "page"]
|
||||
assert len(page_events) == 3
|
||||
assert page_events[0]["pageNumber"] == 0
|
||||
assert page_events[1]["pageNumber"] == 1
|
||||
assert page_events[2]["pageNumber"] == 2
|
||||
assert page_events[0]["pageNumber"] == 1
|
||||
assert page_events[1]["pageNumber"] == 2
|
||||
assert page_events[2]["pageNumber"] == 3
|
||||
assert len(page_events[0]["blocks"]) == 1
|
||||
|
||||
|
||||
@@ -109,8 +109,8 @@ async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_i
|
||||
def page_blocks(image, page_idx, language="de"):
|
||||
nonlocal call_count
|
||||
call_count += 1
|
||||
if page_idx == 1:
|
||||
raise RuntimeError("Engine crashed on page 1")
|
||||
if page_idx == 2:
|
||||
raise RuntimeError("Engine crashed on page 2")
|
||||
return [_make_block(page_idx)]
|
||||
|
||||
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||
@@ -128,7 +128,7 @@ async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_i
|
||||
types = [l["type"] for l in lines]
|
||||
assert "error" in types
|
||||
error_event = [l for l in lines if l["type"] == "error"][0]
|
||||
assert error_event["pageNumber"] == 1
|
||||
assert error_event["pageNumber"] == 2
|
||||
# Error message must be generic, not the raw traceback
|
||||
assert "Engine crashed" not in error_event["message"]
|
||||
|
||||
@@ -148,7 +148,7 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
|
||||
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||
patch("main._models_ready", True), \
|
||||
patch("main.surya_engine") as mock_surya:
|
||||
mock_surya.extract_blocks.return_value = [_make_block(0), _make_block(1)]
|
||||
mock_surya.extract_blocks.return_value = [_make_block(1), _make_block(2)]
|
||||
|
||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||
response = await client.post("/ocr", json={
|
||||
@@ -160,4 +160,4 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
|
||||
data = response.json()
|
||||
assert isinstance(data, list)
|
||||
assert len(data) == 2
|
||||
assert data[0]["pageNumber"] == 0
|
||||
assert data[0]["pageNumber"] == 1
|
||||
|
||||
Reference in New Issue
Block a user