fix(ocr): use 1-based page numbers to match frontend PDF viewer
The PDF viewer uses 1-based currentPage (starting at 1) but the OCR engines produced 0-based pageNumber from enumerate(). Annotations created by OCR were assigned to page 0, which doesn't exist in the viewer. Change enumerate() to start=1 in both engines and the streaming endpoint. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -88,7 +88,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
|||||||
"""
|
"""
|
||||||
all_blocks = []
|
all_blocks = []
|
||||||
|
|
||||||
for page_idx, image in enumerate(images):
|
for page_idx, image in enumerate(images, start=1):
|
||||||
all_blocks.extend(extract_page_blocks(image, page_idx, language))
|
all_blocks.extend(extract_page_blocks(image, page_idx, language))
|
||||||
|
|
||||||
return all_blocks
|
return all_blocks
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
|||||||
"""
|
"""
|
||||||
all_blocks = []
|
all_blocks = []
|
||||||
|
|
||||||
for page_idx, image in enumerate(images):
|
for page_idx, image in enumerate(images, start=1):
|
||||||
all_blocks.extend(extract_page_blocks(image, page_idx, language))
|
all_blocks.extend(extract_page_blocks(image, page_idx, language))
|
||||||
del image
|
del image
|
||||||
|
|
||||||
|
|||||||
@@ -108,7 +108,7 @@ async def run_ocr_stream(request: OcrRequest):
|
|||||||
total_blocks = 0
|
total_blocks = 0
|
||||||
skipped_pages = 0
|
skipped_pages = 0
|
||||||
|
|
||||||
for page_idx, image in enumerate(images):
|
for page_idx, image in enumerate(images, start=1):
|
||||||
try:
|
try:
|
||||||
engine = kraken_engine if use_kraken else surya_engine
|
engine = kraken_engine if use_kraken else surya_engine
|
||||||
blocks = await asyncio.to_thread(
|
blocks = await asyncio.to_thread(
|
||||||
|
|||||||
@@ -61,8 +61,8 @@ def test_surya_extract_blocks_delegates_to_extract_page_blocks():
|
|||||||
blocks = surya.extract_blocks([image1, image2])
|
blocks = surya.extract_blocks([image1, image2])
|
||||||
|
|
||||||
assert len(blocks) == 2
|
assert len(blocks) == 2
|
||||||
assert blocks[0]["pageNumber"] == 0
|
assert blocks[0]["pageNumber"] == 1
|
||||||
assert blocks[1]["pageNumber"] == 1
|
assert blocks[1]["pageNumber"] == 2
|
||||||
|
|
||||||
|
|
||||||
# ─── Kraken extract_page_blocks ──────────────────────────────────────────────
|
# ─── Kraken extract_page_blocks ──────────────────────────────────────────────
|
||||||
@@ -128,5 +128,5 @@ def test_kraken_extract_blocks_delegates_to_extract_page_blocks():
|
|||||||
blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2])
|
blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2])
|
||||||
|
|
||||||
assert len(blocks) == 2
|
assert len(blocks) == 2
|
||||||
assert blocks[0]["pageNumber"] == 0
|
assert blocks[0]["pageNumber"] == 1
|
||||||
assert blocks[1]["pageNumber"] == 1
|
assert blocks[1]["pageNumber"] == 2
|
||||||
|
|||||||
@@ -68,9 +68,9 @@ async def test_ocr_stream_emits_page_event_per_page_with_blocks(mock_images):
|
|||||||
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
|
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
|
||||||
page_events = [l for l in lines if l["type"] == "page"]
|
page_events = [l for l in lines if l["type"] == "page"]
|
||||||
assert len(page_events) == 3
|
assert len(page_events) == 3
|
||||||
assert page_events[0]["pageNumber"] == 0
|
assert page_events[0]["pageNumber"] == 1
|
||||||
assert page_events[1]["pageNumber"] == 1
|
assert page_events[1]["pageNumber"] == 2
|
||||||
assert page_events[2]["pageNumber"] == 2
|
assert page_events[2]["pageNumber"] == 3
|
||||||
assert len(page_events[0]["blocks"]) == 1
|
assert len(page_events[0]["blocks"]) == 1
|
||||||
|
|
||||||
|
|
||||||
@@ -109,8 +109,8 @@ async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_i
|
|||||||
def page_blocks(image, page_idx, language="de"):
|
def page_blocks(image, page_idx, language="de"):
|
||||||
nonlocal call_count
|
nonlocal call_count
|
||||||
call_count += 1
|
call_count += 1
|
||||||
if page_idx == 1:
|
if page_idx == 2:
|
||||||
raise RuntimeError("Engine crashed on page 1")
|
raise RuntimeError("Engine crashed on page 2")
|
||||||
return [_make_block(page_idx)]
|
return [_make_block(page_idx)]
|
||||||
|
|
||||||
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||||
@@ -128,7 +128,7 @@ async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_i
|
|||||||
types = [l["type"] for l in lines]
|
types = [l["type"] for l in lines]
|
||||||
assert "error" in types
|
assert "error" in types
|
||||||
error_event = [l for l in lines if l["type"] == "error"][0]
|
error_event = [l for l in lines if l["type"] == "error"][0]
|
||||||
assert error_event["pageNumber"] == 1
|
assert error_event["pageNumber"] == 2
|
||||||
# Error message must be generic, not the raw traceback
|
# Error message must be generic, not the raw traceback
|
||||||
assert "Engine crashed" not in error_event["message"]
|
assert "Engine crashed" not in error_event["message"]
|
||||||
|
|
||||||
@@ -148,7 +148,7 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
|
|||||||
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||||
patch("main._models_ready", True), \
|
patch("main._models_ready", True), \
|
||||||
patch("main.surya_engine") as mock_surya:
|
patch("main.surya_engine") as mock_surya:
|
||||||
mock_surya.extract_blocks.return_value = [_make_block(0), _make_block(1)]
|
mock_surya.extract_blocks.return_value = [_make_block(1), _make_block(2)]
|
||||||
|
|
||||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||||
response = await client.post("/ocr", json={
|
response = await client.post("/ocr", json={
|
||||||
@@ -160,4 +160,4 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
|
|||||||
data = response.json()
|
data = response.json()
|
||||||
assert isinstance(data, list)
|
assert isinstance(data, list)
|
||||||
assert len(data) == 2
|
assert len(data) == 2
|
||||||
assert data[0]["pageNumber"] == 0
|
assert data[0]["pageNumber"] == 1
|
||||||
|
|||||||
Reference in New Issue
Block a user