diff --git a/ocr-service/engines/kraken.py b/ocr-service/engines/kraken.py index b2b75787..ce994dd7 100644 --- a/ocr-service/engines/kraken.py +++ b/ocr-service/engines/kraken.py @@ -88,7 +88,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: """ all_blocks = [] - for page_idx, image in enumerate(images): + for page_idx, image in enumerate(images, start=1): all_blocks.extend(extract_page_blocks(image, page_idx, language)) return all_blocks diff --git a/ocr-service/engines/surya.py b/ocr-service/engines/surya.py index a82968ce..a0907cb6 100644 --- a/ocr-service/engines/surya.py +++ b/ocr-service/engines/surya.py @@ -90,7 +90,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: """ all_blocks = [] - for page_idx, image in enumerate(images): + for page_idx, image in enumerate(images, start=1): all_blocks.extend(extract_page_blocks(image, page_idx, language)) del image diff --git a/ocr-service/main.py b/ocr-service/main.py index 2b0f7785..66cb01dc 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -108,7 +108,7 @@ async def run_ocr_stream(request: OcrRequest): total_blocks = 0 skipped_pages = 0 - for page_idx, image in enumerate(images): + for page_idx, image in enumerate(images, start=1): try: engine = kraken_engine if use_kraken else surya_engine blocks = await asyncio.to_thread( diff --git a/ocr-service/test_engines.py b/ocr-service/test_engines.py index a6966bd8..8218978e 100644 --- a/ocr-service/test_engines.py +++ b/ocr-service/test_engines.py @@ -61,8 +61,8 @@ def test_surya_extract_blocks_delegates_to_extract_page_blocks(): blocks = surya.extract_blocks([image1, image2]) assert len(blocks) == 2 - assert blocks[0]["pageNumber"] == 0 - assert blocks[1]["pageNumber"] == 1 + assert blocks[0]["pageNumber"] == 1 + assert blocks[1]["pageNumber"] == 2 # ─── Kraken extract_page_blocks ────────────────────────────────────────────── @@ -128,5 +128,5 @@ def test_kraken_extract_blocks_delegates_to_extract_page_blocks(): blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2]) assert len(blocks) == 2 - assert blocks[0]["pageNumber"] == 0 - assert blocks[1]["pageNumber"] == 1 + assert blocks[0]["pageNumber"] == 1 + assert blocks[1]["pageNumber"] == 2 diff --git a/ocr-service/test_stream.py b/ocr-service/test_stream.py index 13d41644..8b070430 100644 --- a/ocr-service/test_stream.py +++ b/ocr-service/test_stream.py @@ -68,9 +68,9 @@ async def test_ocr_stream_emits_page_event_per_page_with_blocks(mock_images): lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()] page_events = [l for l in lines if l["type"] == "page"] assert len(page_events) == 3 - assert page_events[0]["pageNumber"] == 0 - assert page_events[1]["pageNumber"] == 1 - assert page_events[2]["pageNumber"] == 2 + assert page_events[0]["pageNumber"] == 1 + assert page_events[1]["pageNumber"] == 2 + assert page_events[2]["pageNumber"] == 3 assert len(page_events[0]["blocks"]) == 1 @@ -109,8 +109,8 @@ async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_i def page_blocks(image, page_idx, language="de"): nonlocal call_count call_count += 1 - if page_idx == 1: - raise RuntimeError("Engine crashed on page 1") + if page_idx == 2: + raise RuntimeError("Engine crashed on page 2") return [_make_block(page_idx)] with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ @@ -128,7 +128,7 @@ async def test_ocr_stream_emits_error_event_on_page_failure_and_continues(mock_i types = [l["type"] for l in lines] assert "error" in types error_event = [l for l in lines if l["type"] == "error"][0] - assert error_event["pageNumber"] == 1 + assert error_event["pageNumber"] == 2 # Error message must be generic, not the raw traceback assert "Engine crashed" not in error_event["message"] @@ -148,7 +148,7 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images): with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main._models_ready", True), \ patch("main.surya_engine") as mock_surya: - mock_surya.extract_blocks.return_value = [_make_block(0), _make_block(1)] + mock_surya.extract_blocks.return_value = [_make_block(1), _make_block(2)] async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: response = await client.post("/ocr", json={ @@ -160,4 +160,4 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images): data = response.json() assert isinstance(data, list) assert len(data) == 2 - assert data[0]["pageNumber"] == 0 + assert data[0]["pageNumber"] == 1