From ddf6cf4cbca0360a5310ecd1642ec56de35ddec9 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 17:03:29 +0200 Subject: [PATCH] test(ocr): collapse shared client setup into ocr_client helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each metrics test was repeating the same five-line block — patch kraken_engine.load_models, patch load_spell_checker, instantiate the AsyncClient, force _models_ready True, restore it. Lift the lot into a single async context manager so each test body shrinks to its real arrange / act / assert intent. Tests that drive the lifespan directly (models_ready gauge) or stub asyncio.to_thread for /train (which already patches _models_ready) stay unchanged. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_metrics.py | 255 +++++++++++++++--------------------- 1 file changed, 102 insertions(+), 153 deletions(-) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 1e6ab7fa..253cae14 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -4,6 +4,7 @@ Each test that asserts on a counter/gauge value uses a fresh CollectorRegistry (see decision #3 on issue #652) to keep the metrics isolated between tests. """ +import contextlib import io import zipfile from unittest.mock import AsyncMock, patch @@ -17,6 +18,26 @@ from main import app from metrics import build_metrics +@contextlib.asynccontextmanager +async def ocr_client(*, raise_app_exceptions: bool = True): + """Yield an AsyncClient with model-loaders patched and _models_ready forced on. + + The shared setup for almost every metrics test: stub the heavy lifecycle + hooks (kraken_engine.load_models, load_spell_checker), flip the readiness + flag so request handlers do not 503, and restore it afterwards. + """ + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"): + transport = ASGITransport(app=app, raise_app_exceptions=raise_app_exceptions) + async with AsyncClient(transport=transport, base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + yield client + finally: + main_module._models_ready = False + + def _minimal_zip() -> bytes: """Return a ZIP containing one fake .xml so endpoint validation passes.""" buf = io.BytesIO() @@ -57,26 +78,18 @@ async def test_metrics_includes_http_request_metrics_after_ocr_call(): mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "hi", "words": []}] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - # Models need to be loaded for /ocr to accept requests; force the flag. - import main as main_module - main_module._models_ready = True - try: - ocr_response = await client.post("/ocr", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - }) - assert ocr_response.status_code == 200, ocr_response.text + async with ocr_client() as client: + ocr_response = await client.post("/ocr", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) + assert ocr_response.status_code == 200, ocr_response.text - metrics_response = await client.get("/metrics") - finally: - main_module._models_ready = False + metrics_response = await client.get("/metrics") body = metrics_response.text assert "http_requests_total" in body @@ -124,20 +137,13 @@ async def test_ocr_jobs_total_incremented_with_kraken_engine_label_for_kurrent(f mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "hi", "words": []}] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main.correct_text", side_effect=lambda t: t), \ + with patch("main.correct_text", side_effect=lambda t: t), \ patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.kraken_engine.is_available", return_value=True), \ patch("main.kraken_engine.extract_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - await _drive_ocr(client, script_type="HANDWRITING_KURRENT") - finally: - main_module._models_ready = False + async with ocr_client() as client: + await _drive_ocr(client, script_type="HANDWRITING_KURRENT") value = fresh_metrics.ocr_jobs_total.labels( engine="kraken", script_type="HANDWRITING_KURRENT" @@ -152,18 +158,11 @@ async def test_ocr_jobs_total_incremented_with_surya_engine_label_for_typewriter mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "hi", "words": []}] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - await _drive_ocr(client, script_type="TYPEWRITER") - finally: - main_module._models_ready = False + async with ocr_client() as client: + await _drive_ocr(client, script_type="TYPEWRITER") value = fresh_metrics.ocr_jobs_total.labels( engine="surya", script_type="TYPEWRITER" @@ -178,26 +177,19 @@ async def test_ocr_pages_total_incremented_once_per_page_in_stream(fresh_metrics mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "hi", "words": []}] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - async with client.stream("POST", "/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - }) as response: - assert response.status_code == 200 - # Drain the stream so all per-page increments fire. - async for _ in response.aiter_lines(): - pass - finally: - main_module._models_ready = False + async with ocr_client() as client: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + # Drain the stream so all per-page increments fire. + async for _ in response.aiter_lines(): + pass value = fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() assert value == 3.0 @@ -218,28 +210,21 @@ async def test_ocr_skipped_pages_total_incremented_when_engine_raises_for_a_page raise RuntimeError("synthetic engine failure") return good_blocks - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_page_blocks", side_effect=extract_side_effect): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - async with client.stream("POST", "/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - }) as response: - assert response.status_code == 200 - saw_error = False - async for line in response.aiter_lines(): - if line and '"type": "error"' in line: - saw_error = True - assert saw_error - finally: - main_module._models_ready = False + async with ocr_client() as client: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + saw_error = False + async for line in response.aiter_lines(): + if line and '"type": "error"' in line: + saw_error = True + assert saw_error assert fresh_metrics.ocr_skipped_pages_total._value.get() == 1.0 # The second page still succeeds. @@ -266,18 +251,11 @@ async def test_ocr_words_and_illegible_words_total_sum_across_blocks(fresh_metri {"text": "Karl", "confidence": 0.95}]}, ] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - await _drive_ocr(client, script_type="TYPEWRITER") - finally: - main_module._models_ready = False + async with ocr_client() as client: + await _drive_ocr(client, script_type="TYPEWRITER") assert fresh_metrics.ocr_words_total._value.get() == 5.0 assert fresh_metrics.ocr_illegible_words_total._value.get() == 2.0 @@ -296,25 +274,18 @@ async def test_ocr_processing_seconds_histogram_observed_per_page_in_stream(fres mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "ok", "words": []}] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - async with client.stream("POST", "/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - }) as response: - assert response.status_code == 200 - async for _ in response.aiter_lines(): - pass - finally: - main_module._models_ready = False + async with ocr_client() as client: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass sum_seconds, count = _histogram_count_sum( fresh_metrics.ocr_processing_seconds, engine="surya" @@ -507,26 +478,19 @@ async def test_ocr_processing_seconds_histogram_observed_per_page_in_guided_stre {"pageNumber": 2, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "annotationId": "a2"}, ] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_region_text", return_value="text"): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - async with client.stream("POST", "/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - "regions": regions, - }) as response: - assert response.status_code == 200 - async for _ in response.aiter_lines(): - pass - finally: - main_module._models_ready = False + async with ocr_client() as client: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + "regions": regions, + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass sum_seconds, count = _histogram_count_sum( fresh_metrics.ocr_processing_seconds, engine="surya" @@ -549,28 +513,21 @@ async def test_ocr_processing_seconds_histogram_excludes_spell_check_time_in_gui _time.sleep(0.05) return text - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.kraken_engine.is_available", return_value=True), \ patch("main.kraken_engine.extract_region_text", return_value="text"), \ patch("main.correct_text", side_effect=slow_correct): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - async with client.stream("POST", "/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "HANDWRITING_KURRENT", - "language": "de", - "regions": regions, - }) as response: - assert response.status_code == 200 - async for _ in response.aiter_lines(): - pass - finally: - main_module._models_ready = False + async with ocr_client() as client: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "HANDWRITING_KURRENT", + "language": "de", + "regions": regions, + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass sum_seconds, _ = _histogram_count_sum( fresh_metrics.ocr_processing_seconds, engine="kraken" @@ -591,21 +548,13 @@ async def test_ocr_jobs_total_not_incremented_when_pdf_download_fails_in_stream( async def fail_download(url): raise RuntimeError("synthetic download failure") - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new=fail_download): - transport = ASGITransport(app=app, raise_app_exceptions=False) - async with AsyncClient(transport=transport, base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - response = await client.post("/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - }) - finally: - main_module._models_ready = False + with patch("main._download_and_convert_pdf", new=fail_download): + async with ocr_client(raise_app_exceptions=False) as client: + response = await client.post("/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) assert response.status_code == 500 assert fresh_metrics.ocr_jobs_total.labels(