From 18c93d4eaa12c74b89953f47ef6387ff7729f8a0 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 15:59:37 +0200 Subject: [PATCH 01/27] feat(ocr): expose /metrics endpoint via prometheus-fastapi-instrumentator Mount the instrumentator immediately after FastAPI app creation, excluding /health and /metrics from request metrics to keep http_requests_total focused on real application traffic. Refs #652 Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 3 +++ ocr-service/requirements.txt | 1 + ocr-service/test_metrics.py | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+) create mode 100644 ocr-service/test_metrics.py diff --git a/ocr-service/main.py b/ocr-service/main.py index 409cc78f..63859eba 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -20,6 +20,7 @@ import pypdfium2 as pdfium from fastapi import FastAPI, Form, Header, HTTPException, UploadFile from fastapi.responses import StreamingResponse from PIL import Image +from prometheus_fastapi_instrumentator import Instrumentator from confidence import apply_confidence_markers, get_threshold from spell_check import correct_text, load_spell_checker @@ -72,6 +73,8 @@ async def lifespan(app: FastAPI): app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan) +Instrumentator(excluded_handlers=["/health", "/metrics"]).instrument(app).expose(app) + @app.get("/health") def health(): diff --git a/ocr-service/requirements.txt b/ocr-service/requirements.txt index befaafcb..8a9bedec 100644 --- a/ocr-service/requirements.txt +++ b/ocr-service/requirements.txt @@ -10,3 +10,4 @@ pyvips>=2.2.0 httpx==0.28.1 pyspellchecker==0.9.0 opencv-python-headless==4.11.0.86 +prometheus-fastapi-instrumentator==7.0.0 diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py new file mode 100644 index 00000000..3c6bc037 --- /dev/null +++ b/ocr-service/test_metrics.py @@ -0,0 +1,24 @@ +"""Tests for Prometheus metrics exposed by the OCR service. + +Each test that asserts on a counter/gauge value uses a fresh CollectorRegistry +(see decision #3 on issue #652) to keep the metrics isolated between tests. +""" + +from unittest.mock import patch + +import pytest +from httpx import ASGITransport, AsyncClient + +from main import app + + +@pytest.mark.asyncio +async def test_metrics_endpoint_returns_200(): + """`GET /metrics` returns 200 with Prometheus exposition content.""" + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/metrics") + + assert response.status_code == 200 + assert "text/plain" in response.headers.get("content-type", "") -- 2.49.1 From 4bb6685edb5d82f0f53c3dcfb267f8d8b911e287 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:00:33 +0200 Subject: [PATCH 02/27] test(ocr): assert http_* metrics appear after an /ocr request Locks down AC1: prometheus-fastapi-instrumentator must keep auto-exposing http_requests_total and http_request_duration_seconds for application traffic, not just register the /metrics endpoint. Refs #652 Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_metrics.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 3c6bc037..442a2d0e 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -4,10 +4,11 @@ Each test that asserts on a counter/gauge value uses a fresh CollectorRegistry (see decision #3 on issue #652) to keep the metrics isolated between tests. """ -from unittest.mock import patch +from unittest.mock import AsyncMock, patch import pytest from httpx import ASGITransport, AsyncClient +from PIL import Image from main import app @@ -22,3 +23,36 @@ async def test_metrics_endpoint_returns_200(): assert response.status_code == 200 assert "text/plain" in response.headers.get("content-type", "") + + +@pytest.mark.asyncio +async def test_metrics_includes_http_request_metrics_after_ocr_call(): + """After a request to /ocr, `/metrics` exposes auto-instrumented http_* metrics.""" + mock_images = [Image.new("RGB", (100, 100))] + mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "hi", "words": []}] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_blocks", return_value=mock_blocks): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + # Models need to be loaded for /ocr to accept requests; force the flag. + import main as main_module + main_module._models_ready = True + try: + ocr_response = await client.post("/ocr", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) + assert ocr_response.status_code == 200, ocr_response.text + + metrics_response = await client.get("/metrics") + finally: + main_module._models_ready = False + + body = metrics_response.text + assert "http_requests_total" in body + assert "http_request_duration_seconds" in body -- 2.49.1 From f3e3545d0682e3b19b61a8f80ea3a00f6cbb72e1 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:02:20 +0200 Subject: [PATCH 03/27] feat(ocr): add metrics.py factory with test-scoped CollectorRegistry support Encapsulates every custom OCR metric in an OcrMetrics frozen dataclass and exposes a `build_metrics(registry)` factory. Production main.py binds against the default REGISTRY; tests construct a fresh CollectorRegistry per case and monkeypatch main.metrics, so counter values stay isolated between tests (decision #3 on issue #652, Option A). Refs #652 Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 4 ++ ocr-service/metrics.py | 92 +++++++++++++++++++++++++++++++++++++ ocr-service/test_metrics.py | 35 ++++++++++++++ 3 files changed, 131 insertions(+) create mode 100644 ocr-service/metrics.py diff --git a/ocr-service/main.py b/ocr-service/main.py index 63859eba..0d068d15 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -20,9 +20,11 @@ import pypdfium2 as pdfium from fastapi import FastAPI, Form, Header, HTTPException, UploadFile from fastapi.responses import StreamingResponse from PIL import Image +from prometheus_client import REGISTRY from prometheus_fastapi_instrumentator import Instrumentator from confidence import apply_confidence_markers, get_threshold +from metrics import OcrMetrics, build_metrics from spell_check import correct_text, load_spell_checker from engines import kraken as kraken_engine from engines import surya as surya_engine @@ -38,6 +40,8 @@ logger = logging.getLogger(__name__) _models_ready = False +metrics: OcrMetrics = build_metrics(REGISTRY) + ALLOWED_PDF_HOSTS = set( h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",") ) diff --git a/ocr-service/metrics.py b/ocr-service/metrics.py new file mode 100644 index 00000000..66efb9f2 --- /dev/null +++ b/ocr-service/metrics.py @@ -0,0 +1,92 @@ +"""Prometheus metric definitions for the OCR service. + +`build_metrics(registry)` returns a fresh `OcrMetrics` instance bound to the +given `CollectorRegistry`. Production code calls it once at module load with +the default `REGISTRY`; tests pass a per-test `CollectorRegistry()` to keep +counter values isolated between cases (decision #3 on issue #652). +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram + + +@dataclass(frozen=True) +class OcrMetrics: + """Container for every custom OCR metric. + + Counters and gauges are immutable references to `prometheus_client` + instances. Mutating them (`.inc()`, `.observe()`, `.set()`) is safe; + rebinding the field on the dataclass is not — use `build_metrics` to get + a new container. + """ + + ocr_jobs_total: Counter + ocr_pages_total: Counter + ocr_skipped_pages_total: Counter + ocr_words_total: Counter + ocr_illegible_words_total: Counter + ocr_processing_seconds: Histogram + ocr_training_runs_total: Counter + ocr_model_accuracy: Gauge + ocr_models_ready: Gauge + + +def build_metrics(registry: CollectorRegistry) -> OcrMetrics: + """Create one OcrMetrics instance bound to `registry`.""" + return OcrMetrics( + ocr_jobs_total=Counter( + "ocr_jobs_total", + "Number of OCR jobs processed, labelled by engine and script type.", + ["engine", "script_type"], + registry=registry, + ), + ocr_pages_total=Counter( + "ocr_pages_total", + "Number of pages successfully OCR'd, labelled by engine.", + ["engine"], + registry=registry, + ), + ocr_skipped_pages_total=Counter( + "ocr_skipped_pages_total", + "Number of pages skipped because the OCR engine raised.", + registry=registry, + ), + ocr_words_total=Counter( + "ocr_words_total", + "Number of words recognized across all OCR blocks.", + registry=registry, + ), + ocr_illegible_words_total=Counter( + "ocr_illegible_words_total", + "Number of words below the confidence threshold " + "(replaced with [unleserlich]).", + registry=registry, + ), + ocr_processing_seconds=Histogram( + "ocr_processing_seconds", + "OCR processing time per page (streaming) or per document (non-streaming).", + ["engine"], + registry=registry, + ), + ocr_training_runs_total=Counter( + "ocr_training_runs_total", + "Number of training runs, labelled by kind (recognition|segmentation) " + "and outcome (success|error).", + ["kind", "outcome"], + registry=registry, + ), + ocr_model_accuracy=Gauge( + "ocr_model_accuracy", + "Latest model accuracy reported by a successful training run.", + ["kind"], + registry=registry, + ), + ocr_models_ready=Gauge( + "ocr_models_ready", + "1 once the lifespan startup has finished loading models, 0 before.", + registry=registry, + ), + ) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 442a2d0e..0906beec 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -9,8 +9,19 @@ from unittest.mock import AsyncMock, patch import pytest from httpx import ASGITransport, AsyncClient from PIL import Image +from prometheus_client import CollectorRegistry from main import app +from metrics import build_metrics + + +@pytest.fixture +def fresh_metrics(monkeypatch): + """Replace the module-level `main.metrics` with one bound to a fresh registry.""" + registry = CollectorRegistry() + test_metrics = build_metrics(registry) + monkeypatch.setattr("main.metrics", test_metrics) + return test_metrics @pytest.mark.asyncio @@ -56,3 +67,27 @@ async def test_metrics_includes_http_request_metrics_after_ocr_call(): body = metrics_response.text assert "http_requests_total" in body assert "http_request_duration_seconds" in body + + +def test_build_metrics_registers_all_custom_metrics_on_given_registry(): + """`build_metrics` returns an OcrMetrics bound to the supplied registry.""" + registry = CollectorRegistry() + metrics = build_metrics(registry) + + metric_names = {m.name for m in registry.collect()} + expected = { + "ocr_jobs", + "ocr_pages", + "ocr_skipped_pages", + "ocr_words", + "ocr_illegible_words", + "ocr_processing_seconds", + "ocr_training_runs", + "ocr_model_accuracy", + "ocr_models_ready", + } + assert expected <= metric_names, f"missing: {expected - metric_names}" + + # A second registry yields a separate container — no shared state. + other_metrics = build_metrics(CollectorRegistry()) + assert metrics is not other_metrics -- 2.49.1 From 696b71da5abe63915f823ac1a58740c51b89c79e Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:03:37 +0200 Subject: [PATCH 04/27] feat(ocr): increment ocr_jobs_total with engine and script_type labels Pick engine="kraken" for HANDWRITING_KURRENT, engine="surya" otherwise, then increment after the blocks have been extracted. Refs #652 (AC2) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 3 +++ ocr-service/test_metrics.py | 38 +++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index 0d068d15..83a092ee 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -106,6 +106,7 @@ async def run_ocr(request: OcrRequest): del img script_type = request.scriptType.upper() + engine_name = "kraken" if script_type == "HANDWRITING_KURRENT" else "surya" if script_type == "HANDWRITING_KURRENT": if not kraken_engine.is_available(): @@ -119,6 +120,8 @@ async def run_ocr(request: OcrRequest): # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya blocks = await asyncio.to_thread(surya_engine.extract_blocks, images, request.language) + metrics.ocr_jobs_total.labels(engine=engine_name, script_type=script_type).inc() + threshold = get_threshold(script_type) for block in blocks: if block.get("words"): diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 0906beec..2f7b0db8 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -91,3 +91,41 @@ def test_build_metrics_registers_all_custom_metrics_on_given_registry(): # A second registry yields a separate container — no shared state. other_metrics = build_metrics(CollectorRegistry()) assert metrics is not other_metrics + + +async def _drive_ocr(client: AsyncClient, *, script_type: str) -> None: + """Helper — fires /ocr with a single mocked page and asserts a 200.""" + response = await client.post("/ocr", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": script_type, + "language": "de", + }) + assert response.status_code == 200, response.text + + +@pytest.mark.asyncio +async def test_ocr_jobs_total_incremented_with_kraken_engine_label_for_kurrent(fresh_metrics): + """A /ocr call with HANDWRITING_KURRENT increments engine=kraken.""" + mock_images = [Image.new("RGB", (100, 100))] + mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "hi", "words": []}] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main.correct_text", side_effect=lambda t: t), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.kraken_engine.is_available", return_value=True), \ + patch("main.kraken_engine.extract_blocks", return_value=mock_blocks): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + await _drive_ocr(client, script_type="HANDWRITING_KURRENT") + finally: + main_module._models_ready = False + + value = fresh_metrics.ocr_jobs_total.labels( + engine="kraken", script_type="HANDWRITING_KURRENT" + )._value.get() + assert value == 1.0 -- 2.49.1 From 52d8dc2b20fd33612ea6afa1aa0179a78a5e34a0 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:04:20 +0200 Subject: [PATCH 05/27] test(ocr): assert ocr_jobs_total label is engine=surya for typewriter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Locks down AC2 for the non-Kurrent path. The same code branch in /ocr that sets engine_name from script_type now has explicit coverage for both HANDWRITING_KURRENT → kraken and TYPEWRITER → surya. Refs #652 (AC2) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_metrics.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 2f7b0db8..f034a31c 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -129,3 +129,29 @@ async def test_ocr_jobs_total_incremented_with_kraken_engine_label_for_kurrent(f engine="kraken", script_type="HANDWRITING_KURRENT" )._value.get() assert value == 1.0 + + +@pytest.mark.asyncio +async def test_ocr_jobs_total_incremented_with_surya_engine_label_for_typewriter(fresh_metrics): + """A /ocr call with TYPEWRITER increments engine=surya.""" + mock_images = [Image.new("RGB", (100, 100))] + mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "hi", "words": []}] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_blocks", return_value=mock_blocks): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + await _drive_ocr(client, script_type="TYPEWRITER") + finally: + main_module._models_ready = False + + value = fresh_metrics.ocr_jobs_total.labels( + engine="surya", script_type="TYPEWRITER" + )._value.get() + assert value == 1.0 -- 2.49.1 From 79edb9455810dcfad65fb5bea05a690443ec2995 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:05:36 +0200 Subject: [PATCH 06/27] feat(ocr): increment ocr_pages_total per successful page in stream Bumps the counter inside both the standard and guided /ocr/stream generators after a page yields its blocks, before the per-page json line is emitted. Also moves the ocr_jobs_total increment for /ocr/stream right after engine selection so the counter still fires when a page later errors out. Refs #652 (AC3a) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 5 +++++ ocr-service/test_metrics.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index 83a092ee..08990e24 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -156,6 +156,9 @@ async def run_ocr_stream(request: OcrRequest): ) engine = kraken_engine if use_kraken else surya_engine + engine_name = "kraken" if use_kraken else "surya" + + metrics.ocr_jobs_total.labels(engine=engine_name, script_type=script_type).inc() if request.regions: # Guided mode: recognize only the user-drawn annotation regions @@ -206,6 +209,7 @@ async def run_ocr_stream(request: OcrRequest): }) total_blocks += len(blocks) + metrics.ocr_pages_total.labels(engine=engine_name).inc() yield json.dumps({ "type": "page", "pageNumber": page_idx, @@ -260,6 +264,7 @@ async def run_ocr_stream(request: OcrRequest): block["text"] = correct_text(block["text"]) total_blocks += len(blocks) + metrics.ocr_pages_total.labels(engine=engine_name).inc() yield json.dumps({ "type": "page", "pageNumber": page_idx, diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index f034a31c..f91bd0af 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -155,3 +155,35 @@ async def test_ocr_jobs_total_incremented_with_surya_engine_label_for_typewriter engine="surya", script_type="TYPEWRITER" )._value.get() assert value == 1.0 + + +@pytest.mark.asyncio +async def test_ocr_pages_total_incremented_once_per_page_in_stream(fresh_metrics): + """The /ocr/stream generator increments ocr_pages_total per successful page.""" + mock_images = [Image.new("RGB", (100, 100)) for _ in range(3)] + mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "hi", "words": []}] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + # Drain the stream so all per-page increments fire. + async for _ in response.aiter_lines(): + pass + finally: + main_module._models_ready = False + + value = fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() + assert value == 3.0 -- 2.49.1 From 3fa3460dbfb16b13a972d2ce8d59439ece0ff6bc Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:06:50 +0200 Subject: [PATCH 07/27] feat(ocr): increment ocr_skipped_pages_total on per-page engine failure Bumps the counter in both /ocr/stream except blocks (standard and guided generators) so the existing skipped_pages local variable now also flows into Prometheus. Refs #652 (AC3b) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 2 ++ ocr-service/test_metrics.py | 43 +++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index 08990e24..9522d8ae 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -219,6 +219,7 @@ async def run_ocr_stream(request: OcrRequest): except Exception: logger.exception("Guided OCR failed on page %d", page_idx) skipped_pages += 1 + metrics.ocr_skipped_pages_total.inc() yield json.dumps({ "type": "error", "pageNumber": page_idx, @@ -274,6 +275,7 @@ async def run_ocr_stream(request: OcrRequest): except Exception: logger.exception("OCR failed on page %d", page_idx) skipped_pages += 1 + metrics.ocr_skipped_pages_total.inc() yield json.dumps({ "type": "error", "pageNumber": page_idx, diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index f91bd0af..627a78d7 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -187,3 +187,46 @@ async def test_ocr_pages_total_incremented_once_per_page_in_stream(fresh_metrics value = fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() assert value == 3.0 + + +@pytest.mark.asyncio +async def test_ocr_skipped_pages_total_incremented_when_engine_raises_for_a_page(fresh_metrics): + """When the engine raises on a page, ocr_skipped_pages_total bumps and the stream finishes.""" + mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)] + good_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "ok", "words": []}] + + call_count = {"n": 0} + + def extract_side_effect(*args, **kwargs): + call_count["n"] += 1 + if call_count["n"] == 1: + raise RuntimeError("synthetic engine failure") + return good_blocks + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_page_blocks", side_effect=extract_side_effect): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + saw_error = False + async for line in response.aiter_lines(): + if line and '"type": "error"' in line: + saw_error = True + assert saw_error + finally: + main_module._models_ready = False + + assert fresh_metrics.ocr_skipped_pages_total._value.get() == 1.0 + # The second page still succeeds. + assert fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() == 1.0 -- 2.49.1 From 131ed336bc04c40a182e5110bcaeb7576f862ec8 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:07:59 +0200 Subject: [PATCH 08/27] feat(ocr): count words and illegible words at the OCR call sites Walks block["words"] before apply_confidence_markers strips the list, then increments ocr_words_total by len(words) and ocr_illegible_words_total by the count below threshold. Same pattern in both /ocr and /ocr/stream so the ratio illegible/words is a faithful quality signal across endpoints. Refs #652 (AC4) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 18 ++++++++++++++---- ocr-service/test_metrics.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/ocr-service/main.py b/ocr-service/main.py index 9522d8ae..12a77a45 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -124,8 +124,13 @@ async def run_ocr(request: OcrRequest): threshold = get_threshold(script_type) for block in blocks: - if block.get("words"): - block["text"] = apply_confidence_markers(block["words"], threshold) + words = block.get("words") or [] + if words: + metrics.ocr_words_total.inc(len(words)) + metrics.ocr_illegible_words_total.inc( + sum(1 for w in words if w["confidence"] < threshold) + ) + block["text"] = apply_confidence_markers(words, threshold) block.pop("words", None) if script_type in _SPELL_CHECK_SCRIPT_TYPES: block["text"] = correct_text(block["text"]) @@ -258,8 +263,13 @@ async def run_ocr_stream(request: OcrRequest): ) for block in blocks: - if block.get("words"): - block["text"] = apply_confidence_markers(block["words"], threshold) + words = block.get("words") or [] + if words: + metrics.ocr_words_total.inc(len(words)) + metrics.ocr_illegible_words_total.inc( + sum(1 for w in words if w["confidence"] < threshold) + ) + block["text"] = apply_confidence_markers(words, threshold) block.pop("words", None) if script_type in _SPELL_CHECK_SCRIPT_TYPES: block["text"] = correct_text(block["text"]) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 627a78d7..a13db209 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -230,3 +230,40 @@ async def test_ocr_skipped_pages_total_incremented_when_engine_raises_for_a_page assert fresh_metrics.ocr_skipped_pages_total._value.get() == 1.0 # The second page still succeeds. assert fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() == 1.0 + + +@pytest.mark.asyncio +async def test_ocr_words_and_illegible_words_total_sum_across_blocks(fresh_metrics): + """Counters reflect totals summed over every block in the request. + + Threshold defaults to THRESHOLD_DEFAULT (0.3) for non-Kurrent scripts. Two + blocks: 3 words above + 2 words below threshold across blocks. + """ + mock_images = [Image.new("RGB", (100, 100))] + mock_blocks = [ + {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "ignored", + "words": [{"text": "Lieber", "confidence": 0.9}, + {"text": "Freund", "confidence": 0.1}]}, + {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "ignored", + "words": [{"text": "Gruss", "confidence": 0.8}, + {"text": "verschmiert", "confidence": 0.05}, + {"text": "Karl", "confidence": 0.95}]}, + ] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_blocks", return_value=mock_blocks): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + await _drive_ocr(client, script_type="TYPEWRITER") + finally: + main_module._models_ready = False + + assert fresh_metrics.ocr_words_total._value.get() == 5.0 + assert fresh_metrics.ocr_illegible_words_total._value.get() == 2.0 -- 2.49.1 From 2e3744d9ef6c065ca444e25c39d3a4ac1996dade Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:09:25 +0200 Subject: [PATCH 09/27] feat(ocr): observe ocr_processing_seconds around engine.to_thread calls Wraps every asyncio.to_thread(engine.extract_*) call with time.monotonic() deltas in /ocr (per document) and in both /ocr/stream generators (per page). Streaming buckets are the useful operational signal; the non-streaming observation is a bonus. Refs #652 (AC5) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 13 ++++++++++++ ocr-service/test_metrics.py | 40 +++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index 12a77a45..c9f5b783 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -10,6 +10,7 @@ import re import shutil import subprocess import tempfile +import time import zipfile from contextlib import asynccontextmanager from datetime import datetime, timezone @@ -108,6 +109,7 @@ async def run_ocr(request: OcrRequest): script_type = request.scriptType.upper() engine_name = "kraken" if script_type == "HANDWRITING_KURRENT" else "surya" + extract_started = time.monotonic() if script_type == "HANDWRITING_KURRENT": if not kraken_engine.is_available(): raise HTTPException( @@ -119,6 +121,9 @@ async def run_ocr(request: OcrRequest): else: # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya blocks = await asyncio.to_thread(surya_engine.extract_blocks, images, request.language) + metrics.ocr_processing_seconds.labels(engine=engine_name).observe( + time.monotonic() - extract_started + ) metrics.ocr_jobs_total.labels(engine=engine_name, script_type=script_type).inc() @@ -194,6 +199,7 @@ async def run_ocr_stream(request: OcrRequest): image = await asyncio.to_thread(preprocess_page, image) blocks = [] sender_path = request.senderModelPath if use_kraken else None + page_started = time.monotonic() for region in page_regions: text = await asyncio.to_thread( engine.extract_region_text, image, @@ -213,6 +219,9 @@ async def run_ocr_stream(request: OcrRequest): "annotationId": region.annotationId, }) + metrics.ocr_processing_seconds.labels(engine=engine_name).observe( + time.monotonic() - page_started + ) total_blocks += len(blocks) metrics.ocr_pages_total.labels(engine=engine_name).inc() yield json.dumps({ @@ -258,9 +267,13 @@ async def run_ocr_stream(request: OcrRequest): yield json.dumps({"type": "preprocessing", "pageNumber": page_idx}) + "\n" image = await asyncio.to_thread(preprocess_page, image) sender_path = request.senderModelPath if use_kraken else None + page_started = time.monotonic() blocks = await asyncio.to_thread( engine.extract_page_blocks, image, page_idx, request.language, sender_path ) + metrics.ocr_processing_seconds.labels(engine=engine_name).observe( + time.monotonic() - page_started + ) for block in blocks: words = block.get("words") or [] diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index a13db209..5701bbe3 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -267,3 +267,43 @@ async def test_ocr_words_and_illegible_words_total_sum_across_blocks(fresh_metri assert fresh_metrics.ocr_words_total._value.get() == 5.0 assert fresh_metrics.ocr_illegible_words_total._value.get() == 2.0 + + +def _histogram_count_sum(histogram, **labels) -> tuple[float, float]: + """Read the per-label-set _count and _sum from a prometheus_client Histogram.""" + child = histogram.labels(**labels) + return child._sum.get(), sum(b.get() for b in child._buckets) + + +@pytest.mark.asyncio +async def test_ocr_processing_seconds_histogram_observed_per_page_in_stream(fresh_metrics): + """The streaming generator observes ocr_processing_seconds once per page.""" + mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)] + mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "ok", "words": []}] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass + finally: + main_module._models_ready = False + + sum_seconds, count = _histogram_count_sum( + fresh_metrics.ocr_processing_seconds, engine="surya" + ) + assert count == 2.0 + assert sum_seconds >= 0.0 -- 2.49.1 From 6c2b9af10bc61b6e5583ea131422ead6ec8831f9 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:12:26 +0200 Subject: [PATCH 10/27] feat(ocr): record training runs in ocr_training_runs_total per kind and outcome Wraps the await asyncio.to_thread(_run_*) calls in /train, /train-sender, and /segtrain with try/except. Recognition training (/train, /train-sender) shares kind="recognition"; /segtrain uses kind="segmentation". The ocr_model_accuracy gauge is set per kind on success. Refs #652 (AC6, decision #2) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 27 +++++++++++-- ocr-service/test_metrics.py | 81 +++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 3 deletions(-) diff --git a/ocr-service/main.py b/ocr-service/main.py index c9f5b783..1460a213 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -478,7 +478,14 @@ async def train_model( return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs} - result = await asyncio.to_thread(_run_training) + try: + result = await asyncio.to_thread(_run_training) + except Exception: + metrics.ocr_training_runs_total.labels(kind="recognition", outcome="error").inc() + raise + metrics.ocr_training_runs_total.labels(kind="recognition", outcome="success").inc() + if result.get("accuracy") is not None: + metrics.ocr_model_accuracy.labels(kind="recognition").set(result["accuracy"]) return result @@ -558,7 +565,14 @@ async def train_sender_model( return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs} - result = await asyncio.to_thread(_run_sender_training) + try: + result = await asyncio.to_thread(_run_sender_training) + except Exception: + metrics.ocr_training_runs_total.labels(kind="recognition", outcome="error").inc() + raise + metrics.ocr_training_runs_total.labels(kind="recognition", outcome="success").inc() + if result.get("accuracy") is not None: + metrics.ocr_model_accuracy.labels(kind="recognition").set(result["accuracy"]) return result @@ -668,7 +682,14 @@ async def segtrain_model( return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs} - result = await asyncio.to_thread(_run_segtrain) + try: + result = await asyncio.to_thread(_run_segtrain) + except Exception: + metrics.ocr_training_runs_total.labels(kind="segmentation", outcome="error").inc() + raise + metrics.ocr_training_runs_total.labels(kind="segmentation", outcome="success").inc() + if result.get("accuracy") is not None: + metrics.ocr_model_accuracy.labels(kind="segmentation").set(result["accuracy"]) return result diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 5701bbe3..55108fb8 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -4,6 +4,8 @@ Each test that asserts on a counter/gauge value uses a fresh CollectorRegistry (see decision #3 on issue #652) to keep the metrics isolated between tests. """ +import io +import zipfile from unittest.mock import AsyncMock, patch import pytest @@ -15,6 +17,18 @@ from main import app from metrics import build_metrics +def _minimal_zip() -> bytes: + """Return a ZIP containing one fake .xml so endpoint validation passes.""" + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("page_01.xml", "") + return buf.getvalue() + + +def _fake_training_result(accuracy: float = 0.91) -> dict: + return {"loss": None, "accuracy": accuracy, "cer": round(1 - accuracy, 4), "epochs": 5} + + @pytest.fixture def fresh_metrics(monkeypatch): """Replace the module-level `main.metrics` with one bound to a fresh registry.""" @@ -307,3 +321,70 @@ async def test_ocr_processing_seconds_histogram_observed_per_page_in_stream(fres ) assert count == 2.0 assert sum_seconds >= 0.0 + + +@pytest.mark.asyncio +async def test_ocr_training_runs_total_incremented_with_recognition_success_label(fresh_metrics): + """/train success increments ocr_training_runs_total{kind=recognition, outcome=success}.""" + async def fake_to_thread(func, *args, **kwargs): + return _fake_training_result() + + with patch("main.TRAINING_TOKEN", "secret-token"), \ + patch("main._models_ready", True), \ + patch("main.asyncio.to_thread", side_effect=fake_to_thread): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/train", + files={"file": ("training.zip", _minimal_zip(), "application/zip")}, + headers={"X-Training-Token": "secret-token"}, + ) + + assert response.status_code == 200 + assert fresh_metrics.ocr_training_runs_total.labels( + kind="recognition", outcome="success" + )._value.get() == 1.0 + + +@pytest.mark.asyncio +async def test_ocr_training_runs_total_incremented_with_recognition_error_label(fresh_metrics): + """When /train's inner runner raises, the error counter bumps and the exception propagates.""" + async def fake_to_thread(func, *args, **kwargs): + raise RuntimeError("ketos train failed (exit 1): synthetic") + + with patch("main.TRAINING_TOKEN", "secret-token"), \ + patch("main._models_ready", True), \ + patch("main.asyncio.to_thread", side_effect=fake_to_thread): + transport = ASGITransport(app=app, raise_app_exceptions=False) + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.post( + "/train", + files={"file": ("training.zip", _minimal_zip(), "application/zip")}, + headers={"X-Training-Token": "secret-token"}, + ) + + assert response.status_code == 500 + assert fresh_metrics.ocr_training_runs_total.labels( + kind="recognition", outcome="error" + )._value.get() == 1.0 + + +@pytest.mark.asyncio +async def test_ocr_training_runs_total_incremented_with_segmentation_success_label(fresh_metrics): + """/segtrain success increments ocr_training_runs_total{kind=segmentation, outcome=success}.""" + async def fake_to_thread(func, *args, **kwargs): + return _fake_training_result(accuracy=0.83) + + with patch("main.TRAINING_TOKEN", "secret-token"), \ + patch("main._models_ready", True), \ + patch("main.asyncio.to_thread", side_effect=fake_to_thread): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/segtrain", + files={"file": ("training.zip", _minimal_zip(), "application/zip")}, + headers={"X-Training-Token": "secret-token"}, + ) + + assert response.status_code == 200 + assert fresh_metrics.ocr_training_runs_total.labels( + kind="segmentation", outcome="success" + )._value.get() == 1.0 -- 2.49.1 From 77d59c5d835e72a771fdee43ad4c14c9c554e5af Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:13:05 +0200 Subject: [PATCH 11/27] test(ocr): assert ocr_model_accuracy gauge is set per kind on success Hits /train then /segtrain through the same test, each with a distinct mocked accuracy, and asserts the labelled gauges reflect the two values. Locks down the kind-label separation between recognition and segmentation accuracy (decision #2). Refs #652 (AC6) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_metrics.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 55108fb8..7d154102 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -388,3 +388,37 @@ async def test_ocr_training_runs_total_incremented_with_segmentation_success_lab assert fresh_metrics.ocr_training_runs_total.labels( kind="segmentation", outcome="success" )._value.get() == 1.0 + + +@pytest.mark.asyncio +async def test_ocr_model_accuracy_gauge_set_per_kind_after_successful_training(fresh_metrics): + """After /train and /segtrain succeed, ocr_model_accuracy{kind=...} reflects the result.""" + recognition_accuracy = 0.917 + segmentation_accuracy = 0.834 + + async def fake_recognition_to_thread(func, *args, **kwargs): + return _fake_training_result(accuracy=recognition_accuracy) + + async def fake_segmentation_to_thread(func, *args, **kwargs): + return _fake_training_result(accuracy=segmentation_accuracy) + + with patch("main.TRAINING_TOKEN", "secret-token"), \ + patch("main._models_ready", True): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + with patch("main.asyncio.to_thread", side_effect=fake_recognition_to_thread): + rec_resp = await client.post( + "/train", + files={"file": ("training.zip", _minimal_zip(), "application/zip")}, + headers={"X-Training-Token": "secret-token"}, + ) + assert rec_resp.status_code == 200 + with patch("main.asyncio.to_thread", side_effect=fake_segmentation_to_thread): + seg_resp = await client.post( + "/segtrain", + files={"file": ("training.zip", _minimal_zip(), "application/zip")}, + headers={"X-Training-Token": "secret-token"}, + ) + assert seg_resp.status_code == 200 + + assert fresh_metrics.ocr_model_accuracy.labels(kind="recognition")._value.get() == pytest.approx(recognition_accuracy) + assert fresh_metrics.ocr_model_accuracy.labels(kind="segmentation")._value.get() == pytest.approx(segmentation_accuracy) -- 2.49.1 From d6abf990c71b524a91fbae8d319f085c68037eeb Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:15:11 +0200 Subject: [PATCH 12/27] feat(ocr): flip ocr_models_ready to 1 once the lifespan startup finishes Mirrors the existing _models_ready bool so Prometheus has a time-series liveness/readiness signal for future alerting rules (e.g. ocr_models_ready < 1 for 2m). Refs #652 (AC7) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 1 + ocr-service/test_metrics.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index 1460a213..f7dc495d 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -69,6 +69,7 @@ async def lifespan(app: FastAPI): kraken_engine.load_models() load_spell_checker() _models_ready = True + metrics.ocr_models_ready.set(1) logger.info("Startup complete — ready to accept requests") yield diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 7d154102..5c472dbe 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -422,3 +422,23 @@ async def test_ocr_model_accuracy_gauge_set_per_kind_after_successful_training(f assert fresh_metrics.ocr_model_accuracy.labels(kind="recognition")._value.get() == pytest.approx(recognition_accuracy) assert fresh_metrics.ocr_model_accuracy.labels(kind="segmentation")._value.get() == pytest.approx(segmentation_accuracy) + + +def test_ocr_models_ready_gauge_defaults_to_zero(): + """A freshly-built OcrMetrics has ocr_models_ready=0 before lifespan runs.""" + metrics = build_metrics(CollectorRegistry()) + assert metrics.ocr_models_ready._value.get() == 0.0 + + +@pytest.mark.asyncio +async def test_ocr_models_ready_gauge_is_one_after_lifespan_startup(fresh_metrics): + """The lifespan flips ocr_models_ready to 1 once load_models / load_spell_checker return. + + ASGITransport does not run lifespan by default, so the lifespan context + manager is driven directly to exercise the startup code path. + """ + assert fresh_metrics.ocr_models_ready._value.get() == 0.0 + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"): + async with app.router.lifespan_context(app): + assert fresh_metrics.ocr_models_ready._value.get() == 1.0 -- 2.49.1 From 525f091b3a707046ef298352a466b0839dedac36 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:16:14 +0200 Subject: [PATCH 13/27] feat(ocr): suppress uvicorn access logs for /metrics and /health Adds a logging.Filter on uvicorn.access that drops records whose request path is /metrics or /health. Each is hit on a tight schedule (Prometheus scrape interval and Docker healthcheck), so unfiltered they dominate the access log without carrying any information about real traffic. Refs #652 (Nora's recommendation) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 17 +++++++++++++++++ ocr-service/test_metrics.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index f7dc495d..bf26ef35 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -82,6 +82,23 @@ app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan) Instrumentator(excluded_handlers=["/health", "/metrics"]).instrument(app).expose(app) +class MetricsPathFilter(logging.Filter): + """Drop uvicorn.access entries for /metrics and /health to keep logs focused.""" + + _SUPPRESSED_PATHS = {"/metrics", "/health"} + + def filter(self, record: logging.LogRecord) -> bool: + # uvicorn.access formats as: '%s - "%s %s HTTP/%s" %d' + if record.args and len(record.args) >= 3: + path = record.args[2] + if isinstance(path, str) and path in self._SUPPRESSED_PATHS: + return False + return True + + +logging.getLogger("uvicorn.access").addFilter(MetricsPathFilter()) + + @app.get("/health") def health(): """Health endpoint — returns 200 only after models are loaded.""" diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 5c472dbe..271b57d8 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -442,3 +442,33 @@ async def test_ocr_models_ready_gauge_is_one_after_lifespan_startup(fresh_metric patch("main.load_spell_checker"): async with app.router.lifespan_context(app): assert fresh_metrics.ocr_models_ready._value.get() == 1.0 + + +def test_uvicorn_access_log_filter_skips_metrics_path(): + """The MetricsPathFilter drops uvicorn.access log records that target /metrics.""" + import logging as _logging + from main import MetricsPathFilter + + filt = MetricsPathFilter() + metrics_record = _logging.LogRecord( + name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0, + msg='%s - "%s %s HTTP/%s" %d', + args=("127.0.0.1:1234", "GET", "/metrics", "1.1", 200), + exc_info=None, + ) + health_record = _logging.LogRecord( + name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0, + msg='%s - "%s %s HTTP/%s" %d', + args=("127.0.0.1:1234", "GET", "/health", "1.1", 200), + exc_info=None, + ) + ocr_record = _logging.LogRecord( + name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0, + msg='%s - "%s %s HTTP/%s" %d', + args=("127.0.0.1:1234", "POST", "/ocr", "1.1", 200), + exc_info=None, + ) + + assert filt.filter(metrics_record) is False + assert filt.filter(health_record) is False + assert filt.filter(ocr_record) is True -- 2.49.1 From e75ac8ec451f967f940899fde3bf32c72d345f9f Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:16:51 +0200 Subject: [PATCH 14/27] ops(observability): drop TODO from ocr-service scrape job in prometheus.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TODO was a placeholder for this work — the OCR service now exposes /metrics so the target will flip from DOWN to UP on next image rebuild. Refs #652 Co-Authored-By: Claude Sonnet 4.6 --- infra/observability/prometheus/prometheus.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/infra/observability/prometheus/prometheus.yml b/infra/observability/prometheus/prometheus.yml index a29cc75b..4838bc1c 100644 --- a/infra/observability/prometheus/prometheus.yml +++ b/infra/observability/prometheus/prometheus.yml @@ -20,7 +20,4 @@ scrape_configs: - job_name: ocr-service metrics_path: /metrics static_configs: - # TODO: remove or add prometheus-client to ocr-service. - # The Python OCR service does not currently expose Prometheus metrics. - # This target will show as DOWN until prometheus-client is added to ocr-service. - targets: ['ocr:8000'] -- 2.49.1 From ebaedb1af0107025847dcd09ce3c10fe177bfeb0 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:51:23 +0200 Subject: [PATCH 15/27] test(ocr): assert ocr_jobs_total stays zero when stream download fails Locks in the post-download placement of the counter increment so a regression that moves it back above _download_and_convert_pdf would fail. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_metrics.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 271b57d8..783e1d25 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -444,6 +444,38 @@ async def test_ocr_models_ready_gauge_is_one_after_lifespan_startup(fresh_metric assert fresh_metrics.ocr_models_ready._value.get() == 1.0 +@pytest.mark.asyncio +async def test_ocr_jobs_total_not_incremented_when_pdf_download_fails_in_stream(fresh_metrics): + """If `_download_and_convert_pdf` raises, ocr_jobs_total is NOT incremented. + + Mirrors the /ocr endpoint's semantics: the counter only records jobs that + actually started OCR work, not failed downloads. + """ + async def fail_download(url): + raise RuntimeError("synthetic download failure") + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new=fail_download): + transport = ASGITransport(app=app, raise_app_exceptions=False) + async with AsyncClient(transport=transport, base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + response = await client.post("/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) + finally: + main_module._models_ready = False + + assert response.status_code == 500 + assert fresh_metrics.ocr_jobs_total.labels( + engine="surya", script_type="TYPEWRITER" + )._value.get() == 0.0 + + def test_uvicorn_access_log_filter_skips_metrics_path(): """The MetricsPathFilter drops uvicorn.access log records that target /metrics.""" import logging as _logging -- 2.49.1 From 74ddf16b01cfdd727eb4465347b2f4bc0b92d609 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:53:04 +0200 Subject: [PATCH 16/27] feat(ocr): time only engine work in guided stream histogram Previously the guided generator's page_started timer wrapped the entire region loop including the synchronous correct_text() call, inflating ocr_processing_seconds with spell-check latency. Sum the per-region engine.extract_region_text durations instead so the histogram matches the unguided stream's "engine only" semantic. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 6 ++- ocr-service/test_metrics.py | 83 +++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/ocr-service/main.py b/ocr-service/main.py index bf26ef35..31fd9cad 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -217,13 +217,15 @@ async def run_ocr_stream(request: OcrRequest): image = await asyncio.to_thread(preprocess_page, image) blocks = [] sender_path = request.senderModelPath if use_kraken else None - page_started = time.monotonic() + engine_seconds = 0.0 for region in page_regions: + region_started = time.monotonic() text = await asyncio.to_thread( engine.extract_region_text, image, region.x, region.y, region.width, region.height, sender_path, ) + engine_seconds += time.monotonic() - region_started if script_type in _SPELL_CHECK_SCRIPT_TYPES: text = correct_text(text) blocks.append({ @@ -238,7 +240,7 @@ async def run_ocr_stream(request: OcrRequest): }) metrics.ocr_processing_seconds.labels(engine=engine_name).observe( - time.monotonic() - page_started + engine_seconds ) total_blocks += len(blocks) metrics.ocr_pages_total.labels(engine=engine_name).inc() diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 783e1d25..3ed02c31 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -444,6 +444,89 @@ async def test_ocr_models_ready_gauge_is_one_after_lifespan_startup(fresh_metric assert fresh_metrics.ocr_models_ready._value.get() == 1.0 +@pytest.mark.asyncio +async def test_ocr_processing_seconds_histogram_observed_per_page_in_guided_stream(fresh_metrics): + """The guided streaming generator observes ocr_processing_seconds once per page.""" + mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)] + regions = [ + {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"}, + {"pageNumber": 2, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "annotationId": "a2"}, + ] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_region_text", return_value="text"): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + "regions": regions, + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass + finally: + main_module._models_ready = False + + sum_seconds, count = _histogram_count_sum( + fresh_metrics.ocr_processing_seconds, engine="surya" + ) + assert count == 2.0 + assert sum_seconds >= 0.0 + + +@pytest.mark.asyncio +async def test_ocr_processing_seconds_histogram_excludes_spell_check_time_in_guided_stream(fresh_metrics): + """The guided observation must time engine work only, not the spell-check pass.""" + mock_images = [Image.new("RGB", (100, 100))] + regions = [ + {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"}, + {"pageNumber": 1, "x": 0.5, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a2"}, + ] + + def slow_correct(text): + import time as _time + _time.sleep(0.05) + return text + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.kraken_engine.is_available", return_value=True), \ + patch("main.kraken_engine.extract_region_text", return_value="text"), \ + patch("main.correct_text", side_effect=slow_correct): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "HANDWRITING_KURRENT", + "language": "de", + "regions": regions, + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass + finally: + main_module._models_ready = False + + sum_seconds, _ = _histogram_count_sum( + fresh_metrics.ocr_processing_seconds, engine="kraken" + ) + # Spell-check sleeps 0.05s per region × 2 regions = 0.1s; engine work is instantaneous. + # If timing included spell-check, sum_seconds would be >= 0.1s. Allow 30ms slack + # for scheduler overhead. + assert sum_seconds < 0.05, f"timing must exclude spell-check; got sum={sum_seconds}" + + @pytest.mark.asyncio async def test_ocr_jobs_total_not_incremented_when_pdf_download_fails_in_stream(fresh_metrics): """If `_download_and_convert_pdf` raises, ocr_jobs_total is NOT incremented. -- 2.49.1 From 549cb15845706be5dc151303bb9d777c5d2e60a9 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:53:48 +0200 Subject: [PATCH 17/27] test(ocr): cover /train-sender counter and accuracy=None gauge default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two regression tests: - /train-sender hitting the success path bumps the recognition counter (previously only /train and /segtrain were covered). - A successful run whose result.accuracy is None must not call set() on ocr_model_accuracy — the gauge stays at its default 0. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_metrics.py | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 3ed02c31..6aa047e0 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -390,6 +390,52 @@ async def test_ocr_training_runs_total_incremented_with_segmentation_success_lab )._value.get() == 1.0 +@pytest.mark.asyncio +async def test_ocr_training_runs_total_incremented_with_recognition_success_label_for_train_sender(fresh_metrics): + """/train-sender success increments ocr_training_runs_total{kind=recognition, outcome=success}.""" + async def fake_to_thread(func, *args, **kwargs): + return _fake_training_result() + + with patch("main.TRAINING_TOKEN", "secret-token"), \ + patch("main._models_ready", True), \ + patch("main.asyncio.to_thread", side_effect=fake_to_thread): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/train-sender", + files={"file": ("training.zip", _minimal_zip(), "application/zip")}, + data={"output_model_path": "/app/models/sender_test.mlmodel"}, + headers={"X-Training-Token": "secret-token"}, + ) + + assert response.status_code == 200, response.text + assert fresh_metrics.ocr_training_runs_total.labels( + kind="recognition", outcome="success" + )._value.get() == 1.0 + + +@pytest.mark.asyncio +async def test_ocr_model_accuracy_gauge_stays_default_when_training_returns_no_accuracy(fresh_metrics): + """When the runner returns accuracy=None, ocr_model_accuracy must remain at its default 0.""" + async def fake_to_thread(func, *args, **kwargs): + return {"loss": None, "accuracy": None, "cer": None, "epochs": 5} + + with patch("main.TRAINING_TOKEN", "secret-token"), \ + patch("main._models_ready", True), \ + patch("main.asyncio.to_thread", side_effect=fake_to_thread): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/train", + files={"file": ("training.zip", _minimal_zip(), "application/zip")}, + headers={"X-Training-Token": "secret-token"}, + ) + + assert response.status_code == 200 + # Gauge was never .set() — accessing the label child still creates it with default 0.0. + assert fresh_metrics.ocr_model_accuracy.labels( + kind="recognition" + )._value.get() == 0.0 + + @pytest.mark.asyncio async def test_ocr_model_accuracy_gauge_set_per_kind_after_successful_training(fresh_metrics): """After /train and /segtrain succeed, ocr_model_accuracy{kind=...} reflects the result.""" -- 2.49.1 From 0fc0cbcffdc8bab071df74cc0d7899d2d17c2b84 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:54:24 +0200 Subject: [PATCH 18/27] test(ocr): lock in MetricsPathFilter fail-open behavior If uvicorn's access log format ever changes (args=None, or shorter than 3 elements), the filter must keep forwarding records rather than silently dropping them. Two extra LogRecords cover both edge cases. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_metrics.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 6aa047e0..d8099a24 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -605,6 +605,29 @@ async def test_ocr_jobs_total_not_incremented_when_pdf_download_fails_in_stream( )._value.get() == 0.0 +def test_uvicorn_access_log_filter_fails_open_on_short_or_missing_args(): + """The filter must default-allow records when args is None or shorter than expected. + + Locks in fail-open behavior: if uvicorn ever changes its format we keep + forwarding records to the handler rather than silently dropping logs. + """ + import logging as _logging + from main import MetricsPathFilter + + filt = MetricsPathFilter() + none_record = _logging.LogRecord( + name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0, + msg="some message", args=None, exc_info=None, + ) + short_record = _logging.LogRecord( + name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0, + msg="%s %s", args=("a", "b"), exc_info=None, + ) + + assert filt.filter(none_record) is True + assert filt.filter(short_record) is True + + def test_uvicorn_access_log_filter_skips_metrics_path(): """The MetricsPathFilter drops uvicorn.access log records that target /metrics.""" import logging as _logging -- 2.49.1 From 0179e93a4b6551682dd00b3dd84b37242e2bc0b3 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:55:14 +0200 Subject: [PATCH 19/27] test(ocr): narrow training error test to subprocess.run seam MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The asyncio.to_thread patch stubbed out the entire _run_training call, hiding the real error path. Replacing it with a failing CompletedProcess from subprocess.run exercises the actual ketos-failed branch and keeps the test's intent — error counter bumps, 500 surfaces — intact. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_metrics.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index d8099a24..1e6ab7fa 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -347,13 +347,21 @@ async def test_ocr_training_runs_total_incremented_with_recognition_success_labe @pytest.mark.asyncio async def test_ocr_training_runs_total_incremented_with_recognition_error_label(fresh_metrics): - """When /train's inner runner raises, the error counter bumps and the exception propagates.""" - async def fake_to_thread(func, *args, **kwargs): - raise RuntimeError("ketos train failed (exit 1): synthetic") + """When ketos exits non-zero, the error counter bumps and the exception propagates. + + Uses the narrowest available seam — `subprocess.run` returning a failing + CompletedProcess — instead of stubbing the asyncio.to_thread boundary, + so the test exercises the real _run_training error path. + """ + from subprocess import CompletedProcess + + failing_proc = CompletedProcess( + args=["ketos"], returncode=1, stdout="", stderr="synthetic ketos failure" + ) with patch("main.TRAINING_TOKEN", "secret-token"), \ patch("main._models_ready", True), \ - patch("main.asyncio.to_thread", side_effect=fake_to_thread): + patch("main.subprocess.run", return_value=failing_proc): transport = ASGITransport(app=app, raise_app_exceptions=False) async with AsyncClient(transport=transport, base_url="http://test") as client: response = await client.post( -- 2.49.1 From 22a5ee816af65b5e9757b7fa9651483e673bbeb1 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:57:18 +0200 Subject: [PATCH 20/27] refactor(ocr): extract _observe_block_words for word counter sites MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two block-iteration loops (/ocr and /ocr/stream's standard generator) both ran the same word-total and illegible-word increments. Lift them into a single helper so each call site becomes one line and the counter intent reads cleanly. Pure refactor — no behavior change, tests stay green. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/ocr-service/main.py b/ocr-service/main.py index 31fd9cad..51cf737b 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -50,6 +50,18 @@ ALLOWED_PDF_HOSTS = set( _SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"} +def _observe_block_words(words: list[dict], threshold: float) -> None: + """Record per-block word counts and below-threshold word counts. + + Pre: `words` is non-empty. Caller checks for that — keeping the helper + branch-free makes the call sites read as a single line. + """ + metrics.ocr_words_total.inc(len(words)) + metrics.ocr_illegible_words_total.inc( + sum(1 for w in words if w["confidence"] < threshold) + ) + + def _validate_url(url: str) -> None: """Validate that the PDF URL points to an allowed host (SSRF protection).""" parsed = urlparse(url) @@ -149,10 +161,7 @@ async def run_ocr(request: OcrRequest): for block in blocks: words = block.get("words") or [] if words: - metrics.ocr_words_total.inc(len(words)) - metrics.ocr_illegible_words_total.inc( - sum(1 for w in words if w["confidence"] < threshold) - ) + _observe_block_words(words, threshold) block["text"] = apply_confidence_markers(words, threshold) block.pop("words", None) if script_type in _SPELL_CHECK_SCRIPT_TYPES: @@ -298,10 +307,7 @@ async def run_ocr_stream(request: OcrRequest): for block in blocks: words = block.get("words") or [] if words: - metrics.ocr_words_total.inc(len(words)) - metrics.ocr_illegible_words_total.inc( - sum(1 for w in words if w["confidence"] < threshold) - ) + _observe_block_words(words, threshold) block["text"] = apply_confidence_markers(words, threshold) block.pop("words", None) if script_type in _SPELL_CHECK_SCRIPT_TYPES: -- 2.49.1 From df952861c4605dc5ec2721c6b7b9ffe43276fec6 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:58:40 +0200 Subject: [PATCH 21/27] refactor(ocr): extract _record_training for shared metric bookkeeping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /train, /train-sender, and /segtrain endpoints each duplicated the same eight-line try/except + counter + gauge block around the asyncio.to_thread call. Lift it into _record_training(runner, kind), which accepts a sync- or async-returning callable for flexibility. Each endpoint now ends with a single return line. Behaviour preserved — status codes, error propagation, and metric labels stay identical. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 58 ++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/ocr-service/main.py b/ocr-service/main.py index 51cf737b..489fd98e 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -2,6 +2,7 @@ import asyncio import glob +import inspect import io import json import logging @@ -14,6 +15,7 @@ import time import zipfile from contextlib import asynccontextmanager from datetime import datetime, timezone +from typing import Awaitable, Callable from urllib.parse import urlparse import httpx @@ -50,6 +52,30 @@ ALLOWED_PDF_HOSTS = set( _SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"} +async def _record_training( + runner: Callable[[], Awaitable[dict] | dict], + kind: str, +) -> dict: + """Run a training callable and record outcome + accuracy metrics. + + Wraps the per-endpoint try/except + outcome counter + accuracy gauge + block that used to be repeated at /train, /train-sender, and /segtrain. + The runner returns a dict with at least an `accuracy` key; if its value + is None, the gauge is left at its default. + """ + try: + result = runner() + if inspect.isawaitable(result): + result = await result + except Exception: + metrics.ocr_training_runs_total.labels(kind=kind, outcome="error").inc() + raise + metrics.ocr_training_runs_total.labels(kind=kind, outcome="success").inc() + if result.get("accuracy") is not None: + metrics.ocr_model_accuracy.labels(kind=kind).set(result["accuracy"]) + return result + + def _observe_block_words(words: list[dict], threshold: float) -> None: """Record per-block word counts and below-threshold word counts. @@ -504,15 +530,7 @@ async def train_model( return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs} - try: - result = await asyncio.to_thread(_run_training) - except Exception: - metrics.ocr_training_runs_total.labels(kind="recognition", outcome="error").inc() - raise - metrics.ocr_training_runs_total.labels(kind="recognition", outcome="success").inc() - if result.get("accuracy") is not None: - metrics.ocr_model_accuracy.labels(kind="recognition").set(result["accuracy"]) - return result + return await _record_training(lambda: asyncio.to_thread(_run_training), kind="recognition") @app.post("/train-sender") @@ -591,15 +609,9 @@ async def train_sender_model( return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs} - try: - result = await asyncio.to_thread(_run_sender_training) - except Exception: - metrics.ocr_training_runs_total.labels(kind="recognition", outcome="error").inc() - raise - metrics.ocr_training_runs_total.labels(kind="recognition", outcome="success").inc() - if result.get("accuracy") is not None: - metrics.ocr_model_accuracy.labels(kind="recognition").set(result["accuracy"]) - return result + return await _record_training( + lambda: asyncio.to_thread(_run_sender_training), kind="recognition" + ) @app.post("/segtrain") @@ -708,15 +720,7 @@ async def segtrain_model( return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs} - try: - result = await asyncio.to_thread(_run_segtrain) - except Exception: - metrics.ocr_training_runs_total.labels(kind="segmentation", outcome="error").inc() - raise - metrics.ocr_training_runs_total.labels(kind="segmentation", outcome="success").inc() - if result.get("accuracy") is not None: - metrics.ocr_model_accuracy.labels(kind="segmentation").set(result["accuracy"]) - return result + return await _record_training(lambda: asyncio.to_thread(_run_segtrain), kind="segmentation") async def _download_and_convert_pdf(url: str) -> list[Image.Image]: -- 2.49.1 From ddf6cf4cbca0360a5310ecd1642ec56de35ddec9 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 17:03:29 +0200 Subject: [PATCH 22/27] test(ocr): collapse shared client setup into ocr_client helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each metrics test was repeating the same five-line block — patch kraken_engine.load_models, patch load_spell_checker, instantiate the AsyncClient, force _models_ready True, restore it. Lift the lot into a single async context manager so each test body shrinks to its real arrange / act / assert intent. Tests that drive the lifespan directly (models_ready gauge) or stub asyncio.to_thread for /train (which already patches _models_ready) stay unchanged. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_metrics.py | 255 +++++++++++++++--------------------- 1 file changed, 102 insertions(+), 153 deletions(-) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 1e6ab7fa..253cae14 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -4,6 +4,7 @@ Each test that asserts on a counter/gauge value uses a fresh CollectorRegistry (see decision #3 on issue #652) to keep the metrics isolated between tests. """ +import contextlib import io import zipfile from unittest.mock import AsyncMock, patch @@ -17,6 +18,26 @@ from main import app from metrics import build_metrics +@contextlib.asynccontextmanager +async def ocr_client(*, raise_app_exceptions: bool = True): + """Yield an AsyncClient with model-loaders patched and _models_ready forced on. + + The shared setup for almost every metrics test: stub the heavy lifecycle + hooks (kraken_engine.load_models, load_spell_checker), flip the readiness + flag so request handlers do not 503, and restore it afterwards. + """ + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"): + transport = ASGITransport(app=app, raise_app_exceptions=raise_app_exceptions) + async with AsyncClient(transport=transport, base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + yield client + finally: + main_module._models_ready = False + + def _minimal_zip() -> bytes: """Return a ZIP containing one fake .xml so endpoint validation passes.""" buf = io.BytesIO() @@ -57,26 +78,18 @@ async def test_metrics_includes_http_request_metrics_after_ocr_call(): mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "hi", "words": []}] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - # Models need to be loaded for /ocr to accept requests; force the flag. - import main as main_module - main_module._models_ready = True - try: - ocr_response = await client.post("/ocr", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - }) - assert ocr_response.status_code == 200, ocr_response.text + async with ocr_client() as client: + ocr_response = await client.post("/ocr", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) + assert ocr_response.status_code == 200, ocr_response.text - metrics_response = await client.get("/metrics") - finally: - main_module._models_ready = False + metrics_response = await client.get("/metrics") body = metrics_response.text assert "http_requests_total" in body @@ -124,20 +137,13 @@ async def test_ocr_jobs_total_incremented_with_kraken_engine_label_for_kurrent(f mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "hi", "words": []}] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main.correct_text", side_effect=lambda t: t), \ + with patch("main.correct_text", side_effect=lambda t: t), \ patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.kraken_engine.is_available", return_value=True), \ patch("main.kraken_engine.extract_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - await _drive_ocr(client, script_type="HANDWRITING_KURRENT") - finally: - main_module._models_ready = False + async with ocr_client() as client: + await _drive_ocr(client, script_type="HANDWRITING_KURRENT") value = fresh_metrics.ocr_jobs_total.labels( engine="kraken", script_type="HANDWRITING_KURRENT" @@ -152,18 +158,11 @@ async def test_ocr_jobs_total_incremented_with_surya_engine_label_for_typewriter mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "hi", "words": []}] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - await _drive_ocr(client, script_type="TYPEWRITER") - finally: - main_module._models_ready = False + async with ocr_client() as client: + await _drive_ocr(client, script_type="TYPEWRITER") value = fresh_metrics.ocr_jobs_total.labels( engine="surya", script_type="TYPEWRITER" @@ -178,26 +177,19 @@ async def test_ocr_pages_total_incremented_once_per_page_in_stream(fresh_metrics mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "hi", "words": []}] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - async with client.stream("POST", "/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - }) as response: - assert response.status_code == 200 - # Drain the stream so all per-page increments fire. - async for _ in response.aiter_lines(): - pass - finally: - main_module._models_ready = False + async with ocr_client() as client: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + # Drain the stream so all per-page increments fire. + async for _ in response.aiter_lines(): + pass value = fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() assert value == 3.0 @@ -218,28 +210,21 @@ async def test_ocr_skipped_pages_total_incremented_when_engine_raises_for_a_page raise RuntimeError("synthetic engine failure") return good_blocks - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_page_blocks", side_effect=extract_side_effect): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - async with client.stream("POST", "/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - }) as response: - assert response.status_code == 200 - saw_error = False - async for line in response.aiter_lines(): - if line and '"type": "error"' in line: - saw_error = True - assert saw_error - finally: - main_module._models_ready = False + async with ocr_client() as client: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + saw_error = False + async for line in response.aiter_lines(): + if line and '"type": "error"' in line: + saw_error = True + assert saw_error assert fresh_metrics.ocr_skipped_pages_total._value.get() == 1.0 # The second page still succeeds. @@ -266,18 +251,11 @@ async def test_ocr_words_and_illegible_words_total_sum_across_blocks(fresh_metri {"text": "Karl", "confidence": 0.95}]}, ] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - await _drive_ocr(client, script_type="TYPEWRITER") - finally: - main_module._models_ready = False + async with ocr_client() as client: + await _drive_ocr(client, script_type="TYPEWRITER") assert fresh_metrics.ocr_words_total._value.get() == 5.0 assert fresh_metrics.ocr_illegible_words_total._value.get() == 2.0 @@ -296,25 +274,18 @@ async def test_ocr_processing_seconds_histogram_observed_per_page_in_stream(fres mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "ok", "words": []}] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - async with client.stream("POST", "/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - }) as response: - assert response.status_code == 200 - async for _ in response.aiter_lines(): - pass - finally: - main_module._models_ready = False + async with ocr_client() as client: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass sum_seconds, count = _histogram_count_sum( fresh_metrics.ocr_processing_seconds, engine="surya" @@ -507,26 +478,19 @@ async def test_ocr_processing_seconds_histogram_observed_per_page_in_guided_stre {"pageNumber": 2, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "annotationId": "a2"}, ] - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.surya_engine.extract_region_text", return_value="text"): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - async with client.stream("POST", "/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - "regions": regions, - }) as response: - assert response.status_code == 200 - async for _ in response.aiter_lines(): - pass - finally: - main_module._models_ready = False + async with ocr_client() as client: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + "regions": regions, + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass sum_seconds, count = _histogram_count_sum( fresh_metrics.ocr_processing_seconds, engine="surya" @@ -549,28 +513,21 @@ async def test_ocr_processing_seconds_histogram_excludes_spell_check_time_in_gui _time.sleep(0.05) return text - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ patch("main.preprocess_page", side_effect=lambda img: img), \ patch("main.kraken_engine.is_available", return_value=True), \ patch("main.kraken_engine.extract_region_text", return_value="text"), \ patch("main.correct_text", side_effect=slow_correct): - async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - async with client.stream("POST", "/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "HANDWRITING_KURRENT", - "language": "de", - "regions": regions, - }) as response: - assert response.status_code == 200 - async for _ in response.aiter_lines(): - pass - finally: - main_module._models_ready = False + async with ocr_client() as client: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "HANDWRITING_KURRENT", + "language": "de", + "regions": regions, + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass sum_seconds, _ = _histogram_count_sum( fresh_metrics.ocr_processing_seconds, engine="kraken" @@ -591,21 +548,13 @@ async def test_ocr_jobs_total_not_incremented_when_pdf_download_fails_in_stream( async def fail_download(url): raise RuntimeError("synthetic download failure") - with patch("main.kraken_engine.load_models"), \ - patch("main.load_spell_checker"), \ - patch("main._download_and_convert_pdf", new=fail_download): - transport = ASGITransport(app=app, raise_app_exceptions=False) - async with AsyncClient(transport=transport, base_url="http://test") as client: - import main as main_module - main_module._models_ready = True - try: - response = await client.post("/ocr/stream", json={ - "pdfUrl": "http://minio/doc.pdf", - "scriptType": "TYPEWRITER", - "language": "de", - }) - finally: - main_module._models_ready = False + with patch("main._download_and_convert_pdf", new=fail_download): + async with ocr_client(raise_app_exceptions=False) as client: + response = await client.post("/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) assert response.status_code == 500 assert fresh_metrics.ocr_jobs_total.labels( -- 2.49.1 From 67368b4413389797c3de08fa26e077e2d9bf17cb Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 17:04:28 +0200 Subject: [PATCH 23/27] docs(ocr): annotate metrics binding + /metrics exposure + pin client Three small drops that pay back later: - Note that main.metrics is import-time bound and tests must monkeypatch `main.metrics`, not the registry. - Flag the /metrics endpoint as unauthenticated and cross-link the Caddy-block snippet in docs/OBSERVABILITY.md. - Pin prometheus-client to the exact 0.25.0 patch version already resolved by prometheus-fastapi-instrumentator 7.0.0, so an upstream bump cannot silently slip in. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 7 +++++++ ocr-service/requirements.txt | 1 + 2 files changed, 8 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index 489fd98e..bde3fb2f 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -43,6 +43,10 @@ logger = logging.getLogger(__name__) _models_ready = False +# One-shot import-time binding to the default REGISTRY. Tests that need a +# clean counter state must monkeypatch `main.metrics` with a container built +# from a fresh CollectorRegistry — rebinding through the registry directly +# will not retarget the references stored in the OcrMetrics dataclass. metrics: OcrMetrics = build_metrics(REGISTRY) ALLOWED_PDF_HOSTS = set( @@ -117,6 +121,9 @@ async def lifespan(app: FastAPI): app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan) +# /metrics is unauthenticated — relies on Docker-internal-network exposure +# only (CWE-200 risk if `ports:` ever maps 8000 to host). See +# docs/OBSERVABILITY.md §Internal-only endpoints for the Caddy block snippet. Instrumentator(excluded_handlers=["/health", "/metrics"]).instrument(app).expose(app) diff --git a/ocr-service/requirements.txt b/ocr-service/requirements.txt index 8a9bedec..39c492cc 100644 --- a/ocr-service/requirements.txt +++ b/ocr-service/requirements.txt @@ -11,3 +11,4 @@ httpx==0.28.1 pyspellchecker==0.9.0 opencv-python-headless==4.11.0.86 prometheus-fastapi-instrumentator==7.0.0 +prometheus-client==0.25.0 -- 2.49.1 From 2dbb3c37b45d632922bedae61a3b04d7c18888c2 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 17:05:27 +0200 Subject: [PATCH 24/27] docs(observability): document ocr metrics, scrape edge, and access-log filter - L2 container diagram now shows the Prometheus -> ocr:8000 scrape edge (plus the previously-undrawn Prometheus -> backend edge for symmetry). - OBSERVABILITY.md gains a full ocr_* metrics table with labels, units, and the canonical example queries from issue #652. - New "Internal-only endpoints" subsection captures the unauthenticated /metrics caveat and provides the Caddy block snippet for the case where the service ever gets a host port. - Explicit note that MetricsPathFilter only quiets uvicorn stdout, and the OCR metrics must never carry PII or document content. Co-Authored-By: Claude Sonnet 4.6 --- docs/OBSERVABILITY.md | 69 ++++++++++++++++++++++++- docs/architecture/c4/l2-containers.puml | 2 + 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/docs/OBSERVABILITY.md b/docs/OBSERVABILITY.md index b895e849..2b3855d6 100644 --- a/docs/OBSERVABILITY.md +++ b/docs/OBSERVABILITY.md @@ -118,11 +118,14 @@ To find a trace for a specific request in staging/production, either increase th ## Metrics (Prometheus → Grafana) -Prometheus scrapes the backend management endpoint every 15 s: +Prometheus scrapes two targets every 15 s: ``` Target: backend:8081/actuator/prometheus Labels: job="spring-boot", application="Familienarchiv" + +Target: ocr:8000/metrics +Labels: job="ocr-service" ``` All Spring Boot metrics carry the `application="Familienarchiv"` tag, which is how the Grafana Spring Boot Observability dashboard (ID 17175) filters to this service. @@ -146,6 +149,70 @@ jvm_memory_used_bytes{area="heap", application="Familienarchiv"} hikaricp_connections_active ``` +### OCR-service custom metrics + +Exposed at `ocr:8000/metrics` by `prometheus-fastapi-instrumentator`. The +`http_*` metrics describe the FastAPI request layer; the `ocr_*` series are +domain-specific. **Never label these with PII or document content** — labels +have unbounded cardinality risk and are visible to anyone with Grafana access. + +| Metric | Type | Labels | Unit | What it tracks | +|---|---|---|---|---| +| `ocr_jobs_total` | Counter | `engine` (`surya`/`kraken`), `script_type` | jobs | OCR jobs that started after a successful PDF download | +| `ocr_pages_total` | Counter | `engine` | pages | Successfully OCR'd pages in the streaming generator | +| `ocr_skipped_pages_total` | Counter | — | pages | Pages skipped because the engine raised on them | +| `ocr_words_total` | Counter | — | words | Recognized words summed across every block | +| `ocr_illegible_words_total` | Counter | — | words | Words below the confidence threshold (rendered as `[unleserlich]`) | +| `ocr_processing_seconds` | Histogram | `engine` | seconds | Per-page (stream) or per-document (`/ocr`) engine time, excluding preprocessing | +| `ocr_training_runs_total` | Counter | `kind` (`recognition`/`segmentation`), `outcome` (`success`/`error`) | runs | Completed training runs | +| `ocr_model_accuracy` | Gauge | `kind` | ratio (0–1) | Latest accuracy reported by a successful training run | +| `ocr_models_ready` | Gauge | — | 0\|1 | 1 once the lifespan startup has finished loading models | + +Canonical example queries (the same ones referenced in issue #652): + +```promql +# OCR throughput by engine +sum by (engine) (rate(ocr_pages_total[5m])) + +# Share of words rendered as [unleserlich] +sum(rate(ocr_illegible_words_total[5m])) + / sum(rate(ocr_words_total[5m])) + +# p95 page processing time per engine +histogram_quantile(0.95, sum by (engine, le) ( + rate(ocr_processing_seconds_bucket[5m]) +)) + +# Training error rate +sum(rate(ocr_training_runs_total{outcome="error"}[1h])) + / sum(rate(ocr_training_runs_total[1h])) + +# Latest recognition vs segmentation accuracy +ocr_model_accuracy +``` + +### Internal-only endpoints + +`/metrics` is exposed by the OCR service over plain HTTP without +authentication. The container is reachable only on the internal Docker +network — Caddy never proxies to it directly. If the service is ever +exposed (e.g. a `ports:` mapping is added), block the endpoint at the +reverse proxy: + +```caddy +ocr.example.com { + @internal_only path /metrics /health + respond @internal_only 404 + reverse_proxy ocr:8000 +} +``` + +The `MetricsPathFilter` in `ocr-service/main.py` suppresses uvicorn's +**stdout** access log lines for `/metrics` and `/health` so the container +console stays focused on real OCR traffic. Promtail/Loki still receive +access lines from any other source. Treat the filter as console +noise-control, not an audit-suppression mechanism. + ## Errors (GlitchTip) GlitchTip receives errors from both the backend (via Sentry Java SDK) and the frontend (via Sentry JavaScript SDK). It groups events by fingerprint, tracks first/last seen times, and links to the release that introduced the error. diff --git a/docs/architecture/c4/l2-containers.puml b/docs/architecture/c4/l2-containers.puml index 346efe75..8d66a614 100644 --- a/docs/architecture/c4/l2-containers.puml +++ b/docs/architecture/c4/l2-containers.puml @@ -43,6 +43,8 @@ Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned") Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI") Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API") Rel(backend, tempo, "Sends distributed traces via OTLP", "HTTP / OTLP / port 4318 (archiv-net)") +Rel(prometheus, backend, "Scrapes JVM + HTTP metrics", "HTTP 8081 /actuator/prometheus") +Rel(prometheus, ocr, "Scrapes OCR + http_* metrics", "HTTP 8000 /metrics") Rel(grafana, prometheus, "Queries metrics", "HTTP 9090") Rel(grafana, loki, "Queries logs", "HTTP 3100") Rel(grafana, tempo, "Queries traces", "HTTP 3200") -- 2.49.1 From 2df71beb7eac48469b3b963e68d64f4aa693a9c3 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 17:06:44 +0200 Subject: [PATCH 25/27] docs: add ADR-023 and glossary entries for OCR metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ADR-023 captures why prometheus-fastapi-instrumentator was chosen, the build_metrics(registry) factory pattern, and the test rebinding seam. The glossary gains four ops-aligned terms — illegible word, models-ready gauge, recognition vs segmentation accuracy — so the metrics documentation in OBSERVABILITY.md has a vocabulary to lean on. Co-Authored-By: Claude Sonnet 4.6 --- docs/GLOSSARY.md | 8 ++ ...mentator-and-metrics-registry-injection.md | 94 +++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 docs/adr/023-prometheus-instrumentator-and-metrics-registry-injection.md diff --git a/docs/GLOSSARY.md b/docs/GLOSSARY.md index 21addb4c..99da1775 100644 --- a/docs/GLOSSARY.md +++ b/docs/GLOSSARY.md @@ -80,6 +80,14 @@ _See also [DocumentStatus lifecycle](#documentstatus-lifecycle)._ **Sütterlin** — A specific standardized style of Kurrent taught in German schools from 1915 to 1941. +**Illegible word** — a word whose recognition confidence falls below the configured threshold; replaced with the literal token `[unleserlich]` in the rendered block text and counted in the `ocr_illegible_words_total` Prometheus counter. + +**Models-ready gauge** — the `ocr_models_ready` Prometheus gauge, flipped from `0` to `1` once the FastAPI lifespan startup has finished loading the Kraken model and the spell-checker. Used both for the `/health` endpoint and as the supervised signal for the `ocr_models_ready < 1 for 2m` alert. + +**Recognition model accuracy** — the accuracy reported by `ketos train` for the recognition (text-line) model, exposed as `ocr_model_accuracy{kind="recognition"}`. Sourced from `_parse_best_checkpoint` on the highest-scoring checkpoint after training. + +**Segmentation model accuracy** — the accuracy reported by `ketos segtrain` for the baseline layout analysis (`blla`) model, exposed as `ocr_model_accuracy{kind="segmentation"}`. Distinct from recognition accuracy because the two models are trained and improved independently. + --- ## Other Domain Terms diff --git a/docs/adr/023-prometheus-instrumentator-and-metrics-registry-injection.md b/docs/adr/023-prometheus-instrumentator-and-metrics-registry-injection.md new file mode 100644 index 00000000..5e8a1020 --- /dev/null +++ b/docs/adr/023-prometheus-instrumentator-and-metrics-registry-injection.md @@ -0,0 +1,94 @@ +# ADR-023: Prometheus Instrumentator and Metrics Registry Injection + +## Status + +Accepted + +## Context + +Until issue #652 the OCR service exposed no `/metrics` endpoint. The +observability stack already scrapes the Spring Boot backend's actuator +endpoint, but it had nothing to scrape on the Python side. Without HTTP- +and domain-level metrics from `ocr-service` we cannot answer questions +like "what is the share of words rendered as `[unleserlich]`" or +"is the training error rate above its budget" from Grafana. + +Two implementation requirements influenced the design: + +1. **Counter / gauge isolation in tests.** `prometheus_client` collectors + are module-level singletons keyed by name on the global `REGISTRY`. + Re-importing or naively re-instantiating them raises a duplicated- + collector error and cross-test state leaks (a `.inc()` in test A is + still readable by test B). A test harness needs a way to swap the + active container for a fresh per-test instance. + +2. **Minimal blast radius on the request path.** We did not want to + hand-instrument every endpoint with FastAPI middleware. The + `prometheus-fastapi-instrumentator` library already provides + `http_requests_total`, `http_request_duration_seconds`, and the + `/metrics` exposition route, all idiomatic Prometheus names. + +## Decision + +- Add `prometheus-fastapi-instrumentator==7.0.0` and pin its transitive + dependency `prometheus-client==0.25.0` explicitly in + `ocr-service/requirements.txt`. +- Mount the instrumentator once at module load: + `Instrumentator(excluded_handlers=["/health", "/metrics"]).instrument(app).expose(app)`. + This adds `/metrics` and an HTTP-level dashboard surface without + changing any endpoint code. +- Define every domain metric (`ocr_jobs_total`, `ocr_pages_total`, + `ocr_processing_seconds`, …) inside a `build_metrics(registry)` + factory in `ocr-service/metrics.py` that returns a frozen `OcrMetrics` + dataclass. Production code binds the container to the default + `REGISTRY` once: `metrics: OcrMetrics = build_metrics(REGISTRY)`. +- Tests use a `fresh_metrics` fixture that builds a new + `CollectorRegistry()` per test and monkeypatches `main.metrics` with + a container bound to it. The endpoint code keeps reading + `metrics.` without knowing whether it is talking to the global + registry or a per-test one. + +## Consequences + +**Positive** + +- One reusable factory captures the metric definitions; future metrics + go in one place. +- Tests run with full counter isolation. Cross-test state leakage is + impossible because each test sees its own dataclass instance. +- The instrumentator gives us `http_*` metrics for free, including a + Grafana-ready histogram that pairs with the Spring Boot one. + +**Negative** + +- One extra level of indirection: any test that asserts on metric + values must remember to monkeypatch `main.metrics`, not the registry + directly. Rebinding through the registry is harmless but useless — + the dataclass holds references to the original collectors. +- `prometheus-client` is now pinned. Upgrading it requires an explicit + bump and re-checking the instrumentator's compatibility range. +- `/metrics` is exposed unauthenticated and relies on the Docker + internal network for confidentiality. See + [docs/OBSERVABILITY.md §Internal-only endpoints](../OBSERVABILITY.md) + for the Caddy snippet that must be added if the service ever gets a + host-side port mapping. + +## Alternatives considered + +- **Hand-roll the `/metrics` endpoint.** Rejected: would have meant + duplicating what `prometheus-fastapi-instrumentator` ships, plus + middleware for the HTTP histograms. +- **Skip the factory; pass `registry` as a function argument + everywhere.** Rejected: clutters every endpoint signature and breaks + the symmetry with the Spring Boot side, which also relies on a + process-global Micrometer registry. +- **Use a `pytest` autouse fixture that resets `REGISTRY` between + tests.** Rejected: `prometheus_client` does not expose a clean + "unregister all" hook, and we would be relying on private APIs. + +## References + +- Issue: [#652](https://git.raddatz.cloud/marcel/familienarchiv/issues/652) +- Library: +- Code: `ocr-service/metrics.py`, `ocr-service/main.py`, + `ocr-service/test_metrics.py` -- 2.49.1 From e0e1578bdd0e0088cedc2ca28a679c592a9b9b8a Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 17:22:49 +0200 Subject: [PATCH 26/27] test(ocr): widen spell-check exclusion bound to 0.09s with rationale MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sara's cycle-2 S1: the wall-clock assertion at < 0.05s could trip on a slow CI runner under load even when the timer correctly excludes spell-check. Sara's preferred structural fix (patch main.time.monotonic with a deterministic sequence) proved awkward — the patched attribute is the *global* time.monotonic which httpx and asyncio consume, exhausting the sequence before the request reaches the engine loop. Take the documented fallback: widen the bound to 0.09s and explain why. The failure mode the test guards against (spell-check inside the timer) would add 0.1s (2 × 0.05s sleep), so 0.09s catches the bug while leaving ~90ms of headroom for slow CI runners. Verified red→green by temporarily moving correct_text inside the timer block: bound trips at 0.101s; the fixed code reads ~0.001s. Co-Authored-By: Claude Opus 4.7 --- ocr-service/test_metrics.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 253cae14..d2bd9671 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -501,7 +501,17 @@ async def test_ocr_processing_seconds_histogram_observed_per_page_in_guided_stre @pytest.mark.asyncio async def test_ocr_processing_seconds_histogram_excludes_spell_check_time_in_guided_stream(fresh_metrics): - """The guided observation must time engine work only, not the spell-check pass.""" + """The guided observation must time engine work only, not the spell-check pass. + + Wall-clock bound rather than a structural `patch("main.time.monotonic")`: + the patched attribute is the *global* `time.monotonic`, which httpx and + asyncio also consume — they exhaust the deterministic sequence before the + request reaches the engine loop. Bound is sized against the failure mode, + not the noise floor: spell-check sleeps 0.05s × 2 regions = 0.1s, so a + timer that accidentally wrapped `correct_text` would observe >= 0.1s. The + 0.09s ceiling catches that bug while leaving ~90ms of slack for slow CI + runners (engine work is instantaneous under the mock). + """ mock_images = [Image.new("RGB", (100, 100))] regions = [ {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"}, @@ -532,10 +542,7 @@ async def test_ocr_processing_seconds_histogram_excludes_spell_check_time_in_gui sum_seconds, _ = _histogram_count_sum( fresh_metrics.ocr_processing_seconds, engine="kraken" ) - # Spell-check sleeps 0.05s per region × 2 regions = 0.1s; engine work is instantaneous. - # If timing included spell-check, sum_seconds would be >= 0.1s. Allow 30ms slack - # for scheduler overhead. - assert sum_seconds < 0.05, f"timing must exclude spell-check; got sum={sum_seconds}" + assert sum_seconds < 0.09, f"timing must exclude spell-check; got sum={sum_seconds}" @pytest.mark.asyncio -- 2.49.1 From 0801da8df0a4d81fdbd0da32b3fbdb8a3ea722f6 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 17:23:32 +0200 Subject: [PATCH 27/27] docs(ocr): explain why two metrics tests skip fresh_metrics fixture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sara's cycle-2 S2: clarify the latent (but not actual) cross-test state risk on the two metrics tests that hit the global REGISTRY instead of the per-test fresh_metrics fixture. Migrating them would actually break them — the /metrics endpoint is served by prometheus-fastapi-instrumentator which binds to the default REGISTRY at app-construction time, and the http_requests_total assertion only finds counters on that global registry. Both tests already assert response shape only (status code, content-type substring, body substrings), not numeric values, so the shared-registry caveat is documented for future readers rather than treated as a bug to fix. Co-Authored-By: Claude Opus 4.7 --- ocr-service/test_metrics.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index d2bd9671..42afd275 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -61,7 +61,15 @@ def fresh_metrics(monkeypatch): @pytest.mark.asyncio async def test_metrics_endpoint_returns_200(): - """`GET /metrics` returns 200 with Prometheus exposition content.""" + """`GET /metrics` returns 200 with Prometheus exposition content. + + Uses the global REGISTRY by design — does NOT take the `fresh_metrics` fixture. + The `/metrics` endpoint is wired by `prometheus-fastapi-instrumentator`, which + binds to the default REGISTRY at app-construction time; swapping `main.metrics` + via the fixture would not redirect what `/metrics` exposes. This test only + asserts response shape (status code + content-type substring), not numeric + counter values, so cross-test state leakage cannot affect it. + """ with patch("main.kraken_engine.load_models"), \ patch("main.load_spell_checker"): async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: @@ -73,7 +81,15 @@ async def test_metrics_endpoint_returns_200(): @pytest.mark.asyncio async def test_metrics_includes_http_request_metrics_after_ocr_call(): - """After a request to /ocr, `/metrics` exposes auto-instrumented http_* metrics.""" + """After a request to /ocr, `/metrics` exposes auto-instrumented http_* metrics. + + Uses the global REGISTRY by design — does NOT take the `fresh_metrics` fixture. + The `http_requests_total` / `http_request_duration_seconds` metrics live on + the instrumentator's default REGISTRY (not on `main.metrics`), so a fresh + CollectorRegistry would never see them. This test only asserts response shape + (substring presence in the exposition body), not numeric counter values, so + cross-test state leakage cannot affect it. + """ mock_images = [Image.new("RGB", (100, 100))] mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "polygon": None, "text": "hi", "words": []}] -- 2.49.1