feat(ocr): add metrics.py factory with test-scoped CollectorRegistry support

Encapsulates every custom OCR metric in an OcrMetrics frozen dataclass and
exposes a `build_metrics(registry)` factory. Production main.py binds against
the default REGISTRY; tests construct a fresh CollectorRegistry per case and
monkeypatch main.metrics, so counter values stay isolated between tests
(decision #3 on issue #652, Option A).

Refs #652

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-21 16:02:20 +02:00
parent 4bb6685edb
commit f3e3545d06
3 changed files with 131 additions and 0 deletions

View File

@@ -20,9 +20,11 @@ import pypdfium2 as pdfium
from fastapi import FastAPI, Form, Header, HTTPException, UploadFile from fastapi import FastAPI, Form, Header, HTTPException, UploadFile
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse
from PIL import Image from PIL import Image
from prometheus_client import REGISTRY
from prometheus_fastapi_instrumentator import Instrumentator from prometheus_fastapi_instrumentator import Instrumentator
from confidence import apply_confidence_markers, get_threshold from confidence import apply_confidence_markers, get_threshold
from metrics import OcrMetrics, build_metrics
from spell_check import correct_text, load_spell_checker from spell_check import correct_text, load_spell_checker
from engines import kraken as kraken_engine from engines import kraken as kraken_engine
from engines import surya as surya_engine from engines import surya as surya_engine
@@ -38,6 +40,8 @@ logger = logging.getLogger(__name__)
_models_ready = False _models_ready = False
metrics: OcrMetrics = build_metrics(REGISTRY)
ALLOWED_PDF_HOSTS = set( ALLOWED_PDF_HOSTS = set(
h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",") h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
) )

92
ocr-service/metrics.py Normal file
View File

@@ -0,0 +1,92 @@
"""Prometheus metric definitions for the OCR service.
`build_metrics(registry)` returns a fresh `OcrMetrics` instance bound to the
given `CollectorRegistry`. Production code calls it once at module load with
the default `REGISTRY`; tests pass a per-test `CollectorRegistry()` to keep
counter values isolated between cases (decision #3 on issue #652).
"""
from __future__ import annotations
from dataclasses import dataclass
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
@dataclass(frozen=True)
class OcrMetrics:
"""Container for every custom OCR metric.
Counters and gauges are immutable references to `prometheus_client`
instances. Mutating them (`.inc()`, `.observe()`, `.set()`) is safe;
rebinding the field on the dataclass is not — use `build_metrics` to get
a new container.
"""
ocr_jobs_total: Counter
ocr_pages_total: Counter
ocr_skipped_pages_total: Counter
ocr_words_total: Counter
ocr_illegible_words_total: Counter
ocr_processing_seconds: Histogram
ocr_training_runs_total: Counter
ocr_model_accuracy: Gauge
ocr_models_ready: Gauge
def build_metrics(registry: CollectorRegistry) -> OcrMetrics:
"""Create one OcrMetrics instance bound to `registry`."""
return OcrMetrics(
ocr_jobs_total=Counter(
"ocr_jobs_total",
"Number of OCR jobs processed, labelled by engine and script type.",
["engine", "script_type"],
registry=registry,
),
ocr_pages_total=Counter(
"ocr_pages_total",
"Number of pages successfully OCR'd, labelled by engine.",
["engine"],
registry=registry,
),
ocr_skipped_pages_total=Counter(
"ocr_skipped_pages_total",
"Number of pages skipped because the OCR engine raised.",
registry=registry,
),
ocr_words_total=Counter(
"ocr_words_total",
"Number of words recognized across all OCR blocks.",
registry=registry,
),
ocr_illegible_words_total=Counter(
"ocr_illegible_words_total",
"Number of words below the confidence threshold "
"(replaced with [unleserlich]).",
registry=registry,
),
ocr_processing_seconds=Histogram(
"ocr_processing_seconds",
"OCR processing time per page (streaming) or per document (non-streaming).",
["engine"],
registry=registry,
),
ocr_training_runs_total=Counter(
"ocr_training_runs_total",
"Number of training runs, labelled by kind (recognition|segmentation) "
"and outcome (success|error).",
["kind", "outcome"],
registry=registry,
),
ocr_model_accuracy=Gauge(
"ocr_model_accuracy",
"Latest model accuracy reported by a successful training run.",
["kind"],
registry=registry,
),
ocr_models_ready=Gauge(
"ocr_models_ready",
"1 once the lifespan startup has finished loading models, 0 before.",
registry=registry,
),
)

View File

@@ -9,8 +9,19 @@ from unittest.mock import AsyncMock, patch
import pytest import pytest
from httpx import ASGITransport, AsyncClient from httpx import ASGITransport, AsyncClient
from PIL import Image from PIL import Image
from prometheus_client import CollectorRegistry
from main import app from main import app
from metrics import build_metrics
@pytest.fixture
def fresh_metrics(monkeypatch):
"""Replace the module-level `main.metrics` with one bound to a fresh registry."""
registry = CollectorRegistry()
test_metrics = build_metrics(registry)
monkeypatch.setattr("main.metrics", test_metrics)
return test_metrics
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -56,3 +67,27 @@ async def test_metrics_includes_http_request_metrics_after_ocr_call():
body = metrics_response.text body = metrics_response.text
assert "http_requests_total" in body assert "http_requests_total" in body
assert "http_request_duration_seconds" in body assert "http_request_duration_seconds" in body
def test_build_metrics_registers_all_custom_metrics_on_given_registry():
"""`build_metrics` returns an OcrMetrics bound to the supplied registry."""
registry = CollectorRegistry()
metrics = build_metrics(registry)
metric_names = {m.name for m in registry.collect()}
expected = {
"ocr_jobs",
"ocr_pages",
"ocr_skipped_pages",
"ocr_words",
"ocr_illegible_words",
"ocr_processing_seconds",
"ocr_training_runs",
"ocr_model_accuracy",
"ocr_models_ready",
}
assert expected <= metric_names, f"missing: {expected - metric_names}"
# A second registry yields a separate container — no shared state.
other_metrics = build_metrics(CollectorRegistry())
assert metrics is not other_metrics