feat(ocr): expose Prometheus /metrics endpoint with OCR-domain counters #653

Merged
marcel merged 27 commits from feat/issue-652-ocr-metrics into main 2026-05-21 18:16:48 +02:00
3 changed files with 131 additions and 0 deletions
Showing only changes of commit f3e3545d06 - Show all commits

View File

@@ -20,9 +20,11 @@ import pypdfium2 as pdfium
from fastapi import FastAPI, Form, Header, HTTPException, UploadFile from fastapi import FastAPI, Form, Header, HTTPException, UploadFile
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse
from PIL import Image from PIL import Image
from prometheus_client import REGISTRY
from prometheus_fastapi_instrumentator import Instrumentator from prometheus_fastapi_instrumentator import Instrumentator
from confidence import apply_confidence_markers, get_threshold from confidence import apply_confidence_markers, get_threshold
from metrics import OcrMetrics, build_metrics
from spell_check import correct_text, load_spell_checker from spell_check import correct_text, load_spell_checker
from engines import kraken as kraken_engine from engines import kraken as kraken_engine
from engines import surya as surya_engine from engines import surya as surya_engine
@@ -38,6 +40,8 @@ logger = logging.getLogger(__name__)
_models_ready = False _models_ready = False
metrics: OcrMetrics = build_metrics(REGISTRY)
ALLOWED_PDF_HOSTS = set( ALLOWED_PDF_HOSTS = set(
h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",") h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
) )

92
ocr-service/metrics.py Normal file
View File

@@ -0,0 +1,92 @@
"""Prometheus metric definitions for the OCR service.
`build_metrics(registry)` returns a fresh `OcrMetrics` instance bound to the
given `CollectorRegistry`. Production code calls it once at module load with
the default `REGISTRY`; tests pass a per-test `CollectorRegistry()` to keep
counter values isolated between cases (decision #3 on issue #652).
"""
from __future__ import annotations
from dataclasses import dataclass
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
@dataclass(frozen=True)
class OcrMetrics:
"""Container for every custom OCR metric.
Counters and gauges are immutable references to `prometheus_client`
instances. Mutating them (`.inc()`, `.observe()`, `.set()`) is safe;
rebinding the field on the dataclass is not — use `build_metrics` to get
a new container.
"""
ocr_jobs_total: Counter
ocr_pages_total: Counter
ocr_skipped_pages_total: Counter
ocr_words_total: Counter
ocr_illegible_words_total: Counter
ocr_processing_seconds: Histogram
ocr_training_runs_total: Counter
ocr_model_accuracy: Gauge
ocr_models_ready: Gauge
def build_metrics(registry: CollectorRegistry) -> OcrMetrics:
"""Create one OcrMetrics instance bound to `registry`."""
return OcrMetrics(
ocr_jobs_total=Counter(
"ocr_jobs_total",
"Number of OCR jobs processed, labelled by engine and script type.",
["engine", "script_type"],
registry=registry,
),
ocr_pages_total=Counter(
"ocr_pages_total",
"Number of pages successfully OCR'd, labelled by engine.",
["engine"],
registry=registry,
),
ocr_skipped_pages_total=Counter(
"ocr_skipped_pages_total",
"Number of pages skipped because the OCR engine raised.",
registry=registry,
),
ocr_words_total=Counter(
"ocr_words_total",
"Number of words recognized across all OCR blocks.",
registry=registry,
),
ocr_illegible_words_total=Counter(
"ocr_illegible_words_total",
"Number of words below the confidence threshold "
"(replaced with [unleserlich]).",
registry=registry,
),
ocr_processing_seconds=Histogram(
"ocr_processing_seconds",
"OCR processing time per page (streaming) or per document (non-streaming).",
["engine"],
registry=registry,
),
ocr_training_runs_total=Counter(
"ocr_training_runs_total",
"Number of training runs, labelled by kind (recognition|segmentation) "
"and outcome (success|error).",
["kind", "outcome"],
registry=registry,
),
ocr_model_accuracy=Gauge(
"ocr_model_accuracy",
"Latest model accuracy reported by a successful training run.",
["kind"],
registry=registry,
),
ocr_models_ready=Gauge(
"ocr_models_ready",
"1 once the lifespan startup has finished loading models, 0 before.",
registry=registry,
),
)

View File

@@ -9,8 +9,19 @@ from unittest.mock import AsyncMock, patch
import pytest import pytest
from httpx import ASGITransport, AsyncClient from httpx import ASGITransport, AsyncClient
from PIL import Image from PIL import Image
from prometheus_client import CollectorRegistry
from main import app from main import app
from metrics import build_metrics
@pytest.fixture
def fresh_metrics(monkeypatch):
"""Replace the module-level `main.metrics` with one bound to a fresh registry."""
registry = CollectorRegistry()
test_metrics = build_metrics(registry)
monkeypatch.setattr("main.metrics", test_metrics)
return test_metrics
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -56,3 +67,27 @@ async def test_metrics_includes_http_request_metrics_after_ocr_call():
body = metrics_response.text body = metrics_response.text
assert "http_requests_total" in body assert "http_requests_total" in body
assert "http_request_duration_seconds" in body assert "http_request_duration_seconds" in body
def test_build_metrics_registers_all_custom_metrics_on_given_registry():
"""`build_metrics` returns an OcrMetrics bound to the supplied registry."""
registry = CollectorRegistry()
metrics = build_metrics(registry)
metric_names = {m.name for m in registry.collect()}
expected = {
"ocr_jobs",
"ocr_pages",
"ocr_skipped_pages",
"ocr_words",
"ocr_illegible_words",
"ocr_processing_seconds",
"ocr_training_runs",
"ocr_model_accuracy",
"ocr_models_ready",
}
assert expected <= metric_names, f"missing: {expected - metric_names}"
# A second registry yields a separate container — no shared state.
other_metrics = build_metrics(CollectorRegistry())
assert metrics is not other_metrics