feat(ocr): add metrics.py factory with test-scoped CollectorRegistry support
Encapsulates every custom OCR metric in an OcrMetrics frozen dataclass and exposes a `build_metrics(registry)` factory. Production main.py binds against the default REGISTRY; tests construct a fresh CollectorRegistry per case and monkeypatch main.metrics, so counter values stay isolated between tests (decision #3 on issue #652, Option A). Refs #652 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,9 +20,11 @@ import pypdfium2 as pdfium
|
|||||||
from fastapi import FastAPI, Form, Header, HTTPException, UploadFile
|
from fastapi import FastAPI, Form, Header, HTTPException, UploadFile
|
||||||
from fastapi.responses import StreamingResponse
|
from fastapi.responses import StreamingResponse
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from prometheus_client import REGISTRY
|
||||||
from prometheus_fastapi_instrumentator import Instrumentator
|
from prometheus_fastapi_instrumentator import Instrumentator
|
||||||
|
|
||||||
from confidence import apply_confidence_markers, get_threshold
|
from confidence import apply_confidence_markers, get_threshold
|
||||||
|
from metrics import OcrMetrics, build_metrics
|
||||||
from spell_check import correct_text, load_spell_checker
|
from spell_check import correct_text, load_spell_checker
|
||||||
from engines import kraken as kraken_engine
|
from engines import kraken as kraken_engine
|
||||||
from engines import surya as surya_engine
|
from engines import surya as surya_engine
|
||||||
@@ -38,6 +40,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
_models_ready = False
|
_models_ready = False
|
||||||
|
|
||||||
|
metrics: OcrMetrics = build_metrics(REGISTRY)
|
||||||
|
|
||||||
ALLOWED_PDF_HOSTS = set(
|
ALLOWED_PDF_HOSTS = set(
|
||||||
h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
|
h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
|
||||||
)
|
)
|
||||||
|
|||||||
92
ocr-service/metrics.py
Normal file
92
ocr-service/metrics.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
"""Prometheus metric definitions for the OCR service.
|
||||||
|
|
||||||
|
`build_metrics(registry)` returns a fresh `OcrMetrics` instance bound to the
|
||||||
|
given `CollectorRegistry`. Production code calls it once at module load with
|
||||||
|
the default `REGISTRY`; tests pass a per-test `CollectorRegistry()` to keep
|
||||||
|
counter values isolated between cases (decision #3 on issue #652).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class OcrMetrics:
|
||||||
|
"""Container for every custom OCR metric.
|
||||||
|
|
||||||
|
Counters and gauges are immutable references to `prometheus_client`
|
||||||
|
instances. Mutating them (`.inc()`, `.observe()`, `.set()`) is safe;
|
||||||
|
rebinding the field on the dataclass is not — use `build_metrics` to get
|
||||||
|
a new container.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ocr_jobs_total: Counter
|
||||||
|
ocr_pages_total: Counter
|
||||||
|
ocr_skipped_pages_total: Counter
|
||||||
|
ocr_words_total: Counter
|
||||||
|
ocr_illegible_words_total: Counter
|
||||||
|
ocr_processing_seconds: Histogram
|
||||||
|
ocr_training_runs_total: Counter
|
||||||
|
ocr_model_accuracy: Gauge
|
||||||
|
ocr_models_ready: Gauge
|
||||||
|
|
||||||
|
|
||||||
|
def build_metrics(registry: CollectorRegistry) -> OcrMetrics:
|
||||||
|
"""Create one OcrMetrics instance bound to `registry`."""
|
||||||
|
return OcrMetrics(
|
||||||
|
ocr_jobs_total=Counter(
|
||||||
|
"ocr_jobs_total",
|
||||||
|
"Number of OCR jobs processed, labelled by engine and script type.",
|
||||||
|
["engine", "script_type"],
|
||||||
|
registry=registry,
|
||||||
|
),
|
||||||
|
ocr_pages_total=Counter(
|
||||||
|
"ocr_pages_total",
|
||||||
|
"Number of pages successfully OCR'd, labelled by engine.",
|
||||||
|
["engine"],
|
||||||
|
registry=registry,
|
||||||
|
),
|
||||||
|
ocr_skipped_pages_total=Counter(
|
||||||
|
"ocr_skipped_pages_total",
|
||||||
|
"Number of pages skipped because the OCR engine raised.",
|
||||||
|
registry=registry,
|
||||||
|
),
|
||||||
|
ocr_words_total=Counter(
|
||||||
|
"ocr_words_total",
|
||||||
|
"Number of words recognized across all OCR blocks.",
|
||||||
|
registry=registry,
|
||||||
|
),
|
||||||
|
ocr_illegible_words_total=Counter(
|
||||||
|
"ocr_illegible_words_total",
|
||||||
|
"Number of words below the confidence threshold "
|
||||||
|
"(replaced with [unleserlich]).",
|
||||||
|
registry=registry,
|
||||||
|
),
|
||||||
|
ocr_processing_seconds=Histogram(
|
||||||
|
"ocr_processing_seconds",
|
||||||
|
"OCR processing time per page (streaming) or per document (non-streaming).",
|
||||||
|
["engine"],
|
||||||
|
registry=registry,
|
||||||
|
),
|
||||||
|
ocr_training_runs_total=Counter(
|
||||||
|
"ocr_training_runs_total",
|
||||||
|
"Number of training runs, labelled by kind (recognition|segmentation) "
|
||||||
|
"and outcome (success|error).",
|
||||||
|
["kind", "outcome"],
|
||||||
|
registry=registry,
|
||||||
|
),
|
||||||
|
ocr_model_accuracy=Gauge(
|
||||||
|
"ocr_model_accuracy",
|
||||||
|
"Latest model accuracy reported by a successful training run.",
|
||||||
|
["kind"],
|
||||||
|
registry=registry,
|
||||||
|
),
|
||||||
|
ocr_models_ready=Gauge(
|
||||||
|
"ocr_models_ready",
|
||||||
|
"1 once the lifespan startup has finished loading models, 0 before.",
|
||||||
|
registry=registry,
|
||||||
|
),
|
||||||
|
)
|
||||||
@@ -9,8 +9,19 @@ from unittest.mock import AsyncMock, patch
|
|||||||
import pytest
|
import pytest
|
||||||
from httpx import ASGITransport, AsyncClient
|
from httpx import ASGITransport, AsyncClient
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from prometheus_client import CollectorRegistry
|
||||||
|
|
||||||
from main import app
|
from main import app
|
||||||
|
from metrics import build_metrics
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def fresh_metrics(monkeypatch):
|
||||||
|
"""Replace the module-level `main.metrics` with one bound to a fresh registry."""
|
||||||
|
registry = CollectorRegistry()
|
||||||
|
test_metrics = build_metrics(registry)
|
||||||
|
monkeypatch.setattr("main.metrics", test_metrics)
|
||||||
|
return test_metrics
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@@ -56,3 +67,27 @@ async def test_metrics_includes_http_request_metrics_after_ocr_call():
|
|||||||
body = metrics_response.text
|
body = metrics_response.text
|
||||||
assert "http_requests_total" in body
|
assert "http_requests_total" in body
|
||||||
assert "http_request_duration_seconds" in body
|
assert "http_request_duration_seconds" in body
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_metrics_registers_all_custom_metrics_on_given_registry():
|
||||||
|
"""`build_metrics` returns an OcrMetrics bound to the supplied registry."""
|
||||||
|
registry = CollectorRegistry()
|
||||||
|
metrics = build_metrics(registry)
|
||||||
|
|
||||||
|
metric_names = {m.name for m in registry.collect()}
|
||||||
|
expected = {
|
||||||
|
"ocr_jobs",
|
||||||
|
"ocr_pages",
|
||||||
|
"ocr_skipped_pages",
|
||||||
|
"ocr_words",
|
||||||
|
"ocr_illegible_words",
|
||||||
|
"ocr_processing_seconds",
|
||||||
|
"ocr_training_runs",
|
||||||
|
"ocr_model_accuracy",
|
||||||
|
"ocr_models_ready",
|
||||||
|
}
|
||||||
|
assert expected <= metric_names, f"missing: {expected - metric_names}"
|
||||||
|
|
||||||
|
# A second registry yields a separate container — no shared state.
|
||||||
|
other_metrics = build_metrics(CollectorRegistry())
|
||||||
|
assert metrics is not other_metrics
|
||||||
|
|||||||
Reference in New Issue
Block a user