diff --git a/ocr-service/main.py b/ocr-service/main.py index 63859eba..0d068d15 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -20,9 +20,11 @@ import pypdfium2 as pdfium from fastapi import FastAPI, Form, Header, HTTPException, UploadFile from fastapi.responses import StreamingResponse from PIL import Image +from prometheus_client import REGISTRY from prometheus_fastapi_instrumentator import Instrumentator from confidence import apply_confidence_markers, get_threshold +from metrics import OcrMetrics, build_metrics from spell_check import correct_text, load_spell_checker from engines import kraken as kraken_engine from engines import surya as surya_engine @@ -38,6 +40,8 @@ logger = logging.getLogger(__name__) _models_ready = False +metrics: OcrMetrics = build_metrics(REGISTRY) + ALLOWED_PDF_HOSTS = set( h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",") ) diff --git a/ocr-service/metrics.py b/ocr-service/metrics.py new file mode 100644 index 00000000..66efb9f2 --- /dev/null +++ b/ocr-service/metrics.py @@ -0,0 +1,92 @@ +"""Prometheus metric definitions for the OCR service. + +`build_metrics(registry)` returns a fresh `OcrMetrics` instance bound to the +given `CollectorRegistry`. Production code calls it once at module load with +the default `REGISTRY`; tests pass a per-test `CollectorRegistry()` to keep +counter values isolated between cases (decision #3 on issue #652). +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram + + +@dataclass(frozen=True) +class OcrMetrics: + """Container for every custom OCR metric. + + Counters and gauges are immutable references to `prometheus_client` + instances. Mutating them (`.inc()`, `.observe()`, `.set()`) is safe; + rebinding the field on the dataclass is not — use `build_metrics` to get + a new container. + """ + + ocr_jobs_total: Counter + ocr_pages_total: Counter + ocr_skipped_pages_total: Counter + ocr_words_total: Counter + ocr_illegible_words_total: Counter + ocr_processing_seconds: Histogram + ocr_training_runs_total: Counter + ocr_model_accuracy: Gauge + ocr_models_ready: Gauge + + +def build_metrics(registry: CollectorRegistry) -> OcrMetrics: + """Create one OcrMetrics instance bound to `registry`.""" + return OcrMetrics( + ocr_jobs_total=Counter( + "ocr_jobs_total", + "Number of OCR jobs processed, labelled by engine and script type.", + ["engine", "script_type"], + registry=registry, + ), + ocr_pages_total=Counter( + "ocr_pages_total", + "Number of pages successfully OCR'd, labelled by engine.", + ["engine"], + registry=registry, + ), + ocr_skipped_pages_total=Counter( + "ocr_skipped_pages_total", + "Number of pages skipped because the OCR engine raised.", + registry=registry, + ), + ocr_words_total=Counter( + "ocr_words_total", + "Number of words recognized across all OCR blocks.", + registry=registry, + ), + ocr_illegible_words_total=Counter( + "ocr_illegible_words_total", + "Number of words below the confidence threshold " + "(replaced with [unleserlich]).", + registry=registry, + ), + ocr_processing_seconds=Histogram( + "ocr_processing_seconds", + "OCR processing time per page (streaming) or per document (non-streaming).", + ["engine"], + registry=registry, + ), + ocr_training_runs_total=Counter( + "ocr_training_runs_total", + "Number of training runs, labelled by kind (recognition|segmentation) " + "and outcome (success|error).", + ["kind", "outcome"], + registry=registry, + ), + ocr_model_accuracy=Gauge( + "ocr_model_accuracy", + "Latest model accuracy reported by a successful training run.", + ["kind"], + registry=registry, + ), + ocr_models_ready=Gauge( + "ocr_models_ready", + "1 once the lifespan startup has finished loading models, 0 before.", + registry=registry, + ), + ) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 442a2d0e..0906beec 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -9,8 +9,19 @@ from unittest.mock import AsyncMock, patch import pytest from httpx import ASGITransport, AsyncClient from PIL import Image +from prometheus_client import CollectorRegistry from main import app +from metrics import build_metrics + + +@pytest.fixture +def fresh_metrics(monkeypatch): + """Replace the module-level `main.metrics` with one bound to a fresh registry.""" + registry = CollectorRegistry() + test_metrics = build_metrics(registry) + monkeypatch.setattr("main.metrics", test_metrics) + return test_metrics @pytest.mark.asyncio @@ -56,3 +67,27 @@ async def test_metrics_includes_http_request_metrics_after_ocr_call(): body = metrics_response.text assert "http_requests_total" in body assert "http_request_duration_seconds" in body + + +def test_build_metrics_registers_all_custom_metrics_on_given_registry(): + """`build_metrics` returns an OcrMetrics bound to the supplied registry.""" + registry = CollectorRegistry() + metrics = build_metrics(registry) + + metric_names = {m.name for m in registry.collect()} + expected = { + "ocr_jobs", + "ocr_pages", + "ocr_skipped_pages", + "ocr_words", + "ocr_illegible_words", + "ocr_processing_seconds", + "ocr_training_runs", + "ocr_model_accuracy", + "ocr_models_ready", + } + assert expected <= metric_names, f"missing: {expected - metric_names}" + + # A second registry yields a separate container — no shared state. + other_metrics = build_metrics(CollectorRegistry()) + assert metrics is not other_metrics