feat(ocr): expose Prometheus /metrics endpoint with OCR-domain counters #653

Merged
marcel merged 27 commits from feat/issue-652-ocr-metrics into main 2026-05-21 18:16:48 +02:00
Showing only changes of commit 22a5ee816a - Show all commits

View File

@@ -50,6 +50,18 @@ ALLOWED_PDF_HOSTS = set(
_SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"}
def _observe_block_words(words: list[dict], threshold: float) -> None:
"""Record per-block word counts and below-threshold word counts.
Pre: `words` is non-empty. Caller checks for that — keeping the helper
branch-free makes the call sites read as a single line.
"""
metrics.ocr_words_total.inc(len(words))
metrics.ocr_illegible_words_total.inc(
sum(1 for w in words if w["confidence"] < threshold)
)
def _validate_url(url: str) -> None:
"""Validate that the PDF URL points to an allowed host (SSRF protection)."""
parsed = urlparse(url)
@@ -149,10 +161,7 @@ async def run_ocr(request: OcrRequest):
for block in blocks:
words = block.get("words") or []
if words:
metrics.ocr_words_total.inc(len(words))
metrics.ocr_illegible_words_total.inc(
sum(1 for w in words if w["confidence"] < threshold)
)
_observe_block_words(words, threshold)
block["text"] = apply_confidence_markers(words, threshold)
block.pop("words", None)
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
@@ -298,10 +307,7 @@ async def run_ocr_stream(request: OcrRequest):
for block in blocks:
words = block.get("words") or []
if words:
metrics.ocr_words_total.inc(len(words))
metrics.ocr_illegible_words_total.inc(
sum(1 for w in words if w["confidence"] < threshold)
)
_observe_block_words(words, threshold)
block["text"] = apply_confidence_markers(words, threshold)
block.pop("words", None)
if script_type in _SPELL_CHECK_SCRIPT_TYPES: