familienarchiv/ocr-service/test_metrics.py

"""Tests for Prometheus metrics exposed by the OCR service.

Each test that asserts on a counter/gauge value uses a fresh CollectorRegistry
(see decision #3 on issue #652) to keep the metrics isolated between tests.
"""

import contextlib
import io
import zipfile
from unittest.mock import AsyncMock, patch

import pytest
from httpx import ASGITransport, AsyncClient
from PIL import Image
from prometheus_client import CollectorRegistry

from main import app
from metrics import build_metrics


@contextlib.asynccontextmanager
async def ocr_client(*, raise_app_exceptions: bool = True):
    """Yield an AsyncClient with model-loaders patched and _models_ready forced on.

    The shared setup for almost every metrics test: stub the heavy lifecycle
    hooks (kraken_engine.load_models, load_spell_checker), flip the readiness
    flag so request handlers do not 503, and restore it afterwards.
    """
    with patch("main.kraken_engine.load_models"), \
         patch("main.load_spell_checker"):
        transport = ASGITransport(app=app, raise_app_exceptions=raise_app_exceptions)
        async with AsyncClient(transport=transport, base_url="http://test") as client:
            import main as main_module
            main_module._models_ready = True
            try:
                yield client
            finally:
                main_module._models_ready = False


def _minimal_zip() -> bytes:
    """Return a ZIP containing one fake .xml so endpoint validation passes."""
    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w") as zf:
        zf.writestr("page_01.xml", "<PcGts/>")
    return buf.getvalue()


def _fake_training_result(accuracy: float = 0.91) -> dict:
    return {"loss": None, "accuracy": accuracy, "cer": round(1 - accuracy, 4), "epochs": 5}


@pytest.fixture
def fresh_metrics(monkeypatch):
    """Replace the module-level `main.metrics` with one bound to a fresh registry."""
    registry = CollectorRegistry()
    test_metrics = build_metrics(registry)
    monkeypatch.setattr("main.metrics", test_metrics)
    return test_metrics


@pytest.mark.asyncio
async def test_metrics_endpoint_returns_200():
    """`GET /metrics` returns 200 with Prometheus exposition content.

    Uses the global REGISTRY by design — does NOT take the `fresh_metrics` fixture.
    The `/metrics` endpoint is wired by `prometheus-fastapi-instrumentator`, which
    binds to the default REGISTRY at app-construction time; swapping `main.metrics`
    via the fixture would not redirect what `/metrics` exposes. This test only
    asserts response shape (status code + content-type substring), not numeric
    counter values, so cross-test state leakage cannot affect it.
    """
    with patch("main.kraken_engine.load_models"), \
         patch("main.load_spell_checker"):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            response = await client.get("/metrics")

    assert response.status_code == 200
    assert "text/plain" in response.headers.get("content-type", "")


@pytest.mark.asyncio
async def test_metrics_includes_http_request_metrics_after_ocr_call():
    """After a request to /ocr, `/metrics` exposes auto-instrumented http_* metrics.

    Uses the global REGISTRY by design — does NOT take the `fresh_metrics` fixture.
    The `http_requests_total` / `http_request_duration_seconds` metrics live on
    the instrumentator's default REGISTRY (not on `main.metrics`), so a fresh
    CollectorRegistry would never see them. This test only asserts response shape
    (substring presence in the exposition body), not numeric counter values, so
    cross-test state leakage cannot affect it.
    """
    mock_images = [Image.new("RGB", (100, 100))]
    mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "hi", "words": []}]

    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            ocr_response = await client.post("/ocr", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
            })
            assert ocr_response.status_code == 200, ocr_response.text

            metrics_response = await client.get("/metrics")

    body = metrics_response.text
    assert "http_requests_total" in body
    assert "http_request_duration_seconds" in body


def test_build_metrics_registers_all_custom_metrics_on_given_registry():
    """`build_metrics` returns an OcrMetrics bound to the supplied registry."""
    registry = CollectorRegistry()
    metrics = build_metrics(registry)

    metric_names = {m.name for m in registry.collect()}
    expected = {
        "ocr_jobs",
        "ocr_pages",
        "ocr_skipped_pages",
        "ocr_words",
        "ocr_illegible_words",
        "ocr_processing_seconds",
        "ocr_training_runs",
        "ocr_model_accuracy",
        "ocr_models_ready",
    }
    assert expected <= metric_names, f"missing: {expected - metric_names}"

    # A second registry yields a separate container — no shared state.
    other_metrics = build_metrics(CollectorRegistry())
    assert metrics is not other_metrics


async def _drive_ocr(client: AsyncClient, *, script_type: str) -> None:
    """Helper — fires /ocr with a single mocked page and asserts a 200."""
    response = await client.post("/ocr", json={
        "pdfUrl": "http://minio/doc.pdf",
        "scriptType": script_type,
        "language": "de",
    })
    assert response.status_code == 200, response.text


@pytest.mark.asyncio
async def test_ocr_jobs_total_incremented_with_kraken_engine_label_for_kurrent(fresh_metrics):
    """A /ocr call with HANDWRITING_KURRENT increments engine=kraken."""
    mock_images = [Image.new("RGB", (100, 100))]
    mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "hi", "words": []}]

    with patch("main.correct_text", side_effect=lambda t: t), \
         patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.kraken_engine.is_available", return_value=True), \
         patch("main.kraken_engine.extract_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            await _drive_ocr(client, script_type="HANDWRITING_KURRENT")

    value = fresh_metrics.ocr_jobs_total.labels(
        engine="kraken", script_type="HANDWRITING_KURRENT"
    )._value.get()
    assert value == 1.0


@pytest.mark.asyncio
async def test_ocr_jobs_total_incremented_with_surya_engine_label_for_typewriter(fresh_metrics):
    """A /ocr call with TYPEWRITER increments engine=surya."""
    mock_images = [Image.new("RGB", (100, 100))]
    mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "hi", "words": []}]

    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            await _drive_ocr(client, script_type="TYPEWRITER")

    value = fresh_metrics.ocr_jobs_total.labels(
        engine="surya", script_type="TYPEWRITER"
    )._value.get()
    assert value == 1.0


@pytest.mark.asyncio
async def test_ocr_pages_total_incremented_once_per_page_in_stream(fresh_metrics):
    """The /ocr/stream generator increments ocr_pages_total per successful page."""
    mock_images = [Image.new("RGB", (100, 100)) for _ in range(3)]
    mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "hi", "words": []}]

    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            async with client.stream("POST", "/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
            }) as response:
                assert response.status_code == 200
                # Drain the stream so all per-page increments fire.
                async for _ in response.aiter_lines():
                    pass

    value = fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get()
    assert value == 3.0


@pytest.mark.asyncio
async def test_ocr_skipped_pages_total_incremented_when_engine_raises_for_a_page(fresh_metrics):
    """When the engine raises on a page, ocr_skipped_pages_total bumps and the stream finishes."""
    mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)]
    good_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "ok", "words": []}]

    call_count = {"n": 0}

    def extract_side_effect(*args, **kwargs):
        call_count["n"] += 1
        if call_count["n"] == 1:
            raise RuntimeError("synthetic engine failure")
        return good_blocks

    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_page_blocks", side_effect=extract_side_effect):
        async with ocr_client() as client:
            async with client.stream("POST", "/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
            }) as response:
                assert response.status_code == 200
                saw_error = False
                async for line in response.aiter_lines():
                    if line and '"type": "error"' in line:
                        saw_error = True
                assert saw_error

    assert fresh_metrics.ocr_skipped_pages_total._value.get() == 1.0
    # The second page still succeeds.
    assert fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() == 1.0


@pytest.mark.asyncio
async def test_ocr_words_and_illegible_words_total_sum_across_blocks(fresh_metrics):
    """Counters reflect totals summed over every block in the request.

    Threshold defaults to THRESHOLD_DEFAULT (0.3) for non-Kurrent scripts. Two
    blocks: 3 words above + 2 words below threshold across blocks.
    """
    mock_images = [Image.new("RGB", (100, 100))]
    mock_blocks = [
        {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
         "polygon": None, "text": "ignored",
         "words": [{"text": "Lieber", "confidence": 0.9},
                   {"text": "Freund", "confidence": 0.1}]},
        {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
         "polygon": None, "text": "ignored",
         "words": [{"text": "Gruss", "confidence": 0.8},
                   {"text": "verschmiert", "confidence": 0.05},
                   {"text": "Karl", "confidence": 0.95}]},
    ]

    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            await _drive_ocr(client, script_type="TYPEWRITER")

    assert fresh_metrics.ocr_words_total._value.get() == 5.0
    assert fresh_metrics.ocr_illegible_words_total._value.get() == 2.0


def _histogram_count_sum(histogram, **labels) -> tuple[float, float]:
    """Read the per-label-set _count and _sum from a prometheus_client Histogram."""
    child = histogram.labels(**labels)
    return child._sum.get(), sum(b.get() for b in child._buckets)


@pytest.mark.asyncio
async def test_ocr_processing_seconds_histogram_observed_per_page_in_stream(fresh_metrics):
    """The streaming generator observes ocr_processing_seconds once per page."""
    mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)]
    mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "ok", "words": []}]

    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            async with client.stream("POST", "/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
            }) as response:
                assert response.status_code == 200
                async for _ in response.aiter_lines():
                    pass

    sum_seconds, count = _histogram_count_sum(
        fresh_metrics.ocr_processing_seconds, engine="surya"
    )
    assert count == 2.0
    assert sum_seconds >= 0.0


@pytest.mark.asyncio
async def test_ocr_training_runs_total_incremented_with_recognition_success_label(fresh_metrics):
    """/train success increments ocr_training_runs_total{kind=recognition, outcome=success}."""
    async def fake_to_thread(func, *args, **kwargs):
        return _fake_training_result()

    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True), \
         patch("main.asyncio.to_thread", side_effect=fake_to_thread):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            response = await client.post(
                "/train",
                files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                headers={"X-Training-Token": "secret-token"},
            )

    assert response.status_code == 200
    assert fresh_metrics.ocr_training_runs_total.labels(
        kind="recognition", outcome="success"
    )._value.get() == 1.0


@pytest.mark.asyncio
async def test_ocr_training_runs_total_incremented_with_recognition_error_label(fresh_metrics):
    """When ketos exits non-zero, the error counter bumps and the exception propagates.

    Uses the narrowest available seam — `subprocess.run` returning a failing
    CompletedProcess — instead of stubbing the asyncio.to_thread boundary,
    so the test exercises the real _run_training error path.
    """
    from subprocess import CompletedProcess

    failing_proc = CompletedProcess(
        args=["ketos"], returncode=1, stdout="", stderr="synthetic ketos failure"
    )

    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True), \
         patch("main.subprocess.run", return_value=failing_proc):
        transport = ASGITransport(app=app, raise_app_exceptions=False)
        async with AsyncClient(transport=transport, base_url="http://test") as client:
            response = await client.post(
                "/train",
                files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                headers={"X-Training-Token": "secret-token"},
            )

    assert response.status_code == 500
    assert fresh_metrics.ocr_training_runs_total.labels(
        kind="recognition", outcome="error"
    )._value.get() == 1.0


@pytest.mark.asyncio
async def test_ocr_training_runs_total_incremented_with_segmentation_success_label(fresh_metrics):
    """/segtrain success increments ocr_training_runs_total{kind=segmentation, outcome=success}."""
    async def fake_to_thread(func, *args, **kwargs):
        return _fake_training_result(accuracy=0.83)

    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True), \
         patch("main.asyncio.to_thread", side_effect=fake_to_thread):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            response = await client.post(
                "/segtrain",
                files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                headers={"X-Training-Token": "secret-token"},
            )

    assert response.status_code == 200
    assert fresh_metrics.ocr_training_runs_total.labels(
        kind="segmentation", outcome="success"
    )._value.get() == 1.0


@pytest.mark.asyncio
async def test_ocr_training_runs_total_incremented_with_recognition_success_label_for_train_sender(fresh_metrics):
    """/train-sender success increments ocr_training_runs_total{kind=recognition, outcome=success}."""
    async def fake_to_thread(func, *args, **kwargs):
        return _fake_training_result()

    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True), \
         patch("main.asyncio.to_thread", side_effect=fake_to_thread):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            response = await client.post(
                "/train-sender",
                files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                data={"output_model_path": "/app/models/sender_test.mlmodel"},
                headers={"X-Training-Token": "secret-token"},
            )

    assert response.status_code == 200, response.text
    assert fresh_metrics.ocr_training_runs_total.labels(
        kind="recognition", outcome="success"
    )._value.get() == 1.0


@pytest.mark.asyncio
async def test_ocr_model_accuracy_gauge_stays_default_when_training_returns_no_accuracy(fresh_metrics):
    """When the runner returns accuracy=None, ocr_model_accuracy must remain at its default 0."""
    async def fake_to_thread(func, *args, **kwargs):
        return {"loss": None, "accuracy": None, "cer": None, "epochs": 5}

    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True), \
         patch("main.asyncio.to_thread", side_effect=fake_to_thread):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            response = await client.post(
                "/train",
                files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                headers={"X-Training-Token": "secret-token"},
            )

    assert response.status_code == 200
    # Gauge was never .set() — accessing the label child still creates it with default 0.0.
    assert fresh_metrics.ocr_model_accuracy.labels(
        kind="recognition"
    )._value.get() == 0.0


@pytest.mark.asyncio
async def test_ocr_model_accuracy_gauge_set_per_kind_after_successful_training(fresh_metrics):
    """After /train and /segtrain succeed, ocr_model_accuracy{kind=...} reflects the result."""
    recognition_accuracy = 0.917
    segmentation_accuracy = 0.834

    async def fake_recognition_to_thread(func, *args, **kwargs):
        return _fake_training_result(accuracy=recognition_accuracy)

    async def fake_segmentation_to_thread(func, *args, **kwargs):
        return _fake_training_result(accuracy=segmentation_accuracy)

    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            with patch("main.asyncio.to_thread", side_effect=fake_recognition_to_thread):
                rec_resp = await client.post(
                    "/train",
                    files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                    headers={"X-Training-Token": "secret-token"},
                )
            assert rec_resp.status_code == 200
            with patch("main.asyncio.to_thread", side_effect=fake_segmentation_to_thread):
                seg_resp = await client.post(
                    "/segtrain",
                    files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                    headers={"X-Training-Token": "secret-token"},
                )
            assert seg_resp.status_code == 200

    assert fresh_metrics.ocr_model_accuracy.labels(kind="recognition")._value.get() == pytest.approx(recognition_accuracy)
    assert fresh_metrics.ocr_model_accuracy.labels(kind="segmentation")._value.get() == pytest.approx(segmentation_accuracy)


def test_ocr_models_ready_gauge_defaults_to_zero():
    """A freshly-built OcrMetrics has ocr_models_ready=0 before lifespan runs."""
    metrics = build_metrics(CollectorRegistry())
    assert metrics.ocr_models_ready._value.get() == 0.0


@pytest.mark.asyncio
async def test_ocr_models_ready_gauge_is_one_after_lifespan_startup(fresh_metrics):
    """The lifespan flips ocr_models_ready to 1 once load_models / load_spell_checker return.

    ASGITransport does not run lifespan by default, so the lifespan context
    manager is driven directly to exercise the startup code path.
    """
    assert fresh_metrics.ocr_models_ready._value.get() == 0.0
    with patch("main.kraken_engine.load_models"), \
         patch("main.load_spell_checker"):
        async with app.router.lifespan_context(app):
            assert fresh_metrics.ocr_models_ready._value.get() == 1.0


@pytest.mark.asyncio
async def test_ocr_processing_seconds_histogram_observed_per_page_in_guided_stream(fresh_metrics):
    """The guided streaming generator observes ocr_processing_seconds once per page."""
    mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)]
    regions = [
        {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"},
        {"pageNumber": 2, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "annotationId": "a2"},
    ]

    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_region_text", return_value="text"):
        async with ocr_client() as client:
            async with client.stream("POST", "/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
                "regions": regions,
            }) as response:
                assert response.status_code == 200
                async for _ in response.aiter_lines():
                    pass

    sum_seconds, count = _histogram_count_sum(
        fresh_metrics.ocr_processing_seconds, engine="surya"
    )
    assert count == 2.0
    assert sum_seconds >= 0.0


@pytest.mark.asyncio
async def test_ocr_processing_seconds_histogram_excludes_spell_check_time_in_guided_stream(fresh_metrics):
    """The guided observation must time engine work only, not the spell-check pass.

    Wall-clock bound rather than a structural `patch("main.time.monotonic")`:
    the patched attribute is the *global* `time.monotonic`, which httpx and
    asyncio also consume — they exhaust the deterministic sequence before the
    request reaches the engine loop. Bound is sized against the failure mode,
    not the noise floor: spell-check sleeps 0.05s × 2 regions = 0.1s, so a
    timer that accidentally wrapped `correct_text` would observe >= 0.1s. The
    0.09s ceiling catches that bug while leaving ~90ms of slack for slow CI
    runners (engine work is instantaneous under the mock).
    """
    mock_images = [Image.new("RGB", (100, 100))]
    regions = [
        {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"},
        {"pageNumber": 1, "x": 0.5, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a2"},
    ]

    def slow_correct(text):
        import time as _time
        _time.sleep(0.05)
        return text

    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.kraken_engine.is_available", return_value=True), \
         patch("main.kraken_engine.extract_region_text", return_value="text"), \
         patch("main.correct_text", side_effect=slow_correct):
        async with ocr_client() as client:
            async with client.stream("POST", "/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "HANDWRITING_KURRENT",
                "language": "de",
                "regions": regions,
            }) as response:
                assert response.status_code == 200
                async for _ in response.aiter_lines():
                    pass

    sum_seconds, _ = _histogram_count_sum(
        fresh_metrics.ocr_processing_seconds, engine="kraken"
    )
    assert sum_seconds < 0.09, f"timing must exclude spell-check; got sum={sum_seconds}"


@pytest.mark.asyncio
async def test_ocr_jobs_total_not_incremented_when_pdf_download_fails_in_stream(fresh_metrics):
    """If `_download_and_convert_pdf` raises, ocr_jobs_total is NOT incremented.

    Mirrors the /ocr endpoint's semantics: the counter only records jobs that
    actually started OCR work, not failed downloads.
    """
    async def fail_download(url):
        raise RuntimeError("synthetic download failure")

    with patch("main._download_and_convert_pdf", new=fail_download):
        async with ocr_client(raise_app_exceptions=False) as client:
            response = await client.post("/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
            })

    assert response.status_code == 500
    assert fresh_metrics.ocr_jobs_total.labels(
        engine="surya", script_type="TYPEWRITER"
    )._value.get() == 0.0


def test_uvicorn_access_log_filter_fails_open_on_short_or_missing_args():
    """The filter must default-allow records when args is None or shorter than expected.

    Locks in fail-open behavior: if uvicorn ever changes its format we keep
    forwarding records to the handler rather than silently dropping logs.
    """
    import logging as _logging
    from main import MetricsPathFilter

    filt = MetricsPathFilter()
    none_record = _logging.LogRecord(
        name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0,
        msg="some message", args=None, exc_info=None,
    )
    short_record = _logging.LogRecord(
        name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0,
        msg="%s %s", args=("a", "b"), exc_info=None,
    )

    assert filt.filter(none_record) is True
    assert filt.filter(short_record) is True


def test_uvicorn_access_log_filter_skips_metrics_path():
    """The MetricsPathFilter drops uvicorn.access log records that target /metrics."""
    import logging as _logging
    from main import MetricsPathFilter

    filt = MetricsPathFilter()
    metrics_record = _logging.LogRecord(
        name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0,
        msg='%s - "%s %s HTTP/%s" %d',
        args=("127.0.0.1:1234", "GET", "/metrics", "1.1", 200),
        exc_info=None,
    )
    health_record = _logging.LogRecord(
        name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0,
        msg='%s - "%s %s HTTP/%s" %d',
        args=("127.0.0.1:1234", "GET", "/health", "1.1", 200),
        exc_info=None,
    )
    ocr_record = _logging.LogRecord(
        name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0,
        msg='%s - "%s %s HTTP/%s" %d',
        args=("127.0.0.1:1234", "POST", "/ocr", "1.1", 200),
        exc_info=None,
    )

    assert filt.filter(metrics_record) is False
    assert filt.filter(health_record) is False
    assert filt.filter(ocr_record) is True