feat(ocr): time only engine work in guided stream histogram
Previously the guided generator's page_started timer wrapped the entire region loop including the synchronous correct_text() call, inflating ocr_processing_seconds with spell-check latency. Sum the per-region engine.extract_region_text durations instead so the histogram matches the unguided stream's "engine only" semantic. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -217,13 +217,15 @@ async def run_ocr_stream(request: OcrRequest):
|
||||
image = await asyncio.to_thread(preprocess_page, image)
|
||||
blocks = []
|
||||
sender_path = request.senderModelPath if use_kraken else None
|
||||
page_started = time.monotonic()
|
||||
engine_seconds = 0.0
|
||||
for region in page_regions:
|
||||
region_started = time.monotonic()
|
||||
text = await asyncio.to_thread(
|
||||
engine.extract_region_text, image,
|
||||
region.x, region.y, region.width, region.height,
|
||||
sender_path,
|
||||
)
|
||||
engine_seconds += time.monotonic() - region_started
|
||||
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||
text = correct_text(text)
|
||||
blocks.append({
|
||||
@@ -238,7 +240,7 @@ async def run_ocr_stream(request: OcrRequest):
|
||||
})
|
||||
|
||||
metrics.ocr_processing_seconds.labels(engine=engine_name).observe(
|
||||
time.monotonic() - page_started
|
||||
engine_seconds
|
||||
)
|
||||
total_blocks += len(blocks)
|
||||
metrics.ocr_pages_total.labels(engine=engine_name).inc()
|
||||
|
||||
Reference in New Issue
Block a user