feat(ocr): auto-insert [unleserlich] markers for low-confidence words
New confidence.py module with two functions: - apply_confidence_markers(): replaces words below threshold with [unleserlich], collapses adjacent markers into one - words_from_characters(): reconstructs word-level confidence from Kraken's character-level data Surya 0.17 provides native word-level confidence via line.words. Kraken 7.0 provides per-character confidences via record.confidences. Both engines now pass word+confidence data through main.py, which applies the marker post-processing before returning the API response. Threshold configurable via OCR_CONFIDENCE_THRESHOLD env var (default 0.3). Frontend already renders [unleserlich] markers via transcriptionMarkers.ts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -37,6 +37,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
Coordinates are normalized to [0, 1].
|
||||
"""
|
||||
from kraken import blla, rpred
|
||||
from confidence import words_from_characters
|
||||
|
||||
if _model is None:
|
||||
raise RuntimeError("Kraken model is not loaded")
|
||||
@@ -73,6 +74,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
# Approximate polygon to quadrilateral
|
||||
quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
|
||||
|
||||
# Extract word-level confidence for [unleserlich] marking
|
||||
char_confidences = getattr(record, "confidences", [])
|
||||
words = words_from_characters(record.prediction, char_confidences)
|
||||
|
||||
all_blocks.append({
|
||||
"pageNumber": page_idx,
|
||||
"x": x1 / page_w,
|
||||
@@ -81,6 +86,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
"height": (y2 - y1) / page_h,
|
||||
"polygon": quad,
|
||||
"text": record.prediction,
|
||||
"words": words,
|
||||
})
|
||||
|
||||
return all_blocks
|
||||
|
||||
@@ -51,6 +51,17 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
for p in line.polygon
|
||||
]
|
||||
|
||||
# Extract word-level confidence for [unleserlich] marking
|
||||
words = []
|
||||
if hasattr(line, "words") and line.words:
|
||||
for word in line.words:
|
||||
words.append({
|
||||
"text": word.text,
|
||||
"confidence": word.confidence,
|
||||
})
|
||||
else:
|
||||
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
|
||||
|
||||
all_blocks.append({
|
||||
"pageNumber": page_idx,
|
||||
"x": x1 / page_w,
|
||||
@@ -59,6 +70,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
"height": (y2 - y1) / page_h,
|
||||
"polygon": polygon,
|
||||
"text": line.text,
|
||||
"words": words,
|
||||
})
|
||||
|
||||
return all_blocks
|
||||
|
||||
Reference in New Issue
Block a user