feat(ocr): auto-insert [unleserlich] markers for low-confidence words
New confidence.py module with two functions: - apply_confidence_markers(): replaces words below threshold with [unleserlich], collapses adjacent markers into one - words_from_characters(): reconstructs word-level confidence from Kraken's character-level data Surya 0.17 provides native word-level confidence via line.words. Kraken 7.0 provides per-character confidences via record.confidences. Both engines now pass word+confidence data through main.py, which applies the marker post-processing before returning the API response. Threshold configurable via OCR_CONFIDENCE_THRESHOLD env var (default 0.3). Frontend already renders [unleserlich] markers via transcriptionMarkers.ts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
79
ocr-service/confidence.py
Normal file
79
ocr-service/confidence.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""Confidence-based [unleserlich] marker insertion for OCR output."""
|
||||
|
||||
import os
|
||||
|
||||
CONFIDENCE_THRESHOLD = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
|
||||
|
||||
ILLEGIBLE_MARKER = "[unleserlich]"
|
||||
|
||||
|
||||
def apply_confidence_markers(words: list[dict]) -> str:
|
||||
"""Replace low-confidence words with [unleserlich], collapsing adjacent markers.
|
||||
|
||||
Args:
|
||||
words: list of {"text": str, "confidence": float} dicts
|
||||
|
||||
Returns:
|
||||
Reconstructed text string with [unleserlich] substitutions.
|
||||
"""
|
||||
if not words:
|
||||
return ""
|
||||
|
||||
result: list[str] = []
|
||||
prev_was_marker = False
|
||||
|
||||
for word in words:
|
||||
if word["confidence"] < CONFIDENCE_THRESHOLD:
|
||||
if not prev_was_marker:
|
||||
result.append(ILLEGIBLE_MARKER)
|
||||
prev_was_marker = True
|
||||
else:
|
||||
result.append(word["text"])
|
||||
prev_was_marker = False
|
||||
|
||||
return " ".join(result)
|
||||
|
||||
|
||||
def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]:
|
||||
"""Reconstruct word-level confidence from character-level data.
|
||||
|
||||
Splits prediction on whitespace, maps characters to their confidences,
|
||||
computes mean confidence per word.
|
||||
|
||||
Args:
|
||||
prediction: full line text from Kraken
|
||||
confidences: per-character confidence list (same length as prediction)
|
||||
|
||||
Returns:
|
||||
list of {"text": str, "confidence": float} dicts
|
||||
"""
|
||||
if not prediction or not prediction.strip():
|
||||
return []
|
||||
|
||||
if len(confidences) != len(prediction):
|
||||
return [{"text": prediction, "confidence": 1.0}]
|
||||
|
||||
result: list[dict] = []
|
||||
current_word: list[str] = []
|
||||
current_confs: list[float] = []
|
||||
|
||||
for char, conf in zip(prediction, confidences):
|
||||
if char == " ":
|
||||
if current_word:
|
||||
result.append({
|
||||
"text": "".join(current_word),
|
||||
"confidence": sum(current_confs) / len(current_confs),
|
||||
})
|
||||
current_word = []
|
||||
current_confs = []
|
||||
else:
|
||||
current_word.append(char)
|
||||
current_confs.append(conf)
|
||||
|
||||
if current_word:
|
||||
result.append({
|
||||
"text": "".join(current_word),
|
||||
"confidence": sum(current_confs) / len(current_confs),
|
||||
})
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user