feat(ocr): auto-insert [unleserlich] markers for low-confidence words
New confidence.py module with two functions: - apply_confidence_markers(): replaces words below threshold with [unleserlich], collapses adjacent markers into one - words_from_characters(): reconstructs word-level confidence from Kraken's character-level data Surya 0.17 provides native word-level confidence via line.words. Kraken 7.0 provides per-character confidences via record.confidences. Both engines now pass word+confidence data through main.py, which applies the marker post-processing before returning the API response. Threshold configurable via OCR_CONFIDENCE_THRESHOLD env var (default 0.3). Frontend already renders [unleserlich] markers via transcriptionMarkers.ts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -84,6 +84,7 @@ services:
|
|||||||
- ocr_models:/app/models
|
- ocr_models:/app/models
|
||||||
environment:
|
environment:
|
||||||
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
||||||
|
OCR_CONFIDENCE_THRESHOLD: "0.3"
|
||||||
networks:
|
networks:
|
||||||
- archive-net
|
- archive-net
|
||||||
healthcheck:
|
healthcheck:
|
||||||
|
|||||||
79
ocr-service/confidence.py
Normal file
79
ocr-service/confidence.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
"""Confidence-based [unleserlich] marker insertion for OCR output."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
CONFIDENCE_THRESHOLD = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
|
||||||
|
|
||||||
|
ILLEGIBLE_MARKER = "[unleserlich]"
|
||||||
|
|
||||||
|
|
||||||
|
def apply_confidence_markers(words: list[dict]) -> str:
|
||||||
|
"""Replace low-confidence words with [unleserlich], collapsing adjacent markers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: list of {"text": str, "confidence": float} dicts
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Reconstructed text string with [unleserlich] substitutions.
|
||||||
|
"""
|
||||||
|
if not words:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
result: list[str] = []
|
||||||
|
prev_was_marker = False
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
if word["confidence"] < CONFIDENCE_THRESHOLD:
|
||||||
|
if not prev_was_marker:
|
||||||
|
result.append(ILLEGIBLE_MARKER)
|
||||||
|
prev_was_marker = True
|
||||||
|
else:
|
||||||
|
result.append(word["text"])
|
||||||
|
prev_was_marker = False
|
||||||
|
|
||||||
|
return " ".join(result)
|
||||||
|
|
||||||
|
|
||||||
|
def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]:
|
||||||
|
"""Reconstruct word-level confidence from character-level data.
|
||||||
|
|
||||||
|
Splits prediction on whitespace, maps characters to their confidences,
|
||||||
|
computes mean confidence per word.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prediction: full line text from Kraken
|
||||||
|
confidences: per-character confidence list (same length as prediction)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list of {"text": str, "confidence": float} dicts
|
||||||
|
"""
|
||||||
|
if not prediction or not prediction.strip():
|
||||||
|
return []
|
||||||
|
|
||||||
|
if len(confidences) != len(prediction):
|
||||||
|
return [{"text": prediction, "confidence": 1.0}]
|
||||||
|
|
||||||
|
result: list[dict] = []
|
||||||
|
current_word: list[str] = []
|
||||||
|
current_confs: list[float] = []
|
||||||
|
|
||||||
|
for char, conf in zip(prediction, confidences):
|
||||||
|
if char == " ":
|
||||||
|
if current_word:
|
||||||
|
result.append({
|
||||||
|
"text": "".join(current_word),
|
||||||
|
"confidence": sum(current_confs) / len(current_confs),
|
||||||
|
})
|
||||||
|
current_word = []
|
||||||
|
current_confs = []
|
||||||
|
else:
|
||||||
|
current_word.append(char)
|
||||||
|
current_confs.append(conf)
|
||||||
|
|
||||||
|
if current_word:
|
||||||
|
result.append({
|
||||||
|
"text": "".join(current_word),
|
||||||
|
"confidence": sum(current_confs) / len(current_confs),
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -37,6 +37,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
|||||||
Coordinates are normalized to [0, 1].
|
Coordinates are normalized to [0, 1].
|
||||||
"""
|
"""
|
||||||
from kraken import blla, rpred
|
from kraken import blla, rpred
|
||||||
|
from confidence import words_from_characters
|
||||||
|
|
||||||
if _model is None:
|
if _model is None:
|
||||||
raise RuntimeError("Kraken model is not loaded")
|
raise RuntimeError("Kraken model is not loaded")
|
||||||
@@ -73,6 +74,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
|||||||
# Approximate polygon to quadrilateral
|
# Approximate polygon to quadrilateral
|
||||||
quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
|
quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
|
||||||
|
|
||||||
|
# Extract word-level confidence for [unleserlich] marking
|
||||||
|
char_confidences = getattr(record, "confidences", [])
|
||||||
|
words = words_from_characters(record.prediction, char_confidences)
|
||||||
|
|
||||||
all_blocks.append({
|
all_blocks.append({
|
||||||
"pageNumber": page_idx,
|
"pageNumber": page_idx,
|
||||||
"x": x1 / page_w,
|
"x": x1 / page_w,
|
||||||
@@ -81,6 +86,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
|||||||
"height": (y2 - y1) / page_h,
|
"height": (y2 - y1) / page_h,
|
||||||
"polygon": quad,
|
"polygon": quad,
|
||||||
"text": record.prediction,
|
"text": record.prediction,
|
||||||
|
"words": words,
|
||||||
})
|
})
|
||||||
|
|
||||||
return all_blocks
|
return all_blocks
|
||||||
|
|||||||
@@ -51,6 +51,17 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
|||||||
for p in line.polygon
|
for p in line.polygon
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Extract word-level confidence for [unleserlich] marking
|
||||||
|
words = []
|
||||||
|
if hasattr(line, "words") and line.words:
|
||||||
|
for word in line.words:
|
||||||
|
words.append({
|
||||||
|
"text": word.text,
|
||||||
|
"confidence": word.confidence,
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
|
||||||
|
|
||||||
all_blocks.append({
|
all_blocks.append({
|
||||||
"pageNumber": page_idx,
|
"pageNumber": page_idx,
|
||||||
"x": x1 / page_w,
|
"x": x1 / page_w,
|
||||||
@@ -59,6 +70,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
|||||||
"height": (y2 - y1) / page_h,
|
"height": (y2 - y1) / page_h,
|
||||||
"polygon": polygon,
|
"polygon": polygon,
|
||||||
"text": line.text,
|
"text": line.text,
|
||||||
|
"words": words,
|
||||||
})
|
})
|
||||||
|
|
||||||
return all_blocks
|
return all_blocks
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import pypdfium2 as pdfium
|
|||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from confidence import apply_confidence_markers
|
||||||
from engines import kraken as kraken_engine
|
from engines import kraken as kraken_engine
|
||||||
from engines import surya as surya_engine
|
from engines import surya as surya_engine
|
||||||
from models import OcrBlock, OcrRequest
|
from models import OcrBlock, OcrRequest
|
||||||
@@ -71,6 +72,11 @@ async def run_ocr(request: OcrRequest):
|
|||||||
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
|
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
|
||||||
blocks = surya_engine.extract_blocks(images, request.language)
|
blocks = surya_engine.extract_blocks(images, request.language)
|
||||||
|
|
||||||
|
for block in blocks:
|
||||||
|
if block.get("words"):
|
||||||
|
block["text"] = apply_confidence_markers(block["words"])
|
||||||
|
block.pop("words", None)
|
||||||
|
|
||||||
return [OcrBlock(**b) for b in blocks]
|
return [OcrBlock(**b) for b in blocks]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
153
ocr-service/test_confidence.py
Normal file
153
ocr-service/test_confidence.py
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
"""Tests for confidence-based [unleserlich] marker insertion."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from confidence import apply_confidence_markers, words_from_characters
|
||||||
|
|
||||||
|
|
||||||
|
# ─── apply_confidence_markers ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_words_above_threshold_passes_through():
|
||||||
|
words = [
|
||||||
|
{"text": "Lieber", "confidence": 0.95},
|
||||||
|
{"text": "Freund", "confidence": 0.88},
|
||||||
|
]
|
||||||
|
assert apply_confidence_markers(words) == "Lieber Freund"
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_low_confidence_word_replaced():
|
||||||
|
words = [
|
||||||
|
{"text": "Lieber", "confidence": 0.95},
|
||||||
|
{"text": "xkqz", "confidence": 0.1},
|
||||||
|
{"text": "Freund", "confidence": 0.88},
|
||||||
|
]
|
||||||
|
assert apply_confidence_markers(words) == "Lieber [unleserlich] Freund"
|
||||||
|
|
||||||
|
|
||||||
|
def test_adjacent_low_confidence_words_collapsed():
|
||||||
|
words = [
|
||||||
|
{"text": "Lieber", "confidence": 0.95},
|
||||||
|
{"text": "xkqz", "confidence": 0.1},
|
||||||
|
{"text": "abc", "confidence": 0.05},
|
||||||
|
{"text": "yyy", "confidence": 0.2},
|
||||||
|
{"text": "Freund", "confidence": 0.88},
|
||||||
|
]
|
||||||
|
assert apply_confidence_markers(words) == "Lieber [unleserlich] Freund"
|
||||||
|
|
||||||
|
|
||||||
|
def test_mixed_high_low_each_group_gets_marker():
|
||||||
|
words = [
|
||||||
|
{"text": "Lieber", "confidence": 0.95},
|
||||||
|
{"text": "xkqz", "confidence": 0.1},
|
||||||
|
{"text": "wie", "confidence": 0.9},
|
||||||
|
{"text": "abc", "confidence": 0.05},
|
||||||
|
{"text": "dir", "confidence": 0.88},
|
||||||
|
]
|
||||||
|
assert apply_confidence_markers(words) == "Lieber [unleserlich] wie [unleserlich] dir"
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_below_threshold_returns_single_marker():
|
||||||
|
words = [
|
||||||
|
{"text": "xkq", "confidence": 0.1},
|
||||||
|
{"text": "zzz", "confidence": 0.05},
|
||||||
|
]
|
||||||
|
assert apply_confidence_markers(words) == "[unleserlich]"
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_list_returns_empty_string():
|
||||||
|
assert apply_confidence_markers([]) == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_word_above_threshold():
|
||||||
|
words = [{"text": "Hallo", "confidence": 0.9}]
|
||||||
|
assert apply_confidence_markers(words) == "Hallo"
|
||||||
|
|
||||||
|
|
||||||
|
def test_exact_threshold_passes_through():
|
||||||
|
"""Confidence exactly at threshold should NOT be replaced (strict <)."""
|
||||||
|
words = [{"text": "Wort", "confidence": 0.3}]
|
||||||
|
assert apply_confidence_markers(words) == "Wort"
|
||||||
|
|
||||||
|
|
||||||
|
def test_just_below_threshold_replaced():
|
||||||
|
words = [{"text": "Wort", "confidence": 0.29}]
|
||||||
|
assert apply_confidence_markers(words) == "[unleserlich]"
|
||||||
|
|
||||||
|
|
||||||
|
def test_custom_threshold_via_env(monkeypatch):
|
||||||
|
monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.8")
|
||||||
|
# Need to reload the module to pick up the new env var
|
||||||
|
import importlib
|
||||||
|
import confidence
|
||||||
|
importlib.reload(confidence)
|
||||||
|
|
||||||
|
words = [
|
||||||
|
{"text": "Lieber", "confidence": 0.95},
|
||||||
|
{"text": "Freund", "confidence": 0.5},
|
||||||
|
]
|
||||||
|
assert confidence.apply_confidence_markers(words) == "Lieber [unleserlich]"
|
||||||
|
|
||||||
|
# Reset
|
||||||
|
monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.3")
|
||||||
|
importlib.reload(confidence)
|
||||||
|
|
||||||
|
|
||||||
|
def test_low_confidence_at_start():
|
||||||
|
words = [
|
||||||
|
{"text": "xkq", "confidence": 0.1},
|
||||||
|
{"text": "Freund", "confidence": 0.88},
|
||||||
|
]
|
||||||
|
assert apply_confidence_markers(words) == "[unleserlich] Freund"
|
||||||
|
|
||||||
|
|
||||||
|
def test_low_confidence_at_end():
|
||||||
|
words = [
|
||||||
|
{"text": "Lieber", "confidence": 0.95},
|
||||||
|
{"text": "xkq", "confidence": 0.1},
|
||||||
|
]
|
||||||
|
assert apply_confidence_markers(words) == "Lieber [unleserlich]"
|
||||||
|
|
||||||
|
|
||||||
|
# ─── words_from_characters ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_word_matching_confidences():
|
||||||
|
words = words_from_characters("Hallo", [0.9, 0.8, 0.85, 0.7, 0.95])
|
||||||
|
assert len(words) == 1
|
||||||
|
assert words[0]["text"] == "Hallo"
|
||||||
|
assert abs(words[0]["confidence"] - 0.84) < 0.01
|
||||||
|
|
||||||
|
|
||||||
|
def test_multi_word_with_spaces():
|
||||||
|
prediction = "Sehr geehrter"
|
||||||
|
confidences = [0.9, 0.8, 0.7, 0.6, 0.5, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]
|
||||||
|
words = words_from_characters(prediction, confidences)
|
||||||
|
assert len(words) == 2
|
||||||
|
assert words[0]["text"] == "Sehr"
|
||||||
|
assert words[1]["text"] == "geehrter"
|
||||||
|
|
||||||
|
|
||||||
|
def test_length_mismatch_falls_back_safely():
|
||||||
|
words = words_from_characters("Hallo Welt", [0.9, 0.8])
|
||||||
|
assert len(words) == 1
|
||||||
|
assert words[0]["text"] == "Hallo Welt"
|
||||||
|
assert words[0]["confidence"] == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_prediction_returns_empty():
|
||||||
|
assert words_from_characters("", []) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_character_word():
|
||||||
|
words = words_from_characters("A B", [0.9, 0.5, 0.3])
|
||||||
|
assert len(words) == 2
|
||||||
|
assert words[0]["text"] == "A"
|
||||||
|
assert words[0]["confidence"] == 0.9
|
||||||
|
assert words[1]["text"] == "B"
|
||||||
|
assert words[1]["confidence"] == 0.3
|
||||||
|
|
||||||
|
|
||||||
|
def test_whitespace_only_prediction():
|
||||||
|
words = words_from_characters(" ", [0.5, 0.5, 0.5])
|
||||||
|
assert words == []
|
||||||
Reference in New Issue
Block a user