feat(ocr): German spell-check post-processing to reduce handwriting gibberish #260

Merged
marcel merged 10 commits from feat/issue-254-german-spell-check into main 2026-04-17 17:28:41 +02:00
Showing only changes of commit 77747aa556 - Show all commits

View File

@@ -6,6 +6,21 @@ THRESHOLD_DEFAULT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
THRESHOLD_KURRENT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD_KURRENT", "0.5"))
ILLEGIBLE_MARKER = "[unleserlich]"
CORRECTION_MARKER = "[?]"
def _collapse_adjacent_markers(tokens: list[str]) -> list[str]:
collapsed: list[str] = []
prev_was_marker = False
for token in tokens:
if token == ILLEGIBLE_MARKER:
if not prev_was_marker:
collapsed.append(token)
prev_was_marker = True
else:
collapsed.append(token)
prev_was_marker = False
return collapsed
def get_threshold(script_type: str) -> float:
@@ -30,19 +45,14 @@ def apply_confidence_markers(words: list[dict], threshold: float | None = None)
if threshold is None:
threshold = THRESHOLD_DEFAULT
result: list[str] = []
prev_was_marker = False
tokens: list[str] = []
for word in words:
if word["confidence"] < threshold:
if not prev_was_marker:
result.append(ILLEGIBLE_MARKER)
prev_was_marker = True
tokens.append(ILLEGIBLE_MARKER)
else:
result.append(word["text"])
prev_was_marker = False
tokens.append(word["text"])
return " ".join(result)
return " ".join(_collapse_adjacent_markers(tokens))
def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]: