From 77747aa5562512fa46c1d51f602903e6517f7ed7 Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 17 Apr 2026 16:40:39 +0200 Subject: [PATCH] refactor(ocr): extract _collapse_adjacent_markers helper and add CORRECTION_MARKER Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/confidence.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/ocr-service/confidence.py b/ocr-service/confidence.py index e331443f..e4835db6 100644 --- a/ocr-service/confidence.py +++ b/ocr-service/confidence.py @@ -6,6 +6,21 @@ THRESHOLD_DEFAULT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3")) THRESHOLD_KURRENT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD_KURRENT", "0.5")) ILLEGIBLE_MARKER = "[unleserlich]" +CORRECTION_MARKER = "[?]" + + +def _collapse_adjacent_markers(tokens: list[str]) -> list[str]: + collapsed: list[str] = [] + prev_was_marker = False + for token in tokens: + if token == ILLEGIBLE_MARKER: + if not prev_was_marker: + collapsed.append(token) + prev_was_marker = True + else: + collapsed.append(token) + prev_was_marker = False + return collapsed def get_threshold(script_type: str) -> float: @@ -30,19 +45,14 @@ def apply_confidence_markers(words: list[dict], threshold: float | None = None) if threshold is None: threshold = THRESHOLD_DEFAULT - result: list[str] = [] - prev_was_marker = False - + tokens: list[str] = [] for word in words: if word["confidence"] < threshold: - if not prev_was_marker: - result.append(ILLEGIBLE_MARKER) - prev_was_marker = True + tokens.append(ILLEGIBLE_MARKER) else: - result.append(word["text"]) - prev_was_marker = False + tokens.append(word["text"]) - return " ".join(result) + return " ".join(_collapse_adjacent_markers(tokens)) def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]: