refactor(ocr): extract _collapse_adjacent_markers helper and add CORRECTION_MARKER
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,6 +6,21 @@ THRESHOLD_DEFAULT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
|
|||||||
THRESHOLD_KURRENT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD_KURRENT", "0.5"))
|
THRESHOLD_KURRENT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD_KURRENT", "0.5"))
|
||||||
|
|
||||||
ILLEGIBLE_MARKER = "[unleserlich]"
|
ILLEGIBLE_MARKER = "[unleserlich]"
|
||||||
|
CORRECTION_MARKER = "[?]"
|
||||||
|
|
||||||
|
|
||||||
|
def _collapse_adjacent_markers(tokens: list[str]) -> list[str]:
|
||||||
|
collapsed: list[str] = []
|
||||||
|
prev_was_marker = False
|
||||||
|
for token in tokens:
|
||||||
|
if token == ILLEGIBLE_MARKER:
|
||||||
|
if not prev_was_marker:
|
||||||
|
collapsed.append(token)
|
||||||
|
prev_was_marker = True
|
||||||
|
else:
|
||||||
|
collapsed.append(token)
|
||||||
|
prev_was_marker = False
|
||||||
|
return collapsed
|
||||||
|
|
||||||
|
|
||||||
def get_threshold(script_type: str) -> float:
|
def get_threshold(script_type: str) -> float:
|
||||||
@@ -30,19 +45,14 @@ def apply_confidence_markers(words: list[dict], threshold: float | None = None)
|
|||||||
if threshold is None:
|
if threshold is None:
|
||||||
threshold = THRESHOLD_DEFAULT
|
threshold = THRESHOLD_DEFAULT
|
||||||
|
|
||||||
result: list[str] = []
|
tokens: list[str] = []
|
||||||
prev_was_marker = False
|
|
||||||
|
|
||||||
for word in words:
|
for word in words:
|
||||||
if word["confidence"] < threshold:
|
if word["confidence"] < threshold:
|
||||||
if not prev_was_marker:
|
tokens.append(ILLEGIBLE_MARKER)
|
||||||
result.append(ILLEGIBLE_MARKER)
|
|
||||||
prev_was_marker = True
|
|
||||||
else:
|
else:
|
||||||
result.append(word["text"])
|
tokens.append(word["text"])
|
||||||
prev_was_marker = False
|
|
||||||
|
|
||||||
return " ".join(result)
|
return " ".join(_collapse_adjacent_markers(tokens))
|
||||||
|
|
||||||
|
|
||||||
def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]:
|
def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]:
|
||||||
|
|||||||
Reference in New Issue
Block a user