feat(ocr): German spell-check post-processing to reduce handwriting gibberish #260
@@ -9,7 +9,7 @@ ILLEGIBLE_MARKER = "[unleserlich]"
|
||||
CORRECTION_MARKER = "[?]"
|
||||
|
||||
|
||||
def _collapse_adjacent_markers(tokens: list[str]) -> list[str]:
|
||||
def collapse_adjacent_markers(tokens: list[str]) -> list[str]:
|
||||
collapsed: list[str] = []
|
||||
prev_was_marker = False
|
||||
for token in tokens:
|
||||
@@ -52,7 +52,7 @@ def apply_confidence_markers(words: list[dict], threshold: float | None = None)
|
||||
else:
|
||||
tokens.append(word["text"])
|
||||
|
||||
return " ".join(_collapse_adjacent_markers(tokens))
|
||||
return " ".join(collapse_adjacent_markers(tokens))
|
||||
|
||||
|
||||
def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]:
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
|
||||
from spellchecker import SpellChecker
|
||||
|
||||
from confidence import CORRECTION_MARKER, ILLEGIBLE_MARKER, _collapse_adjacent_markers
|
||||
from confidence import CORRECTION_MARKER, ILLEGIBLE_MARKER, collapse_adjacent_markers
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -114,4 +114,4 @@ def correct_text(text: str) -> str:
|
||||
else:
|
||||
checked.append(ILLEGIBLE_MARKER)
|
||||
|
||||
return " ".join(_collapse_adjacent_markers(checked))
|
||||
return " ".join(collapse_adjacent_markers(checked))
|
||||
|
||||
Reference in New Issue
Block a user