feat(ocr): add image preprocessing pipeline to improve transcription quality on aged documents #255

Merged
marcel merged 7 commits from feat/issue-252-ocr-preprocessing into main 2026-04-17 15:50:37 +02:00
2 changed files with 25 additions and 6 deletions
Showing only changes of commit 4cb7c975f5 - Show all commits

View File

@@ -16,7 +16,8 @@ CLAHE_TILE_SIZE = int(os.environ.get("OCR_CLAHE_TILE_SIZE", "8"))
def preprocess_page(image: Image.Image) -> Image.Image:
"""Apply CLAHE + grayscale + Gaussian blur to improve OCR quality on aged documents.
Falls back silently to the original image if any step fails.
Falls back silently to the original image if cv2, numpy, or memory errors occur.
Unexpected exceptions (programming errors) are allowed to propagate.
"""
try:
img_array = np.array(image)
@@ -40,7 +41,7 @@ def preprocess_page(image: Image.Image) -> Image.Image:
del blurred
return result
except Exception as e:
except (cv2.error, ValueError, MemoryError) as e:
logger.warning(
"preprocess_page failed (falling back to original): %s: %s",
type(e).__name__,

View File

@@ -48,17 +48,35 @@ class TestPreprocessPage:
assert l_mean_after > l_mean_before
def test_does_not_crash_on_sub_tile_size_image(self):
"""A 1×1 image is smaller than the CLAHE tile (8×8) in both axes.
preprocess_page must not raise — it either succeeds or falls back silently."""
from preprocessing import preprocess_page
img = Image.new("RGB", (1, 1), color=(128, 100, 80))
result = preprocess_page(img)
assert isinstance(result, Image.Image)
def test_falls_back_to_pixel_identical_original_on_cv2_error(self):
"""When cv2 raises, preprocess_page must return the unmodified original image."""
"""When cv2 raises a known error, preprocess_page returns the unmodified original image."""
from preprocessing import preprocess_page
img = Image.new("RGB", (80, 60), color=(123, 45, 67))
original_pixels = list(img.getdata())
with patch("preprocessing.cv2") as mock_cv2:
mock_cv2.cvtColor.side_effect = RuntimeError("cv2 exploded")
with patch("preprocessing.cv2.cvtColor", side_effect=ValueError("bad input")):
result = preprocess_page(img)
result_pixels = list(result.getdata())
assert result_pixels == original_pixels
def test_unexpected_exception_propagates(self):
"""A RuntimeError (programming error) must propagate — not be swallowed by the cv2 fallback."""
from preprocessing import preprocess_page
img = Image.new("RGB", (80, 60))
with patch("preprocessing.cv2.cvtColor", side_effect=RuntimeError("unexpected")):
with pytest.raises(RuntimeError, match="unexpected"):
preprocess_page(img)