feat(ocr): add image preprocessing module with CLAHE + grayscale + blur

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-17 14:13:42 +02:00
parent bf010a23c3
commit 7183fc4428
2 changed files with 113 additions and 0 deletions

View File

@@ -0,0 +1,64 @@
"""Tests for the image preprocessing pipeline."""
import numpy as np
import pytest
from PIL import Image
from unittest.mock import patch
def _make_yellowed_image(width=100, height=100):
"""Dark, faded yellowed page: L values in a narrow low range with spatial noise.
Very dark (R≈30, G≈20, B≈10) → L_cv ≈ 80-100 in OpenCV uint8 LAB space.
The per-pixel noise gives each CLAHE tile a non-trivial histogram to equalize,
which stretches the narrow dark range toward [0-255] and reliably increases mean L.
"""
rng = np.random.default_rng(42)
arr = np.zeros((height, width, 3), dtype=np.uint8)
arr[:, :, 0] = np.clip(30 + rng.integers(-8, 9, (height, width)), 0, 255)
arr[:, :, 1] = np.clip(20 + rng.integers(-5, 6, (height, width)), 0, 255)
arr[:, :, 2] = np.clip(10 + rng.integers(-3, 4, (height, width)), 0, 255)
return Image.fromarray(arr.astype(np.uint8), mode="RGB")
class TestPreprocessPage:
def test_output_has_same_dimensions_as_input(self):
from preprocessing import preprocess_page
img = Image.new("RGB", (150, 200))
result = preprocess_page(img)
assert result.size == img.size
def test_l_channel_mean_increases_on_yellowed_image(self):
"""CLAHE equalizes the dark narrow-range histogram toward [0-255], raising mean L."""
from preprocessing import preprocess_page
import cv2
img = _make_yellowed_image()
arr_before = np.array(img)
lab_before = cv2.cvtColor(arr_before, cv2.COLOR_RGB2LAB)
l_mean_before = float(lab_before[:, :, 0].mean())
result = preprocess_page(img)
# Output is grayscale (mode "L"); its values ARE the CLAHE-enhanced L channel
l_mean_after = float(np.array(result).mean())
assert l_mean_after > l_mean_before
def test_falls_back_to_pixel_identical_original_on_cv2_error(self):
"""When cv2 raises, preprocess_page must return the unmodified original image."""
from preprocessing import preprocess_page
img = Image.new("RGB", (80, 60), color=(123, 45, 67))
original_pixels = list(img.getdata())
with patch("preprocessing.cv2") as mock_cv2:
mock_cv2.cvtColor.side_effect = RuntimeError("cv2 exploded")
result = preprocess_page(img)
result_pixels = list(result.getdata())
assert result_pixels == original_pixels