From 7183fc44286fff0e65d193fd11f2d2390695763c Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 17 Apr 2026 14:13:42 +0200 Subject: [PATCH] feat(ocr): add image preprocessing module with CLAHE + grayscale + blur Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/preprocessing.py | 49 +++++++++++++++++++++++ ocr-service/test_preprocessing.py | 64 +++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 ocr-service/preprocessing.py create mode 100644 ocr-service/test_preprocessing.py diff --git a/ocr-service/preprocessing.py b/ocr-service/preprocessing.py new file mode 100644 index 00000000..c6f77b87 --- /dev/null +++ b/ocr-service/preprocessing.py @@ -0,0 +1,49 @@ +"""Image preprocessing pipeline for aged document OCR quality improvement.""" + +import logging +import os + +import cv2 +import numpy as np +from PIL import Image + +logger = logging.getLogger(__name__) + +CLAHE_CLIP_LIMIT = float(os.environ.get("OCR_CLAHE_CLIP_LIMIT", "2.0")) +CLAHE_TILE_SIZE = int(os.environ.get("OCR_CLAHE_TILE_SIZE", "8")) + + +def preprocess_page(image: Image.Image) -> Image.Image: + """Apply CLAHE + grayscale + Gaussian blur to improve OCR quality on aged documents. + + Falls back silently to the original image if any step fails. + """ + try: + img_array = np.array(image) + lab = cv2.cvtColor(img_array, cv2.COLOR_RGB2LAB) + del img_array + + l_channel = lab[:, :, 0].copy() + del lab + + clahe = cv2.createCLAHE( + clipLimit=CLAHE_CLIP_LIMIT, + tileGridSize=(CLAHE_TILE_SIZE, CLAHE_TILE_SIZE), + ) + l_clahe = clahe.apply(l_channel) + del l_channel + + blurred = cv2.GaussianBlur(l_clahe, (3, 3), 0) + del l_clahe + + result = Image.fromarray(blurred, mode="L") + del blurred + + return result + except Exception as e: + logger.warning( + "preprocess_page failed (falling back to original): %s: %s", + type(e).__name__, + e, + ) + return image diff --git a/ocr-service/test_preprocessing.py b/ocr-service/test_preprocessing.py new file mode 100644 index 00000000..41ec09bd --- /dev/null +++ b/ocr-service/test_preprocessing.py @@ -0,0 +1,64 @@ +"""Tests for the image preprocessing pipeline.""" + +import numpy as np +import pytest +from PIL import Image +from unittest.mock import patch + + +def _make_yellowed_image(width=100, height=100): + """Dark, faded yellowed page: L values in a narrow low range with spatial noise. + + Very dark (R≈30, G≈20, B≈10) → L_cv ≈ 80-100 in OpenCV uint8 LAB space. + The per-pixel noise gives each CLAHE tile a non-trivial histogram to equalize, + which stretches the narrow dark range toward [0-255] and reliably increases mean L. + """ + rng = np.random.default_rng(42) + arr = np.zeros((height, width, 3), dtype=np.uint8) + arr[:, :, 0] = np.clip(30 + rng.integers(-8, 9, (height, width)), 0, 255) + arr[:, :, 1] = np.clip(20 + rng.integers(-5, 6, (height, width)), 0, 255) + arr[:, :, 2] = np.clip(10 + rng.integers(-3, 4, (height, width)), 0, 255) + return Image.fromarray(arr.astype(np.uint8), mode="RGB") + + +class TestPreprocessPage: + def test_output_has_same_dimensions_as_input(self): + from preprocessing import preprocess_page + + img = Image.new("RGB", (150, 200)) + result = preprocess_page(img) + + assert result.size == img.size + + def test_l_channel_mean_increases_on_yellowed_image(self): + """CLAHE equalizes the dark narrow-range histogram toward [0-255], raising mean L.""" + from preprocessing import preprocess_page + import cv2 + + img = _make_yellowed_image() + + arr_before = np.array(img) + lab_before = cv2.cvtColor(arr_before, cv2.COLOR_RGB2LAB) + l_mean_before = float(lab_before[:, :, 0].mean()) + + result = preprocess_page(img) + + # Output is grayscale (mode "L"); its values ARE the CLAHE-enhanced L channel + l_mean_after = float(np.array(result).mean()) + + assert l_mean_after > l_mean_before + + def test_falls_back_to_pixel_identical_original_on_cv2_error(self): + """When cv2 raises, preprocess_page must return the unmodified original image.""" + from preprocessing import preprocess_page + + img = Image.new("RGB", (80, 60), color=(123, 45, 67)) + original_pixels = list(img.getdata()) + + with patch("preprocessing.cv2") as mock_cv2: + mock_cv2.cvtColor.side_effect = RuntimeError("cv2 exploded") + + result = preprocess_page(img) + + result_pixels = list(result.getdata()) + assert result_pixels == original_pixels