feat(ocr): add image preprocessing module with CLAHE + grayscale + blur
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
49
ocr-service/preprocessing.py
Normal file
49
ocr-service/preprocessing.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
"""Image preprocessing pipeline for aged document OCR quality improvement."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
CLAHE_CLIP_LIMIT = float(os.environ.get("OCR_CLAHE_CLIP_LIMIT", "2.0"))
|
||||||
|
CLAHE_TILE_SIZE = int(os.environ.get("OCR_CLAHE_TILE_SIZE", "8"))
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_page(image: Image.Image) -> Image.Image:
|
||||||
|
"""Apply CLAHE + grayscale + Gaussian blur to improve OCR quality on aged documents.
|
||||||
|
|
||||||
|
Falls back silently to the original image if any step fails.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
img_array = np.array(image)
|
||||||
|
lab = cv2.cvtColor(img_array, cv2.COLOR_RGB2LAB)
|
||||||
|
del img_array
|
||||||
|
|
||||||
|
l_channel = lab[:, :, 0].copy()
|
||||||
|
del lab
|
||||||
|
|
||||||
|
clahe = cv2.createCLAHE(
|
||||||
|
clipLimit=CLAHE_CLIP_LIMIT,
|
||||||
|
tileGridSize=(CLAHE_TILE_SIZE, CLAHE_TILE_SIZE),
|
||||||
|
)
|
||||||
|
l_clahe = clahe.apply(l_channel)
|
||||||
|
del l_channel
|
||||||
|
|
||||||
|
blurred = cv2.GaussianBlur(l_clahe, (3, 3), 0)
|
||||||
|
del l_clahe
|
||||||
|
|
||||||
|
result = Image.fromarray(blurred, mode="L")
|
||||||
|
del blurred
|
||||||
|
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"preprocess_page failed (falling back to original): %s: %s",
|
||||||
|
type(e).__name__,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
return image
|
||||||
64
ocr-service/test_preprocessing.py
Normal file
64
ocr-service/test_preprocessing.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
"""Tests for the image preprocessing pipeline."""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
from PIL import Image
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
|
||||||
|
def _make_yellowed_image(width=100, height=100):
|
||||||
|
"""Dark, faded yellowed page: L values in a narrow low range with spatial noise.
|
||||||
|
|
||||||
|
Very dark (R≈30, G≈20, B≈10) → L_cv ≈ 80-100 in OpenCV uint8 LAB space.
|
||||||
|
The per-pixel noise gives each CLAHE tile a non-trivial histogram to equalize,
|
||||||
|
which stretches the narrow dark range toward [0-255] and reliably increases mean L.
|
||||||
|
"""
|
||||||
|
rng = np.random.default_rng(42)
|
||||||
|
arr = np.zeros((height, width, 3), dtype=np.uint8)
|
||||||
|
arr[:, :, 0] = np.clip(30 + rng.integers(-8, 9, (height, width)), 0, 255)
|
||||||
|
arr[:, :, 1] = np.clip(20 + rng.integers(-5, 6, (height, width)), 0, 255)
|
||||||
|
arr[:, :, 2] = np.clip(10 + rng.integers(-3, 4, (height, width)), 0, 255)
|
||||||
|
return Image.fromarray(arr.astype(np.uint8), mode="RGB")
|
||||||
|
|
||||||
|
|
||||||
|
class TestPreprocessPage:
|
||||||
|
def test_output_has_same_dimensions_as_input(self):
|
||||||
|
from preprocessing import preprocess_page
|
||||||
|
|
||||||
|
img = Image.new("RGB", (150, 200))
|
||||||
|
result = preprocess_page(img)
|
||||||
|
|
||||||
|
assert result.size == img.size
|
||||||
|
|
||||||
|
def test_l_channel_mean_increases_on_yellowed_image(self):
|
||||||
|
"""CLAHE equalizes the dark narrow-range histogram toward [0-255], raising mean L."""
|
||||||
|
from preprocessing import preprocess_page
|
||||||
|
import cv2
|
||||||
|
|
||||||
|
img = _make_yellowed_image()
|
||||||
|
|
||||||
|
arr_before = np.array(img)
|
||||||
|
lab_before = cv2.cvtColor(arr_before, cv2.COLOR_RGB2LAB)
|
||||||
|
l_mean_before = float(lab_before[:, :, 0].mean())
|
||||||
|
|
||||||
|
result = preprocess_page(img)
|
||||||
|
|
||||||
|
# Output is grayscale (mode "L"); its values ARE the CLAHE-enhanced L channel
|
||||||
|
l_mean_after = float(np.array(result).mean())
|
||||||
|
|
||||||
|
assert l_mean_after > l_mean_before
|
||||||
|
|
||||||
|
def test_falls_back_to_pixel_identical_original_on_cv2_error(self):
|
||||||
|
"""When cv2 raises, preprocess_page must return the unmodified original image."""
|
||||||
|
from preprocessing import preprocess_page
|
||||||
|
|
||||||
|
img = Image.new("RGB", (80, 60), color=(123, 45, 67))
|
||||||
|
original_pixels = list(img.getdata())
|
||||||
|
|
||||||
|
with patch("preprocessing.cv2") as mock_cv2:
|
||||||
|
mock_cv2.cvtColor.side_effect = RuntimeError("cv2 exploded")
|
||||||
|
|
||||||
|
result = preprocess_page(img)
|
||||||
|
|
||||||
|
result_pixels = list(result.getdata())
|
||||||
|
assert result_pixels == original_pixels
|
||||||
Reference in New Issue
Block a user