"""Image preprocessing pipeline for aged document OCR quality improvement.""" import logging import os import cv2 import numpy as np from PIL import Image logger = logging.getLogger(__name__) CLAHE_CLIP_LIMIT = float(os.environ.get("OCR_CLAHE_CLIP_LIMIT", "2.0")) CLAHE_TILE_SIZE = int(os.environ.get("OCR_CLAHE_TILE_SIZE", "8")) def preprocess_page(image: Image.Image) -> Image.Image: """Apply CLAHE + grayscale + Gaussian blur to improve OCR quality on aged documents. Falls back silently to the original image if cv2, numpy, or memory errors occur. Unexpected exceptions (programming errors) are allowed to propagate. """ try: img_array = np.array(image) lab = cv2.cvtColor(img_array, cv2.COLOR_RGB2LAB) del img_array l_channel = lab[:, :, 0].copy() del lab clahe = cv2.createCLAHE( clipLimit=CLAHE_CLIP_LIMIT, tileGridSize=(CLAHE_TILE_SIZE, CLAHE_TILE_SIZE), ) l_clahe = clahe.apply(l_channel) del l_channel blurred = cv2.GaussianBlur(l_clahe, (3, 3), 0) del l_clahe result = Image.fromarray(blurred, mode="L") del blurred return result except (cv2.error, ValueError, MemoryError) as e: logger.warning( "preprocess_page failed (falling back to original): %s: %s", type(e).__name__, e, ) return image