"""Image preprocessing pipeline for aged document OCR quality improvement.""" import logging import os import cv2 import numpy as np from PIL import Image logger = logging.getLogger(__name__) CLAHE_CLIP_LIMIT = float(os.environ.get("OCR_CLAHE_CLIP_LIMIT", "2.0")) CLAHE_TILE_SIZE = int(os.environ.get("OCR_CLAHE_TILE_SIZE", "8")) def preprocess_page(image: Image.Image) -> Image.Image: """Apply CLAHE + grayscale + Gaussian blur to improve OCR quality on aged documents. Falls back silently to the original image if any step fails. """ try: img_array = np.array(image) lab = cv2.cvtColor(img_array, cv2.COLOR_RGB2LAB) del img_array l_channel = lab[:, :, 0].copy() del lab clahe = cv2.createCLAHE( clipLimit=CLAHE_CLIP_LIMIT, tileGridSize=(CLAHE_TILE_SIZE, CLAHE_TILE_SIZE), ) l_clahe = clahe.apply(l_channel) del l_channel blurred = cv2.GaussianBlur(l_clahe, (3, 3), 0) del l_clahe result = Image.fromarray(blurred, mode="L") del blurred return result except Exception as e: logger.warning( "preprocess_page failed (falling back to original): %s: %s", type(e).__name__, e, ) return image