Add test for 1×1 image (sub-tile-size) resilience and narrow preprocess_page fallback from except Exception to (cv2.error, ValueError, MemoryError) so programming errors propagate instead of being silently swallowed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
51 lines
1.4 KiB
Python
51 lines
1.4 KiB
Python
"""Image preprocessing pipeline for aged document OCR quality improvement."""
|
|
|
|
import logging
|
|
import os
|
|
|
|
import cv2
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
CLAHE_CLIP_LIMIT = float(os.environ.get("OCR_CLAHE_CLIP_LIMIT", "2.0"))
|
|
CLAHE_TILE_SIZE = int(os.environ.get("OCR_CLAHE_TILE_SIZE", "8"))
|
|
|
|
|
|
def preprocess_page(image: Image.Image) -> Image.Image:
|
|
"""Apply CLAHE + grayscale + Gaussian blur to improve OCR quality on aged documents.
|
|
|
|
Falls back silently to the original image if cv2, numpy, or memory errors occur.
|
|
Unexpected exceptions (programming errors) are allowed to propagate.
|
|
"""
|
|
try:
|
|
img_array = np.array(image)
|
|
lab = cv2.cvtColor(img_array, cv2.COLOR_RGB2LAB)
|
|
del img_array
|
|
|
|
l_channel = lab[:, :, 0].copy()
|
|
del lab
|
|
|
|
clahe = cv2.createCLAHE(
|
|
clipLimit=CLAHE_CLIP_LIMIT,
|
|
tileGridSize=(CLAHE_TILE_SIZE, CLAHE_TILE_SIZE),
|
|
)
|
|
l_clahe = clahe.apply(l_channel)
|
|
del l_channel
|
|
|
|
blurred = cv2.GaussianBlur(l_clahe, (3, 3), 0)
|
|
del l_clahe
|
|
|
|
result = Image.fromarray(blurred, mode="L")
|
|
del blurred
|
|
|
|
return result
|
|
except (cv2.error, ValueError, MemoryError) as e:
|
|
logger.warning(
|
|
"preprocess_page failed (falling back to original): %s: %s",
|
|
type(e).__name__,
|
|
e,
|
|
)
|
|
return image
|