Files
familienarchiv/ocr-service/preprocessing.py
Marcel 4cb7c975f5
Some checks failed
CI / Unit & Component Tests (pull_request) Failing after 2m27s
CI / Backend Unit Tests (pull_request) Failing after 2m37s
CI / Unit & Component Tests (push) Failing after 3m14s
CI / Backend Unit Tests (push) Has been cancelled
test(ocr): add resilience tests for tiny image and unexpected exception propagation
Add test for 1×1 image (sub-tile-size) resilience and narrow preprocess_page
fallback from except Exception to (cv2.error, ValueError, MemoryError) so
programming errors propagate instead of being silently swallowed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-17 15:16:17 +02:00

51 lines
1.4 KiB
Python

"""Image preprocessing pipeline for aged document OCR quality improvement."""
import logging
import os
import cv2
import numpy as np
from PIL import Image
logger = logging.getLogger(__name__)
CLAHE_CLIP_LIMIT = float(os.environ.get("OCR_CLAHE_CLIP_LIMIT", "2.0"))
CLAHE_TILE_SIZE = int(os.environ.get("OCR_CLAHE_TILE_SIZE", "8"))
def preprocess_page(image: Image.Image) -> Image.Image:
"""Apply CLAHE + grayscale + Gaussian blur to improve OCR quality on aged documents.
Falls back silently to the original image if cv2, numpy, or memory errors occur.
Unexpected exceptions (programming errors) are allowed to propagate.
"""
try:
img_array = np.array(image)
lab = cv2.cvtColor(img_array, cv2.COLOR_RGB2LAB)
del img_array
l_channel = lab[:, :, 0].copy()
del lab
clahe = cv2.createCLAHE(
clipLimit=CLAHE_CLIP_LIMIT,
tileGridSize=(CLAHE_TILE_SIZE, CLAHE_TILE_SIZE),
)
l_clahe = clahe.apply(l_channel)
del l_channel
blurred = cv2.GaussianBlur(l_clahe, (3, 3), 0)
del l_clahe
result = Image.fromarray(blurred, mode="L")
del blurred
return result
except (cv2.error, ValueError, MemoryError) as e:
logger.warning(
"preprocess_page failed (falling back to original): %s: %s",
type(e).__name__,
e,
)
return image