feat(ocr): add Python OCR microservice, RestClientOcrClient, Docker Compose
Python microservice (ocr-service/): - FastAPI app with /ocr and /health endpoints - Surya engine: transformer-based OCR for typewritten/modern handwriting - Kraken engine: historical HTR for Kurrent/Suetterlin with pure-Python polygon-to-quad approximation (gift wrapping + rotating calipers) - Eager model loading at startup via lifespan context manager - PDF download via httpx, page rendering via pypdfium2 at 300 DPI Java RestClientOcrClient: - Implements OcrClient + OcrHealthClient interfaces - Calls Python service via Spring RestClient - Health check with graceful fallback Docker Compose: - New ocr-service container (mem_limit 6g, no host ports) - Health check with start_period 60s for model loading - ocr_models volume for Kraken model files - Backend depends on ocr-service health Refs #226, #227 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,73 @@
|
|||||||
|
package org.raddatz.familienarchiv.service;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.raddatz.familienarchiv.model.ScriptType;
|
||||||
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
|
import org.springframework.core.ParameterizedTypeReference;
|
||||||
|
import org.springframework.http.MediaType;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.web.client.RestClient;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
@Slf4j
|
||||||
|
public class RestClientOcrClient implements OcrClient, OcrHealthClient {
|
||||||
|
|
||||||
|
private final RestClient restClient;
|
||||||
|
|
||||||
|
public RestClientOcrClient(@Value("${app.ocr.base-url:http://ocr-service:8000}") String baseUrl) {
|
||||||
|
this.restClient = RestClient.builder().baseUrl(baseUrl).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<OcrBlockResult> extractBlocks(String pdfUrl, ScriptType scriptType) {
|
||||||
|
Map<String, String> body = Map.of(
|
||||||
|
"pdfUrl", pdfUrl,
|
||||||
|
"scriptType", scriptType.name(),
|
||||||
|
"language", "de");
|
||||||
|
|
||||||
|
List<OcrBlockJson> response = restClient.post()
|
||||||
|
.uri("/ocr")
|
||||||
|
.contentType(MediaType.APPLICATION_JSON)
|
||||||
|
.body(body)
|
||||||
|
.retrieve()
|
||||||
|
.body(new ParameterizedTypeReference<>() {});
|
||||||
|
|
||||||
|
if (response == null) return List.of();
|
||||||
|
|
||||||
|
return response.stream()
|
||||||
|
.map(OcrBlockJson::toResult)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isHealthy() {
|
||||||
|
try {
|
||||||
|
restClient.get()
|
||||||
|
.uri("/health")
|
||||||
|
.retrieve()
|
||||||
|
.toBodilessEntity();
|
||||||
|
return true;
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("OCR service health check failed: {}", e.getMessage());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
record OcrBlockJson(
|
||||||
|
@JsonProperty("pageNumber") int pageNumber,
|
||||||
|
double x,
|
||||||
|
double y,
|
||||||
|
double width,
|
||||||
|
double height,
|
||||||
|
List<List<Double>> polygon,
|
||||||
|
String text
|
||||||
|
) {
|
||||||
|
OcrBlockResult toResult() {
|
||||||
|
return new OcrBlockResult(pageNumber, x, y, width, height, polygon, text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -71,6 +71,28 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- archive-net
|
- archive-net
|
||||||
|
|
||||||
|
# --- OCR: Python microservice (Surya + Kraken) ---
|
||||||
|
ocr-service:
|
||||||
|
build:
|
||||||
|
context: ./ocr-service
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: archive-ocr
|
||||||
|
restart: unless-stopped
|
||||||
|
mem_limit: 6g
|
||||||
|
memswap_limit: 6g
|
||||||
|
volumes:
|
||||||
|
- ocr_models:/app/models
|
||||||
|
environment:
|
||||||
|
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
||||||
|
networks:
|
||||||
|
- archive-net
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 12
|
||||||
|
start_period: 60s
|
||||||
|
|
||||||
# --- Backend: Spring Boot ---
|
# --- Backend: Spring Boot ---
|
||||||
backend:
|
backend:
|
||||||
build:
|
build:
|
||||||
@@ -89,6 +111,8 @@ services:
|
|||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
mailpit:
|
mailpit:
|
||||||
condition: service_started
|
condition: service_started
|
||||||
|
ocr-service:
|
||||||
|
condition: service_healthy
|
||||||
environment:
|
environment:
|
||||||
SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/${POSTGRES_DB}
|
SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/${POSTGRES_DB}
|
||||||
SPRING_DATASOURCE_USERNAME: ${POSTGRES_USER}
|
SPRING_DATASOURCE_USERNAME: ${POSTGRES_USER}
|
||||||
@@ -109,6 +133,8 @@ services:
|
|||||||
# Mailpit needs no auth or STARTTLS; production SMTP overrides these via .env
|
# Mailpit needs no auth or STARTTLS; production SMTP overrides these via .env
|
||||||
SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-false}
|
SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-false}
|
||||||
SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false}
|
SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false}
|
||||||
|
APP_OCR_BASE_URL: http://ocr-service:8000
|
||||||
|
APP_S3_INTERNAL_URL: http://minio:9000
|
||||||
ports:
|
ports:
|
||||||
- "${PORT_BACKEND}:8080"
|
- "${PORT_BACKEND}:8080"
|
||||||
networks:
|
networks:
|
||||||
@@ -155,3 +181,4 @@ networks:
|
|||||||
volumes:
|
volumes:
|
||||||
frontend_node_modules:
|
frontend_node_modules:
|
||||||
maven_cache:
|
maven_cache:
|
||||||
|
ocr_models:
|
||||||
|
|||||||
23
ocr-service/Dockerfile
Normal file
23
ocr-service/Dockerfile
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# curl for healthcheck; libgomp1 for PyTorch CPU threading
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
curl \
|
||||||
|
libgomp1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# PyTorch CPU-only — separate layer; the whl/cpu index strips all CUDA variants (~2 GB saved)
|
||||||
|
RUN pip install --no-cache-dir \
|
||||||
|
torch==2.5.1 \
|
||||||
|
--index-url https://download.pytorch.org/whl/cpu
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
0
ocr-service/engines/__init__.py
Normal file
0
ocr-service/engines/__init__.py
Normal file
192
ocr-service/engines/kraken.py
Normal file
192
ocr-service/engines/kraken.py
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
"""Kraken OCR engine wrapper — historical HTR model support for Kurrent/Suetterlin."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_model = None
|
||||||
|
_model_path = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel")
|
||||||
|
|
||||||
|
|
||||||
|
def load_models():
|
||||||
|
"""Load the Kraken model at startup. Skips if model file is not present."""
|
||||||
|
global _model
|
||||||
|
|
||||||
|
if not os.path.exists(_model_path):
|
||||||
|
logger.warning("Kraken model not found at %s — Kurrent OCR will not be available", _model_path)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Loading Kraken model from %s...", _model_path)
|
||||||
|
|
||||||
|
from kraken.lib import models as kraken_models
|
||||||
|
_model = kraken_models.load_any(_model_path)
|
||||||
|
|
||||||
|
logger.info("Kraken model loaded successfully")
|
||||||
|
|
||||||
|
|
||||||
|
def is_available() -> bool:
|
||||||
|
return _model is not None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||||
|
"""Run Kraken segmentation + recognition on a list of PIL images.
|
||||||
|
|
||||||
|
Returns block dicts with pageNumber, x, y, width, height, polygon, text.
|
||||||
|
Polygon is a 4-point quadrilateral approximation of the baseline polygon.
|
||||||
|
Coordinates are normalized to [0, 1].
|
||||||
|
"""
|
||||||
|
from kraken import blla, rpred
|
||||||
|
|
||||||
|
if _model is None:
|
||||||
|
raise RuntimeError("Kraken model is not loaded")
|
||||||
|
|
||||||
|
all_blocks = []
|
||||||
|
|
||||||
|
for page_idx, image in enumerate(images):
|
||||||
|
page_w, page_h = image.size
|
||||||
|
|
||||||
|
baseline_seg = blla.segment(image)
|
||||||
|
|
||||||
|
pred_it = rpred.rpred(_model, image, baseline_seg)
|
||||||
|
|
||||||
|
for record in pred_it:
|
||||||
|
# record.prediction is the recognized text
|
||||||
|
# record.cuts contains polygon points
|
||||||
|
# record.line is the baseline polygon
|
||||||
|
|
||||||
|
polygon_pts = record.cuts if hasattr(record, "cuts") else []
|
||||||
|
|
||||||
|
# Compute AABB from the polygon
|
||||||
|
if polygon_pts:
|
||||||
|
xs = [p[0] for p in polygon_pts]
|
||||||
|
ys = [p[1] for p in polygon_pts]
|
||||||
|
x1, y1 = min(xs), min(ys)
|
||||||
|
x2, y2 = max(xs), max(ys)
|
||||||
|
else:
|
||||||
|
# Fallback to line baseline
|
||||||
|
xs = [p[0] for p in record.line]
|
||||||
|
ys = [p[1] for p in record.line]
|
||||||
|
x1, y1 = min(xs), min(ys) - 5
|
||||||
|
x2, y2 = max(xs), max(ys) + 5
|
||||||
|
|
||||||
|
# Approximate polygon to quadrilateral
|
||||||
|
quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
|
||||||
|
|
||||||
|
all_blocks.append({
|
||||||
|
"pageNumber": page_idx,
|
||||||
|
"x": x1 / page_w,
|
||||||
|
"y": y1 / page_h,
|
||||||
|
"width": (x2 - x1) / page_w,
|
||||||
|
"height": (y2 - y1) / page_h,
|
||||||
|
"polygon": quad,
|
||||||
|
"text": record.prediction,
|
||||||
|
})
|
||||||
|
|
||||||
|
return all_blocks
|
||||||
|
|
||||||
|
|
||||||
|
def _approximate_to_quad(points: list[tuple], page_w: float, page_h: float) -> list[list[float]] | None:
|
||||||
|
"""Approximate a polygon to a 4-point quadrilateral using the minimum bounding rectangle.
|
||||||
|
|
||||||
|
Uses gift-wrapping (Jarvis march) for convex hull, then rotating calipers
|
||||||
|
for the minimum area bounding rectangle. Pure Python, no scipy/numpy.
|
||||||
|
"""
|
||||||
|
if len(points) < 3:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
hull = _convex_hull(points)
|
||||||
|
if len(hull) < 3:
|
||||||
|
return None
|
||||||
|
|
||||||
|
rect = _min_bounding_rect(hull)
|
||||||
|
|
||||||
|
# Normalize to [0, 1]
|
||||||
|
return [[p[0] / page_w, p[1] / page_h] for p in rect]
|
||||||
|
except Exception:
|
||||||
|
logger.debug("Failed to approximate polygon to quad, returning None")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _convex_hull(points: list[tuple]) -> list[tuple]:
|
||||||
|
"""Jarvis march (gift wrapping) algorithm for 2D convex hull."""
|
||||||
|
pts = list(set(points))
|
||||||
|
if len(pts) < 3:
|
||||||
|
return pts
|
||||||
|
|
||||||
|
# Start from leftmost point
|
||||||
|
start = min(pts, key=lambda p: (p[0], p[1]))
|
||||||
|
hull = []
|
||||||
|
current = start
|
||||||
|
|
||||||
|
while True:
|
||||||
|
hull.append(current)
|
||||||
|
candidate = pts[0]
|
||||||
|
for p in pts[1:]:
|
||||||
|
if candidate == current:
|
||||||
|
candidate = p
|
||||||
|
continue
|
||||||
|
cross = _cross(current, candidate, p)
|
||||||
|
if cross < 0:
|
||||||
|
candidate = p
|
||||||
|
elif cross == 0:
|
||||||
|
# Collinear — pick the farther point
|
||||||
|
if _dist_sq(current, p) > _dist_sq(current, candidate):
|
||||||
|
candidate = p
|
||||||
|
|
||||||
|
current = candidate
|
||||||
|
if current == start:
|
||||||
|
break
|
||||||
|
|
||||||
|
return hull
|
||||||
|
|
||||||
|
|
||||||
|
def _min_bounding_rect(hull: list[tuple]) -> list[tuple]:
|
||||||
|
"""Find the minimum area bounding rectangle of a convex hull using rotating calipers."""
|
||||||
|
n = len(hull)
|
||||||
|
if n < 2:
|
||||||
|
return hull
|
||||||
|
|
||||||
|
min_area = float("inf")
|
||||||
|
best_rect = None
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
# Edge vector
|
||||||
|
edge_x = hull[(i + 1) % n][0] - hull[i][0]
|
||||||
|
edge_y = hull[(i + 1) % n][1] - hull[i][1]
|
||||||
|
edge_len = (edge_x ** 2 + edge_y ** 2) ** 0.5
|
||||||
|
if edge_len == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Unit vectors along and perpendicular to the edge
|
||||||
|
ux, uy = edge_x / edge_len, edge_y / edge_len
|
||||||
|
vx, vy = -uy, ux
|
||||||
|
|
||||||
|
# Project all hull points onto the edge coordinate system
|
||||||
|
projs_u = [p[0] * ux + p[1] * uy for p in hull]
|
||||||
|
projs_v = [p[0] * vx + p[1] * vy for p in hull]
|
||||||
|
|
||||||
|
min_u, max_u = min(projs_u), max(projs_u)
|
||||||
|
min_v, max_v = min(projs_v), max(projs_v)
|
||||||
|
|
||||||
|
area = (max_u - min_u) * (max_v - min_v)
|
||||||
|
if area < min_area:
|
||||||
|
min_area = area
|
||||||
|
# Reconstruct 4 corners in original coordinates
|
||||||
|
best_rect = [
|
||||||
|
(min_u * ux + min_v * vx, min_u * uy + min_v * vy),
|
||||||
|
(max_u * ux + min_v * vx, max_u * uy + min_v * vy),
|
||||||
|
(max_u * ux + max_v * vx, max_u * uy + max_v * vy),
|
||||||
|
(min_u * ux + max_v * vx, min_u * uy + max_v * vy),
|
||||||
|
]
|
||||||
|
|
||||||
|
return best_rect if best_rect else hull[:4]
|
||||||
|
|
||||||
|
|
||||||
|
def _cross(o: tuple, a: tuple, b: tuple) -> float:
|
||||||
|
return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
|
||||||
|
|
||||||
|
|
||||||
|
def _dist_sq(a: tuple, b: tuple) -> float:
|
||||||
|
return (a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2
|
||||||
66
ocr-service/engines/surya.py
Normal file
66
ocr-service/engines/surya.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Lazy-loaded at startup via load_models()
|
||||||
|
_recognition_model = None
|
||||||
|
_recognition_processor = None
|
||||||
|
_detection_model = None
|
||||||
|
_detection_processor = None
|
||||||
|
|
||||||
|
|
||||||
|
def load_models():
|
||||||
|
"""Eagerly load Surya models into memory. Called once at container startup."""
|
||||||
|
global _recognition_model, _recognition_processor, _detection_model, _detection_processor
|
||||||
|
|
||||||
|
logger.info("Loading Surya models...")
|
||||||
|
|
||||||
|
from surya.model.detection.model import load_model as load_det_model
|
||||||
|
from surya.model.detection.model import load_processor as load_det_processor
|
||||||
|
from surya.model.recognition.model import load_model as load_rec_model
|
||||||
|
from surya.model.recognition.processor import load_processor as load_rec_processor
|
||||||
|
|
||||||
|
_detection_model = load_det_model()
|
||||||
|
_detection_processor = load_det_processor()
|
||||||
|
_recognition_model = load_rec_model()
|
||||||
|
_recognition_processor = load_rec_processor()
|
||||||
|
|
||||||
|
logger.info("Surya models loaded successfully")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||||
|
"""Run Surya OCR on a list of PIL images (one per page).
|
||||||
|
|
||||||
|
Returns a flat list of block dicts with pageNumber, x, y, width, height, text.
|
||||||
|
Coordinates are normalized to [0, 1] relative to page dimensions.
|
||||||
|
"""
|
||||||
|
from surya.detection import batch_text_detection
|
||||||
|
from surya.recognition import batch_recognition
|
||||||
|
|
||||||
|
all_blocks = []
|
||||||
|
|
||||||
|
for page_idx, image in enumerate(images):
|
||||||
|
page_w, page_h = image.size
|
||||||
|
|
||||||
|
det_predictions = batch_text_detection([image], _detection_model, _detection_processor)
|
||||||
|
rec_predictions = batch_recognition(
|
||||||
|
[image], det_predictions, _recognition_model, _recognition_processor, [language]
|
||||||
|
)
|
||||||
|
|
||||||
|
for line in rec_predictions[0].text_lines:
|
||||||
|
bbox = line.bbox # [x1, y1, x2, y2] in pixel coordinates
|
||||||
|
x1, y1, x2, y2 = bbox
|
||||||
|
|
||||||
|
all_blocks.append({
|
||||||
|
"pageNumber": page_idx,
|
||||||
|
"x": x1 / page_w,
|
||||||
|
"y": y1 / page_h,
|
||||||
|
"width": (x2 - x1) / page_w,
|
||||||
|
"height": (y2 - y1) / page_h,
|
||||||
|
"polygon": None,
|
||||||
|
"text": line.text,
|
||||||
|
})
|
||||||
|
|
||||||
|
return all_blocks
|
||||||
93
ocr-service/main.py
Normal file
93
ocr-service/main.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
"""OCR microservice — FastAPI app with Surya and Kraken engine support."""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pypdfium2 as pdfium
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from engines import kraken as kraken_engine
|
||||||
|
from engines import surya as surya_engine
|
||||||
|
from models import OcrBlock, OcrRequest
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_models_ready = False
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
"""Load all OCR models at startup before accepting requests."""
|
||||||
|
global _models_ready
|
||||||
|
|
||||||
|
logger.info("Loading OCR models at startup...")
|
||||||
|
surya_engine.load_models()
|
||||||
|
kraken_engine.load_models()
|
||||||
|
_models_ready = True
|
||||||
|
logger.info("All OCR models loaded — ready to accept requests")
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
logger.info("Shutting down OCR service")
|
||||||
|
|
||||||
|
|
||||||
|
app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health():
|
||||||
|
"""Health endpoint — returns 200 only after models are loaded."""
|
||||||
|
if not _models_ready:
|
||||||
|
raise HTTPException(status_code=503, detail="Models not loaded yet")
|
||||||
|
return {"status": "ok", "surya": True, "kraken": kraken_engine.is_available()}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/ocr", response_model=list[OcrBlock])
|
||||||
|
async def run_ocr(request: OcrRequest):
|
||||||
|
"""Run OCR on a PDF document.
|
||||||
|
|
||||||
|
Downloads the PDF from the provided URL, converts pages to images,
|
||||||
|
and runs the appropriate OCR engine based on scriptType.
|
||||||
|
"""
|
||||||
|
if not _models_ready:
|
||||||
|
raise HTTPException(status_code=503, detail="Models not loaded yet")
|
||||||
|
|
||||||
|
images = await _download_and_convert_pdf(request.pdf_url)
|
||||||
|
|
||||||
|
script_type = request.script_type.upper()
|
||||||
|
|
||||||
|
if script_type == "HANDWRITING_KURRENT":
|
||||||
|
if not kraken_engine.is_available():
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Kraken model not available — cannot process Kurrent script",
|
||||||
|
)
|
||||||
|
blocks = kraken_engine.extract_blocks(images, request.language)
|
||||||
|
else:
|
||||||
|
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
|
||||||
|
blocks = surya_engine.extract_blocks(images, request.language)
|
||||||
|
|
||||||
|
return [OcrBlock(**b) for b in blocks]
|
||||||
|
|
||||||
|
|
||||||
|
async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
|
||||||
|
"""Download a PDF from URL and convert each page to a PIL Image."""
|
||||||
|
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
|
||||||
|
response = await client.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
pdf = pdfium.PdfDocument(io.BytesIO(response.content))
|
||||||
|
images = []
|
||||||
|
|
||||||
|
for page_idx in range(len(pdf)):
|
||||||
|
page = pdf[page_idx]
|
||||||
|
# Render at 300 DPI for good OCR quality
|
||||||
|
bitmap = page.render(scale=300 / 72)
|
||||||
|
pil_image = bitmap.to_pil()
|
||||||
|
images.append(pil_image)
|
||||||
|
|
||||||
|
return images
|
||||||
20
ocr-service/models.py
Normal file
20
ocr-service/models.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class OcrRequest(BaseModel):
|
||||||
|
pdf_url: str = Field(..., alias="pdfUrl")
|
||||||
|
script_type: str = Field("UNKNOWN", alias="scriptType")
|
||||||
|
language: str = "de"
|
||||||
|
|
||||||
|
|
||||||
|
class OcrBlock(BaseModel):
|
||||||
|
page_number: int = Field(..., alias="pageNumber")
|
||||||
|
x: float
|
||||||
|
y: float
|
||||||
|
width: float
|
||||||
|
height: float
|
||||||
|
polygon: list[list[float]] | None = None
|
||||||
|
text: str
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
populate_by_name = True
|
||||||
6
ocr-service/requirements.txt
Normal file
6
ocr-service/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
fastapi[standard]==0.115.6
|
||||||
|
surya-ocr==0.6.3
|
||||||
|
kraken==5.2.9
|
||||||
|
pillow==11.1.0
|
||||||
|
pypdfium2==4.30.0
|
||||||
|
httpx==0.28.1
|
||||||
Reference in New Issue
Block a user