feat(ocr): integrate preprocessing into stream and batch endpoints
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -25,6 +25,7 @@ from confidence import apply_confidence_markers, get_threshold
|
||||
from engines import kraken as kraken_engine
|
||||
from engines import surya as surya_engine
|
||||
from models import OcrBlock, OcrRequest
|
||||
from preprocessing import preprocess_page
|
||||
|
||||
TRAINING_TOKEN = os.environ.get("TRAINING_TOKEN", "")
|
||||
KRAKEN_MODEL_PATH = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel")
|
||||
@@ -86,6 +87,10 @@ async def run_ocr(request: OcrRequest):
|
||||
|
||||
images = await _download_and_convert_pdf(request.pdfUrl)
|
||||
|
||||
for i, img in enumerate(images):
|
||||
images[i] = await asyncio.to_thread(preprocess_page, img)
|
||||
del img
|
||||
|
||||
script_type = request.scriptType.upper()
|
||||
|
||||
if script_type == "HANDWRITING_KURRENT":
|
||||
@@ -157,6 +162,8 @@ async def run_ocr_stream(request: OcrRequest):
|
||||
continue
|
||||
|
||||
try:
|
||||
yield json.dumps({"type": "preprocessing", "pageNumber": page_idx}) + "\n"
|
||||
image = await asyncio.to_thread(preprocess_page, image)
|
||||
blocks = []
|
||||
for region in page_regions:
|
||||
text = await asyncio.to_thread(
|
||||
@@ -214,6 +221,8 @@ async def run_ocr_stream(request: OcrRequest):
|
||||
|
||||
for page_idx, image in enumerate(images, start=1):
|
||||
try:
|
||||
yield json.dumps({"type": "preprocessing", "pageNumber": page_idx}) + "\n"
|
||||
image = await asyncio.to_thread(preprocess_page, image)
|
||||
blocks = await asyncio.to_thread(
|
||||
engine.extract_page_blocks, image, page_idx, request.language
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user