fix(ocr): extract PDF pages as PNGs before running kraken OCR
Kraken's -f pdf mode tries to write output next to the input file, which fails on read-only mounts. Instead, extract pages as PNGs via pypdfium2 (already installed), then run kraken on each image. Both models run in a single container per PDF to avoid overhead. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -99,21 +99,44 @@ run_evaluation() {
|
||||
|
||||
echo "──── $pdf ────"
|
||||
|
||||
echo " Model 1: $MODEL_1_NAME ..."
|
||||
# Run both models inside a single container run:
|
||||
# 1. Extract PDF pages as PNGs (pypdfium2 is already installed)
|
||||
# 2. Run kraken on each page image for both models
|
||||
# 3. Concatenate per-page output into one file per model
|
||||
docker compose run --rm \
|
||||
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
||||
-v "$(cd "$dir_1" && pwd):/eval-output" \
|
||||
-v "$(cd "$dir_1" && pwd):/eval-out-1" \
|
||||
-v "$(cd "$dir_2" && pwd):/eval-out-2" \
|
||||
"$COMPOSE_SERVICE" \
|
||||
kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \
|
||||
2>/dev/null || echo " ⚠ Model 1 failed on $pdf"
|
||||
python3 -c "
|
||||
import pypdfium2 as pdfium, subprocess, sys, os
|
||||
|
||||
echo " Model 2: $MODEL_2_NAME ..."
|
||||
docker compose run --rm \
|
||||
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
||||
-v "$(cd "$dir_2" && pwd):/eval-output" \
|
||||
"$COMPOSE_SERVICE" \
|
||||
kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \
|
||||
2>/dev/null || echo " ⚠ Model 2 failed on $pdf"
|
||||
pdf = pdfium.PdfDocument('/eval-input/$pdf')
|
||||
pages = []
|
||||
for i in range(len(pdf)):
|
||||
bmp = pdf[i].render(scale=300/72)
|
||||
path = f'/tmp/page_{i:04d}.png'
|
||||
bmp.to_pil().save(path)
|
||||
pages.append(path)
|
||||
print(f'Extracted {len(pages)} pages')
|
||||
|
||||
for label, model, outdir in [
|
||||
('Model 1', '$MODEL_1_PATH', '/eval-out-1'),
|
||||
('Model 2', '$MODEL_2_PATH', '/eval-out-2'),
|
||||
]:
|
||||
print(f' {label}...')
|
||||
combined = ''
|
||||
for p in pages:
|
||||
args = ['kraken', '-i', p, '/dev/stdout', 'segment', '-bl', 'ocr', '-m', model]
|
||||
r = subprocess.run(args, capture_output=True, text=True)
|
||||
combined += r.stdout
|
||||
if r.returncode != 0:
|
||||
print(f' ⚠ failed on {os.path.basename(p)}: {r.stderr[:200]}', file=sys.stderr)
|
||||
with open(f'{outdir}/${basename}.txt', 'w') as f:
|
||||
f.write(combined)
|
||||
lines = combined.count(chr(10))
|
||||
print(f' → {lines} lines')
|
||||
" || echo " ⚠ Failed on $pdf"
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user