fix(ocr): extract PDF pages as PNGs before running kraken OCR
Kraken's -f pdf mode tries to write output next to the input file, which fails on read-only mounts. Instead, extract pages as PNGs via pypdfium2 (already installed), then run kraken on each image. Both models run in a single container per PDF to avoid overhead. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -99,21 +99,44 @@ run_evaluation() {
|
|||||||
|
|
||||||
echo "──── $pdf ────"
|
echo "──── $pdf ────"
|
||||||
|
|
||||||
echo " Model 1: $MODEL_1_NAME ..."
|
# Run both models inside a single container run:
|
||||||
|
# 1. Extract PDF pages as PNGs (pypdfium2 is already installed)
|
||||||
|
# 2. Run kraken on each page image for both models
|
||||||
|
# 3. Concatenate per-page output into one file per model
|
||||||
docker compose run --rm \
|
docker compose run --rm \
|
||||||
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
||||||
-v "$(cd "$dir_1" && pwd):/eval-output" \
|
-v "$(cd "$dir_1" && pwd):/eval-out-1" \
|
||||||
|
-v "$(cd "$dir_2" && pwd):/eval-out-2" \
|
||||||
"$COMPOSE_SERVICE" \
|
"$COMPOSE_SERVICE" \
|
||||||
kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \
|
python3 -c "
|
||||||
2>/dev/null || echo " ⚠ Model 1 failed on $pdf"
|
import pypdfium2 as pdfium, subprocess, sys, os
|
||||||
|
|
||||||
echo " Model 2: $MODEL_2_NAME ..."
|
pdf = pdfium.PdfDocument('/eval-input/$pdf')
|
||||||
docker compose run --rm \
|
pages = []
|
||||||
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
for i in range(len(pdf)):
|
||||||
-v "$(cd "$dir_2" && pwd):/eval-output" \
|
bmp = pdf[i].render(scale=300/72)
|
||||||
"$COMPOSE_SERVICE" \
|
path = f'/tmp/page_{i:04d}.png'
|
||||||
kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \
|
bmp.to_pil().save(path)
|
||||||
2>/dev/null || echo " ⚠ Model 2 failed on $pdf"
|
pages.append(path)
|
||||||
|
print(f'Extracted {len(pages)} pages')
|
||||||
|
|
||||||
|
for label, model, outdir in [
|
||||||
|
('Model 1', '$MODEL_1_PATH', '/eval-out-1'),
|
||||||
|
('Model 2', '$MODEL_2_PATH', '/eval-out-2'),
|
||||||
|
]:
|
||||||
|
print(f' {label}...')
|
||||||
|
combined = ''
|
||||||
|
for p in pages:
|
||||||
|
args = ['kraken', '-i', p, '/dev/stdout', 'segment', '-bl', 'ocr', '-m', model]
|
||||||
|
r = subprocess.run(args, capture_output=True, text=True)
|
||||||
|
combined += r.stdout
|
||||||
|
if r.returncode != 0:
|
||||||
|
print(f' ⚠ failed on {os.path.basename(p)}: {r.stderr[:200]}', file=sys.stderr)
|
||||||
|
with open(f'{outdir}/${basename}.txt', 'w') as f:
|
||||||
|
f.write(combined)
|
||||||
|
lines = combined.count(chr(10))
|
||||||
|
print(f' → {lines} lines')
|
||||||
|
" || echo " ⚠ Failed on $pdf"
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
done
|
done
|
||||||
|
|||||||
Reference in New Issue
Block a user