fix(ocr): extract PDF pages as PNGs before running kraken OCR

Kraken's -f pdf mode tries to write output next to the input file, which fails on read-only mounts. Instead, extract pages as PNGs via pypdfium2 (already installed), then run kraken on each image. Both models run in a single container per PDF to avoid overhead. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 20:37:29 +02:00
parent 31519af1a4
commit dd078d50da
1 changed files with 34 additions and 11 deletions
--- a/scripts/download-kraken-models.sh
+++ b/scripts/download-kraken-models.sh
@@ -99,21 +99,44 @@ run_evaluation() {
        echo "──── $pdf ────"
-        echo "  Model 1: $MODEL_1_NAME ..."
+        # Run both models inside a single container run:
        # 1. Extract PDF pages as PNGs (pypdfium2 is already installed)
        # 2. Run kraken on each page image for both models
        # 3. Concatenate per-page output into one file per model
        docker compose run --rm \
            -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
-            -v "$(cd "$dir_1" && pwd):/eval-output" \
+            -v "$(cd "$dir_1" && pwd):/eval-out-1" \
            -v "$(cd "$dir_2" && pwd):/eval-out-2" \
            "$COMPOSE_SERVICE" \
-            kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \
+            python3 -c "
-            2>/dev/null || echo "  ⚠ Model 1 failed on $pdf"
+import pypdfium2 as pdfium, subprocess, sys, os
-        echo "  Model 2: $MODEL_2_NAME ..."
+pdf = pdfium.PdfDocument('/eval-input/$pdf')
-        docker compose run --rm \
+pages = []
-            -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
+for i in range(len(pdf)):
-            -v "$(cd "$dir_2" && pwd):/eval-output" \
+    bmp = pdf[i].render(scale=300/72)
-            "$COMPOSE_SERVICE" \
+    path = f'/tmp/page_{i:04d}.png'
-            kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \
+    bmp.to_pil().save(path)
-            2>/dev/null || echo "  ⚠ Model 2 failed on $pdf"
+    pages.append(path)
 print(f'Extracted {len(pages)} pages')
 for label, model, outdir in [
    ('Model 1', '$MODEL_1_PATH', '/eval-out-1'),
    ('Model 2', '$MODEL_2_PATH', '/eval-out-2'),
 ]:
    print(f'  {label}...')
    combined = ''
    for p in pages:
        args = ['kraken', '-i', p, '/dev/stdout', 'segment', '-bl', 'ocr', '-m', model]
        r = subprocess.run(args, capture_output=True, text=True)
        combined += r.stdout
        if r.returncode != 0:
            print(f'    ⚠ failed on {os.path.basename(p)}: {r.stderr[:200]}', file=sys.stderr)
    with open(f'{outdir}/${basename}.txt', 'w') as f:
        f.write(combined)
    lines = combined.count(chr(10))
    print(f'    → {lines} lines')
 " || echo "  ⚠ Failed on $pdf"
        echo ""
    done