diff --git a/scripts/download-kraken-models.sh b/scripts/download-kraken-models.sh index 6e3d5a47..c7387322 100755 --- a/scripts/download-kraken-models.sh +++ b/scripts/download-kraken-models.sh @@ -99,21 +99,44 @@ run_evaluation() { echo "──── $pdf ────" - echo " Model 1: $MODEL_1_NAME ..." + # Run both models inside a single container run: + # 1. Extract PDF pages as PNGs (pypdfium2 is already installed) + # 2. Run kraken on each page image for both models + # 3. Concatenate per-page output into one file per model docker compose run --rm \ -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ - -v "$(cd "$dir_1" && pwd):/eval-output" \ + -v "$(cd "$dir_1" && pwd):/eval-out-1" \ + -v "$(cd "$dir_2" && pwd):/eval-out-2" \ "$COMPOSE_SERVICE" \ - kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \ - 2>/dev/null || echo " ⚠ Model 1 failed on $pdf" + python3 -c " +import pypdfium2 as pdfium, subprocess, sys, os - echo " Model 2: $MODEL_2_NAME ..." - docker compose run --rm \ - -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ - -v "$(cd "$dir_2" && pwd):/eval-output" \ - "$COMPOSE_SERVICE" \ - kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \ - 2>/dev/null || echo " ⚠ Model 2 failed on $pdf" +pdf = pdfium.PdfDocument('/eval-input/$pdf') +pages = [] +for i in range(len(pdf)): + bmp = pdf[i].render(scale=300/72) + path = f'/tmp/page_{i:04d}.png' + bmp.to_pil().save(path) + pages.append(path) +print(f'Extracted {len(pages)} pages') + +for label, model, outdir in [ + ('Model 1', '$MODEL_1_PATH', '/eval-out-1'), + ('Model 2', '$MODEL_2_PATH', '/eval-out-2'), +]: + print(f' {label}...') + combined = '' + for p in pages: + args = ['kraken', '-i', p, '/dev/stdout', 'segment', '-bl', 'ocr', '-m', model] + r = subprocess.run(args, capture_output=True, text=True) + combined += r.stdout + if r.returncode != 0: + print(f' ⚠ failed on {os.path.basename(p)}: {r.stderr[:200]}', file=sys.stderr) + with open(f'{outdir}/${basename}.txt', 'w') as f: + f.write(combined) + lines = combined.count(chr(10)) + print(f' → {lines} lines') +" || echo " ⚠ Failed on $pdf" echo "" done