From 0af474967743a936dafcbd253898d25219cabdbc Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 19:41:59 +0200 Subject: [PATCH] feat(ocr): extend model script with automatic OCR evaluation Downloads both Kraken models, then runs each against 4 sample PDFs from the import folder (Eu-0693, Eu-0692, W-0150, W-0575). Output goes to ocr-model-evaluation//.txt for side-by-side comparison. Usage: ./scripts/download-kraken-models.sh # download + evaluate ./scripts/download-kraken-models.sh --eval-only # re-run evaluation ./scripts/download-kraken-models.sh --activate 1 # pick winner Co-Authored-By: Claude Sonnet 4.6 --- scripts/download-kraken-models.sh | 117 ++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 29 deletions(-) diff --git a/scripts/download-kraken-models.sh b/scripts/download-kraken-models.sh index 1486e66b..18d2cdc7 100755 --- a/scripts/download-kraken-models.sh +++ b/scripts/download-kraken-models.sh @@ -1,13 +1,13 @@ #!/bin/bash set -euo pipefail -# Downloads Kraken HTR models for German Kurrent/Suetterlin into the ocr_models volume. -# Run this once after first deployment, or whenever you want to switch models. +# Downloads Kraken HTR models, runs OCR evaluation on sample PDFs, and activates the winner. # # Usage: -# ./scripts/download-kraken-models.sh # download both candidates -# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9) -# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de) +# ./scripts/download-kraken-models.sh # download models + run evaluation +# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9) +# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de) +# ./scripts/download-kraken-models.sh --eval-only # re-run evaluation (models already downloaded) COMPOSE_SERVICE="ocr-service" MODEL_DIR="/app/models" @@ -21,6 +21,10 @@ MODEL_2_NAME="kurrent-de" MODEL_2_DESC="Broad German Kurrent coverage (HTR-United)" MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel" +EVAL_PDFS=("Eu-0693.pdf" "Eu-0692.pdf" "W-0150.pdf" "W-0575.pdf") +IMPORT_DIR="./import" +OUTPUT_BASE="./ocr-model-evaluation" + # ─── Functions ──────────────────────────────────────────────────────────────── download_models() { @@ -29,31 +33,81 @@ download_models() { echo "Model 1: $MODEL_1_NAME" echo " $MODEL_1_DESC" - docker compose run --rm "$COMPOSE_SERVICE" \ - kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH" + docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH" echo "" echo "Model 2: $MODEL_2_NAME" echo " $MODEL_2_DESC" - docker compose run --rm "$COMPOSE_SERVICE" \ - kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH" + docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH" echo "" - echo "Both models downloaded. To test them against a sample document:" + echo "Both models downloaded." +} + +run_evaluation() { echo "" - echo " # Copy a sample Kurrent scan into the container:" - echo " docker cp sample-kurrent.png archive-ocr:/tmp/sample.png" + echo "═══════════════════════════════════════════════════════" + echo " Running OCR evaluation on ${#EVAL_PDFS[@]} documents" + echo "═══════════════════════════════════════════════════════" echo "" - echo " # Test model 1:" - echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out1.txt segment -bl ocr -m $MODEL_1_PATH" - echo " docker compose exec ocr-service cat /tmp/out1.txt" + + # Create output directories on the host + local dir_1="$OUTPUT_BASE/$MODEL_1_NAME" + local dir_2="$OUTPUT_BASE/$MODEL_2_NAME" + mkdir -p "$dir_1" "$dir_2" + + for pdf in "${EVAL_PDFS[@]}"; do + local src="$IMPORT_DIR/$pdf" + local basename="${pdf%.pdf}" + + if [[ ! -f "$src" ]]; then + echo "SKIP: $src not found" + continue + fi + + echo "──── $pdf ────" + + # Model 1 + echo " Model 1: $MODEL_1_NAME ..." + docker compose run --rm \ + -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ + -v "$(cd "$dir_1" && pwd):/eval-output" \ + "$COMPOSE_SERVICE" \ + kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \ + 2>/dev/null || echo " ⚠ Model 1 failed on $pdf" + + # Model 2 + echo " Model 2: $MODEL_2_NAME ..." + docker compose run --rm \ + -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ + -v "$(cd "$dir_2" && pwd):/eval-output" \ + "$COMPOSE_SERVICE" \ + kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \ + 2>/dev/null || echo " ⚠ Model 2 failed on $pdf" + + echo "" + done + + echo "═══════════════════════════════════════════════════════" + echo " Evaluation complete. Results:" echo "" - echo " # Test model 2:" - echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out2.txt segment -bl ocr -m $MODEL_2_PATH" - echo " docker compose exec ocr-service cat /tmp/out2.txt" + echo " Model 1 ($MODEL_1_NAME):" + for f in "$dir_1"/*.txt; do + [[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes" + done echo "" - echo "Then activate the better model:" - echo " ./scripts/download-kraken-models.sh --activate 1 # or 2" + echo " Model 2 ($MODEL_2_NAME):" + for f in "$dir_2"/*.txt; do + [[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes" + done + echo "" + echo " Compare the outputs side by side:" + echo " diff $dir_1/Eu-0693.txt $dir_2/Eu-0693.txt" + echo "" + echo " Then activate the better model:" + echo " ./scripts/download-kraken-models.sh --activate 1 # $MODEL_1_NAME" + echo " ./scripts/download-kraken-models.sh --activate 2 # $MODEL_2_NAME" + echo "═══════════════════════════════════════════════════════" } activate_model() { @@ -61,13 +115,11 @@ activate_model() { case "$choice" in 1) echo "Activating model 1: $MODEL_1_NAME" - docker compose run --rm "$COMPOSE_SERVICE" \ - cp "$MODEL_1_PATH" "$ACTIVE_MODEL" + docker compose run --rm "$COMPOSE_SERVICE" cp "$MODEL_1_PATH" "$ACTIVE_MODEL" ;; 2) echo "Activating model 2: $MODEL_2_NAME" - docker compose run --rm "$COMPOSE_SERVICE" \ - cp "$MODEL_2_PATH" "$ACTIVE_MODEL" + docker compose run --rm "$COMPOSE_SERVICE" cp "$MODEL_2_PATH" "$ACTIVE_MODEL" ;; *) echo "Error: --activate expects 1 or 2" @@ -82,8 +134,15 @@ activate_model() { # ─── Main ───────────────────────────────────────────────────────────────────── -if [[ "${1:-}" == "--activate" ]]; then - activate_model "${2:-}" -else - download_models -fi +case "${1:-}" in + --activate) + activate_model "${2:-}" + ;; + --eval-only) + run_evaluation + ;; + *) + download_models + run_evaluation + ;; +esac