From f12b41161ecf088a6f5a954ce74466b53a07178f Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 20:05:29 +0200 Subject: [PATCH] fix(ocr): update model script for kraken 7 DOI-based downloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kraken 7 uses DOIs (not short names) to identify models from Zenodo. Updated to use actual DOIs: - 10.5281/zenodo.7933463 — German handwriting HTR - 10.5281/zenodo.13788177 — McCATMuS generic handwritten/printed/typed Added -f pdf flag for PDF input, volume mounts for import dir, and post-download copy from htrmopo cache to the models volume. Co-Authored-By: Claude Sonnet 4.6 --- scripts/download-kraken-models.sh | 62 ++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/scripts/download-kraken-models.sh b/scripts/download-kraken-models.sh index 18d2cdc7..8c9898b1 100755 --- a/scripts/download-kraken-models.sh +++ b/scripts/download-kraken-models.sh @@ -5,20 +5,23 @@ set -euo pipefail # # Usage: # ./scripts/download-kraken-models.sh # download models + run evaluation -# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9) -# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de) +# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 +# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 # ./scripts/download-kraken-models.sh --eval-only # re-run evaluation (models already downloaded) COMPOSE_SERVICE="ocr-service" MODEL_DIR="/app/models" ACTIVE_MODEL="$MODEL_DIR/german_kurrent.mlmodel" -MODEL_1_NAME="german_kurrent_manu_9" -MODEL_1_DESC="19th-century German administrative Kurrent (HTR-United)" +# Kraken 7 uses DOIs to identify models from HTR-United / Zenodo +MODEL_1_DOI="10.5281/zenodo.7933463" +MODEL_1_NAME="german_handwriting" +MODEL_1_DESC="HTR model for German manuscripts (handwritten text recognition)" MODEL_1_PATH="$MODEL_DIR/$MODEL_1_NAME.mlmodel" -MODEL_2_NAME="kurrent-de" -MODEL_2_DESC="Broad German Kurrent coverage (HTR-United)" +MODEL_2_DOI="10.5281/zenodo.13788177" +MODEL_2_NAME="mccatmus" +MODEL_2_DESC="McCATMuS — generic model for handwritten, printed & typewritten (16th c. onward)" MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel" EVAL_PDFS=("Eu-0693.pdf" "Eu-0692.pdf" "W-0150.pdf" "W-0575.pdf") @@ -27,18 +30,44 @@ OUTPUT_BASE="./ocr-model-evaluation" # ─── Functions ──────────────────────────────────────────────────────────────── +download_model() { + local doi="$1" + local dest="$2" + local name="$3" + + echo " Downloading $name ($doi)..." + + # kraken get downloads to /root/.local/share/htrmopo// + # We find the .mlmodel file after download and copy it to our volume + docker compose run --rm "$COMPOSE_SERVICE" sh -c " + kraken get $doi 2>&1 + # Find the most recently downloaded .mlmodel and copy to target + FOUND=\$(find /root/.local/share/htrmopo -name '*.mlmodel' -newer /tmp 2>/dev/null | head -1) + if [ -n \"\$FOUND\" ]; then + cp \"\$FOUND\" $dest + echo \"Saved to $dest\" + else + echo 'ERROR: No .mlmodel file found after download' + exit 1 + fi + " +} + download_models() { echo "Downloading Kraken HTR models into the ocr_models volume..." echo "" + # Create a timestamp marker so we can find newly downloaded files + docker compose run --rm "$COMPOSE_SERVICE" touch /tmp/.download-marker + echo "Model 1: $MODEL_1_NAME" echo " $MODEL_1_DESC" - docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH" + download_model "$MODEL_1_DOI" "$MODEL_1_PATH" "$MODEL_1_NAME" echo "" echo "Model 2: $MODEL_2_NAME" echo " $MODEL_2_DESC" - docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH" + download_model "$MODEL_2_DOI" "$MODEL_2_PATH" "$MODEL_2_NAME" echo "" echo "Both models downloaded." @@ -51,7 +80,6 @@ run_evaluation() { echo "═══════════════════════════════════════════════════════" echo "" - # Create output directories on the host local dir_1="$OUTPUT_BASE/$MODEL_1_NAME" local dir_2="$OUTPUT_BASE/$MODEL_2_NAME" mkdir -p "$dir_1" "$dir_2" @@ -67,22 +95,20 @@ run_evaluation() { echo "──── $pdf ────" - # Model 1 echo " Model 1: $MODEL_1_NAME ..." docker compose run --rm \ -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ -v "$(cd "$dir_1" && pwd):/eval-output" \ "$COMPOSE_SERVICE" \ - kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \ + kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \ 2>/dev/null || echo " ⚠ Model 1 failed on $pdf" - # Model 2 echo " Model 2: $MODEL_2_NAME ..." docker compose run --rm \ -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ -v "$(cd "$dir_2" && pwd):/eval-output" \ "$COMPOSE_SERVICE" \ - kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \ + kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \ 2>/dev/null || echo " ⚠ Model 2 failed on $pdf" echo "" @@ -91,20 +117,22 @@ run_evaluation() { echo "═══════════════════════════════════════════════════════" echo " Evaluation complete. Results:" echo "" - echo " Model 1 ($MODEL_1_NAME):" + echo " Model 1 — $MODEL_1_NAME ($MODEL_1_DOI):" for f in "$dir_1"/*.txt; do [[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes" done echo "" - echo " Model 2 ($MODEL_2_NAME):" + echo " Model 2 — $MODEL_2_NAME ($MODEL_2_DOI):" for f in "$dir_2"/*.txt; do [[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes" done echo "" - echo " Compare the outputs side by side:" + echo " Compare outputs:" echo " diff $dir_1/Eu-0693.txt $dir_2/Eu-0693.txt" + echo " # or view individually:" + echo " cat $dir_1/Eu-0693.txt" echo "" - echo " Then activate the better model:" + echo " Activate the better model:" echo " ./scripts/download-kraken-models.sh --activate 1 # $MODEL_1_NAME" echo " ./scripts/download-kraken-models.sh --activate 2 # $MODEL_2_NAME" echo "═══════════════════════════════════════════════════════"