Files
familienarchiv/scripts/download-kraken-models.sh
Marcel c0004f5e6f
Some checks failed
CI / Unit & Component Tests (push) Failing after 1s
CI / Backend Unit Tests (push) Failing after 0s
CI / Unit & Component Tests (pull_request) Failing after 1s
CI / Backend Unit Tests (pull_request) Failing after 0s
fix(ocr): parse kraken 'Model dir' output to locate downloaded model
The previous approach used find across the htrmopo cache which failed
because -newer /tmp ran in a separate container. Now parses the
'Model dir: <path>' line from kraken get output directly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 20:09:23 +02:00

181 lines
6.6 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
# Downloads Kraken HTR models, runs OCR evaluation on sample PDFs, and activates the winner.
#
# Usage:
# ./scripts/download-kraken-models.sh # download models + run evaluation
# ./scripts/download-kraken-models.sh --activate 1 # activate model 1
# ./scripts/download-kraken-models.sh --activate 2 # activate model 2
# ./scripts/download-kraken-models.sh --eval-only # re-run evaluation (models already downloaded)
COMPOSE_SERVICE="ocr-service"
MODEL_DIR="/app/models"
ACTIVE_MODEL="$MODEL_DIR/german_kurrent.mlmodel"
# Kraken 7 uses DOIs to identify models from HTR-United / Zenodo
MODEL_1_DOI="10.5281/zenodo.7933463"
MODEL_1_NAME="german_handwriting"
MODEL_1_DESC="HTR model for German manuscripts (handwritten text recognition)"
MODEL_1_PATH="$MODEL_DIR/$MODEL_1_NAME.mlmodel"
MODEL_2_DOI="10.5281/zenodo.13788177"
MODEL_2_NAME="mccatmus"
MODEL_2_DESC="McCATMuS — generic model for handwritten, printed & typewritten (16th c. onward)"
MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel"
EVAL_PDFS=("Eu-0693.pdf" "Eu-0692.pdf" "W-0150.pdf" "W-0575.pdf")
IMPORT_DIR="./import"
OUTPUT_BASE="./ocr-model-evaluation"
# ─── Functions ────────────────────────────────────────────────────────────────
download_model() {
local doi="$1"
local dest="$2"
local name="$3"
echo " Downloading $name ($doi)..."
# kraken get downloads to /root/.local/share/htrmopo/<uuid>/<name>.mlmodel
# Parse the "Model dir: <path>" line from kraken output to locate the file
docker compose run --rm "$COMPOSE_SERVICE" sh -c '
OUTPUT=$(kraken get '"$doi"' 2>&1)
echo "$OUTPUT"
MODEL_DIR=$(echo "$OUTPUT" | grep -oP "Model dir: \K[^ ]+")
if [ -n "$MODEL_DIR" ] && [ -d "$MODEL_DIR" ]; then
FOUND=$(find "$MODEL_DIR" -name "*.mlmodel" | head -1)
if [ -n "$FOUND" ]; then
cp "$FOUND" '"$dest"'
echo "Saved to '"$dest"'"
else
echo "ERROR: No .mlmodel file in $MODEL_DIR"
ls -la "$MODEL_DIR"
exit 1
fi
else
echo "ERROR: Could not parse model directory from kraken output"
exit 1
fi
'
}
download_models() {
echo "Downloading Kraken HTR models into the ocr_models volume..."
echo ""
echo "Model 1: $MODEL_1_NAME"
echo " $MODEL_1_DESC"
download_model "$MODEL_1_DOI" "$MODEL_1_PATH" "$MODEL_1_NAME"
echo ""
echo "Model 2: $MODEL_2_NAME"
echo " $MODEL_2_DESC"
download_model "$MODEL_2_DOI" "$MODEL_2_PATH" "$MODEL_2_NAME"
echo ""
echo "Both models downloaded."
}
run_evaluation() {
echo ""
echo "═══════════════════════════════════════════════════════"
echo " Running OCR evaluation on ${#EVAL_PDFS[@]} documents"
echo "═══════════════════════════════════════════════════════"
echo ""
local dir_1="$OUTPUT_BASE/$MODEL_1_NAME"
local dir_2="$OUTPUT_BASE/$MODEL_2_NAME"
mkdir -p "$dir_1" "$dir_2"
for pdf in "${EVAL_PDFS[@]}"; do
local src="$IMPORT_DIR/$pdf"
local basename="${pdf%.pdf}"
if [[ ! -f "$src" ]]; then
echo "SKIP: $src not found"
continue
fi
echo "──── $pdf ────"
echo " Model 1: $MODEL_1_NAME ..."
docker compose run --rm \
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
-v "$(cd "$dir_1" && pwd):/eval-output" \
"$COMPOSE_SERVICE" \
kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \
2>/dev/null || echo " ⚠ Model 1 failed on $pdf"
echo " Model 2: $MODEL_2_NAME ..."
docker compose run --rm \
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
-v "$(cd "$dir_2" && pwd):/eval-output" \
"$COMPOSE_SERVICE" \
kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \
2>/dev/null || echo " ⚠ Model 2 failed on $pdf"
echo ""
done
echo "═══════════════════════════════════════════════════════"
echo " Evaluation complete. Results:"
echo ""
echo " Model 1 — $MODEL_1_NAME ($MODEL_1_DOI):"
for f in "$dir_1"/*.txt; do
[[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes"
done
echo ""
echo " Model 2 — $MODEL_2_NAME ($MODEL_2_DOI):"
for f in "$dir_2"/*.txt; do
[[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes"
done
echo ""
echo " Compare outputs:"
echo " diff $dir_1/Eu-0693.txt $dir_2/Eu-0693.txt"
echo " # or view individually:"
echo " cat $dir_1/Eu-0693.txt"
echo ""
echo " Activate the better model:"
echo " ./scripts/download-kraken-models.sh --activate 1 # $MODEL_1_NAME"
echo " ./scripts/download-kraken-models.sh --activate 2 # $MODEL_2_NAME"
echo "═══════════════════════════════════════════════════════"
}
activate_model() {
local choice="$1"
case "$choice" in
1)
echo "Activating model 1: $MODEL_1_NAME"
docker compose run --rm "$COMPOSE_SERVICE" cp "$MODEL_1_PATH" "$ACTIVE_MODEL"
;;
2)
echo "Activating model 2: $MODEL_2_NAME"
docker compose run --rm "$COMPOSE_SERVICE" cp "$MODEL_2_PATH" "$ACTIVE_MODEL"
;;
*)
echo "Error: --activate expects 1 or 2"
exit 1
;;
esac
echo "Active model is now: $ACTIVE_MODEL"
echo "Restart the OCR service to load the new model:"
echo " docker compose restart ocr-service"
}
# ─── Main ─────────────────────────────────────────────────────────────────────
case "${1:-}" in
--activate)
activate_model "${2:-}"
;;
--eval-only)
run_evaluation
;;
*)
download_models
run_evaluation
;;
esac