feat(ocr): add Kraken model download and evaluation script
Runbook script to download both HTR-United Kurrent model candidates (german_kurrent_manu_9, kurrent-de) into the ocr_models Docker volume, test them against sample documents, and activate the winner. Usage: ./scripts/download-kraken-models.sh # download both ./scripts/download-kraken-models.sh --activate 1 # pick model 1 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
89
scripts/download-kraken-models.sh
Executable file
89
scripts/download-kraken-models.sh
Executable file
@@ -0,0 +1,89 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# Downloads Kraken HTR models for German Kurrent/Suetterlin into the ocr_models volume.
|
||||
# Run this once after first deployment, or whenever you want to switch models.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/download-kraken-models.sh # download both candidates
|
||||
# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9)
|
||||
# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de)
|
||||
|
||||
COMPOSE_SERVICE="ocr-service"
|
||||
MODEL_DIR="/app/models"
|
||||
ACTIVE_MODEL="$MODEL_DIR/german_kurrent.mlmodel"
|
||||
|
||||
MODEL_1_NAME="german_kurrent_manu_9"
|
||||
MODEL_1_DESC="19th-century German administrative Kurrent (HTR-United)"
|
||||
MODEL_1_PATH="$MODEL_DIR/$MODEL_1_NAME.mlmodel"
|
||||
|
||||
MODEL_2_NAME="kurrent-de"
|
||||
MODEL_2_DESC="Broad German Kurrent coverage (HTR-United)"
|
||||
MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel"
|
||||
|
||||
# ─── Functions ────────────────────────────────────────────────────────────────
|
||||
|
||||
download_models() {
|
||||
echo "Downloading Kraken HTR models into the ocr_models volume..."
|
||||
echo ""
|
||||
|
||||
echo "Model 1: $MODEL_1_NAME"
|
||||
echo " $MODEL_1_DESC"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" \
|
||||
kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH"
|
||||
echo ""
|
||||
|
||||
echo "Model 2: $MODEL_2_NAME"
|
||||
echo " $MODEL_2_DESC"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" \
|
||||
kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH"
|
||||
echo ""
|
||||
|
||||
echo "Both models downloaded. To test them against a sample document:"
|
||||
echo ""
|
||||
echo " # Copy a sample Kurrent scan into the container:"
|
||||
echo " docker cp sample-kurrent.png archive-ocr:/tmp/sample.png"
|
||||
echo ""
|
||||
echo " # Test model 1:"
|
||||
echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out1.txt segment -bl ocr -m $MODEL_1_PATH"
|
||||
echo " docker compose exec ocr-service cat /tmp/out1.txt"
|
||||
echo ""
|
||||
echo " # Test model 2:"
|
||||
echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out2.txt segment -bl ocr -m $MODEL_2_PATH"
|
||||
echo " docker compose exec ocr-service cat /tmp/out2.txt"
|
||||
echo ""
|
||||
echo "Then activate the better model:"
|
||||
echo " ./scripts/download-kraken-models.sh --activate 1 # or 2"
|
||||
}
|
||||
|
||||
activate_model() {
|
||||
local choice="$1"
|
||||
case "$choice" in
|
||||
1)
|
||||
echo "Activating model 1: $MODEL_1_NAME"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" \
|
||||
cp "$MODEL_1_PATH" "$ACTIVE_MODEL"
|
||||
;;
|
||||
2)
|
||||
echo "Activating model 2: $MODEL_2_NAME"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" \
|
||||
cp "$MODEL_2_PATH" "$ACTIVE_MODEL"
|
||||
;;
|
||||
*)
|
||||
echo "Error: --activate expects 1 or 2"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "Active model is now: $ACTIVE_MODEL"
|
||||
echo "Restart the OCR service to load the new model:"
|
||||
echo " docker compose restart ocr-service"
|
||||
}
|
||||
|
||||
# ─── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
if [[ "${1:-}" == "--activate" ]]; then
|
||||
activate_model "${2:-}"
|
||||
else
|
||||
download_models
|
||||
fi
|
||||
Reference in New Issue
Block a user