feat(ocr): add Kraken model download and evaluation script
Some checks failed
CI / Unit & Component Tests (push) Failing after 2s
CI / Backend Unit Tests (push) Failing after 2s
CI / Unit & Component Tests (pull_request) Failing after 2s
CI / Backend Unit Tests (pull_request) Failing after 2s

Runbook script to download both HTR-United Kurrent model candidates
(german_kurrent_manu_9, kurrent-de) into the ocr_models Docker volume,
test them against sample documents, and activate the winner.

Usage:
  ./scripts/download-kraken-models.sh              # download both
  ./scripts/download-kraken-models.sh --activate 1  # pick model 1

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-12 19:19:39 +02:00
parent c74539b04b
commit 41f9262238

View File

@@ -0,0 +1,89 @@
#!/bin/bash
set -euo pipefail
# Downloads Kraken HTR models for German Kurrent/Suetterlin into the ocr_models volume.
# Run this once after first deployment, or whenever you want to switch models.
#
# Usage:
# ./scripts/download-kraken-models.sh # download both candidates
# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9)
# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de)
COMPOSE_SERVICE="ocr-service"
MODEL_DIR="/app/models"
ACTIVE_MODEL="$MODEL_DIR/german_kurrent.mlmodel"
MODEL_1_NAME="german_kurrent_manu_9"
MODEL_1_DESC="19th-century German administrative Kurrent (HTR-United)"
MODEL_1_PATH="$MODEL_DIR/$MODEL_1_NAME.mlmodel"
MODEL_2_NAME="kurrent-de"
MODEL_2_DESC="Broad German Kurrent coverage (HTR-United)"
MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel"
# ─── Functions ────────────────────────────────────────────────────────────────
download_models() {
echo "Downloading Kraken HTR models into the ocr_models volume..."
echo ""
echo "Model 1: $MODEL_1_NAME"
echo " $MODEL_1_DESC"
docker compose run --rm "$COMPOSE_SERVICE" \
kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH"
echo ""
echo "Model 2: $MODEL_2_NAME"
echo " $MODEL_2_DESC"
docker compose run --rm "$COMPOSE_SERVICE" \
kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH"
echo ""
echo "Both models downloaded. To test them against a sample document:"
echo ""
echo " # Copy a sample Kurrent scan into the container:"
echo " docker cp sample-kurrent.png archive-ocr:/tmp/sample.png"
echo ""
echo " # Test model 1:"
echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out1.txt segment -bl ocr -m $MODEL_1_PATH"
echo " docker compose exec ocr-service cat /tmp/out1.txt"
echo ""
echo " # Test model 2:"
echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out2.txt segment -bl ocr -m $MODEL_2_PATH"
echo " docker compose exec ocr-service cat /tmp/out2.txt"
echo ""
echo "Then activate the better model:"
echo " ./scripts/download-kraken-models.sh --activate 1 # or 2"
}
activate_model() {
local choice="$1"
case "$choice" in
1)
echo "Activating model 1: $MODEL_1_NAME"
docker compose run --rm "$COMPOSE_SERVICE" \
cp "$MODEL_1_PATH" "$ACTIVE_MODEL"
;;
2)
echo "Activating model 2: $MODEL_2_NAME"
docker compose run --rm "$COMPOSE_SERVICE" \
cp "$MODEL_2_PATH" "$ACTIVE_MODEL"
;;
*)
echo "Error: --activate expects 1 or 2"
exit 1
;;
esac
echo "Active model is now: $ACTIVE_MODEL"
echo "Restart the OCR service to load the new model:"
echo " docker compose restart ocr-service"
}
# ─── Main ─────────────────────────────────────────────────────────────────────
if [[ "${1:-}" == "--activate" ]]; then
activate_model "${2:-}"
else
download_models
fi