From 41f92622382fa019d89fb1cd0a0b720bb84d233d Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 19:19:39 +0200 Subject: [PATCH] feat(ocr): add Kraken model download and evaluation script Runbook script to download both HTR-United Kurrent model candidates (german_kurrent_manu_9, kurrent-de) into the ocr_models Docker volume, test them against sample documents, and activate the winner. Usage: ./scripts/download-kraken-models.sh # download both ./scripts/download-kraken-models.sh --activate 1 # pick model 1 Co-Authored-By: Claude Sonnet 4.6 --- scripts/download-kraken-models.sh | 89 +++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100755 scripts/download-kraken-models.sh diff --git a/scripts/download-kraken-models.sh b/scripts/download-kraken-models.sh new file mode 100755 index 00000000..1486e66b --- /dev/null +++ b/scripts/download-kraken-models.sh @@ -0,0 +1,89 @@ +#!/bin/bash +set -euo pipefail + +# Downloads Kraken HTR models for German Kurrent/Suetterlin into the ocr_models volume. +# Run this once after first deployment, or whenever you want to switch models. +# +# Usage: +# ./scripts/download-kraken-models.sh # download both candidates +# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9) +# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de) + +COMPOSE_SERVICE="ocr-service" +MODEL_DIR="/app/models" +ACTIVE_MODEL="$MODEL_DIR/german_kurrent.mlmodel" + +MODEL_1_NAME="german_kurrent_manu_9" +MODEL_1_DESC="19th-century German administrative Kurrent (HTR-United)" +MODEL_1_PATH="$MODEL_DIR/$MODEL_1_NAME.mlmodel" + +MODEL_2_NAME="kurrent-de" +MODEL_2_DESC="Broad German Kurrent coverage (HTR-United)" +MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel" + +# ─── Functions ──────────────────────────────────────────────────────────────── + +download_models() { + echo "Downloading Kraken HTR models into the ocr_models volume..." + echo "" + + echo "Model 1: $MODEL_1_NAME" + echo " $MODEL_1_DESC" + docker compose run --rm "$COMPOSE_SERVICE" \ + kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH" + echo "" + + echo "Model 2: $MODEL_2_NAME" + echo " $MODEL_2_DESC" + docker compose run --rm "$COMPOSE_SERVICE" \ + kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH" + echo "" + + echo "Both models downloaded. To test them against a sample document:" + echo "" + echo " # Copy a sample Kurrent scan into the container:" + echo " docker cp sample-kurrent.png archive-ocr:/tmp/sample.png" + echo "" + echo " # Test model 1:" + echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out1.txt segment -bl ocr -m $MODEL_1_PATH" + echo " docker compose exec ocr-service cat /tmp/out1.txt" + echo "" + echo " # Test model 2:" + echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out2.txt segment -bl ocr -m $MODEL_2_PATH" + echo " docker compose exec ocr-service cat /tmp/out2.txt" + echo "" + echo "Then activate the better model:" + echo " ./scripts/download-kraken-models.sh --activate 1 # or 2" +} + +activate_model() { + local choice="$1" + case "$choice" in + 1) + echo "Activating model 1: $MODEL_1_NAME" + docker compose run --rm "$COMPOSE_SERVICE" \ + cp "$MODEL_1_PATH" "$ACTIVE_MODEL" + ;; + 2) + echo "Activating model 2: $MODEL_2_NAME" + docker compose run --rm "$COMPOSE_SERVICE" \ + cp "$MODEL_2_PATH" "$ACTIVE_MODEL" + ;; + *) + echo "Error: --activate expects 1 or 2" + exit 1 + ;; + esac + + echo "Active model is now: $ACTIVE_MODEL" + echo "Restart the OCR service to load the new model:" + echo " docker compose restart ocr-service" +} + +# ─── Main ───────────────────────────────────────────────────────────────────── + +if [[ "${1:-}" == "--activate" ]]; then + activate_model "${2:-}" +else + download_models +fi