feat(ocr): extend model script with automatic OCR evaluation
Downloads both Kraken models, then runs each against 4 sample PDFs from the import folder (Eu-0693, Eu-0692, W-0150, W-0575). Output goes to ocr-model-evaluation/<model-name>/<doc>.txt for side-by-side comparison. Usage: ./scripts/download-kraken-models.sh # download + evaluate ./scripts/download-kraken-models.sh --eval-only # re-run evaluation ./scripts/download-kraken-models.sh --activate 1 # pick winner Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,13 +1,13 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# Downloads Kraken HTR models for German Kurrent/Suetterlin into the ocr_models volume.
|
||||
# Run this once after first deployment, or whenever you want to switch models.
|
||||
# Downloads Kraken HTR models, runs OCR evaluation on sample PDFs, and activates the winner.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/download-kraken-models.sh # download both candidates
|
||||
# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9)
|
||||
# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de)
|
||||
# ./scripts/download-kraken-models.sh # download models + run evaluation
|
||||
# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9)
|
||||
# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de)
|
||||
# ./scripts/download-kraken-models.sh --eval-only # re-run evaluation (models already downloaded)
|
||||
|
||||
COMPOSE_SERVICE="ocr-service"
|
||||
MODEL_DIR="/app/models"
|
||||
@@ -21,6 +21,10 @@ MODEL_2_NAME="kurrent-de"
|
||||
MODEL_2_DESC="Broad German Kurrent coverage (HTR-United)"
|
||||
MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel"
|
||||
|
||||
EVAL_PDFS=("Eu-0693.pdf" "Eu-0692.pdf" "W-0150.pdf" "W-0575.pdf")
|
||||
IMPORT_DIR="./import"
|
||||
OUTPUT_BASE="./ocr-model-evaluation"
|
||||
|
||||
# ─── Functions ────────────────────────────────────────────────────────────────
|
||||
|
||||
download_models() {
|
||||
@@ -29,31 +33,81 @@ download_models() {
|
||||
|
||||
echo "Model 1: $MODEL_1_NAME"
|
||||
echo " $MODEL_1_DESC"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" \
|
||||
kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH"
|
||||
echo ""
|
||||
|
||||
echo "Model 2: $MODEL_2_NAME"
|
||||
echo " $MODEL_2_DESC"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" \
|
||||
kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH"
|
||||
echo ""
|
||||
|
||||
echo "Both models downloaded. To test them against a sample document:"
|
||||
echo "Both models downloaded."
|
||||
}
|
||||
|
||||
run_evaluation() {
|
||||
echo ""
|
||||
echo " # Copy a sample Kurrent scan into the container:"
|
||||
echo " docker cp sample-kurrent.png archive-ocr:/tmp/sample.png"
|
||||
echo "═══════════════════════════════════════════════════════"
|
||||
echo " Running OCR evaluation on ${#EVAL_PDFS[@]} documents"
|
||||
echo "═══════════════════════════════════════════════════════"
|
||||
echo ""
|
||||
echo " # Test model 1:"
|
||||
echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out1.txt segment -bl ocr -m $MODEL_1_PATH"
|
||||
echo " docker compose exec ocr-service cat /tmp/out1.txt"
|
||||
|
||||
# Create output directories on the host
|
||||
local dir_1="$OUTPUT_BASE/$MODEL_1_NAME"
|
||||
local dir_2="$OUTPUT_BASE/$MODEL_2_NAME"
|
||||
mkdir -p "$dir_1" "$dir_2"
|
||||
|
||||
for pdf in "${EVAL_PDFS[@]}"; do
|
||||
local src="$IMPORT_DIR/$pdf"
|
||||
local basename="${pdf%.pdf}"
|
||||
|
||||
if [[ ! -f "$src" ]]; then
|
||||
echo "SKIP: $src not found"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "──── $pdf ────"
|
||||
|
||||
# Model 1
|
||||
echo " Model 1: $MODEL_1_NAME ..."
|
||||
docker compose run --rm \
|
||||
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
||||
-v "$(cd "$dir_1" && pwd):/eval-output" \
|
||||
"$COMPOSE_SERVICE" \
|
||||
kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \
|
||||
2>/dev/null || echo " ⚠ Model 1 failed on $pdf"
|
||||
|
||||
# Model 2
|
||||
echo " Model 2: $MODEL_2_NAME ..."
|
||||
docker compose run --rm \
|
||||
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
||||
-v "$(cd "$dir_2" && pwd):/eval-output" \
|
||||
"$COMPOSE_SERVICE" \
|
||||
kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \
|
||||
2>/dev/null || echo " ⚠ Model 2 failed on $pdf"
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "═══════════════════════════════════════════════════════"
|
||||
echo " Evaluation complete. Results:"
|
||||
echo ""
|
||||
echo " # Test model 2:"
|
||||
echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out2.txt segment -bl ocr -m $MODEL_2_PATH"
|
||||
echo " docker compose exec ocr-service cat /tmp/out2.txt"
|
||||
echo " Model 1 ($MODEL_1_NAME):"
|
||||
for f in "$dir_1"/*.txt; do
|
||||
[[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes"
|
||||
done
|
||||
echo ""
|
||||
echo "Then activate the better model:"
|
||||
echo " ./scripts/download-kraken-models.sh --activate 1 # or 2"
|
||||
echo " Model 2 ($MODEL_2_NAME):"
|
||||
for f in "$dir_2"/*.txt; do
|
||||
[[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes"
|
||||
done
|
||||
echo ""
|
||||
echo " Compare the outputs side by side:"
|
||||
echo " diff $dir_1/Eu-0693.txt $dir_2/Eu-0693.txt"
|
||||
echo ""
|
||||
echo " Then activate the better model:"
|
||||
echo " ./scripts/download-kraken-models.sh --activate 1 # $MODEL_1_NAME"
|
||||
echo " ./scripts/download-kraken-models.sh --activate 2 # $MODEL_2_NAME"
|
||||
echo "═══════════════════════════════════════════════════════"
|
||||
}
|
||||
|
||||
activate_model() {
|
||||
@@ -61,13 +115,11 @@ activate_model() {
|
||||
case "$choice" in
|
||||
1)
|
||||
echo "Activating model 1: $MODEL_1_NAME"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" \
|
||||
cp "$MODEL_1_PATH" "$ACTIVE_MODEL"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" cp "$MODEL_1_PATH" "$ACTIVE_MODEL"
|
||||
;;
|
||||
2)
|
||||
echo "Activating model 2: $MODEL_2_NAME"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" \
|
||||
cp "$MODEL_2_PATH" "$ACTIVE_MODEL"
|
||||
docker compose run --rm "$COMPOSE_SERVICE" cp "$MODEL_2_PATH" "$ACTIVE_MODEL"
|
||||
;;
|
||||
*)
|
||||
echo "Error: --activate expects 1 or 2"
|
||||
@@ -82,8 +134,15 @@ activate_model() {
|
||||
|
||||
# ─── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
if [[ "${1:-}" == "--activate" ]]; then
|
||||
activate_model "${2:-}"
|
||||
else
|
||||
download_models
|
||||
fi
|
||||
case "${1:-}" in
|
||||
--activate)
|
||||
activate_model "${2:-}"
|
||||
;;
|
||||
--eval-only)
|
||||
run_evaluation
|
||||
;;
|
||||
*)
|
||||
download_models
|
||||
run_evaluation
|
||||
;;
|
||||
esac
|
||||
|
||||
Reference in New Issue
Block a user