fix(ocr): update model script for kraken 7 DOI-based downloads
Kraken 7 uses DOIs (not short names) to identify models from Zenodo. Updated to use actual DOIs: - 10.5281/zenodo.7933463 — German handwriting HTR - 10.5281/zenodo.13788177 — McCATMuS generic handwritten/printed/typed Added -f pdf flag for PDF input, volume mounts for import dir, and post-download copy from htrmopo cache to the models volume. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,20 +5,23 @@ set -euo pipefail
|
|||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# ./scripts/download-kraken-models.sh # download models + run evaluation
|
# ./scripts/download-kraken-models.sh # download models + run evaluation
|
||||||
# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9)
|
# ./scripts/download-kraken-models.sh --activate 1 # activate model 1
|
||||||
# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de)
|
# ./scripts/download-kraken-models.sh --activate 2 # activate model 2
|
||||||
# ./scripts/download-kraken-models.sh --eval-only # re-run evaluation (models already downloaded)
|
# ./scripts/download-kraken-models.sh --eval-only # re-run evaluation (models already downloaded)
|
||||||
|
|
||||||
COMPOSE_SERVICE="ocr-service"
|
COMPOSE_SERVICE="ocr-service"
|
||||||
MODEL_DIR="/app/models"
|
MODEL_DIR="/app/models"
|
||||||
ACTIVE_MODEL="$MODEL_DIR/german_kurrent.mlmodel"
|
ACTIVE_MODEL="$MODEL_DIR/german_kurrent.mlmodel"
|
||||||
|
|
||||||
MODEL_1_NAME="german_kurrent_manu_9"
|
# Kraken 7 uses DOIs to identify models from HTR-United / Zenodo
|
||||||
MODEL_1_DESC="19th-century German administrative Kurrent (HTR-United)"
|
MODEL_1_DOI="10.5281/zenodo.7933463"
|
||||||
|
MODEL_1_NAME="german_handwriting"
|
||||||
|
MODEL_1_DESC="HTR model for German manuscripts (handwritten text recognition)"
|
||||||
MODEL_1_PATH="$MODEL_DIR/$MODEL_1_NAME.mlmodel"
|
MODEL_1_PATH="$MODEL_DIR/$MODEL_1_NAME.mlmodel"
|
||||||
|
|
||||||
MODEL_2_NAME="kurrent-de"
|
MODEL_2_DOI="10.5281/zenodo.13788177"
|
||||||
MODEL_2_DESC="Broad German Kurrent coverage (HTR-United)"
|
MODEL_2_NAME="mccatmus"
|
||||||
|
MODEL_2_DESC="McCATMuS — generic model for handwritten, printed & typewritten (16th c. onward)"
|
||||||
MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel"
|
MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel"
|
||||||
|
|
||||||
EVAL_PDFS=("Eu-0693.pdf" "Eu-0692.pdf" "W-0150.pdf" "W-0575.pdf")
|
EVAL_PDFS=("Eu-0693.pdf" "Eu-0692.pdf" "W-0150.pdf" "W-0575.pdf")
|
||||||
@@ -27,18 +30,44 @@ OUTPUT_BASE="./ocr-model-evaluation"
|
|||||||
|
|
||||||
# ─── Functions ────────────────────────────────────────────────────────────────
|
# ─── Functions ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
download_model() {
|
||||||
|
local doi="$1"
|
||||||
|
local dest="$2"
|
||||||
|
local name="$3"
|
||||||
|
|
||||||
|
echo " Downloading $name ($doi)..."
|
||||||
|
|
||||||
|
# kraken get downloads to /root/.local/share/htrmopo/<uuid>/
|
||||||
|
# We find the .mlmodel file after download and copy it to our volume
|
||||||
|
docker compose run --rm "$COMPOSE_SERVICE" sh -c "
|
||||||
|
kraken get $doi 2>&1
|
||||||
|
# Find the most recently downloaded .mlmodel and copy to target
|
||||||
|
FOUND=\$(find /root/.local/share/htrmopo -name '*.mlmodel' -newer /tmp 2>/dev/null | head -1)
|
||||||
|
if [ -n \"\$FOUND\" ]; then
|
||||||
|
cp \"\$FOUND\" $dest
|
||||||
|
echo \"Saved to $dest\"
|
||||||
|
else
|
||||||
|
echo 'ERROR: No .mlmodel file found after download'
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
"
|
||||||
|
}
|
||||||
|
|
||||||
download_models() {
|
download_models() {
|
||||||
echo "Downloading Kraken HTR models into the ocr_models volume..."
|
echo "Downloading Kraken HTR models into the ocr_models volume..."
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
|
# Create a timestamp marker so we can find newly downloaded files
|
||||||
|
docker compose run --rm "$COMPOSE_SERVICE" touch /tmp/.download-marker
|
||||||
|
|
||||||
echo "Model 1: $MODEL_1_NAME"
|
echo "Model 1: $MODEL_1_NAME"
|
||||||
echo " $MODEL_1_DESC"
|
echo " $MODEL_1_DESC"
|
||||||
docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH"
|
download_model "$MODEL_1_DOI" "$MODEL_1_PATH" "$MODEL_1_NAME"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
echo "Model 2: $MODEL_2_NAME"
|
echo "Model 2: $MODEL_2_NAME"
|
||||||
echo " $MODEL_2_DESC"
|
echo " $MODEL_2_DESC"
|
||||||
docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH"
|
download_model "$MODEL_2_DOI" "$MODEL_2_PATH" "$MODEL_2_NAME"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
echo "Both models downloaded."
|
echo "Both models downloaded."
|
||||||
@@ -51,7 +80,6 @@ run_evaluation() {
|
|||||||
echo "═══════════════════════════════════════════════════════"
|
echo "═══════════════════════════════════════════════════════"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
# Create output directories on the host
|
|
||||||
local dir_1="$OUTPUT_BASE/$MODEL_1_NAME"
|
local dir_1="$OUTPUT_BASE/$MODEL_1_NAME"
|
||||||
local dir_2="$OUTPUT_BASE/$MODEL_2_NAME"
|
local dir_2="$OUTPUT_BASE/$MODEL_2_NAME"
|
||||||
mkdir -p "$dir_1" "$dir_2"
|
mkdir -p "$dir_1" "$dir_2"
|
||||||
@@ -67,22 +95,20 @@ run_evaluation() {
|
|||||||
|
|
||||||
echo "──── $pdf ────"
|
echo "──── $pdf ────"
|
||||||
|
|
||||||
# Model 1
|
|
||||||
echo " Model 1: $MODEL_1_NAME ..."
|
echo " Model 1: $MODEL_1_NAME ..."
|
||||||
docker compose run --rm \
|
docker compose run --rm \
|
||||||
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
||||||
-v "$(cd "$dir_1" && pwd):/eval-output" \
|
-v "$(cd "$dir_1" && pwd):/eval-output" \
|
||||||
"$COMPOSE_SERVICE" \
|
"$COMPOSE_SERVICE" \
|
||||||
kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \
|
kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \
|
||||||
2>/dev/null || echo " ⚠ Model 1 failed on $pdf"
|
2>/dev/null || echo " ⚠ Model 1 failed on $pdf"
|
||||||
|
|
||||||
# Model 2
|
|
||||||
echo " Model 2: $MODEL_2_NAME ..."
|
echo " Model 2: $MODEL_2_NAME ..."
|
||||||
docker compose run --rm \
|
docker compose run --rm \
|
||||||
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
||||||
-v "$(cd "$dir_2" && pwd):/eval-output" \
|
-v "$(cd "$dir_2" && pwd):/eval-output" \
|
||||||
"$COMPOSE_SERVICE" \
|
"$COMPOSE_SERVICE" \
|
||||||
kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \
|
kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \
|
||||||
2>/dev/null || echo " ⚠ Model 2 failed on $pdf"
|
2>/dev/null || echo " ⚠ Model 2 failed on $pdf"
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
@@ -91,20 +117,22 @@ run_evaluation() {
|
|||||||
echo "═══════════════════════════════════════════════════════"
|
echo "═══════════════════════════════════════════════════════"
|
||||||
echo " Evaluation complete. Results:"
|
echo " Evaluation complete. Results:"
|
||||||
echo ""
|
echo ""
|
||||||
echo " Model 1 ($MODEL_1_NAME):"
|
echo " Model 1 — $MODEL_1_NAME ($MODEL_1_DOI):"
|
||||||
for f in "$dir_1"/*.txt; do
|
for f in "$dir_1"/*.txt; do
|
||||||
[[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes"
|
[[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes"
|
||||||
done
|
done
|
||||||
echo ""
|
echo ""
|
||||||
echo " Model 2 ($MODEL_2_NAME):"
|
echo " Model 2 — $MODEL_2_NAME ($MODEL_2_DOI):"
|
||||||
for f in "$dir_2"/*.txt; do
|
for f in "$dir_2"/*.txt; do
|
||||||
[[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes"
|
[[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes"
|
||||||
done
|
done
|
||||||
echo ""
|
echo ""
|
||||||
echo " Compare the outputs side by side:"
|
echo " Compare outputs:"
|
||||||
echo " diff $dir_1/Eu-0693.txt $dir_2/Eu-0693.txt"
|
echo " diff $dir_1/Eu-0693.txt $dir_2/Eu-0693.txt"
|
||||||
|
echo " # or view individually:"
|
||||||
|
echo " cat $dir_1/Eu-0693.txt"
|
||||||
echo ""
|
echo ""
|
||||||
echo " Then activate the better model:"
|
echo " Activate the better model:"
|
||||||
echo " ./scripts/download-kraken-models.sh --activate 1 # $MODEL_1_NAME"
|
echo " ./scripts/download-kraken-models.sh --activate 1 # $MODEL_1_NAME"
|
||||||
echo " ./scripts/download-kraken-models.sh --activate 2 # $MODEL_2_NAME"
|
echo " ./scripts/download-kraken-models.sh --activate 2 # $MODEL_2_NAME"
|
||||||
echo "═══════════════════════════════════════════════════════"
|
echo "═══════════════════════════════════════════════════════"
|
||||||
|
|||||||
Reference in New Issue
Block a user