Kraken's -f pdf mode tries to write output next to the input file, which fails on read-only mounts. Instead, extract pages as PNGs via pypdfium2 (already installed), then run kraken on each image. Both models run in a single container per PDF to avoid overhead. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
204 lines
7.3 KiB
Bash
Executable File
204 lines
7.3 KiB
Bash
Executable File
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
# Downloads Kraken HTR models, runs OCR evaluation on sample PDFs, and activates the winner.
|
|
#
|
|
# Usage:
|
|
# ./scripts/download-kraken-models.sh # download models + run evaluation
|
|
# ./scripts/download-kraken-models.sh --activate 1 # activate model 1
|
|
# ./scripts/download-kraken-models.sh --activate 2 # activate model 2
|
|
# ./scripts/download-kraken-models.sh --eval-only # re-run evaluation (models already downloaded)
|
|
|
|
COMPOSE_SERVICE="ocr-service"
|
|
MODEL_DIR="/app/models"
|
|
ACTIVE_MODEL="$MODEL_DIR/german_kurrent.mlmodel"
|
|
|
|
# Kraken 7 uses DOIs to identify models from HTR-United / Zenodo
|
|
MODEL_1_DOI="10.5281/zenodo.7933463"
|
|
MODEL_1_NAME="german_handwriting"
|
|
MODEL_1_DESC="HTR model for German manuscripts (handwritten text recognition)"
|
|
MODEL_1_PATH="$MODEL_DIR/$MODEL_1_NAME.mlmodel"
|
|
|
|
MODEL_2_DOI="10.5281/zenodo.13788177"
|
|
MODEL_2_NAME="mccatmus"
|
|
MODEL_2_DESC="McCATMuS — generic model for handwritten, printed & typewritten (16th c. onward)"
|
|
MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel"
|
|
|
|
EVAL_PDFS=("Eu-0693.pdf" "Eu-0692.pdf" "W-0150.pdf" "W-0575.pdf")
|
|
IMPORT_DIR="./import"
|
|
OUTPUT_BASE="./ocr-model-evaluation"
|
|
|
|
# ─── Functions ────────────────────────────────────────────────────────────────
|
|
|
|
download_model() {
|
|
local doi="$1"
|
|
local dest="$2"
|
|
local name="$3"
|
|
|
|
echo " Downloading $name ($doi)..."
|
|
|
|
# kraken get downloads to /root/.local/share/htrmopo/<uuid>/<name>.mlmodel
|
|
# Parse the "Model dir: <path>" line from kraken output to locate the file
|
|
docker compose run --rm "$COMPOSE_SERVICE" sh -c '
|
|
OUTPUT=$(kraken get '"$doi"' 2>&1)
|
|
echo "$OUTPUT"
|
|
MODEL_DIR=$(echo "$OUTPUT" | grep -oP "Model dir: \K[^ ]+")
|
|
if [ -n "$MODEL_DIR" ] && [ -d "$MODEL_DIR" ]; then
|
|
FOUND=$(find "$MODEL_DIR" -name "*.mlmodel" | head -1)
|
|
if [ -n "$FOUND" ]; then
|
|
cp "$FOUND" '"$dest"'
|
|
echo "Saved to '"$dest"'"
|
|
else
|
|
echo "ERROR: No .mlmodel file in $MODEL_DIR"
|
|
ls -la "$MODEL_DIR"
|
|
exit 1
|
|
fi
|
|
else
|
|
echo "ERROR: Could not parse model directory from kraken output"
|
|
exit 1
|
|
fi
|
|
'
|
|
}
|
|
|
|
download_models() {
|
|
echo "Downloading Kraken HTR models into the ocr_models volume..."
|
|
echo ""
|
|
|
|
echo "Model 1: $MODEL_1_NAME"
|
|
echo " $MODEL_1_DESC"
|
|
download_model "$MODEL_1_DOI" "$MODEL_1_PATH" "$MODEL_1_NAME"
|
|
echo ""
|
|
|
|
echo "Model 2: $MODEL_2_NAME"
|
|
echo " $MODEL_2_DESC"
|
|
download_model "$MODEL_2_DOI" "$MODEL_2_PATH" "$MODEL_2_NAME"
|
|
echo ""
|
|
|
|
echo "Both models downloaded."
|
|
}
|
|
|
|
run_evaluation() {
|
|
echo ""
|
|
echo "═══════════════════════════════════════════════════════"
|
|
echo " Running OCR evaluation on ${#EVAL_PDFS[@]} documents"
|
|
echo "═══════════════════════════════════════════════════════"
|
|
echo ""
|
|
|
|
local dir_1="$OUTPUT_BASE/$MODEL_1_NAME"
|
|
local dir_2="$OUTPUT_BASE/$MODEL_2_NAME"
|
|
mkdir -p "$dir_1" "$dir_2"
|
|
|
|
for pdf in "${EVAL_PDFS[@]}"; do
|
|
local src="$IMPORT_DIR/$pdf"
|
|
local basename="${pdf%.pdf}"
|
|
|
|
if [[ ! -f "$src" ]]; then
|
|
echo "SKIP: $src not found"
|
|
continue
|
|
fi
|
|
|
|
echo "──── $pdf ────"
|
|
|
|
# Run both models inside a single container run:
|
|
# 1. Extract PDF pages as PNGs (pypdfium2 is already installed)
|
|
# 2. Run kraken on each page image for both models
|
|
# 3. Concatenate per-page output into one file per model
|
|
docker compose run --rm \
|
|
-v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \
|
|
-v "$(cd "$dir_1" && pwd):/eval-out-1" \
|
|
-v "$(cd "$dir_2" && pwd):/eval-out-2" \
|
|
"$COMPOSE_SERVICE" \
|
|
python3 -c "
|
|
import pypdfium2 as pdfium, subprocess, sys, os
|
|
|
|
pdf = pdfium.PdfDocument('/eval-input/$pdf')
|
|
pages = []
|
|
for i in range(len(pdf)):
|
|
bmp = pdf[i].render(scale=300/72)
|
|
path = f'/tmp/page_{i:04d}.png'
|
|
bmp.to_pil().save(path)
|
|
pages.append(path)
|
|
print(f'Extracted {len(pages)} pages')
|
|
|
|
for label, model, outdir in [
|
|
('Model 1', '$MODEL_1_PATH', '/eval-out-1'),
|
|
('Model 2', '$MODEL_2_PATH', '/eval-out-2'),
|
|
]:
|
|
print(f' {label}...')
|
|
combined = ''
|
|
for p in pages:
|
|
args = ['kraken', '-i', p, '/dev/stdout', 'segment', '-bl', 'ocr', '-m', model]
|
|
r = subprocess.run(args, capture_output=True, text=True)
|
|
combined += r.stdout
|
|
if r.returncode != 0:
|
|
print(f' ⚠ failed on {os.path.basename(p)}: {r.stderr[:200]}', file=sys.stderr)
|
|
with open(f'{outdir}/${basename}.txt', 'w') as f:
|
|
f.write(combined)
|
|
lines = combined.count(chr(10))
|
|
print(f' → {lines} lines')
|
|
" || echo " ⚠ Failed on $pdf"
|
|
|
|
echo ""
|
|
done
|
|
|
|
echo "═══════════════════════════════════════════════════════"
|
|
echo " Evaluation complete. Results:"
|
|
echo ""
|
|
echo " Model 1 — $MODEL_1_NAME ($MODEL_1_DOI):"
|
|
for f in "$dir_1"/*.txt; do
|
|
[[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes"
|
|
done
|
|
echo ""
|
|
echo " Model 2 — $MODEL_2_NAME ($MODEL_2_DOI):"
|
|
for f in "$dir_2"/*.txt; do
|
|
[[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes"
|
|
done
|
|
echo ""
|
|
echo " Compare outputs:"
|
|
echo " diff $dir_1/Eu-0693.txt $dir_2/Eu-0693.txt"
|
|
echo " # or view individually:"
|
|
echo " cat $dir_1/Eu-0693.txt"
|
|
echo ""
|
|
echo " Activate the better model:"
|
|
echo " ./scripts/download-kraken-models.sh --activate 1 # $MODEL_1_NAME"
|
|
echo " ./scripts/download-kraken-models.sh --activate 2 # $MODEL_2_NAME"
|
|
echo "═══════════════════════════════════════════════════════"
|
|
}
|
|
|
|
activate_model() {
|
|
local choice="$1"
|
|
case "$choice" in
|
|
1)
|
|
echo "Activating model 1: $MODEL_1_NAME"
|
|
docker compose run --rm "$COMPOSE_SERVICE" cp "$MODEL_1_PATH" "$ACTIVE_MODEL"
|
|
;;
|
|
2)
|
|
echo "Activating model 2: $MODEL_2_NAME"
|
|
docker compose run --rm "$COMPOSE_SERVICE" cp "$MODEL_2_PATH" "$ACTIVE_MODEL"
|
|
;;
|
|
*)
|
|
echo "Error: --activate expects 1 or 2"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
echo "Active model is now: $ACTIVE_MODEL"
|
|
echo "Restart the OCR service to load the new model:"
|
|
echo " docker compose restart ocr-service"
|
|
}
|
|
|
|
# ─── Main ─────────────────────────────────────────────────────────────────────
|
|
|
|
case "${1:-}" in
|
|
--activate)
|
|
activate_model "${2:-}"
|
|
;;
|
|
--eval-only)
|
|
run_evaluation
|
|
;;
|
|
*)
|
|
download_models
|
|
run_evaluation
|
|
;;
|
|
esac
|