fix(ocr): parse kraken 'Model dir' output to locate downloaded model
Some checks failed
CI / Unit & Component Tests (push) Failing after 1s
CI / Backend Unit Tests (push) Failing after 0s
CI / Unit & Component Tests (pull_request) Failing after 1s
CI / Backend Unit Tests (pull_request) Failing after 0s

The previous approach used find across the htrmopo cache which failed
because -newer /tmp ran in a separate container. Now parses the
'Model dir: <path>' line from kraken get output directly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-12 20:09:23 +02:00
parent f12b41161e
commit c0004f5e6f

View File

@@ -37,29 +37,33 @@ download_model() {
echo " Downloading $name ($doi)..." echo " Downloading $name ($doi)..."
# kraken get downloads to /root/.local/share/htrmopo/<uuid>/ # kraken get downloads to /root/.local/share/htrmopo/<uuid>/<name>.mlmodel
# We find the .mlmodel file after download and copy it to our volume # Parse the "Model dir: <path>" line from kraken output to locate the file
docker compose run --rm "$COMPOSE_SERVICE" sh -c " docker compose run --rm "$COMPOSE_SERVICE" sh -c '
kraken get $doi 2>&1 OUTPUT=$(kraken get '"$doi"' 2>&1)
# Find the most recently downloaded .mlmodel and copy to target echo "$OUTPUT"
FOUND=\$(find /root/.local/share/htrmopo -name '*.mlmodel' -newer /tmp 2>/dev/null | head -1) MODEL_DIR=$(echo "$OUTPUT" | grep -oP "Model dir: \K[^ ]+")
if [ -n \"\$FOUND\" ]; then if [ -n "$MODEL_DIR" ] && [ -d "$MODEL_DIR" ]; then
cp \"\$FOUND\" $dest FOUND=$(find "$MODEL_DIR" -name "*.mlmodel" | head -1)
echo \"Saved to $dest\" if [ -n "$FOUND" ]; then
cp "$FOUND" '"$dest"'
echo "Saved to '"$dest"'"
else
echo "ERROR: No .mlmodel file in $MODEL_DIR"
ls -la "$MODEL_DIR"
exit 1
fi
else else
echo 'ERROR: No .mlmodel file found after download' echo "ERROR: Could not parse model directory from kraken output"
exit 1 exit 1
fi fi
" '
} }
download_models() { download_models() {
echo "Downloading Kraken HTR models into the ocr_models volume..." echo "Downloading Kraken HTR models into the ocr_models volume..."
echo "" echo ""
# Create a timestamp marker so we can find newly downloaded files
docker compose run --rm "$COMPOSE_SERVICE" touch /tmp/.download-marker
echo "Model 1: $MODEL_1_NAME" echo "Model 1: $MODEL_1_NAME"
echo " $MODEL_1_DESC" echo " $MODEL_1_DESC"
download_model "$MODEL_1_DOI" "$MODEL_1_PATH" "$MODEL_1_NAME" download_model "$MODEL_1_DOI" "$MODEL_1_PATH" "$MODEL_1_NAME"