feat: OCR pipeline with NDJSON streaming and real-time progress (#226, #227, #231) #229

Merged
marcel merged 74 commits from feat/issue-226-227-ocr-pipeline-polygon into main 2026-04-13 12:39:04 +02:00
Showing only changes of commit c0004f5e6f - Show all commits

View File

@@ -37,29 +37,33 @@ download_model() {
echo " Downloading $name ($doi)..."
# kraken get downloads to /root/.local/share/htrmopo/<uuid>/
# We find the .mlmodel file after download and copy it to our volume
docker compose run --rm "$COMPOSE_SERVICE" sh -c "
kraken get $doi 2>&1
# Find the most recently downloaded .mlmodel and copy to target
FOUND=\$(find /root/.local/share/htrmopo -name '*.mlmodel' -newer /tmp 2>/dev/null | head -1)
if [ -n \"\$FOUND\" ]; then
cp \"\$FOUND\" $dest
echo \"Saved to $dest\"
# kraken get downloads to /root/.local/share/htrmopo/<uuid>/<name>.mlmodel
# Parse the "Model dir: <path>" line from kraken output to locate the file
docker compose run --rm "$COMPOSE_SERVICE" sh -c '
OUTPUT=$(kraken get '"$doi"' 2>&1)
echo "$OUTPUT"
MODEL_DIR=$(echo "$OUTPUT" | grep -oP "Model dir: \K[^ ]+")
if [ -n "$MODEL_DIR" ] && [ -d "$MODEL_DIR" ]; then
FOUND=$(find "$MODEL_DIR" -name "*.mlmodel" | head -1)
if [ -n "$FOUND" ]; then
cp "$FOUND" '"$dest"'
echo "Saved to '"$dest"'"
else
echo "ERROR: No .mlmodel file in $MODEL_DIR"
ls -la "$MODEL_DIR"
exit 1
fi
else
echo 'ERROR: No .mlmodel file found after download'
echo "ERROR: Could not parse model directory from kraken output"
exit 1
fi
"
'
}
download_models() {
echo "Downloading Kraken HTR models into the ocr_models volume..."
echo ""
# Create a timestamp marker so we can find newly downloaded files
docker compose run --rm "$COMPOSE_SERVICE" touch /tmp/.download-marker
echo "Model 1: $MODEL_1_NAME"
echo " $MODEL_1_DESC"
download_model "$MODEL_1_DOI" "$MODEL_1_PATH" "$MODEL_1_NAME"