Files
familienarchiv/scripts/download-paperless.sh
2026-05-05 12:39:20 +02:00

126 lines
3.9 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
set -euo pipefail
# ── Config ────────────────────────────────────────────────────────────────────
BASE_URL="https://dg-familienarchiv.de"
BATCH_SIZE=10
OUTPUT_DIR="./paperless-download"
MAX_RETRIES=3
RETRY_DELAY=10 # seconds between retries
# ─────────────────────────────────────────────────────────────────────────────
# Prompt for credentials
USERNAME="berit.cram@gmx.de"
PASSWORD="LifeOfNewYork1;)"
echo "Authenticating..."
AUTH_BODY=$(jq -n --arg u "$USERNAME" --arg p "$PASSWORD" '{username: $u, password: $p}')
AUTH_RESP=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "$AUTH_BODY" \
"$BASE_URL/api/token/")
TOKEN=$(echo "$AUTH_RESP" | jq -r '.token')
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
echo "Authentication failed. Server responded:" >&2
echo "$AUTH_RESP" >&2
exit 1
fi
echo "Authenticated."
# ── Collect all document IDs ───────────────────────────────────────────────
echo "Fetching document list..."
ALL_IDS=()
PAGE=1
while true; do
RESP=$(curl -s \
-H "Authorization: Token $TOKEN" \
"$BASE_URL/api/documents/?page=$PAGE&page_size=100")
IDS=$(echo "$RESP" | jq -r '.results[].id' | tr -d '\r' | grep -E '^[0-9]+$')
[ -z "$IDS" ] && break
while IFS= read -r id; do
ALL_IDS+=("$id")
done <<< "$IDS"
NEXT=$(echo "$RESP" | jq -r '.next')
[ "$NEXT" = "null" ] && break
PAGE=$((PAGE + 1))
done
TOTAL=${#ALL_IDS[@]}
echo "Found $TOTAL documents."
# ── Download in batches ────────────────────────────────────────────────────
mkdir -p "$OUTPUT_DIR"
BATCH_NUM=0
START=0
while [ $START -lt $TOTAL ]; do
END=$(( START + BATCH_SIZE ))
[ $END -gt $TOTAL ] && END=$TOTAL
BATCH_IDS=("${ALL_IDS[@]:$START:$((END - START))}")
BATCH_NUM=$((BATCH_NUM + 1))
PADDED=$(printf "%03d" "$BATCH_NUM")
ZIP_FILE="$OUTPUT_DIR/batch_$PADDED.zip"
BATCH_DIR="$OUTPUT_DIR/batch_$PADDED"
# Skip already-extracted batches (resume support)
if [ -d "$BATCH_DIR" ]; then
echo "Skipping batch $BATCH_NUM (already extracted)."
START=$END
continue
fi
echo "Downloading batch $BATCH_NUM (docs $((START+1))$END of $TOTAL)..."
JSON_IDS=$(printf '%s\n' "${BATCH_IDS[@]}" | grep -E '^[0-9]+$' | jq -s '[.[] | tonumber]')
ATTEMPT=0
SUCCESS=false
while [ $ATTEMPT -lt $MAX_RETRIES ]; do
ATTEMPT=$((ATTEMPT + 1))
CONTENT_TYPE=$(curl -s --max-time 300 -X POST \
-H "Authorization: Token $TOKEN" \
-H "Content-Type: application/json" \
-d "{\"documents\": $JSON_IDS, \"content\": \"originals\"}" \
"$BASE_URL/api/documents/bulk_download/" \
-D - \
--output "$ZIP_FILE" | grep -i '^content-type:' | tr -d '\r')
if [ ! -s "$ZIP_FILE" ]; then
echo " Attempt $ATTEMPT: empty response, retrying in ${RETRY_DELAY}s..." >&2
rm -f "$ZIP_FILE"
sleep $RETRY_DELAY
continue
fi
if echo "$CONTENT_TYPE" | grep -qi 'zip\|octet-stream'; then
SUCCESS=true
break
else
echo " Attempt $ATTEMPT: unexpected response (${CONTENT_TYPE}):" >&2
cat "$ZIP_FILE" >&2
rm -f "$ZIP_FILE"
sleep $RETRY_DELAY
fi
done
if [ "$SUCCESS" = false ]; then
echo " Batch $BATCH_NUM failed after $MAX_RETRIES attempts, skipping." >&2
START=$END
continue
fi
mkdir -p "$BATCH_DIR"
unzip -q "$ZIP_FILE" -d "$BATCH_DIR"
rm "$ZIP_FILE"
echo " Extracted to $BATCH_DIR/"
START=$END
done
echo "Done. All files saved to $OUTPUT_DIR/"