126 lines
3.9 KiB
Bash
Executable File
126 lines
3.9 KiB
Bash
Executable File
#!/bin/bash
|
||
set -euo pipefail
|
||
|
||
# ── Config ────────────────────────────────────────────────────────────────────
|
||
BASE_URL="https://dg-familienarchiv.de"
|
||
BATCH_SIZE=10
|
||
OUTPUT_DIR="./paperless-download"
|
||
MAX_RETRIES=3
|
||
RETRY_DELAY=10 # seconds between retries
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
# Prompt for credentials
|
||
USERNAME="berit.cram@gmx.de"
|
||
PASSWORD="LifeOfNewYork1;)"
|
||
|
||
echo "Authenticating..."
|
||
AUTH_BODY=$(jq -n --arg u "$USERNAME" --arg p "$PASSWORD" '{username: $u, password: $p}')
|
||
AUTH_RESP=$(curl -s -X POST \
|
||
-H "Content-Type: application/json" \
|
||
-d "$AUTH_BODY" \
|
||
"$BASE_URL/api/token/")
|
||
TOKEN=$(echo "$AUTH_RESP" | jq -r '.token')
|
||
|
||
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
|
||
echo "Authentication failed. Server responded:" >&2
|
||
echo "$AUTH_RESP" >&2
|
||
exit 1
|
||
fi
|
||
echo "Authenticated."
|
||
|
||
# ── Collect all document IDs ───────────────────────────────────────────────
|
||
echo "Fetching document list..."
|
||
ALL_IDS=()
|
||
PAGE=1
|
||
while true; do
|
||
RESP=$(curl -s \
|
||
-H "Authorization: Token $TOKEN" \
|
||
"$BASE_URL/api/documents/?page=$PAGE&page_size=100")
|
||
|
||
IDS=$(echo "$RESP" | jq -r '.results[].id' | tr -d '\r' | grep -E '^[0-9]+$')
|
||
[ -z "$IDS" ] && break
|
||
while IFS= read -r id; do
|
||
ALL_IDS+=("$id")
|
||
done <<< "$IDS"
|
||
|
||
NEXT=$(echo "$RESP" | jq -r '.next')
|
||
[ "$NEXT" = "null" ] && break
|
||
PAGE=$((PAGE + 1))
|
||
done
|
||
|
||
TOTAL=${#ALL_IDS[@]}
|
||
echo "Found $TOTAL documents."
|
||
|
||
# ── Download in batches ────────────────────────────────────────────────────
|
||
mkdir -p "$OUTPUT_DIR"
|
||
|
||
BATCH_NUM=0
|
||
START=0
|
||
while [ $START -lt $TOTAL ]; do
|
||
END=$(( START + BATCH_SIZE ))
|
||
[ $END -gt $TOTAL ] && END=$TOTAL
|
||
BATCH_IDS=("${ALL_IDS[@]:$START:$((END - START))}")
|
||
BATCH_NUM=$((BATCH_NUM + 1))
|
||
|
||
PADDED=$(printf "%03d" "$BATCH_NUM")
|
||
ZIP_FILE="$OUTPUT_DIR/batch_$PADDED.zip"
|
||
BATCH_DIR="$OUTPUT_DIR/batch_$PADDED"
|
||
|
||
# Skip already-extracted batches (resume support)
|
||
if [ -d "$BATCH_DIR" ]; then
|
||
echo "Skipping batch $BATCH_NUM (already extracted)."
|
||
START=$END
|
||
continue
|
||
fi
|
||
|
||
echo "Downloading batch $BATCH_NUM (docs $((START+1))–$END of $TOTAL)..."
|
||
|
||
JSON_IDS=$(printf '%s\n' "${BATCH_IDS[@]}" | grep -E '^[0-9]+$' | jq -s '[.[] | tonumber]')
|
||
|
||
ATTEMPT=0
|
||
SUCCESS=false
|
||
while [ $ATTEMPT -lt $MAX_RETRIES ]; do
|
||
ATTEMPT=$((ATTEMPT + 1))
|
||
|
||
CONTENT_TYPE=$(curl -s --max-time 300 -X POST \
|
||
-H "Authorization: Token $TOKEN" \
|
||
-H "Content-Type: application/json" \
|
||
-d "{\"documents\": $JSON_IDS, \"content\": \"originals\"}" \
|
||
"$BASE_URL/api/documents/bulk_download/" \
|
||
-D - \
|
||
--output "$ZIP_FILE" | grep -i '^content-type:' | tr -d '\r')
|
||
|
||
if [ ! -s "$ZIP_FILE" ]; then
|
||
echo " Attempt $ATTEMPT: empty response, retrying in ${RETRY_DELAY}s..." >&2
|
||
rm -f "$ZIP_FILE"
|
||
sleep $RETRY_DELAY
|
||
continue
|
||
fi
|
||
|
||
if echo "$CONTENT_TYPE" | grep -qi 'zip\|octet-stream'; then
|
||
SUCCESS=true
|
||
break
|
||
else
|
||
echo " Attempt $ATTEMPT: unexpected response (${CONTENT_TYPE}):" >&2
|
||
cat "$ZIP_FILE" >&2
|
||
rm -f "$ZIP_FILE"
|
||
sleep $RETRY_DELAY
|
||
fi
|
||
done
|
||
|
||
if [ "$SUCCESS" = false ]; then
|
||
echo " Batch $BATCH_NUM failed after $MAX_RETRIES attempts, skipping." >&2
|
||
START=$END
|
||
continue
|
||
fi
|
||
|
||
mkdir -p "$BATCH_DIR"
|
||
unzip -q "$ZIP_FILE" -d "$BATCH_DIR"
|
||
rm "$ZIP_FILE"
|
||
echo " Extracted to $BATCH_DIR/"
|
||
|
||
START=$END
|
||
done
|
||
|
||
echo "Done. All files saved to $OUTPUT_DIR/"
|