#!/bin/bash set -euo pipefail # ── Config ──────────────────────────────────────────────────────────────────── BASE_URL="https://dg-familienarchiv.de" BATCH_SIZE=10 OUTPUT_DIR="./paperless-download" MAX_RETRIES=3 RETRY_DELAY=10 # seconds between retries # ───────────────────────────────────────────────────────────────────────────── # Prompt for credentials USERNAME="berit.cram@gmx.de" PASSWORD="LifeOfNewYork1;)" echo "Authenticating..." AUTH_BODY=$(jq -n --arg u "$USERNAME" --arg p "$PASSWORD" '{username: $u, password: $p}') AUTH_RESP=$(curl -s -X POST \ -H "Content-Type: application/json" \ -d "$AUTH_BODY" \ "$BASE_URL/api/token/") TOKEN=$(echo "$AUTH_RESP" | jq -r '.token') if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then echo "Authentication failed. Server responded:" >&2 echo "$AUTH_RESP" >&2 exit 1 fi echo "Authenticated." # ── Collect all document IDs ─────────────────────────────────────────────── echo "Fetching document list..." ALL_IDS=() PAGE=1 while true; do RESP=$(curl -s \ -H "Authorization: Token $TOKEN" \ "$BASE_URL/api/documents/?page=$PAGE&page_size=100") IDS=$(echo "$RESP" | jq -r '.results[].id' | tr -d '\r' | grep -E '^[0-9]+$') [ -z "$IDS" ] && break while IFS= read -r id; do ALL_IDS+=("$id") done <<< "$IDS" NEXT=$(echo "$RESP" | jq -r '.next') [ "$NEXT" = "null" ] && break PAGE=$((PAGE + 1)) done TOTAL=${#ALL_IDS[@]} echo "Found $TOTAL documents." # ── Download in batches ──────────────────────────────────────────────────── mkdir -p "$OUTPUT_DIR" BATCH_NUM=0 START=0 while [ $START -lt $TOTAL ]; do END=$(( START + BATCH_SIZE )) [ $END -gt $TOTAL ] && END=$TOTAL BATCH_IDS=("${ALL_IDS[@]:$START:$((END - START))}") BATCH_NUM=$((BATCH_NUM + 1)) PADDED=$(printf "%03d" "$BATCH_NUM") ZIP_FILE="$OUTPUT_DIR/batch_$PADDED.zip" BATCH_DIR="$OUTPUT_DIR/batch_$PADDED" # Skip already-extracted batches (resume support) if [ -d "$BATCH_DIR" ]; then echo "Skipping batch $BATCH_NUM (already extracted)." START=$END continue fi echo "Downloading batch $BATCH_NUM (docs $((START+1))–$END of $TOTAL)..." JSON_IDS=$(printf '%s\n' "${BATCH_IDS[@]}" | grep -E '^[0-9]+$' | jq -s '[.[] | tonumber]') ATTEMPT=0 SUCCESS=false while [ $ATTEMPT -lt $MAX_RETRIES ]; do ATTEMPT=$((ATTEMPT + 1)) CONTENT_TYPE=$(curl -s --max-time 300 -X POST \ -H "Authorization: Token $TOKEN" \ -H "Content-Type: application/json" \ -d "{\"documents\": $JSON_IDS, \"content\": \"originals\"}" \ "$BASE_URL/api/documents/bulk_download/" \ -D - \ --output "$ZIP_FILE" | grep -i '^content-type:' | tr -d '\r') if [ ! -s "$ZIP_FILE" ]; then echo " Attempt $ATTEMPT: empty response, retrying in ${RETRY_DELAY}s..." >&2 rm -f "$ZIP_FILE" sleep $RETRY_DELAY continue fi if echo "$CONTENT_TYPE" | grep -qi 'zip\|octet-stream'; then SUCCESS=true break else echo " Attempt $ATTEMPT: unexpected response (${CONTENT_TYPE}):" >&2 cat "$ZIP_FILE" >&2 rm -f "$ZIP_FILE" sleep $RETRY_DELAY fi done if [ "$SUCCESS" = false ]; then echo " Batch $BATCH_NUM failed after $MAX_RETRIES attempts, skipping." >&2 START=$END continue fi mkdir -p "$BATCH_DIR" unzip -q "$ZIP_FILE" -d "$BATCH_DIR" rm "$ZIP_FILE" echo " Extracted to $BATCH_DIR/" START=$END done echo "Done. All files saved to $OUTPUT_DIR/"