refactor(document): move document domain core to document/ package
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
125
scripts/download-paperless.sh
Executable file
125
scripts/download-paperless.sh
Executable file
@@ -0,0 +1,125 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
BASE_URL="https://dg-familienarchiv.de"
|
||||
BATCH_SIZE=10
|
||||
OUTPUT_DIR="./paperless-download"
|
||||
MAX_RETRIES=3
|
||||
RETRY_DELAY=10 # seconds between retries
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# Prompt for credentials
|
||||
USERNAME="berit.cram@gmx.de"
|
||||
PASSWORD="LifeOfNewYork1;)"
|
||||
|
||||
echo "Authenticating..."
|
||||
AUTH_BODY=$(jq -n --arg u "$USERNAME" --arg p "$PASSWORD" '{username: $u, password: $p}')
|
||||
AUTH_RESP=$(curl -s -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$AUTH_BODY" \
|
||||
"$BASE_URL/api/token/")
|
||||
TOKEN=$(echo "$AUTH_RESP" | jq -r '.token')
|
||||
|
||||
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
|
||||
echo "Authentication failed. Server responded:" >&2
|
||||
echo "$AUTH_RESP" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "Authenticated."
|
||||
|
||||
# ── Collect all document IDs ───────────────────────────────────────────────
|
||||
echo "Fetching document list..."
|
||||
ALL_IDS=()
|
||||
PAGE=1
|
||||
while true; do
|
||||
RESP=$(curl -s \
|
||||
-H "Authorization: Token $TOKEN" \
|
||||
"$BASE_URL/api/documents/?page=$PAGE&page_size=100")
|
||||
|
||||
IDS=$(echo "$RESP" | jq -r '.results[].id' | tr -d '\r' | grep -E '^[0-9]+$')
|
||||
[ -z "$IDS" ] && break
|
||||
while IFS= read -r id; do
|
||||
ALL_IDS+=("$id")
|
||||
done <<< "$IDS"
|
||||
|
||||
NEXT=$(echo "$RESP" | jq -r '.next')
|
||||
[ "$NEXT" = "null" ] && break
|
||||
PAGE=$((PAGE + 1))
|
||||
done
|
||||
|
||||
TOTAL=${#ALL_IDS[@]}
|
||||
echo "Found $TOTAL documents."
|
||||
|
||||
# ── Download in batches ────────────────────────────────────────────────────
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
BATCH_NUM=0
|
||||
START=0
|
||||
while [ $START -lt $TOTAL ]; do
|
||||
END=$(( START + BATCH_SIZE ))
|
||||
[ $END -gt $TOTAL ] && END=$TOTAL
|
||||
BATCH_IDS=("${ALL_IDS[@]:$START:$((END - START))}")
|
||||
BATCH_NUM=$((BATCH_NUM + 1))
|
||||
|
||||
PADDED=$(printf "%03d" "$BATCH_NUM")
|
||||
ZIP_FILE="$OUTPUT_DIR/batch_$PADDED.zip"
|
||||
BATCH_DIR="$OUTPUT_DIR/batch_$PADDED"
|
||||
|
||||
# Skip already-extracted batches (resume support)
|
||||
if [ -d "$BATCH_DIR" ]; then
|
||||
echo "Skipping batch $BATCH_NUM (already extracted)."
|
||||
START=$END
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "Downloading batch $BATCH_NUM (docs $((START+1))–$END of $TOTAL)..."
|
||||
|
||||
JSON_IDS=$(printf '%s\n' "${BATCH_IDS[@]}" | grep -E '^[0-9]+$' | jq -s '[.[] | tonumber]')
|
||||
|
||||
ATTEMPT=0
|
||||
SUCCESS=false
|
||||
while [ $ATTEMPT -lt $MAX_RETRIES ]; do
|
||||
ATTEMPT=$((ATTEMPT + 1))
|
||||
|
||||
CONTENT_TYPE=$(curl -s --max-time 300 -X POST \
|
||||
-H "Authorization: Token $TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"documents\": $JSON_IDS, \"content\": \"originals\"}" \
|
||||
"$BASE_URL/api/documents/bulk_download/" \
|
||||
-D - \
|
||||
--output "$ZIP_FILE" | grep -i '^content-type:' | tr -d '\r')
|
||||
|
||||
if [ ! -s "$ZIP_FILE" ]; then
|
||||
echo " Attempt $ATTEMPT: empty response, retrying in ${RETRY_DELAY}s..." >&2
|
||||
rm -f "$ZIP_FILE"
|
||||
sleep $RETRY_DELAY
|
||||
continue
|
||||
fi
|
||||
|
||||
if echo "$CONTENT_TYPE" | grep -qi 'zip\|octet-stream'; then
|
||||
SUCCESS=true
|
||||
break
|
||||
else
|
||||
echo " Attempt $ATTEMPT: unexpected response (${CONTENT_TYPE}):" >&2
|
||||
cat "$ZIP_FILE" >&2
|
||||
rm -f "$ZIP_FILE"
|
||||
sleep $RETRY_DELAY
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$SUCCESS" = false ]; then
|
||||
echo " Batch $BATCH_NUM failed after $MAX_RETRIES attempts, skipping." >&2
|
||||
START=$END
|
||||
continue
|
||||
fi
|
||||
|
||||
mkdir -p "$BATCH_DIR"
|
||||
unzip -q "$ZIP_FILE" -d "$BATCH_DIR"
|
||||
rm "$ZIP_FILE"
|
||||
echo " Extracted to $BATCH_DIR/"
|
||||
|
||||
START=$END
|
||||
done
|
||||
|
||||
echo "Done. All files saved to $OUTPUT_DIR/"
|
||||
Reference in New Issue
Block a user