refactor(document): move document domain core to document/ package

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-05 12:39:20 +02:00
parent bb7d872a61
commit e85057bed2
2371 changed files with 385726 additions and 1971 deletions

125
scripts/download-paperless.sh Executable file
View File

@@ -0,0 +1,125 @@
#!/bin/bash
set -euo pipefail
# ── Config ────────────────────────────────────────────────────────────────────
BASE_URL="https://dg-familienarchiv.de"
BATCH_SIZE=10
OUTPUT_DIR="./paperless-download"
MAX_RETRIES=3
RETRY_DELAY=10 # seconds between retries
# ─────────────────────────────────────────────────────────────────────────────
# Prompt for credentials
USERNAME="berit.cram@gmx.de"
PASSWORD="LifeOfNewYork1;)"
echo "Authenticating..."
AUTH_BODY=$(jq -n --arg u "$USERNAME" --arg p "$PASSWORD" '{username: $u, password: $p}')
AUTH_RESP=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "$AUTH_BODY" \
"$BASE_URL/api/token/")
TOKEN=$(echo "$AUTH_RESP" | jq -r '.token')
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
echo "Authentication failed. Server responded:" >&2
echo "$AUTH_RESP" >&2
exit 1
fi
echo "Authenticated."
# ── Collect all document IDs ───────────────────────────────────────────────
echo "Fetching document list..."
ALL_IDS=()
PAGE=1
while true; do
RESP=$(curl -s \
-H "Authorization: Token $TOKEN" \
"$BASE_URL/api/documents/?page=$PAGE&page_size=100")
IDS=$(echo "$RESP" | jq -r '.results[].id' | tr -d '\r' | grep -E '^[0-9]+$')
[ -z "$IDS" ] && break
while IFS= read -r id; do
ALL_IDS+=("$id")
done <<< "$IDS"
NEXT=$(echo "$RESP" | jq -r '.next')
[ "$NEXT" = "null" ] && break
PAGE=$((PAGE + 1))
done
TOTAL=${#ALL_IDS[@]}
echo "Found $TOTAL documents."
# ── Download in batches ────────────────────────────────────────────────────
mkdir -p "$OUTPUT_DIR"
BATCH_NUM=0
START=0
while [ $START -lt $TOTAL ]; do
END=$(( START + BATCH_SIZE ))
[ $END -gt $TOTAL ] && END=$TOTAL
BATCH_IDS=("${ALL_IDS[@]:$START:$((END - START))}")
BATCH_NUM=$((BATCH_NUM + 1))
PADDED=$(printf "%03d" "$BATCH_NUM")
ZIP_FILE="$OUTPUT_DIR/batch_$PADDED.zip"
BATCH_DIR="$OUTPUT_DIR/batch_$PADDED"
# Skip already-extracted batches (resume support)
if [ -d "$BATCH_DIR" ]; then
echo "Skipping batch $BATCH_NUM (already extracted)."
START=$END
continue
fi
echo "Downloading batch $BATCH_NUM (docs $((START+1))$END of $TOTAL)..."
JSON_IDS=$(printf '%s\n' "${BATCH_IDS[@]}" | grep -E '^[0-9]+$' | jq -s '[.[] | tonumber]')
ATTEMPT=0
SUCCESS=false
while [ $ATTEMPT -lt $MAX_RETRIES ]; do
ATTEMPT=$((ATTEMPT + 1))
CONTENT_TYPE=$(curl -s --max-time 300 -X POST \
-H "Authorization: Token $TOKEN" \
-H "Content-Type: application/json" \
-d "{\"documents\": $JSON_IDS, \"content\": \"originals\"}" \
"$BASE_URL/api/documents/bulk_download/" \
-D - \
--output "$ZIP_FILE" | grep -i '^content-type:' | tr -d '\r')
if [ ! -s "$ZIP_FILE" ]; then
echo " Attempt $ATTEMPT: empty response, retrying in ${RETRY_DELAY}s..." >&2
rm -f "$ZIP_FILE"
sleep $RETRY_DELAY
continue
fi
if echo "$CONTENT_TYPE" | grep -qi 'zip\|octet-stream'; then
SUCCESS=true
break
else
echo " Attempt $ATTEMPT: unexpected response (${CONTENT_TYPE}):" >&2
cat "$ZIP_FILE" >&2
rm -f "$ZIP_FILE"
sleep $RETRY_DELAY
fi
done
if [ "$SUCCESS" = false ]; then
echo " Batch $BATCH_NUM failed after $MAX_RETRIES attempts, skipping." >&2
START=$END
continue
fi
mkdir -p "$BATCH_DIR"
unzip -q "$ZIP_FILE" -d "$BATCH_DIR"
rm "$ZIP_FILE"
echo " Extracted to $BATCH_DIR/"
START=$END
done
echo "Done. All files saved to $OUTPUT_DIR/"