refactor(document): move document domain core to document/ package

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-05 12:39:20 +02:00
parent bb7d872a61
commit e85057bed2
2371 changed files with 385726 additions and 1971 deletions
--- a/scripts/download-paperless.sh
+++ b/scripts/download-paperless.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+set -euo pipefail
+
+# ── Config ────────────────────────────────────────────────────────────────────
+BASE_URL="https://dg-familienarchiv.de"
+BATCH_SIZE=10
+OUTPUT_DIR="./paperless-download"
+MAX_RETRIES=3
+RETRY_DELAY=10   # seconds between retries
+# ─────────────────────────────────────────────────────────────────────────────
+
+# Prompt for credentials
+USERNAME="berit.cram@gmx.de"
+PASSWORD="LifeOfNewYork1;)"
+
+echo "Authenticating..."
+AUTH_BODY=$(jq -n --arg u "$USERNAME" --arg p "$PASSWORD" '{username: $u, password: $p}')
+AUTH_RESP=$(curl -s -X POST \
+  -H "Content-Type: application/json" \
+  -d "$AUTH_BODY" \
+  "$BASE_URL/api/token/")
+TOKEN=$(echo "$AUTH_RESP" | jq -r '.token')
+
+if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
+  echo "Authentication failed. Server responded:" >&2
+  echo "$AUTH_RESP" >&2
+  exit 1
+fi
+echo "Authenticated."
+
+# ── Collect all document IDs ───────────────────────────────────────────────
+echo "Fetching document list..."
+ALL_IDS=()
+PAGE=1
+while true; do
+  RESP=$(curl -s \
+    -H "Authorization: Token $TOKEN" \
+    "$BASE_URL/api/documents/?page=$PAGE&page_size=100")
+
+  IDS=$(echo "$RESP" | jq -r '.results[].id' | tr -d '\r' | grep -E '^[0-9]+$')
+  [ -z "$IDS" ] && break
+  while IFS= read -r id; do
+    ALL_IDS+=("$id")
+  done <<< "$IDS"
+
+  NEXT=$(echo "$RESP" | jq -r '.next')
+  [ "$NEXT" = "null" ] && break
+  PAGE=$((PAGE + 1))
+done
+
+TOTAL=${#ALL_IDS[@]}
+echo "Found $TOTAL documents."
+
+# ── Download in batches ────────────────────────────────────────────────────
+mkdir -p "$OUTPUT_DIR"
+
+BATCH_NUM=0
+START=0
+while [ $START -lt $TOTAL ]; do
+  END=$(( START + BATCH_SIZE ))
+  [ $END -gt $TOTAL ] && END=$TOTAL
+  BATCH_IDS=("${ALL_IDS[@]:$START:$((END - START))}")
+  BATCH_NUM=$((BATCH_NUM + 1))
+
+  PADDED=$(printf "%03d" "$BATCH_NUM")
+  ZIP_FILE="$OUTPUT_DIR/batch_$PADDED.zip"
+  BATCH_DIR="$OUTPUT_DIR/batch_$PADDED"
+
+  # Skip already-extracted batches (resume support)
+  if [ -d "$BATCH_DIR" ]; then
+    echo "Skipping batch $BATCH_NUM (already extracted)."
+    START=$END
+    continue
+  fi
+
+  echo "Downloading batch $BATCH_NUM (docs $((START+1))–$END of $TOTAL)..."
+
+  JSON_IDS=$(printf '%s\n' "${BATCH_IDS[@]}" | grep -E '^[0-9]+$' | jq -s '[.[] | tonumber]')
+
+  ATTEMPT=0
+  SUCCESS=false
+  while [ $ATTEMPT -lt $MAX_RETRIES ]; do
+    ATTEMPT=$((ATTEMPT + 1))
+
+    CONTENT_TYPE=$(curl -s --max-time 300 -X POST \
+      -H "Authorization: Token $TOKEN" \
+      -H "Content-Type: application/json" \
+      -d "{\"documents\": $JSON_IDS, \"content\": \"originals\"}" \
+      "$BASE_URL/api/documents/bulk_download/" \
+      -D - \
+      --output "$ZIP_FILE" | grep -i '^content-type:' | tr -d '\r')
+
+    if [ ! -s "$ZIP_FILE" ]; then
+      echo "  Attempt $ATTEMPT: empty response, retrying in ${RETRY_DELAY}s..." >&2
+      rm -f "$ZIP_FILE"
+      sleep $RETRY_DELAY
+      continue
+    fi
+
+    if echo "$CONTENT_TYPE" | grep -qi 'zip\|octet-stream'; then
+      SUCCESS=true
+      break
+    else
+      echo "  Attempt $ATTEMPT: unexpected response (${CONTENT_TYPE}):" >&2
+      cat "$ZIP_FILE" >&2
+      rm -f "$ZIP_FILE"
+      sleep $RETRY_DELAY
+    fi
+  done
+
+  if [ "$SUCCESS" = false ]; then
+    echo "  Batch $BATCH_NUM failed after $MAX_RETRIES attempts, skipping." >&2
+    START=$END
+    continue
+  fi
+
+  mkdir -p "$BATCH_DIR"
+  unzip -q "$ZIP_FILE" -d "$BATCH_DIR"
+  rm "$ZIP_FILE"
+  echo "  Extracted to $BATCH_DIR/"
+
+  START=$END
+done
+
+echo "Done. All files saved to $OUTPUT_DIR/"