refactor(document): move document domain core to document/ package

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-05 12:39:20 +02:00
parent bb7d872a61
commit e85057bed2
2371 changed files with 385726 additions and 1971 deletions
--- a/scripts/CLAUDE.md
+++ b/scripts/CLAUDE.md
@@ -0,0 +1,144 @@
+# Scripts — Familienarchiv
+
+## Overview
+
+Utility scripts for development, data management, model downloads, and database operations. These are standalone shell and Python scripts used outside the normal application runtime.
+
+## Scripts
+
+### `reset-db.sh`
+**Purpose**: Hard-reset the development database, wiping all documents, persons, tags, and related data.
+
+**Usage:**
+```bash
+./scripts/reset-db.sh
+# Type 'yes' to confirm
+```
+
+**What it truncates:**
+- `transcription_block_versions`
+- `transcription_blocks`
+- `comment_mentions`
+- `document_comments`
+- `document_annotations`
+- `document_versions`
+- `notifications`
+- `documents`
+- `person_name_aliases`
+- `persons`
+- `tag`
+
+> ⚠️ **Destructive operation** — only for development!
+
+---
+
+### `rebuild-frontend.sh`
+**Purpose**: Force a clean rebuild of the frontend Docker container.
+
+**Usage:**
+```bash
+./scripts/rebuild-frontend.sh
+```
+
+---
+
+### `download-kraken-models.sh`
+**Purpose**: Download Kraken HTR models for German Kurrent and Sütterlin scripts.
+
+**Usage:**
+```bash
+./scripts/download-kraken-models.sh
+```
+
+Downloads models into `./ocr-service/models/` or the `ocr_models` Docker volume. Models are ~100-500 MB each.
+
+---
+
+### `download-paperless.sh`
+**Purpose**: Download exported documents from a Paperless-ngx instance.
+
+**Usage:**
+```bash
+./scripts/download-paperless.sh
+```
+
+Requires environment variables or config for the Paperless API endpoint and token.
+
+---
+
+### `flatten-paperless.sh`
+**Purpose**: Flatten nested Paperless export directories into a single import-ready structure.
+
+**Usage:**
+```bash
+./scripts/flatten-paperless.sh
+```
+
+---
+
+### `generate_data.py`
+**Purpose**: Generate synthetic test data for development.
+
+**Usage:**
+```bash
+python scripts/generate_data.py
+```
+
+Generates fake documents, persons, and tags suitable for load testing or UI development.
+
+---
+
+### `prepare_historical_dict.py`
+**Purpose**: Build a historical German word dictionary for the OCR spell-checker.
+
+**Usage:**
+```bash
+python scripts/prepare_historical_dict.py
+```
+
+Processes raw word lists into the format expected by `ocr-service/spell_check.py`.
+
+---
+
+### `schema.sql`
+**Purpose**: Complete database schema dump for reference.
+
+**Note**: Flyway migrations in `backend/src/main/resources/db/migration/` are the source of truth for schema evolution. `schema.sql` is a snapshot for quick reference only.
+
+---
+
+### `large-data.sql`
+**Purpose**: Pre-seeded dataset with a large number of documents for performance testing.
+
+**Usage:**
+```bash
+# Import into PostgreSQL
+docker exec -i archive-db psql -U archive_user -d family_archive_db < scripts/large-data.sql
+```
+
+## How to Use
+
+Most scripts should be run from the **repository root**:
+
+```bash
+# Database reset
+./scripts/reset-db.sh
+
+# Model download
+./scripts/download-kraken-models.sh
+
+# Data generation
+cd scripts && python generate_data.py
+```
+
+Ensure scripts are executable:
+```bash
+chmod +x scripts/*.sh
+```
+
+## Adding New Scripts
+
+1. Place the script in `scripts/`
+2. Add a header comment describing purpose and usage
+3. Make it executable (`chmod +x`)
+4. Document it in this `CLAUDE.md`
--- a/scripts/clean-e2e-data.sh
+++ b/scripts/clean-e2e-data.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+# Removes E2E test data that leaked into the database.
+# Documents with title LIKE 'E2E%' and persons whose last_name ends with a
+# timestamp suffix (Sender-<ts> / Receiver-<ts> from bilateral fixtures), plus
+# the manually-created E2E Testperson from persons.spec.ts.
+set -euo pipefail
+
+CONTAINER="archive-db"
+DB_USER="archive_user"
+DB_NAME="family_archive_db"
+
+DRY_RUN=false
+if [[ "${1:-}" == "--dry-run" ]]; then
+    DRY_RUN=true
+fi
+
+run_sql() {
+    docker exec "$CONTAINER" psql -U "$DB_USER" -d "$DB_NAME" -c "$1"
+}
+
+# ── Dry-run summary ──────────────────────────────────────────────────────────
+
+echo ""
+echo "E2E test data found in the database:"
+echo ""
+
+run_sql "
+SELECT count(*) AS e2e_documents
+FROM documents
+WHERE title LIKE 'E2E%';
+"
+
+run_sql "
+SELECT count(*) AS e2e_persons
+FROM persons
+WHERE
+  -- bilateral fixture: Visual Sender-<ts>, A11y Receiver-<ts>, etc.
+  last_name ~ '^(Sender|Receiver)-[0-9]{10,}$'
+  -- persons.spec.ts: E2E Testperson
+  OR (first_name = 'E2E' AND last_name = 'Testperson');
+"
+
+run_sql "
+SELECT count(*) AS e2e_users
+FROM users
+WHERE first_name = 'E2E' AND last_name = 'Testuser';
+"
+
+if $DRY_RUN; then
+    echo "Dry run — no changes made."
+    exit 0
+fi
+
+# ── Confirmation ─────────────────────────────────────────────────────────────
+
+echo ""
+echo "This will permanently delete all E2E test documents, persons, and users."
+read -rp "Type 'yes' to continue: " CONFIRM
+[ "$CONFIRM" != "yes" ] && echo "Aborted." && exit 0
+
+# ── Cleanup ──────────────────────────────────────────────────────────────────
+
+echo ""
+echo "Deleting E2E test data..."
+
+run_sql "
+-- Notifications have no FK on document_id, must be cleaned manually.
+DELETE FROM notifications
+WHERE document_id IN (SELECT id FROM documents WHERE title LIKE 'E2E%');
+
+-- Delete E2E documents. All dependent tables cascade:
+--   document_receivers, document_tags, document_training_labels,
+--   document_versions, document_annotations, document_comments,
+--   comment_mentions, transcription_blocks, transcription_block_versions,
+--   block_mentioned_persons
+DELETE FROM documents WHERE title LIKE 'E2E%';
+
+-- Match all test persons:
+--   bilateral fixtures  → last_name = 'Sender-<timestamp>' or 'Receiver-<timestamp>'
+--   persons.spec.ts     → first_name = 'E2E', last_name = 'Testperson'
+-- Nullify sender FK on any non-E2E documents first (prevents FK violation).
+UPDATE documents
+SET sender_id = NULL
+WHERE sender_id IN (
+  SELECT id FROM persons
+  WHERE last_name ~ '^(Sender|Receiver)-[0-9]{10,}$'
+     OR (first_name = 'E2E' AND last_name = 'Testperson')
+);
+
+-- Remove receiver links from non-E2E documents (same edge case).
+DELETE FROM document_receivers
+WHERE person_id IN (
+  SELECT id FROM persons
+  WHERE last_name ~ '^(Sender|Receiver)-[0-9]{10,}$'
+     OR (first_name = 'E2E' AND last_name = 'Testperson')
+);
+
+-- Delete test persons. person_name_aliases cascade automatically.
+DELETE FROM persons
+WHERE last_name ~ '^(Sender|Receiver)-[0-9]{10,}$'
+   OR (first_name = 'E2E' AND last_name = 'Testperson');
+
+-- Delete leaked E2E users (created but not cleaned up in admin.spec.ts).
+DELETE FROM users_groups
+WHERE user_id IN (SELECT id FROM users WHERE first_name = 'E2E' AND last_name = 'Testuser');
+
+DELETE FROM users
+WHERE first_name = 'E2E' AND last_name = 'Testuser';
+" --quiet
+
+echo "Done."
--- a/scripts/download-paperless.sh
+++ b/scripts/download-paperless.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+set -euo pipefail
+
+# ── Config ────────────────────────────────────────────────────────────────────
+BASE_URL="https://dg-familienarchiv.de"
+BATCH_SIZE=10
+OUTPUT_DIR="./paperless-download"
+MAX_RETRIES=3
+RETRY_DELAY=10   # seconds between retries
+# ─────────────────────────────────────────────────────────────────────────────
+
+# Prompt for credentials
+USERNAME="berit.cram@gmx.de"
+PASSWORD="LifeOfNewYork1;)"
+
+echo "Authenticating..."
+AUTH_BODY=$(jq -n --arg u "$USERNAME" --arg p "$PASSWORD" '{username: $u, password: $p}')
+AUTH_RESP=$(curl -s -X POST \
+  -H "Content-Type: application/json" \
+  -d "$AUTH_BODY" \
+  "$BASE_URL/api/token/")
+TOKEN=$(echo "$AUTH_RESP" | jq -r '.token')
+
+if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
+  echo "Authentication failed. Server responded:" >&2
+  echo "$AUTH_RESP" >&2
+  exit 1
+fi
+echo "Authenticated."
+
+# ── Collect all document IDs ───────────────────────────────────────────────
+echo "Fetching document list..."
+ALL_IDS=()
+PAGE=1
+while true; do
+  RESP=$(curl -s \
+    -H "Authorization: Token $TOKEN" \
+    "$BASE_URL/api/documents/?page=$PAGE&page_size=100")
+
+  IDS=$(echo "$RESP" | jq -r '.results[].id' | tr -d '\r' | grep -E '^[0-9]+$')
+  [ -z "$IDS" ] && break
+  while IFS= read -r id; do
+    ALL_IDS+=("$id")
+  done <<< "$IDS"
+
+  NEXT=$(echo "$RESP" | jq -r '.next')
+  [ "$NEXT" = "null" ] && break
+  PAGE=$((PAGE + 1))
+done
+
+TOTAL=${#ALL_IDS[@]}
+echo "Found $TOTAL documents."
+
+# ── Download in batches ────────────────────────────────────────────────────
+mkdir -p "$OUTPUT_DIR"
+
+BATCH_NUM=0
+START=0
+while [ $START -lt $TOTAL ]; do
+  END=$(( START + BATCH_SIZE ))
+  [ $END -gt $TOTAL ] && END=$TOTAL
+  BATCH_IDS=("${ALL_IDS[@]:$START:$((END - START))}")
+  BATCH_NUM=$((BATCH_NUM + 1))
+
+  PADDED=$(printf "%03d" "$BATCH_NUM")
+  ZIP_FILE="$OUTPUT_DIR/batch_$PADDED.zip"
+  BATCH_DIR="$OUTPUT_DIR/batch_$PADDED"
+
+  # Skip already-extracted batches (resume support)
+  if [ -d "$BATCH_DIR" ]; then
+    echo "Skipping batch $BATCH_NUM (already extracted)."
+    START=$END
+    continue
+  fi
+
+  echo "Downloading batch $BATCH_NUM (docs $((START+1))–$END of $TOTAL)..."
+
+  JSON_IDS=$(printf '%s\n' "${BATCH_IDS[@]}" | grep -E '^[0-9]+$' | jq -s '[.[] | tonumber]')
+
+  ATTEMPT=0
+  SUCCESS=false
+  while [ $ATTEMPT -lt $MAX_RETRIES ]; do
+    ATTEMPT=$((ATTEMPT + 1))
+
+    CONTENT_TYPE=$(curl -s --max-time 300 -X POST \
+      -H "Authorization: Token $TOKEN" \
+      -H "Content-Type: application/json" \
+      -d "{\"documents\": $JSON_IDS, \"content\": \"originals\"}" \
+      "$BASE_URL/api/documents/bulk_download/" \
+      -D - \
+      --output "$ZIP_FILE" | grep -i '^content-type:' | tr -d '\r')
+
+    if [ ! -s "$ZIP_FILE" ]; then
+      echo "  Attempt $ATTEMPT: empty response, retrying in ${RETRY_DELAY}s..." >&2
+      rm -f "$ZIP_FILE"
+      sleep $RETRY_DELAY
+      continue
+    fi
+
+    if echo "$CONTENT_TYPE" | grep -qi 'zip\|octet-stream'; then
+      SUCCESS=true
+      break
+    else
+      echo "  Attempt $ATTEMPT: unexpected response (${CONTENT_TYPE}):" >&2
+      cat "$ZIP_FILE" >&2
+      rm -f "$ZIP_FILE"
+      sleep $RETRY_DELAY
+    fi
+  done
+
+  if [ "$SUCCESS" = false ]; then
+    echo "  Batch $BATCH_NUM failed after $MAX_RETRIES attempts, skipping." >&2
+    START=$END
+    continue
+  fi
+
+  mkdir -p "$BATCH_DIR"
+  unzip -q "$ZIP_FILE" -d "$BATCH_DIR"
+  rm "$ZIP_FILE"
+  echo "  Extracted to $BATCH_DIR/"
+
+  START=$END
+done
+
+echo "Done. All files saved to $OUTPUT_DIR/"
--- a/scripts/flatten-paperless.sh
+++ b/scripts/flatten-paperless.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+set -euo pipefail
+
+# ── Config ────────────────────────────────────────────────────────────────────
+INPUT_DIR="./paperless-download"
+OUTPUT_DIR="./paperless-flat"
+# ─────────────────────────────────────────────────────────────────────────────
+
+mkdir -p "$OUTPUT_DIR"
+
+MOVED=0
+SKIPPED=0
+
+while IFS= read -r -d '' FILE; do
+  BASENAME=$(basename "$FILE")
+
+  # Strip leading date prefix: "YYYY-MM-DD " → ""
+  NEWNAME=$(echo "$BASENAME" | sed -E 's/^[0-9]{4}-[0-9]{2}-[0-9]{2} //')
+
+  DEST="$OUTPUT_DIR/$NEWNAME"
+
+  if [ -e "$DEST" ]; then
+    echo "Skipping (already exists): $NEWNAME" >&2
+    SKIPPED=$((SKIPPED + 1))
+    continue
+  fi
+
+  cp "$FILE" "$DEST"
+  MOVED=$((MOVED + 1))
+done < <(find "$INPUT_DIR" -mindepth 2 -name "*.pdf" -print0)
+
+echo "Done. Copied $MOVED files to $OUTPUT_DIR/ ($SKIPPED skipped as duplicates)."
--- a/scripts/reset-db.sh
+++ b/scripts/reset-db.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -euo pipefail
+
+CONTAINER="archive-db"
+DB_USER="archive_user"
+DB_NAME="family_archive_db"
+
+echo "This will delete ALL documents, persons, and tags from the database."
+read -rp "Type 'yes' to continue: " CONFIRM
+[ "$CONFIRM" != "yes" ] && echo "Aborted." && exit 0
+
+docker exec "$CONTAINER" psql -U "$DB_USER" -d "$DB_NAME" -c "
+TRUNCATE transcription_block_versions CASCADE;
+TRUNCATE transcription_blocks CASCADE;
+TRUNCATE comment_mentions CASCADE;
+TRUNCATE document_comments CASCADE;
+TRUNCATE document_annotations CASCADE;
+TRUNCATE document_versions CASCADE;
+TRUNCATE notifications CASCADE;
+TRUNCATE documents CASCADE;
+TRUNCATE person_name_aliases CASCADE;
+TRUNCATE persons CASCADE;
+TRUNCATE tag CASCADE;
+" --quiet
+
+echo "Done. Database is clean."