refactor(document): move document domain core to document/ package
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
144
scripts/CLAUDE.md
Normal file
144
scripts/CLAUDE.md
Normal file
@@ -0,0 +1,144 @@
|
||||
# Scripts — Familienarchiv
|
||||
|
||||
## Overview
|
||||
|
||||
Utility scripts for development, data management, model downloads, and database operations. These are standalone shell and Python scripts used outside the normal application runtime.
|
||||
|
||||
## Scripts
|
||||
|
||||
### `reset-db.sh`
|
||||
**Purpose**: Hard-reset the development database, wiping all documents, persons, tags, and related data.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
./scripts/reset-db.sh
|
||||
# Type 'yes' to confirm
|
||||
```
|
||||
|
||||
**What it truncates:**
|
||||
- `transcription_block_versions`
|
||||
- `transcription_blocks`
|
||||
- `comment_mentions`
|
||||
- `document_comments`
|
||||
- `document_annotations`
|
||||
- `document_versions`
|
||||
- `notifications`
|
||||
- `documents`
|
||||
- `person_name_aliases`
|
||||
- `persons`
|
||||
- `tag`
|
||||
|
||||
> ⚠️ **Destructive operation** — only for development!
|
||||
|
||||
---
|
||||
|
||||
### `rebuild-frontend.sh`
|
||||
**Purpose**: Force a clean rebuild of the frontend Docker container.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
./scripts/rebuild-frontend.sh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `download-kraken-models.sh`
|
||||
**Purpose**: Download Kraken HTR models for German Kurrent and Sütterlin scripts.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
./scripts/download-kraken-models.sh
|
||||
```
|
||||
|
||||
Downloads models into `./ocr-service/models/` or the `ocr_models` Docker volume. Models are ~100-500 MB each.
|
||||
|
||||
---
|
||||
|
||||
### `download-paperless.sh`
|
||||
**Purpose**: Download exported documents from a Paperless-ngx instance.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
./scripts/download-paperless.sh
|
||||
```
|
||||
|
||||
Requires environment variables or config for the Paperless API endpoint and token.
|
||||
|
||||
---
|
||||
|
||||
### `flatten-paperless.sh`
|
||||
**Purpose**: Flatten nested Paperless export directories into a single import-ready structure.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
./scripts/flatten-paperless.sh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `generate_data.py`
|
||||
**Purpose**: Generate synthetic test data for development.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
python scripts/generate_data.py
|
||||
```
|
||||
|
||||
Generates fake documents, persons, and tags suitable for load testing or UI development.
|
||||
|
||||
---
|
||||
|
||||
### `prepare_historical_dict.py`
|
||||
**Purpose**: Build a historical German word dictionary for the OCR spell-checker.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
python scripts/prepare_historical_dict.py
|
||||
```
|
||||
|
||||
Processes raw word lists into the format expected by `ocr-service/spell_check.py`.
|
||||
|
||||
---
|
||||
|
||||
### `schema.sql`
|
||||
**Purpose**: Complete database schema dump for reference.
|
||||
|
||||
**Note**: Flyway migrations in `backend/src/main/resources/db/migration/` are the source of truth for schema evolution. `schema.sql` is a snapshot for quick reference only.
|
||||
|
||||
---
|
||||
|
||||
### `large-data.sql`
|
||||
**Purpose**: Pre-seeded dataset with a large number of documents for performance testing.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Import into PostgreSQL
|
||||
docker exec -i archive-db psql -U archive_user -d family_archive_db < scripts/large-data.sql
|
||||
```
|
||||
|
||||
## How to Use
|
||||
|
||||
Most scripts should be run from the **repository root**:
|
||||
|
||||
```bash
|
||||
# Database reset
|
||||
./scripts/reset-db.sh
|
||||
|
||||
# Model download
|
||||
./scripts/download-kraken-models.sh
|
||||
|
||||
# Data generation
|
||||
cd scripts && python generate_data.py
|
||||
```
|
||||
|
||||
Ensure scripts are executable:
|
||||
```bash
|
||||
chmod +x scripts/*.sh
|
||||
```
|
||||
|
||||
## Adding New Scripts
|
||||
|
||||
1. Place the script in `scripts/`
|
||||
2. Add a header comment describing purpose and usage
|
||||
3. Make it executable (`chmod +x`)
|
||||
4. Document it in this `CLAUDE.md`
|
||||
111
scripts/clean-e2e-data.sh
Executable file
111
scripts/clean-e2e-data.sh
Executable file
@@ -0,0 +1,111 @@
|
||||
#!/bin/bash
|
||||
# Removes E2E test data that leaked into the database.
|
||||
# Documents with title LIKE 'E2E%' and persons whose last_name ends with a
|
||||
# timestamp suffix (Sender-<ts> / Receiver-<ts> from bilateral fixtures), plus
|
||||
# the manually-created E2E Testperson from persons.spec.ts.
|
||||
set -euo pipefail
|
||||
|
||||
CONTAINER="archive-db"
|
||||
DB_USER="archive_user"
|
||||
DB_NAME="family_archive_db"
|
||||
|
||||
DRY_RUN=false
|
||||
if [[ "${1:-}" == "--dry-run" ]]; then
|
||||
DRY_RUN=true
|
||||
fi
|
||||
|
||||
run_sql() {
|
||||
docker exec "$CONTAINER" psql -U "$DB_USER" -d "$DB_NAME" -c "$1"
|
||||
}
|
||||
|
||||
# ── Dry-run summary ──────────────────────────────────────────────────────────
|
||||
|
||||
echo ""
|
||||
echo "E2E test data found in the database:"
|
||||
echo ""
|
||||
|
||||
run_sql "
|
||||
SELECT count(*) AS e2e_documents
|
||||
FROM documents
|
||||
WHERE title LIKE 'E2E%';
|
||||
"
|
||||
|
||||
run_sql "
|
||||
SELECT count(*) AS e2e_persons
|
||||
FROM persons
|
||||
WHERE
|
||||
-- bilateral fixture: Visual Sender-<ts>, A11y Receiver-<ts>, etc.
|
||||
last_name ~ '^(Sender|Receiver)-[0-9]{10,}$'
|
||||
-- persons.spec.ts: E2E Testperson
|
||||
OR (first_name = 'E2E' AND last_name = 'Testperson');
|
||||
"
|
||||
|
||||
run_sql "
|
||||
SELECT count(*) AS e2e_users
|
||||
FROM users
|
||||
WHERE first_name = 'E2E' AND last_name = 'Testuser';
|
||||
"
|
||||
|
||||
if $DRY_RUN; then
|
||||
echo "Dry run — no changes made."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── Confirmation ─────────────────────────────────────────────────────────────
|
||||
|
||||
echo ""
|
||||
echo "This will permanently delete all E2E test documents, persons, and users."
|
||||
read -rp "Type 'yes' to continue: " CONFIRM
|
||||
[ "$CONFIRM" != "yes" ] && echo "Aborted." && exit 0
|
||||
|
||||
# ── Cleanup ──────────────────────────────────────────────────────────────────
|
||||
|
||||
echo ""
|
||||
echo "Deleting E2E test data..."
|
||||
|
||||
run_sql "
|
||||
-- Notifications have no FK on document_id, must be cleaned manually.
|
||||
DELETE FROM notifications
|
||||
WHERE document_id IN (SELECT id FROM documents WHERE title LIKE 'E2E%');
|
||||
|
||||
-- Delete E2E documents. All dependent tables cascade:
|
||||
-- document_receivers, document_tags, document_training_labels,
|
||||
-- document_versions, document_annotations, document_comments,
|
||||
-- comment_mentions, transcription_blocks, transcription_block_versions,
|
||||
-- block_mentioned_persons
|
||||
DELETE FROM documents WHERE title LIKE 'E2E%';
|
||||
|
||||
-- Match all test persons:
|
||||
-- bilateral fixtures → last_name = 'Sender-<timestamp>' or 'Receiver-<timestamp>'
|
||||
-- persons.spec.ts → first_name = 'E2E', last_name = 'Testperson'
|
||||
-- Nullify sender FK on any non-E2E documents first (prevents FK violation).
|
||||
UPDATE documents
|
||||
SET sender_id = NULL
|
||||
WHERE sender_id IN (
|
||||
SELECT id FROM persons
|
||||
WHERE last_name ~ '^(Sender|Receiver)-[0-9]{10,}$'
|
||||
OR (first_name = 'E2E' AND last_name = 'Testperson')
|
||||
);
|
||||
|
||||
-- Remove receiver links from non-E2E documents (same edge case).
|
||||
DELETE FROM document_receivers
|
||||
WHERE person_id IN (
|
||||
SELECT id FROM persons
|
||||
WHERE last_name ~ '^(Sender|Receiver)-[0-9]{10,}$'
|
||||
OR (first_name = 'E2E' AND last_name = 'Testperson')
|
||||
);
|
||||
|
||||
-- Delete test persons. person_name_aliases cascade automatically.
|
||||
DELETE FROM persons
|
||||
WHERE last_name ~ '^(Sender|Receiver)-[0-9]{10,}$'
|
||||
OR (first_name = 'E2E' AND last_name = 'Testperson');
|
||||
|
||||
-- Delete leaked E2E users (created but not cleaned up in admin.spec.ts).
|
||||
DELETE FROM users_groups
|
||||
WHERE user_id IN (SELECT id FROM users WHERE first_name = 'E2E' AND last_name = 'Testuser');
|
||||
|
||||
DELETE FROM users
|
||||
WHERE first_name = 'E2E' AND last_name = 'Testuser';
|
||||
" --quiet
|
||||
|
||||
echo "Done."
|
||||
125
scripts/download-paperless.sh
Executable file
125
scripts/download-paperless.sh
Executable file
@@ -0,0 +1,125 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
BASE_URL="https://dg-familienarchiv.de"
|
||||
BATCH_SIZE=10
|
||||
OUTPUT_DIR="./paperless-download"
|
||||
MAX_RETRIES=3
|
||||
RETRY_DELAY=10 # seconds between retries
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# Prompt for credentials
|
||||
USERNAME="berit.cram@gmx.de"
|
||||
PASSWORD="LifeOfNewYork1;)"
|
||||
|
||||
echo "Authenticating..."
|
||||
AUTH_BODY=$(jq -n --arg u "$USERNAME" --arg p "$PASSWORD" '{username: $u, password: $p}')
|
||||
AUTH_RESP=$(curl -s -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$AUTH_BODY" \
|
||||
"$BASE_URL/api/token/")
|
||||
TOKEN=$(echo "$AUTH_RESP" | jq -r '.token')
|
||||
|
||||
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
|
||||
echo "Authentication failed. Server responded:" >&2
|
||||
echo "$AUTH_RESP" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "Authenticated."
|
||||
|
||||
# ── Collect all document IDs ───────────────────────────────────────────────
|
||||
echo "Fetching document list..."
|
||||
ALL_IDS=()
|
||||
PAGE=1
|
||||
while true; do
|
||||
RESP=$(curl -s \
|
||||
-H "Authorization: Token $TOKEN" \
|
||||
"$BASE_URL/api/documents/?page=$PAGE&page_size=100")
|
||||
|
||||
IDS=$(echo "$RESP" | jq -r '.results[].id' | tr -d '\r' | grep -E '^[0-9]+$')
|
||||
[ -z "$IDS" ] && break
|
||||
while IFS= read -r id; do
|
||||
ALL_IDS+=("$id")
|
||||
done <<< "$IDS"
|
||||
|
||||
NEXT=$(echo "$RESP" | jq -r '.next')
|
||||
[ "$NEXT" = "null" ] && break
|
||||
PAGE=$((PAGE + 1))
|
||||
done
|
||||
|
||||
TOTAL=${#ALL_IDS[@]}
|
||||
echo "Found $TOTAL documents."
|
||||
|
||||
# ── Download in batches ────────────────────────────────────────────────────
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
BATCH_NUM=0
|
||||
START=0
|
||||
while [ $START -lt $TOTAL ]; do
|
||||
END=$(( START + BATCH_SIZE ))
|
||||
[ $END -gt $TOTAL ] && END=$TOTAL
|
||||
BATCH_IDS=("${ALL_IDS[@]:$START:$((END - START))}")
|
||||
BATCH_NUM=$((BATCH_NUM + 1))
|
||||
|
||||
PADDED=$(printf "%03d" "$BATCH_NUM")
|
||||
ZIP_FILE="$OUTPUT_DIR/batch_$PADDED.zip"
|
||||
BATCH_DIR="$OUTPUT_DIR/batch_$PADDED"
|
||||
|
||||
# Skip already-extracted batches (resume support)
|
||||
if [ -d "$BATCH_DIR" ]; then
|
||||
echo "Skipping batch $BATCH_NUM (already extracted)."
|
||||
START=$END
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "Downloading batch $BATCH_NUM (docs $((START+1))–$END of $TOTAL)..."
|
||||
|
||||
JSON_IDS=$(printf '%s\n' "${BATCH_IDS[@]}" | grep -E '^[0-9]+$' | jq -s '[.[] | tonumber]')
|
||||
|
||||
ATTEMPT=0
|
||||
SUCCESS=false
|
||||
while [ $ATTEMPT -lt $MAX_RETRIES ]; do
|
||||
ATTEMPT=$((ATTEMPT + 1))
|
||||
|
||||
CONTENT_TYPE=$(curl -s --max-time 300 -X POST \
|
||||
-H "Authorization: Token $TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"documents\": $JSON_IDS, \"content\": \"originals\"}" \
|
||||
"$BASE_URL/api/documents/bulk_download/" \
|
||||
-D - \
|
||||
--output "$ZIP_FILE" | grep -i '^content-type:' | tr -d '\r')
|
||||
|
||||
if [ ! -s "$ZIP_FILE" ]; then
|
||||
echo " Attempt $ATTEMPT: empty response, retrying in ${RETRY_DELAY}s..." >&2
|
||||
rm -f "$ZIP_FILE"
|
||||
sleep $RETRY_DELAY
|
||||
continue
|
||||
fi
|
||||
|
||||
if echo "$CONTENT_TYPE" | grep -qi 'zip\|octet-stream'; then
|
||||
SUCCESS=true
|
||||
break
|
||||
else
|
||||
echo " Attempt $ATTEMPT: unexpected response (${CONTENT_TYPE}):" >&2
|
||||
cat "$ZIP_FILE" >&2
|
||||
rm -f "$ZIP_FILE"
|
||||
sleep $RETRY_DELAY
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$SUCCESS" = false ]; then
|
||||
echo " Batch $BATCH_NUM failed after $MAX_RETRIES attempts, skipping." >&2
|
||||
START=$END
|
||||
continue
|
||||
fi
|
||||
|
||||
mkdir -p "$BATCH_DIR"
|
||||
unzip -q "$ZIP_FILE" -d "$BATCH_DIR"
|
||||
rm "$ZIP_FILE"
|
||||
echo " Extracted to $BATCH_DIR/"
|
||||
|
||||
START=$END
|
||||
done
|
||||
|
||||
echo "Done. All files saved to $OUTPUT_DIR/"
|
||||
32
scripts/flatten-paperless.sh
Executable file
32
scripts/flatten-paperless.sh
Executable file
@@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
INPUT_DIR="./paperless-download"
|
||||
OUTPUT_DIR="./paperless-flat"
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
MOVED=0
|
||||
SKIPPED=0
|
||||
|
||||
while IFS= read -r -d '' FILE; do
|
||||
BASENAME=$(basename "$FILE")
|
||||
|
||||
# Strip leading date prefix: "YYYY-MM-DD " → ""
|
||||
NEWNAME=$(echo "$BASENAME" | sed -E 's/^[0-9]{4}-[0-9]{2}-[0-9]{2} //')
|
||||
|
||||
DEST="$OUTPUT_DIR/$NEWNAME"
|
||||
|
||||
if [ -e "$DEST" ]; then
|
||||
echo "Skipping (already exists): $NEWNAME" >&2
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
cp "$FILE" "$DEST"
|
||||
MOVED=$((MOVED + 1))
|
||||
done < <(find "$INPUT_DIR" -mindepth 2 -name "*.pdf" -print0)
|
||||
|
||||
echo "Done. Copied $MOVED files to $OUTPUT_DIR/ ($SKIPPED skipped as duplicates)."
|
||||
26
scripts/reset-db.sh
Executable file
26
scripts/reset-db.sh
Executable file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
CONTAINER="archive-db"
|
||||
DB_USER="archive_user"
|
||||
DB_NAME="family_archive_db"
|
||||
|
||||
echo "This will delete ALL documents, persons, and tags from the database."
|
||||
read -rp "Type 'yes' to continue: " CONFIRM
|
||||
[ "$CONFIRM" != "yes" ] && echo "Aborted." && exit 0
|
||||
|
||||
docker exec "$CONTAINER" psql -U "$DB_USER" -d "$DB_NAME" -c "
|
||||
TRUNCATE transcription_block_versions CASCADE;
|
||||
TRUNCATE transcription_blocks CASCADE;
|
||||
TRUNCATE comment_mentions CASCADE;
|
||||
TRUNCATE document_comments CASCADE;
|
||||
TRUNCATE document_annotations CASCADE;
|
||||
TRUNCATE document_versions CASCADE;
|
||||
TRUNCATE notifications CASCADE;
|
||||
TRUNCATE documents CASCADE;
|
||||
TRUNCATE person_name_aliases CASCADE;
|
||||
TRUNCATE persons CASCADE;
|
||||
TRUNCATE tag CASCADE;
|
||||
" --quiet
|
||||
|
||||
echo "Done. Database is clean."
|
||||
Reference in New Issue
Block a user