Compare commits
34 Commits
13955a5459
...
305f95a572
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
305f95a572 | ||
|
|
43595aeb8a | ||
|
|
947d8aeb6c | ||
|
|
7ec3e6170d | ||
|
|
7d456d8e8b | ||
|
|
24530cf85b | ||
|
|
57c44cf02f | ||
|
|
48223d5a3d | ||
|
|
04069c0286 | ||
|
|
3c46d820ad | ||
|
|
38d558182a | ||
|
|
25aa05411f | ||
|
|
f522ab633c | ||
|
|
593a6c8a38 | ||
|
|
67c03dab8c | ||
|
|
e302d3d689 | ||
|
|
a9aa1ec924 | ||
|
|
ce2bbf4230 | ||
|
|
69bcb3f8b2 | ||
|
|
34a97cbfa2 | ||
|
|
3d3d4b8616 | ||
|
|
e4719b9487 | ||
|
|
7562a400c0 | ||
|
|
2073a4b64a | ||
|
|
5c7efef307 | ||
|
|
74c9046745 | ||
|
|
81da127381 | ||
|
|
f206c0b9e9 | ||
|
|
15e532eb96 | ||
|
|
f241a71733 | ||
|
|
b83465020a | ||
|
|
f08897b801 | ||
|
|
a5979c4069 | ||
|
|
e8375d6c72 |
@@ -21,9 +21,10 @@ PORT_FRONTEND=5173
|
|||||||
PORT_MAILPIT_UI=8100
|
PORT_MAILPIT_UI=8100
|
||||||
PORT_MAILPIT_SMTP=1025
|
PORT_MAILPIT_SMTP=1025
|
||||||
|
|
||||||
# OCR Training — set a secret token to protect the /train and /segtrain endpoints on the
|
# OCR Training — secret token required to call /train and /segtrain on the OCR service.
|
||||||
# Python OCR microservice. Leave empty to disable token authentication (development only).
|
# Also set in the backend so it can pass the token through. Must not be empty in production.
|
||||||
# OCR_TRAINING_TOKEN=change-me-in-production
|
# Generate with: python3 -c "import secrets; print(secrets.token_hex(32))"
|
||||||
|
OCR_TRAINING_TOKEN=change-me-in-production
|
||||||
|
|
||||||
# Production SMTP — uncomment and fill in to send real emails instead of catching them
|
# Production SMTP — uncomment and fill in to send real emails instead of catching them
|
||||||
# APP_BASE_URL=https://your-domain.example.com
|
# APP_BASE_URL=https://your-domain.example.com
|
||||||
|
|||||||
4
backend/.dockerignore
Normal file
4
backend/.dockerignore
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
target/
|
||||||
|
.git/
|
||||||
|
*.md
|
||||||
|
api_tests/
|
||||||
@@ -1,9 +1,18 @@
|
|||||||
FROM eclipse-temurin:21-jdk
|
FROM eclipse-temurin:21.0.10_7-jdk-noble AS builder
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
EXPOSE 8080
|
# Copy wrapper and POM first — dependency layer is cached separately from source
|
||||||
|
COPY .mvn .mvn
|
||||||
|
COPY mvnw pom.xml ./
|
||||||
|
RUN --mount=type=cache,target=/root/.m2 ./mvnw dependency:go-offline -q
|
||||||
|
|
||||||
# Source code and mvnw are mounted via docker-compose volume at runtime.
|
COPY src ./src
|
||||||
# Maven dependencies are cached in a named volume (~/.m2).
|
# -Dmaven.test.skip=true skips test compilation entirely (not just execution)
|
||||||
CMD ["./mvnw", "spring-boot:run"]
|
RUN --mount=type=cache,target=/root/.m2 ./mvnw clean package -Dmaven.test.skip=true -q
|
||||||
|
|
||||||
|
FROM eclipse-temurin:21.0.10_7-jre-noble
|
||||||
|
WORKDIR /app
|
||||||
|
# Spring Boot repackages to *.jar; pre-repackage artifact uses .jar.original, not .jar
|
||||||
|
COPY --from=builder /app/target/*.jar app.jar
|
||||||
|
EXPOSE 8080
|
||||||
|
CMD ["java", "-jar", "app.jar"]
|
||||||
|
|||||||
@@ -12,5 +12,5 @@ public interface OcrTrainingRunRepository extends JpaRepository<OcrTrainingRun,
|
|||||||
|
|
||||||
Optional<OcrTrainingRun> findFirstByStatus(TrainingStatus status);
|
Optional<OcrTrainingRun> findFirstByStatus(TrainingStatus status);
|
||||||
|
|
||||||
List<OcrTrainingRun> findTop5ByOrderByCreatedAtDesc();
|
List<OcrTrainingRun> findTop10ByOrderByCreatedAtDesc();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -45,6 +45,13 @@ public class OcrTrainingService {
|
|||||||
List<OcrTrainingRun> runs
|
List<OcrTrainingRun> runs
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
|
private void assertNoRunningTraining() {
|
||||||
|
if (trainingRunRepository.findFirstByStatus(TrainingStatus.RUNNING).isPresent()) {
|
||||||
|
throw DomainException.conflict(ErrorCode.TRAINING_ALREADY_RUNNING,
|
||||||
|
"A training run is already in progress");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Not safe for horizontal scaling: training reloads the Kraken model in-process on the
|
// Not safe for horizontal scaling: training reloads the Kraken model in-process on the
|
||||||
// Python OCR service after each run. The DB-level RUNNING constraint (V30 partial unique
|
// Python OCR service after each run. The DB-level RUNNING constraint (V30 partial unique
|
||||||
// index) prevents concurrent training API calls, but cannot prevent two OCR service replicas
|
// index) prevents concurrent training API calls, but cannot prevent two OCR service replicas
|
||||||
@@ -53,10 +60,7 @@ public class OcrTrainingService {
|
|||||||
// Short transaction: guard check + create RUNNING row, then commit immediately.
|
// Short transaction: guard check + create RUNNING row, then commit immediately.
|
||||||
// The DB connection is released before the OCR HTTP call, which can take several minutes.
|
// The DB connection is released before the OCR HTTP call, which can take several minutes.
|
||||||
OcrTrainingRun run = Objects.requireNonNull(txTemplate.execute(status -> {
|
OcrTrainingRun run = Objects.requireNonNull(txTemplate.execute(status -> {
|
||||||
if (trainingRunRepository.findFirstByStatus(TrainingStatus.RUNNING).isPresent()) {
|
assertNoRunningTraining();
|
||||||
throw DomainException.conflict(ErrorCode.TRAINING_ALREADY_RUNNING,
|
|
||||||
"A training run is already in progress");
|
|
||||||
}
|
|
||||||
|
|
||||||
var eligibleBlocks = trainingDataExportService.queryEligibleBlocks();
|
var eligibleBlocks = trainingDataExportService.queryEligibleBlocks();
|
||||||
if (eligibleBlocks.size() < 5) {
|
if (eligibleBlocks.size() < 5) {
|
||||||
@@ -120,10 +124,7 @@ public class OcrTrainingService {
|
|||||||
public OcrTrainingRun triggerSegTraining(UUID triggeredBy) {
|
public OcrTrainingRun triggerSegTraining(UUID triggeredBy) {
|
||||||
// Same pattern as triggerTraining: narrow transactions around DB writes only.
|
// Same pattern as triggerTraining: narrow transactions around DB writes only.
|
||||||
OcrTrainingRun run = Objects.requireNonNull(txTemplate.execute(status -> {
|
OcrTrainingRun run = Objects.requireNonNull(txTemplate.execute(status -> {
|
||||||
if (trainingRunRepository.findFirstByStatus(TrainingStatus.RUNNING).isPresent()) {
|
assertNoRunningTraining();
|
||||||
throw DomainException.conflict(ErrorCode.TRAINING_ALREADY_RUNNING,
|
|
||||||
"A training run is already in progress");
|
|
||||||
}
|
|
||||||
|
|
||||||
var segBlocks = segmentationTrainingExportService.querySegmentationBlocks();
|
var segBlocks = segmentationTrainingExportService.querySegmentationBlocks();
|
||||||
if (segBlocks.size() < 5) {
|
if (segBlocks.size() < 5) {
|
||||||
@@ -162,11 +163,12 @@ public class OcrTrainingService {
|
|||||||
return Objects.requireNonNull(txTemplate.execute(status -> {
|
return Objects.requireNonNull(txTemplate.execute(status -> {
|
||||||
run.setStatus(TrainingStatus.DONE);
|
run.setStatus(TrainingStatus.DONE);
|
||||||
run.setCompletedAt(Instant.now());
|
run.setCompletedAt(Instant.now());
|
||||||
|
run.setCer(result.cer());
|
||||||
run.setLoss(result.loss());
|
run.setLoss(result.loss());
|
||||||
run.setAccuracy(result.accuracy());
|
run.setAccuracy(result.accuracy());
|
||||||
run.setEpochs(result.epochs());
|
run.setEpochs(result.epochs());
|
||||||
OcrTrainingRun updated = trainingRunRepository.save(run);
|
OcrTrainingRun updated = trainingRunRepository.save(run);
|
||||||
log.info("[trainingRun={}] Segmentation training completed — epochs={}", runId, result.epochs());
|
log.info("[trainingRun={}] Segmentation training completed — cer={} epochs={}", runId, result.cer(), result.epochs());
|
||||||
return updated;
|
return updated;
|
||||||
}));
|
}));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
@@ -193,7 +195,7 @@ public class OcrTrainingService {
|
|||||||
int totalOcrBlocks = (int) blockRepository.count();
|
int totalOcrBlocks = (int) blockRepository.count();
|
||||||
int availableSegBlocks = segmentationTrainingExportService.querySegmentationBlocks().size();
|
int availableSegBlocks = segmentationTrainingExportService.querySegmentationBlocks().size();
|
||||||
|
|
||||||
List<OcrTrainingRun> recentRuns = trainingRunRepository.findTop5ByOrderByCreatedAtDesc();
|
List<OcrTrainingRun> recentRuns = trainingRunRepository.findTop10ByOrderByCreatedAtDesc();
|
||||||
OcrTrainingRun lastRun = recentRuns.isEmpty() ? null : recentRuns.get(0);
|
OcrTrainingRun lastRun = recentRuns.isEmpty() ? null : recentRuns.get(0);
|
||||||
|
|
||||||
return new TrainingInfoResponse(
|
return new TrainingInfoResponse(
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ class OcrTrainingServiceTest {
|
|||||||
service = new OcrTrainingService(runRepository, exportService, segExportService, ocrClient, healthClient, blockRepository, txTemplate);
|
service = new OcrTrainingService(runRepository, exportService, segExportService, ocrClient, healthClient, blockRepository, txTemplate);
|
||||||
|
|
||||||
when(blockRepository.count()).thenReturn(0L);
|
when(blockRepository.count()).thenReturn(0L);
|
||||||
when(runRepository.findTop5ByOrderByCreatedAtDesc()).thenReturn(List.of());
|
when(runRepository.findTop10ByOrderByCreatedAtDesc()).thenReturn(List.of());
|
||||||
when(segExportService.querySegmentationBlocks()).thenReturn(List.of());
|
when(segExportService.querySegmentationBlocks()).thenReturn(List.of());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -146,6 +146,90 @@ class OcrTrainingServiceTest {
|
|||||||
run.getStatus() == TrainingStatus.FAILED && run.getErrorMessage() != null));
|
run.getStatus() == TrainingStatus.FAILED && run.getErrorMessage() != null));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─── triggerSegTraining ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void triggerSegTraining_throws409_whenRunningRunExists() {
|
||||||
|
when(runRepository.findFirstByStatus(TrainingStatus.RUNNING))
|
||||||
|
.thenReturn(Optional.of(OcrTrainingRun.builder()
|
||||||
|
.id(UUID.randomUUID()).status(TrainingStatus.RUNNING)
|
||||||
|
.blockCount(5).documentCount(2).modelName("blla").build()));
|
||||||
|
|
||||||
|
assertThatThrownBy(() -> service.triggerSegTraining(null))
|
||||||
|
.isInstanceOf(DomainException.class)
|
||||||
|
.extracting("status")
|
||||||
|
.satisfies(s -> assertThat(s.toString()).contains("409"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void triggerSegTraining_throws422_whenFewerThan5Segments() {
|
||||||
|
when(runRepository.findFirstByStatus(TrainingStatus.RUNNING)).thenReturn(Optional.empty());
|
||||||
|
when(segExportService.querySegmentationBlocks()).thenReturn(List.of(
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(UUID.randomUUID()).build(),
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(UUID.randomUUID()).build()
|
||||||
|
));
|
||||||
|
|
||||||
|
assertThatThrownBy(() -> service.triggerSegTraining(null))
|
||||||
|
.isInstanceOf(DomainException.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void triggerSegTraining_createsRunWithBlla_andMarksDoneWithCer() throws Exception {
|
||||||
|
when(runRepository.findFirstByStatus(TrainingStatus.RUNNING)).thenReturn(Optional.empty());
|
||||||
|
|
||||||
|
UUID docA = UUID.randomUUID();
|
||||||
|
UUID docB = UUID.randomUUID();
|
||||||
|
List<TranscriptionBlock> segs = List.of(
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docB).build()
|
||||||
|
);
|
||||||
|
when(segExportService.querySegmentationBlocks()).thenReturn(segs);
|
||||||
|
when(segExportService.exportToZip()).thenReturn(out -> {});
|
||||||
|
when(ocrClient.segtrainModel(any())).thenReturn(new OcrClient.TrainingResult(null, 0.92, 0.08, 5));
|
||||||
|
|
||||||
|
OcrTrainingRun saved = OcrTrainingRun.builder()
|
||||||
|
.id(UUID.randomUUID()).status(TrainingStatus.RUNNING)
|
||||||
|
.blockCount(5).documentCount(2).modelName("blla").build();
|
||||||
|
when(runRepository.save(any())).thenReturn(saved);
|
||||||
|
|
||||||
|
service.triggerSegTraining(null);
|
||||||
|
|
||||||
|
verify(runRepository, atLeastOnce()).save(argThat(run ->
|
||||||
|
run.getStatus() == TrainingStatus.DONE
|
||||||
|
&& "blla".equals(run.getModelName())
|
||||||
|
&& run.getCer() != null));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void triggerSegTraining_marksRunFailed_whenOcrClientThrows() throws Exception {
|
||||||
|
when(runRepository.findFirstByStatus(TrainingStatus.RUNNING)).thenReturn(Optional.empty());
|
||||||
|
|
||||||
|
UUID docA = UUID.randomUUID();
|
||||||
|
List<TranscriptionBlock> segs = List.of(
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
|
||||||
|
TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build()
|
||||||
|
);
|
||||||
|
when(segExportService.querySegmentationBlocks()).thenReturn(segs);
|
||||||
|
when(segExportService.exportToZip()).thenReturn(out -> {});
|
||||||
|
when(ocrClient.segtrainModel(any())).thenThrow(new RuntimeException("seg timeout"));
|
||||||
|
|
||||||
|
OcrTrainingRun saved = OcrTrainingRun.builder()
|
||||||
|
.id(UUID.randomUUID()).status(TrainingStatus.RUNNING)
|
||||||
|
.blockCount(5).documentCount(1).modelName("blla").build();
|
||||||
|
when(runRepository.save(any())).thenReturn(saved);
|
||||||
|
|
||||||
|
service.triggerSegTraining(null);
|
||||||
|
|
||||||
|
verify(runRepository, atLeastOnce()).save(argThat(run ->
|
||||||
|
run.getStatus() == TrainingStatus.FAILED && run.getErrorMessage() != null));
|
||||||
|
}
|
||||||
|
|
||||||
// ─── Orphan recovery ──────────────────────────────────────────────────────
|
// ─── Orphan recovery ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|||||||
@@ -83,11 +83,11 @@ services:
|
|||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
expose:
|
expose:
|
||||||
- "8000"
|
- "8000"
|
||||||
mem_limit: 8g
|
mem_limit: 12g
|
||||||
memswap_limit: 8g
|
memswap_limit: 12g
|
||||||
volumes:
|
volumes:
|
||||||
- ocr_models:/app/models
|
- ocr_models:/app/models
|
||||||
- ocr_cache:/root/.cache
|
- ocr_cache:/root/.cache # Hugging Face / ketos model download cache — prevents re-downloads on container recreate
|
||||||
environment:
|
environment:
|
||||||
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
|
||||||
TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
|
TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
|
||||||
@@ -102,7 +102,7 @@ services:
|
|||||||
interval: 10s
|
interval: 10s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 12
|
retries: 12
|
||||||
start_period: 60s
|
start_period: 120s
|
||||||
|
|
||||||
# --- Backend: Spring Boot ---
|
# --- Backend: Spring Boot ---
|
||||||
backend:
|
backend:
|
||||||
@@ -112,9 +112,7 @@ services:
|
|||||||
container_name: archive-backend
|
container_name: archive-backend
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
volumes:
|
volumes:
|
||||||
- ./backend:/app
|
|
||||||
- ./import:/import
|
- ./import:/import
|
||||||
- maven_cache:/root/.m2
|
|
||||||
depends_on:
|
depends_on:
|
||||||
db:
|
db:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
@@ -145,6 +143,7 @@ services:
|
|||||||
SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-false}
|
SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-false}
|
||||||
SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false}
|
SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false}
|
||||||
APP_OCR_BASE_URL: http://ocr-service:8000
|
APP_OCR_BASE_URL: http://ocr-service:8000
|
||||||
|
APP_OCR_TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
|
||||||
ports:
|
ports:
|
||||||
- "${PORT_BACKEND}:8080"
|
- "${PORT_BACKEND}:8080"
|
||||||
networks:
|
networks:
|
||||||
@@ -154,7 +153,7 @@ services:
|
|||||||
interval: 15s
|
interval: 15s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 10
|
retries: 10
|
||||||
start_period: 60s
|
start_period: 30s # JAR starts in ~15s; was 60s when compilation happened at startup
|
||||||
|
|
||||||
# --- Frontend: SvelteKit (Dev Server) ---
|
# --- Frontend: SvelteKit (Dev Server) ---
|
||||||
frontend:
|
frontend:
|
||||||
@@ -190,6 +189,5 @@ networks:
|
|||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
frontend_node_modules:
|
frontend_node_modules:
|
||||||
maven_cache:
|
|
||||||
ocr_models:
|
ocr_models:
|
||||||
ocr_cache:
|
ocr_cache:
|
||||||
|
|||||||
@@ -79,6 +79,8 @@
|
|||||||
"docs_list_from": "Von",
|
"docs_list_from": "Von",
|
||||||
"docs_list_to": "An",
|
"docs_list_to": "An",
|
||||||
"docs_list_unknown": "Unbekannt",
|
"docs_list_unknown": "Unbekannt",
|
||||||
|
"docs_group_undated": "Undatiert",
|
||||||
|
"docs_group_unknown": "Unbekannt",
|
||||||
"doc_section_who_when": "Wer & Wann",
|
"doc_section_who_when": "Wer & Wann",
|
||||||
"doc_section_description": "Beschreibung",
|
"doc_section_description": "Beschreibung",
|
||||||
"doc_section_file": "Datei",
|
"doc_section_file": "Datei",
|
||||||
@@ -558,6 +560,7 @@
|
|||||||
"training_history_col_cer": "Fehlerrate",
|
"training_history_col_cer": "Fehlerrate",
|
||||||
"training_status_done": "Fertig",
|
"training_status_done": "Fertig",
|
||||||
"training_status_failed": "Fehler",
|
"training_status_failed": "Fehler",
|
||||||
|
"training_error_detail_label": "Fehlerdetails",
|
||||||
"training_status_running": "Läuft…",
|
"training_status_running": "Läuft…",
|
||||||
"training_seg_heading": "Segmentierung trainieren",
|
"training_seg_heading": "Segmentierung trainieren",
|
||||||
"training_seg_description": "Starte ein neues Training mit annotierten Segmentierungsbereichen, um die Texterkennung zu verbessern.",
|
"training_seg_description": "Starte ein neues Training mit annotierten Segmentierungsbereichen, um die Texterkennung zu verbessern.",
|
||||||
|
|||||||
@@ -79,6 +79,8 @@
|
|||||||
"docs_list_from": "From",
|
"docs_list_from": "From",
|
||||||
"docs_list_to": "To",
|
"docs_list_to": "To",
|
||||||
"docs_list_unknown": "Unknown",
|
"docs_list_unknown": "Unknown",
|
||||||
|
"docs_group_undated": "Undated",
|
||||||
|
"docs_group_unknown": "Unknown",
|
||||||
"doc_section_who_when": "Who & When",
|
"doc_section_who_when": "Who & When",
|
||||||
"doc_section_description": "Description",
|
"doc_section_description": "Description",
|
||||||
"doc_section_file": "File",
|
"doc_section_file": "File",
|
||||||
@@ -558,6 +560,7 @@
|
|||||||
"training_history_col_cer": "Error Rate",
|
"training_history_col_cer": "Error Rate",
|
||||||
"training_status_done": "Done",
|
"training_status_done": "Done",
|
||||||
"training_status_failed": "Failed",
|
"training_status_failed": "Failed",
|
||||||
|
"training_error_detail_label": "Error details",
|
||||||
"training_status_running": "Running…",
|
"training_status_running": "Running…",
|
||||||
"training_seg_heading": "Train segmentation",
|
"training_seg_heading": "Train segmentation",
|
||||||
"training_seg_description": "Start a new training run using annotated segmentation regions to improve text detection.",
|
"training_seg_description": "Start a new training run using annotated segmentation regions to improve text detection.",
|
||||||
|
|||||||
@@ -79,6 +79,8 @@
|
|||||||
"docs_list_from": "De",
|
"docs_list_from": "De",
|
||||||
"docs_list_to": "Para",
|
"docs_list_to": "Para",
|
||||||
"docs_list_unknown": "Desconocido",
|
"docs_list_unknown": "Desconocido",
|
||||||
|
"docs_group_undated": "Sin fecha",
|
||||||
|
"docs_group_unknown": "Desconocido",
|
||||||
"doc_section_who_when": "Quién & Cuándo",
|
"doc_section_who_when": "Quién & Cuándo",
|
||||||
"doc_section_description": "Descripción",
|
"doc_section_description": "Descripción",
|
||||||
"doc_section_file": "Archivo",
|
"doc_section_file": "Archivo",
|
||||||
@@ -558,6 +560,7 @@
|
|||||||
"training_history_col_cer": "Tasa de error",
|
"training_history_col_cer": "Tasa de error",
|
||||||
"training_status_done": "Listo",
|
"training_status_done": "Listo",
|
||||||
"training_status_failed": "Error",
|
"training_status_failed": "Error",
|
||||||
|
"training_error_detail_label": "Detalles del error",
|
||||||
"training_status_running": "Ejecutando…",
|
"training_status_running": "Ejecutando…",
|
||||||
"training_seg_heading": "Entrenar segmentación",
|
"training_seg_heading": "Entrenar segmentación",
|
||||||
"training_seg_description": "Inicia un nuevo entrenamiento con regiones de segmentación anotadas para mejorar la detección de texto.",
|
"training_seg_description": "Inicia un nuevo entrenamiento con regiones de segmentación anotadas para mejorar la detección de texto.",
|
||||||
|
|||||||
15
frontend/src/lib/components/GroupDivider.svelte
Normal file
15
frontend/src/lib/components/GroupDivider.svelte
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
<script lang="ts">
|
||||||
|
let { label }: { label: string } = $props();
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<div
|
||||||
|
data-testid="group-divider"
|
||||||
|
role="separator"
|
||||||
|
aria-label={label}
|
||||||
|
class="relative flex items-center py-2 text-center"
|
||||||
|
>
|
||||||
|
<div class="flex-grow border-t border-line"></div>
|
||||||
|
<span class="mx-4 font-sans text-sm font-bold tracking-widest text-ink/60 uppercase">{label}</span
|
||||||
|
>
|
||||||
|
<div class="flex-grow border-t border-line"></div>
|
||||||
|
</div>
|
||||||
23
frontend/src/lib/components/GroupDivider.svelte.spec.ts
Normal file
23
frontend/src/lib/components/GroupDivider.svelte.spec.ts
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
import { describe, expect, it, afterEach } from 'vitest';
|
||||||
|
import { cleanup, render } from 'vitest-browser-svelte';
|
||||||
|
import { page } from 'vitest/browser';
|
||||||
|
import GroupDivider from './GroupDivider.svelte';
|
||||||
|
|
||||||
|
afterEach(() => cleanup());
|
||||||
|
|
||||||
|
describe('GroupDivider', () => {
|
||||||
|
it('renders the label text', async () => {
|
||||||
|
render(GroupDivider, { label: '1938' });
|
||||||
|
await expect.element(page.getByText('1938')).toBeInTheDocument();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('has data-testid="group-divider" on the root element', async () => {
|
||||||
|
render(GroupDivider, { label: 'Test' });
|
||||||
|
await expect.element(page.getByTestId('group-divider')).toBeInTheDocument();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('renders a person name label', async () => {
|
||||||
|
render(GroupDivider, { label: 'Anna Müller' });
|
||||||
|
await expect.element(page.getByText('Anna Müller')).toBeInTheDocument();
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -20,6 +20,12 @@ interface Props {
|
|||||||
|
|
||||||
let { runs }: Props = $props();
|
let { runs }: Props = $props();
|
||||||
|
|
||||||
|
const COLLAPSED_COUNT = 3;
|
||||||
|
let expanded = $state(false);
|
||||||
|
|
||||||
|
const visibleRuns = $derived(expanded ? runs : runs.slice(0, COLLAPSED_COUNT));
|
||||||
|
const hasMore = $derived(runs.length > COLLAPSED_COUNT);
|
||||||
|
|
||||||
const dateFormatter = new Intl.DateTimeFormat('de-DE', {
|
const dateFormatter = new Intl.DateTimeFormat('de-DE', {
|
||||||
day: 'numeric',
|
day: 'numeric',
|
||||||
month: 'short',
|
month: 'short',
|
||||||
@@ -46,7 +52,7 @@ function formatCer(cer: number | undefined | null): string {
|
|||||||
<th class="hidden pb-2 text-right md:table-cell">{m.training_history_col_cer()}</th>
|
<th class="hidden pb-2 text-right md:table-cell">{m.training_history_col_cer()}</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody id="training-history-rows">
|
||||||
{#if runs.length === 0}
|
{#if runs.length === 0}
|
||||||
<tr>
|
<tr>
|
||||||
<td colspan="5" class="py-4 text-center text-sm text-ink-2">
|
<td colspan="5" class="py-4 text-center text-sm text-ink-2">
|
||||||
@@ -54,7 +60,7 @@ function formatCer(cer: number | undefined | null): string {
|
|||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
{:else}
|
{:else}
|
||||||
{#each runs as run (run.id)}
|
{#each visibleRuns as run (run.id)}
|
||||||
<tr class="border-b border-line/50 last:border-0">
|
<tr class="border-b border-line/50 last:border-0">
|
||||||
<td class="py-2 text-ink-2">{formatDate(run.createdAt)}</td>
|
<td class="py-2 text-ink-2">{formatDate(run.createdAt)}</td>
|
||||||
<td class="py-2">
|
<td class="py-2">
|
||||||
@@ -79,7 +85,6 @@ function formatCer(cer: number | undefined | null): string {
|
|||||||
{:else if run.status === 'FAILED'}
|
{:else if run.status === 'FAILED'}
|
||||||
<span
|
<span
|
||||||
class="inline-flex items-center gap-1 rounded-sm bg-red-100 px-1.5 py-0.5 text-xs font-medium text-red-700"
|
class="inline-flex items-center gap-1 rounded-sm bg-red-100 px-1.5 py-0.5 text-xs font-medium text-red-700"
|
||||||
title={run.errorMessage}
|
|
||||||
>
|
>
|
||||||
<svg
|
<svg
|
||||||
aria-hidden="true"
|
aria-hidden="true"
|
||||||
@@ -95,13 +100,21 @@ function formatCer(cer: number | undefined | null): string {
|
|||||||
</svg>
|
</svg>
|
||||||
{m.training_status_failed()}
|
{m.training_status_failed()}
|
||||||
</span>
|
</span>
|
||||||
|
{#if run.errorMessage}
|
||||||
|
<details class="mt-0.5">
|
||||||
|
<summary class="cursor-pointer text-xs text-red-700 underline">
|
||||||
|
{m.training_error_detail_label()}
|
||||||
|
</summary>
|
||||||
|
<p class="mt-1 text-xs text-red-600">{run.errorMessage}</p>
|
||||||
|
</details>
|
||||||
|
{/if}
|
||||||
{:else}
|
{:else}
|
||||||
<span
|
<span
|
||||||
class="inline-flex items-center gap-1 rounded-sm bg-yellow-100 px-1.5 py-0.5 text-xs font-medium text-yellow-700"
|
class="inline-flex items-center gap-1 rounded-sm bg-yellow-100 px-1.5 py-0.5 text-xs font-medium text-yellow-700"
|
||||||
>
|
>
|
||||||
<span
|
<span
|
||||||
aria-hidden="true"
|
aria-hidden="true"
|
||||||
class="h-1.5 w-1.5 animate-pulse rounded-full bg-yellow-500"
|
class="h-1.5 w-1.5 rounded-full bg-yellow-500 motion-safe:animate-pulse"
|
||||||
></span>
|
></span>
|
||||||
{m.training_status_running()}
|
{m.training_status_running()}
|
||||||
</span>
|
</span>
|
||||||
@@ -117,3 +130,17 @@ function formatCer(cer: number | undefined | null): string {
|
|||||||
{/if}
|
{/if}
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
|
{#if hasMore}
|
||||||
|
<div class="mt-2 text-center">
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
aria-expanded={expanded}
|
||||||
|
aria-controls="training-history-rows"
|
||||||
|
class="text-xs font-medium text-ink-3 transition-colors hover:text-ink"
|
||||||
|
onclick={() => (expanded = !expanded)}
|
||||||
|
>
|
||||||
|
{expanded ? m.comp_expandable_show_less() : m.comp_expandable_show_more()}
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
{/if}
|
||||||
|
|||||||
52
frontend/src/lib/components/TrainingHistory.svelte.spec.ts
Normal file
52
frontend/src/lib/components/TrainingHistory.svelte.spec.ts
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
import { afterEach, describe, expect, it } from 'vitest';
|
||||||
|
import { cleanup, render } from 'vitest-browser-svelte';
|
||||||
|
import { page } from 'vitest/browser';
|
||||||
|
import TrainingHistory from './TrainingHistory.svelte';
|
||||||
|
|
||||||
|
afterEach(cleanup);
|
||||||
|
|
||||||
|
function makeRun(i: number) {
|
||||||
|
return {
|
||||||
|
id: `run-${i}`,
|
||||||
|
status: 'DONE' as const,
|
||||||
|
blockCount: 10,
|
||||||
|
documentCount: 2,
|
||||||
|
modelName: 'german_kurrent',
|
||||||
|
createdAt: `2026-01-0${i + 1}T12:00:00Z`,
|
||||||
|
completedAt: `2026-01-0${i + 1}T12:05:00Z`
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const fiveRuns = Array.from({ length: 5 }, (_, i) => makeRun(i));
|
||||||
|
const twoRuns = Array.from({ length: 2 }, (_, i) => makeRun(i));
|
||||||
|
|
||||||
|
describe('TrainingHistory — expand/collapse', () => {
|
||||||
|
it('shows only 3 runs initially when more than 3 exist', async () => {
|
||||||
|
render(TrainingHistory, { runs: fiveRuns });
|
||||||
|
|
||||||
|
const rows = page.getByRole('row');
|
||||||
|
// 1 header row + 3 data rows = 4 total
|
||||||
|
await expect.element(rows.nth(3)).toBeInTheDocument();
|
||||||
|
await expect.element(rows.nth(4)).not.toBeInTheDocument();
|
||||||
|
|
||||||
|
await expect.element(page.getByRole('button', { name: /Mehr anzeigen/i })).toBeInTheDocument();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('shows all runs after clicking the expand button', async () => {
|
||||||
|
render(TrainingHistory, { runs: fiveRuns });
|
||||||
|
|
||||||
|
await page.getByRole('button', { name: /Mehr anzeigen/i }).click();
|
||||||
|
|
||||||
|
const rows = page.getByRole('row');
|
||||||
|
// 1 header row + 5 data rows = 6 total
|
||||||
|
await expect.element(rows.nth(5)).toBeInTheDocument();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('hides the toggle button when 3 or fewer runs exist', async () => {
|
||||||
|
render(TrainingHistory, { runs: twoRuns });
|
||||||
|
|
||||||
|
await expect
|
||||||
|
.element(page.getByRole('button', { name: /Mehr anzeigen/i }))
|
||||||
|
.not.toBeInTheDocument();
|
||||||
|
});
|
||||||
|
});
|
||||||
165
frontend/src/lib/utils/groupDocuments.spec.ts
Normal file
165
frontend/src/lib/utils/groupDocuments.spec.ts
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
import { describe, expect, it } from 'vitest';
|
||||||
|
import { groupDocuments } from './groupDocuments';
|
||||||
|
|
||||||
|
type Doc = {
|
||||||
|
id: string;
|
||||||
|
documentDate?: string | null;
|
||||||
|
sender?: { displayName: string } | null;
|
||||||
|
receivers?: { displayName: string }[];
|
||||||
|
};
|
||||||
|
|
||||||
|
const doc = (overrides: Partial<Doc> & { id: string }): Doc => ({
|
||||||
|
documentDate: null,
|
||||||
|
sender: null,
|
||||||
|
receivers: [],
|
||||||
|
...overrides
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── DATE sort ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe('groupDocuments — DATE sort', () => {
|
||||||
|
it('produces one group per distinct year', () => {
|
||||||
|
const docs = [
|
||||||
|
doc({ id: 'a', documentDate: '1923-04-12' }),
|
||||||
|
doc({ id: 'b', documentDate: '1938-01-01' }),
|
||||||
|
doc({ id: 'c', documentDate: '1965-08-03' })
|
||||||
|
];
|
||||||
|
const groups = groupDocuments(docs, 'DATE', 'Undatiert');
|
||||||
|
expect(groups.map((g) => g.label)).toEqual(['1923', '1938', '1965']);
|
||||||
|
expect(groups.every((g) => g.documents.length === 1)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('puts multiple docs from the same year into one group', () => {
|
||||||
|
const docs = [
|
||||||
|
doc({ id: 'a', documentDate: '1938-03-01' }),
|
||||||
|
doc({ id: 'b', documentDate: '1938-11-15' })
|
||||||
|
];
|
||||||
|
const groups = groupDocuments(docs, 'DATE', 'Undatiert');
|
||||||
|
expect(groups).toHaveLength(1);
|
||||||
|
expect(groups[0].label).toBe('1938');
|
||||||
|
expect(groups[0].documents).toHaveLength(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('places undated docs in the fallback group at the bottom', () => {
|
||||||
|
const docs = [
|
||||||
|
doc({ id: 'a', documentDate: '1938-01-01' }),
|
||||||
|
doc({ id: 'b', documentDate: null }),
|
||||||
|
doc({ id: 'c', documentDate: null })
|
||||||
|
];
|
||||||
|
const groups = groupDocuments(docs, 'DATE', 'Undatiert');
|
||||||
|
expect(groups).toHaveLength(2);
|
||||||
|
expect(groups[0].label).toBe('1938');
|
||||||
|
expect(groups[1].label).toBe('Undatiert');
|
||||||
|
expect(groups[1].documents.map((d) => d.id)).toEqual(['b', 'c']);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns one group with fallback label when all docs are undated', () => {
|
||||||
|
const docs = [doc({ id: 'a' }), doc({ id: 'b' })];
|
||||||
|
const groups = groupDocuments(docs, 'DATE', 'Undatiert');
|
||||||
|
expect(groups).toHaveLength(1);
|
||||||
|
expect(groups[0].label).toBe('Undatiert');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns one group when all docs are from the same year', () => {
|
||||||
|
const docs = [
|
||||||
|
doc({ id: 'a', documentDate: '1938-01-01' }),
|
||||||
|
doc({ id: 'b', documentDate: '1938-06-15' })
|
||||||
|
];
|
||||||
|
const groups = groupDocuments(docs, 'DATE', 'Undatiert');
|
||||||
|
expect(groups).toHaveLength(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── SENDER sort ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe('groupDocuments — SENDER sort', () => {
|
||||||
|
it('produces one group per distinct sender', () => {
|
||||||
|
const docs = [
|
||||||
|
doc({ id: 'a', sender: { displayName: 'Anna Müller' } }),
|
||||||
|
doc({ id: 'b', sender: { displayName: 'Karl Bauer' } }),
|
||||||
|
doc({ id: 'c', sender: { displayName: 'Anna Müller' } })
|
||||||
|
];
|
||||||
|
const groups = groupDocuments(docs, 'SENDER', 'Unbekannt');
|
||||||
|
expect(groups.map((g) => g.label)).toEqual(['Anna Müller', 'Karl Bauer']);
|
||||||
|
expect(groups[0].documents).toHaveLength(2);
|
||||||
|
expect(groups[1].documents).toHaveLength(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('places docs with no sender in the fallback group at the bottom', () => {
|
||||||
|
const docs = [
|
||||||
|
doc({ id: 'a', sender: { displayName: 'Anna Müller' } }),
|
||||||
|
doc({ id: 'b', sender: null })
|
||||||
|
];
|
||||||
|
const groups = groupDocuments(docs, 'SENDER', 'Unbekannt');
|
||||||
|
expect(groups).toHaveLength(2);
|
||||||
|
expect(groups[0].label).toBe('Anna Müller');
|
||||||
|
expect(groups[1].label).toBe('Unbekannt');
|
||||||
|
expect(groups[1].documents[0].id).toBe('b');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── RECEIVER sort ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe('groupDocuments — RECEIVER sort', () => {
|
||||||
|
it('a doc with two receivers appears in both receiver groups', () => {
|
||||||
|
const docs = [
|
||||||
|
doc({
|
||||||
|
id: 'a',
|
||||||
|
receivers: [{ displayName: 'Anna' }, { displayName: 'Karl' }]
|
||||||
|
})
|
||||||
|
];
|
||||||
|
const groups = groupDocuments(docs, 'RECEIVER', 'Unbekannt');
|
||||||
|
expect(groups.map((g) => g.label)).toEqual(['Anna', 'Karl']);
|
||||||
|
expect(groups[0].documents[0].id).toBe('a');
|
||||||
|
expect(groups[1].documents[0].id).toBe('a');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('places docs with no receivers in the fallback group at the bottom', () => {
|
||||||
|
const docs = [
|
||||||
|
doc({ id: 'a', receivers: [{ displayName: 'Anna' }] }),
|
||||||
|
doc({ id: 'b', receivers: [] })
|
||||||
|
];
|
||||||
|
const groups = groupDocuments(docs, 'RECEIVER', 'Unbekannt');
|
||||||
|
expect(groups).toHaveLength(2);
|
||||||
|
expect(groups[0].label).toBe('Anna');
|
||||||
|
expect(groups[1].label).toBe('Unbekannt');
|
||||||
|
expect(groups[1].documents[0].id).toBe('b');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('composite keys are unique: groupLabel + doc.id identifies each item', () => {
|
||||||
|
const docs = [
|
||||||
|
doc({ id: 'a', receivers: [{ displayName: 'Anna' }, { displayName: 'Karl' }] }),
|
||||||
|
doc({ id: 'b', receivers: [{ displayName: 'Anna' }] })
|
||||||
|
];
|
||||||
|
const groups = groupDocuments(docs, 'RECEIVER', 'Unbekannt');
|
||||||
|
const keys = groups.flatMap((g) => g.documents.map((d) => `${g.label}-${d.id}`));
|
||||||
|
const uniqueKeys = new Set(keys);
|
||||||
|
expect(uniqueKeys.size).toBe(keys.length);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── Non-groupable sorts ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe('groupDocuments — non-groupable sorts', () => {
|
||||||
|
it('TITLE sort returns one group containing all documents', () => {
|
||||||
|
const docs = [doc({ id: 'a' }), doc({ id: 'b' })];
|
||||||
|
const groups = groupDocuments(docs, 'TITLE', 'Undatiert');
|
||||||
|
expect(groups).toHaveLength(1);
|
||||||
|
expect(groups[0].documents).toHaveLength(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('UPLOAD_DATE sort returns one group containing all documents', () => {
|
||||||
|
const docs = [doc({ id: 'a' }), doc({ id: 'b' })];
|
||||||
|
const groups = groupDocuments(docs, 'UPLOAD_DATE', 'Undatiert');
|
||||||
|
expect(groups).toHaveLength(1);
|
||||||
|
expect(groups[0].documents).toHaveLength(2);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── Edge cases ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe('groupDocuments — edge cases', () => {
|
||||||
|
it('returns empty array for an empty document list', () => {
|
||||||
|
expect(groupDocuments([], 'DATE', 'Undatiert')).toEqual([]);
|
||||||
|
});
|
||||||
|
});
|
||||||
56
frontend/src/lib/utils/groupDocuments.ts
Normal file
56
frontend/src/lib/utils/groupDocuments.ts
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
export type GroupableDoc = {
|
||||||
|
id: string;
|
||||||
|
documentDate?: string | null;
|
||||||
|
sender?: { displayName: string } | null;
|
||||||
|
receivers?: { displayName: string }[];
|
||||||
|
};
|
||||||
|
|
||||||
|
export type DocumentGroup<T extends GroupableDoc> = {
|
||||||
|
label: string;
|
||||||
|
documents: T[];
|
||||||
|
};
|
||||||
|
|
||||||
|
const GROUPABLE_SORTS = ['DATE', 'SENDER', 'RECEIVER'] as const;
|
||||||
|
type GroupableSort = (typeof GROUPABLE_SORTS)[number];
|
||||||
|
|
||||||
|
export function groupDocuments<T extends GroupableDoc>(
|
||||||
|
docs: T[],
|
||||||
|
sort: string,
|
||||||
|
fallbackLabel: string
|
||||||
|
): DocumentGroup<T>[] {
|
||||||
|
if (docs.length === 0) return [];
|
||||||
|
if (!GROUPABLE_SORTS.includes(sort as GroupableSort)) {
|
||||||
|
return [{ label: '', documents: [...docs] }];
|
||||||
|
}
|
||||||
|
|
||||||
|
const groupMap = new Map<string, T[]>();
|
||||||
|
const fallbackDocs: T[] = [];
|
||||||
|
|
||||||
|
for (const doc of docs) {
|
||||||
|
const keys = extractGroupKeys(doc, sort as GroupableSort);
|
||||||
|
if (keys.length === 0) {
|
||||||
|
fallbackDocs.push(doc);
|
||||||
|
} else {
|
||||||
|
for (const key of keys) {
|
||||||
|
if (!groupMap.has(key)) groupMap.set(key, []);
|
||||||
|
groupMap.get(key)!.push(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const groups = [...groupMap.entries()].map(([label, documents]) => ({ label, documents }));
|
||||||
|
if (fallbackDocs.length > 0) groups.push({ label: fallbackLabel, documents: fallbackDocs });
|
||||||
|
return groups;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractGroupKeys<T extends GroupableDoc>(doc: T, sort: GroupableSort): string[] {
|
||||||
|
if (sort === 'DATE') {
|
||||||
|
const year = doc.documentDate
|
||||||
|
? String(new Date(doc.documentDate + 'T12:00:00').getFullYear())
|
||||||
|
: null;
|
||||||
|
return year ? [year] : [];
|
||||||
|
}
|
||||||
|
if (sort === 'SENDER') return doc.sender ? [doc.sender.displayName] : [];
|
||||||
|
if (sort === 'RECEIVER') return (doc.receivers ?? []).map((r) => r.displayName);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
@@ -13,8 +13,18 @@ export async function load({ url, fetch }) {
|
|||||||
const senderId = url.searchParams.get('senderId') || '';
|
const senderId = url.searchParams.get('senderId') || '';
|
||||||
const receiverId = url.searchParams.get('receiverId') || '';
|
const receiverId = url.searchParams.get('receiverId') || '';
|
||||||
const tags = url.searchParams.getAll('tag');
|
const tags = url.searchParams.getAll('tag');
|
||||||
const sort = url.searchParams.get('sort') || 'DATE';
|
const VALID_SORTS = ['DATE', 'TITLE', 'SENDER', 'RECEIVER', 'UPLOAD_DATE'] as const;
|
||||||
const dir = url.searchParams.get('dir') || 'desc';
|
type ValidSort = (typeof VALID_SORTS)[number];
|
||||||
|
const rawSort = url.searchParams.get('sort') ?? 'DATE';
|
||||||
|
const sort: ValidSort = (VALID_SORTS as readonly string[]).includes(rawSort)
|
||||||
|
? (rawSort as ValidSort)
|
||||||
|
: 'DATE';
|
||||||
|
const VALID_DIRS = ['asc', 'desc'] as const;
|
||||||
|
type ValidDir = (typeof VALID_DIRS)[number];
|
||||||
|
const rawDir = url.searchParams.get('dir') ?? 'desc';
|
||||||
|
const dir: ValidDir = (VALID_DIRS as readonly string[]).includes(rawDir)
|
||||||
|
? (rawDir as ValidDir)
|
||||||
|
: 'desc';
|
||||||
const tagQ = url.searchParams.get('tagQ') || '';
|
const tagQ = url.searchParams.get('tagQ') || '';
|
||||||
|
|
||||||
const isDashboard = !q && !from && !to && !senderId && !receiverId && !tags.length && !tagQ;
|
const isDashboard = !q && !from && !to && !senderId && !receiverId && !tags.length && !tagQ;
|
||||||
@@ -35,7 +45,7 @@ export async function load({ url, fetch }) {
|
|||||||
receiverId: receiverId || undefined,
|
receiverId: receiverId || undefined,
|
||||||
tag: tags.length ? tags : undefined,
|
tag: tags.length ? tags : undefined,
|
||||||
tagQ: tagQ || undefined,
|
tagQ: tagQ || undefined,
|
||||||
sort: sort as 'DATE' | 'TITLE' | 'SENDER' | 'RECEIVER' | 'UPLOAD_DATE',
|
sort,
|
||||||
dir: dir || undefined
|
dir: dir || undefined
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -139,6 +139,7 @@ const showRightColumn = $derived(data.canWrite || (data.incompleteDocs?.length ?
|
|||||||
error={data.error}
|
error={data.error}
|
||||||
total={data.total ?? 0}
|
total={data.total ?? 0}
|
||||||
q={q}
|
q={q}
|
||||||
|
sort={sort}
|
||||||
/>
|
/>
|
||||||
{/if}
|
{/if}
|
||||||
</main>
|
</main>
|
||||||
|
|||||||
@@ -30,7 +30,9 @@ let {
|
|||||||
sort?: string;
|
sort?: string;
|
||||||
} = $props();
|
} = $props();
|
||||||
|
|
||||||
const fallbackLabel = $derived(sort === 'DATE' ? m.docs_group_undated() : m.docs_group_unknown());
|
const fallbackLabel = $derived(
|
||||||
|
(sort ?? 'DATE') === 'DATE' ? m.docs_group_undated() : m.docs_group_unknown()
|
||||||
|
);
|
||||||
const groupedDocuments = $derived.by(() =>
|
const groupedDocuments = $derived.by(() =>
|
||||||
groupDocuments(documents, sort ?? 'DATE', fallbackLabel)
|
groupDocuments(documents, sort ?? 'DATE', fallbackLabel)
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
import { describe, expect, it, vi } from 'vitest';
|
import { describe, expect, it, vi, afterEach } from 'vitest';
|
||||||
import { render } from 'vitest-browser-svelte';
|
import { cleanup, render } from 'vitest-browser-svelte';
|
||||||
import { page } from 'vitest/browser';
|
import { page } from 'vitest/browser';
|
||||||
import DocumentList from './DocumentList.svelte';
|
import DocumentList from './DocumentList.svelte';
|
||||||
|
|
||||||
vi.mock('$app/navigation', () => ({ goto: vi.fn() }));
|
vi.mock('$app/navigation', () => ({ goto: vi.fn() }));
|
||||||
|
|
||||||
|
afterEach(() => cleanup());
|
||||||
|
|
||||||
const baseProps = {
|
const baseProps = {
|
||||||
documents: [],
|
documents: [],
|
||||||
canWrite: false,
|
canWrite: false,
|
||||||
@@ -13,7 +15,14 @@ const baseProps = {
|
|||||||
q: ''
|
q: ''
|
||||||
};
|
};
|
||||||
|
|
||||||
const makeDoc = () => ({
|
type DocOverrides = {
|
||||||
|
id?: string;
|
||||||
|
documentDate?: string | null;
|
||||||
|
sender?: { firstName?: string | null; lastName: string; displayName: string } | null;
|
||||||
|
receivers?: { firstName?: string | null; lastName: string; displayName: string }[];
|
||||||
|
};
|
||||||
|
|
||||||
|
const makeDoc = (overrides: DocOverrides = {}) => ({
|
||||||
id: '1',
|
id: '1',
|
||||||
title: 'Testbrief',
|
title: 'Testbrief',
|
||||||
originalFilename: 'testbrief.pdf',
|
originalFilename: 'testbrief.pdf',
|
||||||
@@ -21,8 +30,9 @@ const makeDoc = () => ({
|
|||||||
documentDate: '2024-03-15',
|
documentDate: '2024-03-15',
|
||||||
location: null,
|
location: null,
|
||||||
sender: null,
|
sender: null,
|
||||||
receivers: [],
|
receivers: [] as { firstName?: string | null; lastName: string; displayName: string }[],
|
||||||
tags: []
|
tags: [],
|
||||||
|
...overrides
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('DocumentList – result count', () => {
|
describe('DocumentList – result count', () => {
|
||||||
@@ -49,3 +59,59 @@ describe('DocumentList – empty state with search term', () => {
|
|||||||
await expect.element(page.getByText(/"Urlaub"/)).toBeInTheDocument();
|
await expect.element(page.getByText(/"Urlaub"/)).toBeInTheDocument();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ─── Group headers ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe('DocumentList – group headers', () => {
|
||||||
|
it('renders group-divider elements when DATE sort spans multiple years', async () => {
|
||||||
|
const documents = [
|
||||||
|
makeDoc({ id: '1', documentDate: '1923-04-12' }),
|
||||||
|
makeDoc({ id: '2', documentDate: '1965-08-03' })
|
||||||
|
];
|
||||||
|
render(DocumentList, { ...baseProps, documents, total: 2, sort: 'DATE' });
|
||||||
|
await expect.element(page.getByTestId('group-divider').first()).toBeInTheDocument();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does not render group-divider when DATE sort has only one distinct year', async () => {
|
||||||
|
const documents = [
|
||||||
|
makeDoc({ id: '1', documentDate: '1938-01-01' }),
|
||||||
|
makeDoc({ id: '2', documentDate: '1938-06-15' })
|
||||||
|
];
|
||||||
|
render(DocumentList, { ...baseProps, documents, total: 2, sort: 'DATE' });
|
||||||
|
await expect.element(page.getByTestId('group-divider')).not.toBeInTheDocument();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does not render group-divider for TITLE sort', async () => {
|
||||||
|
const documents = [
|
||||||
|
makeDoc({ id: '1', documentDate: '1923-04-12' }),
|
||||||
|
makeDoc({ id: '2', documentDate: '1965-08-03' })
|
||||||
|
];
|
||||||
|
render(DocumentList, { ...baseProps, documents, total: 2, sort: 'TITLE' });
|
||||||
|
await expect.element(page.getByTestId('group-divider')).not.toBeInTheDocument();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('shows Undatiert fallback label when sort is undefined and doc has no date', async () => {
|
||||||
|
const documents = [
|
||||||
|
makeDoc({ id: '1', documentDate: '1938-01-01' }),
|
||||||
|
makeDoc({ id: '2', documentDate: null })
|
||||||
|
];
|
||||||
|
render(DocumentList, { ...baseProps, documents, total: 2 }); // sort omitted — defaults to DATE grouping
|
||||||
|
await expect.element(page.getByText(/UNDATIERT/i)).toBeInTheDocument();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('a doc with two receivers appears in both receiver groups', async () => {
|
||||||
|
const documents = [
|
||||||
|
makeDoc({
|
||||||
|
id: '1',
|
||||||
|
receivers: [
|
||||||
|
{ firstName: null, lastName: 'Müller', displayName: 'Anna Müller' },
|
||||||
|
{ firstName: null, lastName: 'Bauer', displayName: 'Karl Bauer' }
|
||||||
|
]
|
||||||
|
})
|
||||||
|
];
|
||||||
|
render(DocumentList, { ...baseProps, documents, total: 1, sort: 'RECEIVER' });
|
||||||
|
const links = page.getByRole('link', { name: /Testbrief/ });
|
||||||
|
await expect.element(links.first()).toBeInTheDocument();
|
||||||
|
await expect.element(links.nth(1)).toBeInTheDocument();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
<script lang="ts">
|
<script lang="ts">
|
||||||
import { m } from '$lib/paraglide/messages.js';
|
import { m } from '$lib/paraglide/messages.js';
|
||||||
import { formatDate } from '$lib/utils/date';
|
import { formatDate } from '$lib/utils/date';
|
||||||
|
import GroupDivider from '$lib/components/GroupDivider.svelte';
|
||||||
|
import { groupDocuments } from '$lib/utils/groupDocuments';
|
||||||
|
|
||||||
let {
|
let {
|
||||||
documents,
|
documents,
|
||||||
@@ -29,22 +31,15 @@ let {
|
|||||||
|
|
||||||
const documentYears = $derived(
|
const documentYears = $derived(
|
||||||
documents
|
documents
|
||||||
.map((doc) => (doc.documentDate ? new Date(doc.documentDate).getFullYear() : null))
|
.map((doc) =>
|
||||||
|
doc.documentDate ? new Date(doc.documentDate + 'T12:00:00').getFullYear() : null
|
||||||
|
)
|
||||||
.filter((y): y is number => y !== null)
|
.filter((y): y is number => y !== null)
|
||||||
);
|
);
|
||||||
const yearFrom = $derived(documentYears.length > 0 ? Math.min(...documentYears) : null);
|
const yearFrom = $derived(documentYears.length > 0 ? Math.min(...documentYears) : null);
|
||||||
const yearTo = $derived(documentYears.length > 0 ? Math.max(...documentYears) : null);
|
const yearTo = $derived(documentYears.length > 0 ? Math.max(...documentYears) : null);
|
||||||
|
|
||||||
const enrichedDocuments = $derived(
|
const documentGroups = $derived.by(() => groupDocuments(documents, 'DATE', ''));
|
||||||
documents.map((doc, i) => {
|
|
||||||
const year = doc.documentDate ? new Date(doc.documentDate).getFullYear() : null;
|
|
||||||
const prevYear =
|
|
||||||
i > 0 && documents[i - 1].documentDate
|
|
||||||
? new Date(documents[i - 1].documentDate!).getFullYear()
|
|
||||||
: null;
|
|
||||||
return { doc, year, showYearDivider: year !== null && year !== prevYear };
|
|
||||||
})
|
|
||||||
);
|
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<!-- Summary bar -->
|
<!-- Summary bar -->
|
||||||
@@ -82,87 +77,83 @@ const enrichedDocuments = $derived(
|
|||||||
|
|
||||||
<div class="p-6 md:p-8">
|
<div class="p-6 md:p-8">
|
||||||
<div class="relative z-10 flex flex-col gap-4">
|
<div class="relative z-10 flex flex-col gap-4">
|
||||||
{#each enrichedDocuments as { doc, year, showYearDivider } (doc.id)}
|
{#each documentGroups as group (group.label)}
|
||||||
{#if showYearDivider}
|
{#if group.label}
|
||||||
<div data-testid="year-divider" class="relative flex items-center py-2 text-center">
|
<GroupDivider label={group.label} />
|
||||||
<div class="flex-grow border-t border-line"></div>
|
|
||||||
<span class="mx-4 font-sans text-xs font-bold tracking-widest text-ink/40 uppercase"
|
|
||||||
>{year}</span
|
|
||||||
>
|
|
||||||
<div class="flex-grow border-t border-line"></div>
|
|
||||||
</div>
|
|
||||||
{/if}
|
{/if}
|
||||||
{@const isRight = doc.sender?.id === senderId}
|
{#each group.documents as doc (doc.id)}
|
||||||
|
{@const isRight = doc.sender?.id === senderId}
|
||||||
|
|
||||||
<!-- Message Row -->
|
<!-- Message Row -->
|
||||||
<div class="flex w-full {isRight ? 'justify-end' : 'justify-start'}">
|
<div class="flex w-full {isRight ? 'justify-end' : 'justify-start'}">
|
||||||
<!-- Bubble Group -->
|
<!-- Bubble Group -->
|
||||||
<div
|
<div
|
||||||
class="flex max-w-[90%] gap-3 md:max-w-[70%] {isRight
|
class="flex max-w-[90%] gap-3 md:max-w-[70%] {isRight
|
||||||
? 'flex-row-reverse'
|
? 'flex-row-reverse'
|
||||||
: 'flex-row'}"
|
: 'flex-row'}"
|
||||||
>
|
>
|
||||||
<!-- AVATAR -->
|
<!-- AVATAR -->
|
||||||
<div class="mt-auto mb-1 hidden flex-shrink-0 sm:block">
|
<div class="mt-auto mb-1 hidden flex-shrink-0 sm:block">
|
||||||
<div
|
<div
|
||||||
class="flex h-8 w-8 items-center justify-center rounded-full border font-serif text-xs shadow-sm
|
class="flex h-8 w-8 items-center justify-center rounded-full border font-serif text-xs shadow-sm
|
||||||
{isRight
|
{isRight
|
||||||
? 'border-primary bg-primary text-primary-fg'
|
? 'border-primary bg-primary text-primary-fg'
|
||||||
: 'border-line bg-surface text-ink'}"
|
: 'border-line bg-surface text-ink'}"
|
||||||
>
|
>
|
||||||
{#if doc.sender}
|
{#if doc.sender}
|
||||||
{doc.sender.firstName ? doc.sender.firstName[0] : doc.sender.lastName[0]}{doc.sender.lastName[0]}
|
{doc.sender.firstName ? doc.sender.firstName[0] : doc.sender.lastName[0]}{doc.sender.lastName[0]}
|
||||||
{:else}
|
{:else}
|
||||||
?
|
?
|
||||||
{/if}
|
{/if}
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- BUBBLE CARD -->
|
<!-- BUBBLE CARD -->
|
||||||
<a
|
<a
|
||||||
href="/documents/{doc.id}"
|
href="/documents/{doc.id}"
|
||||||
class="group block transform rounded border p-4 shadow-sm transition-all duration-200 hover:-translate-y-0.5 hover:shadow-md
|
class="group block transform rounded border p-4 shadow-sm transition-all duration-200 hover:-translate-y-0.5 hover:shadow-md
|
||||||
{isRight
|
{isRight
|
||||||
? 'rounded-br-none border-primary bg-primary text-primary-fg'
|
? 'rounded-br-none border-primary bg-primary text-primary-fg'
|
||||||
: 'rounded-bl-none border-line bg-muted/50 text-ink'}"
|
: 'rounded-bl-none border-line bg-muted/50 text-ink'}"
|
||||||
>
|
>
|
||||||
<!-- Header -->
|
<!-- Header -->
|
||||||
<div class="mb-2 flex items-start justify-between gap-4">
|
<div class="mb-2 flex items-start justify-between gap-4">
|
||||||
<h3
|
<h3
|
||||||
class="font-serif text-sm leading-snug font-medium {isRight
|
class="font-serif text-sm leading-snug font-medium {isRight
|
||||||
? 'text-primary-fg'
|
? 'text-primary-fg'
|
||||||
: 'text-ink'}"
|
: 'text-ink'}"
|
||||||
>
|
>
|
||||||
{doc.title || doc.originalFilename}
|
{doc.title || doc.originalFilename}
|
||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<!-- Status Dot -->
|
<!-- Status Dot -->
|
||||||
<span
|
<span
|
||||||
class="mt-1.5 h-1.5 w-1.5 flex-shrink-0 rounded-full
|
class="mt-1.5 h-1.5 w-1.5 flex-shrink-0 rounded-full
|
||||||
{doc.status === 'UPLOADED' ? 'bg-accent' : 'bg-yellow-400'}"
|
{doc.status === 'UPLOADED' ? 'bg-accent' : 'bg-yellow-400'}"
|
||||||
title={doc.status}
|
title={doc.status}
|
||||||
>
|
>
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Metadata -->
|
<!-- Metadata -->
|
||||||
<div
|
<div
|
||||||
class="flex flex-wrap gap-3 font-sans text-[10px] tracking-wider uppercase opacity-80 {isRight
|
class="flex flex-wrap gap-3 font-sans text-[10px] tracking-wider uppercase opacity-80 {isRight
|
||||||
? 'text-primary-fg/70'
|
? 'text-primary-fg/70'
|
||||||
: 'text-ink-2'}"
|
: 'text-ink-2'}"
|
||||||
>
|
>
|
||||||
<span class="flex items-center">
|
|
||||||
{doc.documentDate ? formatDate(doc.documentDate) : '—'}
|
|
||||||
</span>
|
|
||||||
{#if doc.location}
|
|
||||||
<span class="flex items-center">
|
<span class="flex items-center">
|
||||||
• {doc.location}
|
{doc.documentDate ? formatDate(doc.documentDate) : '—'}
|
||||||
</span>
|
</span>
|
||||||
{/if}
|
{#if doc.location}
|
||||||
</div>
|
<span class="flex items-center">
|
||||||
</a>
|
• {doc.location}
|
||||||
|
</span>
|
||||||
|
{/if}
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
{/each}
|
||||||
{/each}
|
{/each}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -116,7 +116,7 @@ describe('Conversations page – summary', () => {
|
|||||||
describe('Conversations page – year dividers', () => {
|
describe('Conversations page – year dividers', () => {
|
||||||
it('renders a year divider for the first document', async () => {
|
it('renders a year divider for the first document', async () => {
|
||||||
render(Page, { data: withDocs });
|
render(Page, { data: withDocs });
|
||||||
await expect.element(page.getByTestId('year-divider').first()).toHaveTextContent('1923');
|
await expect.element(page.getByTestId('group-divider').first()).toHaveTextContent('1923');
|
||||||
});
|
});
|
||||||
|
|
||||||
it('renders a divider for each new year in the document list', async () => {
|
it('renders a divider for each new year in the document list', async () => {
|
||||||
@@ -128,8 +128,8 @@ describe('Conversations page – year dividers', () => {
|
|||||||
]
|
]
|
||||||
};
|
};
|
||||||
render(Page, { data });
|
render(Page, { data });
|
||||||
await expect.element(page.getByTestId('year-divider').first()).toHaveTextContent('1923');
|
await expect.element(page.getByTestId('group-divider').first()).toHaveTextContent('1923');
|
||||||
await expect.element(page.getByTestId('year-divider').nth(1)).toHaveTextContent('1965');
|
await expect.element(page.getByTestId('group-divider').nth(1)).toHaveTextContent('1965');
|
||||||
});
|
});
|
||||||
|
|
||||||
it('does not render a second divider for documents from the same year', async () => {
|
it('does not render a second divider for documents from the same year', async () => {
|
||||||
@@ -142,8 +142,8 @@ describe('Conversations page – year dividers', () => {
|
|||||||
};
|
};
|
||||||
render(Page, { data });
|
render(Page, { data });
|
||||||
// Only one divider for 1923; 1965 divider should not appear
|
// Only one divider for 1923; 1965 divider should not appear
|
||||||
await expect.element(page.getByTestId('year-divider').first()).toHaveTextContent('1923');
|
await expect.element(page.getByTestId('group-divider').first()).toHaveTextContent('1923');
|
||||||
await expect.element(page.getByTestId('year-divider').nth(1)).not.toBeInTheDocument();
|
await expect.element(page.getByTestId('group-divider').nth(1)).not.toBeInTheDocument();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
FROM python:3.11-slim
|
FROM python:3.11.9-slim
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
@@ -21,6 +21,8 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
|
RUN chmod +x /app/entrypoint.sh
|
||||||
|
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
|
CMD ["/app/entrypoint.sh"]
|
||||||
|
|||||||
80
ocr-service/ensure_blla_model.py
Normal file
80
ocr-service/ensure_blla_model.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
"""Validates the blla segmentation base model and downloads it if needed.
|
||||||
|
|
||||||
|
Run at container startup before uvicorn. ketos 7 requires the model in
|
||||||
|
CoreML protobuf or safetensors format — legacy PyTorch ZIP archives
|
||||||
|
(torch.save output from kraken <4) are not loadable and will be replaced.
|
||||||
|
|
||||||
|
Exits non-zero on failure so Docker marks the container unhealthy rather
|
||||||
|
than silently starting with a broken model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(levelname)s:ensure_blla_model:%(message)s",
|
||||||
|
)
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
BLLA_MODEL_PATH = os.environ.get("BLLA_MODEL_PATH", "/app/models/blla.mlmodel")
|
||||||
|
# DOI for "General segmentation model for print and handwriting" — ketos 7 compatible.
|
||||||
|
BLLA_MODEL_DOI = "10.5281/zenodo.14602569"
|
||||||
|
HTRMOPO_DIR = os.path.expanduser("~/.local/share/htrmopo")
|
||||||
|
|
||||||
|
|
||||||
|
def _model_is_loadable(path: str) -> bool:
|
||||||
|
try:
|
||||||
|
from kraken.lib import vgsl
|
||||||
|
|
||||||
|
vgsl.TorchVGSLModel.load_model(path)
|
||||||
|
return True
|
||||||
|
except (RuntimeError, OSError, ValueError) as e:
|
||||||
|
log.warning("Model at %s failed to load: %s", path, e)
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
log.debug("Unexpected error loading model at %s", path, exc_info=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _download_blla() -> str:
|
||||||
|
log.info("Downloading blla model (DOI %s) ...", BLLA_MODEL_DOI)
|
||||||
|
result = subprocess.run(
|
||||||
|
["kraken", "get", BLLA_MODEL_DOI],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
log.error("kraken get failed: %s", result.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
candidates = sorted(glob.glob(os.path.join(HTRMOPO_DIR, "*/blla.mlmodel")))
|
||||||
|
if not candidates:
|
||||||
|
log.error("Downloaded blla.mlmodel not found under %s", HTRMOPO_DIR)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return candidates[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
if os.path.exists(BLLA_MODEL_PATH):
|
||||||
|
if _model_is_loadable(BLLA_MODEL_PATH):
|
||||||
|
log.info("blla model OK: %s", BLLA_MODEL_PATH)
|
||||||
|
return
|
||||||
|
log.warning(
|
||||||
|
"blla model at %s is in an incompatible format — replacing", BLLA_MODEL_PATH
|
||||||
|
)
|
||||||
|
os.rename(BLLA_MODEL_PATH, BLLA_MODEL_PATH + ".incompatible")
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(BLLA_MODEL_PATH), exist_ok=True)
|
||||||
|
downloaded = _download_blla()
|
||||||
|
shutil.copy2(downloaded, BLLA_MODEL_PATH)
|
||||||
|
log.info("Installed blla model at %s", BLLA_MODEL_PATH)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
9
ocr-service/entrypoint.sh
Normal file
9
ocr-service/entrypoint.sh
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Validate the blla segmentation base model and download it if missing or
|
||||||
|
# incompatible. ketos 7 dropped support for legacy PyTorch ZIP archives —
|
||||||
|
# this ensures the volume always holds a loadable CoreML protobuf model.
|
||||||
|
python3 /app/ensure_blla_model.py
|
||||||
|
|
||||||
|
exec uvicorn main:app --host 0.0.0.0 --port 8000 --workers 1
|
||||||
@@ -472,16 +472,35 @@ async def segtrain_model(
|
|||||||
"-q", "fixed",
|
"-q", "fixed",
|
||||||
"-N", "10",
|
"-N", "10",
|
||||||
]
|
]
|
||||||
|
# Train at 800px height. The default blla model uses 1800px, which peaks at
|
||||||
|
# ~7+ GB on CPU and kills the host (ketos ignores -s when -i is present, so
|
||||||
|
# we cannot override the height of an existing model).
|
||||||
|
# Strategy: only use the base model if it is already at 800px (i.e. was
|
||||||
|
# produced by a previous fine-tuning run here). Otherwise train from scratch —
|
||||||
|
# the first run bootstraps a 800px model; all subsequent runs fine-tune it.
|
||||||
|
seg_spec = (
|
||||||
|
"[1,800,0,3 Cr7,7,64,2,2 Gn32 Cr3,3,128,2,2 Gn32 Cr3,3,128 Gn32 "
|
||||||
|
"Cr3,3,256 Gn32 Cr3,3,256 Gn32 Lbx32 Lby32 Cr1,1,32 Gn32 Lby32 Lbx32]"
|
||||||
|
)
|
||||||
|
use_base_model = False
|
||||||
if os.path.exists(blla_model_path):
|
if os.path.exists(blla_model_path):
|
||||||
cmd += ["-i", blla_model_path, "--resize", "both"]
|
try:
|
||||||
|
from kraken.lib import vgsl as _vgsl
|
||||||
|
_m = _vgsl.TorchVGSLModel.load_model(blla_model_path)
|
||||||
|
use_base_model = _m.input[2] == 800 # input is (batch, channels, H, W)
|
||||||
|
if not use_base_model:
|
||||||
|
log.info(
|
||||||
|
"Base model height is %dpx — skipping -i to avoid OOM; "
|
||||||
|
"will train from scratch at 800px",
|
||||||
|
_m.input[2],
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("Could not inspect base model height, training from scratch: %s", exc)
|
||||||
|
|
||||||
|
if use_base_model:
|
||||||
|
cmd += ["-i", blla_model_path, "--resize", "union", "-s", seg_spec]
|
||||||
else:
|
else:
|
||||||
# No pretrained model — train from scratch with reduced height (800px)
|
cmd += ["-s", seg_spec]
|
||||||
# to keep peak RAM under ~200 MB on CPU (default 1800px uses ~500 MB+)
|
|
||||||
cmd += [
|
|
||||||
"-s",
|
|
||||||
"[1,800,0,3 Cr7,7,64,2,2 Gn32 Cr3,3,128,2,2 Gn32 Cr3,3,128 Gn32 "
|
|
||||||
"Cr3,3,256 Gn32 Cr3,3,256 Gn32 Lbx32 Lby32 Cr1,1,32 Gn32 Lby32 Lbx32]",
|
|
||||||
]
|
|
||||||
cmd += xml_files
|
cmd += xml_files
|
||||||
|
|
||||||
log.info("Running: %s", " ".join(cmd[:5]) + " ...")
|
log.info("Running: %s", " ".join(cmd[:5]) + " ...")
|
||||||
@@ -493,7 +512,8 @@ async def segtrain_model(
|
|||||||
raise RuntimeError(f"ketos segtrain failed (exit {proc.returncode}): {proc.stderr[-500:]}")
|
raise RuntimeError(f"ketos segtrain failed (exit {proc.returncode}): {proc.stderr[-500:]}")
|
||||||
|
|
||||||
accuracy, epochs = _parse_best_checkpoint(checkpoint_dir)
|
accuracy, epochs = _parse_best_checkpoint(checkpoint_dir)
|
||||||
log.info("Segmentation training complete — epochs=%s accuracy=%s", epochs, accuracy)
|
cer = round(1.0 - accuracy, 4) if accuracy is not None else None
|
||||||
|
log.info("Segmentation training complete — epochs=%s accuracy=%s cer=%s", epochs, accuracy, cer)
|
||||||
|
|
||||||
best_model = _find_best_model(checkpoint_dir)
|
best_model = _find_best_model(checkpoint_dir)
|
||||||
if best_model is None:
|
if best_model is None:
|
||||||
@@ -508,7 +528,7 @@ async def segtrain_model(
|
|||||||
shutil.copy2(best_model, blla_model_path)
|
shutil.copy2(best_model, blla_model_path)
|
||||||
log.info("Replaced blla model at %s", blla_model_path)
|
log.info("Replaced blla model at %s", blla_model_path)
|
||||||
|
|
||||||
return {"loss": None, "accuracy": accuracy, "cer": None, "epochs": epochs}
|
return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs}
|
||||||
|
|
||||||
result = await asyncio.to_thread(_run_segtrain)
|
result = await asyncio.to_thread(_run_segtrain)
|
||||||
return result
|
return result
|
||||||
|
|||||||
69
ocr-service/test_ensure_blla_model.py
Normal file
69
ocr-service/test_ensure_blla_model.py
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
"""Unit tests for ensure_blla_model.main()."""
|
||||||
|
|
||||||
|
from unittest.mock import MagicMock, call, patch
|
||||||
|
|
||||||
|
import ensure_blla_model
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Model already loadable ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_main_returns_early_when_model_is_loadable():
|
||||||
|
"""When the model exists and loads cleanly, no download or rename occurs."""
|
||||||
|
with (
|
||||||
|
patch("os.path.exists", return_value=True),
|
||||||
|
patch.object(ensure_blla_model, "_model_is_loadable", return_value=True),
|
||||||
|
patch.object(ensure_blla_model, "_download_blla") as mock_download,
|
||||||
|
patch("os.rename") as mock_rename,
|
||||||
|
):
|
||||||
|
ensure_blla_model.main()
|
||||||
|
|
||||||
|
mock_download.assert_not_called()
|
||||||
|
mock_rename.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Model exists but is incompatible ─────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_main_replaces_incompatible_model():
|
||||||
|
"""An incompatible model is renamed and replaced with a fresh download."""
|
||||||
|
fake_path = "/app/models/blla.mlmodel"
|
||||||
|
downloaded_path = "/tmp/downloaded.mlmodel"
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch.object(ensure_blla_model, "BLLA_MODEL_PATH", fake_path),
|
||||||
|
patch("os.path.exists", return_value=True),
|
||||||
|
patch.object(ensure_blla_model, "_model_is_loadable", return_value=False),
|
||||||
|
patch.object(ensure_blla_model, "_download_blla", return_value=downloaded_path),
|
||||||
|
patch("os.rename") as mock_rename,
|
||||||
|
patch("shutil.copy2") as mock_copy,
|
||||||
|
patch("os.makedirs"),
|
||||||
|
):
|
||||||
|
ensure_blla_model.main()
|
||||||
|
|
||||||
|
mock_rename.assert_called_once_with(fake_path, fake_path + ".incompatible")
|
||||||
|
mock_copy.assert_called_once_with(downloaded_path, fake_path)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Model missing ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_main_downloads_when_model_missing():
|
||||||
|
"""When the model file doesn't exist at all, it is downloaded without rename."""
|
||||||
|
fake_path = "/app/models/blla.mlmodel"
|
||||||
|
downloaded_path = "/tmp/downloaded.mlmodel"
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch.object(ensure_blla_model, "BLLA_MODEL_PATH", fake_path),
|
||||||
|
patch("os.path.exists", return_value=False),
|
||||||
|
patch.object(ensure_blla_model, "_model_is_loadable") as mock_loadable,
|
||||||
|
patch.object(ensure_blla_model, "_download_blla", return_value=downloaded_path),
|
||||||
|
patch("os.rename") as mock_rename,
|
||||||
|
patch("shutil.copy2") as mock_copy,
|
||||||
|
patch("os.makedirs"),
|
||||||
|
):
|
||||||
|
ensure_blla_model.main()
|
||||||
|
|
||||||
|
mock_loadable.assert_not_called()
|
||||||
|
mock_rename.assert_not_called()
|
||||||
|
mock_copy.assert_called_once_with(downloaded_path, fake_path)
|
||||||
Reference in New Issue
Block a user