diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentRepository.java index a110d22c..e907e5f2 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentRepository.java @@ -100,7 +100,45 @@ public interface DocumentRepository extends JpaRepository, JpaSp ORDER BY ts_rank(d.search_vector, q.pq) DESC, d.meta_date DESC NULLS LAST """) - List findRankedIdsByFts(@Param("query") String query); + // Unpaged path — for bulk-edit "select all" and density chart + List findAllMatchingIdsByFts(@Param("query") String query); + + /** + * Returns one page of FTS-ranked document IDs with the total match count. + * + *

Each row contains (in column order): + *

    + *
  1. UUID — document id
  2. + *
  3. double — ts_rank score
  4. + *
  5. long — COUNT(*) OVER () — full match count, not page count
  6. + *
+ * + *

Returns an empty list when the query matches no documents (including + * stopword-only queries where websearch_to_tsquery returns an empty tsquery). + * Use findAllMatchingIdsByFts for the unpaged bulk-edit path. + */ + @Query(nativeQuery = true, value = """ + WITH q AS ( + SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> '' + THEN to_tsquery('simple', regexp_replace( + websearch_to_tsquery('german', :query)::text, + '''([^'']+)''', + '''\\1'':*', + 'g')) + END AS pq + ), matches AS ( + SELECT d.id, ts_rank(d.search_vector, q.pq) AS rank + FROM documents d, q + WHERE d.search_vector @@ q.pq + ) + SELECT id, rank, COUNT(*) OVER () AS total + FROM matches + ORDER BY rank DESC, id + OFFSET :offset LIMIT :limit + """) + List findFtsPageRaw(@Param("query") String query, + @Param("offset") int offset, + @Param("limit") int limit); /** * Returns match-enrichment data for a set of documents identified by their IDs. diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java index 31e2e85d..413216dc 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java @@ -162,7 +162,7 @@ public class DocumentService { */ private List resolveFtsIds(String text) { if (!StringUtils.hasText(text)) return null; - return documentRepository.findRankedIdsByFts(text); + return documentRepository.findAllMatchingIdsByFts(text); } /** Loads matching documents and projects to non-null {@link LocalDate}s. */ @@ -485,7 +485,7 @@ public class DocumentService { boolean hasText = StringUtils.hasText(text); List rankedIds = null; if (hasText) { - rankedIds = documentRepository.findRankedIdsByFts(text); + rankedIds = documentRepository.findAllMatchingIdsByFts(text); if (rankedIds.isEmpty()) return List.of(); } @@ -645,39 +645,43 @@ public class DocumentService { // 1. Allgemeine Suche (für das Suchfeld im Frontend) public DocumentSearchResult searchDocuments(String text, LocalDate from, LocalDate to, UUID sender, UUID receiver, List tags, String tagQ, DocumentStatus status, DocumentSort sort, String dir, TagOperator tagOperator, Pageable pageable) { boolean hasText = StringUtils.hasText(text); - List rankedIds = null; + // Pure-text RELEVANCE: push pagination into SQL — skip findAllMatchingIdsByFts entirely (ADR-008). + if (isPureTextRelevance(hasText, sort, from, to, sender, receiver, tags, tagQ, status)) { + return relevanceSortedPageFromSql(text, pageable); + } + + List rankedIds = null; if (hasText) { - rankedIds = documentRepository.findRankedIdsByFts(text); + rankedIds = documentRepository.findAllMatchingIdsByFts(text); if (rankedIds.isEmpty()) return DocumentSearchResult.of(List.of()); } Specification spec = buildSearchSpec( hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, tagOperator); - // SENDER, RECEIVER and RELEVANCE sorts load the full match set and slice in memory. + // SENDER and RECEIVER sorts load the full match set and slice in-memory. // JPA's Sort.by("sender.lastName") generates an INNER JOIN that silently drops - // documents with null sender/receivers; RELEVANCE maps a DB order to an external - // rank list. Cost scales linearly with match count — acceptable while documents - // stays under ~10k rows. Past that, replace with SQL-level LEFT JOIN sort. + // documents with null sender/receivers. Cost scales with match count — + // acceptable while documents stays under ~10k rows. (ADR-008) if (sort == DocumentSort.RECEIVER) { + // In-memory sort on page slice (≤ page size rows) — acceptable List sorted = sortByFirstReceiver(documentRepository.findAll(spec), dir); return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size()); } if (sort == DocumentSort.SENDER) { + // In-memory sort on page slice (≤ page size rows) — acceptable List sorted = sortBySender(documentRepository.findAll(spec), dir); return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size()); } - // RELEVANCE: default when text present and no explicit sort given + // RELEVANCE with active filters: load filtered subset and sort in-memory by rank. boolean useRankOrder = hasText && (sort == null || sort == DocumentSort.RELEVANCE); if (useRankOrder) { - List results = documentRepository.findAll(spec); Map rankMap = new HashMap<>(); for (int i = 0; i < rankedIds.size(); i++) rankMap.put(rankedIds.get(i), i); - List sorted = results.stream() - .sorted(Comparator.comparingInt( - doc -> rankMap.getOrDefault(doc.getId(), Integer.MAX_VALUE))) + List sorted = documentRepository.findAll(spec).stream() + .sorted(Comparator.comparingInt(doc -> rankMap.getOrDefault(doc.getId(), Integer.MAX_VALUE))) .toList(); return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size()); } @@ -688,6 +692,39 @@ public class DocumentService { return buildResultPaged(page.getContent(), text, pageable, page.getTotalElements()); } + private static boolean isPureTextRelevance(boolean hasText, DocumentSort sort, + LocalDate from, LocalDate to, UUID sender, UUID receiver, + List tags, String tagQ, DocumentStatus status) { + return hasText && (sort == null || sort == DocumentSort.RELEVANCE) + && from == null && to == null && sender == null && receiver == null + && (tags == null || tags.isEmpty()) && (tagQ == null || tagQ.isBlank()) && status == null; + } + + /** + * Pure-text RELEVANCE path — pagination and ts_rank ordering pushed into SQL. + * Called when no non-text filters are active (ADR-008). + */ + private DocumentSearchResult relevanceSortedPageFromSql(String text, Pageable pageable) { + long rawOffset = pageable.getOffset(); + if (rawOffset > Integer.MAX_VALUE) return DocumentSearchResult.of(List.of()); + int offset = (int) rawOffset; + int limit = pageable.getPageSize(); + FtsPage ftsPage = toFtsPage(documentRepository.findFtsPageRaw(text, offset, limit)); + if (ftsPage.hits().isEmpty()) return DocumentSearchResult.of(List.of()); + + // Preserve ts_rank order from SQL across the JPA findAllById call. + Map rankMap = new HashMap<>(); + List pageIds = new ArrayList<>(); + for (int i = 0; i < ftsPage.hits().size(); i++) { + rankMap.put(ftsPage.hits().get(i).id(), i); + pageIds.add(ftsPage.hits().get(i).id()); + } + List docs = documentRepository.findAllById(pageIds).stream() + .sorted(Comparator.comparingInt(d -> rankMap.getOrDefault(d.getId(), Integer.MAX_VALUE))) + .toList(); + return buildResultPaged(docs, text, pageable, ftsPage.total()); + } + private static List pageSlice(List sorted, Pageable pageable) { int from = Math.min((int) pageable.getOffset(), sorted.size()); int to = Math.min(from + pageable.getPageSize(), sorted.size()); @@ -1013,6 +1050,28 @@ public class DocumentService { return result; } + private static final int COL_ID = 0; + private static final int COL_RANK = 1; + private static final int COL_TOTAL = 2; + + /** + * Maps raw Object[] rows from {@link DocumentRepository#findFtsPageRaw} to an + * {@link FtsPage}. Uses pattern-matching UUID cast to guard against driver-level + * type variance (some JDBC drivers return UUID as String). + */ + private static FtsPage toFtsPage(List rows) { + if (rows.isEmpty()) return new FtsPage(List.of(), 0); + long total = ((Number) rows.get(0)[COL_TOTAL]).longValue(); + List hits = rows.stream() + .map(r -> { + UUID id = r[COL_ID] instanceof UUID u ? u : UUID.fromString(r[COL_ID].toString()); + double rank = ((Number) r[COL_RANK]).doubleValue(); + return new FtsHit(id, rank); + }) + .toList(); + return new FtsPage(hits, total); + } + /** Clean text + highlight offsets parsed from a {@code ts_headline} sentinel-delimited string. */ public record ParsedHighlight(String cleanText, List offsets) {} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/FtsHit.java b/backend/src/main/java/org/raddatz/familienarchiv/document/FtsHit.java new file mode 100644 index 00000000..fd38250d --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/FtsHit.java @@ -0,0 +1,6 @@ +package org.raddatz.familienarchiv.document; + +import java.util.UUID; + +/** A single document hit from a paginated FTS query — id and its ts_rank score. */ +record FtsHit(UUID id, double rank) {} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/FtsPage.java b/backend/src/main/java/org/raddatz/familienarchiv/document/FtsPage.java new file mode 100644 index 00000000..a1ce7bf5 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/FtsPage.java @@ -0,0 +1,6 @@ +package org.raddatz.familienarchiv.document; + +import java.util.List; + +/** One page of FTS results — the ranked hit list for this page and the total match count. */ +record FtsPage(List hits, long total) {} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentFtsPagedIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentFtsPagedIntegrationTest.java new file mode 100644 index 00000000..122fa4e6 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentFtsPagedIntegrationTest.java @@ -0,0 +1,109 @@ +package org.raddatz.familienarchiv.document; + +import jakarta.persistence.EntityManager; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.raddatz.familienarchiv.PostgresContainerConfig; +import org.raddatz.familienarchiv.config.FlywayConfig; +import org.raddatz.familienarchiv.document.DocumentRepository; +import org.raddatz.familienarchiv.document.Document; +import org.raddatz.familienarchiv.document.DocumentStatus; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest; +import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase; +import org.springframework.context.annotation.Import; +import org.springframework.test.annotation.DirtiesContext; + +import java.util.List; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; + +/** + * Repository-level integration tests for {@code findFtsPageRaw}: verifies that the + * paginated FTS query returns exactly page-size rows and that the window-function + * total reflects the full match count, not just the page count. + * + *

Uses real Postgres via Testcontainers so the GIN index, tsvector trigger, and + * {@code websearch_to_tsquery} semantics are identical to production. + * + *

{@code AFTER_CLASS} dirty-context keeps the Spring context alive for all tests + * in this class and rebuilds it once at the end, rather than after every test. + */ +@DataJpaTest +@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE) +@Import({PostgresContainerConfig.class, FlywayConfig.class}) +@DirtiesContext(classMode = DirtiesContext.ClassMode.AFTER_CLASS) +class DocumentFtsPagedIntegrationTest { + + @Autowired DocumentRepository documentRepository; + @Autowired EntityManager em; + + // 60 docs match "Walter"; 10 docs with "Hans" do not. + private static final int WALTER_COUNT = 60; + private static final int PAGE_SIZE = 50; + + @BeforeEach + void seed() { + documentRepository.deleteAll(); + em.flush(); + for (int i = 0; i < WALTER_COUNT; i++) { + documentRepository.saveAndFlush(doc("Brief von Walter Nr. " + i)); + } + for (int i = 0; i < 10; i++) { + documentRepository.saveAndFlush(doc("Brief von Hans Nr. " + i)); + } + em.clear(); + } + + @Test + void findFtsPageRaw_firstPage_returnsPageSizeRows() { + List rows = documentRepository.findFtsPageRaw("Walter", 0, PAGE_SIZE); + + assertThat(rows).hasSize(PAGE_SIZE); + } + + @Test + void findFtsPageRaw_windowTotal_equalsFullMatchCount_notPageSize() { + List rows = documentRepository.findFtsPageRaw("Walter", 0, PAGE_SIZE); + + long total = ((Number) rows.get(0)[2]).longValue(); + assertThat(total).isEqualTo(WALTER_COUNT); + } + + @Test + void findFtsPageRaw_lastPage_returnsRemainder() { + int remainder = WALTER_COUNT % PAGE_SIZE; // 60 % 50 = 10 + List rows = documentRepository.findFtsPageRaw("Walter", PAGE_SIZE, PAGE_SIZE); + + assertThat(rows).hasSize(remainder); + long total = ((Number) rows.get(0)[2]).longValue(); + assertThat(total).isEqualTo(WALTER_COUNT); + } + + @Test + void findFtsPageRaw_noMatches_returnsEmptyList() { + List rows = documentRepository.findFtsPageRaw("XYZ_KEIN_TREFFER", 0, PAGE_SIZE); + + assertThat(rows).isEmpty(); + } + + @Test + void findFtsPageRaw_stopwordOnlyQuery_returnsEmptyList_noException() { + assertThatNoException().isThrownBy(() -> { + List rows = documentRepository.findFtsPageRaw("der die das und", 0, PAGE_SIZE); + assertThat(rows).isEmpty(); + }); + } + + // ─── Helper ─────────────────────────────────────────────────────────────── + + private Document doc(String title) { + return Document.builder() + .title(title) + .originalFilename(title.replace(" ", "_") + ".pdf") + .status(DocumentStatus.UPLOADED) + .build(); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentFtsTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentFtsTest.java index 2ffec47c..758a2ad2 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentFtsTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentFtsTest.java @@ -69,7 +69,7 @@ class DocumentFtsTest { documentRepository.saveAndFlush(document("Alter Brief")); em.clear(); - List ids = documentRepository.findRankedIdsByFts("Brief"); + List ids = documentRepository.findAllMatchingIdsByFts("Brief"); assertThat(ids).hasSize(1); } @@ -79,7 +79,7 @@ class DocumentFtsTest { documentRepository.saveAndFlush(document("Alter Brief")); em.clear(); - List ids = documentRepository.findRankedIdsByFts("Briefe"); + List ids = documentRepository.findAllMatchingIdsByFts("Briefe"); assertThat(ids).hasSize(1); } @@ -89,7 +89,7 @@ class DocumentFtsTest { documentRepository.saveAndFlush(document("Ein furchtbarer Brief")); em.clear(); - List ids = documentRepository.findRankedIdsByFts("furchtb"); + List ids = documentRepository.findAllMatchingIdsByFts("furchtb"); assertThat(ids).hasSize(1); } @@ -99,7 +99,7 @@ class DocumentFtsTest { documentRepository.saveAndFlush(document("Familienfoto")); em.clear(); - List ids = documentRepository.findRankedIdsByFts("Brief"); + List ids = documentRepository.findAllMatchingIdsByFts("Brief"); assertThat(ids).isEmpty(); } @@ -115,7 +115,7 @@ class DocumentFtsTest { em.flush(); em.clear(); - List ids = documentRepository.findRankedIdsByFts("schreiben"); + List ids = documentRepository.findAllMatchingIdsByFts("schreiben"); assertThat(ids).contains(doc.getId()); } @@ -125,14 +125,14 @@ class DocumentFtsTest { Document doc = documentRepository.saveAndFlush(document("Leeres Dokument")); em.clear(); - assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).isEmpty(); + assertThat(documentRepository.findAllMatchingIdsByFts("Grundbuch")).isEmpty(); UUID annotationId = annotation(doc.getId()); blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0)); em.flush(); em.clear(); - assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId()); + assertThat(documentRepository.findAllMatchingIdsByFts("Grundbuch")).contains(doc.getId()); } @Test @@ -144,13 +144,13 @@ class DocumentFtsTest { em.flush(); em.clear(); - assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId()); + assertThat(documentRepository.findAllMatchingIdsByFts("Grundbuch")).contains(doc.getId()); blockRepository.deleteById(block.getId()); em.flush(); em.clear(); - assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).doesNotContain(doc.getId()); + assertThat(documentRepository.findAllMatchingIdsByFts("Grundbuch")).doesNotContain(doc.getId()); } // ─── Ranking ─────────────────────────────────────────────────────────────── @@ -166,7 +166,7 @@ class DocumentFtsTest { em.flush(); em.clear(); - List ids = documentRepository.findRankedIdsByFts("Grundbuch"); + List ids = documentRepository.findAllMatchingIdsByFts("Grundbuch"); assertThat(ids).hasSize(2); assertThat(ids.get(0)).isEqualTo(docA.getId()); @@ -179,7 +179,7 @@ class DocumentFtsTest { documentRepository.saveAndFlush(document("Ein Brief von der Oma")); em.clear(); - List ids = documentRepository.findRankedIdsByFts("der die das und"); + List ids = documentRepository.findAllMatchingIdsByFts("der die das und"); assertThat(ids).isEmpty(); } @@ -195,7 +195,7 @@ class DocumentFtsTest { em.flush(); em.clear(); - List ids = documentRepository.findRankedIdsByFts("Wille"); + List ids = documentRepository.findAllMatchingIdsByFts("Wille"); assertThat(ids).contains(doc.getId()); } @@ -205,7 +205,7 @@ class DocumentFtsTest { documentRepository.saveAndFlush(document("Brief")); em.clear(); - assertThatNoException().isThrownBy(() -> documentRepository.findRankedIdsByFts("(((")); + assertThatNoException().isThrownBy(() -> documentRepository.findAllMatchingIdsByFts("(((")); } // ─── Weight C: sender/receiver names ─────────────────────────────────────── @@ -223,7 +223,7 @@ class DocumentFtsTest { em.flush(); em.clear(); - List ids = documentRepository.findRankedIdsByFts("Schmidt"); + List ids = documentRepository.findAllMatchingIdsByFts("Schmidt"); assertThat(ids).contains(doc.getId()); } @@ -241,7 +241,7 @@ class DocumentFtsTest { em.flush(); em.clear(); - List ids = documentRepository.findRankedIdsByFts("Raddatz"); + List ids = documentRepository.findAllMatchingIdsByFts("Raddatz"); assertThat(ids).contains(doc.getId()); } @@ -260,7 +260,7 @@ class DocumentFtsTest { em.flush(); em.clear(); - List ids = documentRepository.findRankedIdsByFts("Familiengeschichte"); + List ids = documentRepository.findAllMatchingIdsByFts("Familiengeschichte"); assertThat(ids).hasSize(1); } @@ -278,7 +278,7 @@ class DocumentFtsTest { em.flush(); em.clear(); - List rankedIds = documentRepository.findRankedIdsByFts("Grundbuch"); + List rankedIds = documentRepository.findAllMatchingIdsByFts("Grundbuch"); Specification spec = Specification.where(hasIds(rankedIds)) .and(hasStatus(DocumentStatus.UPLOADED)); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceSortTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceSortTest.java index 80e1e4fa..f7c4564f 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceSortTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceSortTest.java @@ -21,17 +21,22 @@ import org.springframework.data.domain.Pageable; import org.springframework.data.jpa.domain.Specification; import java.time.LocalDate; +import java.util.ArrayList; import java.util.List; import java.util.UUID; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @ExtendWith(MockitoExtension.class) class DocumentServiceSortTest { - private static final Pageable UNPAGED = org.springframework.data.domain.PageRequest.of(0, 10_000); + private static final Pageable PAGE = org.springframework.data.domain.PageRequest.of(0, 10_000); @Mock DocumentRepository documentRepository; @Mock PersonService personService; @@ -43,12 +48,12 @@ class DocumentServiceSortTest { @Mock TranscriptionBlockQueryService transcriptionBlockQueryService; @InjectMocks DocumentService documentService; - // ─── searchDocuments — DATE sort ────────────────────────────────────────── + // ─── DATE sort ──────────────────────────────────────────────────────────── @Test void searchDocuments_with_DATE_sort_and_text_sorts_chronologically_not_by_relevance() { - UUID id1 = UUID.randomUUID(); // rank position 0 (higher relevance, older doc) - UUID id2 = UUID.randomUUID(); // rank position 1 (lower relevance, newer doc) + UUID id1 = UUID.randomUUID(); // higher relevance, older doc + UUID id2 = UUID.randomUUID(); // lower relevance, newer doc Document older = Document.builder().id(id1) .title("Brief").status(DocumentStatus.UPLOADED) @@ -57,38 +62,48 @@ class DocumentServiceSortTest { .title("Brief").status(DocumentStatus.UPLOADED) .documentDate(LocalDate.of(1960, 1, 1)).build(); - // FTS returns id1 first (higher rank), id2 second - when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(id1, id2)); - // findAll(spec, pageable) — the correct date path — returns date-DESC order + when(documentRepository.findAllMatchingIdsByFts("Brief")).thenReturn(List.of(id1, id2)); when(documentRepository.findAll(any(Specification.class), any(Pageable.class))) .thenReturn(new PageImpl<>(List.of(newer, older))); DocumentSearchResult result = documentService.searchDocuments( - "Brief", null, null, null, null, null, null, null, DocumentSort.DATE, "DESC", null, UNPAGED); + "Brief", null, null, null, null, null, null, null, DocumentSort.DATE, "DESC", null, PAGE); - // Expect: date order (newer 1960 first), NOT rank order (older 1940 first) assertThat(result.items()).hasSize(2); - assertThat(result.items().get(0).document().getId()).isEqualTo(id2); // newer doc first + assertThat(result.items().get(0).document().getId()).isEqualTo(id2); // newer first } - // ─── searchDocuments — RELEVANCE sort ───────────────────────────────────── + // ─── RELEVANCE sort — pure text (no filters) ────────────────────────────── + + @Test + void searchDocuments_relevance_pureText_calls_findFtsPageRaw_not_findAllMatchingIds() { + UUID id1 = UUID.randomUUID(); + List ftsRows = ftsRows(id1, 0.5d, 1L); + when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(ftsRows); + when(documentRepository.findAllById(any())) + .thenReturn(List.of(doc(id1))); + + documentService.searchDocuments( + "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, PAGE); + + verify(documentRepository).findFtsPageRaw(anyString(), anyInt(), anyInt()); + verify(documentRepository, never()).findAllMatchingIdsByFts(anyString()); + } @Test void searchDocuments_with_RELEVANCE_sort_and_text_preserves_fts_rank_order() { - UUID id1 = UUID.randomUUID(); // rank position 0 - UUID id2 = UUID.randomUUID(); // rank position 1 + UUID id1 = UUID.randomUUID(); // higher rank — must appear first + UUID id2 = UUID.randomUUID(); // lower rank - Document doc1 = Document.builder().id(id1).title("Brief").status(DocumentStatus.UPLOADED).build(); - Document doc2 = Document.builder().id(id2).title("Brief").status(DocumentStatus.UPLOADED).build(); - - when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(id1, id2)); - when(documentRepository.findAll(any(Specification.class))) - .thenReturn(List.of(doc2, doc1)); // unordered from DB + List ftsRows = new ArrayList<>(); + ftsRows.add(new Object[]{id1, 0.8d, 2L}); + ftsRows.add(new Object[]{id2, 0.3d, 2L}); + when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(ftsRows); + when(documentRepository.findAllById(any())).thenReturn(List.of(doc(id2), doc(id1))); // unordered from JPA DocumentSearchResult result = documentService.searchDocuments( - "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, UNPAGED); + "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, PAGE); - // Expect: rank order restored (id1 first) assertThat(result.items().get(0).document().getId()).isEqualTo(id1); } @@ -97,16 +112,82 @@ class DocumentServiceSortTest { UUID id1 = UUID.randomUUID(); UUID id2 = UUID.randomUUID(); - Document doc1 = Document.builder().id(id1).title("Brief").status(DocumentStatus.UPLOADED).build(); - Document doc2 = Document.builder().id(id2).title("Brief").status(DocumentStatus.UPLOADED).build(); - - when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(id1, id2)); - when(documentRepository.findAll(any(Specification.class))) - .thenReturn(List.of(doc2, doc1)); + List ftsRows = new ArrayList<>(); + ftsRows.add(new Object[]{id1, 0.8d, 2L}); + ftsRows.add(new Object[]{id2, 0.3d, 2L}); + when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(ftsRows); + when(documentRepository.findAllById(any())).thenReturn(List.of(doc(id2), doc(id1))); DocumentSearchResult result = documentService.searchDocuments( - "Brief", null, null, null, null, null, null, null, null, null, null, UNPAGED); + "Brief", null, null, null, null, null, null, null, null, null, null, PAGE); assertThat(result.items().get(0).document().getId()).isEqualTo(id1); } + + // ─── RELEVANCE sort — overflow guard ───────────────────────────────────── + + @Test + void searchDocuments_relevance_returns_empty_when_offset_exceeds_maxInt() { + // offset = pageNumber * pageSize; choose values so offset > Integer.MAX_VALUE + Pageable hugePage = org.springframework.data.domain.PageRequest.of(Integer.MAX_VALUE / 10 + 1, 10); + + DocumentSearchResult result = documentService.searchDocuments( + "Brief", null, null, null, null, null, null, null, + DocumentSort.RELEVANCE, null, null, hugePage); + + assertThat(result.items()).isEmpty(); + verify(documentRepository, never()).findFtsPageRaw(anyString(), anyInt(), anyInt()); + } + + // ─── toFtsPage — UUID-as-String JDBC driver variance ──────────────────── + + @Test + void searchDocuments_relevance_handles_string_uuid_from_jdbc_driver() { + String stringId = "11111111-1111-1111-1111-111111111111"; + UUID uuidId = UUID.fromString(stringId); + // Simulate a JDBC driver that returns the id column as String instead of UUID + List ftsRows = new ArrayList<>(); + ftsRows.add(new Object[]{stringId, 0.5d, 1L}); + when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(ftsRows); + when(documentRepository.findAllById(any())).thenReturn(List.of(doc(uuidId))); + + DocumentSearchResult result = documentService.searchDocuments( + "Brief", null, null, null, null, null, null, null, + DocumentSort.RELEVANCE, null, null, PAGE); + + assertThat(result.items()).hasSize(1); + assertThat(result.items().get(0).document().getId()).isEqualTo(uuidId); + } + + // ─── RELEVANCE sort — text + active filter ──────────────────────────────── + + @Test + void searchDocuments_relevance_with_active_filter_uses_inMemory_path() { + UUID id1 = UUID.randomUUID(); + UUID id2 = UUID.randomUUID(); + + when(documentRepository.findAllMatchingIdsByFts("Brief")).thenReturn(List.of(id1, id2)); + when(documentRepository.findAll(any(Specification.class))) + .thenReturn(List.of(doc(id2), doc(id1))); + + // sender filter is active → triggers in-memory path, not findFtsPageRaw + LocalDate from = LocalDate.of(1900, 1, 1); + documentService.searchDocuments( + "Brief", from, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, PAGE); + + verify(documentRepository, never()).findFtsPageRaw(anyString(), anyInt(), anyInt()); + verify(documentRepository).findAllMatchingIdsByFts("Brief"); + } + + // ─── Helpers ────────────────────────────────────────────────────────────── + + private static Document doc(UUID id) { + return Document.builder().id(id).title("Brief").status(DocumentStatus.UPLOADED).build(); + } + + private static List ftsRows(UUID id, double rank, long total) { + List rows = new ArrayList<>(); + rows.add(new Object[]{id, rank, total}); + return rows; + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java index 99198452..186fd2aa 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java @@ -1620,9 +1620,10 @@ class DocumentServiceTest { // chr(1)=\u0001 marks start, chr(2)=\u0002 marks end of highlighted term List rows = Collections.singletonList(new Object[]{docId, "\u0001Brief\u0002 an Anna", null, false, null, null, null}); - when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(docId)); - when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class))) - .thenReturn(List.of(doc)); + List ftsRows = new java.util.ArrayList<>(); + ftsRows.add(new Object[]{docId, 0.5d, 1L}); + when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(ftsRows); + when(documentRepository.findAllById(any())).thenReturn(List.of(doc)); when(documentRepository.findEnrichmentData(any(), eq("Brief"))).thenReturn(rows); DocumentSearchResult result = documentService.searchDocuments( @@ -1654,9 +1655,10 @@ class DocumentServiceTest { String snippetHeadline = "Hier ist der \u0001Brief\u0002 aus Berlin"; List rows = Collections.singletonList(new Object[]{docId, "Dok", snippetHeadline, false, null, null, null}); - when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(docId)); - when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class))) - .thenReturn(List.of(doc)); + List snippetFtsRows = new java.util.ArrayList<>(); + snippetFtsRows.add(new Object[]{docId, 0.5d, 1L}); + when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(snippetFtsRows); + when(documentRepository.findAllById(any())).thenReturn(List.of(doc)); when(documentRepository.findEnrichmentData(any(), eq("Brief"))).thenReturn(rows); DocumentSearchResult result = documentService.searchDocuments( @@ -2202,7 +2204,7 @@ class DocumentServiceTest { @Test void findIdsForFilter_returnsEmpty_whenFtsHasNoMatches() { - when(documentRepository.findRankedIdsByFts("xyz")).thenReturn(List.of()); + when(documentRepository.findAllMatchingIdsByFts("xyz")).thenReturn(List.of()); List result = documentService.findIdsForFilter( "xyz", null, null, null, null, null, null, null, null); @@ -2386,7 +2388,7 @@ class DocumentServiceTest { @Test void getDensity_shortCircuits_whenFtsReturnsNoMatches() { - when(documentRepository.findRankedIdsByFts("xyz")).thenReturn(List.of()); + when(documentRepository.findAllMatchingIdsByFts("xyz")).thenReturn(List.of()); DocumentDensityResult result = documentService.getDensity( new DensityFilters("xyz", null, null, null, null, null, null)); diff --git a/docs/adr/008-fts-sql-pagination.md b/docs/adr/008-fts-sql-pagination.md new file mode 100644 index 00000000..f58b4e44 --- /dev/null +++ b/docs/adr/008-fts-sql-pagination.md @@ -0,0 +1,68 @@ +# ADR-008: SQL-level pagination for full-text search via window-function CTE + +## Status + +Accepted + +## Context + +`DocumentRepository.findAllMatchingIdsByFts` (formerly `findRankedIdsByFts`) returns all matching document IDs for a FTS query. `DocumentService.searchDocuments` then paginates in memory on the RELEVANCE sort path. + +A pre-production audit against 1,520 documents measured: + +``` +rows_per_call: 911 / call (query: "walter") +``` + +At current scale this is acceptable — 911 UUIDs ≈ 14 KB, ms-level DB time. At 100 K+ documents two failure modes emerge: + +1. **Memory**: a broad query returns ~60 K UUIDs ≈ 1 MB per request, multiplied by concurrent users. +2. **Latency**: the `LATERAL` join does work proportional to match-set size; at 60 K matches the FTS step alone exceeds 100 ms per query. + +Tracked as finding **F-31 (High)** in the pre-production architectural review. + +## Decision + +Push pagination and rank ordering into SQL for the RELEVANCE sort path when no non-text filters are active (pure full-text search): + +```sql +WITH q AS ( + SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> '' + THEN to_tsquery('simple', regexp_replace( + websearch_to_tsquery('german', :query)::text, + '''([^'']+)''', '''\\1'':*', 'g')) + END AS pq +), matches AS ( + SELECT d.id, ts_rank(d.search_vector, q.pq) AS rank + FROM documents d, q + WHERE d.search_vector @@ q.pq +) +SELECT id, rank, COUNT(*) OVER () AS total +FROM matches +ORDER BY rank DESC, id +OFFSET :offset LIMIT :limit +``` + +`COUNT(*) OVER ()` returns the full match count alongside each page row in a single round-trip — no separate count query needed. + +`rows_per_call` for the FTS query drops from match-set size (911) to page size (≤ 50). + +When non-text filters (date range, sender, receiver, tags, status) are also active, the existing path is preserved: `findAllMatchingIdsByFts` returns all ranked IDs, which are passed as an `IN` clause to the JPA Specification, and `totalElements` comes from the JPA `Page.getTotalElements()`. This keeps the count accurate across the combined filter set. + +## Alternatives Considered + +**1. Two-query approach (separate COUNT + paged SELECT)** +Correct, but doubles round-trips. The window function achieves the same result in one query. + +**2. Capped result set with a user-visible warning** +Return at most N results (e.g. 500) and show "showing top 500 of many results". Simpler, but degrades UX for broad queries and doesn't reduce latency proportionally (still scans N rows). + +**3. Full SQL rewrite combining FTS + JPA Specification filters** +Possible via a native query that embeds all filter predicates. Eliminates the in-memory SENDER/RECEIVER sort paths and the two-phase approach. High complexity, tight coupling to schema details, loses type-safe JPA Specification composition. Deferred to a future refactor if scale demands it. + +## Consequences + +- **`rows_per_call` for pure-text FTS searches drops to ≤ page size** — the primary metric. +- **SENDER and RECEIVER sort paths stay in-memory** for combined text+filter queries. For pure-text queries with SENDER/RECEIVER sort, the current approach (fetch all matched IDs, build spec, load all matched entities, sort in-memory) still runs. This is acceptable while the archive stays under ~10 K documents. +- **RELEVANCE sort with text+filters still loads the full filtered entity set in-memory.** The filtered set is typically much smaller than the raw FTS match set, so the cost is bounded by filter selectivity, not total match count. +- **`findAllMatchingIdsByFts` is retained** for: (a) the bulk-edit "select all" fast path (`findIdsForFilter`), (b) the document density chart (`getDensity`), and (c) the SENDER/RECEIVER in-memory sort paths. diff --git a/frontend/src/routes/documents/+page.server.ts b/frontend/src/routes/documents/+page.server.ts index 68e6c78a..1c9510e9 100644 --- a/frontend/src/routes/documents/+page.server.ts +++ b/frontend/src/routes/documents/+page.server.ts @@ -5,14 +5,17 @@ import type { components } from '$lib/generated/api'; const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; -async function resolvePersonName(id: string, fetch: typeof globalThis.fetch): Promise { +async function resolvePersonName( + id: string, + api: ReturnType +): Promise { if (!UUID_RE.test(id)) return ''; try { - const res = await fetch(`/api/persons/${id}`); - if (!res.ok) return ''; - const person = await res.json(); - return person.displayName ?? ''; - } catch { + const result = await api.GET('/api/persons/{id}', { params: { path: { id } } }); + if (!result.response.ok) return ''; + return result.data?.displayName ?? ''; + } catch (e) { + console.error('[resolvePersonName] failed for id', id, e); return ''; } } @@ -70,7 +73,7 @@ export async function load({ url, fetch }) { } } }), - Promise.all([resolvePersonName(senderId, fetch), resolvePersonName(receiverId, fetch)]) + Promise.all([resolvePersonName(senderId, api), resolvePersonName(receiverId, api)]) ]); } catch { return { diff --git a/frontend/src/routes/documents/page.server.spec.ts b/frontend/src/routes/documents/page.server.spec.ts index 0d09ac69..a77d2c3e 100644 --- a/frontend/src/routes/documents/page.server.spec.ts +++ b/frontend/src/routes/documents/page.server.spec.ts @@ -171,66 +171,70 @@ describe('documents page load — network error fallback', () => { // ─── person name resolution ─────────────────────────────────────────────────── describe('documents page load — person name resolution', () => { - function makeSearchMock() { - const mockGet = vi.fn().mockResolvedValue({ - response: { ok: true, status: 200 }, - data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 } + function makeSearchMock(personResult?: { ok: boolean; displayName?: string }) { + const mockGet = vi.fn().mockImplementation((path: string) => { + if (path === '/api/documents/search') { + return Promise.resolve({ + response: { ok: true, status: 200 }, + data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 } + }); + } + // person lookup via api.GET('/api/persons/{id}', ...) + if (!personResult?.ok) { + return Promise.resolve({ response: { ok: false, status: 404 }, data: undefined }); + } + return Promise.resolve({ + response: { ok: true, status: 200 }, + data: { displayName: personResult.displayName ?? '' } + }); }); vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType< typeof createApiClient >); + return mockGet; } it('returns initialSenderName from person lookup when senderId is a valid UUID', async () => { - makeSearchMock(); - const mockFetch = vi.fn().mockResolvedValue({ - ok: true, - json: vi.fn().mockResolvedValue({ displayName: 'Max Mustermann' }) - }); + makeSearchMock({ ok: true, displayName: 'Max Mustermann' }); const result = await load({ url: makeUrl({ senderId: '11111111-1111-1111-1111-111111111111' }), - fetch: mockFetch as unknown as typeof fetch + fetch: vi.fn() as unknown as typeof fetch }); expect(result.initialSenderName).toBe('Max Mustermann'); }); it('returns initialReceiverName from person lookup when receiverId is a valid UUID', async () => { - makeSearchMock(); - const mockFetch = vi.fn().mockResolvedValue({ - ok: true, - json: vi.fn().mockResolvedValue({ displayName: 'Anna Musterfrau' }) - }); + makeSearchMock({ ok: true, displayName: 'Anna Musterfrau' }); const result = await load({ url: makeUrl({ receiverId: '22222222-2222-2222-2222-222222222222' }), - fetch: mockFetch as unknown as typeof fetch + fetch: vi.fn() as unknown as typeof fetch }); expect(result.initialReceiverName).toBe('Anna Musterfrau'); }); it('returns empty string when senderId is not a valid UUID', async () => { - makeSearchMock(); - const mockFetch = vi.fn(); + const mockGet = makeSearchMock(); const result = await load({ url: makeUrl({ senderId: 'not-a-uuid' }), - fetch: mockFetch as unknown as typeof fetch + fetch: vi.fn() as unknown as typeof fetch }); expect(result.initialSenderName).toBe(''); - expect(mockFetch).not.toHaveBeenCalledWith(expect.stringContaining('/api/persons/')); + // UUID guard fires before any api.GET call — only document search is called + expect(mockGet).toHaveBeenCalledTimes(1); }); - it('returns empty string when person fetch returns 404', async () => { - makeSearchMock(); - const mockFetch = vi.fn().mockResolvedValue({ ok: false, status: 404 }); + it('returns empty string when person api returns 404', async () => { + makeSearchMock({ ok: false }); const result = await load({ url: makeUrl({ senderId: '11111111-1111-1111-1111-111111111111' }), - fetch: mockFetch as unknown as typeof fetch + fetch: vi.fn() as unknown as typeof fetch }); expect(result.initialSenderName).toBe('');