From b017da22c32f6df69c6bf7b0b42f7142e98ed85d Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 9 May 2026 14:20:25 +0200 Subject: [PATCH] feat(fts): push FTS pagination into SQL via CTE window function Pure-text RELEVANCE queries now use findFtsPageRaw (CTE + COUNT(*) OVER()) instead of loading all matching IDs into memory and sorting in-process. Non-text paths (filters active, DATE sort) still use the in-memory path. Co-Authored-By: Claude Sonnet 4.6 --- .../document/DocumentRepository.java | 39 +++++++++- .../document/DocumentService.java | 72 ++++++++++++++++--- 2 files changed, 100 insertions(+), 11 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentRepository.java index 66ed5d4a..e907e5f2 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentRepository.java @@ -100,9 +100,46 @@ public interface DocumentRepository extends JpaRepository, JpaSp ORDER BY ts_rank(d.search_vector, q.pq) DESC, d.meta_date DESC NULLS LAST """) - // Unpaged path — use findFtsPageRaw for paginated search results + // Unpaged path — for bulk-edit "select all" and density chart List findAllMatchingIdsByFts(@Param("query") String query); + /** + * Returns one page of FTS-ranked document IDs with the total match count. + * + *

Each row contains (in column order): + *

    + *
  1. UUID — document id
  2. + *
  3. double — ts_rank score
  4. + *
  5. long — COUNT(*) OVER () — full match count, not page count
  6. + *
+ * + *

Returns an empty list when the query matches no documents (including + * stopword-only queries where websearch_to_tsquery returns an empty tsquery). + * Use findAllMatchingIdsByFts for the unpaged bulk-edit path. + */ + @Query(nativeQuery = true, value = """ + WITH q AS ( + SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> '' + THEN to_tsquery('simple', regexp_replace( + websearch_to_tsquery('german', :query)::text, + '''([^'']+)''', + '''\\1'':*', + 'g')) + END AS pq + ), matches AS ( + SELECT d.id, ts_rank(d.search_vector, q.pq) AS rank + FROM documents d, q + WHERE d.search_vector @@ q.pq + ) + SELECT id, rank, COUNT(*) OVER () AS total + FROM matches + ORDER BY rank DESC, id + OFFSET :offset LIMIT :limit + """) + List findFtsPageRaw(@Param("query") String query, + @Param("offset") int offset, + @Param("limit") int limit); + /** * Returns match-enrichment data for a set of documents identified by their IDs. * Each row contains (in column order): diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java index cc126b37..0a056b7b 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java @@ -645,8 +645,16 @@ public class DocumentService { // 1. Allgemeine Suche (für das Suchfeld im Frontend) public DocumentSearchResult searchDocuments(String text, LocalDate from, LocalDate to, UUID sender, UUID receiver, List tags, String tagQ, DocumentStatus status, DocumentSort sort, String dir, TagOperator tagOperator, Pageable pageable) { boolean hasText = StringUtils.hasText(text); - List rankedIds = null; + // Pure-text RELEVANCE: push pagination into SQL — skip findAllMatchingIdsByFts entirely (ADR-008). + boolean pureTextRelevance = hasText && (sort == null || sort == DocumentSort.RELEVANCE) + && from == null && to == null && sender == null && receiver == null + && (tags == null || tags.isEmpty()) && (tagQ == null || tagQ.isBlank()) && status == null; + if (pureTextRelevance) { + return relevanceSortedPageFromSql(text, pageable); + } + + List rankedIds = null; if (hasText) { rankedIds = documentRepository.findAllMatchingIdsByFts(text); if (rankedIds.isEmpty()) return DocumentSearchResult.of(List.of()); @@ -655,29 +663,28 @@ public class DocumentService { Specification spec = buildSearchSpec( hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, tagOperator); - // SENDER, RECEIVER and RELEVANCE sorts load the full match set and slice in memory. + // SENDER and RECEIVER sorts load the full match set and slice in-memory. // JPA's Sort.by("sender.lastName") generates an INNER JOIN that silently drops - // documents with null sender/receivers; RELEVANCE maps a DB order to an external - // rank list. Cost scales linearly with match count — acceptable while documents - // stays under ~10k rows. Past that, replace with SQL-level LEFT JOIN sort. + // documents with null sender/receivers. Cost scales with match count — + // acceptable while documents stays under ~10k rows. (ADR-008) if (sort == DocumentSort.RECEIVER) { + // In-memory sort on page slice (≤ page size rows) — acceptable List sorted = sortByFirstReceiver(documentRepository.findAll(spec), dir); return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size()); } if (sort == DocumentSort.SENDER) { + // In-memory sort on page slice (≤ page size rows) — acceptable List sorted = sortBySender(documentRepository.findAll(spec), dir); return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size()); } - // RELEVANCE: default when text present and no explicit sort given + // RELEVANCE with active filters: load filtered subset and sort in-memory by rank. boolean useRankOrder = hasText && (sort == null || sort == DocumentSort.RELEVANCE); if (useRankOrder) { - List results = documentRepository.findAll(spec); Map rankMap = new HashMap<>(); for (int i = 0; i < rankedIds.size(); i++) rankMap.put(rankedIds.get(i), i); - List sorted = results.stream() - .sorted(Comparator.comparingInt( - doc -> rankMap.getOrDefault(doc.getId(), Integer.MAX_VALUE))) + List sorted = documentRepository.findAll(spec).stream() + .sorted(Comparator.comparingInt(doc -> rankMap.getOrDefault(doc.getId(), Integer.MAX_VALUE))) .toList(); return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size()); } @@ -688,6 +695,29 @@ public class DocumentService { return buildResultPaged(page.getContent(), text, pageable, page.getTotalElements()); } + /** + * Pure-text RELEVANCE path — pagination and ts_rank ordering pushed into SQL. + * Called when no non-text filters are active (ADR-008). + */ + private DocumentSearchResult relevanceSortedPageFromSql(String text, Pageable pageable) { + int offset = (int) pageable.getOffset(); + int limit = pageable.getPageSize(); + FtsPage ftsPage = toFtsPage(documentRepository.findFtsPageRaw(text, offset, limit)); + if (ftsPage.hits().isEmpty()) return DocumentSearchResult.of(List.of()); + + // Preserve ts_rank order from SQL across the JPA findAllById call. + Map rankMap = new HashMap<>(); + List pageIds = new ArrayList<>(); + for (int i = 0; i < ftsPage.hits().size(); i++) { + rankMap.put(ftsPage.hits().get(i).id(), i); + pageIds.add(ftsPage.hits().get(i).id()); + } + List docs = documentRepository.findAllById(pageIds).stream() + .sorted(Comparator.comparingInt(d -> rankMap.getOrDefault(d.getId(), Integer.MAX_VALUE))) + .toList(); + return buildResultPaged(docs, text, pageable, ftsPage.total()); + } + private static List pageSlice(List sorted, Pageable pageable) { int from = Math.min((int) pageable.getOffset(), sorted.size()); int to = Math.min(from + pageable.getPageSize(), sorted.size()); @@ -1013,6 +1043,28 @@ public class DocumentService { return result; } + private static final int COL_ID = 0; + private static final int COL_RANK = 1; + private static final int COL_TOTAL = 2; + + /** + * Maps raw Object[] rows from {@link DocumentRepository#findFtsPageRaw} to an + * {@link FtsPage}. Uses pattern-matching UUID cast to guard against driver-level + * type variance (some JDBC drivers return UUID as String). + */ + private static FtsPage toFtsPage(List rows) { + if (rows.isEmpty()) return new FtsPage(List.of(), 0); + long total = ((Number) rows.get(0)[COL_TOTAL]).longValue(); + List hits = rows.stream() + .map(r -> { + UUID id = r[COL_ID] instanceof UUID u ? u : UUID.fromString(r[COL_ID].toString()); + double rank = ((Number) r[COL_RANK]).doubleValue(); + return new FtsHit(id, rank); + }) + .toList(); + return new FtsPage(hits, total); + } + /** Clean text + highlight offsets parsed from a {@code ts_headline} sentinel-delimited string. */ public record ParsedHighlight(String cleanText, List offsets) {}