feat(fts): push FTS pagination into SQL via CTE window function

Pure-text RELEVANCE queries now use findFtsPageRaw (CTE + COUNT(*) OVER()) instead of loading all matching IDs into memory and sorting in-process. Non-text paths (filters active, DATE sort) still use the in-memory path. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-09 14:20:25 +02:00
parent fea837b345
commit b017da22c3
2 changed files with 100 additions and 11 deletions
--- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentRepository.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentRepository.java
@@ -100,9 +100,46 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
            ORDER BY ts_rank(d.search_vector, q.pq) DESC,
                     d.meta_date DESC NULLS LAST
            """)
-    // Unpaged path — use findFtsPageRaw for paginated search results
+    // Unpaged path — for bulk-edit "select all" and density chart
    List<UUID> findAllMatchingIdsByFts(@Param("query") String query);

+    /**
+     * Returns one page of FTS-ranked document IDs with the total match count.
+     *
+     * <p>Each row contains (in column order):
+     * <ol>
+     *   <li>UUID   — document id</li>
+     *   <li>double — ts_rank score</li>
+     *   <li>long   — COUNT(*) OVER () — full match count, not page count</li>
+     * </ol>
+     *
+     * <p>Returns an empty list when the query matches no documents (including
+     * stopword-only queries where websearch_to_tsquery returns an empty tsquery).
+     * Use findAllMatchingIdsByFts for the unpaged bulk-edit path.
+     */
+    @Query(nativeQuery = true, value = """
+            WITH q AS (
+                SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
+                            THEN to_tsquery('simple', regexp_replace(
+                                     websearch_to_tsquery('german', :query)::text,
+                                     '''([^'']+)''',
+                                     '''\\1'':*',
+                                     'g'))
+                       END AS pq
+            ), matches AS (
+                SELECT d.id, ts_rank(d.search_vector, q.pq) AS rank
+                FROM documents d, q
+                WHERE d.search_vector @@ q.pq
+            )
+            SELECT id, rank, COUNT(*) OVER () AS total
+            FROM matches
+            ORDER BY rank DESC, id
+            OFFSET :offset LIMIT :limit
+            """)
+    List<Object[]> findFtsPageRaw(@Param("query") String query,
+                                  @Param("offset") int offset,
+                                  @Param("limit") int limit);
+
    /**
     * Returns match-enrichment data for a set of documents identified by their IDs.
     * Each row contains (in column order):
--- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java
@@ -645,8 +645,16 @@ public class DocumentService {
    // 1. Allgemeine Suche (für das Suchfeld im Frontend)
    public DocumentSearchResult searchDocuments(String text, LocalDate from, LocalDate to, UUID sender, UUID receiver, List<String> tags, String tagQ, DocumentStatus status, DocumentSort sort, String dir, TagOperator tagOperator, Pageable pageable) {
        boolean hasText = StringUtils.hasText(text);
-        List<UUID> rankedIds = null;

+        // Pure-text RELEVANCE: push pagination into SQL — skip findAllMatchingIdsByFts entirely (ADR-008).
+        boolean pureTextRelevance = hasText && (sort == null || sort == DocumentSort.RELEVANCE)
+                && from == null && to == null && sender == null && receiver == null
+                && (tags == null || tags.isEmpty()) && (tagQ == null || tagQ.isBlank()) && status == null;
+        if (pureTextRelevance) {
+            return relevanceSortedPageFromSql(text, pageable);
+        }
+
+        List<UUID> rankedIds = null;
        if (hasText) {
            rankedIds = documentRepository.findAllMatchingIdsByFts(text);
            if (rankedIds.isEmpty()) return DocumentSearchResult.of(List.of());
@@ -655,29 +663,28 @@ public class DocumentService {
        Specification<Document> spec = buildSearchSpec(
                hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, tagOperator);

-        // SENDER, RECEIVER and RELEVANCE sorts load the full match set and slice in memory.
+        // SENDER and RECEIVER sorts load the full match set and slice in-memory.
        // JPA's Sort.by("sender.lastName") generates an INNER JOIN that silently drops
-        // documents with null sender/receivers; RELEVANCE maps a DB order to an external
-        // rank list. Cost scales linearly with match count — acceptable while documents
-        // stays under ~10k rows. Past that, replace with SQL-level LEFT JOIN sort.
+        // documents with null sender/receivers. Cost scales with match count —
+        // acceptable while documents stays under ~10k rows. (ADR-008)
        if (sort == DocumentSort.RECEIVER) {
+            // In-memory sort on page slice (≤ page size rows) — acceptable
            List<Document> sorted = sortByFirstReceiver(documentRepository.findAll(spec), dir);
            return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size());
        }
        if (sort == DocumentSort.SENDER) {
+            // In-memory sort on page slice (≤ page size rows) — acceptable
            List<Document> sorted = sortBySender(documentRepository.findAll(spec), dir);
            return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size());
        }

-        // RELEVANCE: default when text present and no explicit sort given
+        // RELEVANCE with active filters: load filtered subset and sort in-memory by rank.
        boolean useRankOrder = hasText && (sort == null || sort == DocumentSort.RELEVANCE);
        if (useRankOrder) {
-            List<Document> results = documentRepository.findAll(spec);
            Map<UUID, Integer> rankMap = new HashMap<>();
            for (int i = 0; i < rankedIds.size(); i++) rankMap.put(rankedIds.get(i), i);
-            List<Document> sorted = results.stream()
-                    .sorted(Comparator.comparingInt(
-                            doc -> rankMap.getOrDefault(doc.getId(), Integer.MAX_VALUE)))
+            List<Document> sorted = documentRepository.findAll(spec).stream()
+                    .sorted(Comparator.comparingInt(doc -> rankMap.getOrDefault(doc.getId(), Integer.MAX_VALUE)))
                    .toList();
            return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size());
        }
@@ -688,6 +695,29 @@ public class DocumentService {
        return buildResultPaged(page.getContent(), text, pageable, page.getTotalElements());
    }

+    /**
+     * Pure-text RELEVANCE path — pagination and ts_rank ordering pushed into SQL.
+     * Called when no non-text filters are active (ADR-008).
+     */
+    private DocumentSearchResult relevanceSortedPageFromSql(String text, Pageable pageable) {
+        int offset = (int) pageable.getOffset();
+        int limit = pageable.getPageSize();
+        FtsPage ftsPage = toFtsPage(documentRepository.findFtsPageRaw(text, offset, limit));
+        if (ftsPage.hits().isEmpty()) return DocumentSearchResult.of(List.of());
+
+        // Preserve ts_rank order from SQL across the JPA findAllById call.
+        Map<UUID, Integer> rankMap = new HashMap<>();
+        List<UUID> pageIds = new ArrayList<>();
+        for (int i = 0; i < ftsPage.hits().size(); i++) {
+            rankMap.put(ftsPage.hits().get(i).id(), i);
+            pageIds.add(ftsPage.hits().get(i).id());
+        }
+        List<Document> docs = documentRepository.findAllById(pageIds).stream()
+                .sorted(Comparator.comparingInt(d -> rankMap.getOrDefault(d.getId(), Integer.MAX_VALUE)))
+                .toList();
+        return buildResultPaged(docs, text, pageable, ftsPage.total());
+    }
+
    private static <T> List<T> pageSlice(List<T> sorted, Pageable pageable) {
        int from = Math.min((int) pageable.getOffset(), sorted.size());
        int to = Math.min(from + pageable.getPageSize(), sorted.size());
@@ -1013,6 +1043,28 @@ public class DocumentService {
        return result;
    }

+    private static final int COL_ID = 0;
+    private static final int COL_RANK = 1;
+    private static final int COL_TOTAL = 2;
+
+    /**
+     * Maps raw Object[] rows from {@link DocumentRepository#findFtsPageRaw} to an
+     * {@link FtsPage}. Uses pattern-matching UUID cast to guard against driver-level
+     * type variance (some JDBC drivers return UUID as String).
+     */
+    private static FtsPage toFtsPage(List<Object[]> rows) {
+        if (rows.isEmpty()) return new FtsPage(List.of(), 0);
+        long total = ((Number) rows.get(0)[COL_TOTAL]).longValue();
+        List<FtsHit> hits = rows.stream()
+                .map(r -> {
+                    UUID id = r[COL_ID] instanceof UUID u ? u : UUID.fromString(r[COL_ID].toString());
+                    double rank = ((Number) r[COL_RANK]).doubleValue();
+                    return new FtsHit(id, rank);
+                })
+                .toList();
+        return new FtsPage(hits, total);
+    }
+
    /** Clean text + highlight offsets parsed from a {@code ts_headline} sentinel-delimited string. */
    public record ParsedHighlight(String cleanText, List<MatchOffset> offsets) {}