feat(fts): push FTS pagination into SQL via CTE window function

Pure-text RELEVANCE queries now use findFtsPageRaw (CTE + COUNT(*) OVER())
instead of loading all matching IDs into memory and sorting in-process.
Non-text paths (filters active, DATE sort) still use the in-memory path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-09 14:20:25 +02:00
committed by marcel
parent fea837b345
commit b017da22c3
2 changed files with 100 additions and 11 deletions

View File

@@ -100,9 +100,46 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
ORDER BY ts_rank(d.search_vector, q.pq) DESC,
d.meta_date DESC NULLS LAST
""")
// Unpaged path — use findFtsPageRaw for paginated search results
// Unpaged path — for bulk-edit "select all" and density chart
List<UUID> findAllMatchingIdsByFts(@Param("query") String query);
/**
* Returns one page of FTS-ranked document IDs with the total match count.
*
* <p>Each row contains (in column order):
* <ol>
* <li>UUID — document id</li>
* <li>double — ts_rank score</li>
* <li>long — COUNT(*) OVER () — full match count, not page count</li>
* </ol>
*
* <p>Returns an empty list when the query matches no documents (including
* stopword-only queries where websearch_to_tsquery returns an empty tsquery).
* Use findAllMatchingIdsByFts for the unpaged bulk-edit path.
*/
@Query(nativeQuery = true, value = """
WITH q AS (
SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
THEN to_tsquery('simple', regexp_replace(
websearch_to_tsquery('german', :query)::text,
'''([^'']+)''',
'''\\1'':*',
'g'))
END AS pq
), matches AS (
SELECT d.id, ts_rank(d.search_vector, q.pq) AS rank
FROM documents d, q
WHERE d.search_vector @@ q.pq
)
SELECT id, rank, COUNT(*) OVER () AS total
FROM matches
ORDER BY rank DESC, id
OFFSET :offset LIMIT :limit
""")
List<Object[]> findFtsPageRaw(@Param("query") String query,
@Param("offset") int offset,
@Param("limit") int limit);
/**
* Returns match-enrichment data for a set of documents identified by their IDs.
* Each row contains (in column order):

View File

@@ -645,8 +645,16 @@ public class DocumentService {
// 1. Allgemeine Suche (für das Suchfeld im Frontend)
public DocumentSearchResult searchDocuments(String text, LocalDate from, LocalDate to, UUID sender, UUID receiver, List<String> tags, String tagQ, DocumentStatus status, DocumentSort sort, String dir, TagOperator tagOperator, Pageable pageable) {
boolean hasText = StringUtils.hasText(text);
List<UUID> rankedIds = null;
// Pure-text RELEVANCE: push pagination into SQL — skip findAllMatchingIdsByFts entirely (ADR-008).
boolean pureTextRelevance = hasText && (sort == null || sort == DocumentSort.RELEVANCE)
&& from == null && to == null && sender == null && receiver == null
&& (tags == null || tags.isEmpty()) && (tagQ == null || tagQ.isBlank()) && status == null;
if (pureTextRelevance) {
return relevanceSortedPageFromSql(text, pageable);
}
List<UUID> rankedIds = null;
if (hasText) {
rankedIds = documentRepository.findAllMatchingIdsByFts(text);
if (rankedIds.isEmpty()) return DocumentSearchResult.of(List.of());
@@ -655,29 +663,28 @@ public class DocumentService {
Specification<Document> spec = buildSearchSpec(
hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, tagOperator);
// SENDER, RECEIVER and RELEVANCE sorts load the full match set and slice in memory.
// SENDER and RECEIVER sorts load the full match set and slice in-memory.
// JPA's Sort.by("sender.lastName") generates an INNER JOIN that silently drops
// documents with null sender/receivers; RELEVANCE maps a DB order to an external
// rank list. Cost scales linearly with match count — acceptable while documents
// stays under ~10k rows. Past that, replace with SQL-level LEFT JOIN sort.
// documents with null sender/receivers. Cost scales with match count —
// acceptable while documents stays under ~10k rows. (ADR-008)
if (sort == DocumentSort.RECEIVER) {
// In-memory sort on page slice (≤ page size rows) — acceptable
List<Document> sorted = sortByFirstReceiver(documentRepository.findAll(spec), dir);
return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size());
}
if (sort == DocumentSort.SENDER) {
// In-memory sort on page slice (≤ page size rows) — acceptable
List<Document> sorted = sortBySender(documentRepository.findAll(spec), dir);
return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size());
}
// RELEVANCE: default when text present and no explicit sort given
// RELEVANCE with active filters: load filtered subset and sort in-memory by rank.
boolean useRankOrder = hasText && (sort == null || sort == DocumentSort.RELEVANCE);
if (useRankOrder) {
List<Document> results = documentRepository.findAll(spec);
Map<UUID, Integer> rankMap = new HashMap<>();
for (int i = 0; i < rankedIds.size(); i++) rankMap.put(rankedIds.get(i), i);
List<Document> sorted = results.stream()
.sorted(Comparator.comparingInt(
doc -> rankMap.getOrDefault(doc.getId(), Integer.MAX_VALUE)))
List<Document> sorted = documentRepository.findAll(spec).stream()
.sorted(Comparator.comparingInt(doc -> rankMap.getOrDefault(doc.getId(), Integer.MAX_VALUE)))
.toList();
return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size());
}
@@ -688,6 +695,29 @@ public class DocumentService {
return buildResultPaged(page.getContent(), text, pageable, page.getTotalElements());
}
/**
* Pure-text RELEVANCE path — pagination and ts_rank ordering pushed into SQL.
* Called when no non-text filters are active (ADR-008).
*/
private DocumentSearchResult relevanceSortedPageFromSql(String text, Pageable pageable) {
int offset = (int) pageable.getOffset();
int limit = pageable.getPageSize();
FtsPage ftsPage = toFtsPage(documentRepository.findFtsPageRaw(text, offset, limit));
if (ftsPage.hits().isEmpty()) return DocumentSearchResult.of(List.of());
// Preserve ts_rank order from SQL across the JPA findAllById call.
Map<UUID, Integer> rankMap = new HashMap<>();
List<UUID> pageIds = new ArrayList<>();
for (int i = 0; i < ftsPage.hits().size(); i++) {
rankMap.put(ftsPage.hits().get(i).id(), i);
pageIds.add(ftsPage.hits().get(i).id());
}
List<Document> docs = documentRepository.findAllById(pageIds).stream()
.sorted(Comparator.comparingInt(d -> rankMap.getOrDefault(d.getId(), Integer.MAX_VALUE)))
.toList();
return buildResultPaged(docs, text, pageable, ftsPage.total());
}
private static <T> List<T> pageSlice(List<T> sorted, Pageable pageable) {
int from = Math.min((int) pageable.getOffset(), sorted.size());
int to = Math.min(from + pageable.getPageSize(), sorted.size());
@@ -1013,6 +1043,28 @@ public class DocumentService {
return result;
}
private static final int COL_ID = 0;
private static final int COL_RANK = 1;
private static final int COL_TOTAL = 2;
/**
* Maps raw Object[] rows from {@link DocumentRepository#findFtsPageRaw} to an
* {@link FtsPage}. Uses pattern-matching UUID cast to guard against driver-level
* type variance (some JDBC drivers return UUID as String).
*/
private static FtsPage toFtsPage(List<Object[]> rows) {
if (rows.isEmpty()) return new FtsPage(List.of(), 0);
long total = ((Number) rows.get(0)[COL_TOTAL]).longValue();
List<FtsHit> hits = rows.stream()
.map(r -> {
UUID id = r[COL_ID] instanceof UUID u ? u : UUID.fromString(r[COL_ID].toString());
double rank = ((Number) r[COL_RANK]).doubleValue();
return new FtsHit(id, rank);
})
.toList();
return new FtsPage(hits, total);
}
/** Clean text + highlight offsets parsed from a {@code ts_headline} sentinel-delimited string. */
public record ParsedHighlight(String cleanText, List<MatchOffset> offsets) {}