fix(fts): paginate FTS match-set in SQL instead of loading all matching IDs #488

Merged
marcel merged 6 commits from fix/issue-468-fts-sql-pagination into main 2026-05-09 16:35:15 +02:00
11 changed files with 474 additions and 98 deletions

View File

@@ -100,7 +100,45 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
ORDER BY ts_rank(d.search_vector, q.pq) DESC,
d.meta_date DESC NULLS LAST
""")
List<UUID> findRankedIdsByFts(@Param("query") String query);
// Unpaged path — for bulk-edit "select all" and density chart
List<UUID> findAllMatchingIdsByFts(@Param("query") String query);
/**
* Returns one page of FTS-ranked document IDs with the total match count.
*
* <p>Each row contains (in column order):
* <ol>
* <li>UUID — document id</li>
* <li>double — ts_rank score</li>
* <li>long — COUNT(*) OVER () — full match count, not page count</li>
* </ol>
*
* <p>Returns an empty list when the query matches no documents (including
* stopword-only queries where websearch_to_tsquery returns an empty tsquery).
* Use findAllMatchingIdsByFts for the unpaged bulk-edit path.
*/
@Query(nativeQuery = true, value = """
WITH q AS (
SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
THEN to_tsquery('simple', regexp_replace(
websearch_to_tsquery('german', :query)::text,
'''([^'']+)''',
'''\\1'':*',
'g'))
END AS pq
), matches AS (
SELECT d.id, ts_rank(d.search_vector, q.pq) AS rank
FROM documents d, q
WHERE d.search_vector @@ q.pq
)
SELECT id, rank, COUNT(*) OVER () AS total
FROM matches
ORDER BY rank DESC, id
OFFSET :offset LIMIT :limit
""")
List<Object[]> findFtsPageRaw(@Param("query") String query,
@Param("offset") int offset,
@Param("limit") int limit);
/**
* Returns match-enrichment data for a set of documents identified by their IDs.

View File

@@ -162,7 +162,7 @@ public class DocumentService {
*/
private List<UUID> resolveFtsIds(String text) {
if (!StringUtils.hasText(text)) return null;
return documentRepository.findRankedIdsByFts(text);
return documentRepository.findAllMatchingIdsByFts(text);
}
/** Loads matching documents and projects to non-null {@link LocalDate}s. */
@@ -485,7 +485,7 @@ public class DocumentService {
boolean hasText = StringUtils.hasText(text);
List<UUID> rankedIds = null;
if (hasText) {
rankedIds = documentRepository.findRankedIdsByFts(text);
rankedIds = documentRepository.findAllMatchingIdsByFts(text);
if (rankedIds.isEmpty()) return List.of();
}
@@ -645,39 +645,43 @@ public class DocumentService {
// 1. Allgemeine Suche (für das Suchfeld im Frontend)
public DocumentSearchResult searchDocuments(String text, LocalDate from, LocalDate to, UUID sender, UUID receiver, List<String> tags, String tagQ, DocumentStatus status, DocumentSort sort, String dir, TagOperator tagOperator, Pageable pageable) {
boolean hasText = StringUtils.hasText(text);
List<UUID> rankedIds = null;
// Pure-text RELEVANCE: push pagination into SQL — skip findAllMatchingIdsByFts entirely (ADR-008).
if (isPureTextRelevance(hasText, sort, from, to, sender, receiver, tags, tagQ, status)) {
return relevanceSortedPageFromSql(text, pageable);
}
List<UUID> rankedIds = null;
if (hasText) {
rankedIds = documentRepository.findRankedIdsByFts(text);
rankedIds = documentRepository.findAllMatchingIdsByFts(text);
if (rankedIds.isEmpty()) return DocumentSearchResult.of(List.of());
}
Specification<Document> spec = buildSearchSpec(
hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, tagOperator);
// SENDER, RECEIVER and RELEVANCE sorts load the full match set and slice in memory.
// SENDER and RECEIVER sorts load the full match set and slice in-memory.
// JPA's Sort.by("sender.lastName") generates an INNER JOIN that silently drops
// documents with null sender/receivers; RELEVANCE maps a DB order to an external
// rank list. Cost scales linearly with match count — acceptable while documents
// stays under ~10k rows. Past that, replace with SQL-level LEFT JOIN sort.
// documents with null sender/receivers. Cost scales with match count —
// acceptable while documents stays under ~10k rows. (ADR-008)
if (sort == DocumentSort.RECEIVER) {
// In-memory sort on page slice (≤ page size rows) — acceptable
List<Document> sorted = sortByFirstReceiver(documentRepository.findAll(spec), dir);
return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size());
}
if (sort == DocumentSort.SENDER) {
// In-memory sort on page slice (≤ page size rows) — acceptable
List<Document> sorted = sortBySender(documentRepository.findAll(spec), dir);
return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size());
}
// RELEVANCE: default when text present and no explicit sort given
// RELEVANCE with active filters: load filtered subset and sort in-memory by rank.
boolean useRankOrder = hasText && (sort == null || sort == DocumentSort.RELEVANCE);
if (useRankOrder) {
List<Document> results = documentRepository.findAll(spec);
Map<UUID, Integer> rankMap = new HashMap<>();
for (int i = 0; i < rankedIds.size(); i++) rankMap.put(rankedIds.get(i), i);
List<Document> sorted = results.stream()
.sorted(Comparator.comparingInt(
doc -> rankMap.getOrDefault(doc.getId(), Integer.MAX_VALUE)))
List<Document> sorted = documentRepository.findAll(spec).stream()
.sorted(Comparator.comparingInt(doc -> rankMap.getOrDefault(doc.getId(), Integer.MAX_VALUE)))
.toList();
return buildResultPaged(pageSlice(sorted, pageable), text, pageable, sorted.size());
}
@@ -688,6 +692,39 @@ public class DocumentService {
return buildResultPaged(page.getContent(), text, pageable, page.getTotalElements());
}
private static boolean isPureTextRelevance(boolean hasText, DocumentSort sort,
LocalDate from, LocalDate to, UUID sender, UUID receiver,
List<String> tags, String tagQ, DocumentStatus status) {
return hasText && (sort == null || sort == DocumentSort.RELEVANCE)
&& from == null && to == null && sender == null && receiver == null
&& (tags == null || tags.isEmpty()) && (tagQ == null || tagQ.isBlank()) && status == null;
}
/**
* Pure-text RELEVANCE path — pagination and ts_rank ordering pushed into SQL.
* Called when no non-text filters are active (ADR-008).
*/
private DocumentSearchResult relevanceSortedPageFromSql(String text, Pageable pageable) {
long rawOffset = pageable.getOffset();
if (rawOffset > Integer.MAX_VALUE) return DocumentSearchResult.of(List.of());
int offset = (int) rawOffset;
int limit = pageable.getPageSize();
FtsPage ftsPage = toFtsPage(documentRepository.findFtsPageRaw(text, offset, limit));
if (ftsPage.hits().isEmpty()) return DocumentSearchResult.of(List.of());
// Preserve ts_rank order from SQL across the JPA findAllById call.
Map<UUID, Integer> rankMap = new HashMap<>();
List<UUID> pageIds = new ArrayList<>();
for (int i = 0; i < ftsPage.hits().size(); i++) {
rankMap.put(ftsPage.hits().get(i).id(), i);
pageIds.add(ftsPage.hits().get(i).id());
}
List<Document> docs = documentRepository.findAllById(pageIds).stream()
.sorted(Comparator.comparingInt(d -> rankMap.getOrDefault(d.getId(), Integer.MAX_VALUE)))
.toList();
return buildResultPaged(docs, text, pageable, ftsPage.total());
}
private static <T> List<T> pageSlice(List<T> sorted, Pageable pageable) {
int from = Math.min((int) pageable.getOffset(), sorted.size());
int to = Math.min(from + pageable.getPageSize(), sorted.size());
@@ -1013,6 +1050,28 @@ public class DocumentService {
return result;
}
private static final int COL_ID = 0;
private static final int COL_RANK = 1;
private static final int COL_TOTAL = 2;
/**
* Maps raw Object[] rows from {@link DocumentRepository#findFtsPageRaw} to an
* {@link FtsPage}. Uses pattern-matching UUID cast to guard against driver-level
* type variance (some JDBC drivers return UUID as String).
*/
private static FtsPage toFtsPage(List<Object[]> rows) {
if (rows.isEmpty()) return new FtsPage(List.of(), 0);
long total = ((Number) rows.get(0)[COL_TOTAL]).longValue();
List<FtsHit> hits = rows.stream()
.map(r -> {
UUID id = r[COL_ID] instanceof UUID u ? u : UUID.fromString(r[COL_ID].toString());
double rank = ((Number) r[COL_RANK]).doubleValue();
return new FtsHit(id, rank);
})
.toList();
return new FtsPage(hits, total);
}
/** Clean text + highlight offsets parsed from a {@code ts_headline} sentinel-delimited string. */
public record ParsedHighlight(String cleanText, List<MatchOffset> offsets) {}

View File

@@ -0,0 +1,6 @@
package org.raddatz.familienarchiv.document;
import java.util.UUID;
/** A single document hit from a paginated FTS query — id and its ts_rank score. */
record FtsHit(UUID id, double rank) {}

View File

@@ -0,0 +1,6 @@
package org.raddatz.familienarchiv.document;
import java.util.List;
/** One page of FTS results — the ranked hit list for this page and the total match count. */
record FtsPage(List<FtsHit> hits, long total) {}

View File

@@ -0,0 +1,109 @@
package org.raddatz.familienarchiv.document;
import jakarta.persistence.EntityManager;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.raddatz.familienarchiv.PostgresContainerConfig;
import org.raddatz.familienarchiv.config.FlywayConfig;
import org.raddatz.familienarchiv.document.DocumentRepository;
import org.raddatz.familienarchiv.document.Document;
import org.raddatz.familienarchiv.document.DocumentStatus;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest;
import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase;
import org.springframework.context.annotation.Import;
import org.springframework.test.annotation.DirtiesContext;
import java.util.List;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatNoException;
/**
* Repository-level integration tests for {@code findFtsPageRaw}: verifies that the
* paginated FTS query returns exactly page-size rows and that the window-function
* total reflects the full match count, not just the page count.
*
* <p>Uses real Postgres via Testcontainers so the GIN index, tsvector trigger, and
* {@code websearch_to_tsquery} semantics are identical to production.
*
* <p>{@code AFTER_CLASS} dirty-context keeps the Spring context alive for all tests
* in this class and rebuilds it once at the end, rather than after every test.
*/
@DataJpaTest
@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE)
@Import({PostgresContainerConfig.class, FlywayConfig.class})
@DirtiesContext(classMode = DirtiesContext.ClassMode.AFTER_CLASS)
class DocumentFtsPagedIntegrationTest {
@Autowired DocumentRepository documentRepository;
@Autowired EntityManager em;
// 60 docs match "Walter"; 10 docs with "Hans" do not.
private static final int WALTER_COUNT = 60;
private static final int PAGE_SIZE = 50;
@BeforeEach
void seed() {
documentRepository.deleteAll();
em.flush();
for (int i = 0; i < WALTER_COUNT; i++) {
documentRepository.saveAndFlush(doc("Brief von Walter Nr. " + i));
}
for (int i = 0; i < 10; i++) {
documentRepository.saveAndFlush(doc("Brief von Hans Nr. " + i));
}
em.clear();
}
@Test
void findFtsPageRaw_firstPage_returnsPageSizeRows() {
List<Object[]> rows = documentRepository.findFtsPageRaw("Walter", 0, PAGE_SIZE);
assertThat(rows).hasSize(PAGE_SIZE);
}
@Test
void findFtsPageRaw_windowTotal_equalsFullMatchCount_notPageSize() {
List<Object[]> rows = documentRepository.findFtsPageRaw("Walter", 0, PAGE_SIZE);
long total = ((Number) rows.get(0)[2]).longValue();
assertThat(total).isEqualTo(WALTER_COUNT);
}
@Test
void findFtsPageRaw_lastPage_returnsRemainder() {
int remainder = WALTER_COUNT % PAGE_SIZE; // 60 % 50 = 10
List<Object[]> rows = documentRepository.findFtsPageRaw("Walter", PAGE_SIZE, PAGE_SIZE);
assertThat(rows).hasSize(remainder);
long total = ((Number) rows.get(0)[2]).longValue();
assertThat(total).isEqualTo(WALTER_COUNT);
}
@Test
void findFtsPageRaw_noMatches_returnsEmptyList() {
List<Object[]> rows = documentRepository.findFtsPageRaw("XYZ_KEIN_TREFFER", 0, PAGE_SIZE);
assertThat(rows).isEmpty();
}
@Test
void findFtsPageRaw_stopwordOnlyQuery_returnsEmptyList_noException() {
assertThatNoException().isThrownBy(() -> {
List<Object[]> rows = documentRepository.findFtsPageRaw("der die das und", 0, PAGE_SIZE);
assertThat(rows).isEmpty();
});
}
// ─── Helper ───────────────────────────────────────────────────────────────
private Document doc(String title) {
return Document.builder()
.title(title)
.originalFilename(title.replace(" ", "_") + ".pdf")
.status(DocumentStatus.UPLOADED)
.build();
}
}

View File

@@ -69,7 +69,7 @@ class DocumentFtsTest {
documentRepository.saveAndFlush(document("Alter Brief"));
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Brief");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("Brief");
assertThat(ids).hasSize(1);
}
@@ -79,7 +79,7 @@ class DocumentFtsTest {
documentRepository.saveAndFlush(document("Alter Brief"));
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Briefe");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("Briefe");
assertThat(ids).hasSize(1);
}
@@ -89,7 +89,7 @@ class DocumentFtsTest {
documentRepository.saveAndFlush(document("Ein furchtbarer Brief"));
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("furchtb");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("furchtb");
assertThat(ids).hasSize(1);
}
@@ -99,7 +99,7 @@ class DocumentFtsTest {
documentRepository.saveAndFlush(document("Familienfoto"));
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Brief");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("Brief");
assertThat(ids).isEmpty();
}
@@ -115,7 +115,7 @@ class DocumentFtsTest {
em.flush();
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("schreiben");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("schreiben");
assertThat(ids).contains(doc.getId());
}
@@ -125,14 +125,14 @@ class DocumentFtsTest {
Document doc = documentRepository.saveAndFlush(document("Leeres Dokument"));
em.clear();
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).isEmpty();
assertThat(documentRepository.findAllMatchingIdsByFts("Grundbuch")).isEmpty();
UUID annotationId = annotation(doc.getId());
blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0));
em.flush();
em.clear();
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId());
assertThat(documentRepository.findAllMatchingIdsByFts("Grundbuch")).contains(doc.getId());
}
@Test
@@ -144,13 +144,13 @@ class DocumentFtsTest {
em.flush();
em.clear();
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId());
assertThat(documentRepository.findAllMatchingIdsByFts("Grundbuch")).contains(doc.getId());
blockRepository.deleteById(block.getId());
em.flush();
em.clear();
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).doesNotContain(doc.getId());
assertThat(documentRepository.findAllMatchingIdsByFts("Grundbuch")).doesNotContain(doc.getId());
}
// ─── Ranking ───────────────────────────────────────────────────────────────
@@ -166,7 +166,7 @@ class DocumentFtsTest {
em.flush();
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Grundbuch");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("Grundbuch");
assertThat(ids).hasSize(2);
assertThat(ids.get(0)).isEqualTo(docA.getId());
@@ -179,7 +179,7 @@ class DocumentFtsTest {
documentRepository.saveAndFlush(document("Ein Brief von der Oma"));
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("der die das und");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("der die das und");
assertThat(ids).isEmpty();
}
@@ -195,7 +195,7 @@ class DocumentFtsTest {
em.flush();
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Wille");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("Wille");
assertThat(ids).contains(doc.getId());
}
@@ -205,7 +205,7 @@ class DocumentFtsTest {
documentRepository.saveAndFlush(document("Brief"));
em.clear();
assertThatNoException().isThrownBy(() -> documentRepository.findRankedIdsByFts("((("));
assertThatNoException().isThrownBy(() -> documentRepository.findAllMatchingIdsByFts("((("));
}
// ─── Weight C: sender/receiver names ───────────────────────────────────────
@@ -223,7 +223,7 @@ class DocumentFtsTest {
em.flush();
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Schmidt");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("Schmidt");
assertThat(ids).contains(doc.getId());
}
@@ -241,7 +241,7 @@ class DocumentFtsTest {
em.flush();
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Raddatz");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("Raddatz");
assertThat(ids).contains(doc.getId());
}
@@ -260,7 +260,7 @@ class DocumentFtsTest {
em.flush();
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Familiengeschichte");
List<UUID> ids = documentRepository.findAllMatchingIdsByFts("Familiengeschichte");
assertThat(ids).hasSize(1);
}
@@ -278,7 +278,7 @@ class DocumentFtsTest {
em.flush();
em.clear();
List<UUID> rankedIds = documentRepository.findRankedIdsByFts("Grundbuch");
List<UUID> rankedIds = documentRepository.findAllMatchingIdsByFts("Grundbuch");
Specification<Document> spec = Specification.where(hasIds(rankedIds))
.and(hasStatus(DocumentStatus.UPLOADED));

View File

@@ -21,17 +21,22 @@ import org.springframework.data.domain.Pageable;
import org.springframework.data.jpa.domain.Specification;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
@ExtendWith(MockitoExtension.class)
class DocumentServiceSortTest {
private static final Pageable UNPAGED = org.springframework.data.domain.PageRequest.of(0, 10_000);
private static final Pageable PAGE = org.springframework.data.domain.PageRequest.of(0, 10_000);
@Mock DocumentRepository documentRepository;
@Mock PersonService personService;
@@ -43,12 +48,12 @@ class DocumentServiceSortTest {
@Mock TranscriptionBlockQueryService transcriptionBlockQueryService;
@InjectMocks DocumentService documentService;
// ─── searchDocuments — DATE sort ──────────────────────────────────────────
// ─── DATE sort ────────────────────────────────────────────────────────────
@Test
void searchDocuments_with_DATE_sort_and_text_sorts_chronologically_not_by_relevance() {
UUID id1 = UUID.randomUUID(); // rank position 0 (higher relevance, older doc)
UUID id2 = UUID.randomUUID(); // rank position 1 (lower relevance, newer doc)
UUID id1 = UUID.randomUUID(); // higher relevance, older doc
UUID id2 = UUID.randomUUID(); // lower relevance, newer doc
Document older = Document.builder().id(id1)
.title("Brief").status(DocumentStatus.UPLOADED)
@@ -57,38 +62,48 @@ class DocumentServiceSortTest {
.title("Brief").status(DocumentStatus.UPLOADED)
.documentDate(LocalDate.of(1960, 1, 1)).build();
// FTS returns id1 first (higher rank), id2 second
when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(id1, id2));
// findAll(spec, pageable) — the correct date path — returns date-DESC order
when(documentRepository.findAllMatchingIdsByFts("Brief")).thenReturn(List.of(id1, id2));
when(documentRepository.findAll(any(Specification.class), any(Pageable.class)))
.thenReturn(new PageImpl<>(List.of(newer, older)));
DocumentSearchResult result = documentService.searchDocuments(
"Brief", null, null, null, null, null, null, null, DocumentSort.DATE, "DESC", null, UNPAGED);
"Brief", null, null, null, null, null, null, null, DocumentSort.DATE, "DESC", null, PAGE);
// Expect: date order (newer 1960 first), NOT rank order (older 1940 first)
assertThat(result.items()).hasSize(2);
assertThat(result.items().get(0).document().getId()).isEqualTo(id2); // newer doc first
assertThat(result.items().get(0).document().getId()).isEqualTo(id2); // newer first
}
// ─── searchDocuments — RELEVANCE sort ─────────────────────────────────────
// ─── RELEVANCE sort — pure text (no filters) ──────────────────────────────
@Test
void searchDocuments_relevance_pureText_calls_findFtsPageRaw_not_findAllMatchingIds() {
UUID id1 = UUID.randomUUID();
List<Object[]> ftsRows = ftsRows(id1, 0.5d, 1L);
when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(ftsRows);
when(documentRepository.findAllById(any()))
.thenReturn(List.of(doc(id1)));
documentService.searchDocuments(
"Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, PAGE);
verify(documentRepository).findFtsPageRaw(anyString(), anyInt(), anyInt());
verify(documentRepository, never()).findAllMatchingIdsByFts(anyString());
}
@Test
void searchDocuments_with_RELEVANCE_sort_and_text_preserves_fts_rank_order() {
UUID id1 = UUID.randomUUID(); // rank position 0
UUID id2 = UUID.randomUUID(); // rank position 1
UUID id1 = UUID.randomUUID(); // higher rank — must appear first
UUID id2 = UUID.randomUUID(); // lower rank
Document doc1 = Document.builder().id(id1).title("Brief").status(DocumentStatus.UPLOADED).build();
Document doc2 = Document.builder().id(id2).title("Brief").status(DocumentStatus.UPLOADED).build();
when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(id1, id2));
when(documentRepository.findAll(any(Specification.class)))
.thenReturn(List.of(doc2, doc1)); // unordered from DB
List<Object[]> ftsRows = new ArrayList<>();
ftsRows.add(new Object[]{id1, 0.8d, 2L});
ftsRows.add(new Object[]{id2, 0.3d, 2L});
when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(ftsRows);
when(documentRepository.findAllById(any())).thenReturn(List.of(doc(id2), doc(id1))); // unordered from JPA
DocumentSearchResult result = documentService.searchDocuments(
"Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, UNPAGED);
"Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, PAGE);
// Expect: rank order restored (id1 first)
assertThat(result.items().get(0).document().getId()).isEqualTo(id1);
}
@@ -97,16 +112,82 @@ class DocumentServiceSortTest {
UUID id1 = UUID.randomUUID();
UUID id2 = UUID.randomUUID();
Document doc1 = Document.builder().id(id1).title("Brief").status(DocumentStatus.UPLOADED).build();
Document doc2 = Document.builder().id(id2).title("Brief").status(DocumentStatus.UPLOADED).build();
when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(id1, id2));
when(documentRepository.findAll(any(Specification.class)))
.thenReturn(List.of(doc2, doc1));
List<Object[]> ftsRows = new ArrayList<>();
ftsRows.add(new Object[]{id1, 0.8d, 2L});
ftsRows.add(new Object[]{id2, 0.3d, 2L});
when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(ftsRows);
when(documentRepository.findAllById(any())).thenReturn(List.of(doc(id2), doc(id1)));
DocumentSearchResult result = documentService.searchDocuments(
"Brief", null, null, null, null, null, null, null, null, null, null, UNPAGED);
"Brief", null, null, null, null, null, null, null, null, null, null, PAGE);
assertThat(result.items().get(0).document().getId()).isEqualTo(id1);
}
// ─── RELEVANCE sort — overflow guard ─────────────────────────────────────
@Test
void searchDocuments_relevance_returns_empty_when_offset_exceeds_maxInt() {
// offset = pageNumber * pageSize; choose values so offset > Integer.MAX_VALUE
Pageable hugePage = org.springframework.data.domain.PageRequest.of(Integer.MAX_VALUE / 10 + 1, 10);
DocumentSearchResult result = documentService.searchDocuments(
"Brief", null, null, null, null, null, null, null,
DocumentSort.RELEVANCE, null, null, hugePage);
assertThat(result.items()).isEmpty();
verify(documentRepository, never()).findFtsPageRaw(anyString(), anyInt(), anyInt());
}
// ─── toFtsPage — UUID-as-String JDBC driver variance ────────────────────
@Test
void searchDocuments_relevance_handles_string_uuid_from_jdbc_driver() {
String stringId = "11111111-1111-1111-1111-111111111111";
UUID uuidId = UUID.fromString(stringId);
// Simulate a JDBC driver that returns the id column as String instead of UUID
List<Object[]> ftsRows = new ArrayList<>();
ftsRows.add(new Object[]{stringId, 0.5d, 1L});
when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(ftsRows);
when(documentRepository.findAllById(any())).thenReturn(List.of(doc(uuidId)));
DocumentSearchResult result = documentService.searchDocuments(
"Brief", null, null, null, null, null, null, null,
DocumentSort.RELEVANCE, null, null, PAGE);
assertThat(result.items()).hasSize(1);
assertThat(result.items().get(0).document().getId()).isEqualTo(uuidId);
}
// ─── RELEVANCE sort — text + active filter ────────────────────────────────
@Test
void searchDocuments_relevance_with_active_filter_uses_inMemory_path() {
UUID id1 = UUID.randomUUID();
UUID id2 = UUID.randomUUID();
when(documentRepository.findAllMatchingIdsByFts("Brief")).thenReturn(List.of(id1, id2));
when(documentRepository.findAll(any(Specification.class)))
.thenReturn(List.of(doc(id2), doc(id1)));
// sender filter is active → triggers in-memory path, not findFtsPageRaw
LocalDate from = LocalDate.of(1900, 1, 1);
documentService.searchDocuments(
"Brief", from, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, PAGE);
verify(documentRepository, never()).findFtsPageRaw(anyString(), anyInt(), anyInt());
verify(documentRepository).findAllMatchingIdsByFts("Brief");
}
// ─── Helpers ──────────────────────────────────────────────────────────────
private static Document doc(UUID id) {
return Document.builder().id(id).title("Brief").status(DocumentStatus.UPLOADED).build();
}
private static List<Object[]> ftsRows(UUID id, double rank, long total) {
List<Object[]> rows = new ArrayList<>();
rows.add(new Object[]{id, rank, total});
return rows;
}
}

View File

@@ -1620,9 +1620,10 @@ class DocumentServiceTest {
// chr(1)=\u0001 marks start, chr(2)=\u0002 marks end of highlighted term
List<Object[]> rows = Collections.singletonList(new Object[]{docId, "\u0001Brief\u0002 an Anna", null, false, null, null, null});
when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(docId));
when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class)))
.thenReturn(List.of(doc));
List<Object[]> ftsRows = new java.util.ArrayList<>();
ftsRows.add(new Object[]{docId, 0.5d, 1L});
when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(ftsRows);
when(documentRepository.findAllById(any())).thenReturn(List.of(doc));
when(documentRepository.findEnrichmentData(any(), eq("Brief"))).thenReturn(rows);
DocumentSearchResult result = documentService.searchDocuments(
@@ -1654,9 +1655,10 @@ class DocumentServiceTest {
String snippetHeadline = "Hier ist der \u0001Brief\u0002 aus Berlin";
List<Object[]> rows = Collections.singletonList(new Object[]{docId, "Dok", snippetHeadline, false, null, null, null});
when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(docId));
when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class)))
.thenReturn(List.of(doc));
List<Object[]> snippetFtsRows = new java.util.ArrayList<>();
snippetFtsRows.add(new Object[]{docId, 0.5d, 1L});
when(documentRepository.findFtsPageRaw(anyString(), anyInt(), anyInt())).thenReturn(snippetFtsRows);
when(documentRepository.findAllById(any())).thenReturn(List.of(doc));
when(documentRepository.findEnrichmentData(any(), eq("Brief"))).thenReturn(rows);
DocumentSearchResult result = documentService.searchDocuments(
@@ -2202,7 +2204,7 @@ class DocumentServiceTest {
@Test
void findIdsForFilter_returnsEmpty_whenFtsHasNoMatches() {
when(documentRepository.findRankedIdsByFts("xyz")).thenReturn(List.of());
when(documentRepository.findAllMatchingIdsByFts("xyz")).thenReturn(List.of());
List<UUID> result = documentService.findIdsForFilter(
"xyz", null, null, null, null, null, null, null, null);
@@ -2386,7 +2388,7 @@ class DocumentServiceTest {
@Test
void getDensity_shortCircuits_whenFtsReturnsNoMatches() {
when(documentRepository.findRankedIdsByFts("xyz")).thenReturn(List.of());
when(documentRepository.findAllMatchingIdsByFts("xyz")).thenReturn(List.of());
DocumentDensityResult result = documentService.getDensity(
new DensityFilters("xyz", null, null, null, null, null, null));

View File

@@ -0,0 +1,68 @@
# ADR-008: SQL-level pagination for full-text search via window-function CTE
## Status
Accepted
## Context
`DocumentRepository.findAllMatchingIdsByFts` (formerly `findRankedIdsByFts`) returns all matching document IDs for a FTS query. `DocumentService.searchDocuments` then paginates in memory on the RELEVANCE sort path.
A pre-production audit against 1,520 documents measured:
```
rows_per_call: 911 / call (query: "walter")
```
At current scale this is acceptable — 911 UUIDs ≈ 14 KB, ms-level DB time. At 100 K+ documents two failure modes emerge:
1. **Memory**: a broad query returns ~60 K UUIDs ≈ 1 MB per request, multiplied by concurrent users.
2. **Latency**: the `LATERAL` join does work proportional to match-set size; at 60 K matches the FTS step alone exceeds 100 ms per query.
Tracked as finding **F-31 (High)** in the pre-production architectural review.
## Decision
Push pagination and rank ordering into SQL for the RELEVANCE sort path when no non-text filters are active (pure full-text search):
```sql
WITH q AS (
SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
THEN to_tsquery('simple', regexp_replace(
websearch_to_tsquery('german', :query)::text,
'''([^'']+)''', '''\\1'':*', 'g'))
END AS pq
), matches AS (
SELECT d.id, ts_rank(d.search_vector, q.pq) AS rank
FROM documents d, q
WHERE d.search_vector @@ q.pq
)
SELECT id, rank, COUNT(*) OVER () AS total
FROM matches
ORDER BY rank DESC, id
OFFSET :offset LIMIT :limit
```
`COUNT(*) OVER ()` returns the full match count alongside each page row in a single round-trip — no separate count query needed.
`rows_per_call` for the FTS query drops from match-set size (911) to page size (≤ 50).
When non-text filters (date range, sender, receiver, tags, status) are also active, the existing path is preserved: `findAllMatchingIdsByFts` returns all ranked IDs, which are passed as an `IN` clause to the JPA Specification, and `totalElements` comes from the JPA `Page.getTotalElements()`. This keeps the count accurate across the combined filter set.
## Alternatives Considered
**1. Two-query approach (separate COUNT + paged SELECT)**
Correct, but doubles round-trips. The window function achieves the same result in one query.
**2. Capped result set with a user-visible warning**
Return at most N results (e.g. 500) and show "showing top 500 of many results". Simpler, but degrades UX for broad queries and doesn't reduce latency proportionally (still scans N rows).
**3. Full SQL rewrite combining FTS + JPA Specification filters**
Possible via a native query that embeds all filter predicates. Eliminates the in-memory SENDER/RECEIVER sort paths and the two-phase approach. High complexity, tight coupling to schema details, loses type-safe JPA Specification composition. Deferred to a future refactor if scale demands it.
## Consequences
- **`rows_per_call` for pure-text FTS searches drops to ≤ page size** — the primary metric.
- **SENDER and RECEIVER sort paths stay in-memory** for combined text+filter queries. For pure-text queries with SENDER/RECEIVER sort, the current approach (fetch all matched IDs, build spec, load all matched entities, sort in-memory) still runs. This is acceptable while the archive stays under ~10 K documents.
- **RELEVANCE sort with text+filters still loads the full filtered entity set in-memory.** The filtered set is typically much smaller than the raw FTS match set, so the cost is bounded by filter selectivity, not total match count.
- **`findAllMatchingIdsByFts` is retained** for: (a) the bulk-edit "select all" fast path (`findIdsForFilter`), (b) the document density chart (`getDensity`), and (c) the SENDER/RECEIVER in-memory sort paths.

View File

@@ -5,14 +5,17 @@ import type { components } from '$lib/generated/api';
const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
async function resolvePersonName(id: string, fetch: typeof globalThis.fetch): Promise<string> {
async function resolvePersonName(
id: string,
api: ReturnType<typeof createApiClient>
): Promise<string> {
if (!UUID_RE.test(id)) return '';
try {
const res = await fetch(`/api/persons/${id}`);
if (!res.ok) return '';
const person = await res.json();
return person.displayName ?? '';
} catch {
const result = await api.GET('/api/persons/{id}', { params: { path: { id } } });
if (!result.response.ok) return '';
return result.data?.displayName ?? '';
} catch (e) {
console.error('[resolvePersonName] failed for id', id, e);
return '';
}
}
@@ -70,7 +73,7 @@ export async function load({ url, fetch }) {
}
}
}),
Promise.all([resolvePersonName(senderId, fetch), resolvePersonName(receiverId, fetch)])
Promise.all([resolvePersonName(senderId, api), resolvePersonName(receiverId, api)])
]);
} catch {
return {

View File

@@ -171,66 +171,70 @@ describe('documents page load — network error fallback', () => {
// ─── person name resolution ───────────────────────────────────────────────────
describe('documents page load — person name resolution', () => {
function makeSearchMock() {
const mockGet = vi.fn().mockResolvedValue({
response: { ok: true, status: 200 },
data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 }
function makeSearchMock(personResult?: { ok: boolean; displayName?: string }) {
const mockGet = vi.fn().mockImplementation((path: string) => {
if (path === '/api/documents/search') {
return Promise.resolve({
response: { ok: true, status: 200 },
data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 }
});
}
// person lookup via api.GET('/api/persons/{id}', ...)
if (!personResult?.ok) {
return Promise.resolve({ response: { ok: false, status: 404 }, data: undefined });
}
return Promise.resolve({
response: { ok: true, status: 200 },
data: { displayName: personResult.displayName ?? '' }
});
});
vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
typeof createApiClient
>);
return mockGet;
}
it('returns initialSenderName from person lookup when senderId is a valid UUID', async () => {
makeSearchMock();
const mockFetch = vi.fn().mockResolvedValue({
ok: true,
json: vi.fn().mockResolvedValue({ displayName: 'Max Mustermann' })
});
makeSearchMock({ ok: true, displayName: 'Max Mustermann' });
const result = await load({
url: makeUrl({ senderId: '11111111-1111-1111-1111-111111111111' }),
fetch: mockFetch as unknown as typeof fetch
fetch: vi.fn() as unknown as typeof fetch
});
expect(result.initialSenderName).toBe('Max Mustermann');
});
it('returns initialReceiverName from person lookup when receiverId is a valid UUID', async () => {
makeSearchMock();
const mockFetch = vi.fn().mockResolvedValue({
ok: true,
json: vi.fn().mockResolvedValue({ displayName: 'Anna Musterfrau' })
});
makeSearchMock({ ok: true, displayName: 'Anna Musterfrau' });
const result = await load({
url: makeUrl({ receiverId: '22222222-2222-2222-2222-222222222222' }),
fetch: mockFetch as unknown as typeof fetch
fetch: vi.fn() as unknown as typeof fetch
});
expect(result.initialReceiverName).toBe('Anna Musterfrau');
});
it('returns empty string when senderId is not a valid UUID', async () => {
makeSearchMock();
const mockFetch = vi.fn();
const mockGet = makeSearchMock();
const result = await load({
url: makeUrl({ senderId: 'not-a-uuid' }),
fetch: mockFetch as unknown as typeof fetch
fetch: vi.fn() as unknown as typeof fetch
});
expect(result.initialSenderName).toBe('');
expect(mockFetch).not.toHaveBeenCalledWith(expect.stringContaining('/api/persons/'));
// UUID guard fires before any api.GET call — only document search is called
expect(mockGet).toHaveBeenCalledTimes(1);
});
it('returns empty string when person fetch returns 404', async () => {
makeSearchMock();
const mockFetch = vi.fn().mockResolvedValue({ ok: false, status: 404 });
it('returns empty string when person api returns 404', async () => {
makeSearchMock({ ok: false });
const result = await load({
url: makeUrl({ senderId: '11111111-1111-1111-1111-111111111111' }),
fetch: mockFetch as unknown as typeof fetch
fetch: vi.fn() as unknown as typeof fetch
});
expect(result.initialSenderName).toBe('');