diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java index 9db6e5d2..44b0a0d9 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java @@ -1058,6 +1058,43 @@ public class DocumentService { tagService.delete(tagId); } + /** + * One-time cleanup of already-stale auto-titles (#726, FR-003). For every document whose + * stored title passes the {@link DocumentTitleBackfillMatcher} overwrite heuristic, rebuilds + * the title from the row's current state and persists it only when it actually changed. + * Idempotent: a second run rebuilds the same value and saves nothing. Hand-written prose is + * left untouched. + * + *

Saves via {@code documentRepository.save} directly — it must NOT route through + * {@link #updateDocument} (which versions every write), following the {@link #backfillFileHashes} + * precedent: a mechanical rename must not snapshot the whole corpus into {@code document_versions}. + * + * @return the number of documents whose title was rewritten + */ + @Transactional + public int backfillTitles() { + List docs = documentRepository.findAll(); + int updated = 0; + int skipped = 0; + for (Document doc : docs) { + if (!DocumentTitleBackfillMatcher.isOverwritable( + doc.getTitle(), doc.getOriginalFilename(), doc.getLocation())) { + skipped++; + continue; + } + String rebuilt = documentTitleFactory.build(doc); + if (rebuilt.equals(doc.getTitle())) { + skipped++; // already correct — keep idempotent, no write + continue; + } + doc.setTitle(rebuilt); + documentRepository.save(doc); // direct save, no recordVersion (mechanical rename) + updated++; + } + log.info("Title backfill complete: scanned={} updated={} skipped={}", docs.size(), updated, skipped); + return updated; + } + @Transactional public int backfillFileHashes() { List docs = documentRepository.findByFileHashIsNullAndFilePathIsNotNull(); diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentTitleBackfillMatcher.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentTitleBackfillMatcher.java new file mode 100644 index 00000000..ee6efc2c --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentTitleBackfillMatcher.java @@ -0,0 +1,101 @@ +package org.raddatz.familienarchiv.document; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.LinkedHashSet; +import java.util.Locale; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Heuristic overwrite test for the one-time title backfill (#726, FR-004): decides whether a + * STORED title is a machine-generated auto-title (and so may be rebuilt from the row's current + * state) versus hand-written prose (left untouched). Used ONLY by the backfill — save-time + * regeneration uses an exact old-vs-new comparison instead, with no heuristic. + * + *

A stored title is overwritable iff, after stripping the literal {@code index} prefix: + *

    + *
  1. it is exactly {@code {index}}, or
  2. + *
  3. {@code {index} – {dateLabel}} with an optional trailing {@code – {location}} segment + * (any location — a present, valid date label is itself strong evidence of a machine + * title), or
  4. + *
  5. {@code {index} – {location}} where the segment equals the document's current location + * (no date label, so the segment must match the known location to be distinguished from + * prose).
  6. + *
+ * + *

Security: the {@code index} is compared literally via {@link String#startsWith} + * (never compiled into a regex) because {@code originalFilename} is user-controlled and may carry + * regex metacharacters — an unquoted pattern would be a ReDoS / regex-injection vector + * (CWE-1333 / CWE-625). The date-label sub-patterns use only bounded, non-nested quantifiers over + * short tokens, so there is no catastrophic backtracking. Fail-closed: any null/blank index or + * structural surprise returns {@code false}. + */ +final class DocumentTitleBackfillMatcher { + + private static final String SEPARATOR = " – "; + + // German month tokens derived from the SAME Locale.GERMAN formatters DocumentTitleFormatter + // uses, so the matcher's accepted spellings cannot drift from what the factory emits (full + // names "Januar"…"Dezember"; abbreviations "Jan."…"Dez." — note May/June/July/März carry no + // period). Pattern.quote each so a "." in an abbreviation is literal, never a wildcard. + private static final String FULL_MONTH = monthAlternation("MMMM"); + private static final String ABBR_MONTH = monthAlternation("MMM"); + private static final String SEASON = "(?:Frühling|Sommer|Herbst|Winter)"; + private static final String YEAR = "\\d{1,4}"; + private static final String DAY_NUM = "\\d{1,2}"; + + // One complete date label, anchored, optionally followed by a free-form trailing location + // segment. Only bounded/non-nested quantifiers over short tokens plus a single trailing + // ".+" → linear, no catastrophic backtracking (FR-004 ReDoS guard). + private static final Pattern DATE_LABEL_WITH_OPTIONAL_LOCATION = Pattern.compile( + "^(?:" + String.join("|", + YEAR, // 1916 + "ca\\. " + YEAR, // ca. 1920 + FULL_MONTH + " " + YEAR, // Juni 1916 + DAY_NUM + "\\. " + FULL_MONTH + " " + YEAR, // 24. Dezember 1943 + SEASON + " " + YEAR, // Sommer 1916 + "Datum unbekannt", + DAY_NUM + "\\.–" + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 10.–11. Jan. 1917 + DAY_NUM + "\\. " + ABBR_MONTH + " – " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 30. Jan. – 2. Feb. 1917 + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR + " – " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 30. Dez. 1916 – 2. Jan. 1917 + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 10. Jan. 1917 (range end == start) + "ab " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR) // ab 10. Jan. 1917 + + ")(?: – .+)?$"); + + private DocumentTitleBackfillMatcher() { + } + + static boolean isOverwritable(String title, String index, String location) { + if (title == null || index == null || index.isBlank()) { + return false; // fail closed + } + if (!title.startsWith(index)) { + return false; // index is matched LITERALLY, never as a regex + } + String tail = title.substring(index.length()); + if (tail.isEmpty()) { + return true; // exactly {index} + } + if (!tail.startsWith(SEPARATOR)) { + return false; + } + String body = tail.substring(SEPARATOR.length()); + if (DATE_LABEL_WITH_OPTIONAL_LOCATION.matcher(body).matches()) { + return true; // {dateLabel} (+ optional trailing location) + } + // No date label: the lone segment must equal the document's current location to be + // distinguished from hand-written prose. + return location != null && !location.isBlank() && body.equals(location); + } + + private static String monthAlternation(String pattern) { + DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern, Locale.GERMAN); + Set tokens = new LinkedHashSet<>(); + for (int month = 1; month <= 12; month++) { + tokens.add(formatter.format(LocalDate.of(2000, month, 15))); + } + return tokens.stream().map(Pattern::quote).collect(Collectors.joining("|", "(?:", ")")); + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java b/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java index 74b5d643..1c5f66b0 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java @@ -51,6 +51,12 @@ public class AdminController { return ResponseEntity.ok(new BackfillResult(count)); } + @PostMapping("/backfill-titles") + public ResponseEntity backfillTitles() { + int count = documentService.backfillTitles(); + return ResponseEntity.ok(new BackfillResult(count)); + } + @PostMapping("/generate-thumbnails") public ResponseEntity generateThumbnails() { thumbnailBackfillService.runBackfillAsync(); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java index 76c95dca..fced773a 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java @@ -658,6 +658,59 @@ class DocumentServiceTest { verify(documentVersionService).recordVersion(any(Document.class)); } + // ─── backfillTitles — one-time stale-title cleanup (#726, FR-003) ───────── + + @Test + void backfillTitles_rewritesStaleAutoTitle_andCountsIt() { + Document stale = makeStored("C-0029", LocalDate.of(1928, 1, 1), DatePrecision.YEAR, "Berlin"); + stale.setTitle("C-0029 – 2028 – Berlin"); // stale stored title (date typo never fixed) + when(documentRepository.findAll()).thenReturn(List.of(stale)); + when(documentRepository.save(any())).thenReturn(stale); + + int count = documentService.backfillTitles(); + + assertThat(count).isEqualTo(1); + assertThat(stale.getTitle()).isEqualTo("C-0029 – 1928 – Berlin"); + verify(documentRepository).save(stale); + } + + @Test + void backfillTitles_skipsProse() { + Document prose = makeStored("C-0030", LocalDate.of(1928, 1, 1), DatePrecision.YEAR, null); + prose.setTitle("C-0030 – Brief an Mutter"); + when(documentRepository.findAll()).thenReturn(List.of(prose)); + + int count = documentService.backfillTitles(); + + assertThat(count).isZero(); + assertThat(prose.getTitle()).isEqualTo("C-0030 – Brief an Mutter"); + verify(documentRepository, never()).save(any()); + } + + @Test + void backfillTitles_isIdempotent_forAlreadyCorrectTitle() { + Document fresh = makeStored("C-0031", LocalDate.of(1940, 1, 1), DatePrecision.YEAR, null); + // title already equals build(current state) → nothing to do + when(documentRepository.findAll()).thenReturn(List.of(fresh)); + + int count = documentService.backfillTitles(); + + assertThat(count).isZero(); + verify(documentRepository, never()).save(any()); + } + + @Test + void backfillTitles_neverRecordsVersions() { + Document stale = makeStored("C-0029", LocalDate.of(1928, 1, 1), DatePrecision.YEAR, "Berlin"); + stale.setTitle("C-0029 – 2028 – Berlin"); + when(documentRepository.findAll()).thenReturn(List.of(stale)); + when(documentRepository.save(any())).thenReturn(stale); + + documentService.backfillTitles(); + + verify(documentVersionService, never()).recordVersion(any()); + } + // ─── thumbnail dispatch ─────────────────────────────────────────────────── @Test diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentTitleBackfillMatcherTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentTitleBackfillMatcherTest.java new file mode 100644 index 00000000..be5e02be --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentTitleBackfillMatcherTest.java @@ -0,0 +1,151 @@ +package org.raddatz.familienarchiv.document; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; + +import java.util.concurrent.TimeUnit; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * The backfill overwrite heuristic (FR-004) in isolation — every emittable date-label form is + * recognised, prose is left alone, and a regex-metacharacter index is matched literally without + * hanging. The exact label spellings mirror {@code docs/date-label-fixtures.json}. + */ +class DocumentTitleBackfillMatcherTest { + + private static boolean overwritable(String title, String location) { + return DocumentTitleBackfillMatcher.isOverwritable(title, "C-0029", location); + } + + // ─── each date-label form (index + form) is overwritable ────────────────── + + @Test + void year_form() { + assertThat(overwritable("C-0029 – 1916", null)).isTrue(); + } + + @Test + void approx_form() { + assertThat(overwritable("C-0029 – ca. 1920", null)).isTrue(); + } + + @Test + void month_form() { + assertThat(overwritable("C-0029 – Juni 1916", null)).isTrue(); + } + + @Test + void day_form() { + assertThat(overwritable("C-0029 – 24. Dezember 1943", null)).isTrue(); + } + + @Test + void season_form() { + assertThat(overwritable("C-0029 – Sommer 1916", null)).isTrue(); + } + + @Test + void unknown_label_form() { + assertThat(overwritable("C-0029 – Datum unbekannt", null)).isTrue(); + } + + @Test + void range_same_month_form() { + assertThat(overwritable("C-0029 – 10.–11. Jan. 1917", null)).isTrue(); + } + + @Test + void range_cross_month_form() { + assertThat(overwritable("C-0029 – 30. Jan. – 2. Feb. 1917", null)).isTrue(); + } + + @Test + void range_cross_year_form() { + assertThat(overwritable("C-0029 – 30. Dez. 1916 – 2. Jan. 1917", null)).isTrue(); + } + + @Test + void range_single_day_form() { + assertThat(overwritable("C-0029 – 10. Jan. 1917", null)).isTrue(); + } + + @Test + void range_open_form() { + assertThat(overwritable("C-0029 – ab 10. Jan. 1917", null)).isTrue(); + } + + // ─── date label + trailing location (any location) ──────────────────────── + + @Test + void date_form_with_trailing_location() { + assertThat(overwritable("C-0029 – 1916 – Berlin", null)).isTrue(); + } + + @Test + void range_with_internal_separator_plus_trailing_location() { + // The range label itself contains " – "; the trailing " – Berlin" must still be peeled. + assertThat(overwritable("C-0029 – 30. Jan. – 2. Feb. 1917 – Berlin", null)).isTrue(); + } + + // ─── index-only and index+location cases ────────────────────────────────── + + @Test + void exactly_index() { + assertThat(overwritable("C-0029", null)).isTrue(); + } + + @Test + void index_plus_location_equal_to_current() { + assertThat(overwritable("C-0029 – Berlin", "Berlin")).isTrue(); + } + + // ─── prose is left untouched ────────────────────────────────────────────── + + @Test + void prose_segment_not_matching_location_is_skipped() { + assertThat(overwritable("C-0029 – Brief an Mutter", "Berlin")).isFalse(); + } + + @Test + void location_only_segment_is_skipped_when_no_current_location() { + // No date label, and the doc has no location to compare against → cannot prove machine. + assertThat(overwritable("C-0029 – Berlin", null)).isFalse(); + } + + @Test + void title_not_starting_with_index_is_skipped() { + assertThat(overwritable("Ganz anderer Titel", null)).isFalse(); + } + + // ─── fail-closed guards ─────────────────────────────────────────────────── + + @Test + void null_title_is_not_overwritable() { + assertThat(DocumentTitleBackfillMatcher.isOverwritable(null, "C-0029", null)).isFalse(); + } + + @Test + void null_index_is_not_overwritable() { + assertThat(DocumentTitleBackfillMatcher.isOverwritable("C-0029 – 1916", null, null)).isFalse(); + } + + @Test + void blank_index_is_not_overwritable() { + assertThat(DocumentTitleBackfillMatcher.isOverwritable(" – 1916", " ", null)).isFalse(); + } + + // ─── ReDoS / regex-metacharacter index is matched literally and terminates ─ + + @Test + @Timeout(value = 5, unit = TimeUnit.SECONDS) + void index_with_regex_metacharacters_is_matched_literally_and_terminates() { + String hostileIndex = "C-0029(.*).pdf"; + // Literal prefix → matches; trailing date label → overwritable. Must not hang. + assertThat(DocumentTitleBackfillMatcher.isOverwritable( + hostileIndex + " – 1916", hostileIndex, null)).isTrue(); + // A title that does NOT start with the literal hostile index is skipped, also fast. + assertThat(DocumentTitleBackfillMatcher.isOverwritable( + "C-0029 – 1916", hostileIndex, null)).isFalse(); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java b/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java index 8e51fad7..6429d3dc 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java @@ -132,6 +132,31 @@ class AdminControllerTest { .andExpect(jsonPath("$.count").value(3)); } + // ─── POST /api/admin/backfill-titles (#726) ──────────────────────────────── + + @Test + void backfillTitles_returns401_whenUnauthenticated() throws Exception { + mockMvc.perform(post("/api/admin/backfill-titles").with(csrf())) + .andExpect(status().isUnauthorized()); + } + + @Test + @WithMockUser(roles = "USER") + void backfillTitles_returns403_whenNotAdmin() throws Exception { + mockMvc.perform(post("/api/admin/backfill-titles").with(csrf())) + .andExpect(status().isForbidden()); + } + + @Test + @WithMockUser(authorities = "ADMIN") + void backfillTitles_returns200_withCount_whenAdmin() throws Exception { + when(documentService.backfillTitles()).thenReturn(7); + + mockMvc.perform(post("/api/admin/backfill-titles").with(csrf())) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.count").value(7)); + } + // ─── POST /api/admin/generate-thumbnails ─────────────────────────────────── @Test