feat(document): one-time backfill endpoint for stale auto-titles (#726)

Adds POST /api/admin/backfill-titles (ADMIN-only, synchronous) which rebuilds every
machine-generated title from the row's current state. A grammar heuristic
(DocumentTitleBackfillMatcher) decides overwritability: index matched literally via
startsWith (originalFilename is user-controlled — no regex injection / ReDoS, CWE-1333),
date-label forms derived from the same Locale.GERMAN formatters as the factory so they
cannot drift, prose left untouched, fail-closed on any surprise. Saves via the repository
directly (no recordVersion — follows backfillFileHashes), so the mechanical rename never
version-spams document_versions. Idempotent: a second run rewrites nothing. Emits one
SLF4J-parameterized scanned/updated/skipped line.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-06-04 16:29:57 +02:00
parent e6ce00035e
commit 26b45f1c78
6 changed files with 373 additions and 0 deletions

View File

@@ -1058,6 +1058,43 @@ public class DocumentService {
tagService.delete(tagId);
}
/**
* One-time cleanup of already-stale auto-titles (#726, FR-003). For every document whose
* stored title passes the {@link DocumentTitleBackfillMatcher} overwrite heuristic, rebuilds
* the title from the row's current state and persists it only when it actually changed.
* Idempotent: a second run rebuilds the same value and saves nothing. Hand-written prose is
* left untouched.
*
* <p>Saves via {@code documentRepository.save} directly — it must NOT route through
* {@link #updateDocument} (which versions every write), following the {@link #backfillFileHashes}
* precedent: a mechanical rename must not snapshot the whole corpus into {@code document_versions}.
*
* @return the number of documents whose title was rewritten
*/
@Transactional
public int backfillTitles() {
List<Document> docs = documentRepository.findAll();
int updated = 0;
int skipped = 0;
for (Document doc : docs) {
if (!DocumentTitleBackfillMatcher.isOverwritable(
doc.getTitle(), doc.getOriginalFilename(), doc.getLocation())) {
skipped++;
continue;
}
String rebuilt = documentTitleFactory.build(doc);
if (rebuilt.equals(doc.getTitle())) {
skipped++; // already correct — keep idempotent, no write
continue;
}
doc.setTitle(rebuilt);
documentRepository.save(doc); // direct save, no recordVersion (mechanical rename)
updated++;
}
log.info("Title backfill complete: scanned={} updated={} skipped={}", docs.size(), updated, skipped);
return updated;
}
@Transactional
public int backfillFileHashes() {
List<Document> docs = documentRepository.findByFileHashIsNullAndFilePathIsNotNull();

View File

@@ -0,0 +1,101 @@
package org.raddatz.familienarchiv.document;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Heuristic overwrite test for the one-time title backfill (#726, FR-004): decides whether a
* STORED title is a machine-generated auto-title (and so may be rebuilt from the row's current
* state) versus hand-written prose (left untouched). Used ONLY by the backfill — save-time
* regeneration uses an exact old-vs-new comparison instead, with no heuristic.
*
* <p>A stored title is overwritable iff, after stripping the literal {@code index} prefix:
* <ol>
* <li>it is exactly {@code {index}}, or</li>
* <li>{@code {index} {dateLabel}} with an optional trailing {@code {location}} segment
* (any location — a present, valid date label is itself strong evidence of a machine
* title), or</li>
* <li>{@code {index} {location}} where the segment equals the document's current location
* (no date label, so the segment must match the known location to be distinguished from
* prose).</li>
* </ol>
*
* <p>Security: the {@code index} is compared <em>literally</em> via {@link String#startsWith}
* (never compiled into a regex) because {@code originalFilename} is user-controlled and may carry
* regex metacharacters — an unquoted pattern would be a ReDoS / regex-injection vector
* (CWE-1333 / CWE-625). The date-label sub-patterns use only bounded, non-nested quantifiers over
* short tokens, so there is no catastrophic backtracking. Fail-closed: any null/blank index or
* structural surprise returns {@code false}.
*/
final class DocumentTitleBackfillMatcher {
private static final String SEPARATOR = " ";
// German month tokens derived from the SAME Locale.GERMAN formatters DocumentTitleFormatter
// uses, so the matcher's accepted spellings cannot drift from what the factory emits (full
// names "Januar"…"Dezember"; abbreviations "Jan."…"Dez." — note May/June/July/März carry no
// period). Pattern.quote each so a "." in an abbreviation is literal, never a wildcard.
private static final String FULL_MONTH = monthAlternation("MMMM");
private static final String ABBR_MONTH = monthAlternation("MMM");
private static final String SEASON = "(?:Frühling|Sommer|Herbst|Winter)";
private static final String YEAR = "\\d{1,4}";
private static final String DAY_NUM = "\\d{1,2}";
// One complete date label, anchored, optionally followed by a free-form trailing location
// segment. Only bounded/non-nested quantifiers over short tokens plus a single trailing
// ".+" → linear, no catastrophic backtracking (FR-004 ReDoS guard).
private static final Pattern DATE_LABEL_WITH_OPTIONAL_LOCATION = Pattern.compile(
"^(?:" + String.join("|",
YEAR, // 1916
"ca\\. " + YEAR, // ca. 1920
FULL_MONTH + " " + YEAR, // Juni 1916
DAY_NUM + "\\. " + FULL_MONTH + " " + YEAR, // 24. Dezember 1943
SEASON + " " + YEAR, // Sommer 1916
"Datum unbekannt",
DAY_NUM + "\\." + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 10.11. Jan. 1917
DAY_NUM + "\\. " + ABBR_MONTH + " " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 30. Jan. 2. Feb. 1917
DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR + " " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 30. Dez. 1916 2. Jan. 1917
DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 10. Jan. 1917 (range end == start)
"ab " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR) // ab 10. Jan. 1917
+ ")(?: .+)?$");
private DocumentTitleBackfillMatcher() {
}
static boolean isOverwritable(String title, String index, String location) {
if (title == null || index == null || index.isBlank()) {
return false; // fail closed
}
if (!title.startsWith(index)) {
return false; // index is matched LITERALLY, never as a regex
}
String tail = title.substring(index.length());
if (tail.isEmpty()) {
return true; // exactly {index}
}
if (!tail.startsWith(SEPARATOR)) {
return false;
}
String body = tail.substring(SEPARATOR.length());
if (DATE_LABEL_WITH_OPTIONAL_LOCATION.matcher(body).matches()) {
return true; // {dateLabel} (+ optional trailing location)
}
// No date label: the lone segment must equal the document's current location to be
// distinguished from hand-written prose.
return location != null && !location.isBlank() && body.equals(location);
}
private static String monthAlternation(String pattern) {
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern, Locale.GERMAN);
Set<String> tokens = new LinkedHashSet<>();
for (int month = 1; month <= 12; month++) {
tokens.add(formatter.format(LocalDate.of(2000, month, 15)));
}
return tokens.stream().map(Pattern::quote).collect(Collectors.joining("|", "(?:", ")"));
}
}

View File

@@ -51,6 +51,12 @@ public class AdminController {
return ResponseEntity.ok(new BackfillResult(count));
}
@PostMapping("/backfill-titles")
public ResponseEntity<BackfillResult> backfillTitles() {
int count = documentService.backfillTitles();
return ResponseEntity.ok(new BackfillResult(count));
}
@PostMapping("/generate-thumbnails")
public ResponseEntity<ThumbnailBackfillService.BackfillStatus> generateThumbnails() {
thumbnailBackfillService.runBackfillAsync();

View File

@@ -658,6 +658,59 @@ class DocumentServiceTest {
verify(documentVersionService).recordVersion(any(Document.class));
}
// ─── backfillTitles — one-time stale-title cleanup (#726, FR-003) ─────────
@Test
void backfillTitles_rewritesStaleAutoTitle_andCountsIt() {
Document stale = makeStored("C-0029", LocalDate.of(1928, 1, 1), DatePrecision.YEAR, "Berlin");
stale.setTitle("C-0029 2028 Berlin"); // stale stored title (date typo never fixed)
when(documentRepository.findAll()).thenReturn(List.of(stale));
when(documentRepository.save(any())).thenReturn(stale);
int count = documentService.backfillTitles();
assertThat(count).isEqualTo(1);
assertThat(stale.getTitle()).isEqualTo("C-0029 1928 Berlin");
verify(documentRepository).save(stale);
}
@Test
void backfillTitles_skipsProse() {
Document prose = makeStored("C-0030", LocalDate.of(1928, 1, 1), DatePrecision.YEAR, null);
prose.setTitle("C-0030 Brief an Mutter");
when(documentRepository.findAll()).thenReturn(List.of(prose));
int count = documentService.backfillTitles();
assertThat(count).isZero();
assertThat(prose.getTitle()).isEqualTo("C-0030 Brief an Mutter");
verify(documentRepository, never()).save(any());
}
@Test
void backfillTitles_isIdempotent_forAlreadyCorrectTitle() {
Document fresh = makeStored("C-0031", LocalDate.of(1940, 1, 1), DatePrecision.YEAR, null);
// title already equals build(current state) → nothing to do
when(documentRepository.findAll()).thenReturn(List.of(fresh));
int count = documentService.backfillTitles();
assertThat(count).isZero();
verify(documentRepository, never()).save(any());
}
@Test
void backfillTitles_neverRecordsVersions() {
Document stale = makeStored("C-0029", LocalDate.of(1928, 1, 1), DatePrecision.YEAR, "Berlin");
stale.setTitle("C-0029 2028 Berlin");
when(documentRepository.findAll()).thenReturn(List.of(stale));
when(documentRepository.save(any())).thenReturn(stale);
documentService.backfillTitles();
verify(documentVersionService, never()).recordVersion(any());
}
// ─── thumbnail dispatch ───────────────────────────────────────────────────
@Test

View File

@@ -0,0 +1,151 @@
package org.raddatz.familienarchiv.document;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Timeout;
import java.util.concurrent.TimeUnit;
import static org.assertj.core.api.Assertions.assertThat;
/**
* The backfill overwrite heuristic (FR-004) in isolation — every emittable date-label form is
* recognised, prose is left alone, and a regex-metacharacter index is matched literally without
* hanging. The exact label spellings mirror {@code docs/date-label-fixtures.json}.
*/
class DocumentTitleBackfillMatcherTest {
private static boolean overwritable(String title, String location) {
return DocumentTitleBackfillMatcher.isOverwritable(title, "C-0029", location);
}
// ─── each date-label form (index + form) is overwritable ──────────────────
@Test
void year_form() {
assertThat(overwritable("C-0029 1916", null)).isTrue();
}
@Test
void approx_form() {
assertThat(overwritable("C-0029 ca. 1920", null)).isTrue();
}
@Test
void month_form() {
assertThat(overwritable("C-0029 Juni 1916", null)).isTrue();
}
@Test
void day_form() {
assertThat(overwritable("C-0029 24. Dezember 1943", null)).isTrue();
}
@Test
void season_form() {
assertThat(overwritable("C-0029 Sommer 1916", null)).isTrue();
}
@Test
void unknown_label_form() {
assertThat(overwritable("C-0029 Datum unbekannt", null)).isTrue();
}
@Test
void range_same_month_form() {
assertThat(overwritable("C-0029 10.11. Jan. 1917", null)).isTrue();
}
@Test
void range_cross_month_form() {
assertThat(overwritable("C-0029 30. Jan. 2. Feb. 1917", null)).isTrue();
}
@Test
void range_cross_year_form() {
assertThat(overwritable("C-0029 30. Dez. 1916 2. Jan. 1917", null)).isTrue();
}
@Test
void range_single_day_form() {
assertThat(overwritable("C-0029 10. Jan. 1917", null)).isTrue();
}
@Test
void range_open_form() {
assertThat(overwritable("C-0029 ab 10. Jan. 1917", null)).isTrue();
}
// ─── date label + trailing location (any location) ────────────────────────
@Test
void date_form_with_trailing_location() {
assertThat(overwritable("C-0029 1916 Berlin", null)).isTrue();
}
@Test
void range_with_internal_separator_plus_trailing_location() {
// The range label itself contains " "; the trailing " Berlin" must still be peeled.
assertThat(overwritable("C-0029 30. Jan. 2. Feb. 1917 Berlin", null)).isTrue();
}
// ─── index-only and index+location cases ──────────────────────────────────
@Test
void exactly_index() {
assertThat(overwritable("C-0029", null)).isTrue();
}
@Test
void index_plus_location_equal_to_current() {
assertThat(overwritable("C-0029 Berlin", "Berlin")).isTrue();
}
// ─── prose is left untouched ──────────────────────────────────────────────
@Test
void prose_segment_not_matching_location_is_skipped() {
assertThat(overwritable("C-0029 Brief an Mutter", "Berlin")).isFalse();
}
@Test
void location_only_segment_is_skipped_when_no_current_location() {
// No date label, and the doc has no location to compare against → cannot prove machine.
assertThat(overwritable("C-0029 Berlin", null)).isFalse();
}
@Test
void title_not_starting_with_index_is_skipped() {
assertThat(overwritable("Ganz anderer Titel", null)).isFalse();
}
// ─── fail-closed guards ───────────────────────────────────────────────────
@Test
void null_title_is_not_overwritable() {
assertThat(DocumentTitleBackfillMatcher.isOverwritable(null, "C-0029", null)).isFalse();
}
@Test
void null_index_is_not_overwritable() {
assertThat(DocumentTitleBackfillMatcher.isOverwritable("C-0029 1916", null, null)).isFalse();
}
@Test
void blank_index_is_not_overwritable() {
assertThat(DocumentTitleBackfillMatcher.isOverwritable(" 1916", " ", null)).isFalse();
}
// ─── ReDoS / regex-metacharacter index is matched literally and terminates ─
@Test
@Timeout(value = 5, unit = TimeUnit.SECONDS)
void index_with_regex_metacharacters_is_matched_literally_and_terminates() {
String hostileIndex = "C-0029(.*).pdf";
// Literal prefix → matches; trailing date label → overwritable. Must not hang.
assertThat(DocumentTitleBackfillMatcher.isOverwritable(
hostileIndex + " 1916", hostileIndex, null)).isTrue();
// A title that does NOT start with the literal hostile index is skipped, also fast.
assertThat(DocumentTitleBackfillMatcher.isOverwritable(
"C-0029 1916", hostileIndex, null)).isFalse();
}
}

View File

@@ -132,6 +132,31 @@ class AdminControllerTest {
.andExpect(jsonPath("$.count").value(3));
}
// ─── POST /api/admin/backfill-titles (#726) ────────────────────────────────
@Test
void backfillTitles_returns401_whenUnauthenticated() throws Exception {
mockMvc.perform(post("/api/admin/backfill-titles").with(csrf()))
.andExpect(status().isUnauthorized());
}
@Test
@WithMockUser(roles = "USER")
void backfillTitles_returns403_whenNotAdmin() throws Exception {
mockMvc.perform(post("/api/admin/backfill-titles").with(csrf()))
.andExpect(status().isForbidden());
}
@Test
@WithMockUser(authorities = "ADMIN")
void backfillTitles_returns200_withCount_whenAdmin() throws Exception {
when(documentService.backfillTitles()).thenReturn(7);
mockMvc.perform(post("/api/admin/backfill-titles").with(csrf()))
.andExpect(status().isOk())
.andExpect(jsonPath("$.count").value(7));
}
// ─── POST /api/admin/generate-thumbnails ───────────────────────────────────
@Test