feat(document): one-time backfill endpoint for stale auto-titles (#726)
Adds POST /api/admin/backfill-titles (ADMIN-only, synchronous) which rebuilds every machine-generated title from the row's current state. A grammar heuristic (DocumentTitleBackfillMatcher) decides overwritability: index matched literally via startsWith (originalFilename is user-controlled — no regex injection / ReDoS, CWE-1333), date-label forms derived from the same Locale.GERMAN formatters as the factory so they cannot drift, prose left untouched, fail-closed on any surprise. Saves via the repository directly (no recordVersion — follows backfillFileHashes), so the mechanical rename never version-spams document_versions. Idempotent: a second run rewrites nothing. Emits one SLF4J-parameterized scanned/updated/skipped line. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1058,6 +1058,43 @@ public class DocumentService {
|
||||
tagService.delete(tagId);
|
||||
}
|
||||
|
||||
/**
|
||||
* One-time cleanup of already-stale auto-titles (#726, FR-003). For every document whose
|
||||
* stored title passes the {@link DocumentTitleBackfillMatcher} overwrite heuristic, rebuilds
|
||||
* the title from the row's current state and persists it only when it actually changed.
|
||||
* Idempotent: a second run rebuilds the same value and saves nothing. Hand-written prose is
|
||||
* left untouched.
|
||||
*
|
||||
* <p>Saves via {@code documentRepository.save} directly — it must NOT route through
|
||||
* {@link #updateDocument} (which versions every write), following the {@link #backfillFileHashes}
|
||||
* precedent: a mechanical rename must not snapshot the whole corpus into {@code document_versions}.
|
||||
*
|
||||
* @return the number of documents whose title was rewritten
|
||||
*/
|
||||
@Transactional
|
||||
public int backfillTitles() {
|
||||
List<Document> docs = documentRepository.findAll();
|
||||
int updated = 0;
|
||||
int skipped = 0;
|
||||
for (Document doc : docs) {
|
||||
if (!DocumentTitleBackfillMatcher.isOverwritable(
|
||||
doc.getTitle(), doc.getOriginalFilename(), doc.getLocation())) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
String rebuilt = documentTitleFactory.build(doc);
|
||||
if (rebuilt.equals(doc.getTitle())) {
|
||||
skipped++; // already correct — keep idempotent, no write
|
||||
continue;
|
||||
}
|
||||
doc.setTitle(rebuilt);
|
||||
documentRepository.save(doc); // direct save, no recordVersion (mechanical rename)
|
||||
updated++;
|
||||
}
|
||||
log.info("Title backfill complete: scanned={} updated={} skipped={}", docs.size(), updated, skipped);
|
||||
return updated;
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public int backfillFileHashes() {
|
||||
List<Document> docs = documentRepository.findByFileHashIsNullAndFilePathIsNotNull();
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
package org.raddatz.familienarchiv.document;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Heuristic overwrite test for the one-time title backfill (#726, FR-004): decides whether a
|
||||
* STORED title is a machine-generated auto-title (and so may be rebuilt from the row's current
|
||||
* state) versus hand-written prose (left untouched). Used ONLY by the backfill — save-time
|
||||
* regeneration uses an exact old-vs-new comparison instead, with no heuristic.
|
||||
*
|
||||
* <p>A stored title is overwritable iff, after stripping the literal {@code index} prefix:
|
||||
* <ol>
|
||||
* <li>it is exactly {@code {index}}, or</li>
|
||||
* <li>{@code {index} – {dateLabel}} with an optional trailing {@code – {location}} segment
|
||||
* (any location — a present, valid date label is itself strong evidence of a machine
|
||||
* title), or</li>
|
||||
* <li>{@code {index} – {location}} where the segment equals the document's current location
|
||||
* (no date label, so the segment must match the known location to be distinguished from
|
||||
* prose).</li>
|
||||
* </ol>
|
||||
*
|
||||
* <p>Security: the {@code index} is compared <em>literally</em> via {@link String#startsWith}
|
||||
* (never compiled into a regex) because {@code originalFilename} is user-controlled and may carry
|
||||
* regex metacharacters — an unquoted pattern would be a ReDoS / regex-injection vector
|
||||
* (CWE-1333 / CWE-625). The date-label sub-patterns use only bounded, non-nested quantifiers over
|
||||
* short tokens, so there is no catastrophic backtracking. Fail-closed: any null/blank index or
|
||||
* structural surprise returns {@code false}.
|
||||
*/
|
||||
final class DocumentTitleBackfillMatcher {
|
||||
|
||||
private static final String SEPARATOR = " – ";
|
||||
|
||||
// German month tokens derived from the SAME Locale.GERMAN formatters DocumentTitleFormatter
|
||||
// uses, so the matcher's accepted spellings cannot drift from what the factory emits (full
|
||||
// names "Januar"…"Dezember"; abbreviations "Jan."…"Dez." — note May/June/July/März carry no
|
||||
// period). Pattern.quote each so a "." in an abbreviation is literal, never a wildcard.
|
||||
private static final String FULL_MONTH = monthAlternation("MMMM");
|
||||
private static final String ABBR_MONTH = monthAlternation("MMM");
|
||||
private static final String SEASON = "(?:Frühling|Sommer|Herbst|Winter)";
|
||||
private static final String YEAR = "\\d{1,4}";
|
||||
private static final String DAY_NUM = "\\d{1,2}";
|
||||
|
||||
// One complete date label, anchored, optionally followed by a free-form trailing location
|
||||
// segment. Only bounded/non-nested quantifiers over short tokens plus a single trailing
|
||||
// ".+" → linear, no catastrophic backtracking (FR-004 ReDoS guard).
|
||||
private static final Pattern DATE_LABEL_WITH_OPTIONAL_LOCATION = Pattern.compile(
|
||||
"^(?:" + String.join("|",
|
||||
YEAR, // 1916
|
||||
"ca\\. " + YEAR, // ca. 1920
|
||||
FULL_MONTH + " " + YEAR, // Juni 1916
|
||||
DAY_NUM + "\\. " + FULL_MONTH + " " + YEAR, // 24. Dezember 1943
|
||||
SEASON + " " + YEAR, // Sommer 1916
|
||||
"Datum unbekannt",
|
||||
DAY_NUM + "\\.–" + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 10.–11. Jan. 1917
|
||||
DAY_NUM + "\\. " + ABBR_MONTH + " – " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 30. Jan. – 2. Feb. 1917
|
||||
DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR + " – " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 30. Dez. 1916 – 2. Jan. 1917
|
||||
DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 10. Jan. 1917 (range end == start)
|
||||
"ab " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR) // ab 10. Jan. 1917
|
||||
+ ")(?: – .+)?$");
|
||||
|
||||
private DocumentTitleBackfillMatcher() {
|
||||
}
|
||||
|
||||
static boolean isOverwritable(String title, String index, String location) {
|
||||
if (title == null || index == null || index.isBlank()) {
|
||||
return false; // fail closed
|
||||
}
|
||||
if (!title.startsWith(index)) {
|
||||
return false; // index is matched LITERALLY, never as a regex
|
||||
}
|
||||
String tail = title.substring(index.length());
|
||||
if (tail.isEmpty()) {
|
||||
return true; // exactly {index}
|
||||
}
|
||||
if (!tail.startsWith(SEPARATOR)) {
|
||||
return false;
|
||||
}
|
||||
String body = tail.substring(SEPARATOR.length());
|
||||
if (DATE_LABEL_WITH_OPTIONAL_LOCATION.matcher(body).matches()) {
|
||||
return true; // {dateLabel} (+ optional trailing location)
|
||||
}
|
||||
// No date label: the lone segment must equal the document's current location to be
|
||||
// distinguished from hand-written prose.
|
||||
return location != null && !location.isBlank() && body.equals(location);
|
||||
}
|
||||
|
||||
private static String monthAlternation(String pattern) {
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern, Locale.GERMAN);
|
||||
Set<String> tokens = new LinkedHashSet<>();
|
||||
for (int month = 1; month <= 12; month++) {
|
||||
tokens.add(formatter.format(LocalDate.of(2000, month, 15)));
|
||||
}
|
||||
return tokens.stream().map(Pattern::quote).collect(Collectors.joining("|", "(?:", ")"));
|
||||
}
|
||||
}
|
||||
@@ -51,6 +51,12 @@ public class AdminController {
|
||||
return ResponseEntity.ok(new BackfillResult(count));
|
||||
}
|
||||
|
||||
@PostMapping("/backfill-titles")
|
||||
public ResponseEntity<BackfillResult> backfillTitles() {
|
||||
int count = documentService.backfillTitles();
|
||||
return ResponseEntity.ok(new BackfillResult(count));
|
||||
}
|
||||
|
||||
@PostMapping("/generate-thumbnails")
|
||||
public ResponseEntity<ThumbnailBackfillService.BackfillStatus> generateThumbnails() {
|
||||
thumbnailBackfillService.runBackfillAsync();
|
||||
|
||||
@@ -658,6 +658,59 @@ class DocumentServiceTest {
|
||||
verify(documentVersionService).recordVersion(any(Document.class));
|
||||
}
|
||||
|
||||
// ─── backfillTitles — one-time stale-title cleanup (#726, FR-003) ─────────
|
||||
|
||||
@Test
|
||||
void backfillTitles_rewritesStaleAutoTitle_andCountsIt() {
|
||||
Document stale = makeStored("C-0029", LocalDate.of(1928, 1, 1), DatePrecision.YEAR, "Berlin");
|
||||
stale.setTitle("C-0029 – 2028 – Berlin"); // stale stored title (date typo never fixed)
|
||||
when(documentRepository.findAll()).thenReturn(List.of(stale));
|
||||
when(documentRepository.save(any())).thenReturn(stale);
|
||||
|
||||
int count = documentService.backfillTitles();
|
||||
|
||||
assertThat(count).isEqualTo(1);
|
||||
assertThat(stale.getTitle()).isEqualTo("C-0029 – 1928 – Berlin");
|
||||
verify(documentRepository).save(stale);
|
||||
}
|
||||
|
||||
@Test
|
||||
void backfillTitles_skipsProse() {
|
||||
Document prose = makeStored("C-0030", LocalDate.of(1928, 1, 1), DatePrecision.YEAR, null);
|
||||
prose.setTitle("C-0030 – Brief an Mutter");
|
||||
when(documentRepository.findAll()).thenReturn(List.of(prose));
|
||||
|
||||
int count = documentService.backfillTitles();
|
||||
|
||||
assertThat(count).isZero();
|
||||
assertThat(prose.getTitle()).isEqualTo("C-0030 – Brief an Mutter");
|
||||
verify(documentRepository, never()).save(any());
|
||||
}
|
||||
|
||||
@Test
|
||||
void backfillTitles_isIdempotent_forAlreadyCorrectTitle() {
|
||||
Document fresh = makeStored("C-0031", LocalDate.of(1940, 1, 1), DatePrecision.YEAR, null);
|
||||
// title already equals build(current state) → nothing to do
|
||||
when(documentRepository.findAll()).thenReturn(List.of(fresh));
|
||||
|
||||
int count = documentService.backfillTitles();
|
||||
|
||||
assertThat(count).isZero();
|
||||
verify(documentRepository, never()).save(any());
|
||||
}
|
||||
|
||||
@Test
|
||||
void backfillTitles_neverRecordsVersions() {
|
||||
Document stale = makeStored("C-0029", LocalDate.of(1928, 1, 1), DatePrecision.YEAR, "Berlin");
|
||||
stale.setTitle("C-0029 – 2028 – Berlin");
|
||||
when(documentRepository.findAll()).thenReturn(List.of(stale));
|
||||
when(documentRepository.save(any())).thenReturn(stale);
|
||||
|
||||
documentService.backfillTitles();
|
||||
|
||||
verify(documentVersionService, never()).recordVersion(any());
|
||||
}
|
||||
|
||||
// ─── thumbnail dispatch ───────────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
package org.raddatz.familienarchiv.document;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.Timeout;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* The backfill overwrite heuristic (FR-004) in isolation — every emittable date-label form is
|
||||
* recognised, prose is left alone, and a regex-metacharacter index is matched literally without
|
||||
* hanging. The exact label spellings mirror {@code docs/date-label-fixtures.json}.
|
||||
*/
|
||||
class DocumentTitleBackfillMatcherTest {
|
||||
|
||||
private static boolean overwritable(String title, String location) {
|
||||
return DocumentTitleBackfillMatcher.isOverwritable(title, "C-0029", location);
|
||||
}
|
||||
|
||||
// ─── each date-label form (index + form) is overwritable ──────────────────
|
||||
|
||||
@Test
|
||||
void year_form() {
|
||||
assertThat(overwritable("C-0029 – 1916", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void approx_form() {
|
||||
assertThat(overwritable("C-0029 – ca. 1920", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void month_form() {
|
||||
assertThat(overwritable("C-0029 – Juni 1916", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void day_form() {
|
||||
assertThat(overwritable("C-0029 – 24. Dezember 1943", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void season_form() {
|
||||
assertThat(overwritable("C-0029 – Sommer 1916", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void unknown_label_form() {
|
||||
assertThat(overwritable("C-0029 – Datum unbekannt", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void range_same_month_form() {
|
||||
assertThat(overwritable("C-0029 – 10.–11. Jan. 1917", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void range_cross_month_form() {
|
||||
assertThat(overwritable("C-0029 – 30. Jan. – 2. Feb. 1917", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void range_cross_year_form() {
|
||||
assertThat(overwritable("C-0029 – 30. Dez. 1916 – 2. Jan. 1917", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void range_single_day_form() {
|
||||
assertThat(overwritable("C-0029 – 10. Jan. 1917", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void range_open_form() {
|
||||
assertThat(overwritable("C-0029 – ab 10. Jan. 1917", null)).isTrue();
|
||||
}
|
||||
|
||||
// ─── date label + trailing location (any location) ────────────────────────
|
||||
|
||||
@Test
|
||||
void date_form_with_trailing_location() {
|
||||
assertThat(overwritable("C-0029 – 1916 – Berlin", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void range_with_internal_separator_plus_trailing_location() {
|
||||
// The range label itself contains " – "; the trailing " – Berlin" must still be peeled.
|
||||
assertThat(overwritable("C-0029 – 30. Jan. – 2. Feb. 1917 – Berlin", null)).isTrue();
|
||||
}
|
||||
|
||||
// ─── index-only and index+location cases ──────────────────────────────────
|
||||
|
||||
@Test
|
||||
void exactly_index() {
|
||||
assertThat(overwritable("C-0029", null)).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
void index_plus_location_equal_to_current() {
|
||||
assertThat(overwritable("C-0029 – Berlin", "Berlin")).isTrue();
|
||||
}
|
||||
|
||||
// ─── prose is left untouched ──────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
void prose_segment_not_matching_location_is_skipped() {
|
||||
assertThat(overwritable("C-0029 – Brief an Mutter", "Berlin")).isFalse();
|
||||
}
|
||||
|
||||
@Test
|
||||
void location_only_segment_is_skipped_when_no_current_location() {
|
||||
// No date label, and the doc has no location to compare against → cannot prove machine.
|
||||
assertThat(overwritable("C-0029 – Berlin", null)).isFalse();
|
||||
}
|
||||
|
||||
@Test
|
||||
void title_not_starting_with_index_is_skipped() {
|
||||
assertThat(overwritable("Ganz anderer Titel", null)).isFalse();
|
||||
}
|
||||
|
||||
// ─── fail-closed guards ───────────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
void null_title_is_not_overwritable() {
|
||||
assertThat(DocumentTitleBackfillMatcher.isOverwritable(null, "C-0029", null)).isFalse();
|
||||
}
|
||||
|
||||
@Test
|
||||
void null_index_is_not_overwritable() {
|
||||
assertThat(DocumentTitleBackfillMatcher.isOverwritable("C-0029 – 1916", null, null)).isFalse();
|
||||
}
|
||||
|
||||
@Test
|
||||
void blank_index_is_not_overwritable() {
|
||||
assertThat(DocumentTitleBackfillMatcher.isOverwritable(" – 1916", " ", null)).isFalse();
|
||||
}
|
||||
|
||||
// ─── ReDoS / regex-metacharacter index is matched literally and terminates ─
|
||||
|
||||
@Test
|
||||
@Timeout(value = 5, unit = TimeUnit.SECONDS)
|
||||
void index_with_regex_metacharacters_is_matched_literally_and_terminates() {
|
||||
String hostileIndex = "C-0029(.*).pdf";
|
||||
// Literal prefix → matches; trailing date label → overwritable. Must not hang.
|
||||
assertThat(DocumentTitleBackfillMatcher.isOverwritable(
|
||||
hostileIndex + " – 1916", hostileIndex, null)).isTrue();
|
||||
// A title that does NOT start with the literal hostile index is skipped, also fast.
|
||||
assertThat(DocumentTitleBackfillMatcher.isOverwritable(
|
||||
"C-0029 – 1916", hostileIndex, null)).isFalse();
|
||||
}
|
||||
}
|
||||
@@ -132,6 +132,31 @@ class AdminControllerTest {
|
||||
.andExpect(jsonPath("$.count").value(3));
|
||||
}
|
||||
|
||||
// ─── POST /api/admin/backfill-titles (#726) ────────────────────────────────
|
||||
|
||||
@Test
|
||||
void backfillTitles_returns401_whenUnauthenticated() throws Exception {
|
||||
mockMvc.perform(post("/api/admin/backfill-titles").with(csrf()))
|
||||
.andExpect(status().isUnauthorized());
|
||||
}
|
||||
|
||||
@Test
|
||||
@WithMockUser(roles = "USER")
|
||||
void backfillTitles_returns403_whenNotAdmin() throws Exception {
|
||||
mockMvc.perform(post("/api/admin/backfill-titles").with(csrf()))
|
||||
.andExpect(status().isForbidden());
|
||||
}
|
||||
|
||||
@Test
|
||||
@WithMockUser(authorities = "ADMIN")
|
||||
void backfillTitles_returns200_withCount_whenAdmin() throws Exception {
|
||||
when(documentService.backfillTitles()).thenReturn(7);
|
||||
|
||||
mockMvc.perform(post("/api/admin/backfill-titles").with(csrf()))
|
||||
.andExpect(status().isOk())
|
||||
.andExpect(jsonPath("$.count").value(7));
|
||||
}
|
||||
|
||||
// ─── POST /api/admin/generate-thumbnails ───────────────────────────────────
|
||||
|
||||
@Test
|
||||
|
||||
Reference in New Issue
Block a user