feat(document): one-time backfill endpoint for stale auto-titles (#726)
Adds POST /api/admin/backfill-titles (ADMIN-only, synchronous) which rebuilds every machine-generated title from the row's current state. A grammar heuristic (DocumentTitleBackfillMatcher) decides overwritability: index matched literally via startsWith (originalFilename is user-controlled — no regex injection / ReDoS, CWE-1333), date-label forms derived from the same Locale.GERMAN formatters as the factory so they cannot drift, prose left untouched, fail-closed on any surprise. Saves via the repository directly (no recordVersion — follows backfillFileHashes), so the mechanical rename never version-spams document_versions. Idempotent: a second run rewrites nothing. Emits one SLF4J-parameterized scanned/updated/skipped line. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1058,6 +1058,43 @@ public class DocumentService {
|
||||
tagService.delete(tagId);
|
||||
}
|
||||
|
||||
/**
|
||||
* One-time cleanup of already-stale auto-titles (#726, FR-003). For every document whose
|
||||
* stored title passes the {@link DocumentTitleBackfillMatcher} overwrite heuristic, rebuilds
|
||||
* the title from the row's current state and persists it only when it actually changed.
|
||||
* Idempotent: a second run rebuilds the same value and saves nothing. Hand-written prose is
|
||||
* left untouched.
|
||||
*
|
||||
* <p>Saves via {@code documentRepository.save} directly — it must NOT route through
|
||||
* {@link #updateDocument} (which versions every write), following the {@link #backfillFileHashes}
|
||||
* precedent: a mechanical rename must not snapshot the whole corpus into {@code document_versions}.
|
||||
*
|
||||
* @return the number of documents whose title was rewritten
|
||||
*/
|
||||
@Transactional
|
||||
public int backfillTitles() {
|
||||
List<Document> docs = documentRepository.findAll();
|
||||
int updated = 0;
|
||||
int skipped = 0;
|
||||
for (Document doc : docs) {
|
||||
if (!DocumentTitleBackfillMatcher.isOverwritable(
|
||||
doc.getTitle(), doc.getOriginalFilename(), doc.getLocation())) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
String rebuilt = documentTitleFactory.build(doc);
|
||||
if (rebuilt.equals(doc.getTitle())) {
|
||||
skipped++; // already correct — keep idempotent, no write
|
||||
continue;
|
||||
}
|
||||
doc.setTitle(rebuilt);
|
||||
documentRepository.save(doc); // direct save, no recordVersion (mechanical rename)
|
||||
updated++;
|
||||
}
|
||||
log.info("Title backfill complete: scanned={} updated={} skipped={}", docs.size(), updated, skipped);
|
||||
return updated;
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public int backfillFileHashes() {
|
||||
List<Document> docs = documentRepository.findByFileHashIsNullAndFilePathIsNotNull();
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
package org.raddatz.familienarchiv.document;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Heuristic overwrite test for the one-time title backfill (#726, FR-004): decides whether a
|
||||
* STORED title is a machine-generated auto-title (and so may be rebuilt from the row's current
|
||||
* state) versus hand-written prose (left untouched). Used ONLY by the backfill — save-time
|
||||
* regeneration uses an exact old-vs-new comparison instead, with no heuristic.
|
||||
*
|
||||
* <p>A stored title is overwritable iff, after stripping the literal {@code index} prefix:
|
||||
* <ol>
|
||||
* <li>it is exactly {@code {index}}, or</li>
|
||||
* <li>{@code {index} – {dateLabel}} with an optional trailing {@code – {location}} segment
|
||||
* (any location — a present, valid date label is itself strong evidence of a machine
|
||||
* title), or</li>
|
||||
* <li>{@code {index} – {location}} where the segment equals the document's current location
|
||||
* (no date label, so the segment must match the known location to be distinguished from
|
||||
* prose).</li>
|
||||
* </ol>
|
||||
*
|
||||
* <p>Security: the {@code index} is compared <em>literally</em> via {@link String#startsWith}
|
||||
* (never compiled into a regex) because {@code originalFilename} is user-controlled and may carry
|
||||
* regex metacharacters — an unquoted pattern would be a ReDoS / regex-injection vector
|
||||
* (CWE-1333 / CWE-625). The date-label sub-patterns use only bounded, non-nested quantifiers over
|
||||
* short tokens, so there is no catastrophic backtracking. Fail-closed: any null/blank index or
|
||||
* structural surprise returns {@code false}.
|
||||
*/
|
||||
final class DocumentTitleBackfillMatcher {
|
||||
|
||||
private static final String SEPARATOR = " – ";
|
||||
|
||||
// German month tokens derived from the SAME Locale.GERMAN formatters DocumentTitleFormatter
|
||||
// uses, so the matcher's accepted spellings cannot drift from what the factory emits (full
|
||||
// names "Januar"…"Dezember"; abbreviations "Jan."…"Dez." — note May/June/July/März carry no
|
||||
// period). Pattern.quote each so a "." in an abbreviation is literal, never a wildcard.
|
||||
private static final String FULL_MONTH = monthAlternation("MMMM");
|
||||
private static final String ABBR_MONTH = monthAlternation("MMM");
|
||||
private static final String SEASON = "(?:Frühling|Sommer|Herbst|Winter)";
|
||||
private static final String YEAR = "\\d{1,4}";
|
||||
private static final String DAY_NUM = "\\d{1,2}";
|
||||
|
||||
// One complete date label, anchored, optionally followed by a free-form trailing location
|
||||
// segment. Only bounded/non-nested quantifiers over short tokens plus a single trailing
|
||||
// ".+" → linear, no catastrophic backtracking (FR-004 ReDoS guard).
|
||||
private static final Pattern DATE_LABEL_WITH_OPTIONAL_LOCATION = Pattern.compile(
|
||||
"^(?:" + String.join("|",
|
||||
YEAR, // 1916
|
||||
"ca\\. " + YEAR, // ca. 1920
|
||||
FULL_MONTH + " " + YEAR, // Juni 1916
|
||||
DAY_NUM + "\\. " + FULL_MONTH + " " + YEAR, // 24. Dezember 1943
|
||||
SEASON + " " + YEAR, // Sommer 1916
|
||||
"Datum unbekannt",
|
||||
DAY_NUM + "\\.–" + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 10.–11. Jan. 1917
|
||||
DAY_NUM + "\\. " + ABBR_MONTH + " – " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 30. Jan. – 2. Feb. 1917
|
||||
DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR + " – " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 30. Dez. 1916 – 2. Jan. 1917
|
||||
DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR, // 10. Jan. 1917 (range end == start)
|
||||
"ab " + DAY_NUM + "\\. " + ABBR_MONTH + " " + YEAR) // ab 10. Jan. 1917
|
||||
+ ")(?: – .+)?$");
|
||||
|
||||
private DocumentTitleBackfillMatcher() {
|
||||
}
|
||||
|
||||
static boolean isOverwritable(String title, String index, String location) {
|
||||
if (title == null || index == null || index.isBlank()) {
|
||||
return false; // fail closed
|
||||
}
|
||||
if (!title.startsWith(index)) {
|
||||
return false; // index is matched LITERALLY, never as a regex
|
||||
}
|
||||
String tail = title.substring(index.length());
|
||||
if (tail.isEmpty()) {
|
||||
return true; // exactly {index}
|
||||
}
|
||||
if (!tail.startsWith(SEPARATOR)) {
|
||||
return false;
|
||||
}
|
||||
String body = tail.substring(SEPARATOR.length());
|
||||
if (DATE_LABEL_WITH_OPTIONAL_LOCATION.matcher(body).matches()) {
|
||||
return true; // {dateLabel} (+ optional trailing location)
|
||||
}
|
||||
// No date label: the lone segment must equal the document's current location to be
|
||||
// distinguished from hand-written prose.
|
||||
return location != null && !location.isBlank() && body.equals(location);
|
||||
}
|
||||
|
||||
private static String monthAlternation(String pattern) {
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern, Locale.GERMAN);
|
||||
Set<String> tokens = new LinkedHashSet<>();
|
||||
for (int month = 1; month <= 12; month++) {
|
||||
tokens.add(formatter.format(LocalDate.of(2000, month, 15)));
|
||||
}
|
||||
return tokens.stream().map(Pattern::quote).collect(Collectors.joining("|", "(?:", ")"));
|
||||
}
|
||||
}
|
||||
@@ -51,6 +51,12 @@ public class AdminController {
|
||||
return ResponseEntity.ok(new BackfillResult(count));
|
||||
}
|
||||
|
||||
@PostMapping("/backfill-titles")
|
||||
public ResponseEntity<BackfillResult> backfillTitles() {
|
||||
int count = documentService.backfillTitles();
|
||||
return ResponseEntity.ok(new BackfillResult(count));
|
||||
}
|
||||
|
||||
@PostMapping("/generate-thumbnails")
|
||||
public ResponseEntity<ThumbnailBackfillService.BackfillStatus> generateThumbnails() {
|
||||
thumbnailBackfillService.runBackfillAsync();
|
||||
|
||||
Reference in New Issue
Block a user