feat(filename): support compound last names like de Gruyter
Some checks failed
CI / Unit & Component Tests (push) Has been cancelled
CI / Backend Unit Tests (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
CI / Unit & Component Tests (pull_request) Successful in 2m17s
CI / Backend Unit Tests (pull_request) Successful in 2m13s
CI / E2E Tests (pull_request) Failing after 25m0s

Replace the four fixed regexes with a split-based algorithm:
- first segment = date → last segment = firstName, rest = lastName parts
- last segment = date → second-to-last = firstName, rest = lastName parts

18881025_de_Gruyter_Walter.pdf now correctly yields "Walter de Gruyter".
Simple two-segment names behave identically to before.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-03-26 15:33:21 +01:00
parent a302f96560
commit f0940524e7
4 changed files with 157 additions and 87 deletions

View File

@@ -358,55 +358,70 @@ public class DocumentService {
/**
* Derives a human-readable title from a structured filename.
* Supports patterns (full match only):
* YYYY-MM-DD_Lastname_Firstname.ext
* YYYYMMDD_Lastname_Firstname.ext
* Lastname_Firstname_YYYY-MM-DD.ext
* Lastname_Firstname_YYYYMMDD.ext
* Falls back to stripExtension for unrecognised names.
*
* Algorithm: split stem on "_", identify the date token (first or last segment),
* treat the outermost remaining segment as firstName, rest as lastName parts.
* Compound last names (e.g. "de_Gruyter") are handled naturally.
* Falls back to stripExtension for unrecognised filenames.
*
* Examples:
* 18881025_de_Gruyter_Walter.pdf → "Walter de Gruyter (25.10.1888)"
* 1965-03-12_Mueller_Hans.pdf → "Hans Mueller (12.03.1965)"
* Mueller_Hans_19650312.pdf → "Hans Mueller (12.03.1965)"
*/
private static final java.util.regex.Pattern FN_DATE_ISO_NAME =
java.util.regex.Pattern.compile("^(\\d{4}-\\d{2}-\\d{2})_(\\p{L}+)_(\\p{L}+)\\.[^.]+$");
private static final java.util.regex.Pattern FN_DATE_COMPACT_NAME =
java.util.regex.Pattern.compile("^(\\d{8})_(\\p{L}+)_(\\p{L}+)\\.[^.]+$");
private static final java.util.regex.Pattern FN_NAME_DATE_ISO =
java.util.regex.Pattern.compile("^(\\p{L}+)_(\\p{L}+)_(\\d{4}-\\d{2}-\\d{2})\\.[^.]+$");
private static final java.util.regex.Pattern FN_NAME_DATE_COMPACT =
java.util.regex.Pattern.compile("^(\\p{L}+)_(\\p{L}+)_(\\d{8})\\.[^.]+$");
static String titleFromFilename(String filename) {
if (filename == null) return null;
java.util.regex.Matcher m;
String dateIso, lastName, firstName;
if ((m = FN_DATE_ISO_NAME.matcher(filename)).matches()) {
dateIso = m.group(1);
lastName = m.group(2);
firstName = m.group(3);
} else if ((m = FN_DATE_COMPACT_NAME.matcher(filename)).matches()) {
String compact = m.group(1);
dateIso = compact.substring(0, 4) + "-" + compact.substring(4, 6) + "-" + compact.substring(6, 8);
lastName = m.group(2);
firstName = m.group(3);
} else if ((m = FN_NAME_DATE_ISO.matcher(filename)).matches()) {
lastName = m.group(1);
firstName = m.group(2);
dateIso = m.group(3);
} else if ((m = FN_NAME_DATE_COMPACT.matcher(filename)).matches()) {
lastName = m.group(1);
firstName = m.group(2);
String compact = m.group(3);
dateIso = compact.substring(0, 4) + "-" + compact.substring(4, 6) + "-" + compact.substring(6, 8);
int dot = filename.lastIndexOf('.');
if (dot < 0) return stripExtension(filename);
String stem = filename.substring(0, dot);
String[] parts = stem.split("_", -1);
// Minimum: date + at least one lastName segment + firstName
if (parts.length < 3) return stripExtension(filename);
String dateIso;
String[] nameParts;
String dateFromFirst = tryParseDate(parts[0]);
if (dateFromFirst != null) {
dateIso = dateFromFirst;
nameParts = Arrays.copyOfRange(parts, 1, parts.length);
} else {
return stripExtension(filename);
String dateFromLast = tryParseDate(parts[parts.length - 1]);
if (dateFromLast == null) return stripExtension(filename);
dateIso = dateFromLast;
nameParts = Arrays.copyOfRange(parts, 0, parts.length - 1);
}
// Format date as DD.MM.YYYY for the title
if (nameParts.length < 2) return stripExtension(filename);
for (String p : nameParts) {
if (!p.matches("\\p{L}+")) return stripExtension(filename);
}
String firstName = nameParts[nameParts.length - 1];
String lastName = String.join(" ", Arrays.copyOfRange(nameParts, 0, nameParts.length - 1));
LocalDate date = LocalDate.parse(dateIso);
String dateDisplay = String.format("%02d.%02d.%d", date.getDayOfMonth(), date.getMonthValue(), date.getYear());
return firstName + " " + lastName + " (" + dateDisplay + ")";
}
private static String tryParseDate(String s) {
if (s.matches("\\d{4}-\\d{2}-\\d{2}")) {
int m = Integer.parseInt(s.substring(5, 7));
int d = Integer.parseInt(s.substring(8, 10));
if (m >= 1 && m <= 12 && d >= 1 && d <= 31) return s;
} else if (s.matches("\\d{8}")) {
int m = Integer.parseInt(s.substring(4, 6));
int d = Integer.parseInt(s.substring(6, 8));
if (m >= 1 && m <= 12 && d >= 1 && d <= 31)
return s.substring(0, 4) + "-" + s.substring(4, 6) + "-" + s.substring(6, 8);
}
return null;
}
private static String sha256Hex(byte[] bytes) {
try {
MessageDigest digest = MessageDigest.getInstance("SHA-256");

View File

@@ -545,6 +545,18 @@ class DocumentServiceTest {
.isEqualTo("Hans Mueller (12.03.1965)");
}
@Test
void titleFromFilename_compound_lastName_dateFirst() {
assertThat(DocumentService.titleFromFilename("18881025_de_Gruyter_Walter.pdf"))
.isEqualTo("Walter de Gruyter (25.10.1888)");
}
@Test
void titleFromFilename_compound_lastName_dateLast() {
assertThat(DocumentService.titleFromFilename("de_Gruyter_Walter_18881025.pdf"))
.isEqualTo("Walter de Gruyter (25.10.1888)");
}
@Test
void titleFromFilename_fallsBackToStripExtension() {
assertThat(DocumentService.titleFromFilename("scan_001.pdf")).isEqualTo("scan_001");