feat(filename): support compound last names like de Gruyter
Some checks failed
CI / Unit & Component Tests (push) Has been cancelled
CI / Backend Unit Tests (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
CI / Unit & Component Tests (pull_request) Successful in 2m17s
CI / Backend Unit Tests (pull_request) Successful in 2m13s
CI / E2E Tests (pull_request) Failing after 25m0s
Some checks failed
CI / Unit & Component Tests (push) Has been cancelled
CI / Backend Unit Tests (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
CI / Unit & Component Tests (pull_request) Successful in 2m17s
CI / Backend Unit Tests (pull_request) Successful in 2m13s
CI / E2E Tests (pull_request) Failing after 25m0s
Replace the four fixed regexes with a split-based algorithm: - first segment = date → last segment = firstName, rest = lastName parts - last segment = date → second-to-last = firstName, rest = lastName parts 18881025_de_Gruyter_Walter.pdf now correctly yields "Walter de Gruyter". Simple two-segment names behave identically to before. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -358,55 +358,70 @@ public class DocumentService {
|
||||
|
||||
/**
|
||||
* Derives a human-readable title from a structured filename.
|
||||
* Supports patterns (full match only):
|
||||
* YYYY-MM-DD_Lastname_Firstname.ext
|
||||
* YYYYMMDD_Lastname_Firstname.ext
|
||||
* Lastname_Firstname_YYYY-MM-DD.ext
|
||||
* Lastname_Firstname_YYYYMMDD.ext
|
||||
* Falls back to stripExtension for unrecognised names.
|
||||
*
|
||||
* Algorithm: split stem on "_", identify the date token (first or last segment),
|
||||
* treat the outermost remaining segment as firstName, rest as lastName parts.
|
||||
* Compound last names (e.g. "de_Gruyter") are handled naturally.
|
||||
* Falls back to stripExtension for unrecognised filenames.
|
||||
*
|
||||
* Examples:
|
||||
* 18881025_de_Gruyter_Walter.pdf → "Walter de Gruyter (25.10.1888)"
|
||||
* 1965-03-12_Mueller_Hans.pdf → "Hans Mueller (12.03.1965)"
|
||||
* Mueller_Hans_19650312.pdf → "Hans Mueller (12.03.1965)"
|
||||
*/
|
||||
private static final java.util.regex.Pattern FN_DATE_ISO_NAME =
|
||||
java.util.regex.Pattern.compile("^(\\d{4}-\\d{2}-\\d{2})_(\\p{L}+)_(\\p{L}+)\\.[^.]+$");
|
||||
private static final java.util.regex.Pattern FN_DATE_COMPACT_NAME =
|
||||
java.util.regex.Pattern.compile("^(\\d{8})_(\\p{L}+)_(\\p{L}+)\\.[^.]+$");
|
||||
private static final java.util.regex.Pattern FN_NAME_DATE_ISO =
|
||||
java.util.regex.Pattern.compile("^(\\p{L}+)_(\\p{L}+)_(\\d{4}-\\d{2}-\\d{2})\\.[^.]+$");
|
||||
private static final java.util.regex.Pattern FN_NAME_DATE_COMPACT =
|
||||
java.util.regex.Pattern.compile("^(\\p{L}+)_(\\p{L}+)_(\\d{8})\\.[^.]+$");
|
||||
|
||||
static String titleFromFilename(String filename) {
|
||||
if (filename == null) return null;
|
||||
java.util.regex.Matcher m;
|
||||
String dateIso, lastName, firstName;
|
||||
|
||||
if ((m = FN_DATE_ISO_NAME.matcher(filename)).matches()) {
|
||||
dateIso = m.group(1);
|
||||
lastName = m.group(2);
|
||||
firstName = m.group(3);
|
||||
} else if ((m = FN_DATE_COMPACT_NAME.matcher(filename)).matches()) {
|
||||
String compact = m.group(1);
|
||||
dateIso = compact.substring(0, 4) + "-" + compact.substring(4, 6) + "-" + compact.substring(6, 8);
|
||||
lastName = m.group(2);
|
||||
firstName = m.group(3);
|
||||
} else if ((m = FN_NAME_DATE_ISO.matcher(filename)).matches()) {
|
||||
lastName = m.group(1);
|
||||
firstName = m.group(2);
|
||||
dateIso = m.group(3);
|
||||
} else if ((m = FN_NAME_DATE_COMPACT.matcher(filename)).matches()) {
|
||||
lastName = m.group(1);
|
||||
firstName = m.group(2);
|
||||
String compact = m.group(3);
|
||||
dateIso = compact.substring(0, 4) + "-" + compact.substring(4, 6) + "-" + compact.substring(6, 8);
|
||||
int dot = filename.lastIndexOf('.');
|
||||
if (dot < 0) return stripExtension(filename);
|
||||
String stem = filename.substring(0, dot);
|
||||
|
||||
String[] parts = stem.split("_", -1);
|
||||
// Minimum: date + at least one lastName segment + firstName
|
||||
if (parts.length < 3) return stripExtension(filename);
|
||||
|
||||
String dateIso;
|
||||
String[] nameParts;
|
||||
|
||||
String dateFromFirst = tryParseDate(parts[0]);
|
||||
if (dateFromFirst != null) {
|
||||
dateIso = dateFromFirst;
|
||||
nameParts = Arrays.copyOfRange(parts, 1, parts.length);
|
||||
} else {
|
||||
return stripExtension(filename);
|
||||
String dateFromLast = tryParseDate(parts[parts.length - 1]);
|
||||
if (dateFromLast == null) return stripExtension(filename);
|
||||
dateIso = dateFromLast;
|
||||
nameParts = Arrays.copyOfRange(parts, 0, parts.length - 1);
|
||||
}
|
||||
|
||||
// Format date as DD.MM.YYYY for the title
|
||||
if (nameParts.length < 2) return stripExtension(filename);
|
||||
|
||||
for (String p : nameParts) {
|
||||
if (!p.matches("\\p{L}+")) return stripExtension(filename);
|
||||
}
|
||||
|
||||
String firstName = nameParts[nameParts.length - 1];
|
||||
String lastName = String.join(" ", Arrays.copyOfRange(nameParts, 0, nameParts.length - 1));
|
||||
|
||||
LocalDate date = LocalDate.parse(dateIso);
|
||||
String dateDisplay = String.format("%02d.%02d.%d", date.getDayOfMonth(), date.getMonthValue(), date.getYear());
|
||||
return firstName + " " + lastName + " (" + dateDisplay + ")";
|
||||
}
|
||||
|
||||
private static String tryParseDate(String s) {
|
||||
if (s.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
int m = Integer.parseInt(s.substring(5, 7));
|
||||
int d = Integer.parseInt(s.substring(8, 10));
|
||||
if (m >= 1 && m <= 12 && d >= 1 && d <= 31) return s;
|
||||
} else if (s.matches("\\d{8}")) {
|
||||
int m = Integer.parseInt(s.substring(4, 6));
|
||||
int d = Integer.parseInt(s.substring(6, 8));
|
||||
if (m >= 1 && m <= 12 && d >= 1 && d <= 31)
|
||||
return s.substring(0, 4) + "-" + s.substring(4, 6) + "-" + s.substring(6, 8);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static String sha256Hex(byte[] bytes) {
|
||||
try {
|
||||
MessageDigest digest = MessageDigest.getInstance("SHA-256");
|
||||
|
||||
@@ -545,6 +545,18 @@ class DocumentServiceTest {
|
||||
.isEqualTo("Hans Mueller (12.03.1965)");
|
||||
}
|
||||
|
||||
@Test
|
||||
void titleFromFilename_compound_lastName_dateFirst() {
|
||||
assertThat(DocumentService.titleFromFilename("18881025_de_Gruyter_Walter.pdf"))
|
||||
.isEqualTo("Walter de Gruyter (25.10.1888)");
|
||||
}
|
||||
|
||||
@Test
|
||||
void titleFromFilename_compound_lastName_dateLast() {
|
||||
assertThat(DocumentService.titleFromFilename("de_Gruyter_Walter_18881025.pdf"))
|
||||
.isEqualTo("Walter de Gruyter (25.10.1888)");
|
||||
}
|
||||
|
||||
@Test
|
||||
void titleFromFilename_fallsBackToStripExtension() {
|
||||
assertThat(DocumentService.titleFromFilename("scan_001.pdf")).isEqualTo("scan_001");
|
||||
|
||||
@@ -2,8 +2,8 @@ import { describe, it, expect } from 'vitest';
|
||||
import { parseFilename, stripExtension } from './filename';
|
||||
|
||||
describe('parseFilename', () => {
|
||||
describe('YYYY-MM-DD_Lastname_Firstname pattern', () => {
|
||||
it('extracts date and name', () => {
|
||||
describe('date-first patterns', () => {
|
||||
it('YYYY-MM-DD_Lastname_Firstname', () => {
|
||||
expect(parseFilename('1965-03-12_Mueller_Hans.pdf')).toEqual({
|
||||
dateIso: '1965-03-12',
|
||||
personName: 'Hans Mueller',
|
||||
@@ -11,40 +11,52 @@ describe('parseFilename', () => {
|
||||
});
|
||||
});
|
||||
|
||||
it('YYYYMMDD_Lastname_Firstname', () => {
|
||||
expect(parseFilename('19650312_Mueller_Hans.pdf')).toEqual({
|
||||
dateIso: '1965-03-12',
|
||||
personName: 'Hans Mueller',
|
||||
suggestedTitle: 'Hans Mueller (12.03.1965)'
|
||||
});
|
||||
});
|
||||
|
||||
it('YYYYMMDD_compound_lastname_Firstname', () => {
|
||||
expect(parseFilename('18881025_de_Gruyter_Walter.pdf')).toEqual({
|
||||
dateIso: '1888-10-25',
|
||||
personName: 'Walter de Gruyter',
|
||||
suggestedTitle: 'Walter de Gruyter (25.10.1888)'
|
||||
});
|
||||
});
|
||||
|
||||
it('handles umlauts in names', () => {
|
||||
const result = parseFilename('2024-01-15_Müller_Jürgen.pdf');
|
||||
expect(result.personName).toBe('Jürgen Müller');
|
||||
});
|
||||
});
|
||||
|
||||
describe('YYYYMMDD_Lastname_Firstname pattern', () => {
|
||||
it('extracts date and name', () => {
|
||||
expect(parseFilename('19650312_Mueller_Hans.pdf')).toEqual({
|
||||
dateIso: '1965-03-12',
|
||||
personName: 'Hans Mueller',
|
||||
suggestedTitle: 'Hans Mueller (12.03.1965)'
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('Lastname_Firstname_YYYY-MM-DD pattern', () => {
|
||||
it('extracts date and name', () => {
|
||||
describe('date-last patterns', () => {
|
||||
it('Lastname_Firstname_YYYY-MM-DD', () => {
|
||||
expect(parseFilename('Mueller_Hans_1965-03-12.pdf')).toEqual({
|
||||
dateIso: '1965-03-12',
|
||||
personName: 'Hans Mueller',
|
||||
suggestedTitle: 'Hans Mueller (12.03.1965)'
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('Lastname_Firstname_YYYYMMDD pattern', () => {
|
||||
it('extracts date and name', () => {
|
||||
it('Lastname_Firstname_YYYYMMDD', () => {
|
||||
expect(parseFilename('Mueller_Hans_19650312.pdf')).toEqual({
|
||||
dateIso: '1965-03-12',
|
||||
personName: 'Hans Mueller',
|
||||
suggestedTitle: 'Hans Mueller (12.03.1965)'
|
||||
});
|
||||
});
|
||||
|
||||
it('compound_lastname_Firstname_YYYYMMDD', () => {
|
||||
expect(parseFilename('de_Gruyter_Walter_18881025.pdf')).toEqual({
|
||||
dateIso: '1888-10-25',
|
||||
personName: 'Walter de Gruyter',
|
||||
suggestedTitle: 'Walter de Gruyter (25.10.1888)'
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('non-matching filenames', () => {
|
||||
@@ -52,7 +64,7 @@ describe('parseFilename', () => {
|
||||
expect(parseFilename('1965-03-12.pdf')).toEqual({});
|
||||
});
|
||||
|
||||
it('returns empty for name-only filename', () => {
|
||||
it('returns empty for two segments with no date', () => {
|
||||
expect(parseFilename('Mueller_Hans.pdf')).toEqual({});
|
||||
});
|
||||
|
||||
@@ -60,13 +72,17 @@ describe('parseFilename', () => {
|
||||
expect(parseFilename('scan_001.pdf')).toEqual({});
|
||||
});
|
||||
|
||||
it('returns empty for three name segments without date', () => {
|
||||
it('returns empty for three name segments with no date', () => {
|
||||
expect(parseFilename('Mueller_Hans_Juergen.pdf')).toEqual({});
|
||||
});
|
||||
|
||||
it('returns empty for filename without extension', () => {
|
||||
expect(parseFilename('1965-03-12_Mueller_Hans')).toEqual({});
|
||||
});
|
||||
|
||||
it('rejects implausible date (month 13)', () => {
|
||||
expect(parseFilename('19651345_Mueller_Hans.pdf')).toEqual({});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -9,46 +9,73 @@ export interface FilenameParseResult {
|
||||
suggestedTitle?: string;
|
||||
}
|
||||
|
||||
// Full-match patterns only. Name segments use Unicode letters (\p{L}) to cover umlauts etc.
|
||||
// Order: date_lastname_firstname
|
||||
const P_DATE_ISO_NAME = /^(\d{4}-\d{2}-\d{2})_(\p{L}+)_(\p{L}+)\.[^.]+$/u;
|
||||
const P_DATE_COMPACT_NAME = /^(\d{8})_(\p{L}+)_(\p{L}+)\.[^.]+$/u;
|
||||
// Order: lastname_firstname_date
|
||||
const P_NAME_DATE_ISO = /^(\p{L}+)_(\p{L}+)_(\d{4}-\d{2}-\d{2})\.[^.]+$/u;
|
||||
const P_NAME_DATE_COMPACT = /^(\p{L}+)_(\p{L}+)_(\d{8})\.[^.]+$/u;
|
||||
|
||||
function compactToIso(compact: string): string {
|
||||
return `${compact.slice(0, 4)}-${compact.slice(4, 6)}-${compact.slice(6, 8)}`;
|
||||
// A date token is either YYYY-MM-DD or YYYYMMDD with a plausible month/day range.
|
||||
function tryParseDate(s: string): string | undefined {
|
||||
if (/^\d{4}-\d{2}-\d{2}$/.test(s)) {
|
||||
const m = parseInt(s.slice(5, 7));
|
||||
const d = parseInt(s.slice(8, 10));
|
||||
if (m >= 1 && m <= 12 && d >= 1 && d <= 31) return s;
|
||||
} else if (/^\d{8}$/.test(s)) {
|
||||
const m = parseInt(s.slice(4, 6));
|
||||
const d = parseInt(s.slice(6, 8));
|
||||
if (m >= 1 && m <= 12 && d >= 1 && d <= 31)
|
||||
return `${s.slice(0, 4)}-${s.slice(4, 6)}-${s.slice(6, 8)}`;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const NAME_PART = /^\p{L}+$/u;
|
||||
|
||||
/**
|
||||
* Parses a structured filename and extracts a date and person name.
|
||||
*
|
||||
* Supported conventions (date-first or date-last, compound last names supported):
|
||||
* YYYY-MM-DD_Lastname_Firstname.ext
|
||||
* YYYYMMDD_Lastname_Firstname.ext
|
||||
* YYYYMMDD_de_Gruyter_Walter.ext ← compound last name: lastName="de Gruyter"
|
||||
* Lastname_Firstname_YYYY-MM-DD.ext
|
||||
* Lastname_Firstname_YYYYMMDD.ext
|
||||
* de_Gruyter_Walter_YYYYMMDD.ext ← compound last name: lastName="de Gruyter"
|
||||
*
|
||||
* Algorithm: split on "_", identify the date token (first or last segment),
|
||||
* treat the outermost remaining segment as firstName, rest as lastName parts.
|
||||
* Returns {} for anything that doesn't match cleanly.
|
||||
*/
|
||||
export function parseFilename(filename: string): FilenameParseResult {
|
||||
const dot = filename.lastIndexOf('.');
|
||||
if (dot < 0) return {}; // no extension — not a real file
|
||||
const stem = filename.slice(0, dot);
|
||||
const parts = stem.split('_');
|
||||
|
||||
// Minimum: date + at least one lastName segment + firstName = 3 parts
|
||||
if (parts.length < 3) return {};
|
||||
|
||||
let dateIso: string;
|
||||
let lastName: string;
|
||||
let firstName: string;
|
||||
let nameParts: string[];
|
||||
|
||||
let m: RegExpMatchArray | null;
|
||||
|
||||
if ((m = P_DATE_ISO_NAME.exec(filename))) {
|
||||
[, dateIso, lastName, firstName] = m;
|
||||
} else if ((m = P_DATE_COMPACT_NAME.exec(filename))) {
|
||||
dateIso = compactToIso(m[1]);
|
||||
lastName = m[2];
|
||||
firstName = m[3];
|
||||
} else if ((m = P_NAME_DATE_ISO.exec(filename))) {
|
||||
lastName = m[1];
|
||||
firstName = m[2];
|
||||
dateIso = m[3];
|
||||
} else if ((m = P_NAME_DATE_COMPACT.exec(filename))) {
|
||||
lastName = m[1];
|
||||
firstName = m[2];
|
||||
dateIso = compactToIso(m[3]);
|
||||
const dateFromFirst = tryParseDate(parts[0]);
|
||||
if (dateFromFirst) {
|
||||
dateIso = dateFromFirst;
|
||||
nameParts = parts.slice(1);
|
||||
} else {
|
||||
return {};
|
||||
const dateFromLast = tryParseDate(parts[parts.length - 1]);
|
||||
if (!dateFromLast) return {};
|
||||
dateIso = dateFromLast;
|
||||
nameParts = parts.slice(0, -1);
|
||||
}
|
||||
|
||||
// Need at least lastName + firstName after removing the date
|
||||
if (nameParts.length < 2) return {};
|
||||
|
||||
// All name segments must be pure letters (covers umlauts via \p{L})
|
||||
if (!nameParts.every((p) => NAME_PART.test(p))) return {};
|
||||
|
||||
const firstName = nameParts[nameParts.length - 1];
|
||||
const lastName = nameParts.slice(0, -1).join(' ');
|
||||
const personName = `${firstName} ${lastName}`;
|
||||
const suggestedTitle = `${personName} (${isoToGerman(dateIso!)})`;
|
||||
return { dateIso: dateIso!, personName, suggestedTitle };
|
||||
const suggestedTitle = `${personName} (${isoToGerman(dateIso)})`;
|
||||
|
||||
return { dateIso, personName, suggestedTitle };
|
||||
}
|
||||
|
||||
export function stripExtension(filename: string): string {
|
||||
|
||||
Reference in New Issue
Block a user