feat(filename): support compound last names like de Gruyter
Some checks failed
CI / Unit & Component Tests (push) Has been cancelled
CI / Backend Unit Tests (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
CI / Unit & Component Tests (pull_request) Successful in 2m17s
CI / Backend Unit Tests (pull_request) Successful in 2m13s
CI / E2E Tests (pull_request) Failing after 25m0s

Replace the four fixed regexes with a split-based algorithm:
- first segment = date → last segment = firstName, rest = lastName parts
- last segment = date → second-to-last = firstName, rest = lastName parts

18881025_de_Gruyter_Walter.pdf now correctly yields "Walter de Gruyter".
Simple two-segment names behave identically to before.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-03-26 15:33:21 +01:00
parent a302f96560
commit f0940524e7
4 changed files with 157 additions and 87 deletions

View File

@@ -9,46 +9,73 @@ export interface FilenameParseResult {
suggestedTitle?: string;
}
// Full-match patterns only. Name segments use Unicode letters (\p{L}) to cover umlauts etc.
// Order: date_lastname_firstname
const P_DATE_ISO_NAME = /^(\d{4}-\d{2}-\d{2})_(\p{L}+)_(\p{L}+)\.[^.]+$/u;
const P_DATE_COMPACT_NAME = /^(\d{8})_(\p{L}+)_(\p{L}+)\.[^.]+$/u;
// Order: lastname_firstname_date
const P_NAME_DATE_ISO = /^(\p{L}+)_(\p{L}+)_(\d{4}-\d{2}-\d{2})\.[^.]+$/u;
const P_NAME_DATE_COMPACT = /^(\p{L}+)_(\p{L}+)_(\d{8})\.[^.]+$/u;
function compactToIso(compact: string): string {
return `${compact.slice(0, 4)}-${compact.slice(4, 6)}-${compact.slice(6, 8)}`;
// A date token is either YYYY-MM-DD or YYYYMMDD with a plausible month/day range.
function tryParseDate(s: string): string | undefined {
if (/^\d{4}-\d{2}-\d{2}$/.test(s)) {
const m = parseInt(s.slice(5, 7));
const d = parseInt(s.slice(8, 10));
if (m >= 1 && m <= 12 && d >= 1 && d <= 31) return s;
} else if (/^\d{8}$/.test(s)) {
const m = parseInt(s.slice(4, 6));
const d = parseInt(s.slice(6, 8));
if (m >= 1 && m <= 12 && d >= 1 && d <= 31)
return `${s.slice(0, 4)}-${s.slice(4, 6)}-${s.slice(6, 8)}`;
}
return undefined;
}
const NAME_PART = /^\p{L}+$/u;
/**
* Parses a structured filename and extracts a date and person name.
*
* Supported conventions (date-first or date-last, compound last names supported):
* YYYY-MM-DD_Lastname_Firstname.ext
* YYYYMMDD_Lastname_Firstname.ext
* YYYYMMDD_de_Gruyter_Walter.ext ← compound last name: lastName="de Gruyter"
* Lastname_Firstname_YYYY-MM-DD.ext
* Lastname_Firstname_YYYYMMDD.ext
* de_Gruyter_Walter_YYYYMMDD.ext ← compound last name: lastName="de Gruyter"
*
* Algorithm: split on "_", identify the date token (first or last segment),
* treat the outermost remaining segment as firstName, rest as lastName parts.
* Returns {} for anything that doesn't match cleanly.
*/
export function parseFilename(filename: string): FilenameParseResult {
const dot = filename.lastIndexOf('.');
if (dot < 0) return {}; // no extension — not a real file
const stem = filename.slice(0, dot);
const parts = stem.split('_');
// Minimum: date + at least one lastName segment + firstName = 3 parts
if (parts.length < 3) return {};
let dateIso: string;
let lastName: string;
let firstName: string;
let nameParts: string[];
let m: RegExpMatchArray | null;
if ((m = P_DATE_ISO_NAME.exec(filename))) {
[, dateIso, lastName, firstName] = m;
} else if ((m = P_DATE_COMPACT_NAME.exec(filename))) {
dateIso = compactToIso(m[1]);
lastName = m[2];
firstName = m[3];
} else if ((m = P_NAME_DATE_ISO.exec(filename))) {
lastName = m[1];
firstName = m[2];
dateIso = m[3];
} else if ((m = P_NAME_DATE_COMPACT.exec(filename))) {
lastName = m[1];
firstName = m[2];
dateIso = compactToIso(m[3]);
const dateFromFirst = tryParseDate(parts[0]);
if (dateFromFirst) {
dateIso = dateFromFirst;
nameParts = parts.slice(1);
} else {
return {};
const dateFromLast = tryParseDate(parts[parts.length - 1]);
if (!dateFromLast) return {};
dateIso = dateFromLast;
nameParts = parts.slice(0, -1);
}
// Need at least lastName + firstName after removing the date
if (nameParts.length < 2) return {};
// All name segments must be pure letters (covers umlauts via \p{L})
if (!nameParts.every((p) => NAME_PART.test(p))) return {};
const firstName = nameParts[nameParts.length - 1];
const lastName = nameParts.slice(0, -1).join(' ');
const personName = `${firstName} ${lastName}`;
const suggestedTitle = `${personName} (${isoToGerman(dateIso!)})`;
return { dateIso: dateIso!, personName, suggestedTitle };
const suggestedTitle = `${personName} (${isoToGerman(dateIso)})`;
return { dateIso, personName, suggestedTitle };
}
export function stripExtension(filename: string): string {