From f0940524e7d0ac1fd0d5fb522c3ab9a60b29f163 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 26 Mar 2026 15:33:21 +0100 Subject: [PATCH] feat(filename): support compound last names like de Gruyter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the four fixed regexes with a split-based algorithm: - first segment = date → last segment = firstName, rest = lastName parts - last segment = date → second-to-last = firstName, rest = lastName parts 18881025_de_Gruyter_Walter.pdf now correctly yields "Walter de Gruyter". Simple two-segment names behave identically to before. Co-Authored-By: Claude Sonnet 4.6 --- .../service/DocumentService.java | 89 +++++++++++-------- .../service/DocumentServiceTest.java | 12 +++ frontend/src/lib/utils/filename.spec.ts | 54 +++++++---- frontend/src/lib/utils/filename.ts | 89 ++++++++++++------- 4 files changed, 157 insertions(+), 87 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/DocumentService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/DocumentService.java index 7d08444a..dab03a48 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/DocumentService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/DocumentService.java @@ -358,55 +358,70 @@ public class DocumentService { /** * Derives a human-readable title from a structured filename. - * Supports patterns (full match only): - * YYYY-MM-DD_Lastname_Firstname.ext - * YYYYMMDD_Lastname_Firstname.ext - * Lastname_Firstname_YYYY-MM-DD.ext - * Lastname_Firstname_YYYYMMDD.ext - * Falls back to stripExtension for unrecognised names. + * + * Algorithm: split stem on "_", identify the date token (first or last segment), + * treat the outermost remaining segment as firstName, rest as lastName parts. + * Compound last names (e.g. "de_Gruyter") are handled naturally. + * Falls back to stripExtension for unrecognised filenames. + * + * Examples: + * 18881025_de_Gruyter_Walter.pdf → "Walter de Gruyter (25.10.1888)" + * 1965-03-12_Mueller_Hans.pdf → "Hans Mueller (12.03.1965)" + * Mueller_Hans_19650312.pdf → "Hans Mueller (12.03.1965)" */ - private static final java.util.regex.Pattern FN_DATE_ISO_NAME = - java.util.regex.Pattern.compile("^(\\d{4}-\\d{2}-\\d{2})_(\\p{L}+)_(\\p{L}+)\\.[^.]+$"); - private static final java.util.regex.Pattern FN_DATE_COMPACT_NAME = - java.util.regex.Pattern.compile("^(\\d{8})_(\\p{L}+)_(\\p{L}+)\\.[^.]+$"); - private static final java.util.regex.Pattern FN_NAME_DATE_ISO = - java.util.regex.Pattern.compile("^(\\p{L}+)_(\\p{L}+)_(\\d{4}-\\d{2}-\\d{2})\\.[^.]+$"); - private static final java.util.regex.Pattern FN_NAME_DATE_COMPACT = - java.util.regex.Pattern.compile("^(\\p{L}+)_(\\p{L}+)_(\\d{8})\\.[^.]+$"); - static String titleFromFilename(String filename) { if (filename == null) return null; - java.util.regex.Matcher m; - String dateIso, lastName, firstName; - if ((m = FN_DATE_ISO_NAME.matcher(filename)).matches()) { - dateIso = m.group(1); - lastName = m.group(2); - firstName = m.group(3); - } else if ((m = FN_DATE_COMPACT_NAME.matcher(filename)).matches()) { - String compact = m.group(1); - dateIso = compact.substring(0, 4) + "-" + compact.substring(4, 6) + "-" + compact.substring(6, 8); - lastName = m.group(2); - firstName = m.group(3); - } else if ((m = FN_NAME_DATE_ISO.matcher(filename)).matches()) { - lastName = m.group(1); - firstName = m.group(2); - dateIso = m.group(3); - } else if ((m = FN_NAME_DATE_COMPACT.matcher(filename)).matches()) { - lastName = m.group(1); - firstName = m.group(2); - String compact = m.group(3); - dateIso = compact.substring(0, 4) + "-" + compact.substring(4, 6) + "-" + compact.substring(6, 8); + int dot = filename.lastIndexOf('.'); + if (dot < 0) return stripExtension(filename); + String stem = filename.substring(0, dot); + + String[] parts = stem.split("_", -1); + // Minimum: date + at least one lastName segment + firstName + if (parts.length < 3) return stripExtension(filename); + + String dateIso; + String[] nameParts; + + String dateFromFirst = tryParseDate(parts[0]); + if (dateFromFirst != null) { + dateIso = dateFromFirst; + nameParts = Arrays.copyOfRange(parts, 1, parts.length); } else { - return stripExtension(filename); + String dateFromLast = tryParseDate(parts[parts.length - 1]); + if (dateFromLast == null) return stripExtension(filename); + dateIso = dateFromLast; + nameParts = Arrays.copyOfRange(parts, 0, parts.length - 1); } - // Format date as DD.MM.YYYY for the title + if (nameParts.length < 2) return stripExtension(filename); + + for (String p : nameParts) { + if (!p.matches("\\p{L}+")) return stripExtension(filename); + } + + String firstName = nameParts[nameParts.length - 1]; + String lastName = String.join(" ", Arrays.copyOfRange(nameParts, 0, nameParts.length - 1)); + LocalDate date = LocalDate.parse(dateIso); String dateDisplay = String.format("%02d.%02d.%d", date.getDayOfMonth(), date.getMonthValue(), date.getYear()); return firstName + " " + lastName + " (" + dateDisplay + ")"; } + private static String tryParseDate(String s) { + if (s.matches("\\d{4}-\\d{2}-\\d{2}")) { + int m = Integer.parseInt(s.substring(5, 7)); + int d = Integer.parseInt(s.substring(8, 10)); + if (m >= 1 && m <= 12 && d >= 1 && d <= 31) return s; + } else if (s.matches("\\d{8}")) { + int m = Integer.parseInt(s.substring(4, 6)); + int d = Integer.parseInt(s.substring(6, 8)); + if (m >= 1 && m <= 12 && d >= 1 && d <= 31) + return s.substring(0, 4) + "-" + s.substring(4, 6) + "-" + s.substring(6, 8); + } + return null; + } + private static String sha256Hex(byte[] bytes) { try { MessageDigest digest = MessageDigest.getInstance("SHA-256"); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/DocumentServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/DocumentServiceTest.java index ff87981d..5e94216a 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/DocumentServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/DocumentServiceTest.java @@ -545,6 +545,18 @@ class DocumentServiceTest { .isEqualTo("Hans Mueller (12.03.1965)"); } + @Test + void titleFromFilename_compound_lastName_dateFirst() { + assertThat(DocumentService.titleFromFilename("18881025_de_Gruyter_Walter.pdf")) + .isEqualTo("Walter de Gruyter (25.10.1888)"); + } + + @Test + void titleFromFilename_compound_lastName_dateLast() { + assertThat(DocumentService.titleFromFilename("de_Gruyter_Walter_18881025.pdf")) + .isEqualTo("Walter de Gruyter (25.10.1888)"); + } + @Test void titleFromFilename_fallsBackToStripExtension() { assertThat(DocumentService.titleFromFilename("scan_001.pdf")).isEqualTo("scan_001"); diff --git a/frontend/src/lib/utils/filename.spec.ts b/frontend/src/lib/utils/filename.spec.ts index 77c28d6b..297c8a1e 100644 --- a/frontend/src/lib/utils/filename.spec.ts +++ b/frontend/src/lib/utils/filename.spec.ts @@ -2,8 +2,8 @@ import { describe, it, expect } from 'vitest'; import { parseFilename, stripExtension } from './filename'; describe('parseFilename', () => { - describe('YYYY-MM-DD_Lastname_Firstname pattern', () => { - it('extracts date and name', () => { + describe('date-first patterns', () => { + it('YYYY-MM-DD_Lastname_Firstname', () => { expect(parseFilename('1965-03-12_Mueller_Hans.pdf')).toEqual({ dateIso: '1965-03-12', personName: 'Hans Mueller', @@ -11,40 +11,52 @@ describe('parseFilename', () => { }); }); + it('YYYYMMDD_Lastname_Firstname', () => { + expect(parseFilename('19650312_Mueller_Hans.pdf')).toEqual({ + dateIso: '1965-03-12', + personName: 'Hans Mueller', + suggestedTitle: 'Hans Mueller (12.03.1965)' + }); + }); + + it('YYYYMMDD_compound_lastname_Firstname', () => { + expect(parseFilename('18881025_de_Gruyter_Walter.pdf')).toEqual({ + dateIso: '1888-10-25', + personName: 'Walter de Gruyter', + suggestedTitle: 'Walter de Gruyter (25.10.1888)' + }); + }); + it('handles umlauts in names', () => { const result = parseFilename('2024-01-15_Müller_Jürgen.pdf'); expect(result.personName).toBe('Jürgen Müller'); }); }); - describe('YYYYMMDD_Lastname_Firstname pattern', () => { - it('extracts date and name', () => { - expect(parseFilename('19650312_Mueller_Hans.pdf')).toEqual({ - dateIso: '1965-03-12', - personName: 'Hans Mueller', - suggestedTitle: 'Hans Mueller (12.03.1965)' - }); - }); - }); - - describe('Lastname_Firstname_YYYY-MM-DD pattern', () => { - it('extracts date and name', () => { + describe('date-last patterns', () => { + it('Lastname_Firstname_YYYY-MM-DD', () => { expect(parseFilename('Mueller_Hans_1965-03-12.pdf')).toEqual({ dateIso: '1965-03-12', personName: 'Hans Mueller', suggestedTitle: 'Hans Mueller (12.03.1965)' }); }); - }); - describe('Lastname_Firstname_YYYYMMDD pattern', () => { - it('extracts date and name', () => { + it('Lastname_Firstname_YYYYMMDD', () => { expect(parseFilename('Mueller_Hans_19650312.pdf')).toEqual({ dateIso: '1965-03-12', personName: 'Hans Mueller', suggestedTitle: 'Hans Mueller (12.03.1965)' }); }); + + it('compound_lastname_Firstname_YYYYMMDD', () => { + expect(parseFilename('de_Gruyter_Walter_18881025.pdf')).toEqual({ + dateIso: '1888-10-25', + personName: 'Walter de Gruyter', + suggestedTitle: 'Walter de Gruyter (25.10.1888)' + }); + }); }); describe('non-matching filenames', () => { @@ -52,7 +64,7 @@ describe('parseFilename', () => { expect(parseFilename('1965-03-12.pdf')).toEqual({}); }); - it('returns empty for name-only filename', () => { + it('returns empty for two segments with no date', () => { expect(parseFilename('Mueller_Hans.pdf')).toEqual({}); }); @@ -60,13 +72,17 @@ describe('parseFilename', () => { expect(parseFilename('scan_001.pdf')).toEqual({}); }); - it('returns empty for three name segments without date', () => { + it('returns empty for three name segments with no date', () => { expect(parseFilename('Mueller_Hans_Juergen.pdf')).toEqual({}); }); it('returns empty for filename without extension', () => { expect(parseFilename('1965-03-12_Mueller_Hans')).toEqual({}); }); + + it('rejects implausible date (month 13)', () => { + expect(parseFilename('19651345_Mueller_Hans.pdf')).toEqual({}); + }); }); }); diff --git a/frontend/src/lib/utils/filename.ts b/frontend/src/lib/utils/filename.ts index 7df5213c..101fdd1c 100644 --- a/frontend/src/lib/utils/filename.ts +++ b/frontend/src/lib/utils/filename.ts @@ -9,46 +9,73 @@ export interface FilenameParseResult { suggestedTitle?: string; } -// Full-match patterns only. Name segments use Unicode letters (\p{L}) to cover umlauts etc. -// Order: date_lastname_firstname -const P_DATE_ISO_NAME = /^(\d{4}-\d{2}-\d{2})_(\p{L}+)_(\p{L}+)\.[^.]+$/u; -const P_DATE_COMPACT_NAME = /^(\d{8})_(\p{L}+)_(\p{L}+)\.[^.]+$/u; -// Order: lastname_firstname_date -const P_NAME_DATE_ISO = /^(\p{L}+)_(\p{L}+)_(\d{4}-\d{2}-\d{2})\.[^.]+$/u; -const P_NAME_DATE_COMPACT = /^(\p{L}+)_(\p{L}+)_(\d{8})\.[^.]+$/u; - -function compactToIso(compact: string): string { - return `${compact.slice(0, 4)}-${compact.slice(4, 6)}-${compact.slice(6, 8)}`; +// A date token is either YYYY-MM-DD or YYYYMMDD with a plausible month/day range. +function tryParseDate(s: string): string | undefined { + if (/^\d{4}-\d{2}-\d{2}$/.test(s)) { + const m = parseInt(s.slice(5, 7)); + const d = parseInt(s.slice(8, 10)); + if (m >= 1 && m <= 12 && d >= 1 && d <= 31) return s; + } else if (/^\d{8}$/.test(s)) { + const m = parseInt(s.slice(4, 6)); + const d = parseInt(s.slice(6, 8)); + if (m >= 1 && m <= 12 && d >= 1 && d <= 31) + return `${s.slice(0, 4)}-${s.slice(4, 6)}-${s.slice(6, 8)}`; + } + return undefined; } +const NAME_PART = /^\p{L}+$/u; + +/** + * Parses a structured filename and extracts a date and person name. + * + * Supported conventions (date-first or date-last, compound last names supported): + * YYYY-MM-DD_Lastname_Firstname.ext + * YYYYMMDD_Lastname_Firstname.ext + * YYYYMMDD_de_Gruyter_Walter.ext ← compound last name: lastName="de Gruyter" + * Lastname_Firstname_YYYY-MM-DD.ext + * Lastname_Firstname_YYYYMMDD.ext + * de_Gruyter_Walter_YYYYMMDD.ext ← compound last name: lastName="de Gruyter" + * + * Algorithm: split on "_", identify the date token (first or last segment), + * treat the outermost remaining segment as firstName, rest as lastName parts. + * Returns {} for anything that doesn't match cleanly. + */ export function parseFilename(filename: string): FilenameParseResult { + const dot = filename.lastIndexOf('.'); + if (dot < 0) return {}; // no extension — not a real file + const stem = filename.slice(0, dot); + const parts = stem.split('_'); + + // Minimum: date + at least one lastName segment + firstName = 3 parts + if (parts.length < 3) return {}; + let dateIso: string; - let lastName: string; - let firstName: string; + let nameParts: string[]; - let m: RegExpMatchArray | null; - - if ((m = P_DATE_ISO_NAME.exec(filename))) { - [, dateIso, lastName, firstName] = m; - } else if ((m = P_DATE_COMPACT_NAME.exec(filename))) { - dateIso = compactToIso(m[1]); - lastName = m[2]; - firstName = m[3]; - } else if ((m = P_NAME_DATE_ISO.exec(filename))) { - lastName = m[1]; - firstName = m[2]; - dateIso = m[3]; - } else if ((m = P_NAME_DATE_COMPACT.exec(filename))) { - lastName = m[1]; - firstName = m[2]; - dateIso = compactToIso(m[3]); + const dateFromFirst = tryParseDate(parts[0]); + if (dateFromFirst) { + dateIso = dateFromFirst; + nameParts = parts.slice(1); } else { - return {}; + const dateFromLast = tryParseDate(parts[parts.length - 1]); + if (!dateFromLast) return {}; + dateIso = dateFromLast; + nameParts = parts.slice(0, -1); } + // Need at least lastName + firstName after removing the date + if (nameParts.length < 2) return {}; + + // All name segments must be pure letters (covers umlauts via \p{L}) + if (!nameParts.every((p) => NAME_PART.test(p))) return {}; + + const firstName = nameParts[nameParts.length - 1]; + const lastName = nameParts.slice(0, -1).join(' '); const personName = `${firstName} ${lastName}`; - const suggestedTitle = `${personName} (${isoToGerman(dateIso!)})`; - return { dateIso: dateIso!, personName, suggestedTitle }; + const suggestedTitle = `${personName} (${isoToGerman(dateIso)})`; + + return { dateIso, personName, suggestedTitle }; } export function stripExtension(filename: string): string {