From 73640ef5b6f8cafc168438427e727cc849254e8d Mon Sep 17 00:00:00 2001 From: Marcel Date: Wed, 8 Apr 2026 13:15:18 +0200 Subject: [PATCH] feat(parser): implement stripTitle for known prefixes Two-pass title stripping with loop for stacked titles: - Dot-prefixes (Dr., Prof.) matched without trailing space - Word-prefixes (Tante, Frau, Schwester, etc.) matched at word boundary - Stacked titles like "Prof. Dr. Muller" handled correctly - Single token after title strip goes to lastName (not firstName) Add 5 "von" last names to KNOWN_LAST_NAMES for correct splitting of entries like "Freifrau von Massenbach". 15 new test cases + updated 3 existing tests for title behavior. Co-Authored-By: Claude Sonnet 4.6 --- .../service/PersonNameParser.java | 62 +++++++++- .../service/PersonNameParserTest.java | 111 +++++++++++++++++- 2 files changed, 166 insertions(+), 7 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java b/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java index f06f4f61..ebd914dd 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java @@ -16,6 +16,7 @@ public class PersonNameParser { // Known last names in this archive, longest first to avoid partial matches // (e.g. "de Gruyter" must be checked before any single-word name) static final List KNOWN_LAST_NAMES = List.of( + "von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa", "de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram"); private static final Pattern GEB_PATTERN = Pattern.compile(",?\\s*geb\\.?\\s+(.+)$"); @@ -149,8 +150,22 @@ public class PersonNameParser { NameParts parts = splitByKnownLastNameOrFallback(cleaned); + String firstName = parts.firstName(); + String lastName = parts.lastName(); + + // When a title was stripped and no first name could be extracted, the + // remaining text is the lastName. "Tante Molly" -> title=Tante, lastName=Molly. + if (title.title() != null) { + if ("?".equals(lastName) && !cleaned.contains(" ")) { + lastName = firstName; + firstName = null; + } else if (firstName.equals(lastName)) { + firstName = null; + } + } + return new SplitName( - title.title(), parts.firstName(), parts.lastName(), + title.title(), firstName, lastName, maiden.maidenName(), paren.annotation() ); } @@ -196,9 +211,50 @@ public class PersonNameParser { return new AnnotationResult(cleaned, rawAnnotation); } - /** Strips title prefixes. Pass-through until #212. */ + private static final List DOT_PREFIXES = List.of("Dr.", "Prof."); + + private static final List WORD_PREFIXES = List.of( + "Frau", "Herr", "Freifrau", "Freiherr", + "Tante", "Onkel", "Schwester", "Bruder", + "Cousine", "Cousin", "Freundin", "Freund", + "Mutter", "Vater", "Pastor", "Architekt"); + + /** Strips known title/relationship prefixes, looping for stacked titles. */ public static TitleResult stripTitle(String input) { - return new TitleResult(input, null); + String remaining = input; + StringBuilder titleBuilder = new StringBuilder(); + boolean found = true; + + while (found) { + found = false; + + for (String prefix : DOT_PREFIXES) { + if (remaining.toLowerCase().startsWith(prefix.toLowerCase())) { + titleBuilder.append(titleBuilder.isEmpty() ? "" : " ").append(prefix); + remaining = remaining.substring(prefix.length()).trim(); + found = true; + break; + } + } + if (found) continue; + + for (String prefix : WORD_PREFIXES) { + String lower = remaining.toLowerCase(); + if (lower.startsWith(prefix.toLowerCase() + " ") || lower.equals(prefix.toLowerCase())) { + titleBuilder.append(titleBuilder.isEmpty() ? "" : " ").append(prefix); + remaining = remaining.length() > prefix.length() + ? remaining.substring(prefix.length() + 1).trim() + : ""; + found = true; + break; + } + } + } + + if (titleBuilder.isEmpty()) { + return new TitleResult(input, null); + } + return new TitleResult(remaining, titleBuilder.toString()); } /** Splits a cleaned name into firstName/lastName using known last names or last-space fallback. */ diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java index 2158032a..1ea4a22e 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java @@ -190,14 +190,16 @@ class PersonNameParserTest { @Test void split_dotCompressed_titleFirstNameLastName() { PersonNameParser.SplitName result = PersonNameParser.split("Dr.Fr.Zarncke"); - assertThat(result.firstName()).isEqualTo("Dr. Fr."); + assertThat(result.title()).isEqualTo("Dr."); + assertThat(result.firstName()).isEqualTo("Fr."); assertThat(result.lastName()).isEqualTo("Zarncke"); } @Test void split_dotCompressed_titleAndLastName() { PersonNameParser.SplitName result = PersonNameParser.split("Dr.Zarnke"); - assertThat(result.firstName()).isEqualTo("Dr."); + assertThat(result.title()).isEqualTo("Dr."); + assertThat(result.firstName()).isNull(); assertThat(result.lastName()).isEqualTo("Zarnke"); } @@ -210,7 +212,8 @@ class PersonNameParserTest { @Test void split_alreadySpacedDotName_noDoubleSpacing() { PersonNameParser.SplitName result = PersonNameParser.split("Dr. Fr. Zarncke"); - assertThat(result.firstName()).isEqualTo("Dr. Fr."); + assertThat(result.title()).isEqualTo("Dr."); + assertThat(result.firstName()).isEqualTo("Fr."); assertThat(result.lastName()).isEqualTo("Zarncke"); } @@ -353,12 +356,112 @@ class PersonNameParserTest { } @Test - void stripTitle_isPassthrough() { + void stripTitle_noPrefix_returnsNull() { PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Walter de Gruyter"); assertThat(result.cleaned()).isEqualTo("Walter de Gruyter"); assertThat(result.title()).isNull(); } + @Test + void stripTitle_tante() { + PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Tante Molly"); + assertThat(result.cleaned()).isEqualTo("Molly"); + assertThat(result.title()).isEqualTo("Tante"); + } + + @Test + void stripTitle_schwester() { + PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Schwester Hanni"); + assertThat(result.cleaned()).isEqualTo("Hanni"); + assertThat(result.title()).isEqualTo("Schwester"); + } + + @Test + void stripTitle_frau() { + PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Frau Bakker"); + assertThat(result.cleaned()).isEqualTo("Bakker"); + assertThat(result.title()).isEqualTo("Frau"); + } + + @Test + void stripTitle_cousine_withFullName() { + PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Cousine Emmy Haniel"); + assertThat(result.cleaned()).isEqualTo("Emmy Haniel"); + assertThat(result.title()).isEqualTo("Cousine"); + } + + @Test + void stripTitle_freifrau() { + PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Freifrau von Massenbach"); + assertThat(result.cleaned()).isEqualTo("von Massenbach"); + assertThat(result.title()).isEqualTo("Freifrau"); + } + + @Test + void stripTitle_dotPrefix_withSpace() { + PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Dr. Sattelmacher"); + assertThat(result.cleaned()).isEqualTo("Sattelmacher"); + assertThat(result.title()).isEqualTo("Dr."); + } + + @Test + void stripTitle_dotPrefix_noSpace() { + PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Dr.von Gelden"); + assertThat(result.cleaned()).isEqualTo("von Gelden"); + assertThat(result.title()).isEqualTo("Dr."); + } + + @Test + void stripTitle_stacked_profDr() { + PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Prof. Dr. Muller"); + assertThat(result.cleaned()).isEqualTo("Muller"); + assertThat(result.title()).isEqualTo("Prof. Dr."); + } + + // --- split — title extraction end-to-end --- + + @Test + void split_tante_setsTitle_firstNameNull() { + PersonNameParser.SplitName result = PersonNameParser.split("Tante Molly"); + assertThat(result.title()).isEqualTo("Tante"); + assertThat(result.firstName()).isNull(); + assertThat(result.lastName()).isEqualTo("Molly"); + } + + @Test + void split_dotTitle_afterDotNorm() { + PersonNameParser.SplitName result = PersonNameParser.split("Dr.Fr.Zarncke"); + assertThat(result.title()).isEqualTo("Dr."); + assertThat(result.firstName()).isEqualTo("Fr."); + assertThat(result.lastName()).isEqualTo("Zarncke"); + } + + @Test + void split_dotTitle_noSpace_vonLastName() { + PersonNameParser.SplitName result = PersonNameParser.split("Dr.von Gelden"); + assertThat(result.title()).isEqualTo("Dr."); + assertThat(result.firstName()).isNull(); + assertThat(result.lastName()).isEqualTo("von Gelden"); + } + + // --- regression: non-prefixes not stripped --- + + @Test + void split_walter_noTitleStrip() { + PersonNameParser.SplitName result = PersonNameParser.split("Walter de Gruyter"); + assertThat(result.title()).isNull(); + assertThat(result.firstName()).isEqualTo("Walter"); + assertThat(result.lastName()).isEqualTo("de Gruyter"); + } + + @Test + void split_conrad_vonGeldern_noTitleStrip() { + PersonNameParser.SplitName result = PersonNameParser.split("Conrad von Geldern"); + assertThat(result.title()).isNull(); + assertThat(result.firstName()).isEqualTo("Conrad"); + assertThat(result.lastName()).isEqualTo("von Geldern"); + } + // --- stripMaidenName — maiden name extraction --- @Test