From c49cb345ca02323d9552cb88f49d107fb7607291 Mon Sep 17 00:00:00 2001 From: Marcel Date: Wed, 8 Apr 2026 12:51:32 +0200 Subject: [PATCH] feat(parser): widen GEB_PATTERN and extract maiden name in stripMaidenName Widen pattern from `\s+geb\.\s+\S+` to `,?\s*geb\.?\s+(.+)$` to handle: optional comma, optional dot, multi-word maiden names. stripMaidenName() now captures the maiden name instead of discarding it. Handles all 5 input variants from the ODS data. Co-Authored-By: Claude Sonnet 4.6 --- .../service/PersonNameParser.java | 13 ++-- .../service/PersonNameParserTest.java | 64 ++++++++++++++++++- 2 files changed, 72 insertions(+), 5 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java b/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java index d66db4fd..d33cd75f 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java @@ -18,7 +18,7 @@ public class PersonNameParser { static final List KNOWN_LAST_NAMES = List.of( "de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram"); - private static final Pattern GEB_PATTERN = Pattern.compile("\\s+geb\\.\\s+\\S+"); + private static final Pattern GEB_PATTERN = Pattern.compile(",?\\s*geb\\.?\\s+(.+)$"); private static final Pattern PAREN_LAST_NAME = Pattern.compile("\\(([^)]+)\\)\\s*$"); private static final Pattern MULTI_SEPARATOR = Pattern.compile("\\s+(?:und|u)\\s+"); private static final Pattern SLASH_SEPARATOR = Pattern.compile("//"); @@ -153,10 +153,15 @@ public class PersonNameParser { ); } - /** Strips "geb. Xxx" maiden-name annotations. Pass-through until #209. */ + /** Strips geb annotations and extracts the maiden name. */ public static MaidenNameResult stripMaidenName(String input) { - String cleaned = GEB_PATTERN.matcher(input).replaceAll("").trim(); - return new MaidenNameResult(cleaned, null); + Matcher m = GEB_PATTERN.matcher(input); + if (m.find()) { + String cleaned = input.substring(0, m.start()).trim(); + String maidenName = m.group(1).trim(); + return new MaidenNameResult(cleaned, maidenName); + } + return new MaidenNameResult(input, null); } /** Normalizes dot-compressed names: "Dr.Fr.Zarncke" → "Dr. Fr. Zarncke" */ diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java index a1148fa7..650e9dc9 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java @@ -115,7 +115,7 @@ class PersonNameParserTest { assertThat(result.title()).isNull(); assertThat(result.firstName()).isEqualTo("Eugenie"); assertThat(result.lastName()).isEqualTo("de Gruyter"); - assertThat(result.maidenName()).isNull(); + assertThat(result.maidenName()).isEqualTo("Müller"); assertThat(result.annotation()).isNull(); } @@ -282,6 +282,68 @@ class PersonNameParserTest { assertThat(result.title()).isNull(); } + // --- stripMaidenName — maiden name extraction --- + + @Test + void stripMaidenName_standardDot_singleWord() { + PersonNameParser.MaidenNameResult result = PersonNameParser.stripMaidenName("Eugenie de Gruyter geb. Muller"); + assertThat(result.cleaned()).isEqualTo("Eugenie de Gruyter"); + assertThat(result.maidenName()).isEqualTo("Muller"); + } + + @Test + void stripMaidenName_dot_multiWordMaidenName() { + PersonNameParser.MaidenNameResult result = PersonNameParser.stripMaidenName("Clara Cram geb. de Gruyter"); + assertThat(result.cleaned()).isEqualTo("Clara Cram"); + assertThat(result.maidenName()).isEqualTo("de Gruyter"); + } + + @Test + void stripMaidenName_commaPrefix_noDot_multiWord() { + PersonNameParser.MaidenNameResult result = PersonNameParser.stripMaidenName("Ella Dieckmann, geb de Gruyter"); + assertThat(result.cleaned()).isEqualTo("Ella Dieckmann"); + assertThat(result.maidenName()).isEqualTo("de Gruyter"); + } + + @Test + void stripMaidenName_noDot_singleWord() { + PersonNameParser.MaidenNameResult result = PersonNameParser.stripMaidenName("Elise Rockstroh geb Sintenis"); + assertThat(result.cleaned()).isEqualTo("Elise Rockstroh"); + assertThat(result.maidenName()).isEqualTo("Sintenis"); + } + + @Test + void stripMaidenName_noDot_noMarriedLastName() { + PersonNameParser.MaidenNameResult result = PersonNameParser.stripMaidenName("Elisabeth geb Fernow"); + assertThat(result.cleaned()).isEqualTo("Elisabeth"); + assertThat(result.maidenName()).isEqualTo("Fernow"); + } + + @Test + void stripMaidenName_noGeb_returnsNullMaidenName() { + PersonNameParser.MaidenNameResult result = PersonNameParser.stripMaidenName("Walter de Gruyter"); + assertThat(result.cleaned()).isEqualTo("Walter de Gruyter"); + assertThat(result.maidenName()).isNull(); + } + + // --- split — maiden name extraction end-to-end --- + + @Test + void split_gebDot_extractsMaidenName() { + PersonNameParser.SplitName result = PersonNameParser.split("Eugenie de Gruyter geb. Muller"); + assertThat(result.firstName()).isEqualTo("Eugenie"); + assertThat(result.lastName()).isEqualTo("de Gruyter"); + assertThat(result.maidenName()).isEqualTo("Muller"); + } + + @Test + void split_gebNoDot_multiWordMaidenName() { + PersonNameParser.SplitName result = PersonNameParser.split("Clara Cram geb. de Gruyter"); + assertThat(result.firstName()).isEqualTo("Clara"); + assertThat(result.lastName()).isEqualTo("Cram"); + assertThat(result.maidenName()).isEqualTo("de Gruyter"); + } + // --- enum values --- @Test