From cbdc48a06146195e73e2db319463221bbfe506c7 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 15 Mar 2026 20:47:10 +0100 Subject: [PATCH] feat: add PersonNameParser utility for ODS name normalisation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure static utility that parses raw name strings from the ODS into structured Person data. Handles multi-receiver patterns like "Walter und Eugenie de Gruyter" → [Walter de Gruyter, Eugenie de Gruyter], parenthesised last names, "geb." maiden-name stripping, and "Familie" filtering. Includes unit tests for all patterns found in the data. Co-Authored-By: Claude Sonnet 4.6 --- .../service/PersonNameParser.java | 139 ++++++++++++++++++ .../service/PersonNameParserTest.java | 120 +++++++++++++++ 2 files changed, 259 insertions(+) create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java b/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java new file mode 100644 index 00000000..cf888702 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/PersonNameParser.java @@ -0,0 +1,139 @@ +package org.raddatz.familienarchiv.service; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Parses name strings from the ODS import into structured Person data. + * + * All methods are pure/static so they can be tested without a Spring context. + */ +public class PersonNameParser { + + // Known last names in this archive, longest first to avoid partial matches + // (e.g. "de Gruyter" must be checked before any single-word name) + static final List KNOWN_LAST_NAMES = List.of( + "de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram"); + + private static final Pattern GEB_PATTERN = Pattern.compile("\\s+geb\\.\\s+\\S+"); + private static final Pattern PAREN_LAST_NAME = Pattern.compile("\\(([^)]+)\\)\\s*$"); + private static final Pattern MULTI_SEPARATOR = Pattern.compile("\\s+(?:und|u)\\s+"); + + public record SplitName(String firstName, String lastName) {} + + /** + * Parses the "An" field from the ODS into individual normalised name strings. + * + * Handles: + * "Walter und Eugenie de Gruyter" → ["Walter de Gruyter", "Eugenie de Gruyter"] + * "Herbert und Clara Cram" → ["Herbert Cram", "Clara Cram"] + * "Hedi und Tutu (Gruber)" → ["Hedi Gruber", "Tutu Gruber"] + * "Clara Cram u Ellen B-M" → ["Clara Cram", "Ellen B-M"] + * "Clara u Familie" → ["Clara"] + * "Walter und Eugenie" → ["Walter", "Eugenie"] + * "Eugenie de Gruyter geb. Müller" → ["Eugenie de Gruyter"] + */ + public static List parseReceivers(String raw) { + if (raw == null || raw.isBlank()) return List.of(); + + // 1. Strip "geb. Xxx" maiden-name annotations + String cleaned = GEB_PATTERN.matcher(raw).replaceAll("").trim(); + + // 2. Extract parenthesised last name override, e.g. "(Gruber)" + String sharedLastName = null; + Matcher parenMatcher = PAREN_LAST_NAME.matcher(cleaned); + if (parenMatcher.find()) { + sharedLastName = parenMatcher.group(1).trim(); + cleaned = cleaned.substring(0, parenMatcher.start()).trim(); + } + + // 3. If no multi-separator present, this is a single person + if (!MULTI_SEPARATOR.matcher(cleaned).find()) { + return List.of(cleaned); + } + + // 4. Split on " und " / " u " + String[] parts = MULTI_SEPARATOR.split(cleaned); + + // 5. Filter out "Familie" (not a person) + List nameParts = new ArrayList<>(); + for (String part : parts) { + if (!part.trim().equalsIgnoreCase("Familie")) { + nameParts.add(part.trim()); + } + } + + if (nameParts.isEmpty()) return List.of(); + if (nameParts.size() == 1) return List.of(nameParts.get(0)); + + // 6. If parenthesised last name was found, apply to all single-token parts + if (sharedLastName != null) { + String ln = sharedLastName; + return nameParts.stream() + .map(p -> p.contains(" ") ? p : p + " " + ln) + .toList(); + } + + // 7. Try to detect a shared last name from the final segment and distribute it + // to earlier segments that are single first-name tokens + String lastSegment = nameParts.get(nameParts.size() - 1); + String detectedLastName = findKnownLastName(lastSegment); + + if (detectedLastName != null) { + List result = new ArrayList<>(); + for (int i = 0; i < nameParts.size() - 1; i++) { + String part = nameParts.get(i); + // Distribute only if the part has no last name of its own + if (!part.contains(" ") && findKnownLastName(part) == null) { + result.add(part + " " + detectedLastName); + } else { + result.add(part); + } + } + result.add(lastSegment); + return result; + } + + // 8. No shared last name found — return parts as-is + return nameParts; + } + + /** + * Splits a single full name string into firstName and lastName. + * Uses known last names first; falls back to splitting on the last space. + */ + public static SplitName split(String rawName) { + if (rawName == null || rawName.isBlank()) { + return new SplitName("?", "?"); + } + + String cleaned = GEB_PATTERN.matcher(rawName).replaceAll("").trim(); + + String lastName = findKnownLastName(cleaned); + if (lastName != null) { + String firstName = cleaned.substring(0, cleaned.length() - lastName.length()).trim(); + if (firstName.isBlank()) firstName = cleaned; + return new SplitName(firstName, lastName); + } + + int lastSpace = cleaned.lastIndexOf(' '); + if (lastSpace > 0) { + return new SplitName(cleaned.substring(0, lastSpace).trim(), cleaned.substring(lastSpace + 1).trim()); + } + + return new SplitName(cleaned, "?"); + } + + /** Returns the known last name that the given string ends with, or null. */ + static String findKnownLastName(String name) { + String lower = name.toLowerCase(); + for (String lastName : KNOWN_LAST_NAMES) { + if (lower.endsWith(lastName.toLowerCase())) { + return lastName; + } + } + return null; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java new file mode 100644 index 00000000..1b22c4b9 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/PersonNameParserTest.java @@ -0,0 +1,120 @@ +package org.raddatz.familienarchiv.service; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +class PersonNameParserTest { + + // --- parseReceivers --- + + @Test + void singlePerson_noChange() { + assertThat(PersonNameParser.parseReceivers("Walter de Gruyter")) + .containsExactly("Walter de Gruyter"); + } + + @Test + void gebAnnotation_stripped() { + assertThat(PersonNameParser.parseReceivers("Eugenie de Gruyter geb. Müller")) + .containsExactly("Eugenie de Gruyter"); + } + + @Test + void twoFirstNames_sharedKnownLastName_und() { + assertThat(PersonNameParser.parseReceivers("Walter und Eugenie de Gruyter")) + .containsExactly("Walter de Gruyter", "Eugenie de Gruyter"); + } + + @Test + void twoFirstNames_sharedKnownLastName_u() { + assertThat(PersonNameParser.parseReceivers("Herbert und Clara Cram")) + .containsExactly("Herbert Cram", "Clara Cram"); + } + + @Test + void twoFirstNames_sharedKnownLastName_u_short() { + assertThat(PersonNameParser.parseReceivers("Ella u Walter Dieckmann")) + .containsExactly("Ella Dieckmann", "Walter Dieckmann"); + } + + @Test + void twoFirstNames_parenthesisedLastName() { + assertThat(PersonNameParser.parseReceivers("Hedi und Tutu (Gruber)")) + .containsExactly("Hedi Gruber", "Tutu Gruber"); + } + + @Test + void twoPersons_differentLastNames() { + assertThat(PersonNameParser.parseReceivers("Clara Cram u Ellen B-M")) + .containsExactly("Clara Cram", "Ellen B-M"); + } + + @Test + void familie_filtered_out() { + assertThat(PersonNameParser.parseReceivers("Clara u Familie")) + .containsExactly("Clara"); + } + + @Test + void twoFirstNames_noLastName() { + assertThat(PersonNameParser.parseReceivers("Walter und Eugenie")) + .containsExactly("Walter", "Eugenie"); + } + + @Test + void nullInput_returnsEmpty() { + assertThat(PersonNameParser.parseReceivers(null)).isEmpty(); + } + + @Test + void blankInput_returnsEmpty() { + assertThat(PersonNameParser.parseReceivers(" ")).isEmpty(); + } + + // --- split --- + + @Test + void split_knownMultiWordLastName() { + PersonNameParser.SplitName result = PersonNameParser.split("Walter de Gruyter"); + assertThat(result.firstName()).isEqualTo("Walter"); + assertThat(result.lastName()).isEqualTo("de Gruyter"); + } + + @Test + void split_knownSingleWordLastName() { + PersonNameParser.SplitName result = PersonNameParser.split("Clara Cram"); + assertThat(result.firstName()).isEqualTo("Clara"); + assertThat(result.lastName()).isEqualTo("Cram"); + } + + @Test + void split_unknownLastName_fallsBackToLastSpace() { + PersonNameParser.SplitName result = PersonNameParser.split("Ellen Burkhard-Meier"); + assertThat(result.firstName()).isEqualTo("Ellen"); + assertThat(result.lastName()).isEqualTo("Burkhard-Meier"); + } + + @Test + void split_singleToken_lastNameIsPlaceholder() { + PersonNameParser.SplitName result = PersonNameParser.split("Clara"); + assertThat(result.firstName()).isEqualTo("Clara"); + assertThat(result.lastName()).isEqualTo("?"); + } + + @Test + void split_gebAnnotation_stripped() { + PersonNameParser.SplitName result = PersonNameParser.split("Eugenie de Gruyter geb. Müller"); + assertThat(result.firstName()).isEqualTo("Eugenie"); + assertThat(result.lastName()).isEqualTo("de Gruyter"); + } + + @Test + void split_null_returnsPlaceholder() { + PersonNameParser.SplitName result = PersonNameParser.split(null); + assertThat(result.firstName()).isEqualTo("?"); + assertThat(result.lastName()).isEqualTo("?"); + } +}