feat: add PersonNameParser utility for ODS name normalisation
Pure static utility that parses raw name strings from the ODS into structured Person data. Handles multi-receiver patterns like "Walter und Eugenie de Gruyter" → [Walter de Gruyter, Eugenie de Gruyter], parenthesised last names, "geb." maiden-name stripping, and "Familie" filtering. Includes unit tests for all patterns found in the data. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,139 @@
|
|||||||
|
package org.raddatz.familienarchiv.service;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses name strings from the ODS import into structured Person data.
|
||||||
|
*
|
||||||
|
* All methods are pure/static so they can be tested without a Spring context.
|
||||||
|
*/
|
||||||
|
public class PersonNameParser {
|
||||||
|
|
||||||
|
// Known last names in this archive, longest first to avoid partial matches
|
||||||
|
// (e.g. "de Gruyter" must be checked before any single-word name)
|
||||||
|
static final List<String> KNOWN_LAST_NAMES = List.of(
|
||||||
|
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram");
|
||||||
|
|
||||||
|
private static final Pattern GEB_PATTERN = Pattern.compile("\\s+geb\\.\\s+\\S+");
|
||||||
|
private static final Pattern PAREN_LAST_NAME = Pattern.compile("\\(([^)]+)\\)\\s*$");
|
||||||
|
private static final Pattern MULTI_SEPARATOR = Pattern.compile("\\s+(?:und|u)\\s+");
|
||||||
|
|
||||||
|
public record SplitName(String firstName, String lastName) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses the "An" field from the ODS into individual normalised name strings.
|
||||||
|
*
|
||||||
|
* Handles:
|
||||||
|
* "Walter und Eugenie de Gruyter" → ["Walter de Gruyter", "Eugenie de Gruyter"]
|
||||||
|
* "Herbert und Clara Cram" → ["Herbert Cram", "Clara Cram"]
|
||||||
|
* "Hedi und Tutu (Gruber)" → ["Hedi Gruber", "Tutu Gruber"]
|
||||||
|
* "Clara Cram u Ellen B-M" → ["Clara Cram", "Ellen B-M"]
|
||||||
|
* "Clara u Familie" → ["Clara"]
|
||||||
|
* "Walter und Eugenie" → ["Walter", "Eugenie"]
|
||||||
|
* "Eugenie de Gruyter geb. Müller" → ["Eugenie de Gruyter"]
|
||||||
|
*/
|
||||||
|
public static List<String> parseReceivers(String raw) {
|
||||||
|
if (raw == null || raw.isBlank()) return List.of();
|
||||||
|
|
||||||
|
// 1. Strip "geb. Xxx" maiden-name annotations
|
||||||
|
String cleaned = GEB_PATTERN.matcher(raw).replaceAll("").trim();
|
||||||
|
|
||||||
|
// 2. Extract parenthesised last name override, e.g. "(Gruber)"
|
||||||
|
String sharedLastName = null;
|
||||||
|
Matcher parenMatcher = PAREN_LAST_NAME.matcher(cleaned);
|
||||||
|
if (parenMatcher.find()) {
|
||||||
|
sharedLastName = parenMatcher.group(1).trim();
|
||||||
|
cleaned = cleaned.substring(0, parenMatcher.start()).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. If no multi-separator present, this is a single person
|
||||||
|
if (!MULTI_SEPARATOR.matcher(cleaned).find()) {
|
||||||
|
return List.of(cleaned);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Split on " und " / " u "
|
||||||
|
String[] parts = MULTI_SEPARATOR.split(cleaned);
|
||||||
|
|
||||||
|
// 5. Filter out "Familie" (not a person)
|
||||||
|
List<String> nameParts = new ArrayList<>();
|
||||||
|
for (String part : parts) {
|
||||||
|
if (!part.trim().equalsIgnoreCase("Familie")) {
|
||||||
|
nameParts.add(part.trim());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nameParts.isEmpty()) return List.of();
|
||||||
|
if (nameParts.size() == 1) return List.of(nameParts.get(0));
|
||||||
|
|
||||||
|
// 6. If parenthesised last name was found, apply to all single-token parts
|
||||||
|
if (sharedLastName != null) {
|
||||||
|
String ln = sharedLastName;
|
||||||
|
return nameParts.stream()
|
||||||
|
.map(p -> p.contains(" ") ? p : p + " " + ln)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 7. Try to detect a shared last name from the final segment and distribute it
|
||||||
|
// to earlier segments that are single first-name tokens
|
||||||
|
String lastSegment = nameParts.get(nameParts.size() - 1);
|
||||||
|
String detectedLastName = findKnownLastName(lastSegment);
|
||||||
|
|
||||||
|
if (detectedLastName != null) {
|
||||||
|
List<String> result = new ArrayList<>();
|
||||||
|
for (int i = 0; i < nameParts.size() - 1; i++) {
|
||||||
|
String part = nameParts.get(i);
|
||||||
|
// Distribute only if the part has no last name of its own
|
||||||
|
if (!part.contains(" ") && findKnownLastName(part) == null) {
|
||||||
|
result.add(part + " " + detectedLastName);
|
||||||
|
} else {
|
||||||
|
result.add(part);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.add(lastSegment);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 8. No shared last name found — return parts as-is
|
||||||
|
return nameParts;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits a single full name string into firstName and lastName.
|
||||||
|
* Uses known last names first; falls back to splitting on the last space.
|
||||||
|
*/
|
||||||
|
public static SplitName split(String rawName) {
|
||||||
|
if (rawName == null || rawName.isBlank()) {
|
||||||
|
return new SplitName("?", "?");
|
||||||
|
}
|
||||||
|
|
||||||
|
String cleaned = GEB_PATTERN.matcher(rawName).replaceAll("").trim();
|
||||||
|
|
||||||
|
String lastName = findKnownLastName(cleaned);
|
||||||
|
if (lastName != null) {
|
||||||
|
String firstName = cleaned.substring(0, cleaned.length() - lastName.length()).trim();
|
||||||
|
if (firstName.isBlank()) firstName = cleaned;
|
||||||
|
return new SplitName(firstName, lastName);
|
||||||
|
}
|
||||||
|
|
||||||
|
int lastSpace = cleaned.lastIndexOf(' ');
|
||||||
|
if (lastSpace > 0) {
|
||||||
|
return new SplitName(cleaned.substring(0, lastSpace).trim(), cleaned.substring(lastSpace + 1).trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
return new SplitName(cleaned, "?");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the known last name that the given string ends with, or null. */
|
||||||
|
static String findKnownLastName(String name) {
|
||||||
|
String lower = name.toLowerCase();
|
||||||
|
for (String lastName : KNOWN_LAST_NAMES) {
|
||||||
|
if (lower.endsWith(lastName.toLowerCase())) {
|
||||||
|
return lastName;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,120 @@
|
|||||||
|
package org.raddatz.familienarchiv.service;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
class PersonNameParserTest {
|
||||||
|
|
||||||
|
// --- parseReceivers ---
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void singlePerson_noChange() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers("Walter de Gruyter"))
|
||||||
|
.containsExactly("Walter de Gruyter");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void gebAnnotation_stripped() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers("Eugenie de Gruyter geb. Müller"))
|
||||||
|
.containsExactly("Eugenie de Gruyter");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void twoFirstNames_sharedKnownLastName_und() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers("Walter und Eugenie de Gruyter"))
|
||||||
|
.containsExactly("Walter de Gruyter", "Eugenie de Gruyter");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void twoFirstNames_sharedKnownLastName_u() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers("Herbert und Clara Cram"))
|
||||||
|
.containsExactly("Herbert Cram", "Clara Cram");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void twoFirstNames_sharedKnownLastName_u_short() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers("Ella u Walter Dieckmann"))
|
||||||
|
.containsExactly("Ella Dieckmann", "Walter Dieckmann");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void twoFirstNames_parenthesisedLastName() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers("Hedi und Tutu (Gruber)"))
|
||||||
|
.containsExactly("Hedi Gruber", "Tutu Gruber");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void twoPersons_differentLastNames() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers("Clara Cram u Ellen B-M"))
|
||||||
|
.containsExactly("Clara Cram", "Ellen B-M");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void familie_filtered_out() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers("Clara u Familie"))
|
||||||
|
.containsExactly("Clara");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void twoFirstNames_noLastName() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers("Walter und Eugenie"))
|
||||||
|
.containsExactly("Walter", "Eugenie");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void nullInput_returnsEmpty() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers(null)).isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void blankInput_returnsEmpty() {
|
||||||
|
assertThat(PersonNameParser.parseReceivers(" ")).isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- split ---
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void split_knownMultiWordLastName() {
|
||||||
|
PersonNameParser.SplitName result = PersonNameParser.split("Walter de Gruyter");
|
||||||
|
assertThat(result.firstName()).isEqualTo("Walter");
|
||||||
|
assertThat(result.lastName()).isEqualTo("de Gruyter");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void split_knownSingleWordLastName() {
|
||||||
|
PersonNameParser.SplitName result = PersonNameParser.split("Clara Cram");
|
||||||
|
assertThat(result.firstName()).isEqualTo("Clara");
|
||||||
|
assertThat(result.lastName()).isEqualTo("Cram");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void split_unknownLastName_fallsBackToLastSpace() {
|
||||||
|
PersonNameParser.SplitName result = PersonNameParser.split("Ellen Burkhard-Meier");
|
||||||
|
assertThat(result.firstName()).isEqualTo("Ellen");
|
||||||
|
assertThat(result.lastName()).isEqualTo("Burkhard-Meier");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void split_singleToken_lastNameIsPlaceholder() {
|
||||||
|
PersonNameParser.SplitName result = PersonNameParser.split("Clara");
|
||||||
|
assertThat(result.firstName()).isEqualTo("Clara");
|
||||||
|
assertThat(result.lastName()).isEqualTo("?");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void split_gebAnnotation_stripped() {
|
||||||
|
PersonNameParser.SplitName result = PersonNameParser.split("Eugenie de Gruyter geb. Müller");
|
||||||
|
assertThat(result.firstName()).isEqualTo("Eugenie");
|
||||||
|
assertThat(result.lastName()).isEqualTo("de Gruyter");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void split_null_returnsPlaceholder() {
|
||||||
|
PersonNameParser.SplitName result = PersonNameParser.split(null);
|
||||||
|
assertThat(result.firstName()).isEqualTo("?");
|
||||||
|
assertThat(result.lastName()).isEqualTo("?");
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user