feat(parser): support // separator and dot-compressed names #208
@@ -1,6 +1,7 @@
|
||||
package org.raddatz.familienarchiv.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
@@ -20,6 +21,7 @@ public class PersonNameParser {
|
||||
private static final Pattern GEB_PATTERN = Pattern.compile("\\s+geb\\.\\s+\\S+");
|
||||
private static final Pattern PAREN_LAST_NAME = Pattern.compile("\\(([^)]+)\\)\\s*$");
|
||||
private static final Pattern MULTI_SEPARATOR = Pattern.compile("\\s+(?:und|u)\\s+");
|
||||
private static final Pattern SLASH_SEPARATOR = Pattern.compile("//");
|
||||
|
||||
public record SplitName(String firstName, String lastName) {}
|
||||
|
||||
@@ -38,6 +40,16 @@ public class PersonNameParser {
|
||||
public static List<String> parseReceivers(String raw) {
|
||||
if (raw == null || raw.isBlank()) return List.of();
|
||||
|
||||
// 0. Pre-split on "//" — each segment is an independent name entry
|
||||
String[] slashParts = SLASH_SEPARATOR.split(raw, -1);
|
||||
if (slashParts.length > 1) {
|
||||
return Arrays.stream(slashParts)
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isBlank())
|
||||
.flatMap(segment -> parseReceivers(segment).stream())
|
||||
.toList();
|
||||
}
|
||||
|
||||
// 1. Strip "geb. Xxx" maiden-name annotations
|
||||
String cleaned = GEB_PATTERN.matcher(raw).replaceAll("").trim();
|
||||
|
||||
@@ -111,6 +123,11 @@ public class PersonNameParser {
|
||||
|
||||
String cleaned = GEB_PATTERN.matcher(rawName).replaceAll("").trim();
|
||||
|
||||
// Normalize dot-compressed names: "Dr.Fr.Zarncke" -> "Dr. Fr. Zarncke"
|
||||
if (!cleaned.contains(" ") && cleaned.contains(".")) {
|
||||
cleaned = cleaned.replace(".", ". ").trim();
|
||||
}
|
||||
|
||||
String lastName = findKnownLastName(cleaned);
|
||||
if (lastName != null) {
|
||||
String firstName = cleaned.substring(0, cleaned.length() - lastName.length()).trim();
|
||||
|
||||
@@ -133,6 +133,55 @@ class PersonNameParserTest {
|
||||
assertThat(result.lastName()).isEqualTo("de Gruyter");
|
||||
}
|
||||
|
||||
// --- split — dot-compressed names ---
|
||||
|
||||
@Test
|
||||
void split_dotCompressed_initialAndLastName() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("E.Rockstroh");
|
||||
assertThat(result.firstName()).isEqualTo("E.");
|
||||
assertThat(result.lastName()).isEqualTo("Rockstroh");
|
||||
}
|
||||
|
||||
@Test
|
||||
void split_dotCompressed_twoInitials() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("E.M.");
|
||||
assertThat(result.firstName()).isEqualTo("E.");
|
||||
assertThat(result.lastName()).isEqualTo("M.");
|
||||
}
|
||||
|
||||
@Test
|
||||
void split_dotCompressed_titleFirstNameLastName() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Dr.Fr.Zarncke");
|
||||
assertThat(result.firstName()).isEqualTo("Dr. Fr.");
|
||||
assertThat(result.lastName()).isEqualTo("Zarncke");
|
||||
}
|
||||
|
||||
@Test
|
||||
void split_dotCompressed_titleAndLastName() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Dr.Zarnke");
|
||||
assertThat(result.firstName()).isEqualTo("Dr.");
|
||||
assertThat(result.lastName()).isEqualTo("Zarnke");
|
||||
}
|
||||
|
||||
@Test
|
||||
void parseReceivers_dotCompressedName_passthrough() {
|
||||
assertThat(PersonNameParser.parseReceivers("Dr.Fr.Zarncke"))
|
||||
.containsExactly("Dr.Fr.Zarncke");
|
||||
}
|
||||
|
||||
@Test
|
||||
void split_alreadySpacedDotName_noDoubleSpacing() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Dr. Fr. Zarncke");
|
||||
assertThat(result.firstName()).isEqualTo("Dr. Fr.");
|
||||
assertThat(result.lastName()).isEqualTo("Zarncke");
|
||||
}
|
||||
|
||||
@Test
|
||||
void slashSeparator_combinedWithDotCompressed() {
|
||||
assertThat(PersonNameParser.parseReceivers("E.Rockstroh//Dr.Fr.Zarncke"))
|
||||
.containsExactly("E.Rockstroh", "Dr.Fr.Zarncke");
|
||||
}
|
||||
|
||||
// --- parseReceivers — shared last name with full-name part ─────────────────
|
||||
|
||||
@Test
|
||||
@@ -149,6 +198,38 @@ class PersonNameParserTest {
|
||||
assertThat(result).containsExactlyInAnyOrder("Clara Cram", "Eugenie de Gruyter");
|
||||
}
|
||||
|
||||
// --- parseReceivers — // separator ---
|
||||
|
||||
@Test
|
||||
void slashSeparator_twoIndependentFullNames() {
|
||||
assertThat(PersonNameParser.parseReceivers("Charl.Blomquist//Tante Lolly"))
|
||||
.containsExactly("Charl.Blomquist", "Tante Lolly");
|
||||
}
|
||||
|
||||
@Test
|
||||
void slashSeparator_abbreviatedFirstName() {
|
||||
assertThat(PersonNameParser.parseReceivers("Walter de Gruyter//Eugenie de Gruyter"))
|
||||
.containsExactly("Walter de Gruyter", "Eugenie de Gruyter");
|
||||
}
|
||||
|
||||
@Test
|
||||
void slashSeparator_withSpacesAroundSlashes() {
|
||||
assertThat(PersonNameParser.parseReceivers(" Herbert Cram // Eugenie de Gruyter "))
|
||||
.containsExactly("Herbert Cram", "Eugenie de Gruyter");
|
||||
}
|
||||
|
||||
@Test
|
||||
void slashSeparator_segmentContainsUnd() {
|
||||
assertThat(PersonNameParser.parseReceivers("Herbert und Clara Cram//Eugenie de Gruyter"))
|
||||
.containsExactly("Herbert Cram", "Clara Cram", "Eugenie de Gruyter");
|
||||
}
|
||||
|
||||
@Test
|
||||
void slashSeparator_trailingSlash() {
|
||||
assertThat(PersonNameParser.parseReceivers("Herbert Cram//"))
|
||||
.containsExactly("Herbert Cram");
|
||||
}
|
||||
|
||||
@Test
|
||||
void parseReceivers_returnsEmpty_whenAllPartsAreFamilie() {
|
||||
// All parts filtered out → nameParts.isEmpty() = true → return List.of()
|
||||
|
||||
Reference in New Issue
Block a user