feat(parser): implement stripTitle for known prefixes
Two-pass title stripping with loop for stacked titles: - Dot-prefixes (Dr., Prof.) matched without trailing space - Word-prefixes (Tante, Frau, Schwester, etc.) matched at word boundary - Stacked titles like "Prof. Dr. Muller" handled correctly - Single token after title strip goes to lastName (not firstName) Add 5 "von" last names to KNOWN_LAST_NAMES for correct splitting of entries like "Freifrau von Massenbach". 15 new test cases + updated 3 existing tests for title behavior. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ public class PersonNameParser {
|
||||
// Known last names in this archive, longest first to avoid partial matches
|
||||
// (e.g. "de Gruyter" must be checked before any single-word name)
|
||||
static final List<String> KNOWN_LAST_NAMES = List.of(
|
||||
"von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa",
|
||||
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram");
|
||||
|
||||
private static final Pattern GEB_PATTERN = Pattern.compile(",?\\s*geb\\.?\\s+(.+)$");
|
||||
@@ -149,8 +150,22 @@ public class PersonNameParser {
|
||||
|
||||
NameParts parts = splitByKnownLastNameOrFallback(cleaned);
|
||||
|
||||
String firstName = parts.firstName();
|
||||
String lastName = parts.lastName();
|
||||
|
||||
// When a title was stripped and no first name could be extracted, the
|
||||
// remaining text is the lastName. "Tante Molly" -> title=Tante, lastName=Molly.
|
||||
if (title.title() != null) {
|
||||
if ("?".equals(lastName) && !cleaned.contains(" ")) {
|
||||
lastName = firstName;
|
||||
firstName = null;
|
||||
} else if (firstName.equals(lastName)) {
|
||||
firstName = null;
|
||||
}
|
||||
}
|
||||
|
||||
return new SplitName(
|
||||
title.title(), parts.firstName(), parts.lastName(),
|
||||
title.title(), firstName, lastName,
|
||||
maiden.maidenName(), paren.annotation()
|
||||
);
|
||||
}
|
||||
@@ -196,9 +211,50 @@ public class PersonNameParser {
|
||||
return new AnnotationResult(cleaned, rawAnnotation);
|
||||
}
|
||||
|
||||
/** Strips title prefixes. Pass-through until #212. */
|
||||
private static final List<String> DOT_PREFIXES = List.of("Dr.", "Prof.");
|
||||
|
||||
private static final List<String> WORD_PREFIXES = List.of(
|
||||
"Frau", "Herr", "Freifrau", "Freiherr",
|
||||
"Tante", "Onkel", "Schwester", "Bruder",
|
||||
"Cousine", "Cousin", "Freundin", "Freund",
|
||||
"Mutter", "Vater", "Pastor", "Architekt");
|
||||
|
||||
/** Strips known title/relationship prefixes, looping for stacked titles. */
|
||||
public static TitleResult stripTitle(String input) {
|
||||
return new TitleResult(input, null);
|
||||
String remaining = input;
|
||||
StringBuilder titleBuilder = new StringBuilder();
|
||||
boolean found = true;
|
||||
|
||||
while (found) {
|
||||
found = false;
|
||||
|
||||
for (String prefix : DOT_PREFIXES) {
|
||||
if (remaining.toLowerCase().startsWith(prefix.toLowerCase())) {
|
||||
titleBuilder.append(titleBuilder.isEmpty() ? "" : " ").append(prefix);
|
||||
remaining = remaining.substring(prefix.length()).trim();
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found) continue;
|
||||
|
||||
for (String prefix : WORD_PREFIXES) {
|
||||
String lower = remaining.toLowerCase();
|
||||
if (lower.startsWith(prefix.toLowerCase() + " ") || lower.equals(prefix.toLowerCase())) {
|
||||
titleBuilder.append(titleBuilder.isEmpty() ? "" : " ").append(prefix);
|
||||
remaining = remaining.length() > prefix.length()
|
||||
? remaining.substring(prefix.length() + 1).trim()
|
||||
: "";
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (titleBuilder.isEmpty()) {
|
||||
return new TitleResult(input, null);
|
||||
}
|
||||
return new TitleResult(remaining, titleBuilder.toString());
|
||||
}
|
||||
|
||||
/** Splits a cleaned name into firstName/lastName using known last names or last-space fallback. */
|
||||
|
||||
@@ -190,14 +190,16 @@ class PersonNameParserTest {
|
||||
@Test
|
||||
void split_dotCompressed_titleFirstNameLastName() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Dr.Fr.Zarncke");
|
||||
assertThat(result.firstName()).isEqualTo("Dr. Fr.");
|
||||
assertThat(result.title()).isEqualTo("Dr.");
|
||||
assertThat(result.firstName()).isEqualTo("Fr.");
|
||||
assertThat(result.lastName()).isEqualTo("Zarncke");
|
||||
}
|
||||
|
||||
@Test
|
||||
void split_dotCompressed_titleAndLastName() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Dr.Zarnke");
|
||||
assertThat(result.firstName()).isEqualTo("Dr.");
|
||||
assertThat(result.title()).isEqualTo("Dr.");
|
||||
assertThat(result.firstName()).isNull();
|
||||
assertThat(result.lastName()).isEqualTo("Zarnke");
|
||||
}
|
||||
|
||||
@@ -210,7 +212,8 @@ class PersonNameParserTest {
|
||||
@Test
|
||||
void split_alreadySpacedDotName_noDoubleSpacing() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Dr. Fr. Zarncke");
|
||||
assertThat(result.firstName()).isEqualTo("Dr. Fr.");
|
||||
assertThat(result.title()).isEqualTo("Dr.");
|
||||
assertThat(result.firstName()).isEqualTo("Fr.");
|
||||
assertThat(result.lastName()).isEqualTo("Zarncke");
|
||||
}
|
||||
|
||||
@@ -353,12 +356,112 @@ class PersonNameParserTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripTitle_isPassthrough() {
|
||||
void stripTitle_noPrefix_returnsNull() {
|
||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Walter de Gruyter");
|
||||
assertThat(result.cleaned()).isEqualTo("Walter de Gruyter");
|
||||
assertThat(result.title()).isNull();
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripTitle_tante() {
|
||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Tante Molly");
|
||||
assertThat(result.cleaned()).isEqualTo("Molly");
|
||||
assertThat(result.title()).isEqualTo("Tante");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripTitle_schwester() {
|
||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Schwester Hanni");
|
||||
assertThat(result.cleaned()).isEqualTo("Hanni");
|
||||
assertThat(result.title()).isEqualTo("Schwester");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripTitle_frau() {
|
||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Frau Bakker");
|
||||
assertThat(result.cleaned()).isEqualTo("Bakker");
|
||||
assertThat(result.title()).isEqualTo("Frau");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripTitle_cousine_withFullName() {
|
||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Cousine Emmy Haniel");
|
||||
assertThat(result.cleaned()).isEqualTo("Emmy Haniel");
|
||||
assertThat(result.title()).isEqualTo("Cousine");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripTitle_freifrau() {
|
||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Freifrau von Massenbach");
|
||||
assertThat(result.cleaned()).isEqualTo("von Massenbach");
|
||||
assertThat(result.title()).isEqualTo("Freifrau");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripTitle_dotPrefix_withSpace() {
|
||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Dr. Sattelmacher");
|
||||
assertThat(result.cleaned()).isEqualTo("Sattelmacher");
|
||||
assertThat(result.title()).isEqualTo("Dr.");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripTitle_dotPrefix_noSpace() {
|
||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Dr.von Gelden");
|
||||
assertThat(result.cleaned()).isEqualTo("von Gelden");
|
||||
assertThat(result.title()).isEqualTo("Dr.");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripTitle_stacked_profDr() {
|
||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Prof. Dr. Muller");
|
||||
assertThat(result.cleaned()).isEqualTo("Muller");
|
||||
assertThat(result.title()).isEqualTo("Prof. Dr.");
|
||||
}
|
||||
|
||||
// --- split — title extraction end-to-end ---
|
||||
|
||||
@Test
|
||||
void split_tante_setsTitle_firstNameNull() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Tante Molly");
|
||||
assertThat(result.title()).isEqualTo("Tante");
|
||||
assertThat(result.firstName()).isNull();
|
||||
assertThat(result.lastName()).isEqualTo("Molly");
|
||||
}
|
||||
|
||||
@Test
|
||||
void split_dotTitle_afterDotNorm() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Dr.Fr.Zarncke");
|
||||
assertThat(result.title()).isEqualTo("Dr.");
|
||||
assertThat(result.firstName()).isEqualTo("Fr.");
|
||||
assertThat(result.lastName()).isEqualTo("Zarncke");
|
||||
}
|
||||
|
||||
@Test
|
||||
void split_dotTitle_noSpace_vonLastName() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Dr.von Gelden");
|
||||
assertThat(result.title()).isEqualTo("Dr.");
|
||||
assertThat(result.firstName()).isNull();
|
||||
assertThat(result.lastName()).isEqualTo("von Gelden");
|
||||
}
|
||||
|
||||
// --- regression: non-prefixes not stripped ---
|
||||
|
||||
@Test
|
||||
void split_walter_noTitleStrip() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Walter de Gruyter");
|
||||
assertThat(result.title()).isNull();
|
||||
assertThat(result.firstName()).isEqualTo("Walter");
|
||||
assertThat(result.lastName()).isEqualTo("de Gruyter");
|
||||
}
|
||||
|
||||
@Test
|
||||
void split_conrad_vonGeldern_noTitleStrip() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Conrad von Geldern");
|
||||
assertThat(result.title()).isNull();
|
||||
assertThat(result.firstName()).isEqualTo("Conrad");
|
||||
assertThat(result.lastName()).isEqualTo("von Geldern");
|
||||
}
|
||||
|
||||
// --- stripMaidenName — maiden name extraction ---
|
||||
|
||||
@Test
|
||||
|
||||
Reference in New Issue
Block a user