feat(parser): implement stripTitle for known prefixes
Some checks failed
CI / Unit & Component Tests (push) Failing after 3s
CI / Backend Unit Tests (push) Failing after 1s
CI / Unit & Component Tests (pull_request) Failing after 2s
CI / Backend Unit Tests (pull_request) Failing after 1s

Two-pass title stripping with loop for stacked titles:
- Dot-prefixes (Dr., Prof.) matched without trailing space
- Word-prefixes (Tante, Frau, Schwester, etc.) matched at
  word boundary
- Stacked titles like "Prof. Dr. Muller" handled correctly
- Single token after title strip goes to lastName (not firstName)

Add 5 "von" last names to KNOWN_LAST_NAMES for correct splitting
of entries like "Freifrau von Massenbach".

15 new test cases + updated 3 existing tests for title behavior.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-08 13:15:18 +02:00
parent 6ee1ef73c3
commit 73640ef5b6
2 changed files with 166 additions and 7 deletions

View File

@@ -16,6 +16,7 @@ public class PersonNameParser {
// Known last names in this archive, longest first to avoid partial matches
// (e.g. "de Gruyter" must be checked before any single-word name)
static final List<String> KNOWN_LAST_NAMES = List.of(
"von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa",
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram");
private static final Pattern GEB_PATTERN = Pattern.compile(",?\\s*geb\\.?\\s+(.+)$");
@@ -149,8 +150,22 @@ public class PersonNameParser {
NameParts parts = splitByKnownLastNameOrFallback(cleaned);
String firstName = parts.firstName();
String lastName = parts.lastName();
// When a title was stripped and no first name could be extracted, the
// remaining text is the lastName. "Tante Molly" -> title=Tante, lastName=Molly.
if (title.title() != null) {
if ("?".equals(lastName) && !cleaned.contains(" ")) {
lastName = firstName;
firstName = null;
} else if (firstName.equals(lastName)) {
firstName = null;
}
}
return new SplitName(
title.title(), parts.firstName(), parts.lastName(),
title.title(), firstName, lastName,
maiden.maidenName(), paren.annotation()
);
}
@@ -196,9 +211,50 @@ public class PersonNameParser {
return new AnnotationResult(cleaned, rawAnnotation);
}
/** Strips title prefixes. Pass-through until #212. */
private static final List<String> DOT_PREFIXES = List.of("Dr.", "Prof.");
private static final List<String> WORD_PREFIXES = List.of(
"Frau", "Herr", "Freifrau", "Freiherr",
"Tante", "Onkel", "Schwester", "Bruder",
"Cousine", "Cousin", "Freundin", "Freund",
"Mutter", "Vater", "Pastor", "Architekt");
/** Strips known title/relationship prefixes, looping for stacked titles. */
public static TitleResult stripTitle(String input) {
return new TitleResult(input, null);
String remaining = input;
StringBuilder titleBuilder = new StringBuilder();
boolean found = true;
while (found) {
found = false;
for (String prefix : DOT_PREFIXES) {
if (remaining.toLowerCase().startsWith(prefix.toLowerCase())) {
titleBuilder.append(titleBuilder.isEmpty() ? "" : " ").append(prefix);
remaining = remaining.substring(prefix.length()).trim();
found = true;
break;
}
}
if (found) continue;
for (String prefix : WORD_PREFIXES) {
String lower = remaining.toLowerCase();
if (lower.startsWith(prefix.toLowerCase() + " ") || lower.equals(prefix.toLowerCase())) {
titleBuilder.append(titleBuilder.isEmpty() ? "" : " ").append(prefix);
remaining = remaining.length() > prefix.length()
? remaining.substring(prefix.length() + 1).trim()
: "";
found = true;
break;
}
}
}
if (titleBuilder.isEmpty()) {
return new TitleResult(input, null);
}
return new TitleResult(remaining, titleBuilder.toString());
}
/** Splits a cleaned name into firstName/lastName using known last names or last-space fallback. */

View File

@@ -190,14 +190,16 @@ class PersonNameParserTest {
@Test
void split_dotCompressed_titleFirstNameLastName() {
PersonNameParser.SplitName result = PersonNameParser.split("Dr.Fr.Zarncke");
assertThat(result.firstName()).isEqualTo("Dr. Fr.");
assertThat(result.title()).isEqualTo("Dr.");
assertThat(result.firstName()).isEqualTo("Fr.");
assertThat(result.lastName()).isEqualTo("Zarncke");
}
@Test
void split_dotCompressed_titleAndLastName() {
PersonNameParser.SplitName result = PersonNameParser.split("Dr.Zarnke");
assertThat(result.firstName()).isEqualTo("Dr.");
assertThat(result.title()).isEqualTo("Dr.");
assertThat(result.firstName()).isNull();
assertThat(result.lastName()).isEqualTo("Zarnke");
}
@@ -210,7 +212,8 @@ class PersonNameParserTest {
@Test
void split_alreadySpacedDotName_noDoubleSpacing() {
PersonNameParser.SplitName result = PersonNameParser.split("Dr. Fr. Zarncke");
assertThat(result.firstName()).isEqualTo("Dr. Fr.");
assertThat(result.title()).isEqualTo("Dr.");
assertThat(result.firstName()).isEqualTo("Fr.");
assertThat(result.lastName()).isEqualTo("Zarncke");
}
@@ -353,12 +356,112 @@ class PersonNameParserTest {
}
@Test
void stripTitle_isPassthrough() {
void stripTitle_noPrefix_returnsNull() {
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Walter de Gruyter");
assertThat(result.cleaned()).isEqualTo("Walter de Gruyter");
assertThat(result.title()).isNull();
}
@Test
void stripTitle_tante() {
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Tante Molly");
assertThat(result.cleaned()).isEqualTo("Molly");
assertThat(result.title()).isEqualTo("Tante");
}
@Test
void stripTitle_schwester() {
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Schwester Hanni");
assertThat(result.cleaned()).isEqualTo("Hanni");
assertThat(result.title()).isEqualTo("Schwester");
}
@Test
void stripTitle_frau() {
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Frau Bakker");
assertThat(result.cleaned()).isEqualTo("Bakker");
assertThat(result.title()).isEqualTo("Frau");
}
@Test
void stripTitle_cousine_withFullName() {
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Cousine Emmy Haniel");
assertThat(result.cleaned()).isEqualTo("Emmy Haniel");
assertThat(result.title()).isEqualTo("Cousine");
}
@Test
void stripTitle_freifrau() {
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Freifrau von Massenbach");
assertThat(result.cleaned()).isEqualTo("von Massenbach");
assertThat(result.title()).isEqualTo("Freifrau");
}
@Test
void stripTitle_dotPrefix_withSpace() {
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Dr. Sattelmacher");
assertThat(result.cleaned()).isEqualTo("Sattelmacher");
assertThat(result.title()).isEqualTo("Dr.");
}
@Test
void stripTitle_dotPrefix_noSpace() {
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Dr.von Gelden");
assertThat(result.cleaned()).isEqualTo("von Gelden");
assertThat(result.title()).isEqualTo("Dr.");
}
@Test
void stripTitle_stacked_profDr() {
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Prof. Dr. Muller");
assertThat(result.cleaned()).isEqualTo("Muller");
assertThat(result.title()).isEqualTo("Prof. Dr.");
}
// --- split — title extraction end-to-end ---
@Test
void split_tante_setsTitle_firstNameNull() {
PersonNameParser.SplitName result = PersonNameParser.split("Tante Molly");
assertThat(result.title()).isEqualTo("Tante");
assertThat(result.firstName()).isNull();
assertThat(result.lastName()).isEqualTo("Molly");
}
@Test
void split_dotTitle_afterDotNorm() {
PersonNameParser.SplitName result = PersonNameParser.split("Dr.Fr.Zarncke");
assertThat(result.title()).isEqualTo("Dr.");
assertThat(result.firstName()).isEqualTo("Fr.");
assertThat(result.lastName()).isEqualTo("Zarncke");
}
@Test
void split_dotTitle_noSpace_vonLastName() {
PersonNameParser.SplitName result = PersonNameParser.split("Dr.von Gelden");
assertThat(result.title()).isEqualTo("Dr.");
assertThat(result.firstName()).isNull();
assertThat(result.lastName()).isEqualTo("von Gelden");
}
// --- regression: non-prefixes not stripped ---
@Test
void split_walter_noTitleStrip() {
PersonNameParser.SplitName result = PersonNameParser.split("Walter de Gruyter");
assertThat(result.title()).isNull();
assertThat(result.firstName()).isEqualTo("Walter");
assertThat(result.lastName()).isEqualTo("de Gruyter");
}
@Test
void split_conrad_vonGeldern_noTitleStrip() {
PersonNameParser.SplitName result = PersonNameParser.split("Conrad von Geldern");
assertThat(result.title()).isNull();
assertThat(result.firstName()).isEqualTo("Conrad");
assertThat(result.lastName()).isEqualTo("von Geldern");
}
// --- stripMaidenName — maiden name extraction ---
@Test