feat(parser): implement stripTitle for known prefixes
Two-pass title stripping with loop for stacked titles: - Dot-prefixes (Dr., Prof.) matched without trailing space - Word-prefixes (Tante, Frau, Schwester, etc.) matched at word boundary - Stacked titles like "Prof. Dr. Muller" handled correctly - Single token after title strip goes to lastName (not firstName) Add 5 "von" last names to KNOWN_LAST_NAMES for correct splitting of entries like "Freifrau von Massenbach". 15 new test cases + updated 3 existing tests for title behavior. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ public class PersonNameParser {
|
||||
// Known last names in this archive, longest first to avoid partial matches
|
||||
// (e.g. "de Gruyter" must be checked before any single-word name)
|
||||
static final List<String> KNOWN_LAST_NAMES = List.of(
|
||||
"von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa",
|
||||
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram");
|
||||
|
||||
private static final Pattern GEB_PATTERN = Pattern.compile(",?\\s*geb\\.?\\s+(.+)$");
|
||||
@@ -149,8 +150,22 @@ public class PersonNameParser {
|
||||
|
||||
NameParts parts = splitByKnownLastNameOrFallback(cleaned);
|
||||
|
||||
String firstName = parts.firstName();
|
||||
String lastName = parts.lastName();
|
||||
|
||||
// When a title was stripped and no first name could be extracted, the
|
||||
// remaining text is the lastName. "Tante Molly" -> title=Tante, lastName=Molly.
|
||||
if (title.title() != null) {
|
||||
if ("?".equals(lastName) && !cleaned.contains(" ")) {
|
||||
lastName = firstName;
|
||||
firstName = null;
|
||||
} else if (firstName.equals(lastName)) {
|
||||
firstName = null;
|
||||
}
|
||||
}
|
||||
|
||||
return new SplitName(
|
||||
title.title(), parts.firstName(), parts.lastName(),
|
||||
title.title(), firstName, lastName,
|
||||
maiden.maidenName(), paren.annotation()
|
||||
);
|
||||
}
|
||||
@@ -196,9 +211,50 @@ public class PersonNameParser {
|
||||
return new AnnotationResult(cleaned, rawAnnotation);
|
||||
}
|
||||
|
||||
/** Strips title prefixes. Pass-through until #212. */
|
||||
private static final List<String> DOT_PREFIXES = List.of("Dr.", "Prof.");
|
||||
|
||||
private static final List<String> WORD_PREFIXES = List.of(
|
||||
"Frau", "Herr", "Freifrau", "Freiherr",
|
||||
"Tante", "Onkel", "Schwester", "Bruder",
|
||||
"Cousine", "Cousin", "Freundin", "Freund",
|
||||
"Mutter", "Vater", "Pastor", "Architekt");
|
||||
|
||||
/** Strips known title/relationship prefixes, looping for stacked titles. */
|
||||
public static TitleResult stripTitle(String input) {
|
||||
return new TitleResult(input, null);
|
||||
String remaining = input;
|
||||
StringBuilder titleBuilder = new StringBuilder();
|
||||
boolean found = true;
|
||||
|
||||
while (found) {
|
||||
found = false;
|
||||
|
||||
for (String prefix : DOT_PREFIXES) {
|
||||
if (remaining.toLowerCase().startsWith(prefix.toLowerCase())) {
|
||||
titleBuilder.append(titleBuilder.isEmpty() ? "" : " ").append(prefix);
|
||||
remaining = remaining.substring(prefix.length()).trim();
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found) continue;
|
||||
|
||||
for (String prefix : WORD_PREFIXES) {
|
||||
String lower = remaining.toLowerCase();
|
||||
if (lower.startsWith(prefix.toLowerCase() + " ") || lower.equals(prefix.toLowerCase())) {
|
||||
titleBuilder.append(titleBuilder.isEmpty() ? "" : " ").append(prefix);
|
||||
remaining = remaining.length() > prefix.length()
|
||||
? remaining.substring(prefix.length() + 1).trim()
|
||||
: "";
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (titleBuilder.isEmpty()) {
|
||||
return new TitleResult(input, null);
|
||||
}
|
||||
return new TitleResult(remaining, titleBuilder.toString());
|
||||
}
|
||||
|
||||
/** Splits a cleaned name into firstName/lastName using known last names or last-space fallback. */
|
||||
|
||||
Reference in New Issue
Block a user