feat(parser): widen GEB_PATTERN and extract maiden name in stripMaidenName
Widen pattern from `\s+geb\.\s+\S+` to `,?\s*geb\.?\s+(.+)$` to handle: optional comma, optional dot, multi-word maiden names. stripMaidenName() now captures the maiden name instead of discarding it. Handles all 5 input variants from the ODS data. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -18,7 +18,7 @@ public class PersonNameParser {
|
||||
static final List<String> KNOWN_LAST_NAMES = List.of(
|
||||
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram");
|
||||
|
||||
private static final Pattern GEB_PATTERN = Pattern.compile("\\s+geb\\.\\s+\\S+");
|
||||
private static final Pattern GEB_PATTERN = Pattern.compile(",?\\s*geb\\.?\\s+(.+)$");
|
||||
private static final Pattern PAREN_LAST_NAME = Pattern.compile("\\(([^)]+)\\)\\s*$");
|
||||
private static final Pattern MULTI_SEPARATOR = Pattern.compile("\\s+(?:und|u)\\s+");
|
||||
private static final Pattern SLASH_SEPARATOR = Pattern.compile("//");
|
||||
@@ -153,10 +153,15 @@ public class PersonNameParser {
|
||||
);
|
||||
}
|
||||
|
||||
/** Strips "geb. Xxx" maiden-name annotations. Pass-through until #209. */
|
||||
/** Strips geb annotations and extracts the maiden name. */
|
||||
public static MaidenNameResult stripMaidenName(String input) {
|
||||
String cleaned = GEB_PATTERN.matcher(input).replaceAll("").trim();
|
||||
return new MaidenNameResult(cleaned, null);
|
||||
Matcher m = GEB_PATTERN.matcher(input);
|
||||
if (m.find()) {
|
||||
String cleaned = input.substring(0, m.start()).trim();
|
||||
String maidenName = m.group(1).trim();
|
||||
return new MaidenNameResult(cleaned, maidenName);
|
||||
}
|
||||
return new MaidenNameResult(input, null);
|
||||
}
|
||||
|
||||
/** Normalizes dot-compressed names: "Dr.Fr.Zarncke" → "Dr. Fr. Zarncke" */
|
||||
|
||||
Reference in New Issue
Block a user