feat(parser): implement stripAnnotation for parenthesized content

Extract trailing (...) content as annotation. Handles birth years
(*1871), nicknames (Tuttu), uncertainty markers (?), and uncertain
names (Quast ?) where the name part is extracted back into the
cleaned result. Uses [^)]* regex to prevent ReDoS.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-08 12:58:02 +02:00
parent 9f90cc1a5f
commit e696e5056d
2 changed files with 73 additions and 3 deletions

View File

@@ -172,9 +172,26 @@ public class PersonNameParser {
return input;
}
/** Strips parenthesized annotations. Pass-through until #210. */
private static final Pattern PAREN_ANNOTATION = Pattern.compile("\\s*\\(([^)]*)\\)\\s*$");
private static final Pattern UNCERTAIN_NAME = Pattern.compile("^(\\S+)\\s+\\?\\s*$");
/** Strips trailing parenthesized annotations and extracts the content. */
public static AnnotationResult stripAnnotation(String input) {
return new AnnotationResult(input, null);
Matcher m = PAREN_ANNOTATION.matcher(input);
if (!m.find()) {
return new AnnotationResult(input, null);
}
String cleaned = input.substring(0, m.start()).trim();
String rawAnnotation = m.group(1).trim();
Matcher uncertainMatcher = UNCERTAIN_NAME.matcher(rawAnnotation);
if (uncertainMatcher.matches()) {
String nameFromAnnotation = uncertainMatcher.group(1);
cleaned = (cleaned + " " + nameFromAnnotation).trim();
return new AnnotationResult(cleaned, "?");
}
return new AnnotationResult(cleaned, rawAnnotation);
}
/** Strips title prefixes. Pass-through until #212. */