feat(parser): implement stripAnnotation for parenthesized content
Extract trailing (...) content as annotation. Handles birth years (*1871), nicknames (Tuttu), uncertainty markers (?), and uncertain names (Quast ?) where the name part is extracted back into the cleaned result. Uses [^)]* regex to prevent ReDoS. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -172,9 +172,26 @@ public class PersonNameParser {
|
||||
return input;
|
||||
}
|
||||
|
||||
/** Strips parenthesized annotations. Pass-through until #210. */
|
||||
private static final Pattern PAREN_ANNOTATION = Pattern.compile("\\s*\\(([^)]*)\\)\\s*$");
|
||||
private static final Pattern UNCERTAIN_NAME = Pattern.compile("^(\\S+)\\s+\\?\\s*$");
|
||||
|
||||
/** Strips trailing parenthesized annotations and extracts the content. */
|
||||
public static AnnotationResult stripAnnotation(String input) {
|
||||
return new AnnotationResult(input, null);
|
||||
Matcher m = PAREN_ANNOTATION.matcher(input);
|
||||
if (!m.find()) {
|
||||
return new AnnotationResult(input, null);
|
||||
}
|
||||
String cleaned = input.substring(0, m.start()).trim();
|
||||
String rawAnnotation = m.group(1).trim();
|
||||
|
||||
Matcher uncertainMatcher = UNCERTAIN_NAME.matcher(rawAnnotation);
|
||||
if (uncertainMatcher.matches()) {
|
||||
String nameFromAnnotation = uncertainMatcher.group(1);
|
||||
cleaned = (cleaned + " " + nameFromAnnotation).trim();
|
||||
return new AnnotationResult(cleaned, "?");
|
||||
}
|
||||
|
||||
return new AnnotationResult(cleaned, rawAnnotation);
|
||||
}
|
||||
|
||||
/** Strips title prefixes. Pass-through until #212. */
|
||||
|
||||
Reference in New Issue
Block a user