feat(parser): implement stripAnnotation for parenthesized content
Extract trailing (...) content as annotation. Handles birth years (*1871), nicknames (Tuttu), uncertainty markers (?), and uncertain names (Quast ?) where the name part is extracted back into the cleaned result. Uses [^)]* regex to prevent ReDoS. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -172,9 +172,26 @@ public class PersonNameParser {
|
||||
return input;
|
||||
}
|
||||
|
||||
/** Strips parenthesized annotations. Pass-through until #210. */
|
||||
private static final Pattern PAREN_ANNOTATION = Pattern.compile("\\s*\\(([^)]*)\\)\\s*$");
|
||||
private static final Pattern UNCERTAIN_NAME = Pattern.compile("^(\\S+)\\s+\\?\\s*$");
|
||||
|
||||
/** Strips trailing parenthesized annotations and extracts the content. */
|
||||
public static AnnotationResult stripAnnotation(String input) {
|
||||
return new AnnotationResult(input, null);
|
||||
Matcher m = PAREN_ANNOTATION.matcher(input);
|
||||
if (!m.find()) {
|
||||
return new AnnotationResult(input, null);
|
||||
}
|
||||
String cleaned = input.substring(0, m.start()).trim();
|
||||
String rawAnnotation = m.group(1).trim();
|
||||
|
||||
Matcher uncertainMatcher = UNCERTAIN_NAME.matcher(rawAnnotation);
|
||||
if (uncertainMatcher.matches()) {
|
||||
String nameFromAnnotation = uncertainMatcher.group(1);
|
||||
cleaned = (cleaned + " " + nameFromAnnotation).trim();
|
||||
return new AnnotationResult(cleaned, "?");
|
||||
}
|
||||
|
||||
return new AnnotationResult(cleaned, rawAnnotation);
|
||||
}
|
||||
|
||||
/** Strips title prefixes. Pass-through until #212. */
|
||||
|
||||
@@ -281,12 +281,65 @@ class PersonNameParserTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripAnnotation_isPassthrough() {
|
||||
void stripAnnotation_noParens_returnsNull() {
|
||||
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Walter de Gruyter");
|
||||
assertThat(result.cleaned()).isEqualTo("Walter de Gruyter");
|
||||
assertThat(result.annotation()).isNull();
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripAnnotation_birthYear_noSpace() {
|
||||
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Clara de Gruyter(*1871)");
|
||||
assertThat(result.cleaned()).isEqualTo("Clara de Gruyter");
|
||||
assertThat(result.annotation()).isEqualTo("*1871");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripAnnotation_uncertainty_withSpace() {
|
||||
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Ernst Kurmany (?)");
|
||||
assertThat(result.cleaned()).isEqualTo("Ernst Kurmany");
|
||||
assertThat(result.annotation()).isEqualTo("?");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripAnnotation_nickname_noSpace() {
|
||||
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Gertrud D.(Tuttu)");
|
||||
assertThat(result.cleaned()).isEqualTo("Gertrud D.");
|
||||
assertThat(result.annotation()).isEqualTo("Tuttu");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripAnnotation_uncertainName_extractsNameBack() {
|
||||
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Richard (Quast ? )");
|
||||
assertThat(result.cleaned()).isEqualTo("Richard Quast");
|
||||
assertThat(result.annotation()).isEqualTo("?");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripAnnotation_onlyParen_returnsPlaceholder() {
|
||||
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("(OnlyParen)");
|
||||
assertThat(result.cleaned()).isEmpty();
|
||||
assertThat(result.annotation()).isEqualTo("OnlyParen");
|
||||
}
|
||||
|
||||
// --- split — annotation extraction end-to-end ---
|
||||
|
||||
@Test
|
||||
void split_birthYearAnnotation_extracted() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Clara de Gruyter(*1871)");
|
||||
assertThat(result.firstName()).isEqualTo("Clara");
|
||||
assertThat(result.lastName()).isEqualTo("de Gruyter");
|
||||
assertThat(result.annotation()).isEqualTo("*1871");
|
||||
}
|
||||
|
||||
@Test
|
||||
void split_uncertainName_extractsLastName() {
|
||||
PersonNameParser.SplitName result = PersonNameParser.split("Richard (Quast ? )");
|
||||
assertThat(result.firstName()).isEqualTo("Richard");
|
||||
assertThat(result.lastName()).isEqualTo("Quast");
|
||||
assertThat(result.annotation()).isEqualTo("?");
|
||||
}
|
||||
|
||||
@Test
|
||||
void stripTitle_isPassthrough() {
|
||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Walter de Gruyter");
|
||||
|
||||
Reference in New Issue
Block a user