feat(parser): implement stripAnnotation for parenthesized content

Extract trailing (...) content as annotation. Handles birth years
(*1871), nicknames (Tuttu), uncertainty markers (?), and uncertain
names (Quast ?) where the name part is extracted back into the
cleaned result. Uses [^)]* regex to prevent ReDoS.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-08 12:58:02 +02:00
parent 9f90cc1a5f
commit e696e5056d
2 changed files with 73 additions and 3 deletions

View File

@@ -172,9 +172,26 @@ public class PersonNameParser {
return input;
}
/** Strips parenthesized annotations. Pass-through until #210. */
private static final Pattern PAREN_ANNOTATION = Pattern.compile("\\s*\\(([^)]*)\\)\\s*$");
private static final Pattern UNCERTAIN_NAME = Pattern.compile("^(\\S+)\\s+\\?\\s*$");
/** Strips trailing parenthesized annotations and extracts the content. */
public static AnnotationResult stripAnnotation(String input) {
return new AnnotationResult(input, null);
Matcher m = PAREN_ANNOTATION.matcher(input);
if (!m.find()) {
return new AnnotationResult(input, null);
}
String cleaned = input.substring(0, m.start()).trim();
String rawAnnotation = m.group(1).trim();
Matcher uncertainMatcher = UNCERTAIN_NAME.matcher(rawAnnotation);
if (uncertainMatcher.matches()) {
String nameFromAnnotation = uncertainMatcher.group(1);
cleaned = (cleaned + " " + nameFromAnnotation).trim();
return new AnnotationResult(cleaned, "?");
}
return new AnnotationResult(cleaned, rawAnnotation);
}
/** Strips title prefixes. Pass-through until #212. */

View File

@@ -281,12 +281,65 @@ class PersonNameParserTest {
}
@Test
void stripAnnotation_isPassthrough() {
void stripAnnotation_noParens_returnsNull() {
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Walter de Gruyter");
assertThat(result.cleaned()).isEqualTo("Walter de Gruyter");
assertThat(result.annotation()).isNull();
}
@Test
void stripAnnotation_birthYear_noSpace() {
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Clara de Gruyter(*1871)");
assertThat(result.cleaned()).isEqualTo("Clara de Gruyter");
assertThat(result.annotation()).isEqualTo("*1871");
}
@Test
void stripAnnotation_uncertainty_withSpace() {
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Ernst Kurmany (?)");
assertThat(result.cleaned()).isEqualTo("Ernst Kurmany");
assertThat(result.annotation()).isEqualTo("?");
}
@Test
void stripAnnotation_nickname_noSpace() {
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Gertrud D.(Tuttu)");
assertThat(result.cleaned()).isEqualTo("Gertrud D.");
assertThat(result.annotation()).isEqualTo("Tuttu");
}
@Test
void stripAnnotation_uncertainName_extractsNameBack() {
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Richard (Quast ? )");
assertThat(result.cleaned()).isEqualTo("Richard Quast");
assertThat(result.annotation()).isEqualTo("?");
}
@Test
void stripAnnotation_onlyParen_returnsPlaceholder() {
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("(OnlyParen)");
assertThat(result.cleaned()).isEmpty();
assertThat(result.annotation()).isEqualTo("OnlyParen");
}
// --- split — annotation extraction end-to-end ---
@Test
void split_birthYearAnnotation_extracted() {
PersonNameParser.SplitName result = PersonNameParser.split("Clara de Gruyter(*1871)");
assertThat(result.firstName()).isEqualTo("Clara");
assertThat(result.lastName()).isEqualTo("de Gruyter");
assertThat(result.annotation()).isEqualTo("*1871");
}
@Test
void split_uncertainName_extractsLastName() {
PersonNameParser.SplitName result = PersonNameParser.split("Richard (Quast ? )");
assertThat(result.firstName()).isEqualTo("Richard");
assertThat(result.lastName()).isEqualTo("Quast");
assertThat(result.annotation()).isEqualTo("?");
}
@Test
void stripTitle_isPassthrough() {
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Walter de Gruyter");