feat(parser): implement stripAnnotation for parenthesized content
Extract trailing (...) content as annotation. Handles birth years (*1871), nicknames (Tuttu), uncertainty markers (?), and uncertain names (Quast ?) where the name part is extracted back into the cleaned result. Uses [^)]* regex to prevent ReDoS. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -172,9 +172,26 @@ public class PersonNameParser {
|
|||||||
return input;
|
return input;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Strips parenthesized annotations. Pass-through until #210. */
|
private static final Pattern PAREN_ANNOTATION = Pattern.compile("\\s*\\(([^)]*)\\)\\s*$");
|
||||||
|
private static final Pattern UNCERTAIN_NAME = Pattern.compile("^(\\S+)\\s+\\?\\s*$");
|
||||||
|
|
||||||
|
/** Strips trailing parenthesized annotations and extracts the content. */
|
||||||
public static AnnotationResult stripAnnotation(String input) {
|
public static AnnotationResult stripAnnotation(String input) {
|
||||||
return new AnnotationResult(input, null);
|
Matcher m = PAREN_ANNOTATION.matcher(input);
|
||||||
|
if (!m.find()) {
|
||||||
|
return new AnnotationResult(input, null);
|
||||||
|
}
|
||||||
|
String cleaned = input.substring(0, m.start()).trim();
|
||||||
|
String rawAnnotation = m.group(1).trim();
|
||||||
|
|
||||||
|
Matcher uncertainMatcher = UNCERTAIN_NAME.matcher(rawAnnotation);
|
||||||
|
if (uncertainMatcher.matches()) {
|
||||||
|
String nameFromAnnotation = uncertainMatcher.group(1);
|
||||||
|
cleaned = (cleaned + " " + nameFromAnnotation).trim();
|
||||||
|
return new AnnotationResult(cleaned, "?");
|
||||||
|
}
|
||||||
|
|
||||||
|
return new AnnotationResult(cleaned, rawAnnotation);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Strips title prefixes. Pass-through until #212. */
|
/** Strips title prefixes. Pass-through until #212. */
|
||||||
|
|||||||
@@ -281,12 +281,65 @@ class PersonNameParserTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void stripAnnotation_isPassthrough() {
|
void stripAnnotation_noParens_returnsNull() {
|
||||||
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Walter de Gruyter");
|
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Walter de Gruyter");
|
||||||
assertThat(result.cleaned()).isEqualTo("Walter de Gruyter");
|
assertThat(result.cleaned()).isEqualTo("Walter de Gruyter");
|
||||||
assertThat(result.annotation()).isNull();
|
assertThat(result.annotation()).isNull();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void stripAnnotation_birthYear_noSpace() {
|
||||||
|
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Clara de Gruyter(*1871)");
|
||||||
|
assertThat(result.cleaned()).isEqualTo("Clara de Gruyter");
|
||||||
|
assertThat(result.annotation()).isEqualTo("*1871");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void stripAnnotation_uncertainty_withSpace() {
|
||||||
|
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Ernst Kurmany (?)");
|
||||||
|
assertThat(result.cleaned()).isEqualTo("Ernst Kurmany");
|
||||||
|
assertThat(result.annotation()).isEqualTo("?");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void stripAnnotation_nickname_noSpace() {
|
||||||
|
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Gertrud D.(Tuttu)");
|
||||||
|
assertThat(result.cleaned()).isEqualTo("Gertrud D.");
|
||||||
|
assertThat(result.annotation()).isEqualTo("Tuttu");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void stripAnnotation_uncertainName_extractsNameBack() {
|
||||||
|
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("Richard (Quast ? )");
|
||||||
|
assertThat(result.cleaned()).isEqualTo("Richard Quast");
|
||||||
|
assertThat(result.annotation()).isEqualTo("?");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void stripAnnotation_onlyParen_returnsPlaceholder() {
|
||||||
|
PersonNameParser.AnnotationResult result = PersonNameParser.stripAnnotation("(OnlyParen)");
|
||||||
|
assertThat(result.cleaned()).isEmpty();
|
||||||
|
assertThat(result.annotation()).isEqualTo("OnlyParen");
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- split — annotation extraction end-to-end ---
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void split_birthYearAnnotation_extracted() {
|
||||||
|
PersonNameParser.SplitName result = PersonNameParser.split("Clara de Gruyter(*1871)");
|
||||||
|
assertThat(result.firstName()).isEqualTo("Clara");
|
||||||
|
assertThat(result.lastName()).isEqualTo("de Gruyter");
|
||||||
|
assertThat(result.annotation()).isEqualTo("*1871");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void split_uncertainName_extractsLastName() {
|
||||||
|
PersonNameParser.SplitName result = PersonNameParser.split("Richard (Quast ? )");
|
||||||
|
assertThat(result.firstName()).isEqualTo("Richard");
|
||||||
|
assertThat(result.lastName()).isEqualTo("Quast");
|
||||||
|
assertThat(result.annotation()).isEqualTo("?");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void stripTitle_isPassthrough() {
|
void stripTitle_isPassthrough() {
|
||||||
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Walter de Gruyter");
|
PersonNameParser.TitleResult result = PersonNameParser.stripTitle("Walter de Gruyter");
|
||||||
|
|||||||
Reference in New Issue
Block a user