feat(service): add PersonTypeClassifier with keyword heuristics
Static classify() method uses position-aware keyword matching: - SKIP: Briefumschlag, Kondolenzbriefe, Hochzeitsgedicht (start) - INSTITUTION: Firma, Architekt (start), GmbH, Co (end) - GROUP: Familie, Comité, Comite, Geschwister, Gesellschafter, Garde, Mitarbeiter (start), Eltern, Kinder, Schwiegereltern (word boundary) - PERSON: default for all other inputs Case-insensitive. 25 parameterized test cases. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,60 @@
|
|||||||
|
package org.raddatz.familienarchiv.service;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import org.raddatz.familienarchiv.model.PersonType;
|
||||||
|
|
||||||
|
public class PersonTypeClassifier {
|
||||||
|
|
||||||
|
private static final List<String> SKIP_KEYWORDS = List.of(
|
||||||
|
"Briefumschlag", "Kondolenzbriefe", "Hochzeitsgedicht");
|
||||||
|
|
||||||
|
private static final List<String> INSTITUTION_START = List.of(
|
||||||
|
"Firma", "Architekt");
|
||||||
|
|
||||||
|
private static final List<String> INSTITUTION_END = List.of(
|
||||||
|
"GmbH");
|
||||||
|
|
||||||
|
private static final List<String> GROUP_START = List.of(
|
||||||
|
"Familie", "Comité", "Comite", "Geschwister", "Gesellschafter",
|
||||||
|
"Garde", "Mitarbeiter");
|
||||||
|
|
||||||
|
private static final List<String> GROUP_CONTAINS = List.of(
|
||||||
|
"Eltern", "Kinder", "Schwiegereltern");
|
||||||
|
|
||||||
|
public static PersonType classify(String rawName) {
|
||||||
|
if (rawName == null || rawName.isBlank()) return PersonType.PERSON;
|
||||||
|
|
||||||
|
String trimmed = rawName.trim();
|
||||||
|
String lower = trimmed.toLowerCase();
|
||||||
|
|
||||||
|
for (String keyword : SKIP_KEYWORDS) {
|
||||||
|
if (lower.startsWith(keyword.toLowerCase())) return PersonType.SKIP;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String keyword : INSTITUTION_START) {
|
||||||
|
if (lower.startsWith(keyword.toLowerCase())) return PersonType.INSTITUTION;
|
||||||
|
}
|
||||||
|
for (String keyword : INSTITUTION_END) {
|
||||||
|
if (lower.endsWith(keyword.toLowerCase())) return PersonType.INSTITUTION;
|
||||||
|
}
|
||||||
|
if (lower.endsWith(" co") || lower.endsWith(" co.")) return PersonType.INSTITUTION;
|
||||||
|
|
||||||
|
for (String keyword : GROUP_START) {
|
||||||
|
if (lower.startsWith(keyword.toLowerCase())) return PersonType.GROUP;
|
||||||
|
}
|
||||||
|
for (String keyword : GROUP_CONTAINS) {
|
||||||
|
if (containsWord(lower, keyword.toLowerCase())) return PersonType.GROUP;
|
||||||
|
}
|
||||||
|
|
||||||
|
return PersonType.PERSON;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean containsWord(String text, String word) {
|
||||||
|
int idx = text.indexOf(word);
|
||||||
|
if (idx < 0) return false;
|
||||||
|
boolean startOk = idx == 0 || !Character.isLetter(text.charAt(idx - 1));
|
||||||
|
int end = idx + word.length();
|
||||||
|
boolean endOk = end >= text.length() || !Character.isLetter(text.charAt(end));
|
||||||
|
return startOk && endOk;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,89 @@
|
|||||||
|
package org.raddatz.familienarchiv.service;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
|
import org.junit.jupiter.params.provider.CsvSource;
|
||||||
|
import org.raddatz.familienarchiv.model.PersonType;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
class PersonTypeClassifierTest {
|
||||||
|
|
||||||
|
// --- SKIP ---
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@CsvSource({
|
||||||
|
"'Briefumschlag aus Java', SKIP",
|
||||||
|
"'Kondolenzbriefe zum Tod von Walter de Gruyter', SKIP",
|
||||||
|
"'Hochzeitsgedicht fur Paul u Luise de Gruyter', SKIP"
|
||||||
|
})
|
||||||
|
void classify_skipEntries(String input, PersonType expected) {
|
||||||
|
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- INSTITUTION ---
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@CsvSource({
|
||||||
|
"'Arthur Collignon GmbH', INSTITUTION",
|
||||||
|
"'Firma Auschrath', INSTITUTION",
|
||||||
|
"'Westermann u Co', INSTITUTION",
|
||||||
|
"'Architekt Korschelt u Renker', INSTITUTION"
|
||||||
|
})
|
||||||
|
void classify_institutionEntries(String input, PersonType expected) {
|
||||||
|
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- GROUP ---
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@CsvSource({
|
||||||
|
"'Comite der Abschiedsfeier', GROUP",
|
||||||
|
"'Comité zur Errichtung eines Heine-Denkmals', GROUP",
|
||||||
|
"'Garde du Corps', GROUP",
|
||||||
|
"'Geschwister de Gruyter', GROUP",
|
||||||
|
"'Gesellschafter des Verlages', GROUP",
|
||||||
|
"'Ella de Gruyters Eltern', GROUP",
|
||||||
|
"'Eugenie de Gruyters Kinder', GROUP",
|
||||||
|
"'Hilde de Gruyters Schwiegereltern', GROUP",
|
||||||
|
"'Eltern Muller', GROUP",
|
||||||
|
"'Familie Cram', GROUP",
|
||||||
|
"'Familie Hasenvlever', GROUP",
|
||||||
|
"'Mitarbeiter Verlag', GROUP",
|
||||||
|
"'Mitarbeiter Druckerei TrebbinClara Cram', GROUP",
|
||||||
|
"'Mitarbeiter Kunstverlag Mu', GROUP"
|
||||||
|
})
|
||||||
|
void classify_groupEntries(String input, PersonType expected) {
|
||||||
|
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- PERSON (default) ---
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@CsvSource({
|
||||||
|
"'Walter de Gruyter', PERSON",
|
||||||
|
"'Clara Cram', PERSON",
|
||||||
|
"'Eugenie de Gruyter geb. Müller', PERSON",
|
||||||
|
"'Dr. Firma Mueller', PERSON"
|
||||||
|
})
|
||||||
|
void classify_personEntries(String input, PersonType expected) {
|
||||||
|
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Edge cases ---
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void classify_null_returnsPerson() {
|
||||||
|
assertThat(PersonTypeClassifier.classify(null)).isEqualTo(PersonType.PERSON);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void classify_blank_returnsPerson() {
|
||||||
|
assertThat(PersonTypeClassifier.classify(" ")).isEqualTo(PersonType.PERSON);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void classify_caseInsensitive() {
|
||||||
|
assertThat(PersonTypeClassifier.classify("firma auschrath")).isEqualTo(PersonType.INSTITUTION);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user