feat(service): add PersonTypeClassifier with keyword heuristics
Static classify() method uses position-aware keyword matching: - SKIP: Briefumschlag, Kondolenzbriefe, Hochzeitsgedicht (start) - INSTITUTION: Firma, Architekt (start), GmbH, Co (end) - GROUP: Familie, Comité, Comite, Geschwister, Gesellschafter, Garde, Mitarbeiter (start), Eltern, Kinder, Schwiegereltern (word boundary) - PERSON: default for all other inputs Case-insensitive. 25 parameterized test cases. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
package org.raddatz.familienarchiv.service;
|
||||
|
||||
import java.util.List;
|
||||
import org.raddatz.familienarchiv.model.PersonType;
|
||||
|
||||
public class PersonTypeClassifier {
|
||||
|
||||
private static final List<String> SKIP_KEYWORDS = List.of(
|
||||
"Briefumschlag", "Kondolenzbriefe", "Hochzeitsgedicht");
|
||||
|
||||
private static final List<String> INSTITUTION_START = List.of(
|
||||
"Firma", "Architekt");
|
||||
|
||||
private static final List<String> INSTITUTION_END = List.of(
|
||||
"GmbH");
|
||||
|
||||
private static final List<String> GROUP_START = List.of(
|
||||
"Familie", "Comité", "Comite", "Geschwister", "Gesellschafter",
|
||||
"Garde", "Mitarbeiter");
|
||||
|
||||
private static final List<String> GROUP_CONTAINS = List.of(
|
||||
"Eltern", "Kinder", "Schwiegereltern");
|
||||
|
||||
public static PersonType classify(String rawName) {
|
||||
if (rawName == null || rawName.isBlank()) return PersonType.PERSON;
|
||||
|
||||
String trimmed = rawName.trim();
|
||||
String lower = trimmed.toLowerCase();
|
||||
|
||||
for (String keyword : SKIP_KEYWORDS) {
|
||||
if (lower.startsWith(keyword.toLowerCase())) return PersonType.SKIP;
|
||||
}
|
||||
|
||||
for (String keyword : INSTITUTION_START) {
|
||||
if (lower.startsWith(keyword.toLowerCase())) return PersonType.INSTITUTION;
|
||||
}
|
||||
for (String keyword : INSTITUTION_END) {
|
||||
if (lower.endsWith(keyword.toLowerCase())) return PersonType.INSTITUTION;
|
||||
}
|
||||
if (lower.endsWith(" co") || lower.endsWith(" co.")) return PersonType.INSTITUTION;
|
||||
|
||||
for (String keyword : GROUP_START) {
|
||||
if (lower.startsWith(keyword.toLowerCase())) return PersonType.GROUP;
|
||||
}
|
||||
for (String keyword : GROUP_CONTAINS) {
|
||||
if (containsWord(lower, keyword.toLowerCase())) return PersonType.GROUP;
|
||||
}
|
||||
|
||||
return PersonType.PERSON;
|
||||
}
|
||||
|
||||
private static boolean containsWord(String text, String word) {
|
||||
int idx = text.indexOf(word);
|
||||
if (idx < 0) return false;
|
||||
boolean startOk = idx == 0 || !Character.isLetter(text.charAt(idx - 1));
|
||||
int end = idx + word.length();
|
||||
boolean endOk = end >= text.length() || !Character.isLetter(text.charAt(end));
|
||||
return startOk && endOk;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
package org.raddatz.familienarchiv.service;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.CsvSource;
|
||||
import org.raddatz.familienarchiv.model.PersonType;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
class PersonTypeClassifierTest {
|
||||
|
||||
// --- SKIP ---
|
||||
|
||||
@ParameterizedTest
|
||||
@CsvSource({
|
||||
"'Briefumschlag aus Java', SKIP",
|
||||
"'Kondolenzbriefe zum Tod von Walter de Gruyter', SKIP",
|
||||
"'Hochzeitsgedicht fur Paul u Luise de Gruyter', SKIP"
|
||||
})
|
||||
void classify_skipEntries(String input, PersonType expected) {
|
||||
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
|
||||
}
|
||||
|
||||
// --- INSTITUTION ---
|
||||
|
||||
@ParameterizedTest
|
||||
@CsvSource({
|
||||
"'Arthur Collignon GmbH', INSTITUTION",
|
||||
"'Firma Auschrath', INSTITUTION",
|
||||
"'Westermann u Co', INSTITUTION",
|
||||
"'Architekt Korschelt u Renker', INSTITUTION"
|
||||
})
|
||||
void classify_institutionEntries(String input, PersonType expected) {
|
||||
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
|
||||
}
|
||||
|
||||
// --- GROUP ---
|
||||
|
||||
@ParameterizedTest
|
||||
@CsvSource({
|
||||
"'Comite der Abschiedsfeier', GROUP",
|
||||
"'Comité zur Errichtung eines Heine-Denkmals', GROUP",
|
||||
"'Garde du Corps', GROUP",
|
||||
"'Geschwister de Gruyter', GROUP",
|
||||
"'Gesellschafter des Verlages', GROUP",
|
||||
"'Ella de Gruyters Eltern', GROUP",
|
||||
"'Eugenie de Gruyters Kinder', GROUP",
|
||||
"'Hilde de Gruyters Schwiegereltern', GROUP",
|
||||
"'Eltern Muller', GROUP",
|
||||
"'Familie Cram', GROUP",
|
||||
"'Familie Hasenvlever', GROUP",
|
||||
"'Mitarbeiter Verlag', GROUP",
|
||||
"'Mitarbeiter Druckerei TrebbinClara Cram', GROUP",
|
||||
"'Mitarbeiter Kunstverlag Mu', GROUP"
|
||||
})
|
||||
void classify_groupEntries(String input, PersonType expected) {
|
||||
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
|
||||
}
|
||||
|
||||
// --- PERSON (default) ---
|
||||
|
||||
@ParameterizedTest
|
||||
@CsvSource({
|
||||
"'Walter de Gruyter', PERSON",
|
||||
"'Clara Cram', PERSON",
|
||||
"'Eugenie de Gruyter geb. Müller', PERSON",
|
||||
"'Dr. Firma Mueller', PERSON"
|
||||
})
|
||||
void classify_personEntries(String input, PersonType expected) {
|
||||
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
|
||||
}
|
||||
|
||||
// --- Edge cases ---
|
||||
|
||||
@Test
|
||||
void classify_null_returnsPerson() {
|
||||
assertThat(PersonTypeClassifier.classify(null)).isEqualTo(PersonType.PERSON);
|
||||
}
|
||||
|
||||
@Test
|
||||
void classify_blank_returnsPerson() {
|
||||
assertThat(PersonTypeClassifier.classify(" ")).isEqualTo(PersonType.PERSON);
|
||||
}
|
||||
|
||||
@Test
|
||||
void classify_caseInsensitive() {
|
||||
assertThat(PersonTypeClassifier.classify("firma auschrath")).isEqualTo(PersonType.INSTITUTION);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user