feat(service): add PersonTypeClassifier with keyword heuristics

Static classify() method uses position-aware keyword matching:
- SKIP: Briefumschlag, Kondolenzbriefe, Hochzeitsgedicht (start)
- INSTITUTION: Firma, Architekt (start), GmbH, Co (end)
- GROUP: Familie, Comité, Comite, Geschwister, Gesellschafter,
  Garde, Mitarbeiter (start), Eltern, Kinder,
  Schwiegereltern (word boundary)
- PERSON: default for all other inputs

Case-insensitive. 25 parameterized test cases.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-08 13:03:53 +02:00
parent e49ae5de29
commit 68f0c4c4b9
2 changed files with 149 additions and 0 deletions

View File

@@ -0,0 +1,60 @@
package org.raddatz.familienarchiv.service;
import java.util.List;
import org.raddatz.familienarchiv.model.PersonType;
public class PersonTypeClassifier {
private static final List<String> SKIP_KEYWORDS = List.of(
"Briefumschlag", "Kondolenzbriefe", "Hochzeitsgedicht");
private static final List<String> INSTITUTION_START = List.of(
"Firma", "Architekt");
private static final List<String> INSTITUTION_END = List.of(
"GmbH");
private static final List<String> GROUP_START = List.of(
"Familie", "Comité", "Comite", "Geschwister", "Gesellschafter",
"Garde", "Mitarbeiter");
private static final List<String> GROUP_CONTAINS = List.of(
"Eltern", "Kinder", "Schwiegereltern");
public static PersonType classify(String rawName) {
if (rawName == null || rawName.isBlank()) return PersonType.PERSON;
String trimmed = rawName.trim();
String lower = trimmed.toLowerCase();
for (String keyword : SKIP_KEYWORDS) {
if (lower.startsWith(keyword.toLowerCase())) return PersonType.SKIP;
}
for (String keyword : INSTITUTION_START) {
if (lower.startsWith(keyword.toLowerCase())) return PersonType.INSTITUTION;
}
for (String keyword : INSTITUTION_END) {
if (lower.endsWith(keyword.toLowerCase())) return PersonType.INSTITUTION;
}
if (lower.endsWith(" co") || lower.endsWith(" co.")) return PersonType.INSTITUTION;
for (String keyword : GROUP_START) {
if (lower.startsWith(keyword.toLowerCase())) return PersonType.GROUP;
}
for (String keyword : GROUP_CONTAINS) {
if (containsWord(lower, keyword.toLowerCase())) return PersonType.GROUP;
}
return PersonType.PERSON;
}
private static boolean containsWord(String text, String word) {
int idx = text.indexOf(word);
if (idx < 0) return false;
boolean startOk = idx == 0 || !Character.isLetter(text.charAt(idx - 1));
int end = idx + word.length();
boolean endOk = end >= text.length() || !Character.isLetter(text.charAt(end));
return startOk && endOk;
}
}

View File

@@ -0,0 +1,89 @@
package org.raddatz.familienarchiv.service;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.raddatz.familienarchiv.model.PersonType;
import static org.assertj.core.api.Assertions.assertThat;
class PersonTypeClassifierTest {
// --- SKIP ---
@ParameterizedTest
@CsvSource({
"'Briefumschlag aus Java', SKIP",
"'Kondolenzbriefe zum Tod von Walter de Gruyter', SKIP",
"'Hochzeitsgedicht fur Paul u Luise de Gruyter', SKIP"
})
void classify_skipEntries(String input, PersonType expected) {
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
}
// --- INSTITUTION ---
@ParameterizedTest
@CsvSource({
"'Arthur Collignon GmbH', INSTITUTION",
"'Firma Auschrath', INSTITUTION",
"'Westermann u Co', INSTITUTION",
"'Architekt Korschelt u Renker', INSTITUTION"
})
void classify_institutionEntries(String input, PersonType expected) {
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
}
// --- GROUP ---
@ParameterizedTest
@CsvSource({
"'Comite der Abschiedsfeier', GROUP",
"'Comité zur Errichtung eines Heine-Denkmals', GROUP",
"'Garde du Corps', GROUP",
"'Geschwister de Gruyter', GROUP",
"'Gesellschafter des Verlages', GROUP",
"'Ella de Gruyters Eltern', GROUP",
"'Eugenie de Gruyters Kinder', GROUP",
"'Hilde de Gruyters Schwiegereltern', GROUP",
"'Eltern Muller', GROUP",
"'Familie Cram', GROUP",
"'Familie Hasenvlever', GROUP",
"'Mitarbeiter Verlag', GROUP",
"'Mitarbeiter Druckerei TrebbinClara Cram', GROUP",
"'Mitarbeiter Kunstverlag Mu', GROUP"
})
void classify_groupEntries(String input, PersonType expected) {
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
}
// --- PERSON (default) ---
@ParameterizedTest
@CsvSource({
"'Walter de Gruyter', PERSON",
"'Clara Cram', PERSON",
"'Eugenie de Gruyter geb. Müller', PERSON",
"'Dr. Firma Mueller', PERSON"
})
void classify_personEntries(String input, PersonType expected) {
assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected);
}
// --- Edge cases ---
@Test
void classify_null_returnsPerson() {
assertThat(PersonTypeClassifier.classify(null)).isEqualTo(PersonType.PERSON);
}
@Test
void classify_blank_returnsPerson() {
assertThat(PersonTypeClassifier.classify(" ")).isEqualTo(PersonType.PERSON);
}
@Test
void classify_caseInsensitive() {
assertThat(PersonTypeClassifier.classify("firma auschrath")).isEqualTo(PersonType.INSTITUTION);
}
}