From 68f0c4c4b9f9916ce98dc85b6dcdd6d79160168d Mon Sep 17 00:00:00 2001 From: Marcel Date: Wed, 8 Apr 2026 13:03:53 +0200 Subject: [PATCH] feat(service): add PersonTypeClassifier with keyword heuristics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Static classify() method uses position-aware keyword matching: - SKIP: Briefumschlag, Kondolenzbriefe, Hochzeitsgedicht (start) - INSTITUTION: Firma, Architekt (start), GmbH, Co (end) - GROUP: Familie, Comité, Comite, Geschwister, Gesellschafter, Garde, Mitarbeiter (start), Eltern, Kinder, Schwiegereltern (word boundary) - PERSON: default for all other inputs Case-insensitive. 25 parameterized test cases. Co-Authored-By: Claude Sonnet 4.6 --- .../service/PersonTypeClassifier.java | 60 +++++++++++++ .../service/PersonTypeClassifierTest.java | 89 +++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/PersonTypeClassifier.java create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/service/PersonTypeClassifierTest.java diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/PersonTypeClassifier.java b/backend/src/main/java/org/raddatz/familienarchiv/service/PersonTypeClassifier.java new file mode 100644 index 00000000..e39abb05 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/PersonTypeClassifier.java @@ -0,0 +1,60 @@ +package org.raddatz.familienarchiv.service; + +import java.util.List; +import org.raddatz.familienarchiv.model.PersonType; + +public class PersonTypeClassifier { + + private static final List SKIP_KEYWORDS = List.of( + "Briefumschlag", "Kondolenzbriefe", "Hochzeitsgedicht"); + + private static final List INSTITUTION_START = List.of( + "Firma", "Architekt"); + + private static final List INSTITUTION_END = List.of( + "GmbH"); + + private static final List GROUP_START = List.of( + "Familie", "Comité", "Comite", "Geschwister", "Gesellschafter", + "Garde", "Mitarbeiter"); + + private static final List GROUP_CONTAINS = List.of( + "Eltern", "Kinder", "Schwiegereltern"); + + public static PersonType classify(String rawName) { + if (rawName == null || rawName.isBlank()) return PersonType.PERSON; + + String trimmed = rawName.trim(); + String lower = trimmed.toLowerCase(); + + for (String keyword : SKIP_KEYWORDS) { + if (lower.startsWith(keyword.toLowerCase())) return PersonType.SKIP; + } + + for (String keyword : INSTITUTION_START) { + if (lower.startsWith(keyword.toLowerCase())) return PersonType.INSTITUTION; + } + for (String keyword : INSTITUTION_END) { + if (lower.endsWith(keyword.toLowerCase())) return PersonType.INSTITUTION; + } + if (lower.endsWith(" co") || lower.endsWith(" co.")) return PersonType.INSTITUTION; + + for (String keyword : GROUP_START) { + if (lower.startsWith(keyword.toLowerCase())) return PersonType.GROUP; + } + for (String keyword : GROUP_CONTAINS) { + if (containsWord(lower, keyword.toLowerCase())) return PersonType.GROUP; + } + + return PersonType.PERSON; + } + + private static boolean containsWord(String text, String word) { + int idx = text.indexOf(word); + if (idx < 0) return false; + boolean startOk = idx == 0 || !Character.isLetter(text.charAt(idx - 1)); + int end = idx + word.length(); + boolean endOk = end >= text.length() || !Character.isLetter(text.charAt(end)); + return startOk && endOk; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/PersonTypeClassifierTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/PersonTypeClassifierTest.java new file mode 100644 index 00000000..ed9132ef --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/PersonTypeClassifierTest.java @@ -0,0 +1,89 @@ +package org.raddatz.familienarchiv.service; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; +import org.raddatz.familienarchiv.model.PersonType; + +import static org.assertj.core.api.Assertions.assertThat; + +class PersonTypeClassifierTest { + + // --- SKIP --- + + @ParameterizedTest + @CsvSource({ + "'Briefumschlag aus Java', SKIP", + "'Kondolenzbriefe zum Tod von Walter de Gruyter', SKIP", + "'Hochzeitsgedicht fur Paul u Luise de Gruyter', SKIP" + }) + void classify_skipEntries(String input, PersonType expected) { + assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected); + } + + // --- INSTITUTION --- + + @ParameterizedTest + @CsvSource({ + "'Arthur Collignon GmbH', INSTITUTION", + "'Firma Auschrath', INSTITUTION", + "'Westermann u Co', INSTITUTION", + "'Architekt Korschelt u Renker', INSTITUTION" + }) + void classify_institutionEntries(String input, PersonType expected) { + assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected); + } + + // --- GROUP --- + + @ParameterizedTest + @CsvSource({ + "'Comite der Abschiedsfeier', GROUP", + "'Comité zur Errichtung eines Heine-Denkmals', GROUP", + "'Garde du Corps', GROUP", + "'Geschwister de Gruyter', GROUP", + "'Gesellschafter des Verlages', GROUP", + "'Ella de Gruyters Eltern', GROUP", + "'Eugenie de Gruyters Kinder', GROUP", + "'Hilde de Gruyters Schwiegereltern', GROUP", + "'Eltern Muller', GROUP", + "'Familie Cram', GROUP", + "'Familie Hasenvlever', GROUP", + "'Mitarbeiter Verlag', GROUP", + "'Mitarbeiter Druckerei TrebbinClara Cram', GROUP", + "'Mitarbeiter Kunstverlag Mu', GROUP" + }) + void classify_groupEntries(String input, PersonType expected) { + assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected); + } + + // --- PERSON (default) --- + + @ParameterizedTest + @CsvSource({ + "'Walter de Gruyter', PERSON", + "'Clara Cram', PERSON", + "'Eugenie de Gruyter geb. Müller', PERSON", + "'Dr. Firma Mueller', PERSON" + }) + void classify_personEntries(String input, PersonType expected) { + assertThat(PersonTypeClassifier.classify(input)).isEqualTo(expected); + } + + // --- Edge cases --- + + @Test + void classify_null_returnsPerson() { + assertThat(PersonTypeClassifier.classify(null)).isEqualTo(PersonType.PERSON); + } + + @Test + void classify_blank_returnsPerson() { + assertThat(PersonTypeClassifier.classify(" ")).isEqualTo(PersonType.PERSON); + } + + @Test + void classify_caseInsensitive() { + assertThat(PersonTypeClassifier.classify("firma auschrath")).isEqualTo(PersonType.INSTITUTION); + } +}