Rule-based pipeline: persons matched via rapidfuzz against all known names loaded from DB at startup. Fixes first-name-only extraction (Eugenie, Herbert), merged-span bug (Herbert + Eugenie de Gruyter), false positives on compound nouns, and EN/ES model failures. Date extraction unchanged (regex). No spaCy models required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
338 lines
12 KiB
Python
338 lines
12 KiB
Python
"""Tests for the rule-based extractor and PersonMatcher."""
|
|
import pytest
|
|
|
|
from extractor import extract, extract_dates, extract_keywords, set_person_matcher
|
|
from person_matcher import PersonMatcher
|
|
|
|
# ── Shared test fixture ───────────────────────────────────────────────────────
|
|
|
|
_TEST_PERSONS = [
|
|
("Clara", "Cram"),
|
|
("Herbert", "Cram"),
|
|
("Eugenie", "de Gruyter"),
|
|
("Walter", "de Gruyter"),
|
|
("Marie", "Cram"),
|
|
("Juan", "Cram"),
|
|
("Hilde", "de Gruyter"),
|
|
("Hans", "de Gruyter"),
|
|
("Albert", "de Gruyter"),
|
|
("Anita", "Wöhler"),
|
|
("Else", "Bohrmann"),
|
|
("Lili", "Duvenbeck"),
|
|
]
|
|
|
|
|
|
@pytest.fixture(scope="session", autouse=True)
|
|
def seeded_matcher():
|
|
"""Load test persons into the global matcher before any test runs."""
|
|
m = PersonMatcher()
|
|
m.load(_TEST_PERSONS)
|
|
set_person_matcher(m)
|
|
return m
|
|
|
|
|
|
# ── PersonMatcher unit tests ──────────────────────────────────────────────────
|
|
|
|
class TestPersonMatcher:
|
|
DE_PREPS = frozenset({"von", "vom", "an", "nach", "für"})
|
|
|
|
def test_load_populates_names(self, seeded_matcher):
|
|
assert len(seeded_matcher) > 0
|
|
|
|
def test_exact_full_name_match(self, seeded_matcher):
|
|
hits = seeded_matcher.find_in_query("Briefe von Clara Cram", self.DE_PREPS)
|
|
assert hits == [("Clara Cram", "von")]
|
|
|
|
def test_exact_first_name_only(self, seeded_matcher):
|
|
hits = seeded_matcher.find_in_query("Briefe von Eugenie", self.DE_PREPS)
|
|
assert hits == [("Eugenie", "von")]
|
|
|
|
def test_exact_first_name_receiver(self, seeded_matcher):
|
|
hits = seeded_matcher.find_in_query("Briefe an Herbert", self.DE_PREPS)
|
|
assert hits == [("Herbert", "an")]
|
|
|
|
def test_fuzzy_typo(self, seeded_matcher):
|
|
hits = seeded_matcher.find_in_query("Briefe von Herrbert Cram", self.DE_PREPS)
|
|
assert len(hits) == 1
|
|
assert hits[0][1] == "von"
|
|
|
|
def test_two_persons_extracted(self, seeded_matcher):
|
|
hits = seeded_matcher.find_in_query(
|
|
"Briefe von Clara Cram an Herbert Cram", self.DE_PREPS
|
|
)
|
|
assert len(hits) == 2
|
|
assert hits[0][0] == "Clara Cram"
|
|
assert hits[0][1] == "von"
|
|
assert hits[1][0] == "Herbert Cram"
|
|
assert hits[1][1] == "an"
|
|
|
|
def test_no_match_for_place_name(self, seeded_matcher):
|
|
hits = seeded_matcher.find_in_query("Reise nach Mexiko", self.DE_PREPS)
|
|
assert hits == []
|
|
|
|
def test_no_match_for_topic_word(self, seeded_matcher):
|
|
hits = seeded_matcher.find_in_query("Briefe aus dem Krieg", self.DE_PREPS)
|
|
assert hits == []
|
|
|
|
def test_first_name_eugenie_regression(self, seeded_matcher):
|
|
# spaCy NER missed standalone first names
|
|
hits = seeded_matcher.find_in_query("Briefe von Eugenie", self.DE_PREPS)
|
|
assert len(hits) == 1
|
|
|
|
def test_merged_names_regression(self, seeded_matcher):
|
|
# spaCy NER merged "Herbert an Eugenie de Gruyter" into one PER span
|
|
hits = seeded_matcher.find_in_query(
|
|
"Briefe von Herbert an Eugenie de Gruyter nach 1914", self.DE_PREPS
|
|
)
|
|
assert len(hits) == 2
|
|
names = [h[0] for h in hits]
|
|
assert "Herbert" in names
|
|
assert "Eugenie de Gruyter" in names
|
|
|
|
def test_english_preps(self, seeded_matcher):
|
|
en_preps = frozenset({"from", "by", "to", "for"})
|
|
hits = seeded_matcher.find_in_query(
|
|
"Letters from Clara Cram to Walter de Gruyter in 1920", en_preps
|
|
)
|
|
assert len(hits) == 2
|
|
assert hits[0][0] == "Clara Cram"
|
|
assert hits[1][0] == "Walter de Gruyter"
|
|
|
|
def test_double_preposition_de(self, seeded_matcher):
|
|
hits = seeded_matcher.find_in_query(
|
|
"Briefe von Clara nach Herbert", self.DE_PREPS
|
|
)
|
|
assert len(hits) == 2
|
|
names = [h[0] for h in hits]
|
|
assert "Clara" in names
|
|
assert "Herbert" in names
|
|
|
|
|
|
# ── Date extraction tests ─────────────────────────────────────────────────────
|
|
|
|
class TestExtractDates:
|
|
def test_bare_year_gives_range(self):
|
|
assert extract_dates("Briefe 1920", "de") == ("1920-01-01", "1920-12-31")
|
|
|
|
def test_im_jahr(self):
|
|
assert extract_dates("Schriften im Jahr 1905", "de") == (
|
|
"1905-01-01", "1905-12-31"
|
|
)
|
|
|
|
def test_vor_year(self):
|
|
assert extract_dates("Briefe vor 1920", "de") == (None, "1920-12-31")
|
|
|
|
def test_nach_year(self):
|
|
assert extract_dates("Schriften nach 1920", "de") == ("1920-01-01", None)
|
|
|
|
def test_zwischen(self):
|
|
assert extract_dates("Dokumente zwischen 1914 und 1918", "de") == (
|
|
"1914-01-01", "1918-12-31"
|
|
)
|
|
|
|
def test_before_en(self):
|
|
assert extract_dates("Letters before 1918", "en") == (None, "1918-12-31")
|
|
|
|
def test_after_en(self):
|
|
assert extract_dates("Letters after 1939", "en") == ("1939-01-01", None)
|
|
|
|
def test_between_en(self):
|
|
assert extract_dates("Letters between 1914 and 1918", "en") == (
|
|
"1914-01-01", "1918-12-31"
|
|
)
|
|
|
|
def test_antes_de_es(self):
|
|
assert extract_dates("Cartas antes de 1900", "es") == (None, "1900-12-31")
|
|
|
|
def test_entre_es(self):
|
|
assert extract_dates("entre 1915 y 1920", "es") == (
|
|
"1915-01-01", "1920-12-31"
|
|
)
|
|
|
|
def test_no_year(self):
|
|
assert extract_dates("Briefe aus dem Krieg", "de") == (None, None)
|
|
|
|
def test_nach_before_person_then_year(self):
|
|
# "nach Marie 1920" — "nach" belongs to person, not date
|
|
date_from, date_to = extract_dates("Briefe nach Marie 1920", "de", ["Marie"])
|
|
assert date_from == "1920-01-01"
|
|
assert date_to == "1920-12-31"
|
|
|
|
def test_bare_year_alone(self):
|
|
assert extract_dates("1918", "de") == ("1918-01-01", "1918-12-31")
|
|
|
|
|
|
# ── Keyword extraction tests ──────────────────────────────────────────────────
|
|
|
|
class TestExtractKeywords:
|
|
def test_basic_topic_words(self):
|
|
kws = extract_keywords("Briefe aus dem Krieg", "de", [], [])
|
|
assert "krieg" in kws
|
|
|
|
def test_stopwords_excluded(self):
|
|
kws = extract_keywords("von der nach dem aus", "de", [], [])
|
|
for sw in ("von", "der", "nach", "dem", "aus"):
|
|
assert sw not in kws
|
|
|
|
def test_person_spans_excluded(self):
|
|
kws = extract_keywords(
|
|
"Briefe von Clara Cram nach Herbert", "de",
|
|
["Clara Cram", "Herbert"], []
|
|
)
|
|
assert "clara" not in kws
|
|
assert "cram" not in kws
|
|
assert "herbert" not in kws
|
|
|
|
def test_years_excluded(self):
|
|
kws = extract_keywords("Schriften 1920 über Reise", "de", [], ["1920"])
|
|
assert "1920" not in kws
|
|
|
|
def test_deduplication(self):
|
|
kws = extract_keywords("Krieg Krieg Krieg", "de", [], [])
|
|
assert kws.count("krieg") == 1
|
|
|
|
def test_en_stopwords(self):
|
|
kws = extract_keywords("Letters about the war", "en", [], [])
|
|
assert "the" not in kws
|
|
assert "war" in kws
|
|
|
|
def test_short_words_excluded(self):
|
|
kws = extract_keywords("ab cd ef xy", "de", [], [])
|
|
assert all(len(k) >= 3 for k in kws)
|
|
|
|
|
|
# ── Full pipeline integration tests ──────────────────────────────────────────
|
|
|
|
class TestExtract:
|
|
def test_full_sentence_de(self):
|
|
r = extract("Briefe von Clara Cram an Walter de Gruyter im Jahr 1920", "de")
|
|
assert "Clara Cram" in r.personNames
|
|
assert "Walter de Gruyter" in r.personNames
|
|
assert r.personRole == "any"
|
|
assert r.dateFrom == "1920-01-01"
|
|
assert r.dateTo == "1920-12-31"
|
|
|
|
def test_sender_role_de(self):
|
|
r = extract("Briefe von Clara Cram vor 1910", "de")
|
|
assert r.personNames == ["Clara Cram"]
|
|
assert r.personRole == "sender"
|
|
assert r.dateTo == "1910-12-31"
|
|
assert r.dateFrom is None
|
|
|
|
def test_receiver_role_de(self):
|
|
r = extract("Briefe an Walter de Gruyter", "de")
|
|
assert r.personNames == ["Walter de Gruyter"]
|
|
assert r.personRole == "receiver"
|
|
|
|
def test_first_name_only_eugenie(self):
|
|
r = extract("Briefe von Eugenie", "de")
|
|
assert "Eugenie" in r.personNames
|
|
assert r.personRole == "sender"
|
|
|
|
def test_first_name_only_herbert(self):
|
|
r = extract("Kriegsbriefe von Herbert", "de")
|
|
assert "Herbert" in r.personNames
|
|
|
|
def test_merged_names_bug_fixed(self):
|
|
r = extract("Briefe von Herbert an Eugenie de Gruyter nach 1914", "de")
|
|
assert "Herbert" in r.personNames
|
|
assert "Eugenie de Gruyter" in r.personNames
|
|
assert r.dateFrom == "1914-01-01"
|
|
|
|
def test_topic_only_krieg(self):
|
|
r = extract("Briefe aus dem Krieg", "de")
|
|
assert r.personNames == []
|
|
assert "krieg" in r.keywords
|
|
|
|
def test_topic_only_single_word(self):
|
|
r = extract("Kriegspost", "de")
|
|
assert r.personNames == []
|
|
|
|
def test_date_range_only(self):
|
|
r = extract("Dokumente zwischen 1914 und 1918", "de")
|
|
assert r.personNames == []
|
|
assert r.dateFrom == "1914-01-01"
|
|
assert r.dateTo == "1918-12-31"
|
|
|
|
def test_colloquial_von(self):
|
|
r = extract("von Clara", "de")
|
|
assert r.personNames == ["Clara"]
|
|
assert r.personRole == "sender"
|
|
|
|
def test_colloquial_an(self):
|
|
r = extract("an Walter", "de")
|
|
assert r.personNames == ["Walter"]
|
|
assert r.personRole == "receiver"
|
|
|
|
def test_bare_year_alone(self):
|
|
r = extract("1918", "de")
|
|
assert r.dateFrom == "1918-01-01"
|
|
assert r.dateTo == "1918-12-31"
|
|
assert r.personNames == []
|
|
|
|
def test_english_full_sentence(self):
|
|
r = extract("Letters from Clara Cram to Walter de Gruyter in 1920", "en")
|
|
assert "Clara Cram" in r.personNames
|
|
assert "Walter de Gruyter" in r.personNames
|
|
assert r.dateFrom == "1920-01-01"
|
|
|
|
def test_english_receiver_with_date(self):
|
|
r = extract("Letters to Herbert Cram after 1939", "en")
|
|
assert "Herbert Cram" in r.personNames
|
|
assert r.personRole == "receiver"
|
|
assert r.dateFrom == "1939-01-01"
|
|
|
|
def test_english_birthday(self):
|
|
r = extract("Birthday greetings from Anita Wöhler", "en")
|
|
assert "Anita Wöhler" in r.personNames
|
|
assert r.personRole == "sender"
|
|
|
|
def test_english_between_dates(self):
|
|
r = extract("Letters between 1914 and 1918", "en")
|
|
assert r.dateFrom == "1914-01-01"
|
|
assert r.dateTo == "1918-12-31"
|
|
|
|
def test_spanish_full_sentence(self):
|
|
r = extract("Cartas de Clara Cram a Walter de Gruyter en 1920", "es")
|
|
assert "Clara Cram" in r.personNames
|
|
assert "Walter de Gruyter" in r.personNames
|
|
assert r.dateFrom == "1920-01-01"
|
|
|
|
def test_spanish_before(self):
|
|
r = extract("Cartas antes de 1900", "es")
|
|
assert r.dateTo == "1900-12-31"
|
|
assert r.dateFrom is None
|
|
|
|
def test_rawquery_echoed(self):
|
|
q = "test query"
|
|
r = extract(q, "de")
|
|
assert r.rawQuery == q
|
|
|
|
def test_false_positive_compound_noun_regression(self):
|
|
# spaCy tagged "Geburtstagsglückwünsche" as a PER entity
|
|
r = extract("Geburtstagsglückwünsche", "de")
|
|
assert r.personNames == []
|
|
|
|
def test_question_phrasing(self):
|
|
r = extract("Wer hat an Herbert Cram 1918 geschrieben?", "de")
|
|
assert "Herbert Cram" in r.personNames
|
|
assert r.personRole == "receiver"
|
|
assert r.dateFrom == "1918-01-01"
|
|
|
|
def test_lowercase_query(self):
|
|
r = extract("briefe von clara cram an herbert 1920", "de")
|
|
# Should still find persons despite lowercase
|
|
assert len(r.personNames) >= 1
|
|
|
|
def test_empty_matcher_returns_no_persons(self):
|
|
# Temporarily use an empty matcher
|
|
from extractor import set_person_matcher
|
|
empty = PersonMatcher()
|
|
set_person_matcher(empty)
|
|
r = extract("Briefe von Clara Cram", "de")
|
|
assert r.personNames == []
|
|
# Restore seeded matcher
|
|
m = PersonMatcher()
|
|
m.load(_TEST_PERSONS)
|
|
set_person_matcher(m)
|