"""Tests for the rule-based extractor and PersonMatcher.""" import pytest from extractor import extract, extract_dates, extract_keywords, set_person_matcher from person_matcher import PersonMatcher # ── Shared test fixture ─────────────────────────────────────────────────────── _TEST_PERSONS = [ ("Clara", "Cram"), ("Herbert", "Cram"), ("Eugenie", "de Gruyter"), ("Walter", "de Gruyter"), ("Marie", "Cram"), ("Juan", "Cram"), ("Hilde", "de Gruyter"), ("Hans", "de Gruyter"), ("Albert", "de Gruyter"), ("Anita", "Wöhler"), ("Else", "Bohrmann"), ("Lili", "Duvenbeck"), ] @pytest.fixture(scope="session", autouse=True) def seeded_matcher(): """Load test persons into the global matcher before any test runs.""" m = PersonMatcher() m.load(_TEST_PERSONS) set_person_matcher(m) return m # ── PersonMatcher unit tests ────────────────────────────────────────────────── class TestPersonMatcher: DE_PREPS = frozenset({"von", "vom", "an", "nach", "für"}) def test_load_populates_names(self, seeded_matcher): assert len(seeded_matcher) > 0 def test_exact_full_name_match(self, seeded_matcher): hits = seeded_matcher.find_in_query("Briefe von Clara Cram", self.DE_PREPS) assert hits == [("Clara Cram", "von")] def test_exact_first_name_only(self, seeded_matcher): hits = seeded_matcher.find_in_query("Briefe von Eugenie", self.DE_PREPS) assert hits == [("Eugenie", "von")] def test_exact_first_name_receiver(self, seeded_matcher): hits = seeded_matcher.find_in_query("Briefe an Herbert", self.DE_PREPS) assert hits == [("Herbert", "an")] def test_fuzzy_typo(self, seeded_matcher): hits = seeded_matcher.find_in_query("Briefe von Herrbert Cram", self.DE_PREPS) assert len(hits) == 1 assert hits[0][1] == "von" def test_two_persons_extracted(self, seeded_matcher): hits = seeded_matcher.find_in_query( "Briefe von Clara Cram an Herbert Cram", self.DE_PREPS ) assert len(hits) == 2 assert hits[0][0] == "Clara Cram" assert hits[0][1] == "von" assert hits[1][0] == "Herbert Cram" assert hits[1][1] == "an" def test_no_match_for_place_name(self, seeded_matcher): hits = seeded_matcher.find_in_query("Reise nach Mexiko", self.DE_PREPS) assert hits == [] def test_no_match_for_topic_word(self, seeded_matcher): hits = seeded_matcher.find_in_query("Briefe aus dem Krieg", self.DE_PREPS) assert hits == [] def test_first_name_eugenie_regression(self, seeded_matcher): # spaCy NER missed standalone first names hits = seeded_matcher.find_in_query("Briefe von Eugenie", self.DE_PREPS) assert len(hits) == 1 def test_merged_names_regression(self, seeded_matcher): # spaCy NER merged "Herbert an Eugenie de Gruyter" into one PER span hits = seeded_matcher.find_in_query( "Briefe von Herbert an Eugenie de Gruyter nach 1914", self.DE_PREPS ) assert len(hits) == 2 names = [h[0] for h in hits] assert "Herbert" in names assert "Eugenie de Gruyter" in names def test_english_preps(self, seeded_matcher): en_preps = frozenset({"from", "by", "to", "for"}) hits = seeded_matcher.find_in_query( "Letters from Clara Cram to Walter de Gruyter in 1920", en_preps ) assert len(hits) == 2 assert hits[0][0] == "Clara Cram" assert hits[1][0] == "Walter de Gruyter" def test_double_preposition_de(self, seeded_matcher): hits = seeded_matcher.find_in_query( "Briefe von Clara nach Herbert", self.DE_PREPS ) assert len(hits) == 2 names = [h[0] for h in hits] assert "Clara" in names assert "Herbert" in names # ── Date extraction tests ───────────────────────────────────────────────────── class TestExtractDates: def test_bare_year_gives_range(self): assert extract_dates("Briefe 1920", "de") == ("1920-01-01", "1920-12-31") def test_im_jahr(self): assert extract_dates("Schriften im Jahr 1905", "de") == ( "1905-01-01", "1905-12-31" ) def test_vor_year(self): assert extract_dates("Briefe vor 1920", "de") == (None, "1920-12-31") def test_nach_year(self): assert extract_dates("Schriften nach 1920", "de") == ("1920-01-01", None) def test_zwischen(self): assert extract_dates("Dokumente zwischen 1914 und 1918", "de") == ( "1914-01-01", "1918-12-31" ) def test_before_en(self): assert extract_dates("Letters before 1918", "en") == (None, "1918-12-31") def test_after_en(self): assert extract_dates("Letters after 1939", "en") == ("1939-01-01", None) def test_between_en(self): assert extract_dates("Letters between 1914 and 1918", "en") == ( "1914-01-01", "1918-12-31" ) def test_antes_de_es(self): assert extract_dates("Cartas antes de 1900", "es") == (None, "1900-12-31") def test_entre_es(self): assert extract_dates("entre 1915 y 1920", "es") == ( "1915-01-01", "1920-12-31" ) def test_no_year(self): assert extract_dates("Briefe aus dem Krieg", "de") == (None, None) def test_nach_before_person_then_year(self): # "nach Marie 1920" — "nach" belongs to person, not date date_from, date_to = extract_dates("Briefe nach Marie 1920", "de", ["Marie"]) assert date_from == "1920-01-01" assert date_to == "1920-12-31" def test_bare_year_alone(self): assert extract_dates("1918", "de") == ("1918-01-01", "1918-12-31") # ── Keyword extraction tests ────────────────────────────────────────────────── class TestExtractKeywords: def test_basic_topic_words(self): kws = extract_keywords("Briefe aus dem Krieg", "de", [], []) assert "krieg" in kws def test_stopwords_excluded(self): kws = extract_keywords("von der nach dem aus", "de", [], []) for sw in ("von", "der", "nach", "dem", "aus"): assert sw not in kws def test_person_spans_excluded(self): kws = extract_keywords( "Briefe von Clara Cram nach Herbert", "de", ["Clara Cram", "Herbert"], [] ) assert "clara" not in kws assert "cram" not in kws assert "herbert" not in kws def test_years_excluded(self): kws = extract_keywords("Schriften 1920 über Reise", "de", [], ["1920"]) assert "1920" not in kws def test_deduplication(self): kws = extract_keywords("Krieg Krieg Krieg", "de", [], []) assert kws.count("krieg") == 1 def test_en_stopwords(self): kws = extract_keywords("Letters about the war", "en", [], []) assert "the" not in kws assert "war" in kws def test_short_words_excluded(self): kws = extract_keywords("ab cd ef xy", "de", [], []) assert all(len(k) >= 3 for k in kws) # ── Full pipeline integration tests ────────────────────────────────────────── class TestExtract: def test_full_sentence_de(self): r = extract("Briefe von Clara Cram an Walter de Gruyter im Jahr 1920", "de") assert "Clara Cram" in r.personNames assert "Walter de Gruyter" in r.personNames assert r.personRole == "any" assert r.dateFrom == "1920-01-01" assert r.dateTo == "1920-12-31" def test_sender_role_de(self): r = extract("Briefe von Clara Cram vor 1910", "de") assert r.personNames == ["Clara Cram"] assert r.personRole == "sender" assert r.dateTo == "1910-12-31" assert r.dateFrom is None def test_receiver_role_de(self): r = extract("Briefe an Walter de Gruyter", "de") assert r.personNames == ["Walter de Gruyter"] assert r.personRole == "receiver" def test_first_name_only_eugenie(self): r = extract("Briefe von Eugenie", "de") assert "Eugenie" in r.personNames assert r.personRole == "sender" def test_first_name_only_herbert(self): r = extract("Kriegsbriefe von Herbert", "de") assert "Herbert" in r.personNames def test_merged_names_bug_fixed(self): r = extract("Briefe von Herbert an Eugenie de Gruyter nach 1914", "de") assert "Herbert" in r.personNames assert "Eugenie de Gruyter" in r.personNames assert r.dateFrom == "1914-01-01" def test_topic_only_krieg(self): r = extract("Briefe aus dem Krieg", "de") assert r.personNames == [] assert "krieg" in r.keywords def test_topic_only_single_word(self): r = extract("Kriegspost", "de") assert r.personNames == [] def test_date_range_only(self): r = extract("Dokumente zwischen 1914 und 1918", "de") assert r.personNames == [] assert r.dateFrom == "1914-01-01" assert r.dateTo == "1918-12-31" def test_colloquial_von(self): r = extract("von Clara", "de") assert r.personNames == ["Clara"] assert r.personRole == "sender" def test_colloquial_an(self): r = extract("an Walter", "de") assert r.personNames == ["Walter"] assert r.personRole == "receiver" def test_bare_year_alone(self): r = extract("1918", "de") assert r.dateFrom == "1918-01-01" assert r.dateTo == "1918-12-31" assert r.personNames == [] def test_english_full_sentence(self): r = extract("Letters from Clara Cram to Walter de Gruyter in 1920", "en") assert "Clara Cram" in r.personNames assert "Walter de Gruyter" in r.personNames assert r.dateFrom == "1920-01-01" def test_english_receiver_with_date(self): r = extract("Letters to Herbert Cram after 1939", "en") assert "Herbert Cram" in r.personNames assert r.personRole == "receiver" assert r.dateFrom == "1939-01-01" def test_english_birthday(self): r = extract("Birthday greetings from Anita Wöhler", "en") assert "Anita Wöhler" in r.personNames assert r.personRole == "sender" def test_english_between_dates(self): r = extract("Letters between 1914 and 1918", "en") assert r.dateFrom == "1914-01-01" assert r.dateTo == "1918-12-31" def test_spanish_full_sentence(self): r = extract("Cartas de Clara Cram a Walter de Gruyter en 1920", "es") assert "Clara Cram" in r.personNames assert "Walter de Gruyter" in r.personNames assert r.dateFrom == "1920-01-01" def test_spanish_before(self): r = extract("Cartas antes de 1900", "es") assert r.dateTo == "1900-12-31" assert r.dateFrom is None def test_rawquery_echoed(self): q = "test query" r = extract(q, "de") assert r.rawQuery == q def test_false_positive_compound_noun_regression(self): # spaCy tagged "Geburtstagsglückwünsche" as a PER entity r = extract("Geburtstagsglückwünsche", "de") assert r.personNames == [] def test_question_phrasing(self): r = extract("Wer hat an Herbert Cram 1918 geschrieben?", "de") assert "Herbert Cram" in r.personNames assert r.personRole == "receiver" assert r.dateFrom == "1918-01-01" def test_lowercase_query(self): r = extract("briefe von clara cram an herbert 1920", "de") # Should still find persons despite lowercase assert len(r.personNames) >= 1 def test_empty_matcher_returns_no_persons(self, seeded_matcher): from extractor import get_person_matcher, set_person_matcher original = get_person_matcher() try: set_person_matcher(PersonMatcher()) r = extract("Briefe von Clara Cram", "de") assert r.personNames == [] finally: set_person_matcher(original)