Files
familienarchiv/nlp-service/test_main.py
Marcel 6c5cf8ec9b feat(nlp-service): replace spaCy NER with DB-backed PersonMatcher
Rule-based pipeline: persons matched via rapidfuzz against all known
names loaded from DB at startup. Fixes first-name-only extraction
(Eugenie, Herbert), merged-span bug (Herbert + Eugenie de Gruyter),
false positives on compound nouns, and EN/ES model failures.
Date extraction unchanged (regex). No spaCy models required.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-07 11:00:03 +02:00

82 lines
2.3 KiB
Python

"""Integration tests for the FastAPI app."""
import pytest
from fastapi.testclient import TestClient
from extractor import set_person_matcher
from person_matcher import PersonMatcher
_TEST_PERSONS = [
("Clara", "Cram"),
("Herbert", "Cram"),
("Eugenie", "de Gruyter"),
("Walter", "de Gruyter"),
("Marie", "Cram"),
("Anita", "Wöhler"),
]
@pytest.fixture(scope="session")
def client():
# Pre-seed the matcher so the lifespan doesn't overwrite it with an empty one.
m = PersonMatcher()
m.load(_TEST_PERSONS)
set_person_matcher(m)
from main import app
with TestClient(app) as c:
yield c
def test_health(client):
r = client.get("/health")
assert r.status_code == 200
assert r.json()["status"] == "ok"
assert r.json()["persons_loaded"] > 0
def test_parse_returns_200_with_all_fields(client):
r = client.post("/parse", json={"query": "Briefe vor 1920", "lang": "de"})
assert r.status_code == 200
d = r.json()
assert "personNames" in d
assert d["personRole"] in ("sender", "receiver", "any")
assert "dateFrom" in d
assert "dateTo" in d
assert "keywords" in d
assert d["rawQuery"] == "Briefe vor 1920"
assert d["dateTo"] == "1920-12-31"
def test_parse_person_with_date(client):
r = client.post(
"/parse",
json={"query": "Briefe von Clara Cram an Walter de Gruyter im Jahr 1920", "lang": "de"},
)
assert r.status_code == 200
d = r.json()
assert "Clara Cram" in d["personNames"]
assert "Walter de Gruyter" in d["personNames"]
assert d["dateFrom"] == "1920-01-01"
assert d["dateTo"] == "1920-12-31"
def test_parse_unknown_lang_returns_422(client):
r = client.post("/parse", json={"query": "test", "lang": "fr"})
assert r.status_code == 422
def test_parse_missing_query_returns_422(client):
r = client.post("/parse", json={"lang": "de"})
assert r.status_code == 422
def test_parse_all_languages(client):
cases = [
("de", "Briefe vor 1920"),
("en", "letters before 1920"),
("es", "cartas antes de 1920"),
]
for lang, query in cases:
r = client.post("/parse", json={"query": query, "lang": lang})
assert r.status_code == 200, f"Failed for lang={lang}"
assert r.json()["dateTo"] == "1920-12-31", f"Wrong dateTo for lang={lang}"