feat(nlp-service): replace spaCy NER with DB-backed PersonMatcher
Rule-based pipeline: persons matched via rapidfuzz against all known names loaded from DB at startup. Fixes first-name-only extraction (Eugenie, Herbert), merged-span bug (Herbert + Eugenie de Gruyter), false positives on compound nouns, and EN/ES model failures. Date extraction unchanged (regex). No spaCy models required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,43 +1,72 @@
|
||||
"""Integration tests for the FastAPI app."""
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from extractor import set_person_matcher
|
||||
from person_matcher import PersonMatcher
|
||||
|
||||
_TEST_PERSONS = [
|
||||
("Clara", "Cram"),
|
||||
("Herbert", "Cram"),
|
||||
("Eugenie", "de Gruyter"),
|
||||
("Walter", "de Gruyter"),
|
||||
("Marie", "Cram"),
|
||||
("Anita", "Wöhler"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def client():
|
||||
# Pre-seed the matcher so the lifespan doesn't overwrite it with an empty one.
|
||||
m = PersonMatcher()
|
||||
m.load(_TEST_PERSONS)
|
||||
set_person_matcher(m)
|
||||
from main import app
|
||||
with TestClient(app) as c:
|
||||
yield c
|
||||
|
||||
|
||||
def test_health(client):
|
||||
response = client.get("/health")
|
||||
assert response.status_code == 200
|
||||
assert response.json() == {"status": "ok"}
|
||||
r = client.get("/health")
|
||||
assert r.status_code == 200
|
||||
assert r.json()["status"] == "ok"
|
||||
assert r.json()["persons_loaded"] > 0
|
||||
|
||||
|
||||
def test_parse_returns_200_with_all_fields(client):
|
||||
response = client.post("/parse", json={"query": "Briefe vor 1920", "lang": "de"})
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert "personNames" in data
|
||||
assert "personRole" in data
|
||||
assert data["personRole"] in ("sender", "receiver", "any")
|
||||
assert "dateFrom" in data
|
||||
assert "dateTo" in data
|
||||
assert "keywords" in data
|
||||
assert "rawQuery" in data
|
||||
assert data["rawQuery"] == "Briefe vor 1920"
|
||||
assert data["dateTo"] == "1920-12-31"
|
||||
r = client.post("/parse", json={"query": "Briefe vor 1920", "lang": "de"})
|
||||
assert r.status_code == 200
|
||||
d = r.json()
|
||||
assert "personNames" in d
|
||||
assert d["personRole"] in ("sender", "receiver", "any")
|
||||
assert "dateFrom" in d
|
||||
assert "dateTo" in d
|
||||
assert "keywords" in d
|
||||
assert d["rawQuery"] == "Briefe vor 1920"
|
||||
assert d["dateTo"] == "1920-12-31"
|
||||
|
||||
|
||||
def test_parse_person_with_date(client):
|
||||
r = client.post(
|
||||
"/parse",
|
||||
json={"query": "Briefe von Clara Cram an Walter de Gruyter im Jahr 1920", "lang": "de"},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
d = r.json()
|
||||
assert "Clara Cram" in d["personNames"]
|
||||
assert "Walter de Gruyter" in d["personNames"]
|
||||
assert d["dateFrom"] == "1920-01-01"
|
||||
assert d["dateTo"] == "1920-12-31"
|
||||
|
||||
|
||||
def test_parse_unknown_lang_returns_422(client):
|
||||
response = client.post("/parse", json={"query": "test", "lang": "fr"})
|
||||
assert response.status_code == 422
|
||||
r = client.post("/parse", json={"query": "test", "lang": "fr"})
|
||||
assert r.status_code == 422
|
||||
|
||||
|
||||
def test_parse_missing_query_returns_422(client):
|
||||
response = client.post("/parse", json={"lang": "de"})
|
||||
assert response.status_code == 422
|
||||
r = client.post("/parse", json={"lang": "de"})
|
||||
assert r.status_code == 422
|
||||
|
||||
|
||||
def test_parse_all_languages(client):
|
||||
@@ -47,6 +76,6 @@ def test_parse_all_languages(client):
|
||||
("es", "cartas antes de 1920"),
|
||||
]
|
||||
for lang, query in cases:
|
||||
response = client.post("/parse", json={"query": query, "lang": lang})
|
||||
assert response.status_code == 200, f"Failed for lang={lang}"
|
||||
assert response.json()["dateTo"] == "1920-12-31", f"Wrong dateTo for lang={lang}"
|
||||
r = client.post("/parse", json={"query": query, "lang": lang})
|
||||
assert r.status_code == 200, f"Failed for lang={lang}"
|
||||
assert r.json()["dateTo"] == "1920-12-31", f"Wrong dateTo for lang={lang}"
|
||||
|
||||
Reference in New Issue
Block a user