feat(nlp-service): replace spaCy NER with DB-backed PersonMatcher

Rule-based pipeline: persons matched via rapidfuzz against all known
names loaded from DB at startup. Fixes first-name-only extraction
(Eugenie, Herbert), merged-span bug (Herbert + Eugenie de Gruyter),
false positives on compound nouns, and EN/ES model failures.
Date extraction unchanged (regex). No spaCy models required.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-06-07 11:00:03 +02:00
committed by marcel
parent aa200bf3c5
commit 03d7d44e57
8 changed files with 939 additions and 551 deletions

View File

@@ -1,19 +1,38 @@
import logging
"""FastAPI app — /parse and /health endpoints."""
from __future__ import annotations
import os
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from extractor import extract, load_all_models
from extractor import extract, get_person_matcher, set_person_matcher
from models import ParseRequest, ParseResponse
from person_matcher import PersonMatcher
logger = logging.getLogger(__name__)
def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]:
import psycopg2 # deferred — not available in test environments without a DB
conn = psycopg2.connect(db_url)
try:
cur = conn.cursor()
cur.execute("SELECT first_name, last_name FROM persons")
return cur.fetchall()
finally:
conn.close()
@asynccontextmanager
async def lifespan(app: FastAPI):
logger.info("Loading spaCy models...")
load_all_models()
logger.info("All models ready.")
# Only initialise the matcher when nothing was pre-seeded (e.g., by tests).
if get_person_matcher() is None:
m = PersonMatcher()
db_url = os.environ.get("DATABASE_URL")
if db_url:
rows = _load_persons_from_db(db_url)
m.load(rows)
set_person_matcher(m)
yield
@@ -22,7 +41,8 @@ app = FastAPI(lifespan=lifespan)
@app.get("/health")
def health() -> dict:
return {"status": "ok"}
m = get_person_matcher()
return {"status": "ok", "persons_loaded": len(m) if m else 0}
@app.post("/parse", response_model=ParseResponse)
@@ -30,5 +50,4 @@ def parse(request: ParseRequest) -> ParseResponse:
try:
return extract(request.query, request.lang)
except Exception as exc:
logger.exception("Extraction pipeline failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc