feat(nlp-service): replace spaCy NER with DB-backed PersonMatcher
Rule-based pipeline: persons matched via rapidfuzz against all known names loaded from DB at startup. Fixes first-name-only extraction (Eugenie, Herbert), merged-span bug (Herbert + Eugenie de Gruyter), false positives on compound nouns, and EN/ES model failures. Date extraction unchanged (regex). No spaCy models required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,19 +1,38 @@
|
||||
import logging
|
||||
"""FastAPI app — /parse and /health endpoints."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
|
||||
from extractor import extract, load_all_models
|
||||
from extractor import extract, get_person_matcher, set_person_matcher
|
||||
from models import ParseRequest, ParseResponse
|
||||
from person_matcher import PersonMatcher
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]:
|
||||
import psycopg2 # deferred — not available in test environments without a DB
|
||||
|
||||
conn = psycopg2.connect(db_url)
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT first_name, last_name FROM persons")
|
||||
return cur.fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
logger.info("Loading spaCy models...")
|
||||
load_all_models()
|
||||
logger.info("All models ready.")
|
||||
# Only initialise the matcher when nothing was pre-seeded (e.g., by tests).
|
||||
if get_person_matcher() is None:
|
||||
m = PersonMatcher()
|
||||
db_url = os.environ.get("DATABASE_URL")
|
||||
if db_url:
|
||||
rows = _load_persons_from_db(db_url)
|
||||
m.load(rows)
|
||||
set_person_matcher(m)
|
||||
yield
|
||||
|
||||
|
||||
@@ -22,7 +41,8 @@ app = FastAPI(lifespan=lifespan)
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict:
|
||||
return {"status": "ok"}
|
||||
m = get_person_matcher()
|
||||
return {"status": "ok", "persons_loaded": len(m) if m else 0}
|
||||
|
||||
|
||||
@app.post("/parse", response_model=ParseResponse)
|
||||
@@ -30,5 +50,4 @@ def parse(request: ParseRequest) -> ParseResponse:
|
||||
try:
|
||||
return extract(request.query, request.lang)
|
||||
except Exception as exc:
|
||||
logger.exception("Extraction pipeline failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
|
||||
Reference in New Issue
Block a user