Files
familienarchiv/nlp-service/main.py
Marcel 6c5cf8ec9b feat(nlp-service): replace spaCy NER with DB-backed PersonMatcher
Rule-based pipeline: persons matched via rapidfuzz against all known
names loaded from DB at startup. Fixes first-name-only extraction
(Eugenie, Herbert), merged-span bug (Herbert + Eugenie de Gruyter),
false positives on compound nouns, and EN/ES model failures.
Date extraction unchanged (regex). No spaCy models required.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-07 11:00:03 +02:00

54 lines
1.5 KiB
Python

"""FastAPI app — /parse and /health endpoints."""
from __future__ import annotations
import os
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from extractor import extract, get_person_matcher, set_person_matcher
from models import ParseRequest, ParseResponse
from person_matcher import PersonMatcher
def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]:
import psycopg2 # deferred — not available in test environments without a DB
conn = psycopg2.connect(db_url)
try:
cur = conn.cursor()
cur.execute("SELECT first_name, last_name FROM persons")
return cur.fetchall()
finally:
conn.close()
@asynccontextmanager
async def lifespan(app: FastAPI):
# Only initialise the matcher when nothing was pre-seeded (e.g., by tests).
if get_person_matcher() is None:
m = PersonMatcher()
db_url = os.environ.get("DATABASE_URL")
if db_url:
rows = _load_persons_from_db(db_url)
m.load(rows)
set_person_matcher(m)
yield
app = FastAPI(lifespan=lifespan)
@app.get("/health")
def health() -> dict:
m = get_person_matcher()
return {"status": "ok", "persons_loaded": len(m) if m else 0}
@app.post("/parse", response_model=ParseResponse)
def parse(request: ParseRequest) -> ParseResponse:
try:
return extract(request.query, request.lang)
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc)) from exc