Rule-based pipeline: persons matched via rapidfuzz against all known names loaded from DB at startup. Fixes first-name-only extraction (Eugenie, Herbert), merged-span bug (Herbert + Eugenie de Gruyter), false positives on compound nouns, and EN/ES model failures. Date extraction unchanged (regex). No spaCy models required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
54 lines
1.5 KiB
Python
54 lines
1.5 KiB
Python
"""FastAPI app — /parse and /health endpoints."""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from contextlib import asynccontextmanager
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
|
|
from extractor import extract, get_person_matcher, set_person_matcher
|
|
from models import ParseRequest, ParseResponse
|
|
from person_matcher import PersonMatcher
|
|
|
|
|
|
def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]:
|
|
import psycopg2 # deferred — not available in test environments without a DB
|
|
|
|
conn = psycopg2.connect(db_url)
|
|
try:
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT first_name, last_name FROM persons")
|
|
return cur.fetchall()
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
# Only initialise the matcher when nothing was pre-seeded (e.g., by tests).
|
|
if get_person_matcher() is None:
|
|
m = PersonMatcher()
|
|
db_url = os.environ.get("DATABASE_URL")
|
|
if db_url:
|
|
rows = _load_persons_from_db(db_url)
|
|
m.load(rows)
|
|
set_person_matcher(m)
|
|
yield
|
|
|
|
|
|
app = FastAPI(lifespan=lifespan)
|
|
|
|
|
|
@app.get("/health")
|
|
def health() -> dict:
|
|
m = get_person_matcher()
|
|
return {"status": "ok", "persons_loaded": len(m) if m else 0}
|
|
|
|
|
|
@app.post("/parse", response_model=ParseResponse)
|
|
def parse(request: ParseRequest) -> ParseResponse:
|
|
try:
|
|
return extract(request.query, request.lang)
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|