feat(nlp-service): wire NLP_FUZZY_THRESHOLD env var with 0-100 validation
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -13,9 +13,10 @@ from person_matcher import PersonMatcher
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# ── Module-level PersonMatcher (set at startup) ───────────────────────────────
|
# ── Module-level PersonMatcher and fuzzy threshold (set at startup) ──────────
|
||||||
|
|
||||||
_matcher: PersonMatcher | None = None
|
_matcher: PersonMatcher | None = None
|
||||||
|
_fuzzy_threshold: int = 80
|
||||||
|
|
||||||
|
|
||||||
def set_person_matcher(m: PersonMatcher) -> None:
|
def set_person_matcher(m: PersonMatcher) -> None:
|
||||||
@@ -27,6 +28,11 @@ def get_person_matcher() -> PersonMatcher | None:
|
|||||||
return _matcher
|
return _matcher
|
||||||
|
|
||||||
|
|
||||||
|
def set_fuzzy_threshold(threshold: int) -> None:
|
||||||
|
global _fuzzy_threshold
|
||||||
|
_fuzzy_threshold = threshold
|
||||||
|
|
||||||
|
|
||||||
# ── Preposition sets ──────────────────────────────────────────────────────────
|
# ── Preposition sets ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
_SENDER_PREPS: dict[str, frozenset[str]] = {
|
_SENDER_PREPS: dict[str, frozenset[str]] = {
|
||||||
@@ -155,7 +161,7 @@ def _extract_persons_and_role(
|
|||||||
|
|
||||||
preps = _ALL_PERSON_PREPS[lang]
|
preps = _ALL_PERSON_PREPS[lang]
|
||||||
stops = preps | _DATE_BEFORE[lang] | _DATE_AFTER[lang] | _DATE_BETWEEN[lang] | _EXTRA_SPAN_STOPS[lang]
|
stops = preps | _DATE_BEFORE[lang] | _DATE_AFTER[lang] | _DATE_BETWEEN[lang] | _EXTRA_SPAN_STOPS[lang]
|
||||||
matches = m.find_in_query(query, preps, stop_tokens=stops)
|
matches = m.find_in_query(query, preps, stop_tokens=stops, threshold=_fuzzy_threshold)
|
||||||
|
|
||||||
person_names = [text for text, _ in matches]
|
person_names = [text for text, _ in matches]
|
||||||
|
|
||||||
|
|||||||
@@ -9,10 +9,23 @@ from fastapi import FastAPI, HTTPException
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
from extractor import extract, get_person_matcher, set_person_matcher
|
from extractor import extract, get_person_matcher, set_fuzzy_threshold, set_person_matcher
|
||||||
from models import ParseRequest, ParseResponse
|
from models import ParseRequest, ParseResponse
|
||||||
from person_matcher import PersonMatcher
|
from person_matcher import PersonMatcher
|
||||||
|
|
||||||
|
_DEFAULT_FUZZY_THRESHOLD = 80
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_fuzzy_threshold(val: str) -> int:
|
||||||
|
"""Parse and validate NLP_FUZZY_THRESHOLD — must be integer in [0, 100]."""
|
||||||
|
try:
|
||||||
|
n = int(val)
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(f"NLP_FUZZY_THRESHOLD must be an integer, got: {val!r}")
|
||||||
|
if not (0 <= n <= 100):
|
||||||
|
raise ValueError(f"NLP_FUZZY_THRESHOLD must be between 0 and 100, got: {n}")
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]:
|
def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]:
|
||||||
import psycopg2 # deferred — not available in test environments without a DB
|
import psycopg2 # deferred — not available in test environments without a DB
|
||||||
@@ -28,6 +41,10 @@ def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]:
|
|||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
|
threshold_raw = os.environ.get("NLP_FUZZY_THRESHOLD", str(_DEFAULT_FUZZY_THRESHOLD))
|
||||||
|
threshold = _parse_fuzzy_threshold(threshold_raw)
|
||||||
|
set_fuzzy_threshold(threshold)
|
||||||
|
|
||||||
# Only initialise the matcher when nothing was pre-seeded (e.g., by tests).
|
# Only initialise the matcher when nothing was pre-seeded (e.g., by tests).
|
||||||
if get_person_matcher() is None:
|
if get_person_matcher() is None:
|
||||||
m = PersonMatcher()
|
m = PersonMatcher()
|
||||||
|
|||||||
@@ -81,6 +81,23 @@ def test_parse_all_languages(client):
|
|||||||
assert r.json()["dateTo"] == "1920-12-31", f"Wrong dateTo for lang={lang}"
|
assert r.json()["dateTo"] == "1920-12-31", f"Wrong dateTo for lang={lang}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fuzzy_threshold_valid_range():
|
||||||
|
from main import _parse_fuzzy_threshold
|
||||||
|
assert _parse_fuzzy_threshold("80") == 80
|
||||||
|
assert _parse_fuzzy_threshold("0") == 0
|
||||||
|
assert _parse_fuzzy_threshold("100") == 100
|
||||||
|
|
||||||
|
|
||||||
|
def test_fuzzy_threshold_out_of_range_raises():
|
||||||
|
from main import _parse_fuzzy_threshold
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
_parse_fuzzy_threshold("101")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
_parse_fuzzy_threshold("-1")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
_parse_fuzzy_threshold("abc")
|
||||||
|
|
||||||
|
|
||||||
def test_parse_exceeds_max_length_returns_422(client):
|
def test_parse_exceeds_max_length_returns_422(client):
|
||||||
r = client.post("/parse", json={"query": "a" * 501, "lang": "de"})
|
r = client.post("/parse", json={"query": "a" * 501, "lang": "de"})
|
||||||
assert r.status_code == 422
|
assert r.status_code == 422
|
||||||
|
|||||||
Reference in New Issue
Block a user