From 98ee6cf5871848e460c9ad2698c6d44e708ebbfa Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 7 Jun 2026 15:48:57 +0200 Subject: [PATCH] feat(nlp-service): wire NLP_FUZZY_THRESHOLD env var with 0-100 validation Co-Authored-By: Claude Sonnet 4.6 --- nlp-service/extractor.py | 10 ++++++++-- nlp-service/main.py | 19 ++++++++++++++++++- nlp-service/test_main.py | 17 +++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/nlp-service/extractor.py b/nlp-service/extractor.py index f4b16adf..de884910 100644 --- a/nlp-service/extractor.py +++ b/nlp-service/extractor.py @@ -13,9 +13,10 @@ from person_matcher import PersonMatcher if TYPE_CHECKING: pass -# ── Module-level PersonMatcher (set at startup) ─────────────────────────────── +# ── Module-level PersonMatcher and fuzzy threshold (set at startup) ────────── _matcher: PersonMatcher | None = None +_fuzzy_threshold: int = 80 def set_person_matcher(m: PersonMatcher) -> None: @@ -27,6 +28,11 @@ def get_person_matcher() -> PersonMatcher | None: return _matcher +def set_fuzzy_threshold(threshold: int) -> None: + global _fuzzy_threshold + _fuzzy_threshold = threshold + + # ── Preposition sets ────────────────────────────────────────────────────────── _SENDER_PREPS: dict[str, frozenset[str]] = { @@ -155,7 +161,7 @@ def _extract_persons_and_role( preps = _ALL_PERSON_PREPS[lang] stops = preps | _DATE_BEFORE[lang] | _DATE_AFTER[lang] | _DATE_BETWEEN[lang] | _EXTRA_SPAN_STOPS[lang] - matches = m.find_in_query(query, preps, stop_tokens=stops) + matches = m.find_in_query(query, preps, stop_tokens=stops, threshold=_fuzzy_threshold) person_names = [text for text, _ in matches] diff --git a/nlp-service/main.py b/nlp-service/main.py index 52533188..509ae7b1 100644 --- a/nlp-service/main.py +++ b/nlp-service/main.py @@ -9,10 +9,23 @@ from fastapi import FastAPI, HTTPException logger = logging.getLogger(__name__) -from extractor import extract, get_person_matcher, set_person_matcher +from extractor import extract, get_person_matcher, set_fuzzy_threshold, set_person_matcher from models import ParseRequest, ParseResponse from person_matcher import PersonMatcher +_DEFAULT_FUZZY_THRESHOLD = 80 + + +def _parse_fuzzy_threshold(val: str) -> int: + """Parse and validate NLP_FUZZY_THRESHOLD — must be integer in [0, 100].""" + try: + n = int(val) + except ValueError: + raise ValueError(f"NLP_FUZZY_THRESHOLD must be an integer, got: {val!r}") + if not (0 <= n <= 100): + raise ValueError(f"NLP_FUZZY_THRESHOLD must be between 0 and 100, got: {n}") + return n + def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]: import psycopg2 # deferred — not available in test environments without a DB @@ -28,6 +41,10 @@ def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]: @asynccontextmanager async def lifespan(app: FastAPI): + threshold_raw = os.environ.get("NLP_FUZZY_THRESHOLD", str(_DEFAULT_FUZZY_THRESHOLD)) + threshold = _parse_fuzzy_threshold(threshold_raw) + set_fuzzy_threshold(threshold) + # Only initialise the matcher when nothing was pre-seeded (e.g., by tests). if get_person_matcher() is None: m = PersonMatcher() diff --git a/nlp-service/test_main.py b/nlp-service/test_main.py index 31ec4766..5a81156a 100644 --- a/nlp-service/test_main.py +++ b/nlp-service/test_main.py @@ -81,6 +81,23 @@ def test_parse_all_languages(client): assert r.json()["dateTo"] == "1920-12-31", f"Wrong dateTo for lang={lang}" +def test_fuzzy_threshold_valid_range(): + from main import _parse_fuzzy_threshold + assert _parse_fuzzy_threshold("80") == 80 + assert _parse_fuzzy_threshold("0") == 0 + assert _parse_fuzzy_threshold("100") == 100 + + +def test_fuzzy_threshold_out_of_range_raises(): + from main import _parse_fuzzy_threshold + with pytest.raises(ValueError): + _parse_fuzzy_threshold("101") + with pytest.raises(ValueError): + _parse_fuzzy_threshold("-1") + with pytest.raises(ValueError): + _parse_fuzzy_threshold("abc") + + def test_parse_exceeds_max_length_returns_422(client): r = client.post("/parse", json={"query": "a" * 501, "lang": "de"}) assert r.status_code == 422