feat(nlp-service): wire NLP_FUZZY_THRESHOLD env var with 0-100 validation

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-07 15:48:57 +02:00
parent 778382cd61
commit 98ee6cf587
3 changed files with 43 additions and 3 deletions
--- a/nlp-service/extractor.py
+++ b/nlp-service/extractor.py
@@ -13,9 +13,10 @@ from person_matcher import PersonMatcher
 if TYPE_CHECKING:
    pass

-# ── Module-level PersonMatcher (set at startup) ───────────────────────────────
+# ── Module-level PersonMatcher and fuzzy threshold (set at startup) ──────────

 _matcher: PersonMatcher | None = None
+_fuzzy_threshold: int = 80


 def set_person_matcher(m: PersonMatcher) -> None:
@@ -27,6 +28,11 @@ def get_person_matcher() -> PersonMatcher | None:
    return _matcher


+def set_fuzzy_threshold(threshold: int) -> None:
+    global _fuzzy_threshold
+    _fuzzy_threshold = threshold
+
+
 # ── Preposition sets ──────────────────────────────────────────────────────────

 _SENDER_PREPS: dict[str, frozenset[str]] = {
@@ -155,7 +161,7 @@ def _extract_persons_and_role(

    preps = _ALL_PERSON_PREPS[lang]
    stops = preps | _DATE_BEFORE[lang] | _DATE_AFTER[lang] | _DATE_BETWEEN[lang] | _EXTRA_SPAN_STOPS[lang]
-    matches = m.find_in_query(query, preps, stop_tokens=stops)
+    matches = m.find_in_query(query, preps, stop_tokens=stops, threshold=_fuzzy_threshold)

    person_names = [text for text, _ in matches]

--- a/nlp-service/main.py
+++ b/nlp-service/main.py
@@ -9,10 +9,23 @@ from fastapi import FastAPI, HTTPException

 logger = logging.getLogger(__name__)

-from extractor import extract, get_person_matcher, set_person_matcher
+from extractor import extract, get_person_matcher, set_fuzzy_threshold, set_person_matcher
 from models import ParseRequest, ParseResponse
 from person_matcher import PersonMatcher

+_DEFAULT_FUZZY_THRESHOLD = 80
+
+
+def _parse_fuzzy_threshold(val: str) -> int:
+    """Parse and validate NLP_FUZZY_THRESHOLD — must be integer in [0, 100]."""
+    try:
+        n = int(val)
+    except ValueError:
+        raise ValueError(f"NLP_FUZZY_THRESHOLD must be an integer, got: {val!r}")
+    if not (0 <= n <= 100):
+        raise ValueError(f"NLP_FUZZY_THRESHOLD must be between 0 and 100, got: {n}")
+    return n
+

 def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]:
    import psycopg2  # deferred — not available in test environments without a DB
@@ -28,6 +41,10 @@ def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]:

@asynccontextmanager
 async def lifespan(app: FastAPI):
+    threshold_raw = os.environ.get("NLP_FUZZY_THRESHOLD", str(_DEFAULT_FUZZY_THRESHOLD))
+    threshold = _parse_fuzzy_threshold(threshold_raw)
+    set_fuzzy_threshold(threshold)
+
    # Only initialise the matcher when nothing was pre-seeded (e.g., by tests).
    if get_person_matcher() is None:
        m = PersonMatcher()
--- a/nlp-service/test_main.py
+++ b/nlp-service/test_main.py
@@ -81,6 +81,23 @@ def test_parse_all_languages(client):
        assert r.json()["dateTo"] == "1920-12-31", f"Wrong dateTo for lang={lang}"


+def test_fuzzy_threshold_valid_range():
+    from main import _parse_fuzzy_threshold
+    assert _parse_fuzzy_threshold("80") == 80
+    assert _parse_fuzzy_threshold("0") == 0
+    assert _parse_fuzzy_threshold("100") == 100
+
+
+def test_fuzzy_threshold_out_of_range_raises():
+    from main import _parse_fuzzy_threshold
+    with pytest.raises(ValueError):
+        _parse_fuzzy_threshold("101")
+    with pytest.raises(ValueError):
+        _parse_fuzzy_threshold("-1")
+    with pytest.raises(ValueError):
+        _parse_fuzzy_threshold("abc")
+
+
 def test_parse_exceeds_max_length_returns_422(client):
    r = client.post("/parse", json={"query": "a" * 501, "lang": "de"})
    assert r.status_code == 422