fix(nlp-service): eliminate false-positive person matches from dirty DB records

- Wire _EXTRA_SPAN_STOPS into _extract_persons_and_role so German function
  words (im, seine, ihre, dem, …) terminate name spans — fixes "Clara im"
  and "seine Kinder" leaking into personNames
- Add _NON_NAME_TOKENS filter in PersonMatcher.load() to skip DB records
  whose first_name contains prepositions or possessives — filters 290 bad
  records (annotations like "an seine Eltern", "Eltern in", place references
  like "Enkel Cram aus Mexiko") that were causing exact Pass-2 matches
- Remove spaCy model downloads from Dockerfile (no longer needed after the
  DB-backed matcher rewrite)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-06-07 11:09:35 +02:00
committed by marcel
parent 03d7d44e57
commit bda7855cad
3 changed files with 37 additions and 6 deletions

View File

@@ -8,6 +8,22 @@ from rapidfuzz import fuzz, process
_PUNCT_RE = re.compile(r"[^\w\s\-]", re.UNICODE)
_YEAR_PAT = re.compile(r"^\d{4}$")
# Tokens that cannot appear in a real person's first name — used to filter DB
# records that are annotations or descriptions masquerading as persons.
_NON_NAME_TOKENS: frozenset[str] = frozenset({
# German prepositions
"an", "in", "im", "am", "aus", "von", "vom", "nach", "zu", "zum", "zur",
"für", "bei", "beim", "mit", "über", "unter", "durch", "gegen", "ohne",
"bis", "seit", "des", "dem", "den",
# German possessives / pronouns
"sein", "seine", "seinen", "seiner",
"ihr", "ihre", "ihren", "ihrem",
# English prepositions
"for", "from", "by", "of",
# Spanish prepositions
"del", "por", "para",
})
class PersonMatcher:
"""Match person name fragments from free-text queries against known persons.
@@ -30,6 +46,10 @@ class PersonMatcher:
for first, last in rows:
first = (first or "").strip()
last = (last or "").strip()
# Skip records whose first_name contains function words — these are
# annotations or descriptions in the DB, not real person names.
if any(w in _NON_NAME_TOKENS for w in first.lower().split()):
continue
for variant in _name_variants(first, last):
key = variant.lower()
if key not in seen: