Files
familienarchiv/nlp-service/person_matcher.py
Marcel bda7855cad fix(nlp-service): eliminate false-positive person matches from dirty DB records
- Wire _EXTRA_SPAN_STOPS into _extract_persons_and_role so German function
  words (im, seine, ihre, dem, …) terminate name spans — fixes "Clara im"
  and "seine Kinder" leaking into personNames
- Add _NON_NAME_TOKENS filter in PersonMatcher.load() to skip DB records
  whose first_name contains prepositions or possessives — filters 290 bad
  records (annotations like "an seine Eltern", "Eltern in", place references
  like "Enkel Cram aus Mexiko") that were causing exact Pass-2 matches
- Remove spaCy model downloads from Dockerfile (no longer needed after the
  DB-backed matcher rewrite)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-08 10:56:32 +02:00

185 lines
7.6 KiB
Python

"""DB-backed person name matcher with fuzzy search."""
from __future__ import annotations
import re
from rapidfuzz import fuzz, process
_PUNCT_RE = re.compile(r"[^\w\s\-]", re.UNICODE)
_YEAR_PAT = re.compile(r"^\d{4}$")
# Tokens that cannot appear in a real person's first name — used to filter DB
# records that are annotations or descriptions masquerading as persons.
_NON_NAME_TOKENS: frozenset[str] = frozenset({
# German prepositions
"an", "in", "im", "am", "aus", "von", "vom", "nach", "zu", "zum", "zur",
"für", "bei", "beim", "mit", "über", "unter", "durch", "gegen", "ohne",
"bis", "seit", "des", "dem", "den",
# German possessives / pronouns
"sein", "seine", "seinen", "seiner",
"ihr", "ihre", "ihren", "ihrem",
# English prepositions
"for", "from", "by", "of",
# Spanish prepositions
"del", "por", "para",
})
class PersonMatcher:
"""Match person name fragments from free-text queries against known persons.
Loaded once at startup from (first_name, last_name) DB rows. At query
time, scans for tokens following person-indicator prepositions and fuzzy-
matches them against the loaded name variants. Returns the original query
text (not the resolved DB name) so the Java resolveNames() mechanism can
do its own disambiguation.
"""
def __init__(self) -> None:
self._names: list[str] = [] # lowercase name variants
# ── Loading ───────────────────────────────────────────────────────────────
def load(self, rows: list[tuple[str | None, str | None]]) -> None:
"""Populate from DB rows of (first_name, last_name)."""
seen: set[str] = set()
for first, last in rows:
first = (first or "").strip()
last = (last or "").strip()
# Skip records whose first_name contains function words — these are
# annotations or descriptions in the DB, not real person names.
if any(w in _NON_NAME_TOKENS for w in first.lower().split()):
continue
for variant in _name_variants(first, last):
key = variant.lower()
if key not in seen:
seen.add(key)
self._names.append(key)
def __len__(self) -> int:
return len(self._names)
# ── Query-time matching ───────────────────────────────────────────────────
def find_in_query(
self,
query: str,
prepositions: frozenset[str],
stop_tokens: frozenset[str] | None = None,
threshold: int = 80,
) -> list[tuple[str, str | None]]:
"""Find person name spans in *query*.
Returns a list of ``(original_query_text, anchoring_prep_or_None)``
in left-to-right order.
Parameters
----------
prepositions:
Person-indicator prepositions for the query language (triggers a
scan for the tokens that follow).
stop_tokens:
Tokens that terminate a name span (prepositions + date-direction
words). "de" is a special exception: when immediately followed by
a capitalised word it is treated as a name connector (e.g.
"de Gruyter") rather than a stop.
threshold:
Minimum rapidfuzz token_sort_ratio score to accept a match.
Strategy
--------
Pass 1 — prep-anchored: for each person-indicator preposition found in
the token list, collect up to 3 consecutive non-stop, non-year tokens
after it and fuzzy-match the resulting span against loaded names.
Longest match wins.
Pass 2 — full-name scan: scan positions not yet consumed for exact
multi-word full-name matches (no preposition anchor required).
"""
tokens = query.split()
clean = [_PUNCT_RE.sub("", t) for t in tokens]
lower = [t.lower() for t in clean]
# Prepositions always terminate a name span, even without explicit stop_tokens.
stops = (stop_tokens or frozenset()) | prepositions
consumed: set[int] = set()
hits: list[tuple[int, str, str | None]] = [] # (position, text, prep)
# Pass 1 — prep-anchored
for i, ltok in enumerate(lower):
if ltok not in prepositions or i + 1 >= len(tokens):
continue
# Build candidate span — stop at stop tokens or 4-digit years.
# Exception: "de" before a capitalised word is a name connector.
span_indices: list[int] = []
j = i + 1
while j < len(tokens) and len(span_indices) < 3:
if j in consumed:
break
t = lower[j]
if t in stops or _YEAR_PAT.match(clean[j]):
# Allow "de" when the *next* token starts with a capital —
# e.g. "Walter de Gruyter".
next_clean = clean[j + 1] if j + 1 < len(tokens) else ""
if t == "de" and next_clean[:1].isupper():
pass # connector — keep going
else:
break
span_indices.append(j)
j += 1
# Try longest match first, then shorter spans
for span_len in range(len(span_indices), 0, -1):
idx = span_indices[:span_len]
span_lower = " ".join(lower[k] for k in idx)
if self._is_match(span_lower, threshold):
hits.append((idx[0], " ".join(tokens[k] for k in idx), ltok))
consumed.update(idx)
break
# Pass 2 — full multi-word name scan (exact only, no preposition needed)
for span_len in (3, 2):
for i in range(len(tokens) - span_len + 1):
span_idx = range(i, i + span_len)
if any(j in consumed for j in span_idx):
continue
span_lower = " ".join(lower[i : i + span_len])
if span_lower in self._names:
hits.append((i, " ".join(tokens[i : i + span_len]), None))
consumed.update(span_idx)
hits.sort(key=lambda h: h[0])
return [(text, prep) for _, text, prep in hits]
# ── Internal helpers ──────────────────────────────────────────────────────
def _is_match(self, text: str, threshold: int) -> bool:
"""Return True if *text* fuzzy-matches any loaded name at >= threshold."""
if not self._names or len(text.strip()) < 3:
return False
text_lower = text.strip().lower()
if text_lower in self._names:
return True # exact match — fast path
result = process.extractOne(
text_lower,
self._names,
scorer=fuzz.token_sort_ratio,
score_cutoff=threshold,
)
return result is not None
# ── helpers ───────────────────────────────────────────────────────────────────
def _name_variants(first: str, last: str) -> list[str]:
"""Return the name variants to index for a single person."""
variants = []
if first and last:
variants.append(f"{first} {last}")
if first:
variants.append(first)
if last:
variants.append(last)
return variants