fix(nlp-service): eliminate false-positive person matches from dirty DB records
- Wire _EXTRA_SPAN_STOPS into _extract_persons_and_role so German function words (im, seine, ihre, dem, …) terminate name spans — fixes "Clara im" and "seine Kinder" leaking into personNames - Add _NON_NAME_TOKENS filter in PersonMatcher.load() to skip DB records whose first_name contains prepositions or possessives — filters 290 bad records (annotations like "an seine Eltern", "Eltern in", place references like "Enkel Cram aus Mexiko") that were causing exact Pass-2 matches - Remove spaCy model downloads from Dockerfile (no longer needed after the DB-backed matcher rewrite) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -9,11 +9,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
# Bake models into the image — no volume needed, ~350 MB total
|
|
||||||
RUN python -m spacy download de_core_news_sm \
|
|
||||||
&& python -m spacy download en_core_web_sm \
|
|
||||||
&& python -m spacy download es_core_news_sm
|
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN useradd --no-create-home --shell /usr/sbin/nologin --uid 1001 nlp \
|
RUN useradd --no-create-home --shell /usr/sbin/nologin --uid 1001 nlp \
|
||||||
|
|||||||
@@ -66,6 +66,22 @@ _DATE_BETWEEN: dict[str, frozenset[str]] = {
|
|||||||
"es": frozenset({"entre"}),
|
"es": frozenset({"entre"}),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ── Extra span-termination tokens (function words that cannot be in a name) ──
|
||||||
|
|
||||||
|
_EXTRA_SPAN_STOPS: dict[str, frozenset[str]] = {
|
||||||
|
# German articles, possessives, and particles that end a name span
|
||||||
|
"de": frozenset({
|
||||||
|
"im", "am", "beim", "zum", "zur",
|
||||||
|
"dem", "den", "des",
|
||||||
|
"sein", "seine", "seinen", "seiner",
|
||||||
|
"ihr", "ihre", "ihrem", "ihren", "ihrer",
|
||||||
|
"unser", "unsere", "unseren",
|
||||||
|
"über", "auch", "oder", "und",
|
||||||
|
}),
|
||||||
|
"en": frozenset(),
|
||||||
|
"es": frozenset({"el", "la", "los", "las", "su", "sus", "mi"}),
|
||||||
|
}
|
||||||
|
|
||||||
# ── Stopword lists ────────────────────────────────────────────────────────────
|
# ── Stopword lists ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
_STOPWORDS: dict[str, frozenset[str]] = {
|
_STOPWORDS: dict[str, frozenset[str]] = {
|
||||||
@@ -138,7 +154,7 @@ def _extract_persons_and_role(
|
|||||||
return [], "any"
|
return [], "any"
|
||||||
|
|
||||||
preps = _ALL_PERSON_PREPS[lang]
|
preps = _ALL_PERSON_PREPS[lang]
|
||||||
stops = preps | _DATE_BEFORE[lang] | _DATE_AFTER[lang] | _DATE_BETWEEN[lang]
|
stops = preps | _DATE_BEFORE[lang] | _DATE_AFTER[lang] | _DATE_BETWEEN[lang] | _EXTRA_SPAN_STOPS[lang]
|
||||||
matches = m.find_in_query(query, preps, stop_tokens=stops)
|
matches = m.find_in_query(query, preps, stop_tokens=stops)
|
||||||
|
|
||||||
person_names = [text for text, _ in matches]
|
person_names = [text for text, _ in matches]
|
||||||
|
|||||||
@@ -8,6 +8,22 @@ from rapidfuzz import fuzz, process
|
|||||||
_PUNCT_RE = re.compile(r"[^\w\s\-]", re.UNICODE)
|
_PUNCT_RE = re.compile(r"[^\w\s\-]", re.UNICODE)
|
||||||
_YEAR_PAT = re.compile(r"^\d{4}$")
|
_YEAR_PAT = re.compile(r"^\d{4}$")
|
||||||
|
|
||||||
|
# Tokens that cannot appear in a real person's first name — used to filter DB
|
||||||
|
# records that are annotations or descriptions masquerading as persons.
|
||||||
|
_NON_NAME_TOKENS: frozenset[str] = frozenset({
|
||||||
|
# German prepositions
|
||||||
|
"an", "in", "im", "am", "aus", "von", "vom", "nach", "zu", "zum", "zur",
|
||||||
|
"für", "bei", "beim", "mit", "über", "unter", "durch", "gegen", "ohne",
|
||||||
|
"bis", "seit", "des", "dem", "den",
|
||||||
|
# German possessives / pronouns
|
||||||
|
"sein", "seine", "seinen", "seiner",
|
||||||
|
"ihr", "ihre", "ihren", "ihrem",
|
||||||
|
# English prepositions
|
||||||
|
"for", "from", "by", "of",
|
||||||
|
# Spanish prepositions
|
||||||
|
"del", "por", "para",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
class PersonMatcher:
|
class PersonMatcher:
|
||||||
"""Match person name fragments from free-text queries against known persons.
|
"""Match person name fragments from free-text queries against known persons.
|
||||||
@@ -30,6 +46,10 @@ class PersonMatcher:
|
|||||||
for first, last in rows:
|
for first, last in rows:
|
||||||
first = (first or "").strip()
|
first = (first or "").strip()
|
||||||
last = (last or "").strip()
|
last = (last or "").strip()
|
||||||
|
# Skip records whose first_name contains function words — these are
|
||||||
|
# annotations or descriptions in the DB, not real person names.
|
||||||
|
if any(w in _NON_NAME_TOKENS for w in first.lower().split()):
|
||||||
|
continue
|
||||||
for variant in _name_variants(first, last):
|
for variant in _name_variants(first, last):
|
||||||
key = variant.lower()
|
key = variant.lower()
|
||||||
if key not in seen:
|
if key not in seen:
|
||||||
|
|||||||
Reference in New Issue
Block a user