diff --git a/nlp-service/Dockerfile b/nlp-service/Dockerfile index b47a71be..61c723b0 100644 --- a/nlp-service/Dockerfile +++ b/nlp-service/Dockerfile @@ -9,11 +9,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -# Bake models into the image — no volume needed, ~350 MB total -RUN python -m spacy download de_core_news_sm \ - && python -m spacy download en_core_web_sm \ - && python -m spacy download es_core_news_sm - COPY . . RUN useradd --no-create-home --shell /usr/sbin/nologin --uid 1001 nlp \ diff --git a/nlp-service/extractor.py b/nlp-service/extractor.py index b23a58b0..f4b16adf 100644 --- a/nlp-service/extractor.py +++ b/nlp-service/extractor.py @@ -66,6 +66,22 @@ _DATE_BETWEEN: dict[str, frozenset[str]] = { "es": frozenset({"entre"}), } +# ── Extra span-termination tokens (function words that cannot be in a name) ── + +_EXTRA_SPAN_STOPS: dict[str, frozenset[str]] = { + # German articles, possessives, and particles that end a name span + "de": frozenset({ + "im", "am", "beim", "zum", "zur", + "dem", "den", "des", + "sein", "seine", "seinen", "seiner", + "ihr", "ihre", "ihrem", "ihren", "ihrer", + "unser", "unsere", "unseren", + "über", "auch", "oder", "und", + }), + "en": frozenset(), + "es": frozenset({"el", "la", "los", "las", "su", "sus", "mi"}), +} + # ── Stopword lists ──────────────────────────────────────────────────────────── _STOPWORDS: dict[str, frozenset[str]] = { @@ -138,7 +154,7 @@ def _extract_persons_and_role( return [], "any" preps = _ALL_PERSON_PREPS[lang] - stops = preps | _DATE_BEFORE[lang] | _DATE_AFTER[lang] | _DATE_BETWEEN[lang] + stops = preps | _DATE_BEFORE[lang] | _DATE_AFTER[lang] | _DATE_BETWEEN[lang] | _EXTRA_SPAN_STOPS[lang] matches = m.find_in_query(query, preps, stop_tokens=stops) person_names = [text for text, _ in matches] diff --git a/nlp-service/person_matcher.py b/nlp-service/person_matcher.py index 5e6f69c7..2374208c 100644 --- a/nlp-service/person_matcher.py +++ b/nlp-service/person_matcher.py @@ -8,6 +8,22 @@ from rapidfuzz import fuzz, process _PUNCT_RE = re.compile(r"[^\w\s\-]", re.UNICODE) _YEAR_PAT = re.compile(r"^\d{4}$") +# Tokens that cannot appear in a real person's first name — used to filter DB +# records that are annotations or descriptions masquerading as persons. +_NON_NAME_TOKENS: frozenset[str] = frozenset({ + # German prepositions + "an", "in", "im", "am", "aus", "von", "vom", "nach", "zu", "zum", "zur", + "für", "bei", "beim", "mit", "über", "unter", "durch", "gegen", "ohne", + "bis", "seit", "des", "dem", "den", + # German possessives / pronouns + "sein", "seine", "seinen", "seiner", + "ihr", "ihre", "ihren", "ihrem", + # English prepositions + "for", "from", "by", "of", + # Spanish prepositions + "del", "por", "para", +}) + class PersonMatcher: """Match person name fragments from free-text queries against known persons. @@ -30,6 +46,10 @@ class PersonMatcher: for first, last in rows: first = (first or "").strip() last = (last or "").strip() + # Skip records whose first_name contains function words — these are + # annotations or descriptions in the DB, not real person names. + if any(w in _NON_NAME_TOKENS for w in first.lower().split()): + continue for variant in _name_variants(first, last): key = variant.lower() if key not in seen: