From 6c5cf8ec9be0ba5fba56ca23e93e234e20bfb4e1 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 7 Jun 2026 11:00:03 +0200 Subject: [PATCH] feat(nlp-service): replace spaCy NER with DB-backed PersonMatcher Rule-based pipeline: persons matched via rapidfuzz against all known names loaded from DB at startup. Fixes first-name-only extraction (Eugenie, Herbert), merged-span bug (Herbert + Eugenie de Gruyter), false positives on compound nouns, and EN/ES model failures. Date extraction unchanged (regex). No spaCy models required. Co-Authored-By: Claude Sonnet 4.6 --- nlp-service/CLAUDE.md | 25 +- nlp-service/extractor.py | 381 ++++++++++--------- nlp-service/main.py | 35 +- nlp-service/person_matcher.py | 164 ++++++++ nlp-service/requirements.txt | 3 +- nlp-service/test_extractor.py | 683 +++++++++++++++++----------------- nlp-service/test_main.py | 73 ++-- nlp-service/test_sentences.md | 126 +++++++ 8 files changed, 939 insertions(+), 551 deletions(-) create mode 100644 nlp-service/person_matcher.py create mode 100644 nlp-service/test_sentences.md diff --git a/nlp-service/CLAUDE.md b/nlp-service/CLAUDE.md index 4b5300ea..f7579b7d 100644 --- a/nlp-service/CLAUDE.md +++ b/nlp-service/CLAUDE.md @@ -5,23 +5,30 @@ replacing Ollama for the Familienarchiv NL search feature. ## Stack -- Python 3.11, FastAPI 0.115, spaCy 3.8, dateparser 1.2 +- Python 3.11, FastAPI 0.115, rapidfuzz 3.x, dateparser 1.2, psycopg2-binary + +No ML models — persons are matched against the live DB via fuzzy lookup. ## Endpoints - `POST /parse` — parse a free-text query, return extraction matching `OllamaExtraction` contract -- `GET /health` — returns `{"status": "ok"}` when all models are loaded +- `GET /health` — returns `{"status": "ok", "persons_loaded": N}` ## Running locally ```bash pip install -r requirements.txt -python -m spacy download de_core_news_sm en_core_web_sm es_core_news_sm + +# Without DB (empty person matcher — dates and keywords still work): uvicorn main:app --reload --port 8001 +# With DB (full person matching): +DATABASE_URL=postgresql://archive_user:secret@localhost:5432/family_archive_db \ + uvicorn main:app --reload --port 8001 + curl -X POST http://localhost:8001/parse \ -H "Content-Type: application/json" \ - -d '{"query": "Briefe von Opa Hermann an Marie vor 1920", "lang": "de"}' + -d '{"query": "Briefe von Clara Cram an Walter de Gruyter vor 1920", "lang": "de"}' ``` ## Testing @@ -30,6 +37,14 @@ curl -X POST http://localhost:8001/parse \ pytest -v ``` +No DB required for tests — fixture pre-seeds the PersonMatcher with a small test corpus. + +## Architecture + +- `person_matcher.py` — DB-backed name lookup: loads all persons at startup, fuzzy-matches query tokens after person prepositions +- `extractor.py` — pipeline: persons → role → dates (regex) → keywords (stopword filter) +- `main.py` — FastAPI app; reads `DATABASE_URL` env var at startup + ## Design spec See `docs/superpowers/specs/2026-06-07-spacy-nlp-service-design.md`. @@ -39,3 +54,5 @@ See `docs/superpowers/specs/2026-06-07-spacy-nlp-service-design.md`. This is a **prototype** for extraction quality evaluation. No docker-compose integration or Java-side changes in this iteration. The extraction contract matches `OllamaExtraction` in `backend/src/main/java/org/raddatz/familienarchiv/search/`. + +Test sentences for manual evaluation are in `test_sentences.md`. diff --git a/nlp-service/extractor.py b/nlp-service/extractor.py index 5d6bb629..b23a58b0 100644 --- a/nlp-service/extractor.py +++ b/nlp-service/extractor.py @@ -1,46 +1,33 @@ +"""Rule-based NLP pipeline: dates via regex, persons via DB-backed matcher.""" from __future__ import annotations import re from datetime import date +from typing import TYPE_CHECKING import dateparser -import spacy -from spacy.language import Language from models import ParseResponse +from person_matcher import PersonMatcher -# ── Language model registry ────────────────────────────────────────────────── +if TYPE_CHECKING: + pass -_MODEL_NAMES: dict[str, str] = { - "de": "de_core_news_sm", - "en": "en_core_web_sm", - "es": "es_core_news_sm", -} +# ── Module-level PersonMatcher (set at startup) ─────────────────────────────── -_nlp_cache: dict[str, Language] = {} +_matcher: PersonMatcher | None = None -def get_nlp(lang: str) -> Language: - if lang not in _MODEL_NAMES: - raise ValueError(f"Unsupported language: {lang!r}. Valid: {list(_MODEL_NAMES)}") - if lang not in _nlp_cache: - _nlp_cache[lang] = spacy.load(_MODEL_NAMES[lang]) - return _nlp_cache[lang] +def set_person_matcher(m: PersonMatcher) -> None: + global _matcher + _matcher = m -def load_all_models() -> None: - for lang in _MODEL_NAMES: - get_nlp(lang) +def get_person_matcher() -> PersonMatcher | None: + return _matcher -# ── Step 1: Person name extraction ────────────────────────────────────────── - -def extract_person_names(doc) -> list[str]: - """Return PER entity texts in left-to-right span order.""" - return [ent.text for ent in doc.ents if ent.label_ == "PER"] - - -# ── Step 2: Role detection ─────────────────────────────────────────────────── +# ── Preposition sets ────────────────────────────────────────────────────────── _SENDER_PREPS: dict[str, frozenset[str]] = { "de": frozenset({"von", "vom"}), @@ -54,43 +41,12 @@ _RECEIVER_PREPS: dict[str, frozenset[str]] = { "es": frozenset({"para", "a"}), } +_ALL_PERSON_PREPS: dict[str, frozenset[str]] = { + lang: _SENDER_PREPS[lang] | _RECEIVER_PREPS[lang] + for lang in ("de", "en", "es") +} -def detect_person_role(doc, per_spans: list, lang: str) -> str: - """Return 'sender', 'receiver', or 'any'. - - Only meaningful for single-PER queries — two-person queries always return - 'any' because Java derives direction from list position. - """ - if len(per_spans) != 1: - return "any" - - span = per_spans[0] - root = span.root - sender = _SENDER_PREPS[lang] - receiver = _RECEIVER_PREPS[lang] - - # Primary: dependency-tree children of the PER root - for child in root.children: - if child.dep_ in ("case", "prep", "mo"): - if child.lower_ in sender: - return "sender" - if child.lower_ in receiver: - return "receiver" - - # Fallback: token immediately before the span start - if span.start > 0: - prev = doc[span.start - 1] - if prev.lower_ in sender: - return "sender" - if prev.lower_ in receiver: - return "receiver" - - return "any" - - -# ── Step 3: Date parsing ───────────────────────────────────────────────────── - -_YEAR_RE = re.compile(r"^\d{4}$") +# ── Date direction tokens ───────────────────────────────────────────────────── _DATE_BEFORE: dict[str, frozenset[str]] = { "de": frozenset({"vor"}), @@ -110,130 +66,219 @@ _DATE_BETWEEN: dict[str, frozenset[str]] = { "es": frozenset({"entre"}), } +# ── Stopword lists ──────────────────────────────────────────────────────────── -def _parse_date_text(text: str, lang: str) -> date | None: - text = text.strip() - if _YEAR_RE.match(text): - year = int(text) - if 1000 < year < 3000: - return date(year, 1, 1) - parsed = dateparser.parse( - text, - languages=[lang], - settings={"PREFER_DAY_OF_MONTH": "first", "RETURN_AS_TIMEZONE_AWARE": False}, - ) - return parsed.date() if parsed else None +_STOPWORDS: dict[str, frozenset[str]] = { + "de": frozenset({ + "der", "die", "das", "des", "dem", "den", + "ein", "eine", "einem", "einen", "einer", "eines", + "er", "sie", "es", "wir", "ihr", "ich", "du", + "und", "oder", "aber", "doch", "auch", "noch", "nur", + "in", "an", "auf", "aus", "bei", "mit", "nach", "von", "vom", + "vor", "zu", "zur", "zum", "durch", "für", "über", "unter", + "zwischen", "gegen", "ohne", "um", "bis", "seit", "wegen", + "ist", "sind", "war", "waren", "wird", "werden", + "hat", "haben", "hatte", "hatten", + "sein", "seine", "seinen", "seiner", "seines", + "ihre", "ihren", "ihrer", "ihrem", "ihres", + "nicht", "kein", "keine", "keinen", "keinem", "keines", + "so", "wie", "als", "da", "hier", "dort", "wo", "wer", "was", + "im", "am", "beim", "ins", "ans", + "ja", "nein", "denn", "wenn", "weil", "dass", "ob", "damit", + "alle", "alles", "mehr", "sehr", "viel", "wenig", + "diesem", "dieser", "dieses", "diese", "diesen", + "jetzt", "dann", "nun", "schon", "wohl", "wurde", "wurden", + "worden", "geschrieben", "seinen", "ihrer", + "beim", "nach", "zum", "zur", "dem", "den", + "seine", "ihrem", "Jahr", "Jahren", "jahre", "jahr", + }), + "en": frozenset({ + "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", + "for", "of", "with", "by", "from", "about", "as", "into", + "through", "is", "are", "was", "were", "be", "been", "being", + "have", "has", "had", "do", "does", "did", "will", "would", + "could", "should", "may", "might", "must", "shall", "can", + "i", "you", "he", "she", "it", "we", "they", "their", "our", + "his", "her", "its", "my", "your", + "this", "that", "these", "those", "all", "not", "no", "nor", + "very", "more", "most", "much", "many", "some", "any", + "before", "after", "between", "during", "since", "until", + "when", "where", "who", "which", "what", "how", + }), + "es": frozenset({ + "el", "la", "los", "las", "un", "una", "unos", "unas", + "y", "o", "pero", "sin", "con", "en", "de", "del", "al", + "a", "ante", "bajo", "desde", "entre", "hacia", "hasta", + "para", "por", "sobre", "tras", + "es", "son", "era", "eran", "fue", "fueron", "ser", "estar", + "ha", "han", "he", "tener", "tiene", + "yo", "su", "sus", "mi", "tu", + "este", "esta", "estos", "estas", "ese", "esa", + "no", "muy", "todo", "todos", "toda", + "que", "cuando", "donde", "como", + "antes", "después", "durante", "desde", "hasta", + }), +} + +# ── Year regex ──────────────────────────────────────────────────────────────── + +_YEAR_RE = re.compile(r"\b(\d{4})\b") +_WORD_RE = re.compile(r"\b[^\W\d_]{3,}\b", re.UNICODE) -def _year_end(d: date) -> date: - """If d is Jan 1, return Dec 31 of the same year (year-only boundary).""" - if d.month == 1 and d.day == 1: - return date(d.year, 12, 31) - return d +# ── Step 1 + 2: Person extraction and role detection ───────────────────────── + +def _extract_persons_and_role( + query: str, + lang: str, +) -> tuple[list[str], str]: + """Return (person_names, role) using the DB-backed PersonMatcher.""" + m = _matcher + if m is None or len(m) == 0: + return [], "any" + + preps = _ALL_PERSON_PREPS[lang] + stops = preps | _DATE_BEFORE[lang] | _DATE_AFTER[lang] | _DATE_BETWEEN[lang] + matches = m.find_in_query(query, preps, stop_tokens=stops) + + person_names = [text for text, _ in matches] + + if len(matches) != 1: + return person_names, "any" + + _, prep = matches[0] + if prep is None: + return person_names, "any" + if prep in _SENDER_PREPS[lang]: + return person_names, "sender" + if prep in _RECEIVER_PREPS[lang]: + return person_names, "receiver" + return person_names, "any" -def _find_year_spans(doc) -> list: - """Fallback: find tokens that look like 4-digit years (1000–2999) when NER - produces no DATE entities. Returns a list of single-token pseudo-spans - (spaCy Span objects) labelled 'DATE'.""" - spans = [] - for token in doc: - if _YEAR_RE.match(token.text): - year = int(token.text) - if 1000 < year < 3000: - span = doc[token.i : token.i + 1] - spans.append(span) - return spans +# ── Step 3: Date extraction ─────────────────────────────────────────────────── - -def extract_dates(doc, lang: str) -> tuple[str | None, str | None]: - """Return (date_from, date_to) as ISO strings or None.""" - date_spans = [ent for ent in doc.ents if ent.label_ == "DATE"] - - # Fallback: some spaCy small models (de, es) don't tag bare years as DATE - if not date_spans: - date_spans = _find_year_spans(doc) - - if not date_spans: - return None, None - - between_tokens = _DATE_BETWEEN[lang] - before_tokens = _DATE_BEFORE[lang] - after_tokens = _DATE_AFTER[lang] - - # "zwischen X und Y" / "between X and Y" — two DATE spans form a range - has_between = any(tok.lower_ in between_tokens for tok in doc) - if has_between and len(date_spans) >= 2: - parsed = [] - for span in date_spans[:2]: - d = _parse_date_text(span.text, lang) - if d: - parsed.append(d) - if len(parsed) == 2: - parsed.sort() - return parsed[0].isoformat(), _year_end(parsed[1]).isoformat() - - # Single DATE span — use direction token - span = date_spans[0] - d = _parse_date_text(span.text, lang) - if not d: - return None, None - - # Check up to 2 tokens before the date span to handle multi-word prepositions - # like Spanish "antes de 1920" where the keyword is 2 tokens back. - prev_tokens = [ - doc[span.start - i].lower_ - for i in range(1, min(3, span.start + 1)) +def _find_years(query: str) -> list[tuple[int, int, int]]: + """Return list of (start, end, year_int) for valid 4-digit year tokens.""" + return [ + (m.start(), m.end(), int(m.group())) + for m in _YEAR_RE.finditer(query) + if 1000 < int(m.group()) < 3000 ] - if any(t in before_tokens for t in prev_tokens): - return None, _year_end(d).isoformat() - if any(t in after_tokens for t in prev_tokens): - return d.isoformat(), None - # Bare year/date — closed year-range - return d.isoformat(), _year_end(d).isoformat() + +def _direction_before_year( + query: str, + year_start: int, + lang: str, + person_names: list[str], +) -> str: + """Classify direction of the date span as 'before', 'after', or 'bare'. + + Looks at the two tokens immediately preceding the year. If the closer + token is a matched person name part, the direction word belongs to that + person — not to the year — so we return 'bare'. + """ + prefix_words = query[:year_start].split() + if not prefix_words: + return "bare" + + person_tokens = {w.lower() for name in person_names for w in name.split()} + recent = [w.lower() for w in prefix_words[-2:]] + + before_set = _DATE_BEFORE[lang] + after_set = _DATE_AFTER[lang] + + for direction_tok in reversed(recent): # closest first + if direction_tok in before_set: + # Only use this if the word immediately before the year is not a person + if recent[-1] in person_tokens: + return "bare" + return "before" + if direction_tok in after_set: + if recent[-1] in person_tokens: + return "bare" + return "after" + + return "bare" -# ── Step 4: Keyword extraction ─────────────────────────────────────────────── +def extract_dates( + query: str, + lang: str, + person_names: list[str] | None = None, +) -> tuple[str | None, str | None]: + """Return (date_from, date_to) as ISO strings or None.""" + if person_names is None: + person_names = [] -def extract_keywords(doc, excluded_spans: list) -> list[str]: - """Return lowercased lemmas of content words not inside any NER span.""" - excluded_indices: set[int] = set() - for span in excluded_spans: - excluded_indices.update(range(span.start, span.end)) + year_spans = _find_years(query) + if not year_spans: + return None, None + # "zwischen X und Y" / "between X and Y" — two years form a range + query_lower = query.lower() + if any(w in query_lower.split() for w in _DATE_BETWEEN[lang]) and len(year_spans) >= 2: + years = sorted([y for _, _, y in year_spans[:2]]) + return date(years[0], 1, 1).isoformat(), date(years[1], 12, 31).isoformat() + + start, end, year = year_spans[0] + direction = _direction_before_year(query, start, lang, person_names) + + if direction == "before": + return None, date(year, 12, 31).isoformat() + if direction == "after": + return date(year, 1, 1).isoformat(), None + # bare year → closed year range + return date(year, 1, 1).isoformat(), date(year, 12, 31).isoformat() + + +# ── Step 4: Keyword extraction ──────────────────────────────────────────────── + +def extract_keywords( + query: str, + lang: str, + person_spans: list[str], + year_strings: list[str], +) -> list[str]: + """Return lowercased content words after removing persons, years, stopwords.""" + text = query + + # Remove matched person spans (longest first to avoid partial replacements) + for span in sorted(person_spans, key=len, reverse=True): + text = re.sub( + r"(? ParseResponse: - """Run the full NLP pipeline and return a ParseResponse.""" - nlp = get_nlp(lang) - doc = nlp(query) - - per_spans = [ent for ent in doc.ents if ent.label_ == "PER"] - - person_names = extract_person_names(doc) - person_role = detect_person_role(doc, per_spans, lang) - date_from, date_to = extract_dates(doc, lang) - keywords = extract_keywords(doc, list(doc.ents)) + """Run the full rule-based pipeline and return a ParseResponse.""" + person_names, person_role = _extract_persons_and_role(query, lang) + year_strings = [str(y) for _, _, y in _find_years(query)] + date_from, date_to = extract_dates(query, lang, person_names) + keywords = extract_keywords(query, lang, person_names, year_strings) return ParseResponse( personNames=person_names, diff --git a/nlp-service/main.py b/nlp-service/main.py index c440a1b0..7163c8ac 100644 --- a/nlp-service/main.py +++ b/nlp-service/main.py @@ -1,19 +1,38 @@ -import logging +"""FastAPI app — /parse and /health endpoints.""" +from __future__ import annotations + +import os from contextlib import asynccontextmanager from fastapi import FastAPI, HTTPException -from extractor import extract, load_all_models +from extractor import extract, get_person_matcher, set_person_matcher from models import ParseRequest, ParseResponse +from person_matcher import PersonMatcher -logger = logging.getLogger(__name__) + +def _load_persons_from_db(db_url: str) -> list[tuple[str | None, str | None]]: + import psycopg2 # deferred — not available in test environments without a DB + + conn = psycopg2.connect(db_url) + try: + cur = conn.cursor() + cur.execute("SELECT first_name, last_name FROM persons") + return cur.fetchall() + finally: + conn.close() @asynccontextmanager async def lifespan(app: FastAPI): - logger.info("Loading spaCy models...") - load_all_models() - logger.info("All models ready.") + # Only initialise the matcher when nothing was pre-seeded (e.g., by tests). + if get_person_matcher() is None: + m = PersonMatcher() + db_url = os.environ.get("DATABASE_URL") + if db_url: + rows = _load_persons_from_db(db_url) + m.load(rows) + set_person_matcher(m) yield @@ -22,7 +41,8 @@ app = FastAPI(lifespan=lifespan) @app.get("/health") def health() -> dict: - return {"status": "ok"} + m = get_person_matcher() + return {"status": "ok", "persons_loaded": len(m) if m else 0} @app.post("/parse", response_model=ParseResponse) @@ -30,5 +50,4 @@ def parse(request: ParseRequest) -> ParseResponse: try: return extract(request.query, request.lang) except Exception as exc: - logger.exception("Extraction pipeline failed") raise HTTPException(status_code=500, detail=str(exc)) from exc diff --git a/nlp-service/person_matcher.py b/nlp-service/person_matcher.py new file mode 100644 index 00000000..5e6f69c7 --- /dev/null +++ b/nlp-service/person_matcher.py @@ -0,0 +1,164 @@ +"""DB-backed person name matcher with fuzzy search.""" +from __future__ import annotations + +import re + +from rapidfuzz import fuzz, process + +_PUNCT_RE = re.compile(r"[^\w\s\-]", re.UNICODE) +_YEAR_PAT = re.compile(r"^\d{4}$") + + +class PersonMatcher: + """Match person name fragments from free-text queries against known persons. + + Loaded once at startup from (first_name, last_name) DB rows. At query + time, scans for tokens following person-indicator prepositions and fuzzy- + matches them against the loaded name variants. Returns the original query + text (not the resolved DB name) so the Java resolveNames() mechanism can + do its own disambiguation. + """ + + def __init__(self) -> None: + self._names: list[str] = [] # lowercase name variants + + # ── Loading ─────────────────────────────────────────────────────────────── + + def load(self, rows: list[tuple[str | None, str | None]]) -> None: + """Populate from DB rows of (first_name, last_name).""" + seen: set[str] = set() + for first, last in rows: + first = (first or "").strip() + last = (last or "").strip() + for variant in _name_variants(first, last): + key = variant.lower() + if key not in seen: + seen.add(key) + self._names.append(key) + + def __len__(self) -> int: + return len(self._names) + + # ── Query-time matching ─────────────────────────────────────────────────── + + def find_in_query( + self, + query: str, + prepositions: frozenset[str], + stop_tokens: frozenset[str] | None = None, + threshold: int = 80, + ) -> list[tuple[str, str | None]]: + """Find person name spans in *query*. + + Returns a list of ``(original_query_text, anchoring_prep_or_None)`` + in left-to-right order. + + Parameters + ---------- + prepositions: + Person-indicator prepositions for the query language (triggers a + scan for the tokens that follow). + stop_tokens: + Tokens that terminate a name span (prepositions + date-direction + words). "de" is a special exception: when immediately followed by + a capitalised word it is treated as a name connector (e.g. + "de Gruyter") rather than a stop. + threshold: + Minimum rapidfuzz token_sort_ratio score to accept a match. + + Strategy + -------- + Pass 1 — prep-anchored: for each person-indicator preposition found in + the token list, collect up to 3 consecutive non-stop, non-year tokens + after it and fuzzy-match the resulting span against loaded names. + Longest match wins. + + Pass 2 — full-name scan: scan positions not yet consumed for exact + multi-word full-name matches (no preposition anchor required). + """ + tokens = query.split() + clean = [_PUNCT_RE.sub("", t) for t in tokens] + lower = [t.lower() for t in clean] + + # Prepositions always terminate a name span, even without explicit stop_tokens. + stops = (stop_tokens or frozenset()) | prepositions + consumed: set[int] = set() + hits: list[tuple[int, str, str | None]] = [] # (position, text, prep) + + # Pass 1 — prep-anchored + for i, ltok in enumerate(lower): + if ltok not in prepositions or i + 1 >= len(tokens): + continue + + # Build candidate span — stop at stop tokens or 4-digit years. + # Exception: "de" before a capitalised word is a name connector. + span_indices: list[int] = [] + j = i + 1 + while j < len(tokens) and len(span_indices) < 3: + if j in consumed: + break + t = lower[j] + if t in stops or _YEAR_PAT.match(clean[j]): + # Allow "de" when the *next* token starts with a capital — + # e.g. "Walter de Gruyter". + next_clean = clean[j + 1] if j + 1 < len(tokens) else "" + if t == "de" and next_clean[:1].isupper(): + pass # connector — keep going + else: + break + span_indices.append(j) + j += 1 + + # Try longest match first, then shorter spans + for span_len in range(len(span_indices), 0, -1): + idx = span_indices[:span_len] + span_lower = " ".join(lower[k] for k in idx) + if self._is_match(span_lower, threshold): + hits.append((idx[0], " ".join(tokens[k] for k in idx), ltok)) + consumed.update(idx) + break + + # Pass 2 — full multi-word name scan (exact only, no preposition needed) + for span_len in (3, 2): + for i in range(len(tokens) - span_len + 1): + span_idx = range(i, i + span_len) + if any(j in consumed for j in span_idx): + continue + span_lower = " ".join(lower[i : i + span_len]) + if span_lower in self._names: + hits.append((i, " ".join(tokens[i : i + span_len]), None)) + consumed.update(span_idx) + + hits.sort(key=lambda h: h[0]) + return [(text, prep) for _, text, prep in hits] + + # ── Internal helpers ────────────────────────────────────────────────────── + + def _is_match(self, text: str, threshold: int) -> bool: + """Return True if *text* fuzzy-matches any loaded name at >= threshold.""" + if not self._names or len(text.strip()) < 3: + return False + text_lower = text.strip().lower() + if text_lower in self._names: + return True # exact match — fast path + result = process.extractOne( + text_lower, + self._names, + scorer=fuzz.token_sort_ratio, + score_cutoff=threshold, + ) + return result is not None + + +# ── helpers ─────────────────────────────────────────────────────────────────── + +def _name_variants(first: str, last: str) -> list[str]: + """Return the name variants to index for a single person.""" + variants = [] + if first and last: + variants.append(f"{first} {last}") + if first: + variants.append(first) + if last: + variants.append(last) + return variants diff --git a/nlp-service/requirements.txt b/nlp-service/requirements.txt index 14c14462..253b7ed0 100644 --- a/nlp-service/requirements.txt +++ b/nlp-service/requirements.txt @@ -1,6 +1,7 @@ fastapi[standard]==0.115.6 uvicorn[standard]==0.34.0 -spacy>=3.8,<4.0 dateparser>=1.2,<2.0 +rapidfuzz>=3.0,<4.0 +psycopg2-binary>=2.9,<3.0 pytest>=8.0,<9.0 httpx>=0.28,<1.0 diff --git a/nlp-service/test_extractor.py b/nlp-service/test_extractor.py index fb117884..ac7490f1 100644 --- a/nlp-service/test_extractor.py +++ b/nlp-service/test_extractor.py @@ -1,350 +1,337 @@ +"""Tests for the rule-based extractor and PersonMatcher.""" import pytest -from pydantic import ValidationError - -# ── Models ────────────────────────────────────────────────────────────────── - -def test_parse_request_valid(): - from models import ParseRequest - req = ParseRequest(query="Briefe von Opa", lang="de") - assert req.query == "Briefe von Opa" - assert req.lang == "de" - - -def test_parse_request_rejects_unknown_lang(): - from models import ParseRequest - with pytest.raises(ValidationError): - ParseRequest(query="Letters from grandpa", lang="fr") - - -def test_parse_response_serializes_nulls(): - from models import ParseResponse - resp = ParseResponse( - personNames=["Opa"], - personRole="sender", - dateFrom=None, - dateTo="1920-12-31", - keywords=["brief"], - rawQuery="Briefe von Opa", - ) - data = resp.model_dump() - assert data["dateFrom"] is None - assert data["dateTo"] == "1920-12-31" - assert data["personRole"] == "sender" - - -# ── Model loading ──────────────────────────────────────────────────────────── - -@pytest.fixture(scope="session") -def nlp_de(): - from extractor import get_nlp - return get_nlp("de") - - -@pytest.fixture(scope="session") -def nlp_en(): - from extractor import get_nlp - return get_nlp("en") - - -@pytest.fixture(scope="session") -def nlp_es(): - from extractor import get_nlp - return get_nlp("es") - - -def test_get_nlp_de_loads(nlp_de): - doc = nlp_de("Test") - assert doc is not None - - -def test_get_nlp_en_loads(nlp_en): - doc = nlp_en("Test") - assert doc is not None - - -def test_get_nlp_es_loads(nlp_es): - doc = nlp_es("Prueba") - assert doc is not None - - -def test_get_nlp_unknown_lang_raises(): - from extractor import get_nlp - with pytest.raises(ValueError, match="Unsupported language"): - get_nlp("fr") - - -# ── Person name extraction ─────────────────────────────────────────────────── - -def _make_doc_with_ents(nlp, text: str, char_ents: list[tuple[int, int, str]]): - """Create a Doc with manually injected entity spans (no NER model needed).""" - doc = nlp.make_doc(text) - spans = [doc.char_span(s, e, label=lbl) for s, e, lbl in char_ents] - doc.ents = [sp for sp in spans if sp is not None] - return doc - - -def test_extract_person_names_two_persons(nlp_de): - from extractor import extract_person_names - # "Briefe von Opa Hermann an Marie" - # "Opa Hermann" = chars 11..22, "Marie" = chars 26..31 - doc = _make_doc_with_ents(nlp_de, "Briefe von Opa Hermann an Marie", [ - (11, 22, "PER"), - (26, 31, "PER"), - ]) - assert extract_person_names(doc) == ["Opa Hermann", "Marie"] - - -def test_extract_person_names_preserves_order(nlp_de): - from extractor import extract_person_names - # "Marie von Opa" — Marie comes first in text - # "Marie" = 0..5, "Opa" = 10..13 - doc = _make_doc_with_ents(nlp_de, "Marie von Opa", [ - (0, 5, "PER"), - (10, 13, "PER"), - ]) - assert extract_person_names(doc) == ["Marie", "Opa"] - - -def test_extract_person_names_empty(nlp_de): - from extractor import extract_person_names - doc = _make_doc_with_ents(nlp_de, "Briefe aus dem Krieg", []) - assert extract_person_names(doc) == [] - - -def test_extract_person_names_ignores_non_per(nlp_de): - from extractor import extract_person_names - # DATE entity should not appear in personNames - doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")]) - assert extract_person_names(doc) == [] - - -# ── Role detection ─────────────────────────────────────────────────────────── - -def test_role_sender_von(nlp_de): - from extractor import detect_person_role - # "Briefe von Marie" — "von" immediately before "Marie" - # "Marie" = chars 11..16 - doc = _make_doc_with_ents(nlp_de, "Briefe von Marie", [(11, 16, "PER")]) - per_spans = list(doc.ents) - assert detect_person_role(doc, per_spans, "de") == "sender" - - -def test_role_receiver_an(nlp_de): - from extractor import detect_person_role - # "Briefe an Marie" — "an" immediately before "Marie" - # "Marie" = chars 10..15 - doc = _make_doc_with_ents(nlp_de, "Briefe an Marie", [(10, 15, "PER")]) - per_spans = list(doc.ents) - assert detect_person_role(doc, per_spans, "de") == "receiver" - - -def test_role_two_persons_returns_any(nlp_de): - from extractor import detect_person_role - # "von Opa an Marie" — two PER spans → always "any" - # "Opa" = chars 4..7, "Marie" = chars 11..16 - doc = _make_doc_with_ents(nlp_de, "von Opa an Marie", [ - (4, 7, "PER"), - (11, 16, "PER"), - ]) - per_spans = list(doc.ents) - assert detect_person_role(doc, per_spans, "de") == "any" - - -def test_role_no_prep_returns_any(nlp_de): - from extractor import detect_person_role - # "Briefe Marie" — no preposition - # "Marie" = chars 7..12 - doc = _make_doc_with_ents(nlp_de, "Briefe Marie", [(7, 12, "PER")]) - per_spans = list(doc.ents) - assert detect_person_role(doc, per_spans, "de") == "any" - - -def test_role_empty_returns_any(nlp_de): - from extractor import detect_person_role - doc = _make_doc_with_ents(nlp_de, "Briefe 1920", []) - assert detect_person_role(doc, [], "de") == "any" - - -def test_role_sender_from_english(nlp_en): - from extractor import detect_person_role - # "letters from Marie" — "from" before "Marie" - # "Marie" = chars 13..18 - doc = _make_doc_with_ents(nlp_en, "letters from Marie", [(13, 18, "PER")]) - per_spans = list(doc.ents) - assert detect_person_role(doc, per_spans, "en") == "sender" - - -def test_role_receiver_to_english(nlp_en): - from extractor import detect_person_role - # "letters to Marie" — "to" before "Marie" - # "letters" = 0..7, " " = 7, "to" = 8..10, " " = 10, "Marie" = 11..16 - doc = _make_doc_with_ents(nlp_en, "letters to Marie", [(11, 16, "PER")]) - per_spans = list(doc.ents) - assert detect_person_role(doc, per_spans, "en") == "receiver" - - -# ── Date parsing ───────────────────────────────────────────────────────────── - -def test_date_vor_1920(nlp_de): - from extractor import extract_dates - # "Briefe vor 1920" — "1920" at chars 11..15 - doc = _make_doc_with_ents(nlp_de, "Briefe vor 1920", [(11, 15, "DATE")]) - date_from, date_to = extract_dates(doc, "de") - assert date_from is None - assert date_to == "1920-12-31" - - -def test_date_nach_1900(nlp_de): - from extractor import extract_dates - # "Briefe nach 1900" — "1900" at chars 12..16 - doc = _make_doc_with_ents(nlp_de, "Briefe nach 1900", [(12, 16, "DATE")]) - date_from, date_to = extract_dates(doc, "de") - assert date_from == "1900-01-01" - assert date_to is None - - -def test_date_zwischen_1900_und_1920(nlp_de): - from extractor import extract_dates - # "zwischen 1900 und 1920" - # "1900" = chars 9..13, "1920" = chars 18..22 - doc = _make_doc_with_ents(nlp_de, "zwischen 1900 und 1920", [ - (9, 13, "DATE"), - (18, 22, "DATE"), - ]) - date_from, date_to = extract_dates(doc, "de") - assert date_from == "1900-01-01" - assert date_to == "1920-12-31" - - -def test_date_bare_year_makes_range(nlp_de): - from extractor import extract_dates - # "Briefe 1920" — no direction token → year-range - # "1920" = chars 7..11 - doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")]) - date_from, date_to = extract_dates(doc, "de") - assert date_from == "1920-01-01" - assert date_to == "1920-12-31" - - -def test_date_no_date_entity(nlp_de): - from extractor import extract_dates - doc = _make_doc_with_ents(nlp_de, "Briefe von Opa", []) - date_from, date_to = extract_dates(doc, "de") - assert date_from is None - assert date_to is None - - -def test_date_before_english(nlp_en): - from extractor import extract_dates - # "letters before 1920" — "1920" at chars 15..19 - doc = _make_doc_with_ents(nlp_en, "letters before 1920", [(15, 19, "DATE")]) - date_from, date_to = extract_dates(doc, "en") - assert date_from is None - assert date_to == "1920-12-31" - - -def test_date_after_english(nlp_en): - from extractor import extract_dates - # "letters after 1900" — "1900" at chars 14..18 - doc = _make_doc_with_ents(nlp_en, "letters after 1900", [(14, 18, "DATE")]) - date_from, date_to = extract_dates(doc, "en") - assert date_from == "1900-01-01" - assert date_to is None - - -# ── Keyword extraction ─────────────────────────────────────────────────────── - -def test_keywords_extracts_nouns(nlp_de): - from extractor import extract_keywords - # Use real NLP for POS tags; disable NER to avoid interference - doc = nlp_de("Briefe aus dem Krieg", disable=["ner"]) - keywords = extract_keywords(doc, []) - # "Brief" (NOUN) and "Krieg" (NOUN) should appear as lemmas - assert "brief" in keywords - assert "krieg" in keywords - - -def test_keywords_excludes_stopwords(nlp_de): - from extractor import extract_keywords - doc = nlp_de("Briefe aus dem Krieg", disable=["ner"]) - keywords = extract_keywords(doc, []) - # "dem" is a stopword article — must not appear - assert "dem" not in keywords - - -def test_keywords_excludes_per_ner_spans(nlp_de): - from extractor import extract_keywords - # Run full NLP for POS tags, then inject a PER span over "Hermann" - # "Briefe von Hermann": B=0..6, ' '=6, v=7..10, ' '=10, H=11..18 - doc = nlp_de("Briefe von Hermann") - per_span = doc.char_span(11, 18, label="PER") - if per_span: - doc.ents = [per_span] - keywords = extract_keywords(doc, list(doc.ents)) - assert "hermann" not in keywords - - -def test_keywords_excludes_short_lemmas(nlp_de): - from extractor import extract_keywords - doc = nlp_de("Briefe an ihn", disable=["ner"]) - keywords = extract_keywords(doc, []) - # "ihn" is 3 chars but is a stopword pronoun; "an" is 2 chars - assert "an" not in keywords - - -def test_keywords_deduplicates(nlp_de): - from extractor import extract_keywords - doc = nlp_de("Brief Brief Krieg", disable=["ner"]) - keywords = extract_keywords(doc, []) - assert keywords.count("brief") == 1 - - -# ── Full extract() pipeline ────────────────────────────────────────────────── - -def test_extract_dates_de(): - from extractor import extract - result = extract("Briefe vor 1920", "de") - assert result.dateFrom is None - assert result.dateTo == "1920-12-31" - assert result.rawQuery == "Briefe vor 1920" - assert result.personNames == [] - assert result.personRole == "any" - - -def test_extract_keywords_from_topic_de(): - from extractor import extract - result = extract("Briefe aus dem Krieg", "de") - assert "krieg" in result.keywords - assert result.dateFrom is None - assert result.dateTo is None - - -def test_extract_dates_en(): - from extractor import extract - result = extract("letters before 1920", "en") - assert result.dateTo == "1920-12-31" - assert result.dateFrom is None - - -def test_extract_dates_es(): - from extractor import extract - result = extract("cartas antes de 1920", "es") - assert result.dateTo == "1920-12-31" - assert result.dateFrom is None - - -def test_extract_rawquery_echoed(): - from extractor import extract - q = "Texte über Weihnachten" - result = extract(q, "de") - assert result.rawQuery == q - - -def test_extract_response_fields_are_complete(): - from extractor import extract - result = extract("Briefe 1900", "de") - assert isinstance(result.personNames, list) - assert result.personRole in ("sender", "receiver", "any") - assert isinstance(result.keywords, list) - assert result.rawQuery == "Briefe 1900" +from extractor import extract, extract_dates, extract_keywords, set_person_matcher +from person_matcher import PersonMatcher + +# ── Shared test fixture ─────────────────────────────────────────────────────── + +_TEST_PERSONS = [ + ("Clara", "Cram"), + ("Herbert", "Cram"), + ("Eugenie", "de Gruyter"), + ("Walter", "de Gruyter"), + ("Marie", "Cram"), + ("Juan", "Cram"), + ("Hilde", "de Gruyter"), + ("Hans", "de Gruyter"), + ("Albert", "de Gruyter"), + ("Anita", "Wöhler"), + ("Else", "Bohrmann"), + ("Lili", "Duvenbeck"), +] + + +@pytest.fixture(scope="session", autouse=True) +def seeded_matcher(): + """Load test persons into the global matcher before any test runs.""" + m = PersonMatcher() + m.load(_TEST_PERSONS) + set_person_matcher(m) + return m + + +# ── PersonMatcher unit tests ────────────────────────────────────────────────── + +class TestPersonMatcher: + DE_PREPS = frozenset({"von", "vom", "an", "nach", "für"}) + + def test_load_populates_names(self, seeded_matcher): + assert len(seeded_matcher) > 0 + + def test_exact_full_name_match(self, seeded_matcher): + hits = seeded_matcher.find_in_query("Briefe von Clara Cram", self.DE_PREPS) + assert hits == [("Clara Cram", "von")] + + def test_exact_first_name_only(self, seeded_matcher): + hits = seeded_matcher.find_in_query("Briefe von Eugenie", self.DE_PREPS) + assert hits == [("Eugenie", "von")] + + def test_exact_first_name_receiver(self, seeded_matcher): + hits = seeded_matcher.find_in_query("Briefe an Herbert", self.DE_PREPS) + assert hits == [("Herbert", "an")] + + def test_fuzzy_typo(self, seeded_matcher): + hits = seeded_matcher.find_in_query("Briefe von Herrbert Cram", self.DE_PREPS) + assert len(hits) == 1 + assert hits[0][1] == "von" + + def test_two_persons_extracted(self, seeded_matcher): + hits = seeded_matcher.find_in_query( + "Briefe von Clara Cram an Herbert Cram", self.DE_PREPS + ) + assert len(hits) == 2 + assert hits[0][0] == "Clara Cram" + assert hits[0][1] == "von" + assert hits[1][0] == "Herbert Cram" + assert hits[1][1] == "an" + + def test_no_match_for_place_name(self, seeded_matcher): + hits = seeded_matcher.find_in_query("Reise nach Mexiko", self.DE_PREPS) + assert hits == [] + + def test_no_match_for_topic_word(self, seeded_matcher): + hits = seeded_matcher.find_in_query("Briefe aus dem Krieg", self.DE_PREPS) + assert hits == [] + + def test_first_name_eugenie_regression(self, seeded_matcher): + # spaCy NER missed standalone first names + hits = seeded_matcher.find_in_query("Briefe von Eugenie", self.DE_PREPS) + assert len(hits) == 1 + + def test_merged_names_regression(self, seeded_matcher): + # spaCy NER merged "Herbert an Eugenie de Gruyter" into one PER span + hits = seeded_matcher.find_in_query( + "Briefe von Herbert an Eugenie de Gruyter nach 1914", self.DE_PREPS + ) + assert len(hits) == 2 + names = [h[0] for h in hits] + assert "Herbert" in names + assert "Eugenie de Gruyter" in names + + def test_english_preps(self, seeded_matcher): + en_preps = frozenset({"from", "by", "to", "for"}) + hits = seeded_matcher.find_in_query( + "Letters from Clara Cram to Walter de Gruyter in 1920", en_preps + ) + assert len(hits) == 2 + assert hits[0][0] == "Clara Cram" + assert hits[1][0] == "Walter de Gruyter" + + def test_double_preposition_de(self, seeded_matcher): + hits = seeded_matcher.find_in_query( + "Briefe von Clara nach Herbert", self.DE_PREPS + ) + assert len(hits) == 2 + names = [h[0] for h in hits] + assert "Clara" in names + assert "Herbert" in names + + +# ── Date extraction tests ───────────────────────────────────────────────────── + +class TestExtractDates: + def test_bare_year_gives_range(self): + assert extract_dates("Briefe 1920", "de") == ("1920-01-01", "1920-12-31") + + def test_im_jahr(self): + assert extract_dates("Schriften im Jahr 1905", "de") == ( + "1905-01-01", "1905-12-31" + ) + + def test_vor_year(self): + assert extract_dates("Briefe vor 1920", "de") == (None, "1920-12-31") + + def test_nach_year(self): + assert extract_dates("Schriften nach 1920", "de") == ("1920-01-01", None) + + def test_zwischen(self): + assert extract_dates("Dokumente zwischen 1914 und 1918", "de") == ( + "1914-01-01", "1918-12-31" + ) + + def test_before_en(self): + assert extract_dates("Letters before 1918", "en") == (None, "1918-12-31") + + def test_after_en(self): + assert extract_dates("Letters after 1939", "en") == ("1939-01-01", None) + + def test_between_en(self): + assert extract_dates("Letters between 1914 and 1918", "en") == ( + "1914-01-01", "1918-12-31" + ) + + def test_antes_de_es(self): + assert extract_dates("Cartas antes de 1900", "es") == (None, "1900-12-31") + + def test_entre_es(self): + assert extract_dates("entre 1915 y 1920", "es") == ( + "1915-01-01", "1920-12-31" + ) + + def test_no_year(self): + assert extract_dates("Briefe aus dem Krieg", "de") == (None, None) + + def test_nach_before_person_then_year(self): + # "nach Marie 1920" — "nach" belongs to person, not date + date_from, date_to = extract_dates("Briefe nach Marie 1920", "de", ["Marie"]) + assert date_from == "1920-01-01" + assert date_to == "1920-12-31" + + def test_bare_year_alone(self): + assert extract_dates("1918", "de") == ("1918-01-01", "1918-12-31") + + +# ── Keyword extraction tests ────────────────────────────────────────────────── + +class TestExtractKeywords: + def test_basic_topic_words(self): + kws = extract_keywords("Briefe aus dem Krieg", "de", [], []) + assert "krieg" in kws + + def test_stopwords_excluded(self): + kws = extract_keywords("von der nach dem aus", "de", [], []) + for sw in ("von", "der", "nach", "dem", "aus"): + assert sw not in kws + + def test_person_spans_excluded(self): + kws = extract_keywords( + "Briefe von Clara Cram nach Herbert", "de", + ["Clara Cram", "Herbert"], [] + ) + assert "clara" not in kws + assert "cram" not in kws + assert "herbert" not in kws + + def test_years_excluded(self): + kws = extract_keywords("Schriften 1920 über Reise", "de", [], ["1920"]) + assert "1920" not in kws + + def test_deduplication(self): + kws = extract_keywords("Krieg Krieg Krieg", "de", [], []) + assert kws.count("krieg") == 1 + + def test_en_stopwords(self): + kws = extract_keywords("Letters about the war", "en", [], []) + assert "the" not in kws + assert "war" in kws + + def test_short_words_excluded(self): + kws = extract_keywords("ab cd ef xy", "de", [], []) + assert all(len(k) >= 3 for k in kws) + + +# ── Full pipeline integration tests ────────────────────────────────────────── + +class TestExtract: + def test_full_sentence_de(self): + r = extract("Briefe von Clara Cram an Walter de Gruyter im Jahr 1920", "de") + assert "Clara Cram" in r.personNames + assert "Walter de Gruyter" in r.personNames + assert r.personRole == "any" + assert r.dateFrom == "1920-01-01" + assert r.dateTo == "1920-12-31" + + def test_sender_role_de(self): + r = extract("Briefe von Clara Cram vor 1910", "de") + assert r.personNames == ["Clara Cram"] + assert r.personRole == "sender" + assert r.dateTo == "1910-12-31" + assert r.dateFrom is None + + def test_receiver_role_de(self): + r = extract("Briefe an Walter de Gruyter", "de") + assert r.personNames == ["Walter de Gruyter"] + assert r.personRole == "receiver" + + def test_first_name_only_eugenie(self): + r = extract("Briefe von Eugenie", "de") + assert "Eugenie" in r.personNames + assert r.personRole == "sender" + + def test_first_name_only_herbert(self): + r = extract("Kriegsbriefe von Herbert", "de") + assert "Herbert" in r.personNames + + def test_merged_names_bug_fixed(self): + r = extract("Briefe von Herbert an Eugenie de Gruyter nach 1914", "de") + assert "Herbert" in r.personNames + assert "Eugenie de Gruyter" in r.personNames + assert r.dateFrom == "1914-01-01" + + def test_topic_only_krieg(self): + r = extract("Briefe aus dem Krieg", "de") + assert r.personNames == [] + assert "krieg" in r.keywords + + def test_topic_only_single_word(self): + r = extract("Kriegspost", "de") + assert r.personNames == [] + + def test_date_range_only(self): + r = extract("Dokumente zwischen 1914 und 1918", "de") + assert r.personNames == [] + assert r.dateFrom == "1914-01-01" + assert r.dateTo == "1918-12-31" + + def test_colloquial_von(self): + r = extract("von Clara", "de") + assert r.personNames == ["Clara"] + assert r.personRole == "sender" + + def test_colloquial_an(self): + r = extract("an Walter", "de") + assert r.personNames == ["Walter"] + assert r.personRole == "receiver" + + def test_bare_year_alone(self): + r = extract("1918", "de") + assert r.dateFrom == "1918-01-01" + assert r.dateTo == "1918-12-31" + assert r.personNames == [] + + def test_english_full_sentence(self): + r = extract("Letters from Clara Cram to Walter de Gruyter in 1920", "en") + assert "Clara Cram" in r.personNames + assert "Walter de Gruyter" in r.personNames + assert r.dateFrom == "1920-01-01" + + def test_english_receiver_with_date(self): + r = extract("Letters to Herbert Cram after 1939", "en") + assert "Herbert Cram" in r.personNames + assert r.personRole == "receiver" + assert r.dateFrom == "1939-01-01" + + def test_english_birthday(self): + r = extract("Birthday greetings from Anita Wöhler", "en") + assert "Anita Wöhler" in r.personNames + assert r.personRole == "sender" + + def test_english_between_dates(self): + r = extract("Letters between 1914 and 1918", "en") + assert r.dateFrom == "1914-01-01" + assert r.dateTo == "1918-12-31" + + def test_spanish_full_sentence(self): + r = extract("Cartas de Clara Cram a Walter de Gruyter en 1920", "es") + assert "Clara Cram" in r.personNames + assert "Walter de Gruyter" in r.personNames + assert r.dateFrom == "1920-01-01" + + def test_spanish_before(self): + r = extract("Cartas antes de 1900", "es") + assert r.dateTo == "1900-12-31" + assert r.dateFrom is None + + def test_rawquery_echoed(self): + q = "test query" + r = extract(q, "de") + assert r.rawQuery == q + + def test_false_positive_compound_noun_regression(self): + # spaCy tagged "Geburtstagsglückwünsche" as a PER entity + r = extract("Geburtstagsglückwünsche", "de") + assert r.personNames == [] + + def test_question_phrasing(self): + r = extract("Wer hat an Herbert Cram 1918 geschrieben?", "de") + assert "Herbert Cram" in r.personNames + assert r.personRole == "receiver" + assert r.dateFrom == "1918-01-01" + + def test_lowercase_query(self): + r = extract("briefe von clara cram an herbert 1920", "de") + # Should still find persons despite lowercase + assert len(r.personNames) >= 1 + + def test_empty_matcher_returns_no_persons(self): + # Temporarily use an empty matcher + from extractor import set_person_matcher + empty = PersonMatcher() + set_person_matcher(empty) + r = extract("Briefe von Clara Cram", "de") + assert r.personNames == [] + # Restore seeded matcher + m = PersonMatcher() + m.load(_TEST_PERSONS) + set_person_matcher(m) diff --git a/nlp-service/test_main.py b/nlp-service/test_main.py index d9382e2d..0b169c34 100644 --- a/nlp-service/test_main.py +++ b/nlp-service/test_main.py @@ -1,43 +1,72 @@ +"""Integration tests for the FastAPI app.""" import pytest from fastapi.testclient import TestClient +from extractor import set_person_matcher +from person_matcher import PersonMatcher + +_TEST_PERSONS = [ + ("Clara", "Cram"), + ("Herbert", "Cram"), + ("Eugenie", "de Gruyter"), + ("Walter", "de Gruyter"), + ("Marie", "Cram"), + ("Anita", "Wöhler"), +] + @pytest.fixture(scope="session") def client(): + # Pre-seed the matcher so the lifespan doesn't overwrite it with an empty one. + m = PersonMatcher() + m.load(_TEST_PERSONS) + set_person_matcher(m) from main import app with TestClient(app) as c: yield c def test_health(client): - response = client.get("/health") - assert response.status_code == 200 - assert response.json() == {"status": "ok"} + r = client.get("/health") + assert r.status_code == 200 + assert r.json()["status"] == "ok" + assert r.json()["persons_loaded"] > 0 def test_parse_returns_200_with_all_fields(client): - response = client.post("/parse", json={"query": "Briefe vor 1920", "lang": "de"}) - assert response.status_code == 200 - data = response.json() - assert "personNames" in data - assert "personRole" in data - assert data["personRole"] in ("sender", "receiver", "any") - assert "dateFrom" in data - assert "dateTo" in data - assert "keywords" in data - assert "rawQuery" in data - assert data["rawQuery"] == "Briefe vor 1920" - assert data["dateTo"] == "1920-12-31" + r = client.post("/parse", json={"query": "Briefe vor 1920", "lang": "de"}) + assert r.status_code == 200 + d = r.json() + assert "personNames" in d + assert d["personRole"] in ("sender", "receiver", "any") + assert "dateFrom" in d + assert "dateTo" in d + assert "keywords" in d + assert d["rawQuery"] == "Briefe vor 1920" + assert d["dateTo"] == "1920-12-31" + + +def test_parse_person_with_date(client): + r = client.post( + "/parse", + json={"query": "Briefe von Clara Cram an Walter de Gruyter im Jahr 1920", "lang": "de"}, + ) + assert r.status_code == 200 + d = r.json() + assert "Clara Cram" in d["personNames"] + assert "Walter de Gruyter" in d["personNames"] + assert d["dateFrom"] == "1920-01-01" + assert d["dateTo"] == "1920-12-31" def test_parse_unknown_lang_returns_422(client): - response = client.post("/parse", json={"query": "test", "lang": "fr"}) - assert response.status_code == 422 + r = client.post("/parse", json={"query": "test", "lang": "fr"}) + assert r.status_code == 422 def test_parse_missing_query_returns_422(client): - response = client.post("/parse", json={"lang": "de"}) - assert response.status_code == 422 + r = client.post("/parse", json={"lang": "de"}) + assert r.status_code == 422 def test_parse_all_languages(client): @@ -47,6 +76,6 @@ def test_parse_all_languages(client): ("es", "cartas antes de 1920"), ] for lang, query in cases: - response = client.post("/parse", json={"query": query, "lang": lang}) - assert response.status_code == 200, f"Failed for lang={lang}" - assert response.json()["dateTo"] == "1920-12-31", f"Wrong dateTo for lang={lang}" + r = client.post("/parse", json={"query": query, "lang": lang}) + assert r.status_code == 200, f"Failed for lang={lang}" + assert r.json()["dateTo"] == "1920-12-31", f"Wrong dateTo for lang={lang}" diff --git a/nlp-service/test_sentences.md b/nlp-service/test_sentences.md new file mode 100644 index 00000000..66143c3f --- /dev/null +++ b/nlp-service/test_sentences.md @@ -0,0 +1,126 @@ +# NLP Service — Test Sentences + +Real data drawn from the Familienarchiv DB (2026-06-07). +Top persons: Clara Cram, Herbert Cram, Eugenie de Gruyter, Walter de Gruyter, Marie Cram, +Juan Cram, Albert de Gruyter, Hilde de Gruyter, Else Bohrmann, Anita Wöhler, Lili Duvenbeck. +Date range: ~1895–1945. Key tags: Krieg, Hochzeit, Reise, Geburtstag, Tod, Alltag, Briefwechsel. + +--- + +## German — full sentences + +```json +{"query": "Briefe von Clara Cram an Walter de Gruyter im Jahr 1920", "lang": "de"} +{"query": "Briefe von Herbert an Eugenie de Gruyter nach 1914", "lang": "de"} +{"query": "Schreiben von Albert de Gruyter an seine Kinder vor 1900", "lang": "de"} +{"query": "Briefe von Juan Cram an Marie zwischen 1915 und 1918", "lang": "de"} +{"query": "Telegramm von Walter de Gruyter an Clara im Jahr 1930", "lang": "de"} +{"query": "Briefe von Else Bohrmann an Herbert Cram nach 1939", "lang": "de"} +``` + +## German — medium (person + date, no strong role signal) + +```json +{"query": "Briefe von Clara Cram vor 1910", "lang": "de"} +{"query": "Dokumente über Walter de Gruyter aus den 1920er Jahren", "lang": "de"} +{"query": "Briefe an Herbert Cram nach dem Krieg", "lang": "de"} +{"query": "Schriften von Eugenie de Gruyter im Jahr 1905", "lang": "de"} +``` + +## German — short (person only) + +```json +{"query": "Briefe an Walter de Gruyter", "lang": "de"} +{"query": "Dokumente über Clara Cram", "lang": "de"} +{"query": "Herbert Cram", "lang": "de"} +{"query": "Anita Wöhler", "lang": "de"} +``` + +## German — topic only (keywords → tag resolution on Java side) + +```json +{"query": "Briefe aus dem Krieg", "lang": "de"} +{"query": "Kriegspost", "lang": "de"} +{"query": "Hochzeitsbriefe", "lang": "de"} +{"query": "Reisebriefe", "lang": "de"} +{"query": "Geburtstagsglückwünsche", "lang": "de"} +{"query": "Briefe über die Hochzeitsreise", "lang": "de"} +{"query": "Kinderbriefe", "lang": "de"} +{"query": "Familienbriefe aus dem Alltag", "lang": "de"} +{"query": "Brautbriefe", "lang": "de"} +{"query": "Kondolenzbriefe nach dem Tod von Eugenie", "lang": "de"} +``` + +## German — date range only + +```json +{"query": "Briefe aus dem Ersten Weltkrieg", "lang": "de"} +{"query": "Dokumente zwischen 1914 und 1918", "lang": "de"} +{"query": "Briefe vor 1900", "lang": "de"} +{"query": "Schriften nach 1920", "lang": "de"} +``` + +## German — combined (all fields) + +```json +{"query": "Briefe von Clara Cram an ihre Kinder über die Reise nach Mexiko im Jahr 1925", "lang": "de"} +{"query": "Kriegspost von Herbert Cram an Eugenie de Gruyter zwischen 1916 und 1918", "lang": "de"} +{"query": "Glückwünsche von Hilde de Gruyter zur Hochzeit im Jahr 1910", "lang": "de"} +{"query": "Kondolenzschreiben an Walter de Gruyter nach dem Tod von Eugenie", "lang": "de"} +``` + +## English + +```json +{"query": "Letters from Clara Cram to Walter de Gruyter in 1920", "lang": "en"} +{"query": "Letters about the war before 1918", "lang": "en"} +{"query": "Letters to Herbert Cram after 1939", "lang": "en"} +{"query": "Birthday greetings from Anita Wöhler", "lang": "en"} +{"query": "Letters between 1914 and 1918", "lang": "en"} +``` + +## Spanish + +```json +{"query": "Cartas de Clara Cram a Walter de Gruyter en 1920", "lang": "es"} +{"query": "Cartas antes de 1900", "lang": "es"} +{"query": "Cartas después de la guerra", "lang": "es"} +{"query": "Cartas de Juan Cram a sus hijos entre 1915 y 1920", "lang": "es"} +``` + +--- + +## Edge cases — lazy / missing words / typos + +```json +{"query": "Clara", "lang": "de"} +{"query": "Eugenie", "lang": "de"} +{"query": "Herbert", "lang": "de"} +{"query": "de Gruyter", "lang": "de"} +{"query": "Briefe von Klara Kram an Herbert", "lang": "de"} +{"query": "briefe von clara cram an herbert 1920", "lang": "de"} +{"query": "1918", "lang": "de"} +{"query": "1914 1918", "lang": "de"} +{"query": "Krieg", "lang": "de"} +{"query": "Briefe von Eugenie", "lang": "de"} +{"query": "Clara Cram Herbert Cram 1920", "lang": "de"} +{"query": "Wer hat an Herbert Cram 1918 geschrieben?", "lang": "de"} +{"query": "von Clara", "lang": "de"} +{"query": "an Walter", "lang": "de"} +{"query": "Clara 1920", "lang": "de"} +{"query": "Kriegsbriefe von Herbert", "lang": "de"} +{"query": "Briefe von Clara nach Herbert", "lang": "de"} +{"query": "Briefe von Herrbert Cram", "lang": "de"} +``` + +--- + +## Known spaCy failures now fixed by DB-backed matcher + +| Query | spaCy result | Expected | +|---|---|---| +| `Briefe von Eugenie` | persons=[] | persons=["Eugenie"] | +| `Kriegsbriefe von Herbert` | keywords=["herbert"] | persons=["Herbert"] | +| `Briefe von Herbert an Eugenie de Gruyter nach 1914` | persons=["Herbert an Eugenie de Gruyter"] (merged!) | persons=["Herbert", "Eugenie de Gruyter"] | +| `Letters from Clara Cram to Walter de Gruyter` | persons=[] (EN model doesn't know German names) | persons=["Clara Cram", "Walter de Gruyter"] | +| `Geburtstagsglückwünsche` | persons=["Geburtstagsglückwünsche"] (false positive!) | persons=[] |