diff --git a/nlp-service/CLAUDE.md b/nlp-service/CLAUDE.md new file mode 100644 index 00000000..4b5300ea --- /dev/null +++ b/nlp-service/CLAUDE.md @@ -0,0 +1,41 @@ +# NLP Service + +Lightweight FastAPI service that parses free-text search queries into structured extractions, +replacing Ollama for the Familienarchiv NL search feature. + +## Stack + +- Python 3.11, FastAPI 0.115, spaCy 3.8, dateparser 1.2 + +## Endpoints + +- `POST /parse` — parse a free-text query, return extraction matching `OllamaExtraction` contract +- `GET /health` — returns `{"status": "ok"}` when all models are loaded + +## Running locally + +```bash +pip install -r requirements.txt +python -m spacy download de_core_news_sm en_core_web_sm es_core_news_sm +uvicorn main:app --reload --port 8001 + +curl -X POST http://localhost:8001/parse \ + -H "Content-Type: application/json" \ + -d '{"query": "Briefe von Opa Hermann an Marie vor 1920", "lang": "de"}' +``` + +## Testing + +```bash +pytest -v +``` + +## Design spec + +See `docs/superpowers/specs/2026-06-07-spacy-nlp-service-design.md`. + +## Notes + +This is a **prototype** for extraction quality evaluation. No docker-compose integration or +Java-side changes in this iteration. The extraction contract matches `OllamaExtraction` in +`backend/src/main/java/org/raddatz/familienarchiv/search/`. diff --git a/nlp-service/models.py b/nlp-service/models.py new file mode 100644 index 00000000..e36fb89e --- /dev/null +++ b/nlp-service/models.py @@ -0,0 +1,17 @@ +from __future__ import annotations +from typing import Literal +from pydantic import BaseModel + + +class ParseRequest(BaseModel): + query: str + lang: Literal["de", "en", "es"] + + +class ParseResponse(BaseModel): + personNames: list[str] + personRole: Literal["sender", "receiver", "any"] + dateFrom: str | None + dateTo: str | None + keywords: list[str] + rawQuery: str diff --git a/nlp-service/requirements.txt b/nlp-service/requirements.txt new file mode 100644 index 00000000..14c14462 --- /dev/null +++ b/nlp-service/requirements.txt @@ -0,0 +1,6 @@ +fastapi[standard]==0.115.6 +uvicorn[standard]==0.34.0 +spacy>=3.8,<4.0 +dateparser>=1.2,<2.0 +pytest>=8.0,<9.0 +httpx>=0.28,<1.0 diff --git a/nlp-service/test_extractor.py b/nlp-service/test_extractor.py new file mode 100644 index 00000000..0b80d0b4 --- /dev/null +++ b/nlp-service/test_extractor.py @@ -0,0 +1,33 @@ +import pytest +from pydantic import ValidationError + + +# ── Models ────────────────────────────────────────────────────────────────── + +def test_parse_request_valid(): + from models import ParseRequest + req = ParseRequest(query="Briefe von Opa", lang="de") + assert req.query == "Briefe von Opa" + assert req.lang == "de" + + +def test_parse_request_rejects_unknown_lang(): + from models import ParseRequest + with pytest.raises(ValidationError): + ParseRequest(query="Letters from grandpa", lang="fr") + + +def test_parse_response_serializes_nulls(): + from models import ParseResponse + resp = ParseResponse( + personNames=["Opa"], + personRole="sender", + dateFrom=None, + dateTo="1920-12-31", + keywords=["brief"], + rawQuery="Briefe von Opa", + ) + data = resp.model_dump() + assert data["dateFrom"] is None + assert data["dateTo"] == "1920-12-31" + assert data["personRole"] == "sender"