refactor(search): remove NLP/smart-search feature entirely #772
@@ -31,3 +31,10 @@ def get_nlp(lang: str) -> Language:
|
||||
def load_all_models() -> None:
|
||||
for lang in _MODEL_NAMES:
|
||||
get_nlp(lang)
|
||||
|
||||
|
||||
# ── Step 1: Person name extraction ──────────────────────────────────────────
|
||||
|
||||
def extract_person_names(doc) -> list[str]:
|
||||
"""Return PER entity texts in left-to-right span order."""
|
||||
return [ent.text for ent in doc.ents if ent.label_ == "PER"]
|
||||
|
||||
@@ -72,3 +72,48 @@ def test_get_nlp_unknown_lang_raises():
|
||||
from extractor import get_nlp
|
||||
with pytest.raises(ValueError, match="Unsupported language"):
|
||||
get_nlp("fr")
|
||||
|
||||
|
||||
# ── Person name extraction ───────────────────────────────────────────────────
|
||||
|
||||
def _make_doc_with_ents(nlp, text: str, char_ents: list[tuple[int, int, str]]):
|
||||
"""Create a Doc with manually injected entity spans (no NER model needed)."""
|
||||
doc = nlp.make_doc(text)
|
||||
spans = [doc.char_span(s, e, label=lbl) for s, e, lbl in char_ents]
|
||||
doc.ents = [sp for sp in spans if sp is not None]
|
||||
return doc
|
||||
|
||||
|
||||
def test_extract_person_names_two_persons(nlp_de):
|
||||
from extractor import extract_person_names
|
||||
# "Briefe von Opa Hermann an Marie"
|
||||
# "Opa Hermann" = chars 11..22, "Marie" = chars 26..31
|
||||
doc = _make_doc_with_ents(nlp_de, "Briefe von Opa Hermann an Marie", [
|
||||
(11, 22, "PER"),
|
||||
(26, 31, "PER"),
|
||||
])
|
||||
assert extract_person_names(doc) == ["Opa Hermann", "Marie"]
|
||||
|
||||
|
||||
def test_extract_person_names_preserves_order(nlp_de):
|
||||
from extractor import extract_person_names
|
||||
# "Marie von Opa" — Marie comes first in text
|
||||
# "Marie" = 0..5, "Opa" = 10..13
|
||||
doc = _make_doc_with_ents(nlp_de, "Marie von Opa", [
|
||||
(0, 5, "PER"),
|
||||
(10, 13, "PER"),
|
||||
])
|
||||
assert extract_person_names(doc) == ["Marie", "Opa"]
|
||||
|
||||
|
||||
def test_extract_person_names_empty(nlp_de):
|
||||
from extractor import extract_person_names
|
||||
doc = _make_doc_with_ents(nlp_de, "Briefe aus dem Krieg", [])
|
||||
assert extract_person_names(doc) == []
|
||||
|
||||
|
||||
def test_extract_person_names_ignores_non_per(nlp_de):
|
||||
from extractor import extract_person_names
|
||||
# DATE entity should not appear in personNames
|
||||
doc = _make_doc_with_ents(nlp_de, "Briefe 1920", [(7, 11, "DATE")])
|
||||
assert extract_person_names(doc) == []
|
||||
|
||||
Reference in New Issue
Block a user