89 lines
2.6 KiB
Python
89 lines
2.6 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from datetime import date
|
|
|
|
import dateparser
|
|
import spacy
|
|
from spacy.language import Language
|
|
|
|
from models import ParseResponse
|
|
|
|
# ── Language model registry ──────────────────────────────────────────────────
|
|
|
|
_MODEL_NAMES: dict[str, str] = {
|
|
"de": "de_core_news_sm",
|
|
"en": "en_core_web_sm",
|
|
"es": "es_core_news_sm",
|
|
}
|
|
|
|
_nlp_cache: dict[str, Language] = {}
|
|
|
|
|
|
def get_nlp(lang: str) -> Language:
|
|
if lang not in _MODEL_NAMES:
|
|
raise ValueError(f"Unsupported language: {lang!r}. Valid: {list(_MODEL_NAMES)}")
|
|
if lang not in _nlp_cache:
|
|
_nlp_cache[lang] = spacy.load(_MODEL_NAMES[lang])
|
|
return _nlp_cache[lang]
|
|
|
|
|
|
def load_all_models() -> None:
|
|
for lang in _MODEL_NAMES:
|
|
get_nlp(lang)
|
|
|
|
|
|
# ── Step 1: Person name extraction ──────────────────────────────────────────
|
|
|
|
def extract_person_names(doc) -> list[str]:
|
|
"""Return PER entity texts in left-to-right span order."""
|
|
return [ent.text for ent in doc.ents if ent.label_ == "PER"]
|
|
|
|
|
|
# ── Step 2: Role detection ───────────────────────────────────────────────────
|
|
|
|
_SENDER_PREPS: dict[str, frozenset[str]] = {
|
|
"de": frozenset({"von", "vom"}),
|
|
"en": frozenset({"from", "by"}),
|
|
"es": frozenset({"de", "por"}),
|
|
}
|
|
|
|
_RECEIVER_PREPS: dict[str, frozenset[str]] = {
|
|
"de": frozenset({"an", "nach", "für"}),
|
|
"en": frozenset({"to", "for"}),
|
|
"es": frozenset({"para", "a"}),
|
|
}
|
|
|
|
|
|
def detect_person_role(doc, per_spans: list, lang: str) -> str:
|
|
"""Return 'sender', 'receiver', or 'any'.
|
|
|
|
Only meaningful for single-PER queries — two-person queries always return
|
|
'any' because Java derives direction from list position.
|
|
"""
|
|
if len(per_spans) != 1:
|
|
return "any"
|
|
|
|
span = per_spans[0]
|
|
root = span.root
|
|
sender = _SENDER_PREPS[lang]
|
|
receiver = _RECEIVER_PREPS[lang]
|
|
|
|
# Primary: dependency-tree children of the PER root
|
|
for child in root.children:
|
|
if child.dep_ in ("case", "prep", "mo"):
|
|
if child.lower_ in sender:
|
|
return "sender"
|
|
if child.lower_ in receiver:
|
|
return "receiver"
|
|
|
|
# Fallback: token immediately before the span start
|
|
if span.start > 0:
|
|
prev = doc[span.start - 1]
|
|
if prev.lower_ in sender:
|
|
return "sender"
|
|
if prev.lower_ in receiver:
|
|
return "receiver"
|
|
|
|
return "any"
|