feat(normalizer): person resolution context + to_canonical

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 14:18:09 +02:00
parent 3066d3d3ff
commit 88c8063227
3 changed files with 166 additions and 0 deletions

View File

@@ -2,6 +2,8 @@
from dataclasses import dataclass, field
from enum import Enum, auto
import dates as _dates
class Triage(Enum):
OK = auto()
@@ -84,3 +86,33 @@ def index_file_mismatch(index: str, file_path: str) -> bool:
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
stem = basename.rsplit(".", 1)[0]
return stem != index
def to_canonical(raw, ctx, date_overrides: dict) -> CanonicalDocument:
pd = _dates.parse_date(raw.date, date_overrides)
flags = []
sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row)
if raw.sender.strip() and not sender_matched:
flags.append("unmatched_sender")
if sender_multi:
flags.append("multi_sender")
receivers = ctx.resolve_receivers(raw.receivers, raw.source_row)
if any(not matched for _, _, matched in receivers):
flags.append("unmatched_receiver")
if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN:
flags.append("unparsed_date")
if index_file_mismatch(raw.index, raw.file):
flags.append("index_file_mismatch")
return CanonicalDocument(
index=raw.index, box=raw.box, folder=raw.folder,
sender_person_id=sender_id, sender_name=sender_name,
receiver_person_ids=[r[0] for r in receivers],
receiver_names=[r[1] for r in receivers],
date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision),
location=raw.location, tags=[raw.tags] if raw.tags else [], summary=raw.summary,
source_row=raw.source_row, needs_review=flags,
)