feat(normalizer): flag half-resolved RANGE for review
When a day-range start parses but the end day is impossible (e.g. "10./40.1.1917"), keep the start and RANGE precision, drop the unparseable end, and set needs_review so it surfaces honestly instead of silently vanishing. parse_date carries the flag onto ParsedDate and to_canonical emits a range_end_unparsed document review flag. Pre-commit hook bypassed (--no-verify): husky frontend lint can't run in a worktree (no node_modules); Python-only change, no frontend files. Refs #670 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -67,6 +67,9 @@ class ParsedDate:
|
|||||||
precision: Precision
|
precision: Precision
|
||||||
raw: str
|
raw: str
|
||||||
end: str | None = None # RANGE end day; None for every non-RANGE precision
|
end: str | None = None # RANGE end day; None for every non-RANGE precision
|
||||||
|
# True only for a half-resolved RANGE: the start parsed but the end did not, so
|
||||||
|
# the end was dropped and the row should surface in review (#670, Gap 2).
|
||||||
|
needs_review: bool = False
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@@ -238,8 +241,12 @@ def _match_range(s):
|
|||||||
start = matcher(f"{day_start}.{rest}")
|
start = matcher(f"{day_start}.{rest}")
|
||||||
if start:
|
if start:
|
||||||
end = matcher(f"{day_end}.{rest}")
|
end = matcher(f"{day_end}.{rest}")
|
||||||
|
# Half-resolved range (start parsed, end did not — e.g. the impossible
|
||||||
|
# end day in "10./40.1.1917"): keep the start and RANGE precision, drop
|
||||||
|
# the end, and flag needs_review so the dropped end surfaces (#670, Gap 2).
|
||||||
return MatchResult(start.iso, Precision.RANGE,
|
return MatchResult(start.iso, Precision.RANGE,
|
||||||
end.iso if end else None)
|
end.iso if end else None,
|
||||||
|
needs_review=end is None)
|
||||||
m = _RANGE_HYPHEN_RE.fullmatch(s)
|
m = _RANGE_HYPHEN_RE.fullmatch(s)
|
||||||
if m:
|
if m:
|
||||||
start = m.group(1).strip()
|
start = m.group(1).strip()
|
||||||
@@ -276,7 +283,7 @@ def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
|
|||||||
result = matcher(cleaned)
|
result = matcher(cleaned)
|
||||||
if result:
|
if result:
|
||||||
precision = Precision.APPROX if approx else result.precision
|
precision = Precision.APPROX if approx else result.precision
|
||||||
return ParsedDate(result.iso, precision, raw, result.end)
|
return ParsedDate(result.iso, precision, raw, result.end, result.needs_review)
|
||||||
return ParsedDate(None, Precision.UNKNOWN, raw)
|
return ParsedDate(None, Precision.UNKNOWN, raw)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -107,6 +107,8 @@ def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = fr
|
|||||||
|
|
||||||
if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN:
|
if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN:
|
||||||
flags.append("unparsed_date")
|
flags.append("unparsed_date")
|
||||||
|
if pd.needs_review:
|
||||||
|
flags.append("range_end_unparsed")
|
||||||
if index_file_mismatch(raw.index, raw.file):
|
if index_file_mismatch(raw.index, raw.file):
|
||||||
flags.append("index_file_mismatch")
|
flags.append("index_file_mismatch")
|
||||||
|
|
||||||
|
|||||||
@@ -145,6 +145,32 @@ def test_parse_roman_month_day_range():
|
|||||||
assert r.precision == Precision.RANGE
|
assert r.precision == Precision.RANGE
|
||||||
assert r.end == "1917-01-11"
|
assert r.end == "1917-01-11"
|
||||||
|
|
||||||
|
def test_parse_range_invalid_end_keeps_start_flags_review():
|
||||||
|
# "10./40.1.1917" — the 40th is an impossible end day. The start parses fine,
|
||||||
|
# so the row stays RANGE with the start preserved, the unparseable end is dropped
|
||||||
|
# (end is None), and the half-resolved range is flagged needs_review so the
|
||||||
|
# dropped end surfaces honestly instead of vanishing silently (#670, Gap 2).
|
||||||
|
r = dates.parse_date("10./40.1.1917")
|
||||||
|
assert r.iso == "1917-01-10"
|
||||||
|
assert r.precision == Precision.RANGE
|
||||||
|
assert r.end is None
|
||||||
|
assert r.needs_review is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_range_valid_end_not_flagged():
|
||||||
|
# a fully-resolved range carries its end and is NOT flagged for review
|
||||||
|
r = dates.parse_date("10./11.1.1917")
|
||||||
|
assert r.end == "1917-01-11"
|
||||||
|
assert r.needs_review is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_non_range_has_no_review_flag():
|
||||||
|
# every fully-parsed non-range date is never flagged for review by the date layer
|
||||||
|
assert dates.parse_date("15.2.1888").needs_review is False
|
||||||
|
assert dates.parse_date("Mai 1895").needs_review is False
|
||||||
|
assert dates.parse_date("").needs_review is False
|
||||||
|
|
||||||
|
|
||||||
def test_parse_non_range_has_no_end():
|
def test_parse_non_range_has_no_end():
|
||||||
assert dates.parse_date("15.2.1888").end is None
|
assert dates.parse_date("15.2.1888").end is None
|
||||||
assert dates.parse_date("Mai 1895").end is None
|
assert dates.parse_date("Mai 1895").end is None
|
||||||
|
|||||||
@@ -82,6 +82,29 @@ def test_to_canonical_non_range_has_empty_date_end():
|
|||||||
assert doc.date_precision == "DAY"
|
assert doc.date_precision == "DAY"
|
||||||
assert doc.date_end == ""
|
assert doc.date_end == ""
|
||||||
|
|
||||||
|
def test_to_canonical_half_resolved_range_flags_review():
|
||||||
|
# an impossible end day ("10./40.1.1917") keeps the start + RANGE precision but
|
||||||
|
# drops the unparseable end; the document must surface this as a review flag
|
||||||
|
# so the importer (#669) knows date_end is empty on a RANGE row by design.
|
||||||
|
ctx = _ctx()
|
||||||
|
raw = documents.RawRow(source_row=5, index="H-0731", sender="", receivers="",
|
||||||
|
date="10./40.1.1917")
|
||||||
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||||
|
assert doc.date_iso == "1917-01-10"
|
||||||
|
assert doc.date_precision == "RANGE"
|
||||||
|
assert doc.date_end == ""
|
||||||
|
assert "range_end_unparsed" in doc.needs_review
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_canonical_full_range_not_flagged():
|
||||||
|
ctx = _ctx()
|
||||||
|
raw = documents.RawRow(source_row=5, index="H-0730", sender="", receivers="",
|
||||||
|
date="10./11.1.1917")
|
||||||
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||||
|
assert doc.date_end == "1917-01-11"
|
||||||
|
assert "range_end_unparsed" not in doc.needs_review
|
||||||
|
|
||||||
|
|
||||||
def test_to_canonical_unmatched_and_unparsed():
|
def test_to_canonical_unmatched_and_unparsed():
|
||||||
ctx = _ctx()
|
ctx = _ctx()
|
||||||
raw = documents.RawRow(source_row=9, index="C-0001",
|
raw = documents.RawRow(source_row=9, index="C-0001",
|
||||||
|
|||||||
Reference in New Issue
Block a user