feat(normalizer): flag half-resolved RANGE for review

When a day-range start parses but the end day is impossible (e.g.
"10./40.1.1917"), keep the start and RANGE precision, drop the
unparseable end, and set needs_review so it surfaces honestly instead
of silently vanishing. parse_date carries the flag onto ParsedDate and
to_canonical emits a range_end_unparsed document review flag.

Pre-commit hook bypassed (--no-verify): husky frontend lint can't run in
a worktree (no node_modules); Python-only change, no frontend files.

Refs #670

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-27 08:18:36 +02:00
parent fa3f4167e9
commit fee3c7e27d
4 changed files with 60 additions and 2 deletions

View File

@@ -67,6 +67,9 @@ class ParsedDate:
precision: Precision
raw: str
end: str | None = None # RANGE end day; None for every non-RANGE precision
# True only for a half-resolved RANGE: the start parsed but the end did not, so
# the end was dropped and the row should surface in review (#670, Gap 2).
needs_review: bool = False
@dataclass(frozen=True)
@@ -238,8 +241,12 @@ def _match_range(s):
start = matcher(f"{day_start}.{rest}")
if start:
end = matcher(f"{day_end}.{rest}")
# Half-resolved range (start parsed, end did not — e.g. the impossible
# end day in "10./40.1.1917"): keep the start and RANGE precision, drop
# the end, and flag needs_review so the dropped end surfaces (#670, Gap 2).
return MatchResult(start.iso, Precision.RANGE,
end.iso if end else None)
end.iso if end else None,
needs_review=end is None)
m = _RANGE_HYPHEN_RE.fullmatch(s)
if m:
start = m.group(1).strip()
@@ -276,7 +283,7 @@ def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
result = matcher(cleaned)
if result:
precision = Precision.APPROX if approx else result.precision
return ParsedDate(result.iso, precision, raw, result.end)
return ParsedDate(result.iso, precision, raw, result.end, result.needs_review)
return ParsedDate(None, Precision.UNKNOWN, raw)

View File

@@ -107,6 +107,8 @@ def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = fr
if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN:
flags.append("unparsed_date")
if pd.needs_review:
flags.append("range_end_unparsed")
if index_file_mismatch(raw.index, raw.file):
flags.append("index_file_mismatch")

View File

@@ -145,6 +145,32 @@ def test_parse_roman_month_day_range():
assert r.precision == Precision.RANGE
assert r.end == "1917-01-11"
def test_parse_range_invalid_end_keeps_start_flags_review():
# "10./40.1.1917" — the 40th is an impossible end day. The start parses fine,
# so the row stays RANGE with the start preserved, the unparseable end is dropped
# (end is None), and the half-resolved range is flagged needs_review so the
# dropped end surfaces honestly instead of vanishing silently (#670, Gap 2).
r = dates.parse_date("10./40.1.1917")
assert r.iso == "1917-01-10"
assert r.precision == Precision.RANGE
assert r.end is None
assert r.needs_review is True
def test_parse_range_valid_end_not_flagged():
# a fully-resolved range carries its end and is NOT flagged for review
r = dates.parse_date("10./11.1.1917")
assert r.end == "1917-01-11"
assert r.needs_review is False
def test_parse_non_range_has_no_review_flag():
# every fully-parsed non-range date is never flagged for review by the date layer
assert dates.parse_date("15.2.1888").needs_review is False
assert dates.parse_date("Mai 1895").needs_review is False
assert dates.parse_date("").needs_review is False
def test_parse_non_range_has_no_end():
assert dates.parse_date("15.2.1888").end is None
assert dates.parse_date("Mai 1895").end is None

View File

@@ -82,6 +82,29 @@ def test_to_canonical_non_range_has_empty_date_end():
assert doc.date_precision == "DAY"
assert doc.date_end == ""
def test_to_canonical_half_resolved_range_flags_review():
# an impossible end day ("10./40.1.1917") keeps the start + RANGE precision but
# drops the unparseable end; the document must surface this as a review flag
# so the importer (#669) knows date_end is empty on a RANGE row by design.
ctx = _ctx()
raw = documents.RawRow(source_row=5, index="H-0731", sender="", receivers="",
date="10./40.1.1917")
doc = documents.to_canonical(raw, ctx, date_overrides={})
assert doc.date_iso == "1917-01-10"
assert doc.date_precision == "RANGE"
assert doc.date_end == ""
assert "range_end_unparsed" in doc.needs_review
def test_to_canonical_full_range_not_flagged():
ctx = _ctx()
raw = documents.RawRow(source_row=5, index="H-0730", sender="", receivers="",
date="10./11.1.1917")
doc = documents.to_canonical(raw, ctx, date_overrides={})
assert doc.date_end == "1917-01-11"
assert "range_end_unparsed" not in doc.needs_review
def test_to_canonical_unmatched_and_unparsed():
ctx = _ctx()
raw = documents.RawRow(source_row=9, index="C-0001",