diff --git a/tools/import-normalizer/dates.py b/tools/import-normalizer/dates.py index 77245680..d1dc81aa 100644 --- a/tools/import-normalizer/dates.py +++ b/tools/import-normalizer/dates.py @@ -66,6 +66,7 @@ class ParsedDate: iso: str | None precision: Precision raw: str + end: str | None = None # RANGE end day; None for every non-RANGE precision _LEADING_MARKERS = re.compile( @@ -210,21 +211,23 @@ def _match_year_only(s): def _match_range(s): m = _RANGE_YY_RE.fullmatch(s) if m: - return datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE + return datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE, None m = _RANGE_DAY_RE.fullmatch(s) if m: - first = f"{m.group(1)}.{m.group(3)}" # "7." + "Sept.1923" -> "7.Sept.1923" - for matcher in (_match_numeric, _match_monthname_a): - r = matcher(first) - if r: - return r[0], Precision.RANGE + day_start, day_end, rest = m.group(1), m.group(2), m.group(3) + # "10." + "1.1917" -> "10.1.1917"; resolve start and end day against the shared month/year + for matcher in (_match_numeric, _match_roman, _match_monthname_a): + start = matcher(f"{day_start}.{rest}") + if start: + end = matcher(f"{day_end}.{rest}") + return start[0], Precision.RANGE, (end[0] if end else None) m = _RANGE_HYPHEN_RE.fullmatch(s) if m: start = m.group(1).strip() for matcher in (_match_numeric, _match_roman, _match_monthname_a, _match_year_only): r = matcher(start) if r: - return r[0], Precision.RANGE + return r[0], Precision.RANGE, None return None @@ -253,10 +256,11 @@ def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate: for matcher in _MATCHERS: result = matcher(cleaned) if result: - iso, precision = result + iso, precision = result[0], result[1] + end = result[2] if len(result) > 2 else None if approx: precision = Precision.APPROX - return ParsedDate(iso, precision, raw) + return ParsedDate(iso, precision, raw, end) return ParsedDate(None, Precision.UNKNOWN, raw) diff --git a/tools/import-normalizer/documents.py b/tools/import-normalizer/documents.py index c8060719..fbd3ebdb 100644 --- a/tools/import-normalizer/documents.py +++ b/tools/import-normalizer/documents.py @@ -116,6 +116,7 @@ def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = fr receiver_person_ids=[r[0] for r in receivers], receiver_names=[r[1] for r in receivers], date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision), + date_end=pd.end or "", location=raw.location, tags=_tags.generate_tags(raw.tags, raw.summary, approved_themes), summary=raw.summary, source_row=raw.source_row, needs_review=flags, ) diff --git a/tools/import-normalizer/tests/test_dates.py b/tools/import-normalizer/tests/test_dates.py index 2a43ad61..b380c7c7 100644 --- a/tools/import-normalizer/tests/test_dates.py +++ b/tools/import-normalizer/tests/test_dates.py @@ -115,10 +115,29 @@ def test_parse_invalid_calendar_date_is_unknown(): assert dates.parse_date("31.4.1916").precision == Precision.UNKNOWN def test_parse_intra_month_day_range(): - # "7./8. Sept.1923" -> start day, RANGE. Must NOT be confused with slash-date "17/6. 1916". - assert dates.parse_date("7./8. Sept.1923") == dates.ParsedDate("1923-09-07", Precision.RANGE, "7./8. Sept.1923") + # "7./8. Sept.1923" -> start day, RANGE, end day 8th. Must NOT be confused with slash-date "17/6. 1916". + assert dates.parse_date("7./8. Sept.1923") == dates.ParsedDate("1923-09-07", Precision.RANGE, "7./8. Sept.1923", "1923-09-08") assert dates.parse_date("17/6. 1916") == dates.ParsedDate("1916-06-17", Precision.DAY, "17/6. 1916") +def test_parse_intra_month_day_range_carries_end_day(): + # the intra-month day range surfaces the END day so Phase 4 can render meta_date_end + r = dates.parse_date("10./11.1.1917") + assert r.iso == "1917-01-10" + assert r.precision == Precision.RANGE + assert r.end == "1917-01-11" + +def test_parse_roman_month_day_range(): + # "10./11.I.1917" — Roman-numeral-month range; previously fell through to UNKNOWN + r = dates.parse_date("10./11.I.1917") + assert r.iso == "1917-01-10" + assert r.precision == Precision.RANGE + assert r.end == "1917-01-11" + +def test_parse_non_range_has_no_end(): + assert dates.parse_date("15.2.1888").end is None + assert dates.parse_date("Mai 1895").end is None + assert dates.parse_date("").end is None + def test_parse_trailing_note_stripped_but_raw_preserved(): r = dates.parse_date("17.Nov 1887, 2. Brief") # REQ-DATE-04 assert r.iso == "1887-11-17" diff --git a/tools/import-normalizer/tests/test_documents.py b/tools/import-normalizer/tests/test_documents.py index 3395275b..5313a632 100644 --- a/tools/import-normalizer/tests/test_documents.py +++ b/tools/import-normalizer/tests/test_documents.py @@ -63,6 +63,25 @@ def test_to_canonical_carries_file_name(): doc = documents.to_canonical(raw, ctx, date_overrides={}) assert doc.file == "H-0730.pdf" + +def test_to_canonical_range_carries_date_end(): + ctx = _ctx() + raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="", + date="10./11.1.1917") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.date_iso == "1917-01-10" + assert doc.date_precision == "RANGE" + assert doc.date_end == "1917-01-11" + + +def test_to_canonical_non_range_has_empty_date_end(): + ctx = _ctx() + raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="", + date="15.2.1888") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.date_precision == "DAY" + assert doc.date_end == "" + def test_to_canonical_unmatched_and_unparsed(): ctx = _ctx() raw = documents.RawRow(source_row=9, index="C-0001",