From cff486dda7352395feb3687d274f57c945c0b522 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 13:35:19 +0200 Subject: [PATCH] =?UTF-8?q?fix(normalizer):=20treat=20leading=20date=20qua?= =?UTF-8?q?lifiers=20(nach/vor/=E2=80=A6)=20as=20APPROX?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _preprocess now sets approx=True when a leading marker is stripped; add _match_year_only so bare years (e.g. "nach 1900" -> "1900") resolve to 1900-01-01/YEAR before being upgraded to APPROX. Strengthen test_parse_approx_marker_upgrades_precision and add test_parse_leading_qualifier_is_approx (11 tests, all pass). Co-Authored-By: Claude Opus 4.7 --- tools/import-normalizer/dates.py | 19 +++++++++++++++---- tools/import-normalizer/tests/test_dates.py | 7 ++++++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/tools/import-normalizer/dates.py b/tools/import-normalizer/dates.py index a34bd357..65449a52 100644 --- a/tools/import-normalizer/dates.py +++ b/tools/import-normalizer/dates.py @@ -72,7 +72,7 @@ _LEADING_MARKERS = re.compile( def _preprocess(raw: str): - """Return (cleaned_string, approx_flag).""" + """Return (cleaned_string, approx_flag). Any uncertainty/qualifier marker -> approx.""" s = (raw or "").strip() if not s: return "", False @@ -82,8 +82,10 @@ def _preprocess(raw: str): s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)" s = s.replace("?", " ") s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief") - s = _LEADING_MARKERS.sub("", s) - s = re.sub(r"\s+", " ", s).strip(" .,") + stripped = _LEADING_MARKERS.sub("", s) + if stripped != s: # a leading qualifier (um/ca/nach/vor/anfang/…) signals approximation + approx = True + s = re.sub(r"\s+", " ", stripped).strip(" .,") return s, approx @@ -114,8 +116,17 @@ def _match_numeric(s): return None +_YEAR_ONLY_RE = re.compile(r"\d{4}") + + +def _match_year_only(s): + if _YEAR_ONLY_RE.fullmatch(s): + return datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR + return None + + # Matchers are tried in order. Later tasks append to this list. -_MATCHERS = [_match_iso, _match_numeric] +_MATCHERS = [_match_iso, _match_numeric, _match_year_only] def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate: diff --git a/tools/import-normalizer/tests/test_dates.py b/tools/import-normalizer/tests/test_dates.py index d8e06e22..5a2a6f5b 100644 --- a/tools/import-normalizer/tests/test_dates.py +++ b/tools/import-normalizer/tests/test_dates.py @@ -56,5 +56,10 @@ def test_parse_numeric_unparseable(): def test_parse_approx_marker_upgrades_precision(): r = dates.parse_date("17.Nov (?) 1887") # month-name handled in a later task; here just the marker path - # after the marker is detected, a parsed date becomes APPROX (verified fully in Task 8) assert r.raw == "17.Nov (?) 1887" + assert r.precision == Precision.UNKNOWN # no month-name matcher until Task 7; full APPROX check in Task 8 + +def test_parse_leading_qualifier_is_approx(): + r = dates.parse_date("nach 1900") # "after 1900" -> year salvaged, but precision is APPROX not exact + assert r.iso == "1900-01-01" + assert r.precision == Precision.APPROX