fix(normalizer): treat leading date qualifiers (nach/vor/…) as APPROX
_preprocess now sets approx=True when a leading marker is stripped; add _match_year_only so bare years (e.g. "nach 1900" -> "1900") resolve to 1900-01-01/YEAR before being upgraded to APPROX. Strengthen test_parse_approx_marker_upgrades_precision and add test_parse_leading_qualifier_is_approx (11 tests, all pass). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -72,7 +72,7 @@ _LEADING_MARKERS = re.compile(
|
|||||||
|
|
||||||
|
|
||||||
def _preprocess(raw: str):
|
def _preprocess(raw: str):
|
||||||
"""Return (cleaned_string, approx_flag)."""
|
"""Return (cleaned_string, approx_flag). Any uncertainty/qualifier marker -> approx."""
|
||||||
s = (raw or "").strip()
|
s = (raw or "").strip()
|
||||||
if not s:
|
if not s:
|
||||||
return "", False
|
return "", False
|
||||||
@@ -82,8 +82,10 @@ def _preprocess(raw: str):
|
|||||||
s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)"
|
s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)"
|
||||||
s = s.replace("?", " ")
|
s = s.replace("?", " ")
|
||||||
s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief")
|
s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief")
|
||||||
s = _LEADING_MARKERS.sub("", s)
|
stripped = _LEADING_MARKERS.sub("", s)
|
||||||
s = re.sub(r"\s+", " ", s).strip(" .,")
|
if stripped != s: # a leading qualifier (um/ca/nach/vor/anfang/…) signals approximation
|
||||||
|
approx = True
|
||||||
|
s = re.sub(r"\s+", " ", stripped).strip(" .,")
|
||||||
return s, approx
|
return s, approx
|
||||||
|
|
||||||
|
|
||||||
@@ -114,8 +116,17 @@ def _match_numeric(s):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
_YEAR_ONLY_RE = re.compile(r"\d{4}")
|
||||||
|
|
||||||
|
|
||||||
|
def _match_year_only(s):
|
||||||
|
if _YEAR_ONLY_RE.fullmatch(s):
|
||||||
|
return datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# Matchers are tried in order. Later tasks append to this list.
|
# Matchers are tried in order. Later tasks append to this list.
|
||||||
_MATCHERS = [_match_iso, _match_numeric]
|
_MATCHERS = [_match_iso, _match_numeric, _match_year_only]
|
||||||
|
|
||||||
|
|
||||||
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
|
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
|
||||||
|
|||||||
@@ -56,5 +56,10 @@ def test_parse_numeric_unparseable():
|
|||||||
|
|
||||||
def test_parse_approx_marker_upgrades_precision():
|
def test_parse_approx_marker_upgrades_precision():
|
||||||
r = dates.parse_date("17.Nov (?) 1887") # month-name handled in a later task; here just the marker path
|
r = dates.parse_date("17.Nov (?) 1887") # month-name handled in a later task; here just the marker path
|
||||||
# after the marker is detected, a parsed date becomes APPROX (verified fully in Task 8)
|
|
||||||
assert r.raw == "17.Nov (?) 1887"
|
assert r.raw == "17.Nov (?) 1887"
|
||||||
|
assert r.precision == Precision.UNKNOWN # no month-name matcher until Task 7; full APPROX check in Task 8
|
||||||
|
|
||||||
|
def test_parse_leading_qualifier_is_approx():
|
||||||
|
r = dates.parse_date("nach 1900") # "after 1900" -> year salvaged, but precision is APPROX not exact
|
||||||
|
assert r.iso == "1900-01-01"
|
||||||
|
assert r.precision == Precision.APPROX
|
||||||
|
|||||||
Reference in New Issue
Block a user