Import normalizer: offline tool to normalize the raw archive spreadsheets #663

Merged
marcel merged 172 commits from docs/import-migration into main 2026-05-28 15:05:51 +02:00
Showing only changes of commit e93b09f1e2 - Show all commits

View File

@@ -178,39 +178,61 @@ public class DocumentImporter {
String s3Key, String contentType, DocumentStatus status) { String s3Key, String contentType, DocumentStatus status) {
Document doc = existing != null ? existing Document doc = existing != null ? existing
: Document.builder().originalFilename(index).build(); : Document.builder().originalFilename(index).build();
applyAttribution(doc, row);
applyDates(doc, row);
applyAuthoritativeAssociations(doc, row);
applyFileMetadata(doc, s3Key, contentType, status, index);
applyComputedFlags(doc);
return doc;
}
// Sender + raw sender/receiver text. The raw cells are always retained verbatim, even
// when a person is linked — the load-bearing invariant behind the merge story (ADR-025).
private void applyAttribution(Document doc, CanonicalSheetReader.Row row) {
String senderName = row.get("sender_name"); String senderName = row.get("sender_name");
String receiverNames = row.get("receiver_names"); String receiverNames = row.get("receiver_names");
Person sender = resolveSender(row.get("sender_person_id"), senderName); Person sender = resolveSender(row.get("sender_person_id"), senderName);
doc.setSender(sender);
doc.setSenderText(blankToNull(senderName));
doc.setReceiverText(blankToNull(receiverNames));
}
// Date triplet + raw + location. Pure value parsing, no semantic logic.
private void applyDates(Document doc, CanonicalSheetReader.Row row) {
doc.setDocumentDate(parseIsoDate(row.get("date_iso")));
doc.setMetaDatePrecision(parsePrecision(row.get("date_precision")));
doc.setMetaDateEnd(parseIsoDate(row.get("date_end")));
doc.setMetaDateRaw(blankToNull(row.get("date_raw")));
doc.setLocation(blankToNull(row.get("location")));
doc.setSummary(blankToNull(row.get("summary")));
}
// Receivers and tags are owned by the canonical row (ADR-025): clear then re-populate so a
// shrunk set on re-import prunes stale links rather than accumulating them. The
// "preserve human edits" rule does NOT extend to these collections.
private void applyAuthoritativeAssociations(Document doc, CanonicalSheetReader.Row row) {
Set<Person> receivers = resolveReceivers(row.get("receiver_person_ids")); Set<Person> receivers = resolveReceivers(row.get("receiver_person_ids"));
doc.getReceivers().clear();
doc.getReceivers().addAll(receivers);
attachTag(doc, row.get("tags"));
}
LocalDate date = parseIsoDate(row.get("date_iso")); // S3 key, content type, status, and the index-derived title.
DatePrecision precision = parsePrecision(row.get("date_precision")); private void applyFileMetadata(Document doc, String s3Key, String contentType,
LocalDate dateEnd = parseIsoDate(row.get("date_end")); DocumentStatus status, String index) {
String dateRaw = blankToNull(row.get("date_raw"));
String location = blankToNull(row.get("location"));
doc.setTitle(buildTitle(index, date, precision, dateEnd, dateRaw, location));
doc.setStatus(status); doc.setStatus(status);
doc.setFilePath(s3Key); doc.setFilePath(s3Key);
doc.setContentType(contentType); doc.setContentType(contentType);
doc.setSender(sender); doc.setTitle(buildTitle(index, doc.getDocumentDate(), doc.getMetaDatePrecision(),
doc.setSenderText(blankToNull(senderName)); doc.getMetaDateEnd(), doc.getMetaDateRaw(), doc.getLocation()));
// The canonical row is authoritative for receivers/tags (ADR-025): clear then }
// re-populate so a shrunk set on re-import prunes stale links rather than
// accumulating them. The raw sender_text/receiver_text retention is separate. // metadataComplete: a document counts as fully described if any of the three "who/when"
doc.getReceivers().clear(); // pieces is filled. Called last so the upstream setters have already populated the doc.
doc.getReceivers().addAll(receivers); private void applyComputedFlags(Document doc) {
doc.setReceiverText(blankToNull(receiverNames)); doc.setMetadataComplete(doc.getDocumentDate() != null
doc.setDocumentDate(date); || doc.getSender() != null
doc.setMetaDatePrecision(precision); || !doc.getReceivers().isEmpty());
doc.setMetaDateEnd(dateEnd);
doc.setMetaDateRaw(dateRaw);
doc.setLocation(location);
doc.setSummary(blankToNull(row.get("summary")));
attachTag(doc, row.get("tags"));
doc.setMetadataComplete(doc.getDocumentDate() != null || sender != null || !receivers.isEmpty());
return doc;
} }
// The title carries the date at the HONEST precision (never a fabricated day) via the // The title carries the date at the HONEST precision (never a fabricated day) via the