@@ -0,0 +1,324 @@
package org.raddatz.familienarchiv.importing ;
import lombok.RequiredArgsConstructor ;
import lombok.extern.slf4j.Slf4j ;
import org.raddatz.familienarchiv.document.DatePrecision ;
import org.raddatz.familienarchiv.document.Document ;
import org.raddatz.familienarchiv.document.DocumentService ;
import org.raddatz.familienarchiv.document.DocumentStatus ;
import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner ;
import org.raddatz.familienarchiv.exception.DomainException ;
import org.raddatz.familienarchiv.exception.ErrorCode ;
import org.raddatz.familienarchiv.person.Person ;
import org.raddatz.familienarchiv.person.PersonService ;
import org.raddatz.familienarchiv.person.PersonType ;
import org.raddatz.familienarchiv.person.PersonUpsertCommand ;
import org.raddatz.familienarchiv.tag.Tag ;
import org.springframework.beans.factory.annotation.Value ;
import org.springframework.stereotype.Component ;
import org.springframework.transaction.annotation.Transactional ;
import software.amazon.awssdk.core.sync.RequestBody ;
import software.amazon.awssdk.services.s3.S3Client ;
import software.amazon.awssdk.services.s3.model.PutObjectRequest ;
import org.raddatz.familienarchiv.tag.TagService ;
import java.io.File ;
import java.io.FileInputStream ;
import java.io.IOException ;
import java.io.InputStream ;
import java.nio.file.Files ;
import java.nio.file.Path ;
import java.nio.file.Paths ;
import java.time.LocalDate ;
import java.time.format.DateTimeParseException ;
import java.util.ArrayList ;
import java.util.LinkedHashSet ;
import java.util.List ;
import java.util.Optional ;
import java.util.Set ;
import java.util.UUID ;
import java.util.stream.Stream ;
/**
* Loads {@code canonical-documents.xlsx} into the document domain. Java performs no
* semantic transformation: the normalizer already resolved people to slugs and dates to
* ISO values. This loader maps columns by header name, routes each attribution
* register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}),
* parses clean dates, and keeps the file/S3/thumbnail plumbing.
*
* <p>The {@code file} value is hostile input regardless of upstream trust (CWE-22 does not
* care that it came from our Python tool): its basename is validated with
* {@link #isValidImportFilename} and then resolved with canonical-path containment in
* {@link #findFileRecursive}.
*/
@Component
@RequiredArgsConstructor
@Slf4j
public class DocumentImporter {
static final List < String > REQUIRED_HEADERS = List . of (
" index " , " file " , " sender_person_id " , " sender_name " ,
" receiver_person_ids " , " receiver_names " , " date_iso " , " date_raw " , " date_precision " ) ;
private final DocumentService documentService ;
private final PersonService personService ;
private final TagService tagService ;
private final S3Client s3Client ;
private final ThumbnailAsyncRunner thumbnailAsyncRunner ;
@Value ( " ${app.s3.bucket:familienarchiv} " )
private String bucketName ;
@Value ( " ${app.import.dir:/import} " )
private String importDir ;
/** Outcome of loading the document sheet: processed count + per-file skips. */
public record LoadResult ( int processed , List < ImportStatus . SkippedFile > skippedFiles ) { }
public LoadResult load ( File artifact ) {
List < CanonicalSheetReader . Row > rows = CanonicalSheetReader . readRows ( artifact , REQUIRED_HEADERS ) ;
int processed = 0 ;
List < ImportStatus . SkippedFile > skipped = new ArrayList < > ( ) ;
for ( CanonicalSheetReader . Row row : rows ) {
String index = row . get ( " index " ) ;
if ( index . isBlank ( ) ) continue ;
Optional < ImportStatus . SkipReason > skipReason = importRow ( row , index , skipped ) ;
if ( skipReason . isPresent ( ) ) {
skipped . add ( new ImportStatus . SkippedFile ( displayName ( row , index ) , skipReason . get ( ) ) ) ;
} else {
processed + + ;
}
}
log . info ( " Imported {} documents from {} ({} skipped) " , processed , artifact . getName ( ) , skipped . size ( ) ) ;
return new LoadResult ( processed , skipped ) ;
}
private Optional < ImportStatus . SkipReason > importRow ( CanonicalSheetReader . Row row , String index ,
List < ImportStatus . SkippedFile > skipped ) {
Optional < File > resolved ;
try {
resolved = resolveFile ( row . get ( " file " ) ) ;
} catch ( InvalidImportFilenameException e ) {
log . warn ( " Skipping import row {}: filename rejected " , index ) ;
return Optional . of ( ImportStatus . SkipReason . INVALID_FILENAME_PATH_TRAVERSAL ) ;
}
if ( resolved . isPresent ( ) ) {
try {
if ( ! isPdfMagicBytes ( resolved . get ( ) ) ) {
return Optional . of ( ImportStatus . SkipReason . INVALID_PDF_SIGNATURE ) ;
}
} catch ( IOException e ) {
log . error ( " Magic-byte check failed for row {} " , index , e ) ;
return Optional . of ( ImportStatus . SkipReason . FILE_READ_ERROR ) ;
}
}
return persist ( row , index , resolved ) ;
}
@Transactional
protected Optional < ImportStatus . SkipReason > persist ( CanonicalSheetReader . Row row , String index , Optional < File > file ) {
Document existing = documentService . findByOriginalFilename ( index ) . orElse ( null ) ;
if ( existing ! = null & & existing . getStatus ( ) ! = DocumentStatus . PLACEHOLDER ) {
return Optional . of ( ImportStatus . SkipReason . ALREADY_EXISTS ) ;
}
String s3Key = null ;
String contentType = null ;
DocumentStatus status = DocumentStatus . PLACEHOLDER ;
if ( file . isPresent ( ) ) {
contentType = probeContentType ( file . get ( ) ) ;
s3Key = " documents/ " + UUID . randomUUID ( ) + " _ " + file . get ( ) . getName ( ) ;
try {
uploadToS3 ( file . get ( ) , s3Key , contentType ) ;
status = DocumentStatus . UPLOADED ;
} catch ( Exception e ) {
log . error ( " S3 upload failed for {} " , file . get ( ) . getName ( ) , e ) ;
return Optional . of ( ImportStatus . SkipReason . S3_UPLOAD_FAILED ) ;
}
}
Document doc = buildDocument ( row , index , existing , s3Key , contentType , status ) ;
Document saved = documentService . save ( doc ) ;
if ( file . isPresent ( ) ) {
thumbnailAsyncRunner . dispatchAfterCommit ( saved . getId ( ) ) ;
}
return Optional . empty ( ) ;
}
private Document buildDocument ( CanonicalSheetReader . Row row , String index , Document existing ,
String s3Key , String contentType , DocumentStatus status ) {
Document doc = existing ! = null ? existing
: Document . builder ( ) . originalFilename ( index ) . build ( ) ;
String senderName = row . get ( " sender_name " ) ;
String receiverNames = row . get ( " receiver_names " ) ;
Person sender = resolveSender ( row . get ( " sender_person_id " ) , senderName ) ;
Set < Person > receivers = resolveReceivers ( row . get ( " receiver_person_ids " ) ) ;
doc . setTitle ( index ) ;
doc . setStatus ( status ) ;
doc . setFilePath ( s3Key ) ;
doc . setContentType ( contentType ) ;
doc . setSender ( sender ) ;
doc . setSenderText ( blankToNull ( senderName ) ) ;
doc . getReceivers ( ) . addAll ( receivers ) ;
doc . setReceiverText ( blankToNull ( receiverNames ) ) ;
doc . setDocumentDate ( parseIsoDate ( row . get ( " date_iso " ) ) ) ;
doc . setMetaDatePrecision ( parsePrecision ( row . get ( " date_precision " ) ) ) ;
doc . setMetaDateEnd ( parseIsoDate ( row . get ( " date_end " ) ) ) ;
doc . setMetaDateRaw ( blankToNull ( row . get ( " date_raw " ) ) ) ;
doc . setLocation ( blankToNull ( row . get ( " location " ) ) ) ;
doc . setSummary ( blankToNull ( row . get ( " summary " ) ) ) ;
attachTag ( doc , row . get ( " tags " ) ) ;
doc . setMetadataComplete ( doc . getDocumentDate ( ) ! = null | | sender ! = null | | ! receivers . isEmpty ( ) ) ;
return doc ;
}
// ─── attribution routing — register-first, always retain raw ─────────────────────
private Person resolveSender ( String slug , String rawName ) {
if ( slug . isBlank ( ) ) return null ;
return resolvePerson ( slug , rawName ) ;
}
private Set < Person > resolveReceivers ( String slugs ) {
Set < Person > receivers = new LinkedHashSet < > ( ) ;
for ( String slug : CanonicalSheetReader . splitList ( slugs ) ) {
receivers . add ( resolvePerson ( slug , slug ) ) ;
}
return receivers ;
}
private Person resolvePerson ( String slug , String rawName ) {
return personService . findBySourceRef ( slug )
. orElseGet ( ( ) - > personService . upsertBySourceRef ( PersonUpsertCommand . builder ( )
. sourceRef ( slug )
. lastName ( blankToNull ( rawName ) = = null ? slug : rawName )
. personType ( PersonType . PERSON )
. provisional ( true )
. build ( ) ) ) ;
}
private void attachTag ( Document doc , String tagPath ) {
if ( tagPath . isBlank ( ) ) return ;
tagService . findBySourceRef ( tagPath ) . ifPresent ( tag - > doc . getTags ( ) . add ( tag ) ) ;
}
// ─── clean-value parsing (no semantic logic) ─────────────────────────────────────
private static LocalDate parseIsoDate ( String value ) {
if ( value = = null | | value . isBlank ( ) ) return null ;
try {
return LocalDate . parse ( value . trim ( ) ) ;
} catch ( DateTimeParseException e ) {
return null ;
}
}
private static DatePrecision parsePrecision ( String value ) {
if ( value = = null | | value . isBlank ( ) ) return DatePrecision . UNKNOWN ;
try {
return DatePrecision . valueOf ( value . trim ( ) ) ;
} catch ( IllegalArgumentException e ) {
return DatePrecision . UNKNOWN ;
}
}
// ─── file handling + S3 (small ≤20-line methods) ─────────────────────────────────
private Optional < File > resolveFile ( String fileColumn ) {
if ( fileColumn = = null | | fileColumn . isBlank ( ) ) return Optional . empty ( ) ;
String basename = basenameOf ( fileColumn ) ;
if ( ! isValidImportFilename ( basename ) ) {
throw new InvalidImportFilenameException ( ) ;
}
return findFileRecursive ( basename ) ;
}
private static String basenameOf ( String fileColumn ) {
String normalized = fileColumn . replace ( '\\' , '/' ) ;
int lastSlash = normalized . lastIndexOf ( '/' ) ;
return lastSlash < 0 ? normalized . trim ( ) : normalized . substring ( lastSlash + 1 ) . trim ( ) ;
}
private String probeContentType ( File file ) {
try {
String probed = Files . probeContentType ( file . toPath ( ) ) ;
return probed ! = null ? probed : " application/octet-stream " ;
} catch ( IOException e ) {
return " application/octet-stream " ;
}
}
private void uploadToS3 ( File file , String s3Key , String contentType ) {
s3Client . putObject ( PutObjectRequest . builder ( )
. bucket ( bucketName )
. key ( s3Key )
. contentType ( contentType )
. build ( ) ,
RequestBody . fromFile ( file ) ) ;
}
// ─── security guards — ported verbatim from MassImportService — do not weaken ────
private boolean isValidImportFilename ( String filename ) {
if ( filename = = null | | filename . isBlank ( ) ) return false ;
if ( filename . contains ( " / " ) ) return false ;
if ( filename . contains ( " \\ " ) ) return false ;
if ( filename . contains ( " ∕ " ) ) return false ; // U+2215 DIVISION SLASH
if ( filename . contains ( " / " ) ) return false ; // U+FF0F FULLWIDTH SOLIDUS
if ( filename . contains ( " ⧵ " ) ) return false ; // U+29F5 REVERSE SOLIDUS OPERATOR
if ( filename . contains ( " .. " ) ) return false ;
if ( filename . equals ( " . " ) ) return false ;
if ( filename . contains ( " \ 0 " ) ) return false ;
if ( Paths . get ( filename ) . isAbsolute ( ) ) return false ;
return true ;
}
// package-private: a Mockito spy in tests can override to inject IOException
InputStream openFileStream ( File file ) throws IOException {
return new FileInputStream ( file ) ;
}
private boolean isPdfMagicBytes ( File file ) throws IOException {
try ( InputStream is = openFileStream ( file ) ) {
byte [ ] header = is . readNBytes ( 4 ) ;
return header . length = = 4
& & header [ 0 ] = = 0x25 // %
& & header [ 1 ] = = 0x50 // P
& & header [ 2 ] = = 0x44 // D
& & header [ 3 ] = = 0x46 ; // F
}
}
private Optional < File > findFileRecursive ( String filename ) {
File baseDir = new File ( importDir ) ;
try ( Stream < Path > walk = Files . walk ( baseDir . toPath ( ) ) ) {
Optional < Path > match = walk . filter ( p - > ! Files . isDirectory ( p ) )
. filter ( p - > p . getFileName ( ) . toString ( ) . equals ( filename ) )
. findFirst ( ) ;
if ( match . isEmpty ( ) ) return Optional . empty ( ) ;
File candidate = match . get ( ) . toFile ( ) ;
String baseDirCanonical = baseDir . getCanonicalPath ( ) ;
if ( ! candidate . getCanonicalPath ( ) . startsWith ( baseDirCanonical + File . separator ) ) {
throw DomainException . internal ( ErrorCode . INTERNAL_ERROR , " Path escape detected: " + candidate ) ;
}
return Optional . of ( candidate ) ;
} catch ( IOException e ) {
return Optional . empty ( ) ;
}
}
private static String displayName ( CanonicalSheetReader . Row row , String index ) {
String file = row . get ( " file " ) ;
return file . isBlank ( ) ? index : basenameOf ( file ) ;
}
private static String blankToNull ( String s ) {
return ( s = = null | | s . isBlank ( ) ) ? null : s ;
}
private static final class InvalidImportFilenameException extends RuntimeException {
}
}