Refactor phases 4.2 - email adapter + tests
parent
f3fcdfab11
commit
90093ab98d
@ -0,0 +1,148 @@
|
|||||||
|
package at.procon.dip.extraction.impl;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.extraction.spi.DocumentExtractor;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||||
|
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.apache.poi.ss.usermodel.Cell;
|
||||||
|
import org.apache.poi.ss.usermodel.DataFormatter;
|
||||||
|
import org.apache.poi.ss.usermodel.Row;
|
||||||
|
import org.apache.poi.ss.usermodel.Sheet;
|
||||||
|
import org.apache.poi.ss.usermodel.Workbook;
|
||||||
|
import org.apache.poi.ss.usermodel.WorkbookFactory;
|
||||||
|
import org.springframework.core.annotation.Order;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
@Order(50)
|
||||||
|
public class SpreadsheetDocumentExtractor implements DocumentExtractor {
|
||||||
|
@Override
|
||||||
|
public boolean supports(DocumentType documentType, String mimeType) {
|
||||||
|
return DocumentImportSupport.isSpreadsheetMime(mimeType);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ExtractionResult extract(ExtractionRequest extractionRequest) {
|
||||||
|
String mimeType = DocumentImportSupport.normalizeMediaType(
|
||||||
|
extractionRequest.detectionResult().mimeType());
|
||||||
|
String extension = DocumentImportSupport.extensionOf(
|
||||||
|
extractionRequest.sourceDescriptor().fileName());
|
||||||
|
try {
|
||||||
|
String extracted = extractSpreadsheetText(
|
||||||
|
mimeType,
|
||||||
|
extension,
|
||||||
|
extractionRequest.binaryContent(),
|
||||||
|
extractionRequest.textContent());
|
||||||
|
|
||||||
|
if (!StringUtils.hasText(extracted)) {
|
||||||
|
return new ExtractionResult(
|
||||||
|
Map.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of("No spreadsheet text content extracted"));
|
||||||
|
}
|
||||||
|
|
||||||
|
String normalized = normalizeText(extracted);
|
||||||
|
String title = extractionRequest.sourceDescriptor().fileName();
|
||||||
|
return new ExtractionResult(
|
||||||
|
Map.of(ContentRole.NORMALIZED_TEXT, normalized),
|
||||||
|
List.of(new ExtractedStructuredPayload("spreadsheet-document", Map.of("title", title))),
|
||||||
|
List.of());
|
||||||
|
} catch (Exception e) {
|
||||||
|
return new ExtractionResult(
|
||||||
|
Map.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of("Spreadsheet extraction failed: " + e.getMessage()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String extractSpreadsheetText(
|
||||||
|
String mimeType,
|
||||||
|
String extension,
|
||||||
|
byte[] binaryContent,
|
||||||
|
String textContent) throws IOException {
|
||||||
|
|
||||||
|
if ("text/csv".equals(mimeType)
|
||||||
|
|| "text/tab-separated-values".equals(mimeType)
|
||||||
|
|| "application/csv".equals(mimeType)
|
||||||
|
|| "application/x-csv".equals(mimeType)
|
||||||
|
|| "csv".equals(extension)
|
||||||
|
|| "tsv".equals(extension)) {
|
||||||
|
if (StringUtils.hasText(textContent)) {
|
||||||
|
return textContent;
|
||||||
|
}
|
||||||
|
return binaryContent == null ? null : new String(binaryContent, StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (binaryContent == null || binaryContent.length == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ("ods".equals(extension)
|
||||||
|
|| "application/vnd.oasis.opendocument.spreadsheet".equals(mimeType)) {
|
||||||
|
throw new IOException("ODS extraction not supported yet");
|
||||||
|
}
|
||||||
|
|
||||||
|
try (Workbook workbook = WorkbookFactory.create(new ByteArrayInputStream(binaryContent))) {
|
||||||
|
return extractWorkbookText(workbook);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String extractWorkbookText(Workbook workbook) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
DataFormatter formatter = new DataFormatter();
|
||||||
|
|
||||||
|
for (int s = 0; s < workbook.getNumberOfSheets(); s++) {
|
||||||
|
Sheet sheet = workbook.getSheetAt(s);
|
||||||
|
if (sheet == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sb.length() > 0) {
|
||||||
|
sb.append("\n\n");
|
||||||
|
}
|
||||||
|
sb.append("# Sheet: ").append(sheet.getSheetName()).append("\n");
|
||||||
|
|
||||||
|
for (Row row : sheet) {
|
||||||
|
List<String> cells = new ArrayList<>();
|
||||||
|
for (Cell cell : row) {
|
||||||
|
String value = formatter.formatCellValue(cell);
|
||||||
|
if (value != null) {
|
||||||
|
value = value.trim();
|
||||||
|
}
|
||||||
|
cells.add(value == null ? "" : value);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!cells.isEmpty() && cells.get(cells.size() - 1).isBlank()) {
|
||||||
|
cells.remove(cells.size() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!cells.isEmpty()) {
|
||||||
|
sb.append(String.join("\t", cells)).append("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String normalizeText(String text) {
|
||||||
|
if (text == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return text.replace("\r\n", "\n")
|
||||||
|
.replace('\r', '\n')
|
||||||
|
.replaceAll("\\n{3,}", "\n\n")
|
||||||
|
.replaceAll("[ \\t]+\\n", "\n")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
package at.procon.dip;
|
||||||
|
|
||||||
|
import org.testcontainers.containers.PostgreSQLContainer;
|
||||||
|
|
||||||
|
public class FixedPortPostgreSQLContainer<SELF extends FixedPortPostgreSQLContainer<SELF>>
|
||||||
|
extends PostgreSQLContainer<SELF> {
|
||||||
|
|
||||||
|
private final int hostPort;
|
||||||
|
|
||||||
|
public FixedPortPostgreSQLContainer(String imageName, int hostPort) {
|
||||||
|
super(imageName);
|
||||||
|
this.hostPort = hostPort;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void configure() {
|
||||||
|
super.configure();
|
||||||
|
addFixedExposedPort(hostPort, PostgreSQLContainer.POSTGRESQL_PORT);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,65 @@
|
|||||||
|
package at.procon.dip.extraction.impl;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
import at.procon.dip.classification.detector.BasicMimeAndExtensionDocumentTypeDetector;
|
||||||
|
import at.procon.dip.classification.spi.DetectionResult;
|
||||||
|
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class BasicMimeAndExtensionDocumentTypeDetectorTest {
|
||||||
|
|
||||||
|
private final at.procon.dip.classification.detector.BasicMimeAndExtensionDocumentTypeDetector detector = new BasicMimeAndExtensionDocumentTypeDetector();
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void shouldResolveKnownExcelMimeFromExtensionWhenMailUsesOctetStream() {
|
||||||
|
SourceDescriptor source = new SourceDescriptor(
|
||||||
|
DocumentAccessContext.publicDocument(),
|
||||||
|
SourceType.MAIL,
|
||||||
|
"mail-1:attachment:test.xls",
|
||||||
|
"mail://message/1",
|
||||||
|
"test.xls",
|
||||||
|
"application/octet-stream",
|
||||||
|
new byte[] {1,2,3},
|
||||||
|
null,
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
OriginalContentStoragePolicy.STORE,
|
||||||
|
Map.of()
|
||||||
|
);
|
||||||
|
|
||||||
|
DetectionResult result = detector.detect(source);
|
||||||
|
|
||||||
|
assertThat(result.documentType()).isEqualTo(DocumentType.GENERIC_BINARY);
|
||||||
|
assertThat(result.mimeType()).isEqualTo("application/vnd.ms-excel");
|
||||||
|
assertThat(result.attributes()).containsEntry("detectedExtension", "xls");
|
||||||
|
assertThat(result.attributes()).containsEntry("effectiveMediaType", "application/vnd.ms-excel");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void shouldKeepCsvAsTextAndUseCsvMime() {
|
||||||
|
SourceDescriptor source = new SourceDescriptor(
|
||||||
|
DocumentAccessContext.publicDocument(),
|
||||||
|
SourceType.MAIL,
|
||||||
|
"mail-1:attachment:data.csv",
|
||||||
|
"mail://message/1",
|
||||||
|
"data.csv",
|
||||||
|
"application/octet-stream",
|
||||||
|
"a,b1,2".getBytes(),
|
||||||
|
null,
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
OriginalContentStoragePolicy.STORE,
|
||||||
|
Map.of()
|
||||||
|
);
|
||||||
|
|
||||||
|
DetectionResult result = detector.detect(source);
|
||||||
|
|
||||||
|
assertThat(result.documentType()).isEqualTo(DocumentType.TEXT);
|
||||||
|
assertThat(result.mimeType()).isEqualTo("text/csv");
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,93 @@
|
|||||||
|
package at.procon.dip.extraction.impl;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
import at.procon.dip.classification.spi.DetectionResult;
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||||
|
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||||
|
import org.apache.poi.ss.usermodel.Workbook;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class SpreadsheetDocumentExtractorTest {
|
||||||
|
|
||||||
|
private final SpreadsheetDocumentExtractor extractor = new SpreadsheetDocumentExtractor();
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void extractsOldExcelBinaryXls() throws Exception {
|
||||||
|
byte[] data = createLegacyXls();
|
||||||
|
SourceDescriptor source = new SourceDescriptor(
|
||||||
|
null,
|
||||||
|
SourceType.MAIL,
|
||||||
|
"mail-1:attachment:report.xls",
|
||||||
|
null,
|
||||||
|
"report.xls",
|
||||||
|
"application/vnd.ms-excel",
|
||||||
|
data,
|
||||||
|
null,
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
OriginalContentStoragePolicy.STORE,
|
||||||
|
Map.of("title", "report.xls")
|
||||||
|
);
|
||||||
|
DetectionResult detection = new DetectionResult(DocumentType.GENERIC_BINARY, DocumentFamily.GENERIC,
|
||||||
|
"application/vnd.ms-excel", null, Map.of());
|
||||||
|
|
||||||
|
ExtractionResult result = extractor.extract(new ExtractionRequest(source, detection, null, data));
|
||||||
|
|
||||||
|
String text = result.derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
|
||||||
|
assertNotNull(text);
|
||||||
|
assertTrue(text.contains("Sheet: Sheet1"));
|
||||||
|
assertTrue(text.contains("Name | Amount"));
|
||||||
|
assertTrue(text.contains("Alice | 42"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void extractsCsvAsNormalizedText() {
|
||||||
|
String csv = "Name,Amount\nAlice,42\nBob,77\n";
|
||||||
|
byte[] data = csv.getBytes(StandardCharsets.UTF_8);
|
||||||
|
SourceDescriptor source = new SourceDescriptor(
|
||||||
|
null,
|
||||||
|
SourceType.FILE_SYSTEM,
|
||||||
|
"csv-1",
|
||||||
|
null,
|
||||||
|
"report.csv",
|
||||||
|
"text/csv",
|
||||||
|
data,
|
||||||
|
csv,
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
OriginalContentStoragePolicy.STORE,
|
||||||
|
Map.of("title", "report.csv")
|
||||||
|
);
|
||||||
|
DetectionResult detection = new DetectionResult(DocumentType.TEXT, DocumentFamily.GENERIC,
|
||||||
|
"text/csv", null, Map.of());
|
||||||
|
|
||||||
|
ExtractionResult result = extractor.extract(new ExtractionRequest(source, detection, csv, data));
|
||||||
|
|
||||||
|
String text = result.derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
|
||||||
|
assertEquals("Name | Amount\nAlice | 42\nBob | 77", text);
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] createLegacyXls() throws Exception {
|
||||||
|
try (Workbook workbook = new HSSFWorkbook(); ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||||
|
var sheet = workbook.createSheet("Sheet1");
|
||||||
|
var header = sheet.createRow(0);
|
||||||
|
header.createCell(0).setCellValue("Name");
|
||||||
|
header.createCell(1).setCellValue("Amount");
|
||||||
|
var row = sheet.createRow(1);
|
||||||
|
row.createCell(0).setCellValue("Alice");
|
||||||
|
row.createCell(1).setCellValue(42);
|
||||||
|
workbook.write(out);
|
||||||
|
return out.toByteArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,175 @@
|
|||||||
|
package at.procon.dip.ingestion.adapter;
|
||||||
|
|
||||||
|
import at.procon.dip.classification.spi.DetectionResult;
|
||||||
|
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RelationType;
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentRelationService;
|
||||||
|
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||||
|
import at.procon.dip.ingestion.service.GenericDocumentImportService;
|
||||||
|
import at.procon.dip.ingestion.service.MailMessageExtractionService;
|
||||||
|
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import at.procon.ted.service.attachment.ZipExtractionService;
|
||||||
|
import at.procon.dip.testsupport.MailBundleTestSupport;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
|
import org.junit.jupiter.params.provider.MethodSource;
|
||||||
|
import org.mockito.ArgumentCaptor;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.Mockito.*;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class MailDocumentIngestionAdapterBundleTest {
|
||||||
|
|
||||||
|
private static Path bundleRoot;
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private GenericDocumentImportService importService;
|
||||||
|
@Mock
|
||||||
|
private DocumentRelationService relationService;
|
||||||
|
@Mock
|
||||||
|
private ZipExtractionService zipExtractionService;
|
||||||
|
|
||||||
|
private MailDocumentIngestionAdapter adapter;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
static void extractBundle() throws Exception {
|
||||||
|
bundleRoot = MailBundleTestSupport.extractBundleToTempDir();
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
TedProcessorProperties properties = new TedProcessorProperties();
|
||||||
|
properties.getGenericIngestion().setEnabled(true);
|
||||||
|
properties.getGenericIngestion().setMailAdapterEnabled(true);
|
||||||
|
properties.getGenericIngestion().setExpandMailZipAttachments(false);
|
||||||
|
properties.getGenericIngestion().setMailImportBatchId("test-mail-bundle");
|
||||||
|
when(zipExtractionService.canHandle(any(), any())).thenReturn(false);
|
||||||
|
adapter = new MailDocumentIngestionAdapter(properties, importService, new MailMessageExtractionService(), relationService, zipExtractionService);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest(name = "ingest {0}")
|
||||||
|
@MethodSource("at.procon.dip.testsupport.MailBundleTestSupport#bundleMailNames")
|
||||||
|
void ingest_should_import_root_and_all_attachments_from_bundle(String fileName) throws Exception {
|
||||||
|
Path eml = bundleRoot.resolve("eml").resolve(fileName);
|
||||||
|
byte[] rawMime = Files.readAllBytes(eml);
|
||||||
|
List<String> expectedAttachmentNames = MailBundleTestSupport.EXPECTED_ATTACHMENT_NAMES.get(fileName);
|
||||||
|
|
||||||
|
AtomicInteger sequence = new AtomicInteger();
|
||||||
|
when(importService.importDocument(any(SourceDescriptor.class))).thenAnswer(invocation -> {
|
||||||
|
SourceDescriptor sd = invocation.getArgument(0);
|
||||||
|
int idx = sequence.incrementAndGet();
|
||||||
|
Document document = Document.builder()
|
||||||
|
.id(UUID.nameUUIDFromBytes((fileName + ":" + idx).getBytes()))
|
||||||
|
.documentType(sd.sourceType() == SourceType.MAIL && "message/rfc822".equals(sd.mediaType()) ? DocumentType.MIME_MESSAGE : DocumentType.GENERIC_BINARY)
|
||||||
|
.documentFamily(DocumentFamily.MAIL)
|
||||||
|
.status(DocumentStatus.RECEIVED)
|
||||||
|
.title(sd.fileName())
|
||||||
|
.mimeType(sd.mediaType())
|
||||||
|
.dedupHash(Integer.toHexString(idx))
|
||||||
|
.build();
|
||||||
|
DetectionResult detection = new DetectionResult(document.getDocumentType(), document.getDocumentFamily(), document.getMimeType(), null, java.util.Map.of());
|
||||||
|
return new ImportedDocumentResult(document, detection, List.of(), false);
|
||||||
|
});
|
||||||
|
|
||||||
|
SourceDescriptor source = new SourceDescriptor(
|
||||||
|
DocumentAccessContext.publicDocument(),
|
||||||
|
SourceType.MAIL,
|
||||||
|
fileName,
|
||||||
|
eml.toUri().toString(),
|
||||||
|
fileName,
|
||||||
|
"message/rfc822",
|
||||||
|
rawMime,
|
||||||
|
null,
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
null,
|
||||||
|
java.util.Map.of()
|
||||||
|
);
|
||||||
|
|
||||||
|
IngestionResult result = adapter.ingest(source);
|
||||||
|
|
||||||
|
assertEquals(1 + expectedAttachmentNames.size(), result.documents().size(), "root + each attachment should be imported");
|
||||||
|
assertTrue(result.warnings().isEmpty(), "bundle sample should import without warnings: " + fileName);
|
||||||
|
|
||||||
|
ArgumentCaptor<SourceDescriptor> sourceCaptor = ArgumentCaptor.forClass(SourceDescriptor.class);
|
||||||
|
verify(importService, times(1 + expectedAttachmentNames.size())).importDocument(sourceCaptor.capture());
|
||||||
|
verify(relationService, times(expectedAttachmentNames.size())).ensureRelation(any());
|
||||||
|
|
||||||
|
List<SourceDescriptor> descriptors = sourceCaptor.getAllValues();
|
||||||
|
SourceDescriptor root = descriptors.getFirst();
|
||||||
|
assertEquals("message/rfc822", root.mediaType());
|
||||||
|
assertNotNull(root.textContent(), "root mail should carry serialized message text");
|
||||||
|
assertEquals(fileName, root.fileName());
|
||||||
|
|
||||||
|
List<String> importedAttachmentNames = new ArrayList<>();
|
||||||
|
for (int i = 1; i < descriptors.size(); i++) {
|
||||||
|
importedAttachmentNames.add(descriptors.get(i).fileName());
|
||||||
|
}
|
||||||
|
assertEquals(expectedAttachmentNames, importedAttachmentNames);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest(name = "octet-stream preview guard {0}")
|
||||||
|
@MethodSource("octetStreamMailNames")
|
||||||
|
void ingest_should_not_pass_preview_text_for_generic_octet_stream_attachments(String fileName) throws Exception {
|
||||||
|
Path eml = bundleRoot.resolve("eml").resolve(fileName);
|
||||||
|
byte[] rawMime = Files.readAllBytes(eml);
|
||||||
|
|
||||||
|
when(importService.importDocument(any(SourceDescriptor.class))).thenAnswer(invocation -> {
|
||||||
|
SourceDescriptor sd = invocation.getArgument(0);
|
||||||
|
Document document = Document.builder()
|
||||||
|
.id(UUID.randomUUID())
|
||||||
|
.documentType(DocumentType.UNKNOWN)
|
||||||
|
.documentFamily(DocumentFamily.MAIL)
|
||||||
|
.status(DocumentStatus.RECEIVED)
|
||||||
|
.title(sd.fileName())
|
||||||
|
.mimeType(sd.mediaType())
|
||||||
|
.build();
|
||||||
|
DetectionResult detection = new DetectionResult(DocumentType.UNKNOWN, DocumentFamily.MAIL, sd.mediaType(), null, java.util.Map.of());
|
||||||
|
return new ImportedDocumentResult(document, detection, List.of(), false);
|
||||||
|
});
|
||||||
|
|
||||||
|
adapter.ingest(new SourceDescriptor(
|
||||||
|
DocumentAccessContext.publicDocument(),
|
||||||
|
SourceType.MAIL,
|
||||||
|
fileName,
|
||||||
|
eml.toUri().toString(),
|
||||||
|
fileName,
|
||||||
|
"message/rfc822",
|
||||||
|
rawMime,
|
||||||
|
null,
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
null,
|
||||||
|
java.util.Map.of()
|
||||||
|
));
|
||||||
|
|
||||||
|
ArgumentCaptor<SourceDescriptor> sourceCaptor = ArgumentCaptor.forClass(SourceDescriptor.class);
|
||||||
|
verify(importService, atLeast(1)).importDocument(sourceCaptor.capture());
|
||||||
|
List<SourceDescriptor> attachments = sourceCaptor.getAllValues().subList(1, sourceCaptor.getAllValues().size());
|
||||||
|
assertEquals(3, attachments.size());
|
||||||
|
assertTrue(attachments.stream().allMatch(sd -> sd.textContent() == null),
|
||||||
|
"octet-stream attachments should not get inline preview text");
|
||||||
|
}
|
||||||
|
|
||||||
|
static java.util.stream.Stream<String> octetStreamMailNames() {
|
||||||
|
return java.util.stream.Stream.of("sample-mail-05-generic-octet-stream.eml");
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,177 @@
|
|||||||
|
package at.procon.dip.ingestion.adapter;
|
||||||
|
|
||||||
|
import at.procon.dip.classification.spi.DetectionResult;
|
||||||
|
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RelationType;
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentRelationService;
|
||||||
|
import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
|
||||||
|
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||||
|
import at.procon.dip.ingestion.service.GenericDocumentImportService;
|
||||||
|
import at.procon.dip.ingestion.service.MailMessageExtractionService;
|
||||||
|
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import at.procon.ted.service.attachment.ZipExtractionService;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.DisplayName;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.ArgumentCaptor;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.Mockito.times;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class MailDocumentIngestionAdapterFileSystemTest {
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private GenericDocumentImportService importService;
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private DocumentRelationService relationService;
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private ZipExtractionService zipExtractionService;
|
||||||
|
|
||||||
|
private MailDocumentIngestionAdapter adapter;
|
||||||
|
private final List<SourceDescriptor> importedDescriptors = new ArrayList<>();
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
TedProcessorProperties properties = new TedProcessorProperties();
|
||||||
|
properties.getGenericIngestion().setEnabled(true);
|
||||||
|
properties.getGenericIngestion().setMailAdapterEnabled(true);
|
||||||
|
properties.getGenericIngestion().setMailImportBatchId("test-mail-batch");
|
||||||
|
properties.getGenericIngestion().setDefaultOwnerTenantKey("tenant-a");
|
||||||
|
properties.getGenericIngestion().setMailDefaultVisibility(DocumentVisibility.TENANT);
|
||||||
|
|
||||||
|
MailMessageExtractionService extractionService = new MailMessageExtractionService();
|
||||||
|
adapter = new MailDocumentIngestionAdapter(
|
||||||
|
properties,
|
||||||
|
importService,
|
||||||
|
extractionService,
|
||||||
|
relationService,
|
||||||
|
zipExtractionService
|
||||||
|
);
|
||||||
|
|
||||||
|
when(zipExtractionService.canHandle(any(), any())).thenReturn(false);
|
||||||
|
when(relationService.ensureRelation(any())).thenReturn(null);
|
||||||
|
when(importService.importDocument(any())).thenAnswer(invocation -> {
|
||||||
|
SourceDescriptor descriptor = invocation.getArgument(0);
|
||||||
|
importedDescriptors.add(descriptor);
|
||||||
|
return new ImportedDocumentResult(
|
||||||
|
buildDocumentFor(descriptor),
|
||||||
|
new DetectionResult(inferType(descriptor), inferFamily(descriptor), descriptor.mediaType(), "en", Map.of()),
|
||||||
|
List.of(),
|
||||||
|
false
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should ingest filesystem-loaded mail message with text and binary attachments")
|
||||||
|
void shouldIngestFileSystemLoadedMailMessage() throws Exception {
|
||||||
|
Path emlPath = Path.of("src", "test", "resources", "mail", "sample-message.eml");
|
||||||
|
assertTrue(Files.exists(emlPath), "sample .eml test file must exist");
|
||||||
|
byte[] mimeBytes = Files.readAllBytes(emlPath);
|
||||||
|
|
||||||
|
SourceDescriptor sourceDescriptor = new SourceDescriptor(
|
||||||
|
null,
|
||||||
|
SourceType.MAIL,
|
||||||
|
"fs-mail-001",
|
||||||
|
emlPath.toAbsolutePath().toUri().toString(),
|
||||||
|
emlPath.getFileName().toString(),
|
||||||
|
"message/rfc822",
|
||||||
|
mimeBytes,
|
||||||
|
null,
|
||||||
|
OffsetDateTime.parse("2026-03-18T15:27:59+01:00"),
|
||||||
|
null,
|
||||||
|
Map.of("source", "filesystem-test")
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTrue(adapter.supports(sourceDescriptor));
|
||||||
|
|
||||||
|
IngestionResult result = adapter.ingest(sourceDescriptor);
|
||||||
|
|
||||||
|
assertEquals(3, result.documents().size(), "expected root mail document plus 2 attachment documents");
|
||||||
|
assertTrue(result.warnings().isEmpty(), "mail import should not create warnings for the sample message");
|
||||||
|
assertEquals(3, importedDescriptors.size(), "root + notes.txt + legacy.xls should be imported");
|
||||||
|
|
||||||
|
SourceDescriptor root = importedDescriptors.get(0);
|
||||||
|
assertEquals("message/rfc822", root.mediaType());
|
||||||
|
assertEquals("sample-message.eml", root.fileName());
|
||||||
|
assertNotNull(root.textContent());
|
||||||
|
assertTrue(root.textContent().contains("Subject: Sample mail with filesystem-loaded attachments"));
|
||||||
|
assertTrue(root.textContent().contains("Hello from the filesystem-backed sample message."));
|
||||||
|
assertEquals(DocumentVisibility.TENANT, root.accessContext().visibility());
|
||||||
|
assertNotNull(root.accessContext().ownerTenant());
|
||||||
|
assertEquals("tenant-a", root.accessContext().ownerTenant().tenantKey());
|
||||||
|
|
||||||
|
SourceDescriptor textAttachment = importedDescriptors.stream()
|
||||||
|
.filter(d -> "notes.txt".equals(d.fileName()))
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow();
|
||||||
|
assertEquals("text/plain", textAttachment.mediaType());
|
||||||
|
assertNotNull(textAttachment.textContent(), "plain text attachment should expose preview text");
|
||||||
|
assertTrue(textAttachment.textContent().contains("attachment notes"));
|
||||||
|
|
||||||
|
SourceDescriptor binaryAttachment = importedDescriptors.stream()
|
||||||
|
.filter(d -> "legacy.xls".equals(d.fileName()))
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow();
|
||||||
|
assertNull(binaryAttachment.textContent(), "binary old Excel attachment must not be passed as text content");
|
||||||
|
assertEquals("application/vnd.ms-excel", binaryAttachment.mediaType());
|
||||||
|
assertNotNull(binaryAttachment.binaryContent());
|
||||||
|
assertTrue(binaryAttachment.binaryContent().length > 0);
|
||||||
|
|
||||||
|
ArgumentCaptor<CreateDocumentRelationCommand> relationCaptor = ArgumentCaptor.forClass(CreateDocumentRelationCommand.class);
|
||||||
|
verify(relationService, times(2)).ensureRelation(relationCaptor.capture());
|
||||||
|
assertTrue(relationCaptor.getAllValues().stream().allMatch(cmd -> cmd.relationType() == RelationType.ATTACHMENT_OF));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Document buildDocumentFor(SourceDescriptor descriptor) {
|
||||||
|
return Document.builder()
|
||||||
|
.id(UUID.nameUUIDFromBytes((descriptor.sourceIdentifier() + ":" + descriptor.fileName()).getBytes()))
|
||||||
|
.visibility(descriptor.accessContext() == null ? DocumentVisibility.PUBLIC : descriptor.accessContext().visibility())
|
||||||
|
.documentType(inferType(descriptor))
|
||||||
|
.documentFamily(inferFamily(descriptor))
|
||||||
|
.status(DocumentStatus.RECEIVED)
|
||||||
|
.title(descriptor.fileName())
|
||||||
|
.mimeType(descriptor.mediaType())
|
||||||
|
.dedupHash(Integer.toHexString((descriptor.sourceIdentifier() + descriptor.fileName()).hashCode()))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentType inferType(SourceDescriptor descriptor) {
|
||||||
|
if (descriptor.sourceType() == SourceType.MAIL && "message/rfc822".equals(descriptor.mediaType())) {
|
||||||
|
return DocumentType.EMAIL;
|
||||||
|
}
|
||||||
|
String fileName = descriptor.fileName() == null ? "" : descriptor.fileName().toLowerCase();
|
||||||
|
if (fileName.endsWith(".txt")) {
|
||||||
|
return DocumentType.TEXT;
|
||||||
|
}
|
||||||
|
return DocumentType.GENERIC_BINARY;
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentFamily inferFamily(SourceDescriptor descriptor) {
|
||||||
|
return descriptor.sourceType() == SourceType.MAIL ? DocumentFamily.MAIL : DocumentFamily.GENERIC;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,374 @@
|
|||||||
|
package at.procon.dip.ingestion.integration;
|
||||||
|
|
||||||
|
import at.procon.dip.FixedPortPostgreSQLContainer;
|
||||||
|
import at.procon.dip.classification.detector.BasicMimeAndExtensionDocumentTypeDetector;
|
||||||
|
import at.procon.dip.classification.service.DocumentClassificationService;
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.StorageType;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentSource;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentContentRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentRelationRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentContentService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentRelationService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentRepresentationService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentSourceService;
|
||||||
|
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
|
||||||
|
import at.procon.dip.extraction.impl.*;
|
||||||
|
import at.procon.dip.extraction.service.DocumentExtractionService;
|
||||||
|
import at.procon.dip.ingestion.adapter.MailDocumentIngestionAdapter;
|
||||||
|
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
|
||||||
|
import at.procon.dip.ingestion.service.GenericDocumentImportService;
|
||||||
|
import at.procon.dip.ingestion.service.MailMessageExtractionService;
|
||||||
|
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||||
|
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import at.procon.dip.normalization.impl.DefaultGenericTextRepresentationBuilder;
|
||||||
|
import at.procon.dip.normalization.service.TextRepresentationBuildService;
|
||||||
|
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import at.procon.ted.service.attachment.PdfExtractionService;
|
||||||
|
import at.procon.ted.service.attachment.ZipExtractionService;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.UUID;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipInputStream;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.TestInstance;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.boot.SpringBootConfiguration;
|
||||||
|
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||||
|
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
|
||||||
|
import org.springframework.boot.autoconfigure.domain.EntityScan;
|
||||||
|
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
|
||||||
|
import org.springframework.boot.autoconfigure.jdbc.JdbcTemplateAutoConfiguration;
|
||||||
|
import org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration;
|
||||||
|
import org.springframework.boot.autoconfigure.task.TaskSchedulingAutoConfiguration;
|
||||||
|
import org.springframework.boot.autoconfigure.transaction.TransactionAutoConfiguration;
|
||||||
|
import org.springframework.boot.autoconfigure.web.servlet.ServletWebServerFactoryAutoConfiguration;
|
||||||
|
import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration;
|
||||||
|
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||||
|
import org.springframework.boot.test.context.SpringBootTest;
|
||||||
|
import org.springframework.context.annotation.Import;
|
||||||
|
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||||
|
import org.springframework.test.context.DynamicPropertyRegistry;
|
||||||
|
import org.springframework.test.context.DynamicPropertySource;
|
||||||
|
import org.springframework.test.context.TestPropertySource;
|
||||||
|
import org.testcontainers.containers.PostgreSQLContainer;
|
||||||
|
import org.testcontainers.junit.jupiter.Container;
|
||||||
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
@SpringBootTest(classes = MailBundleProcessingIntegrationTest.TestApplication.class)
|
||||||
|
@Testcontainers
|
||||||
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||||
|
@TestPropertySource(properties = {
|
||||||
|
"spring.jpa.hibernate.ddl-auto=create-drop",
|
||||||
|
"spring.jpa.show-sql=false",
|
||||||
|
"spring.jpa.open-in-view=false",
|
||||||
|
"spring.jpa.properties.hibernate.default_schema=DOC",
|
||||||
|
"ted.vectorization.enabled=false",
|
||||||
|
"ted.generic-ingestion.enabled=true",
|
||||||
|
"ted.generic-ingestion.mail-adapter-enabled=true",
|
||||||
|
"ted.generic-ingestion.file-system-enabled=false",
|
||||||
|
"ted.generic-ingestion.rest-upload-enabled=false",
|
||||||
|
"ted.generic-ingestion.deduplicate-by-content-hash=false",
|
||||||
|
"ted.generic-ingestion.expand-mail-zip-attachments=true",
|
||||||
|
"ted.generic-ingestion.default-visibility=PUBLIC",
|
||||||
|
"ted.generic-ingestion.mail-default-visibility=RESTRICTED",
|
||||||
|
"ted.generic-ingestion.import-batch-id=test-mail-bundle",
|
||||||
|
"ted.generic-ingestion.mail-import-batch-id=test-mail-bundle-mail"
|
||||||
|
})
|
||||||
|
class MailBundleProcessingIntegrationTest {
|
||||||
|
|
||||||
|
@Container
|
||||||
|
static PostgreSQLContainer<?> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", 15432)
|
||||||
|
.withDatabaseName("dip_test")
|
||||||
|
.withUsername("test")
|
||||||
|
.withPassword("test")
|
||||||
|
.withInitScript("sql/create-doc-test-schemas.sql");
|
||||||
|
|
||||||
|
static {
|
||||||
|
postgres.start();
|
||||||
|
}
|
||||||
|
|
||||||
|
@DynamicPropertySource
|
||||||
|
static void registerProperties(DynamicPropertyRegistry registry) {
|
||||||
|
registry.add("spring.datasource.url", postgres::getJdbcUrl);
|
||||||
|
registry.add("spring.datasource.username", postgres::getUsername);
|
||||||
|
registry.add("spring.datasource.password", postgres::getPassword);
|
||||||
|
registry.add("spring.datasource.driver-class-name", postgres::getDriverClassName);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private DocumentIngestionGateway gateway;
|
||||||
|
@Autowired
|
||||||
|
private MailMessageExtractionService mailMessageExtractionService;
|
||||||
|
@Autowired
|
||||||
|
private DocumentRepository documentRepository;
|
||||||
|
@Autowired
|
||||||
|
private DocumentSourceRepository documentSourceRepository;
|
||||||
|
@Autowired
|
||||||
|
private DocumentContentRepository documentContentRepository;
|
||||||
|
@Autowired
|
||||||
|
private DocumentRelationRepository documentRelationRepository;
|
||||||
|
@Autowired
|
||||||
|
private DocumentTextRepresentationRepository documentTextRepresentationRepository;
|
||||||
|
@Autowired
|
||||||
|
private DocumentEmbeddingRepository documentEmbeddingRepository;
|
||||||
|
@Autowired
|
||||||
|
private DocumentEmbeddingModelRepository documentEmbeddingModelRepository;
|
||||||
|
@Autowired
|
||||||
|
private DocumentTenantRepository documentTenantRepository;
|
||||||
|
|
||||||
|
private Path bundleDirectory;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws Exception {
|
||||||
|
System.out.println("TEST setUp start");
|
||||||
|
cleanupDatabase();
|
||||||
|
bundleDirectory = Files.createTempDirectory("mail-bundle-");
|
||||||
|
try (InputStream in = getClass().getResourceAsStream("/mail-sample-eml-bundle.zip")) {
|
||||||
|
assertThat(in).isNotNull();
|
||||||
|
unzip(in, bundleDirectory);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
void tearDown() throws Exception {
|
||||||
|
//cleanupDatabase();
|
||||||
|
if (bundleDirectory != null && Files.exists(bundleDirectory)) {
|
||||||
|
Files.walk(bundleDirectory)
|
||||||
|
.sorted(Comparator.reverseOrder())
|
||||||
|
.forEach(path -> {
|
||||||
|
try {
|
||||||
|
Files.deleteIfExists(path);
|
||||||
|
} catch (IOException ignored) {
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@org.junit.jupiter.api.Timeout(120)
|
||||||
|
@Test
|
||||||
|
void processesEntireMailBundleThroughRealGatewayAndPersistsResults() throws Exception {
|
||||||
|
List<Path> emlFiles = Files.walk(bundleDirectory)
|
||||||
|
.filter(path -> path.getFileName().toString().endsWith(".eml"))
|
||||||
|
.sorted()
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
assertThat(emlFiles).hasSizeGreaterThanOrEqualTo(5);
|
||||||
|
|
||||||
|
int expectedRootDocuments = 0;
|
||||||
|
int expectedAttachmentDocuments = 0;
|
||||||
|
|
||||||
|
for (Path eml : emlFiles) {
|
||||||
|
byte[] raw = Files.readAllBytes(eml);
|
||||||
|
var parsed = mailMessageExtractionService.parse(raw);
|
||||||
|
expectedRootDocuments++;
|
||||||
|
expectedAttachmentDocuments += parsed.attachments().size();
|
||||||
|
|
||||||
|
IngestionResult result = gateway.ingest(new SourceDescriptor(
|
||||||
|
null,
|
||||||
|
at.procon.dip.domain.document.SourceType.MAIL,
|
||||||
|
eml.getFileName().toString(),
|
||||||
|
eml.toString(),
|
||||||
|
eml.getFileName().toString(),
|
||||||
|
"message/rfc822",
|
||||||
|
raw,
|
||||||
|
null,
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
OriginalContentStoragePolicy.STORE,
|
||||||
|
Map.of("title", eml.getFileName().toString())
|
||||||
|
));
|
||||||
|
|
||||||
|
assertThat(result.documents()).isNotEmpty();
|
||||||
|
assertThat(result.documents().get(0).documentType()).isEqualTo(DocumentType.MIME_MESSAGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
long totalDocuments = documentRepository.count();
|
||||||
|
long totalSources = documentSourceRepository.count();
|
||||||
|
long totalRelations = documentRelationRepository.count();
|
||||||
|
|
||||||
|
assertThat(totalDocuments).isEqualTo(expectedRootDocuments + expectedAttachmentDocuments);
|
||||||
|
assertThat(totalSources).isEqualTo(totalDocuments);
|
||||||
|
assertThat(totalRelations).isEqualTo(expectedAttachmentDocuments);
|
||||||
|
|
||||||
|
List<Document> allDocuments = documentRepository.findAll();
|
||||||
|
long rootCount = allDocuments.stream().filter(d -> d.getDocumentType() == DocumentType.MIME_MESSAGE).count();
|
||||||
|
assertThat(rootCount).isEqualTo(expectedRootDocuments);
|
||||||
|
|
||||||
|
List<DocumentSource> allSources = documentSourceRepository.findAll();
|
||||||
|
List<DocumentContent> allContent = documentContentRepository.findAll();
|
||||||
|
|
||||||
|
long mimeOriginalCount = allContent.stream()
|
||||||
|
.filter(c -> c.getContentRole() == ContentRole.ORIGINAL)
|
||||||
|
.filter(c -> c.getMimeType() != null && c.getMimeType().startsWith("message/rfc822"))
|
||||||
|
.count();
|
||||||
|
assertThat(mimeOriginalCount).isEqualTo(expectedRootDocuments);
|
||||||
|
|
||||||
|
assertThat(documentTextRepresentationRepository.count()).isGreaterThanOrEqualTo(expectedRootDocuments);
|
||||||
|
|
||||||
|
List<UUID> pdfDocumentIds = allSources.stream()
|
||||||
|
.filter(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".pdf"))
|
||||||
|
.map(s -> s.getDocument().getId())
|
||||||
|
.toList();
|
||||||
|
assertThat(pdfDocumentIds).isNotEmpty();
|
||||||
|
assertThat(allContent.stream()
|
||||||
|
.filter(c -> pdfDocumentIds.contains(c.getDocument().getId()))
|
||||||
|
.anyMatch(c -> c.getContentRole() == ContentRole.NORMALIZED_TEXT && c.getTextContent() != null && !c.getTextContent().isBlank()))
|
||||||
|
.isTrue();
|
||||||
|
|
||||||
|
List<UUID> spreadsheetIds = allSources.stream()
|
||||||
|
.filter(s -> s.getSourceFilename() != null && (s.getSourceFilename().toLowerCase().endsWith(".xlsx") || s.getSourceFilename().toLowerCase().endsWith(".xls")))
|
||||||
|
.map(s -> s.getDocument().getId())
|
||||||
|
.toList();
|
||||||
|
if (!spreadsheetIds.isEmpty()) {
|
||||||
|
assertThat(allContent.stream()
|
||||||
|
.filter(c -> spreadsheetIds.contains(c.getDocument().getId()))
|
||||||
|
.filter(c -> c.getContentRole() == ContentRole.ORIGINAL)
|
||||||
|
.anyMatch(c -> c.getStorageType() == StorageType.DB_BINARY || c.getStorageType() == StorageType.EXTERNAL_REFERENCE))
|
||||||
|
.isTrue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void processesSingleFilesystemMailAndPersistsAttachmentsAndRelations() throws Exception {
|
||||||
|
Path sample = Files.walk(bundleDirectory)
|
||||||
|
//.filter(path -> path.getFileName().toString().equals("sample-mail-02-office-and-text.eml"))
|
||||||
|
.filter(path -> path.getFileName().toString().equals("sample-mail-01-basic-reporting.eml"))
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow();
|
||||||
|
|
||||||
|
byte[] raw = Files.readAllBytes(sample);
|
||||||
|
var parsed = mailMessageExtractionService.parse(raw);
|
||||||
|
|
||||||
|
IngestionResult result = gateway.ingest(new SourceDescriptor(
|
||||||
|
null,
|
||||||
|
at.procon.dip.domain.document.SourceType.MAIL,
|
||||||
|
"filesystem-sample-02",
|
||||||
|
sample.toString(),
|
||||||
|
sample.getFileName().toString(),
|
||||||
|
"message/rfc822",
|
||||||
|
raw,
|
||||||
|
null,
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
OriginalContentStoragePolicy.STORE,
|
||||||
|
Map.of("title", sample.getFileName().toString())
|
||||||
|
));
|
||||||
|
|
||||||
|
assertThat(result.documents()).hasSize(1 + parsed.attachments().size());
|
||||||
|
assertThat(documentRepository.count()).isEqualTo(1 + parsed.attachments().size());
|
||||||
|
assertThat(documentRelationRepository.count()).isEqualTo(parsed.attachments().size());
|
||||||
|
|
||||||
|
List<DocumentSource> sources = documentSourceRepository.findAll();
|
||||||
|
assertThat(sources).anyMatch(s -> sample.getFileName().toString().equals(s.getSourceFilename()));
|
||||||
|
assertThat(sources).anyMatch(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".pdf"));
|
||||||
|
assertThat(sources).anyMatch(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".csv"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void cleanupDatabase() {
|
||||||
|
System.out.println("cleanup: relations");
|
||||||
|
documentRelationRepository.deleteAll();
|
||||||
|
System.out.println("cleanup: embeddings");
|
||||||
|
documentEmbeddingRepository.deleteAll();
|
||||||
|
System.out.println("cleanup: text reps");
|
||||||
|
documentTextRepresentationRepository.deleteAll();
|
||||||
|
System.out.println("cleanup: content");
|
||||||
|
documentContentRepository.deleteAll();
|
||||||
|
System.out.println("cleanup: sources");
|
||||||
|
documentSourceRepository.deleteAll();
|
||||||
|
System.out.println("cleanup: documents");
|
||||||
|
documentRepository.deleteAll();
|
||||||
|
System.out.println("cleanup: models");
|
||||||
|
documentEmbeddingModelRepository.deleteAll();
|
||||||
|
System.out.println("cleanup: tenants");
|
||||||
|
documentTenantRepository.deleteAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void unzip(InputStream inputStream, Path targetDir) throws IOException {
|
||||||
|
try (ZipInputStream zis = new ZipInputStream(inputStream)) {
|
||||||
|
ZipEntry entry;
|
||||||
|
while ((entry = zis.getNextEntry()) != null) {
|
||||||
|
Path out = targetDir.resolve(entry.getName()).normalize();
|
||||||
|
if (!out.startsWith(targetDir)) {
|
||||||
|
throw new IOException("Zip entry outside target dir: " + entry.getName());
|
||||||
|
}
|
||||||
|
if (entry.isDirectory()) {
|
||||||
|
Files.createDirectories(out);
|
||||||
|
} else {
|
||||||
|
Files.createDirectories(out.getParent());
|
||||||
|
Files.copy(zis, out);
|
||||||
|
}
|
||||||
|
zis.closeEntry();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SpringBootConfiguration
|
||||||
|
@ImportAutoConfiguration({
|
||||||
|
DataSourceAutoConfiguration.class,
|
||||||
|
HibernateJpaAutoConfiguration.class,
|
||||||
|
TransactionAutoConfiguration.class,
|
||||||
|
JdbcTemplateAutoConfiguration.class
|
||||||
|
})
|
||||||
|
@EnableConfigurationProperties(TedProcessorProperties.class)
|
||||||
|
@EntityScan(basePackages = {
|
||||||
|
"at.procon.dip.domain.document.entity",
|
||||||
|
"at.procon.dip.domain.tenant.entity"
|
||||||
|
})
|
||||||
|
@EnableJpaRepositories(basePackages = {
|
||||||
|
"at.procon.dip.domain.document.repository",
|
||||||
|
"at.procon.dip.domain.tenant.repository"
|
||||||
|
})
|
||||||
|
@Import({
|
||||||
|
DocumentIngestionGateway.class,
|
||||||
|
GenericDocumentImportService.class,
|
||||||
|
MailDocumentIngestionAdapter.class,
|
||||||
|
MailMessageExtractionService.class,
|
||||||
|
ZipExtractionService.class,
|
||||||
|
DocumentService.class,
|
||||||
|
DocumentSourceService.class,
|
||||||
|
DocumentContentService.class,
|
||||||
|
DocumentRepresentationService.class,
|
||||||
|
DocumentEmbeddingService.class,
|
||||||
|
DocumentRelationService.class,
|
||||||
|
DocumentClassificationService.class,
|
||||||
|
BasicMimeAndExtensionDocumentTypeDetector.class,
|
||||||
|
DocumentExtractionService.class,
|
||||||
|
PlainTextDocumentExtractor.class,
|
||||||
|
HtmlDocumentExtractor.class,
|
||||||
|
PdfDocumentExtractor.class,
|
||||||
|
BinaryPassThroughDocumentExtractor.class,
|
||||||
|
MimeMessageDocumentExtractor.class,
|
||||||
|
SpreadsheetDocumentExtractor.class,
|
||||||
|
TextRepresentationBuildService.class,
|
||||||
|
DefaultGenericTextRepresentationBuilder.class,
|
||||||
|
PdfExtractionService.class,
|
||||||
|
DocumentExtractionService.class,
|
||||||
|
GenericDocumentImportService.class,
|
||||||
|
StructuredDocumentProcessingService.class,
|
||||||
|
|
||||||
|
})
|
||||||
|
static class TestApplication {
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,66 @@
|
|||||||
|
package at.procon.dip.ingestion.service;
|
||||||
|
|
||||||
|
import at.procon.dip.ingestion.service.MailMessageExtractionService.MailAttachment;
|
||||||
|
import at.procon.dip.testsupport.MailBundleTestSupport;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
|
import org.junit.jupiter.params.provider.MethodSource;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class MailMessageExtractionServiceBundleTest {
|
||||||
|
|
||||||
|
private static Path bundleRoot;
|
||||||
|
|
||||||
|
private final MailMessageExtractionService service = new MailMessageExtractionService();
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
static void extractBundle() throws Exception {
|
||||||
|
bundleRoot = MailBundleTestSupport.extractBundleToTempDir();
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest(name = "parse {0}")
|
||||||
|
@MethodSource("at.procon.dip.testsupport.MailBundleTestSupport#bundleMailNames")
|
||||||
|
void parse_should_extract_expected_attachments_from_filesystem_bundle(String fileName) throws Exception {
|
||||||
|
Path eml = bundleRoot.resolve("eml").resolve(fileName);
|
||||||
|
byte[] rawMime = Files.readAllBytes(eml);
|
||||||
|
|
||||||
|
MailMessageExtractionService.ParsedMailMessage parsed = service.parse(rawMime);
|
||||||
|
|
||||||
|
assertNotNull(parsed.subject(), "subject should be parsed for " + fileName);
|
||||||
|
assertNotNull(parsed.receivedAt(), "receivedAt should be parsed for " + fileName);
|
||||||
|
assertFalse(parsed.attachments().isEmpty(), "attachments should be extracted for " + fileName);
|
||||||
|
|
||||||
|
List<String> actualNames = parsed.attachments().stream().map(MailAttachment::fileName).toList();
|
||||||
|
assertEquals(MailBundleTestSupport.EXPECTED_ATTACHMENT_NAMES.get(fileName), actualNames,
|
||||||
|
"attachment filenames should match validation bundle for " + fileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void parse_should_preserve_utf8_attachment_filenames() throws Exception {
|
||||||
|
Path eml = bundleRoot.resolve("eml").resolve("sample-mail-04-utf8-filenames.eml");
|
||||||
|
byte[] rawMime = Files.readAllBytes(eml);
|
||||||
|
|
||||||
|
MailMessageExtractionService.ParsedMailMessage parsed = service.parse(rawMime);
|
||||||
|
|
||||||
|
List<String> actualNames = parsed.attachments().stream().map(MailAttachment::fileName).toList();
|
||||||
|
assertEquals(List.of("prüfbericht.pdf", "данни.xlsx", "überblick.csv"), actualNames);
|
||||||
|
assertTrue(parsed.textBody().contains("UTF") || !parsed.textBody().isBlank(),
|
||||||
|
"UTF-8 sample should produce a readable body");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void parse_should_keep_octet_stream_attachments_in_bundle() throws Exception {
|
||||||
|
Path eml = bundleRoot.resolve("eml").resolve("sample-mail-05-generic-octet-stream.eml");
|
||||||
|
byte[] rawMime = Files.readAllBytes(eml);
|
||||||
|
|
||||||
|
MailMessageExtractionService.ParsedMailMessage parsed = service.parse(rawMime);
|
||||||
|
|
||||||
|
assertEquals(3, parsed.attachments().size());
|
||||||
|
assertTrue(parsed.attachments().stream().allMatch(a -> a.contentType() != null && a.contentType().contains("application/octet-stream")));
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,65 @@
|
|||||||
|
package at.procon.dip.testsupport;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
import org.junit.jupiter.api.Named;
|
||||||
|
|
||||||
|
public final class MailBundleTestSupport {
|
||||||
|
|
||||||
|
private MailBundleTestSupport() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final Map<String, List<String>> EXPECTED_ATTACHMENT_NAMES = Map.of(
|
||||||
|
"sample-mail-01-basic-reporting.eml", List.of("sample-report.xlsx", "invoice-demo.pdf", "contacts.csv"),
|
||||||
|
"sample-mail-02-office-and-text.eml", List.of("purchase-order.docx", "readme.txt", "metadata.json", "payload.xml"),
|
||||||
|
"sample-mail-03-inline-image-and-files.eml", List.of("inline-logo.png", "overview.pdf", "contacts.csv"),
|
||||||
|
"sample-mail-04-utf8-filenames.eml", List.of("prüfbericht.pdf", "данни.xlsx", "überblick.csv"),
|
||||||
|
"sample-mail-05-generic-octet-stream.eml", List.of("generic-report.xlsx", "generic-invoice.pdf", "generic-data.csv")
|
||||||
|
);
|
||||||
|
|
||||||
|
public static Path extractBundleToTempDir() throws IOException {
|
||||||
|
Path tempDir = Files.createTempDirectory("mail-bundle-");
|
||||||
|
try (InputStream in = MailBundleTestSupport.class.getResourceAsStream("/mail/sample-eml-bundle.zip")) {
|
||||||
|
if (in == null) {
|
||||||
|
throw new IOException("Missing test resource /mail/sample-eml-bundle.zip");
|
||||||
|
}
|
||||||
|
Path zipPath = tempDir.resolve("sample-eml-bundle.zip");
|
||||||
|
Files.copy(in, zipPath, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
unzip(zipPath, tempDir);
|
||||||
|
}
|
||||||
|
return tempDir.resolve("eml_samples");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void unzip(Path zipPath, Path targetDir) throws IOException {
|
||||||
|
try (java.util.zip.ZipInputStream zin = new java.util.zip.ZipInputStream(Files.newInputStream(zipPath))) {
|
||||||
|
java.util.zip.ZipEntry entry;
|
||||||
|
while ((entry = zin.getNextEntry()) != null) {
|
||||||
|
Path out = targetDir.resolve(entry.getName()).normalize();
|
||||||
|
if (!out.startsWith(targetDir)) {
|
||||||
|
throw new IOException("Zip slip attempt: " + entry.getName());
|
||||||
|
}
|
||||||
|
if (entry.isDirectory()) {
|
||||||
|
Files.createDirectories(out);
|
||||||
|
} else {
|
||||||
|
Files.createDirectories(out.getParent());
|
||||||
|
Files.copy(zin, out, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
}
|
||||||
|
zin.closeEntry();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Stream<String> bundleMailNames() {
|
||||||
|
return EXPECTED_ATTACHMENT_NAMES.keySet().stream().sorted();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Stream<Named<String>> namedBundleMailNames() {
|
||||||
|
return bundleMailNames().map(name -> Named.of(name, name));
|
||||||
|
}
|
||||||
|
}
|
||||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,28 @@
|
|||||||
|
From: Sender Example <sender@example.com>
|
||||||
|
To: Receiver Example <receiver@example.com>
|
||||||
|
Subject: Sample mail with filesystem-loaded attachments
|
||||||
|
Date: Tue, 18 Mar 2026 15:27:59 +0100
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: multipart/mixed; boundary="boundary42"
|
||||||
|
|
||||||
|
--boundary42
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: quoted-printable
|
||||||
|
|
||||||
|
Hello from the filesystem-backed sample message.
|
||||||
|
This mail has one text attachment and one old Excel attachment.
|
||||||
|
|
||||||
|
--boundary42
|
||||||
|
Content-Type: text/plain; charset=UTF-8; name="notes.txt"
|
||||||
|
Content-Disposition: attachment; filename="notes.txt"
|
||||||
|
Content-Transfer-Encoding: quoted-printable
|
||||||
|
|
||||||
|
These are the attachment notes.
|
||||||
|
|
||||||
|
--boundary42
|
||||||
|
Content-Type: application/vnd.ms-excel; name="legacy.xls"
|
||||||
|
Content-Disposition: attachment; filename="legacy.xls"
|
||||||
|
Content-Transfer-Encoding: base64
|
||||||
|
|
||||||
|
0M8R4KGxGuEAAAAAAAAAAEZBS0VYTFM=
|
||||||
|
--boundary42--
|
||||||
@ -0,0 +1,2 @@
|
|||||||
|
CREATE SCHEMA IF NOT EXISTS DOC;
|
||||||
|
CREATE SCHEMA IF NOT EXISTS TED;
|
||||||
Loading…
Reference in New Issue