You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
377 lines
17 KiB
Java
377 lines
17 KiB
Java
package at.procon.dip.ingestion.integration;
|
|
|
|
import at.procon.dip.FixedPortPostgreSQLContainer;
|
|
import at.procon.dip.classification.detector.BasicMimeAndExtensionDocumentTypeDetector;
|
|
import at.procon.dip.classification.service.DocumentClassificationService;
|
|
import at.procon.dip.domain.document.ContentRole;
|
|
import at.procon.dip.domain.document.DocumentType;
|
|
import at.procon.dip.domain.document.StorageType;
|
|
import at.procon.dip.domain.document.entity.Document;
|
|
import at.procon.dip.domain.document.entity.DocumentContent;
|
|
import at.procon.dip.domain.document.entity.DocumentSource;
|
|
import at.procon.dip.domain.document.repository.DocumentContentRepository;
|
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
|
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
|
import at.procon.dip.domain.document.repository.DocumentRelationRepository;
|
|
import at.procon.dip.domain.document.repository.DocumentRepository;
|
|
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
|
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
|
import at.procon.dip.domain.document.service.DocumentContentService;
|
|
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
|
import at.procon.dip.domain.document.service.DocumentRelationService;
|
|
import at.procon.dip.domain.document.service.DocumentRepresentationService;
|
|
import at.procon.dip.domain.document.service.DocumentService;
|
|
import at.procon.dip.domain.document.service.DocumentSourceService;
|
|
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
|
|
import at.procon.dip.extraction.impl.*;
|
|
import at.procon.dip.extraction.service.DocumentExtractionService;
|
|
import at.procon.dip.ingestion.adapter.MailDocumentIngestionAdapter;
|
|
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
|
|
import at.procon.dip.ingestion.service.GenericDocumentImportService;
|
|
import at.procon.dip.ingestion.service.MailMessageExtractionService;
|
|
import at.procon.dip.ingestion.spi.IngestionResult;
|
|
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
|
import at.procon.dip.normalization.impl.DefaultGenericTextRepresentationBuilder;
|
|
import at.procon.dip.normalization.service.TextRepresentationBuildService;
|
|
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
|
|
import at.procon.ted.config.TedProcessorProperties;
|
|
import at.procon.ted.service.attachment.PdfExtractionService;
|
|
import at.procon.ted.service.attachment.ZipExtractionService;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.time.OffsetDateTime;
|
|
import java.util.Comparator;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.UUID;
|
|
import java.util.zip.ZipEntry;
|
|
import java.util.zip.ZipInputStream;
|
|
import org.junit.jupiter.api.AfterEach;
|
|
import org.junit.jupiter.api.BeforeEach;
|
|
import org.junit.jupiter.api.Test;
|
|
import org.junit.jupiter.api.TestInstance;
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
import org.springframework.boot.SpringBootConfiguration;
|
|
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
|
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
|
|
import org.springframework.boot.autoconfigure.domain.EntityScan;
|
|
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
|
|
import org.springframework.boot.autoconfigure.jdbc.JdbcTemplateAutoConfiguration;
|
|
import org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration;
|
|
import org.springframework.boot.autoconfigure.task.TaskSchedulingAutoConfiguration;
|
|
import org.springframework.boot.autoconfigure.transaction.TransactionAutoConfiguration;
|
|
import org.springframework.boot.autoconfigure.web.servlet.ServletWebServerFactoryAutoConfiguration;
|
|
import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration;
|
|
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
|
import org.springframework.boot.test.context.SpringBootTest;
|
|
import org.springframework.context.annotation.Import;
|
|
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
|
import org.springframework.test.context.DynamicPropertyRegistry;
|
|
import org.springframework.test.context.DynamicPropertySource;
|
|
import org.springframework.test.context.TestPropertySource;
|
|
import org.testcontainers.containers.PostgreSQLContainer;
|
|
import org.testcontainers.junit.jupiter.Container;
|
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
|
|
|
import static org.assertj.core.api.Assertions.assertThat;
|
|
|
|
@SpringBootTest(classes = MailBundleProcessingIntegrationTest.TestApplication.class)
|
|
@Testcontainers
|
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
|
@TestPropertySource(properties = {
|
|
"spring.jpa.hibernate.ddl-auto=create-drop",
|
|
"spring.jpa.show-sql=false",
|
|
"spring.jpa.open-in-view=false",
|
|
"spring.jpa.properties.hibernate.default_schema=DOC",
|
|
"ted.vectorization.enabled=false",
|
|
"ted.generic-ingestion.enabled=true",
|
|
"ted.generic-ingestion.mail-adapter-enabled=true",
|
|
"ted.generic-ingestion.file-system-enabled=false",
|
|
"ted.generic-ingestion.rest-upload-enabled=false",
|
|
"ted.generic-ingestion.deduplicate-by-content-hash=false",
|
|
"ted.generic-ingestion.expand-mail-zip-attachments=true",
|
|
"ted.generic-ingestion.default-visibility=PUBLIC",
|
|
"ted.generic-ingestion.mail-default-visibility=RESTRICTED",
|
|
"ted.generic-ingestion.import-batch-id=test-mail-bundle",
|
|
"ted.generic-ingestion.mail-import-batch-id=test-mail-bundle-mail"
|
|
})
|
|
class MailBundleProcessingIntegrationTest {
|
|
|
|
private static final int HOST_PORT = 15433;
|
|
|
|
@Container
|
|
static PostgreSQLContainer<?> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", HOST_PORT)
|
|
.withDatabaseName("dip_test")
|
|
.withUsername("test")
|
|
.withPassword("test")
|
|
.withInitScript("sql/create-doc-test-schemas.sql");
|
|
|
|
static {
|
|
postgres.start();
|
|
}
|
|
|
|
@DynamicPropertySource
|
|
static void registerProperties(DynamicPropertyRegistry registry) {
|
|
registry.add("spring.datasource.url", postgres::getJdbcUrl);
|
|
registry.add("spring.datasource.username", postgres::getUsername);
|
|
registry.add("spring.datasource.password", postgres::getPassword);
|
|
registry.add("spring.datasource.driver-class-name", postgres::getDriverClassName);
|
|
}
|
|
|
|
@Autowired
|
|
private DocumentIngestionGateway gateway;
|
|
@Autowired
|
|
private MailMessageExtractionService mailMessageExtractionService;
|
|
@Autowired
|
|
private DocumentRepository documentRepository;
|
|
@Autowired
|
|
private DocumentSourceRepository documentSourceRepository;
|
|
@Autowired
|
|
private DocumentContentRepository documentContentRepository;
|
|
@Autowired
|
|
private DocumentRelationRepository documentRelationRepository;
|
|
@Autowired
|
|
private DocumentTextRepresentationRepository documentTextRepresentationRepository;
|
|
@Autowired
|
|
private DocumentEmbeddingRepository documentEmbeddingRepository;
|
|
@Autowired
|
|
private DocumentEmbeddingModelRepository documentEmbeddingModelRepository;
|
|
@Autowired
|
|
private DocumentTenantRepository documentTenantRepository;
|
|
|
|
private Path bundleDirectory;
|
|
|
|
@BeforeEach
|
|
void setUp() throws Exception {
|
|
System.out.println("TEST setUp start");
|
|
cleanupDatabase();
|
|
bundleDirectory = Files.createTempDirectory("mail-bundle-");
|
|
try (InputStream in = getClass().getResourceAsStream("/mail-sample-eml-bundle.zip")) {
|
|
assertThat(in).isNotNull();
|
|
unzip(in, bundleDirectory);
|
|
}
|
|
}
|
|
|
|
@AfterEach
|
|
void tearDown() throws Exception {
|
|
//cleanupDatabase();
|
|
if (bundleDirectory != null && Files.exists(bundleDirectory)) {
|
|
Files.walk(bundleDirectory)
|
|
.sorted(Comparator.reverseOrder())
|
|
.forEach(path -> {
|
|
try {
|
|
Files.deleteIfExists(path);
|
|
} catch (IOException ignored) {
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
@org.junit.jupiter.api.Timeout(120)
|
|
@Test
|
|
void processesEntireMailBundleThroughRealGatewayAndPersistsResults() throws Exception {
|
|
List<Path> emlFiles = Files.walk(bundleDirectory)
|
|
.filter(path -> path.getFileName().toString().endsWith(".eml"))
|
|
.sorted()
|
|
.toList();
|
|
|
|
assertThat(emlFiles).hasSizeGreaterThanOrEqualTo(5);
|
|
|
|
int expectedRootDocuments = 0;
|
|
int expectedAttachmentDocuments = 0;
|
|
|
|
for (Path eml : emlFiles) {
|
|
byte[] raw = Files.readAllBytes(eml);
|
|
var parsed = mailMessageExtractionService.parse(raw);
|
|
expectedRootDocuments++;
|
|
expectedAttachmentDocuments += parsed.attachments().size();
|
|
|
|
IngestionResult result = gateway.ingest(new SourceDescriptor(
|
|
null,
|
|
at.procon.dip.domain.document.SourceType.MAIL,
|
|
eml.getFileName().toString(),
|
|
eml.toString(),
|
|
eml.getFileName().toString(),
|
|
"message/rfc822",
|
|
raw,
|
|
null,
|
|
OffsetDateTime.now(),
|
|
OriginalContentStoragePolicy.STORE,
|
|
Map.of("title", eml.getFileName().toString())
|
|
));
|
|
|
|
assertThat(result.documents()).isNotEmpty();
|
|
assertThat(result.documents().get(0).documentType()).isEqualTo(DocumentType.MIME_MESSAGE);
|
|
}
|
|
|
|
long totalDocuments = documentRepository.count();
|
|
long totalSources = documentSourceRepository.count();
|
|
long totalRelations = documentRelationRepository.count();
|
|
|
|
assertThat(totalDocuments).isEqualTo(expectedRootDocuments + expectedAttachmentDocuments);
|
|
assertThat(totalSources).isEqualTo(totalDocuments);
|
|
assertThat(totalRelations).isEqualTo(expectedAttachmentDocuments);
|
|
|
|
List<Document> allDocuments = documentRepository.findAll();
|
|
long rootCount = allDocuments.stream().filter(d -> d.getDocumentType() == DocumentType.MIME_MESSAGE).count();
|
|
assertThat(rootCount).isEqualTo(expectedRootDocuments);
|
|
|
|
List<DocumentSource> allSources = documentSourceRepository.findAll();
|
|
List<DocumentContent> allContent = documentContentRepository.findAll();
|
|
|
|
long mimeOriginalCount = allContent.stream()
|
|
.filter(c -> c.getContentRole() == ContentRole.ORIGINAL)
|
|
.filter(c -> c.getMimeType() != null && c.getMimeType().startsWith("message/rfc822"))
|
|
.count();
|
|
assertThat(mimeOriginalCount).isEqualTo(expectedRootDocuments);
|
|
|
|
assertThat(documentTextRepresentationRepository.count()).isGreaterThanOrEqualTo(expectedRootDocuments);
|
|
|
|
List<UUID> pdfDocumentIds = allSources.stream()
|
|
.filter(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".pdf"))
|
|
.map(s -> s.getDocument().getId())
|
|
.toList();
|
|
assertThat(pdfDocumentIds).isNotEmpty();
|
|
assertThat(allContent.stream()
|
|
.filter(c -> pdfDocumentIds.contains(c.getDocument().getId()))
|
|
.anyMatch(c -> c.getContentRole() == ContentRole.NORMALIZED_TEXT && c.getTextContent() != null && !c.getTextContent().isBlank()))
|
|
.isTrue();
|
|
|
|
List<UUID> spreadsheetIds = allSources.stream()
|
|
.filter(s -> s.getSourceFilename() != null && (s.getSourceFilename().toLowerCase().endsWith(".xlsx") || s.getSourceFilename().toLowerCase().endsWith(".xls")))
|
|
.map(s -> s.getDocument().getId())
|
|
.toList();
|
|
if (!spreadsheetIds.isEmpty()) {
|
|
assertThat(allContent.stream()
|
|
.filter(c -> spreadsheetIds.contains(c.getDocument().getId()))
|
|
.filter(c -> c.getContentRole() == ContentRole.ORIGINAL)
|
|
.anyMatch(c -> c.getStorageType() == StorageType.DB_BINARY || c.getStorageType() == StorageType.EXTERNAL_REFERENCE))
|
|
.isTrue();
|
|
}
|
|
}
|
|
|
|
@Test
|
|
void processesSingleFilesystemMailAndPersistsAttachmentsAndRelations() throws Exception {
|
|
Path sample = Files.walk(bundleDirectory)
|
|
//.filter(path -> path.getFileName().toString().equals("sample-mail-02-office-and-text.eml"))
|
|
.filter(path -> path.getFileName().toString().equals("sample-mail-01-basic-reporting.eml"))
|
|
.findFirst()
|
|
.orElseThrow();
|
|
|
|
byte[] raw = Files.readAllBytes(sample);
|
|
var parsed = mailMessageExtractionService.parse(raw);
|
|
|
|
IngestionResult result = gateway.ingest(new SourceDescriptor(
|
|
null,
|
|
at.procon.dip.domain.document.SourceType.MAIL,
|
|
"filesystem-sample-02",
|
|
sample.toString(),
|
|
sample.getFileName().toString(),
|
|
"message/rfc822",
|
|
raw,
|
|
null,
|
|
OffsetDateTime.now(),
|
|
OriginalContentStoragePolicy.STORE,
|
|
Map.of("title", sample.getFileName().toString())
|
|
));
|
|
|
|
assertThat(result.documents()).hasSize(1 + parsed.attachments().size());
|
|
assertThat(documentRepository.count()).isEqualTo(1 + parsed.attachments().size());
|
|
assertThat(documentRelationRepository.count()).isEqualTo(parsed.attachments().size());
|
|
|
|
List<DocumentSource> sources = documentSourceRepository.findAll();
|
|
assertThat(sources).anyMatch(s -> sample.getFileName().toString().equals(s.getSourceFilename()));
|
|
assertThat(sources).anyMatch(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".pdf"));
|
|
assertThat(sources).anyMatch(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".csv"));
|
|
}
|
|
|
|
private void cleanupDatabase() {
|
|
System.out.println("cleanup: relations");
|
|
documentRelationRepository.deleteAll();
|
|
System.out.println("cleanup: embeddings");
|
|
documentEmbeddingRepository.deleteAll();
|
|
System.out.println("cleanup: text reps");
|
|
documentTextRepresentationRepository.deleteAll();
|
|
System.out.println("cleanup: content");
|
|
documentContentRepository.deleteAll();
|
|
System.out.println("cleanup: sources");
|
|
documentSourceRepository.deleteAll();
|
|
System.out.println("cleanup: documents");
|
|
documentRepository.deleteAll();
|
|
System.out.println("cleanup: models");
|
|
documentEmbeddingModelRepository.deleteAll();
|
|
System.out.println("cleanup: tenants");
|
|
documentTenantRepository.deleteAll();
|
|
}
|
|
|
|
private static void unzip(InputStream inputStream, Path targetDir) throws IOException {
|
|
try (ZipInputStream zis = new ZipInputStream(inputStream)) {
|
|
ZipEntry entry;
|
|
while ((entry = zis.getNextEntry()) != null) {
|
|
Path out = targetDir.resolve(entry.getName()).normalize();
|
|
if (!out.startsWith(targetDir)) {
|
|
throw new IOException("Zip entry outside target dir: " + entry.getName());
|
|
}
|
|
if (entry.isDirectory()) {
|
|
Files.createDirectories(out);
|
|
} else {
|
|
Files.createDirectories(out.getParent());
|
|
Files.copy(zis, out);
|
|
}
|
|
zis.closeEntry();
|
|
}
|
|
}
|
|
}
|
|
|
|
@SpringBootConfiguration
|
|
@ImportAutoConfiguration({
|
|
DataSourceAutoConfiguration.class,
|
|
HibernateJpaAutoConfiguration.class,
|
|
TransactionAutoConfiguration.class,
|
|
JdbcTemplateAutoConfiguration.class
|
|
})
|
|
@EnableConfigurationProperties(TedProcessorProperties.class)
|
|
@EntityScan(basePackages = {
|
|
"at.procon.dip.domain.document.entity",
|
|
"at.procon.dip.domain.tenant.entity"
|
|
})
|
|
@EnableJpaRepositories(basePackages = {
|
|
"at.procon.dip.domain.document.repository",
|
|
"at.procon.dip.domain.tenant.repository"
|
|
})
|
|
@Import({
|
|
DocumentIngestionGateway.class,
|
|
GenericDocumentImportService.class,
|
|
MailDocumentIngestionAdapter.class,
|
|
MailMessageExtractionService.class,
|
|
ZipExtractionService.class,
|
|
DocumentService.class,
|
|
DocumentSourceService.class,
|
|
DocumentContentService.class,
|
|
DocumentRepresentationService.class,
|
|
DocumentEmbeddingService.class,
|
|
DocumentRelationService.class,
|
|
DocumentClassificationService.class,
|
|
BasicMimeAndExtensionDocumentTypeDetector.class,
|
|
DocumentExtractionService.class,
|
|
PlainTextDocumentExtractor.class,
|
|
HtmlDocumentExtractor.class,
|
|
PdfDocumentExtractor.class,
|
|
BinaryPassThroughDocumentExtractor.class,
|
|
MimeMessageDocumentExtractor.class,
|
|
SpreadsheetDocumentExtractor.class,
|
|
TextRepresentationBuildService.class,
|
|
DefaultGenericTextRepresentationBuilder.class,
|
|
PdfExtractionService.class,
|
|
DocumentExtractionService.class,
|
|
GenericDocumentImportService.class,
|
|
StructuredDocumentProcessingService.class,
|
|
|
|
})
|
|
static class TestApplication {
|
|
}
|
|
}
|