You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

377 lines
17 KiB
Java

package at.procon.dip.ingestion.integration;
import at.procon.dip.FixedPortPostgreSQLContainer;
import at.procon.dip.classification.detector.BasicMimeAndExtensionDocumentTypeDetector;
import at.procon.dip.classification.service.DocumentClassificationService;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.StorageType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.domain.document.entity.DocumentSource;
import at.procon.dip.domain.document.repository.DocumentContentRepository;
import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
import at.procon.dip.domain.document.repository.DocumentRelationRepository;
import at.procon.dip.domain.document.repository.DocumentRepository;
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.domain.document.service.DocumentContentService;
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
import at.procon.dip.domain.document.service.DocumentRelationService;
import at.procon.dip.domain.document.service.DocumentRepresentationService;
import at.procon.dip.domain.document.service.DocumentService;
import at.procon.dip.domain.document.service.DocumentSourceService;
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
import at.procon.dip.extraction.impl.*;
import at.procon.dip.extraction.service.DocumentExtractionService;
import at.procon.dip.ingestion.adapter.MailDocumentIngestionAdapter;
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
import at.procon.dip.ingestion.service.GenericDocumentImportService;
import at.procon.dip.ingestion.service.MailMessageExtractionService;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.normalization.impl.DefaultGenericTextRepresentationBuilder;
import at.procon.dip.normalization.service.TextRepresentationBuildService;
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.service.attachment.PdfExtractionService;
import at.procon.ted.service.attachment.ZipExtractionService;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.OffsetDateTime;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInstance;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.SpringBootConfiguration;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.domain.EntityScan;
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
import org.springframework.boot.autoconfigure.jdbc.JdbcTemplateAutoConfiguration;
import org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration;
import org.springframework.boot.autoconfigure.task.TaskSchedulingAutoConfiguration;
import org.springframework.boot.autoconfigure.transaction.TransactionAutoConfiguration;
import org.springframework.boot.autoconfigure.web.servlet.ServletWebServerFactoryAutoConfiguration;
import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.context.annotation.Import;
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
import org.springframework.test.context.DynamicPropertyRegistry;
import org.springframework.test.context.DynamicPropertySource;
import org.springframework.test.context.TestPropertySource;
import org.testcontainers.containers.PostgreSQLContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import static org.assertj.core.api.Assertions.assertThat;
@SpringBootTest(classes = MailBundleProcessingIntegrationTest.TestApplication.class)
@Testcontainers
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@TestPropertySource(properties = {
"spring.jpa.hibernate.ddl-auto=create-drop",
"spring.jpa.show-sql=false",
"spring.jpa.open-in-view=false",
"spring.jpa.properties.hibernate.default_schema=DOC",
"ted.vectorization.enabled=false",
"ted.generic-ingestion.enabled=true",
"ted.generic-ingestion.mail-adapter-enabled=true",
"ted.generic-ingestion.file-system-enabled=false",
"ted.generic-ingestion.rest-upload-enabled=false",
"ted.generic-ingestion.deduplicate-by-content-hash=false",
"ted.generic-ingestion.expand-mail-zip-attachments=true",
"ted.generic-ingestion.default-visibility=PUBLIC",
"ted.generic-ingestion.mail-default-visibility=RESTRICTED",
"ted.generic-ingestion.import-batch-id=test-mail-bundle",
"ted.generic-ingestion.mail-import-batch-id=test-mail-bundle-mail"
})
class MailBundleProcessingIntegrationTest {
private static final int HOST_PORT = 15433;
@Container
static PostgreSQLContainer<?> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", HOST_PORT)
.withDatabaseName("dip_test")
.withUsername("test")
.withPassword("test")
.withInitScript("sql/create-doc-test-schemas.sql");
static {
postgres.start();
}
@DynamicPropertySource
static void registerProperties(DynamicPropertyRegistry registry) {
registry.add("spring.datasource.url", postgres::getJdbcUrl);
registry.add("spring.datasource.username", postgres::getUsername);
registry.add("spring.datasource.password", postgres::getPassword);
registry.add("spring.datasource.driver-class-name", postgres::getDriverClassName);
}
@Autowired
private DocumentIngestionGateway gateway;
@Autowired
private MailMessageExtractionService mailMessageExtractionService;
@Autowired
private DocumentRepository documentRepository;
@Autowired
private DocumentSourceRepository documentSourceRepository;
@Autowired
private DocumentContentRepository documentContentRepository;
@Autowired
private DocumentRelationRepository documentRelationRepository;
@Autowired
private DocumentTextRepresentationRepository documentTextRepresentationRepository;
@Autowired
private DocumentEmbeddingRepository documentEmbeddingRepository;
@Autowired
private DocumentEmbeddingModelRepository documentEmbeddingModelRepository;
@Autowired
private DocumentTenantRepository documentTenantRepository;
private Path bundleDirectory;
@BeforeEach
void setUp() throws Exception {
System.out.println("TEST setUp start");
cleanupDatabase();
bundleDirectory = Files.createTempDirectory("mail-bundle-");
try (InputStream in = getClass().getResourceAsStream("/mail-sample-eml-bundle.zip")) {
assertThat(in).isNotNull();
unzip(in, bundleDirectory);
}
}
@AfterEach
void tearDown() throws Exception {
//cleanupDatabase();
if (bundleDirectory != null && Files.exists(bundleDirectory)) {
Files.walk(bundleDirectory)
.sorted(Comparator.reverseOrder())
.forEach(path -> {
try {
Files.deleteIfExists(path);
} catch (IOException ignored) {
}
});
}
}
@org.junit.jupiter.api.Timeout(120)
@Test
void processesEntireMailBundleThroughRealGatewayAndPersistsResults() throws Exception {
List<Path> emlFiles = Files.walk(bundleDirectory)
.filter(path -> path.getFileName().toString().endsWith(".eml"))
.sorted()
.toList();
assertThat(emlFiles).hasSizeGreaterThanOrEqualTo(5);
int expectedRootDocuments = 0;
int expectedAttachmentDocuments = 0;
for (Path eml : emlFiles) {
byte[] raw = Files.readAllBytes(eml);
var parsed = mailMessageExtractionService.parse(raw);
expectedRootDocuments++;
expectedAttachmentDocuments += parsed.attachments().size();
IngestionResult result = gateway.ingest(new SourceDescriptor(
null,
at.procon.dip.domain.document.SourceType.MAIL,
eml.getFileName().toString(),
eml.toString(),
eml.getFileName().toString(),
"message/rfc822",
raw,
null,
OffsetDateTime.now(),
OriginalContentStoragePolicy.STORE,
Map.of("title", eml.getFileName().toString())
));
assertThat(result.documents()).isNotEmpty();
assertThat(result.documents().get(0).documentType()).isEqualTo(DocumentType.MIME_MESSAGE);
}
long totalDocuments = documentRepository.count();
long totalSources = documentSourceRepository.count();
long totalRelations = documentRelationRepository.count();
assertThat(totalDocuments).isEqualTo(expectedRootDocuments + expectedAttachmentDocuments);
assertThat(totalSources).isEqualTo(totalDocuments);
assertThat(totalRelations).isEqualTo(expectedAttachmentDocuments);
List<Document> allDocuments = documentRepository.findAll();
long rootCount = allDocuments.stream().filter(d -> d.getDocumentType() == DocumentType.MIME_MESSAGE).count();
assertThat(rootCount).isEqualTo(expectedRootDocuments);
List<DocumentSource> allSources = documentSourceRepository.findAll();
List<DocumentContent> allContent = documentContentRepository.findAll();
long mimeOriginalCount = allContent.stream()
.filter(c -> c.getContentRole() == ContentRole.ORIGINAL)
.filter(c -> c.getMimeType() != null && c.getMimeType().startsWith("message/rfc822"))
.count();
assertThat(mimeOriginalCount).isEqualTo(expectedRootDocuments);
assertThat(documentTextRepresentationRepository.count()).isGreaterThanOrEqualTo(expectedRootDocuments);
List<UUID> pdfDocumentIds = allSources.stream()
.filter(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".pdf"))
.map(s -> s.getDocument().getId())
.toList();
assertThat(pdfDocumentIds).isNotEmpty();
assertThat(allContent.stream()
.filter(c -> pdfDocumentIds.contains(c.getDocument().getId()))
.anyMatch(c -> c.getContentRole() == ContentRole.NORMALIZED_TEXT && c.getTextContent() != null && !c.getTextContent().isBlank()))
.isTrue();
List<UUID> spreadsheetIds = allSources.stream()
.filter(s -> s.getSourceFilename() != null && (s.getSourceFilename().toLowerCase().endsWith(".xlsx") || s.getSourceFilename().toLowerCase().endsWith(".xls")))
.map(s -> s.getDocument().getId())
.toList();
if (!spreadsheetIds.isEmpty()) {
assertThat(allContent.stream()
.filter(c -> spreadsheetIds.contains(c.getDocument().getId()))
.filter(c -> c.getContentRole() == ContentRole.ORIGINAL)
.anyMatch(c -> c.getStorageType() == StorageType.DB_BINARY || c.getStorageType() == StorageType.EXTERNAL_REFERENCE))
.isTrue();
}
}
@Test
void processesSingleFilesystemMailAndPersistsAttachmentsAndRelations() throws Exception {
Path sample = Files.walk(bundleDirectory)
//.filter(path -> path.getFileName().toString().equals("sample-mail-02-office-and-text.eml"))
.filter(path -> path.getFileName().toString().equals("sample-mail-01-basic-reporting.eml"))
.findFirst()
.orElseThrow();
byte[] raw = Files.readAllBytes(sample);
var parsed = mailMessageExtractionService.parse(raw);
IngestionResult result = gateway.ingest(new SourceDescriptor(
null,
at.procon.dip.domain.document.SourceType.MAIL,
"filesystem-sample-02",
sample.toString(),
sample.getFileName().toString(),
"message/rfc822",
raw,
null,
OffsetDateTime.now(),
OriginalContentStoragePolicy.STORE,
Map.of("title", sample.getFileName().toString())
));
assertThat(result.documents()).hasSize(1 + parsed.attachments().size());
assertThat(documentRepository.count()).isEqualTo(1 + parsed.attachments().size());
assertThat(documentRelationRepository.count()).isEqualTo(parsed.attachments().size());
List<DocumentSource> sources = documentSourceRepository.findAll();
assertThat(sources).anyMatch(s -> sample.getFileName().toString().equals(s.getSourceFilename()));
assertThat(sources).anyMatch(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".pdf"));
assertThat(sources).anyMatch(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".csv"));
}
private void cleanupDatabase() {
System.out.println("cleanup: relations");
documentRelationRepository.deleteAll();
System.out.println("cleanup: embeddings");
documentEmbeddingRepository.deleteAll();
System.out.println("cleanup: text reps");
documentTextRepresentationRepository.deleteAll();
System.out.println("cleanup: content");
documentContentRepository.deleteAll();
System.out.println("cleanup: sources");
documentSourceRepository.deleteAll();
System.out.println("cleanup: documents");
documentRepository.deleteAll();
System.out.println("cleanup: models");
documentEmbeddingModelRepository.deleteAll();
System.out.println("cleanup: tenants");
documentTenantRepository.deleteAll();
}
private static void unzip(InputStream inputStream, Path targetDir) throws IOException {
try (ZipInputStream zis = new ZipInputStream(inputStream)) {
ZipEntry entry;
while ((entry = zis.getNextEntry()) != null) {
Path out = targetDir.resolve(entry.getName()).normalize();
if (!out.startsWith(targetDir)) {
throw new IOException("Zip entry outside target dir: " + entry.getName());
}
if (entry.isDirectory()) {
Files.createDirectories(out);
} else {
Files.createDirectories(out.getParent());
Files.copy(zis, out);
}
zis.closeEntry();
}
}
}
@SpringBootConfiguration
@ImportAutoConfiguration({
DataSourceAutoConfiguration.class,
HibernateJpaAutoConfiguration.class,
TransactionAutoConfiguration.class,
JdbcTemplateAutoConfiguration.class
})
@EnableConfigurationProperties(TedProcessorProperties.class)
@EntityScan(basePackages = {
"at.procon.dip.domain.document.entity",
"at.procon.dip.domain.tenant.entity"
})
@EnableJpaRepositories(basePackages = {
"at.procon.dip.domain.document.repository",
"at.procon.dip.domain.tenant.repository"
})
@Import({
DocumentIngestionGateway.class,
GenericDocumentImportService.class,
MailDocumentIngestionAdapter.class,
MailMessageExtractionService.class,
ZipExtractionService.class,
DocumentService.class,
DocumentSourceService.class,
DocumentContentService.class,
DocumentRepresentationService.class,
DocumentEmbeddingService.class,
DocumentRelationService.class,
DocumentClassificationService.class,
BasicMimeAndExtensionDocumentTypeDetector.class,
DocumentExtractionService.class,
PlainTextDocumentExtractor.class,
HtmlDocumentExtractor.class,
PdfDocumentExtractor.class,
BinaryPassThroughDocumentExtractor.class,
MimeMessageDocumentExtractor.class,
SpreadsheetDocumentExtractor.class,
TextRepresentationBuildService.class,
DefaultGenericTextRepresentationBuilder.class,
PdfExtractionService.class,
DocumentExtractionService.class,
GenericDocumentImportService.class,
StructuredDocumentProcessingService.class,
})
static class TestApplication {
}
}