package at.procon.dip.ingestion.integration; import at.procon.dip.FixedPortPostgreSQLContainer; import at.procon.dip.classification.detector.BasicMimeAndExtensionDocumentTypeDetector; import at.procon.dip.classification.service.DocumentClassificationService; import at.procon.dip.domain.document.ContentRole; import at.procon.dip.domain.document.DocumentType; import at.procon.dip.domain.document.StorageType; import at.procon.dip.domain.document.entity.Document; import at.procon.dip.domain.document.entity.DocumentContent; import at.procon.dip.domain.document.entity.DocumentSource; import at.procon.dip.domain.document.repository.DocumentContentRepository; import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository; import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; import at.procon.dip.domain.document.repository.DocumentRelationRepository; import at.procon.dip.domain.document.repository.DocumentRepository; import at.procon.dip.domain.document.repository.DocumentSourceRepository; import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; import at.procon.dip.domain.document.service.DocumentContentService; import at.procon.dip.domain.document.service.DocumentEmbeddingService; import at.procon.dip.domain.document.service.DocumentRelationService; import at.procon.dip.domain.document.service.DocumentRepresentationService; import at.procon.dip.domain.document.service.DocumentService; import at.procon.dip.domain.document.service.DocumentSourceService; import at.procon.dip.domain.tenant.repository.DocumentTenantRepository; import at.procon.dip.extraction.impl.*; import at.procon.dip.extraction.service.DocumentExtractionService; import at.procon.dip.ingestion.adapter.MailDocumentIngestionAdapter; import at.procon.dip.ingestion.service.DocumentIngestionGateway; import at.procon.dip.ingestion.service.GenericDocumentImportService; import at.procon.dip.ingestion.service.MailMessageExtractionService; import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.normalization.impl.DefaultGenericTextRepresentationBuilder; import at.procon.dip.normalization.service.TextRepresentationBuildService; import at.procon.dip.processing.service.StructuredDocumentProcessingService; import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.service.attachment.PdfExtractionService; import at.procon.ted.service.attachment.ZipExtractionService; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.time.OffsetDateTime; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInstance; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.SpringBootConfiguration; import org.springframework.boot.autoconfigure.EnableAutoConfiguration; import org.springframework.boot.autoconfigure.ImportAutoConfiguration; import org.springframework.boot.autoconfigure.domain.EntityScan; import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration; import org.springframework.boot.autoconfigure.jdbc.JdbcTemplateAutoConfiguration; import org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration; import org.springframework.boot.autoconfigure.task.TaskSchedulingAutoConfiguration; import org.springframework.boot.autoconfigure.transaction.TransactionAutoConfiguration; import org.springframework.boot.autoconfigure.web.servlet.ServletWebServerFactoryAutoConfiguration; import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration; import org.springframework.boot.context.properties.EnableConfigurationProperties; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.context.annotation.Import; import org.springframework.data.jpa.repository.config.EnableJpaRepositories; import org.springframework.test.context.DynamicPropertyRegistry; import org.springframework.test.context.DynamicPropertySource; import org.springframework.test.context.TestPropertySource; import org.testcontainers.containers.PostgreSQLContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; import static org.assertj.core.api.Assertions.assertThat; @SpringBootTest(classes = MailBundleProcessingIntegrationTest.TestApplication.class) @Testcontainers @TestInstance(TestInstance.Lifecycle.PER_CLASS) @TestPropertySource(properties = { "spring.jpa.hibernate.ddl-auto=create-drop", "spring.jpa.show-sql=false", "spring.jpa.open-in-view=false", "spring.jpa.properties.hibernate.default_schema=DOC", "ted.vectorization.enabled=false", "ted.generic-ingestion.enabled=true", "ted.generic-ingestion.mail-adapter-enabled=true", "ted.generic-ingestion.file-system-enabled=false", "ted.generic-ingestion.rest-upload-enabled=false", "ted.generic-ingestion.deduplicate-by-content-hash=false", "ted.generic-ingestion.expand-mail-zip-attachments=true", "ted.generic-ingestion.default-visibility=PUBLIC", "ted.generic-ingestion.mail-default-visibility=RESTRICTED", "ted.generic-ingestion.import-batch-id=test-mail-bundle", "ted.generic-ingestion.mail-import-batch-id=test-mail-bundle-mail" }) class MailBundleProcessingIntegrationTest { private static final int HOST_PORT = 15433; @Container static PostgreSQLContainer postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", HOST_PORT) .withDatabaseName("dip_test") .withUsername("test") .withPassword("test") .withInitScript("sql/create-doc-test-schemas.sql"); static { postgres.start(); } @DynamicPropertySource static void registerProperties(DynamicPropertyRegistry registry) { registry.add("spring.datasource.url", postgres::getJdbcUrl); registry.add("spring.datasource.username", postgres::getUsername); registry.add("spring.datasource.password", postgres::getPassword); registry.add("spring.datasource.driver-class-name", postgres::getDriverClassName); } @Autowired private DocumentIngestionGateway gateway; @Autowired private MailMessageExtractionService mailMessageExtractionService; @Autowired private DocumentRepository documentRepository; @Autowired private DocumentSourceRepository documentSourceRepository; @Autowired private DocumentContentRepository documentContentRepository; @Autowired private DocumentRelationRepository documentRelationRepository; @Autowired private DocumentTextRepresentationRepository documentTextRepresentationRepository; @Autowired private DocumentEmbeddingRepository documentEmbeddingRepository; @Autowired private DocumentEmbeddingModelRepository documentEmbeddingModelRepository; @Autowired private DocumentTenantRepository documentTenantRepository; private Path bundleDirectory; @BeforeEach void setUp() throws Exception { System.out.println("TEST setUp start"); cleanupDatabase(); bundleDirectory = Files.createTempDirectory("mail-bundle-"); try (InputStream in = getClass().getResourceAsStream("/mail-sample-eml-bundle.zip")) { assertThat(in).isNotNull(); unzip(in, bundleDirectory); } } @AfterEach void tearDown() throws Exception { //cleanupDatabase(); if (bundleDirectory != null && Files.exists(bundleDirectory)) { Files.walk(bundleDirectory) .sorted(Comparator.reverseOrder()) .forEach(path -> { try { Files.deleteIfExists(path); } catch (IOException ignored) { } }); } } @org.junit.jupiter.api.Timeout(120) @Test void processesEntireMailBundleThroughRealGatewayAndPersistsResults() throws Exception { List emlFiles = Files.walk(bundleDirectory) .filter(path -> path.getFileName().toString().endsWith(".eml")) .sorted() .toList(); assertThat(emlFiles).hasSizeGreaterThanOrEqualTo(5); int expectedRootDocuments = 0; int expectedAttachmentDocuments = 0; for (Path eml : emlFiles) { byte[] raw = Files.readAllBytes(eml); var parsed = mailMessageExtractionService.parse(raw); expectedRootDocuments++; expectedAttachmentDocuments += parsed.attachments().size(); IngestionResult result = gateway.ingest(new SourceDescriptor( null, at.procon.dip.domain.document.SourceType.MAIL, eml.getFileName().toString(), eml.toString(), eml.getFileName().toString(), "message/rfc822", raw, null, OffsetDateTime.now(), OriginalContentStoragePolicy.STORE, Map.of("title", eml.getFileName().toString()) )); assertThat(result.documents()).isNotEmpty(); assertThat(result.documents().get(0).documentType()).isEqualTo(DocumentType.MIME_MESSAGE); } long totalDocuments = documentRepository.count(); long totalSources = documentSourceRepository.count(); long totalRelations = documentRelationRepository.count(); assertThat(totalDocuments).isEqualTo(expectedRootDocuments + expectedAttachmentDocuments); assertThat(totalSources).isEqualTo(totalDocuments); assertThat(totalRelations).isEqualTo(expectedAttachmentDocuments); List allDocuments = documentRepository.findAll(); long rootCount = allDocuments.stream().filter(d -> d.getDocumentType() == DocumentType.MIME_MESSAGE).count(); assertThat(rootCount).isEqualTo(expectedRootDocuments); List allSources = documentSourceRepository.findAll(); List allContent = documentContentRepository.findAll(); long mimeOriginalCount = allContent.stream() .filter(c -> c.getContentRole() == ContentRole.ORIGINAL) .filter(c -> c.getMimeType() != null && c.getMimeType().startsWith("message/rfc822")) .count(); assertThat(mimeOriginalCount).isEqualTo(expectedRootDocuments); assertThat(documentTextRepresentationRepository.count()).isGreaterThanOrEqualTo(expectedRootDocuments); List pdfDocumentIds = allSources.stream() .filter(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".pdf")) .map(s -> s.getDocument().getId()) .toList(); assertThat(pdfDocumentIds).isNotEmpty(); assertThat(allContent.stream() .filter(c -> pdfDocumentIds.contains(c.getDocument().getId())) .anyMatch(c -> c.getContentRole() == ContentRole.NORMALIZED_TEXT && c.getTextContent() != null && !c.getTextContent().isBlank())) .isTrue(); List spreadsheetIds = allSources.stream() .filter(s -> s.getSourceFilename() != null && (s.getSourceFilename().toLowerCase().endsWith(".xlsx") || s.getSourceFilename().toLowerCase().endsWith(".xls"))) .map(s -> s.getDocument().getId()) .toList(); if (!spreadsheetIds.isEmpty()) { assertThat(allContent.stream() .filter(c -> spreadsheetIds.contains(c.getDocument().getId())) .filter(c -> c.getContentRole() == ContentRole.ORIGINAL) .anyMatch(c -> c.getStorageType() == StorageType.DB_BINARY || c.getStorageType() == StorageType.EXTERNAL_REFERENCE)) .isTrue(); } } @Test void processesSingleFilesystemMailAndPersistsAttachmentsAndRelations() throws Exception { Path sample = Files.walk(bundleDirectory) //.filter(path -> path.getFileName().toString().equals("sample-mail-02-office-and-text.eml")) .filter(path -> path.getFileName().toString().equals("sample-mail-01-basic-reporting.eml")) .findFirst() .orElseThrow(); byte[] raw = Files.readAllBytes(sample); var parsed = mailMessageExtractionService.parse(raw); IngestionResult result = gateway.ingest(new SourceDescriptor( null, at.procon.dip.domain.document.SourceType.MAIL, "filesystem-sample-02", sample.toString(), sample.getFileName().toString(), "message/rfc822", raw, null, OffsetDateTime.now(), OriginalContentStoragePolicy.STORE, Map.of("title", sample.getFileName().toString()) )); assertThat(result.documents()).hasSize(1 + parsed.attachments().size()); assertThat(documentRepository.count()).isEqualTo(1 + parsed.attachments().size()); assertThat(documentRelationRepository.count()).isEqualTo(parsed.attachments().size()); List sources = documentSourceRepository.findAll(); assertThat(sources).anyMatch(s -> sample.getFileName().toString().equals(s.getSourceFilename())); assertThat(sources).anyMatch(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".pdf")); assertThat(sources).anyMatch(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".csv")); } private void cleanupDatabase() { System.out.println("cleanup: relations"); documentRelationRepository.deleteAll(); System.out.println("cleanup: embeddings"); documentEmbeddingRepository.deleteAll(); System.out.println("cleanup: text reps"); documentTextRepresentationRepository.deleteAll(); System.out.println("cleanup: content"); documentContentRepository.deleteAll(); System.out.println("cleanup: sources"); documentSourceRepository.deleteAll(); System.out.println("cleanup: documents"); documentRepository.deleteAll(); System.out.println("cleanup: models"); documentEmbeddingModelRepository.deleteAll(); System.out.println("cleanup: tenants"); documentTenantRepository.deleteAll(); } private static void unzip(InputStream inputStream, Path targetDir) throws IOException { try (ZipInputStream zis = new ZipInputStream(inputStream)) { ZipEntry entry; while ((entry = zis.getNextEntry()) != null) { Path out = targetDir.resolve(entry.getName()).normalize(); if (!out.startsWith(targetDir)) { throw new IOException("Zip entry outside target dir: " + entry.getName()); } if (entry.isDirectory()) { Files.createDirectories(out); } else { Files.createDirectories(out.getParent()); Files.copy(zis, out); } zis.closeEntry(); } } } @SpringBootConfiguration @ImportAutoConfiguration({ DataSourceAutoConfiguration.class, HibernateJpaAutoConfiguration.class, TransactionAutoConfiguration.class, JdbcTemplateAutoConfiguration.class }) @EnableConfigurationProperties(TedProcessorProperties.class) @EntityScan(basePackages = { "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity" }) @EnableJpaRepositories(basePackages = { "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository" }) @Import({ DocumentIngestionGateway.class, GenericDocumentImportService.class, MailDocumentIngestionAdapter.class, MailMessageExtractionService.class, ZipExtractionService.class, DocumentService.class, DocumentSourceService.class, DocumentContentService.class, DocumentRepresentationService.class, DocumentEmbeddingService.class, DocumentRelationService.class, DocumentClassificationService.class, BasicMimeAndExtensionDocumentTypeDetector.class, DocumentExtractionService.class, PlainTextDocumentExtractor.class, HtmlDocumentExtractor.class, PdfDocumentExtractor.class, BinaryPassThroughDocumentExtractor.class, MimeMessageDocumentExtractor.class, SpreadsheetDocumentExtractor.class, TextRepresentationBuildService.class, DefaultGenericTextRepresentationBuilder.class, PdfExtractionService.class, DocumentExtractionService.class, GenericDocumentImportService.class, StructuredDocumentProcessingService.class, }) static class TestApplication { } }