Refactor phases 4.2 - email adapter + tests

master
trifonovt 1 month ago
parent f3fcdfab11
commit 90093ab98d

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<!-- <!--
TED Procurement Document Processor Document Intelligence Platform
Author: Martin.Schweitzer@procon.co.at and claude.ai Author: Martin.Schweitzer@procon.co.at and claude.ai
Spring Boot application for processing EU eForms public procurement notices. Spring Boot application for processing EU eForms public procurement notices.
@ -25,7 +25,7 @@
<groupId>at.procon.dip</groupId> <groupId>at.procon.dip</groupId>
<artifactId>document-intelligence-platform</artifactId> <artifactId>document-intelligence-platform</artifactId>
<version>1.0.0-SNAPSHOT</version> <version>1.0.0-SNAPSHOT</version>
<name>Procon Document Intelligence Platform</name> <name>Document Intelligence Platform</name>
<description>Generic document ingestion, normalization, and semantic search platform with TED support</description> <description>Generic document ingestion, normalization, and semantic search platform with TED support</description>
<properties> <properties>
@ -232,6 +232,12 @@
<groupId>org.flywaydb</groupId> <groupId>org.flywaydb</groupId>
<artifactId>flyway-database-postgresql</artifactId> <artifactId>flyway-database-postgresql</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.testcontainers</groupId>
<artifactId>junit-jupiter</artifactId>
<version>1.21.4</version>
<scope>test</scope>
</dependency>
</dependencies> </dependencies>
<build> <build>

@ -25,7 +25,19 @@ public class BasicMimeAndExtensionDocumentTypeDetector implements DocumentTypeDe
@Override @Override
public DetectionResult detect(SourceDescriptor sourceDescriptor) { public DetectionResult detect(SourceDescriptor sourceDescriptor) {
String normalizedMediaType = DocumentImportSupport.normalizeMediaType(sourceDescriptor.mediaType()); String normalizedMediaType = DocumentImportSupport.inferKnownMimeType(sourceDescriptor.fileName(), sourceDescriptor.mediaType());
String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName());
if (sourceDescriptor.attributes() != null && StringUtils.hasText(sourceDescriptor.attributes().get("documentTypeHint"))) {
try {
DocumentType hintedType = DocumentType.valueOf(sourceDescriptor.attributes().get("documentTypeHint").trim());
return new DetectionResult(hintedType, DocumentImportSupport.familyFor(hintedType),
normalizedMediaType,
sourceDescriptor.attributes().get("languageCode"),
Map.of("documentTypeHint", hintedType.name()));
} catch (IllegalArgumentException ignored) {
// fall back to normal detection
}
}
if (sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.TED_PACKAGE) { if (sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.TED_PACKAGE) {
Map<String, String> attributes = new HashMap<>(); Map<String, String> attributes = new HashMap<>();
@ -37,10 +49,7 @@ public class BasicMimeAndExtensionDocumentTypeDetector implements DocumentTypeDe
normalizedMediaType != null ? normalizedMediaType : "application/gzip", null, attributes); normalizedMediaType != null ? normalizedMediaType : "application/gzip", null, attributes);
} }
DocumentType hintedType = detectByHint(sourceDescriptor); DocumentType documentType = detectByMediaType(normalizedMediaType);
String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName());
DocumentType documentType = hintedType != null ? hintedType : detectByMediaType(normalizedMediaType);
if (documentType == DocumentType.UNKNOWN) { if (documentType == DocumentType.UNKNOWN) {
documentType = detectByExtension(extension); documentType = detectByExtension(extension);
} }
@ -58,28 +67,10 @@ public class BasicMimeAndExtensionDocumentTypeDetector implements DocumentTypeDe
if (StringUtils.hasText(sourceDescriptor.fileName())) { if (StringUtils.hasText(sourceDescriptor.fileName())) {
attributes.put("fileName", sourceDescriptor.fileName()); attributes.put("fileName", sourceDescriptor.fileName());
} }
if (hintedType != null) {
attributes.put("documentTypeHint", hintedType.name());
}
return new DetectionResult(documentType, family, normalizedMediaType, languageCode, attributes); return new DetectionResult(documentType, family, normalizedMediaType, languageCode, attributes);
} }
private DocumentType detectByHint(SourceDescriptor sourceDescriptor) {
if (sourceDescriptor.attributes() == null) {
return null;
}
String hint = sourceDescriptor.attributes().get("documentTypeHint");
if (!StringUtils.hasText(hint)) {
return null;
}
try {
return DocumentType.valueOf(hint.trim().toUpperCase(Locale.ROOT));
} catch (IllegalArgumentException ignored) {
return null;
}
}
private DocumentType detectByMediaType(String mediaType) { private DocumentType detectByMediaType(String mediaType) {
if (!StringUtils.hasText(mediaType)) { if (!StringUtils.hasText(mediaType)) {
return DocumentType.UNKNOWN; return DocumentType.UNKNOWN;
@ -87,11 +78,24 @@ public class BasicMimeAndExtensionDocumentTypeDetector implements DocumentTypeDe
return switch (mediaType.toLowerCase(Locale.ROOT)) { return switch (mediaType.toLowerCase(Locale.ROOT)) {
case "application/pdf", "application/x-pdf" -> DocumentType.PDF; case "application/pdf", "application/x-pdf" -> DocumentType.PDF;
case "text/html", "application/xhtml+xml" -> DocumentType.HTML; case "text/html", "application/xhtml+xml" -> DocumentType.HTML;
case "text/plain" -> DocumentType.TEXT; case "text/plain", "text/csv", "text/tab-separated-values", "application/csv", "application/x-csv" -> DocumentType.TEXT;
case "text/markdown", "text/x-markdown" -> DocumentType.MARKDOWN; case "text/markdown", "text/x-markdown" -> DocumentType.MARKDOWN;
case "application/xml", "text/xml" -> DocumentType.XML_GENERIC; case "application/xml", "text/xml", "application/json", "application/yaml" -> DocumentType.XML_GENERIC;
case "message/rfc822" -> DocumentType.MIME_MESSAGE; case "message/rfc822" -> DocumentType.MIME_MESSAGE;
case "application/zip", "application/x-zip-compressed" -> DocumentType.ZIP_ARCHIVE; case "application/zip", "application/x-zip-compressed" -> DocumentType.ZIP_ARCHIVE;
case "application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-excel.sheet.macroenabled.12",
"application/vnd.ms-excel.sheet.binary.macroenabled.12",
"application/vnd.oasis.opendocument.spreadsheet",
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.oasis.opendocument.text",
"application/vnd.oasis.opendocument.presentation",
"application/octet-stream",
"binary/octet-stream" -> DocumentType.GENERIC_BINARY;
default -> mediaType.startsWith("text/") ? DocumentType.TEXT : DocumentType.UNKNOWN; default -> mediaType.startsWith("text/") ? DocumentType.TEXT : DocumentType.UNKNOWN;
}; };
} }
@ -100,12 +104,13 @@ public class BasicMimeAndExtensionDocumentTypeDetector implements DocumentTypeDe
return switch (extension) { return switch (extension) {
case "pdf" -> DocumentType.PDF; case "pdf" -> DocumentType.PDF;
case "html", "htm" -> DocumentType.HTML; case "html", "htm" -> DocumentType.HTML;
case "txt", "log", "csv", "json", "yaml", "yml" -> DocumentType.TEXT; case "txt", "log", "csv", "tsv", "json", "yaml", "yml" -> DocumentType.TEXT;
case "md", "markdown" -> DocumentType.MARKDOWN; case "md", "markdown" -> DocumentType.MARKDOWN;
case "xml", "xsd", "xslt" -> DocumentType.XML_GENERIC; case "xml", "xsd", "xslt" -> DocumentType.XML_GENERIC;
case "eml", "msg" -> DocumentType.MIME_MESSAGE; case "eml", "msg" -> DocumentType.MIME_MESSAGE;
case "zip" -> DocumentType.ZIP_ARCHIVE; case "zip" -> DocumentType.ZIP_ARCHIVE;
case "docx" -> DocumentType.DOCX; case "docx" -> DocumentType.DOCX;
case "xls", "xlsx", "xlsm", "xlsb", "ods", "doc", "ppt", "pptx", "odt", "odp" -> DocumentType.GENERIC_BINARY;
default -> DocumentType.UNKNOWN; default -> DocumentType.UNKNOWN;
}; };
} }

@ -19,7 +19,8 @@ public class BinaryPassThroughDocumentExtractor implements DocumentExtractor {
return documentType == DocumentType.DOCX return documentType == DocumentType.DOCX
|| documentType == DocumentType.ZIP_ARCHIVE || documentType == DocumentType.ZIP_ARCHIVE
|| documentType == DocumentType.GENERIC_BINARY || documentType == DocumentType.GENERIC_BINARY
|| documentType == DocumentType.MIME_MESSAGE; || documentType == DocumentType.MIME_MESSAGE
|| documentType == DocumentType.UNKNOWN;
} }
@Override @Override

@ -21,7 +21,7 @@ public class PlainTextDocumentExtractor implements DocumentExtractor {
return documentType == DocumentType.TEXT return documentType == DocumentType.TEXT
|| documentType == DocumentType.MARKDOWN || documentType == DocumentType.MARKDOWN
|| documentType == DocumentType.XML_GENERIC || documentType == DocumentType.XML_GENERIC
|| documentType == DocumentType.UNKNOWN || (documentType == DocumentType.UNKNOWN && DocumentImportSupport.isLikelyTextMime(mimeType))
|| DocumentImportSupport.isLikelyTextMime(mimeType); || DocumentImportSupport.isLikelyTextMime(mimeType);
} }

@ -0,0 +1,148 @@
package at.procon.dip.extraction.impl;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.extraction.spi.DocumentExtractor;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
@Component
@Order(50)
public class SpreadsheetDocumentExtractor implements DocumentExtractor {
@Override
public boolean supports(DocumentType documentType, String mimeType) {
return DocumentImportSupport.isSpreadsheetMime(mimeType);
}
@Override
public ExtractionResult extract(ExtractionRequest extractionRequest) {
String mimeType = DocumentImportSupport.normalizeMediaType(
extractionRequest.detectionResult().mimeType());
String extension = DocumentImportSupport.extensionOf(
extractionRequest.sourceDescriptor().fileName());
try {
String extracted = extractSpreadsheetText(
mimeType,
extension,
extractionRequest.binaryContent(),
extractionRequest.textContent());
if (!StringUtils.hasText(extracted)) {
return new ExtractionResult(
Map.of(),
List.of(),
List.of("No spreadsheet text content extracted"));
}
String normalized = normalizeText(extracted);
String title = extractionRequest.sourceDescriptor().fileName();
return new ExtractionResult(
Map.of(ContentRole.NORMALIZED_TEXT, normalized),
List.of(new ExtractedStructuredPayload("spreadsheet-document", Map.of("title", title))),
List.of());
} catch (Exception e) {
return new ExtractionResult(
Map.of(),
List.of(),
List.of("Spreadsheet extraction failed: " + e.getMessage()));
}
}
private String extractSpreadsheetText(
String mimeType,
String extension,
byte[] binaryContent,
String textContent) throws IOException {
if ("text/csv".equals(mimeType)
|| "text/tab-separated-values".equals(mimeType)
|| "application/csv".equals(mimeType)
|| "application/x-csv".equals(mimeType)
|| "csv".equals(extension)
|| "tsv".equals(extension)) {
if (StringUtils.hasText(textContent)) {
return textContent;
}
return binaryContent == null ? null : new String(binaryContent, StandardCharsets.UTF_8);
}
if (binaryContent == null || binaryContent.length == 0) {
return null;
}
if ("ods".equals(extension)
|| "application/vnd.oasis.opendocument.spreadsheet".equals(mimeType)) {
throw new IOException("ODS extraction not supported yet");
}
try (Workbook workbook = WorkbookFactory.create(new ByteArrayInputStream(binaryContent))) {
return extractWorkbookText(workbook);
}
}
private String extractWorkbookText(Workbook workbook) {
StringBuilder sb = new StringBuilder();
DataFormatter formatter = new DataFormatter();
for (int s = 0; s < workbook.getNumberOfSheets(); s++) {
Sheet sheet = workbook.getSheetAt(s);
if (sheet == null) {
continue;
}
if (sb.length() > 0) {
sb.append("\n\n");
}
sb.append("# Sheet: ").append(sheet.getSheetName()).append("\n");
for (Row row : sheet) {
List<String> cells = new ArrayList<>();
for (Cell cell : row) {
String value = formatter.formatCellValue(cell);
if (value != null) {
value = value.trim();
}
cells.add(value == null ? "" : value);
}
while (!cells.isEmpty() && cells.get(cells.size() - 1).isBlank()) {
cells.remove(cells.size() - 1);
}
if (!cells.isEmpty()) {
sb.append(String.join("\t", cells)).append("\n");
}
}
}
return sb.toString();
}
private String normalizeText(String text) {
if (text == null) {
return null;
}
return text.replace("\r\n", "\n")
.replace('\r', '\n')
.replaceAll("\\n{3,}", "\n\n")
.replaceAll("[ \\t]+\\n", "\n")
.trim();
}
}

@ -158,11 +158,11 @@ public class MailDocumentIngestionAdapter implements DocumentIngestionAdapter {
private String previewTextIfLikelyText(MailAttachment attachment) { private String previewTextIfLikelyText(MailAttachment attachment) {
String mime = DocumentImportSupport.normalizeMediaType(attachment.contentType()); String mime = DocumentImportSupport.normalizeMediaType(attachment.contentType());
if (DocumentImportSupport.isLikelyTextMime(mime)) {
return attachment.safeTextPreview();
}
String ext = DocumentImportSupport.extensionOf(attachment.fileName()); String ext = DocumentImportSupport.extensionOf(attachment.fileName());
if ("txt".equals(ext) || "xml".equals(ext) || "html".equals(ext) || "htm".equals(ext) || "md".equals(ext)) { if (DocumentImportSupport.isSpreadsheetMime(mime) || DocumentImportSupport.isSpreadsheetExtension(ext)) {
return null;
}
if (DocumentImportSupport.isLikelyTextMime(mime) || DocumentImportSupport.isLikelyTextExtension(ext)) {
return attachment.safeTextPreview(); return attachment.safeTextPreview();
} }
return null; return null;

@ -4,12 +4,15 @@ import at.procon.dip.classification.service.DocumentClassificationService;
import at.procon.dip.classification.spi.DetectionResult; import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.access.DocumentAccessContext; import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.access.DocumentVisibility; import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
import at.procon.dip.domain.document.ContentRole; import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentStatus; import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.StorageType; import at.procon.dip.domain.document.StorageType;
import at.procon.dip.domain.document.entity.Document; import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentContent; import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
import at.procon.dip.domain.document.entity.DocumentSource;
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
import at.procon.dip.domain.document.repository.DocumentRepository; import at.procon.dip.domain.document.repository.DocumentRepository;
import at.procon.dip.domain.document.repository.DocumentSourceRepository; import at.procon.dip.domain.document.repository.DocumentSourceRepository;
import at.procon.dip.domain.document.service.DocumentContentService; import at.procon.dip.domain.document.service.DocumentContentService;
@ -30,9 +33,9 @@ import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.ingestion.util.DocumentImportSupport; import at.procon.dip.ingestion.util.DocumentImportSupport;
import at.procon.dip.normalization.service.TextRepresentationBuildService; import at.procon.dip.normalization.service.TextRepresentationBuildService;
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
import at.procon.dip.normalization.spi.RepresentationBuildRequest; import at.procon.dip.normalization.spi.RepresentationBuildRequest;
import at.procon.dip.normalization.spi.TextRepresentationDraft; import at.procon.dip.normalization.spi.TextRepresentationDraft;
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
import at.procon.dip.processing.spi.DocumentProcessingPolicy; import at.procon.dip.processing.spi.DocumentProcessingPolicy;
import at.procon.dip.processing.spi.StructuredProcessingRequest; import at.procon.dip.processing.spi.StructuredProcessingRequest;
import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.config.TedProcessorProperties;
@ -44,6 +47,7 @@ import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.UUID;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -51,7 +55,7 @@ import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils; import org.springframework.util.StringUtils;
/** /**
* Generic import pipeline that persists arbitrary document types into the DOC model. * Phase 4 generic import pipeline that persists arbitrary document types into the DOC model.
*/ */
@Service @Service
@RequiredArgsConstructor @RequiredArgsConstructor
@ -61,6 +65,7 @@ public class GenericDocumentImportService {
private final TedProcessorProperties properties; private final TedProcessorProperties properties;
private final DocumentRepository documentRepository; private final DocumentRepository documentRepository;
private final DocumentSourceRepository documentSourceRepository; private final DocumentSourceRepository documentSourceRepository;
private final DocumentEmbeddingRepository documentEmbeddingRepository;
private final DocumentService documentService; private final DocumentService documentService;
private final DocumentSourceService documentSourceService; private final DocumentSourceService documentSourceService;
private final DocumentContentService documentContentService; private final DocumentContentService documentContentService;
@ -114,11 +119,12 @@ public class GenericDocumentImportService {
: null; : null;
List<String> warnings = new ArrayList<>(); List<String> warnings = new ArrayList<>();
DocumentProcessingPolicy processingPolicy = structuredProcessingService.resolvePolicy(sourceDescriptor, detection);
ExtractionResult extractionResult = emptyExtractionResult(); ExtractionResult extractionResult = emptyExtractionResult();
Map<ContentRole, DocumentContent> persistedDerivedContent = new LinkedHashMap<>(); Map<ContentRole, DocumentContent> persistedDerivedContent = java.util.Collections.emptyMap();
if (persistOriginalContent) { if (persistOriginalContent) {
DocumentProcessingPolicy processingPolicy = structuredProcessingService.resolvePolicy(sourceDescriptor, detection);
if (processingPolicy.runGenericExtraction()) { if (processingPolicy.runGenericExtraction()) {
extractionResult = extractionService.extract(new ExtractionRequest( extractionResult = extractionService.extract(new ExtractionRequest(
sourceDescriptor, sourceDescriptor,
@ -127,16 +133,10 @@ public class GenericDocumentImportService {
payload.binaryContent() payload.binaryContent()
)); ));
warnings.addAll(extractionResult.warnings()); warnings.addAll(extractionResult.warnings());
if (processingPolicy.persistExtractedContent()) {
persistedDerivedContent.putAll(persistDerivedContent(document, detection, extractionResult, dedupHash, "generic"));
}
if (!extractionResult.derivedTextByRole().isEmpty()) {
documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
}
} }
if (processingPolicy.invokeStructuredProcessor()) { if (processingPolicy.invokeStructuredProcessor()) {
Optional<ExtractionResult> structuredExtractionResult = structuredProcessingService.process(new StructuredProcessingRequest( var structuredResult = structuredProcessingService.process(new StructuredProcessingRequest(
document, document,
originalContent, originalContent,
sourceDescriptor, sourceDescriptor,
@ -145,23 +145,25 @@ public class GenericDocumentImportService {
payload.textContent(), payload.textContent(),
dedupHash dedupHash
)); ));
if (structuredExtractionResult.isPresent()) { if (structuredResult.isPresent()) {
ExtractionResult result = structuredExtractionResult.get(); extractionResult = mergeExtractionResults(extractionResult, structuredResult.get());
warnings.addAll(result.warnings()); warnings.addAll(structuredResult.get().warnings());
extractionResult = mergeExtractionResults(extractionResult, result);
if (processingPolicy.persistExtractedContent()) {
persistedDerivedContent.putAll(persistDerivedContent(document, detection, result, dedupHash, "structured"));
}
if (!result.derivedTextByRole().isEmpty()) {
documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
}
} }
} }
if (processingPolicy.persistExtractedContent()) {
persistedDerivedContent = persistDerivedContent(document, detection, extractionResult, dedupHash);
documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
}
if (processingPolicy.runRepresentationBuilders()) { if (processingPolicy.runRepresentationBuilders()) {
var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult)); var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult));
persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts); persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts);
} }
if (processingPolicy.applyStructuredTitleIfMissing() && !extractionResult.structuredPayloads().isEmpty()) {
applyStructuredTitleIfMissing(documentService.getRequired(document.getId()), extractionResult);
}
} else { } else {
warnings.add("Original content storage disabled for this document; skipped extraction and text-representation processing"); warnings.add("Original content storage disabled for this document; skipped extraction and text-representation processing");
} }
@ -172,38 +174,9 @@ public class GenericDocumentImportService {
reloaded = documentService.getRequired(reloaded.getId()); reloaded = documentService.getRequired(reloaded.getId());
} }
if (processingPolicy.applyStructuredTitleIfMissing() && !extractionResult.structuredPayloads().isEmpty()) {
applyStructuredTitleIfMissing(reloaded, extractionResult);
reloaded = documentService.getRequired(reloaded.getId());
}
return new ImportedDocumentResult(reloaded, detection, warnings, false); return new ImportedDocumentResult(reloaded, detection, warnings, false);
} }
private ExtractionResult mergeExtractionResults(ExtractionResult left, ExtractionResult right) {
Map<ContentRole, String> derivedText = new LinkedHashMap<>();
if (left != null && left.derivedTextByRole() != null) {
derivedText.putAll(left.derivedTextByRole());
}
if (right != null && right.derivedTextByRole() != null) {
derivedText.putAll(right.derivedTextByRole());
}
List<at.procon.dip.extraction.spi.ExtractedStructuredPayload> payloads = new ArrayList<>();
if (left != null && left.structuredPayloads() != null) {
payloads.addAll(left.structuredPayloads());
}
if (right != null && right.structuredPayloads() != null) {
payloads.addAll(right.structuredPayloads());
}
List<String> warnings = new ArrayList<>();
if (left != null && left.warnings() != null) {
warnings.addAll(left.warnings());
}
if (right != null && right.warnings() != null) {
warnings.addAll(right.warnings());
}
return new ExtractionResult(derivedText, payloads, warnings);
}
private ExtractionResult emptyExtractionResult() { private ExtractionResult emptyExtractionResult() {
return new ExtractionResult(java.util.Collections.emptyMap(), java.util.Collections.emptyList(), java.util.Collections.emptyList()); return new ExtractionResult(java.util.Collections.emptyMap(), java.util.Collections.emptyList(), java.util.Collections.emptyList());
@ -271,19 +244,7 @@ public class GenericDocumentImportService {
} }
private String inferMediaType(SourceDescriptor sourceDescriptor) { private String inferMediaType(SourceDescriptor sourceDescriptor) {
if (StringUtils.hasText(sourceDescriptor.mediaType())) { return DocumentImportSupport.inferKnownMimeType(sourceDescriptor.fileName(), sourceDescriptor.mediaType());
return DocumentImportSupport.normalizeMediaType(sourceDescriptor.mediaType());
}
String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName());
return switch (extension) {
case "pdf" -> "application/pdf";
case "html", "htm" -> "text/html";
case "md", "markdown" -> "text/markdown";
case "xml" -> "application/xml";
case "txt", "log", "csv", "json", "yaml", "yml" -> "text/plain";
case "eml" -> "message/rfc822";
default -> null;
};
} }
private DocumentAccessContext defaultAccessContext() { private DocumentAccessContext defaultAccessContext() {
@ -303,7 +264,7 @@ public class GenericDocumentImportService {
return sourceDescriptor.fileName(); return sourceDescriptor.fileName();
} }
if (StringUtils.hasText(payload.textContent())) { if (StringUtils.hasText(payload.textContent())) {
for (String line : payload.textContent().split("\n")) { for (String line : payload.textContent().split("\\n")) {
if (StringUtils.hasText(line)) { if (StringUtils.hasText(line)) {
return DocumentImportSupport.ellipsize(line.trim(), 240); return DocumentImportSupport.ellipsize(line.trim(), 240);
} }
@ -402,14 +363,13 @@ public class GenericDocumentImportService {
private Map<ContentRole, DocumentContent> persistDerivedContent(Document document, private Map<ContentRole, DocumentContent> persistDerivedContent(Document document,
DetectionResult detection, DetectionResult detection,
ExtractionResult extractionResult, ExtractionResult extractionResult,
String baseHash, String baseHash) {
String hashNamespace) {
Map<ContentRole, DocumentContent> result = new LinkedHashMap<>(); Map<ContentRole, DocumentContent> result = new LinkedHashMap<>();
extractionResult.derivedTextByRole().forEach((role, text) -> { extractionResult.derivedTextByRole().forEach((role, text) -> {
if (!StringUtils.hasText(text)) { if (!StringUtils.hasText(text)) {
return; return;
} }
String contentHash = HashUtils.computeSha256(baseHash + ":" + hashNamespace + ":" + role.name() + ":" + text); String contentHash = HashUtils.computeSha256(baseHash + ":" + role.name() + ":" + text);
DocumentContent content = documentContentService.addContent(new AddDocumentContentCommand( DocumentContent content = documentContentService.addContent(new AddDocumentContentCommand(
document.getId(), document.getId(),
role, role,
@ -452,13 +412,16 @@ public class GenericDocumentImportService {
if (!StringUtils.hasText(draft.textBody())) { if (!StringUtils.hasText(draft.textBody())) {
continue; continue;
} }
DocumentContent linkedContent = resolveLinkedContent(originalContent, derivedContent, draft); DocumentContent linkedContent = switch (draft.representationType()) {
case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK ->
derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
};
var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand( var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
document.getId(), document.getId(),
linkedContent == null ? null : linkedContent.getId(), linkedContent == null ? null : linkedContent.getId(),
draft.representationType(), draft.representationType(),
draft.builderKey() == null ? "phase4-generic-builder" : draft.builderKey(), StringUtils.hasText(draft.builderKey()) ? draft.builderKey() : "phase4-generic-builder",
draft.languageCode(), draft.languageCode(),
null, null,
draft.chunkIndex(), draft.chunkIndex(),
@ -475,26 +438,50 @@ public class GenericDocumentImportService {
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED); documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
} }
private DocumentContent resolveLinkedContent(DocumentContent originalContent, private DocumentContent resolveLinkedContent(TextRepresentationDraft draft,
Map<ContentRole, DocumentContent> derivedContent, DocumentContent originalContent,
TextRepresentationDraft draft) { Map<ContentRole, DocumentContent> derivedContent) {
ContentRole sourceRole = draft.sourceContentRole(); if (draft.sourceContentRole() != null && derivedContent.containsKey(draft.sourceContentRole())) {
if (sourceRole == null) { return derivedContent.get(draft.sourceContentRole());
sourceRole = ContentRole.NORMALIZED_TEXT;
}
if (sourceRole == ContentRole.ORIGINAL) {
return originalContent;
} }
return derivedContent.getOrDefault(sourceRole, originalContent); return derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
} }
private boolean shouldQueueEmbedding(TextRepresentationDraft draft) { private boolean shouldQueueEmbedding(TextRepresentationDraft draft) {
if (Boolean.FALSE.equals(draft.queueForEmbedding())) { if (draft.queueForEmbedding() != null) {
return false; return draft.queueForEmbedding();
} }
return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true; return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true;
} }
private ExtractionResult mergeExtractionResults(ExtractionResult base, ExtractionResult override) {
Map<ContentRole, String> mergedText = new LinkedHashMap<>();
if (base != null && base.derivedTextByRole() != null) {
mergedText.putAll(base.derivedTextByRole());
}
if (override != null && override.derivedTextByRole() != null) {
mergedText.putAll(override.derivedTextByRole());
}
List<at.procon.dip.extraction.spi.ExtractedStructuredPayload> mergedPayloads = new ArrayList<>();
if (base != null && base.structuredPayloads() != null) {
mergedPayloads.addAll(base.structuredPayloads());
}
if (override != null && override.structuredPayloads() != null) {
mergedPayloads.addAll(override.structuredPayloads());
}
List<String> mergedWarnings = new ArrayList<>();
if (base != null && base.warnings() != null) {
mergedWarnings.addAll(base.warnings());
}
if (override != null && override.warnings() != null) {
mergedWarnings.addAll(override.warnings());
}
return new ExtractionResult(mergedText, mergedPayloads, mergedWarnings);
}
private void applyStructuredTitleIfMissing(Document document, ExtractionResult extractionResult) { private void applyStructuredTitleIfMissing(Document document, ExtractionResult extractionResult) {
boolean missingTitle = !StringUtils.hasText(document.getTitle()) || document.getTitle().equals(document.getDocumentType().name()); boolean missingTitle = !StringUtils.hasText(document.getTitle()) || document.getTitle().equals(document.getDocumentType().name());
if (!missingTitle) { if (!missingTitle) {

@ -1,5 +1,6 @@
package at.procon.dip.ingestion.service; package at.procon.dip.ingestion.service;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import jakarta.mail.BodyPart; import jakarta.mail.BodyPart;
import jakarta.mail.Multipart; import jakarta.mail.Multipart;
import jakarta.mail.Part; import jakarta.mail.Part;
@ -63,9 +64,9 @@ public class MailMessageExtractionService {
processPart(bodyPart, text, html, attachments); processPart(bodyPart, text, html, attachments);
} }
} else if (contentType.toLowerCase().contains("text/plain")) { } else if (contentType.toLowerCase().contains("text/plain")) {
text.append(content.toString()).append("\n"); text.append(content.toString()).append("");
} else if (contentType.toLowerCase().contains("text/html")) { } else if (contentType.toLowerCase().contains("text/html")) {
html.append(content.toString()).append("\n"); html.append(content.toString()).append("");
} else if (part.getFileName() != null) { } else if (part.getFileName() != null) {
attachments.add(extractAttachment(part)); attachments.add(extractAttachment(part));
} }
@ -103,10 +104,10 @@ public class MailMessageExtractionService {
public String serializeMessage(ParsedMailMessage parsed) { public String serializeMessage(ParsedMailMessage parsed) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
if (parsed.subject() != null) sb.append("Subject: ").append(parsed.subject()).append("\n"); if (parsed.subject() != null) sb.append("Subject: ").append(parsed.subject()).append("");
if (parsed.from() != null) sb.append("From: ").append(parsed.from()).append("\n"); if (parsed.from() != null) sb.append("From: ").append(parsed.from()).append("");
if (!parsed.recipients().isEmpty()) sb.append("To: ").append(String.join(", ", parsed.recipients())).append("\n"); if (!parsed.recipients().isEmpty()) sb.append("To: ").append(String.join(", ", parsed.recipients())).append("");
sb.append("\n"); sb.append("");
if (parsed.textBody() != null) sb.append(parsed.textBody()); if (parsed.textBody() != null) sb.append(parsed.textBody());
return sb.toString().trim(); return sb.toString().trim();
} }
@ -116,7 +117,15 @@ public class MailMessageExtractionService {
public record MailAttachment(String fileName, String contentType, byte[] data, long sizeBytes, String path) { public record MailAttachment(String fileName, String contentType, byte[] data, long sizeBytes, String path) {
public String safeTextPreview() { public String safeTextPreview() {
return new String(data, StandardCharsets.UTF_8); String extension = DocumentImportSupport.extensionOf(fileName);
String mime = DocumentImportSupport.normalizeMediaType(contentType);
if (DocumentImportSupport.isSpreadsheetMime(mime)
|| DocumentImportSupport.isSpreadsheetExtension(extension)
|| (!DocumentImportSupport.isLikelyTextMime(mime) && !DocumentImportSupport.isLikelyTextExtension(extension))
|| DocumentImportSupport.looksBinary(data)) {
return null;
}
return DocumentImportSupport.normalizeText(new String(data, StandardCharsets.UTF_8));
} }
} }
} }

@ -5,6 +5,7 @@ import at.procon.dip.domain.document.DocumentType;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Set;
import org.springframework.util.StringUtils; import org.springframework.util.StringUtils;
/** /**
@ -12,6 +13,41 @@ import org.springframework.util.StringUtils;
*/ */
public final class DocumentImportSupport { public final class DocumentImportSupport {
private static final Set<String> TEXT_EXTENSIONS = Set.of(
"txt", "log", "csv", "tsv", "json", "yaml", "yml", "xml", "xsd", "xslt", "html", "htm", "md", "markdown"
);
private static final Map<String, String> KNOWN_MIME_BY_EXTENSION = Map.ofEntries(
Map.entry("pdf", "application/pdf"),
Map.entry("html", "text/html"),
Map.entry("htm", "text/html"),
Map.entry("md", "text/markdown"),
Map.entry("markdown", "text/markdown"),
Map.entry("xml", "application/xml"),
Map.entry("xsd", "application/xml"),
Map.entry("xslt", "application/xml"),
Map.entry("txt", "text/plain"),
Map.entry("log", "text/plain"),
Map.entry("csv", "text/csv"),
Map.entry("tsv", "text/tab-separated-values"),
Map.entry("json", "application/json"),
Map.entry("yaml", "application/yaml"),
Map.entry("yml", "application/yaml"),
Map.entry("eml", "message/rfc822"),
Map.entry("zip", "application/zip"),
Map.entry("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
Map.entry("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
Map.entry("xlsm", "application/vnd.ms-excel.sheet.macroenabled.12"),
Map.entry("xlsb", "application/vnd.ms-excel.sheet.binary.macroenabled.12"),
Map.entry("xls", "application/vnd.ms-excel"),
Map.entry("ods", "application/vnd.oasis.opendocument.spreadsheet"),
Map.entry("doc", "application/msword"),
Map.entry("ppt", "application/vnd.ms-powerpoint"),
Map.entry("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
Map.entry("odt", "application/vnd.oasis.opendocument.text"),
Map.entry("odp", "application/vnd.oasis.opendocument.presentation")
);
private DocumentImportSupport() { private DocumentImportSupport() {
} }
@ -28,12 +64,31 @@ public final class DocumentImportSupport {
} }
String normalized = normalizeMediaType(mediaType); String normalized = normalizeMediaType(mediaType);
return normalized.startsWith("text/") return normalized.startsWith("text/")
|| normalized.contains("json") || normalized.contains("application/json")
|| normalized.contains("xml") || normalized.contains("application/xml")
|| normalized.contains("javascript") || normalized.contains("javascript")
|| normalized.equals("application/xhtml+xml"); || normalized.equals("application/xhtml+xml");
} }
public static boolean isLikelyTextExtension(String extension) {
if (!StringUtils.hasText(extension)) {
return false;
}
String normalized = extension.toLowerCase(Locale.ROOT);
return normalized.equals("txt")
|| normalized.equals("xml")
|| normalized.equals("html")
|| normalized.equals("htm")
|| normalized.equals("md")
|| normalized.equals("markdown")
|| normalized.equals("json")
|| normalized.equals("yaml")
|| normalized.equals("yml")
|| normalized.equals("csv")
|| normalized.equals("log")
|| normalized.equals("eml");
}
public static String normalizeMediaType(String mediaType) { public static String normalizeMediaType(String mediaType) {
if (!StringUtils.hasText(mediaType)) { if (!StringUtils.hasText(mediaType)) {
return null; return null;
@ -43,6 +98,80 @@ public final class DocumentImportSupport {
return result.trim().toLowerCase(Locale.ROOT); return result.trim().toLowerCase(Locale.ROOT);
} }
public static String inferKnownMimeType(String fileName, String mediaType) {
String normalized = normalizeMediaType(mediaType);
String extension = extensionOf(fileName);
String inferred = KNOWN_MIME_BY_EXTENSION.get(extension);
if (!StringUtils.hasText(normalized)) {
return inferred;
}
if ("application/octet-stream".equals(normalized)
|| "binary/octet-stream".equals(normalized)
|| "application/x-download".equals(normalized)) {
return inferred != null ? inferred : normalized;
}
return normalized;
}
public static boolean isSpreadsheetMime(String mediaType) {
String normalized = normalizeMediaType(mediaType);
if (!StringUtils.hasText(normalized)) {
return false;
}
return normalized.equals("application/vnd.ms-excel")
|| normalized.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|| normalized.equals("application/vnd.ms-excel.sheet.macroenabled.12")
|| normalized.equals("application/vnd.ms-excel.sheet.binary.macroenabled.12")
|| normalized.equals("application/vnd.oasis.opendocument.spreadsheet")
|| normalized.equals("text/csv")
|| normalized.equals("text/tab-separated-values")
|| normalized.equals("application/csv")
|| normalized.equals("application/x-csv");
}
public static boolean isSpreadsheetExtension(String extension) {
if (!StringUtils.hasText(extension)) {
return false;
}
String normalized = extension.toLowerCase(Locale.ROOT);
return normalized.equals("xls")
|| normalized.equals("xlsx")
|| normalized.equals("xlsm")
|| normalized.equals("xlsb")
|| normalized.equals("ods")
|| normalized.equals("csv")
|| normalized.equals("tsv");
}
public static String inferKnownMimeType(String extension) {
if (!StringUtils.hasText(extension)) {
return null;
}
return switch (extension.toLowerCase(Locale.ROOT)) {
case "pdf" -> "application/pdf";
case "html", "htm" -> "text/html";
case "md", "markdown" -> "text/markdown";
case "xml", "xsd", "xslt" -> "application/xml";
case "txt", "log" -> "text/plain";
case "csv" -> "text/csv";
case "tsv" -> "text/tab-separated-values";
case "json" -> "application/json";
case "yaml", "yml" -> "application/yaml";
case "eml" -> "message/rfc822";
case "zip" -> "application/zip";
case "xls" -> "application/vnd.ms-excel";
case "xlsx" -> "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
case "xlsm" -> "application/vnd.ms-excel.sheet.macroenabled.12";
case "xlsb" -> "application/vnd.ms-excel.sheet.binary.macroenabled.12";
case "ods" -> "application/vnd.oasis.opendocument.spreadsheet";
default -> null;
};
}
public static boolean isClearlyTextExtension(String extension) {
return TEXT_EXTENSIONS.contains(extensionOf(extension));
}
public static DocumentFamily familyFor(DocumentType documentType) { public static DocumentFamily familyFor(DocumentType documentType) {
return switch (documentType) { return switch (documentType) {
case TED_PACKAGE, TED_NOTICE -> DocumentFamily.PROCUREMENT; case TED_PACKAGE, TED_NOTICE -> DocumentFamily.PROCUREMENT;
@ -81,4 +210,45 @@ public final class DocumentImportSupport {
} }
return text.substring(0, maxLength - 3) + "..."; return text.substring(0, maxLength - 3) + "...";
} }
public static boolean containsZeroByte(byte[] bytes) {
if (bytes == null) {
return false;
}
for (byte b : bytes) {
if (b == 0) {
return true;
}
}
return false;
}
public static boolean looksBinary(byte[] bytes) {
if (bytes == null || bytes.length == 0) {
return false;
}
if (containsZeroByte(bytes)) {
return true;
}
int sample = Math.min(bytes.length, 512);
int suspicious = 0;
for (int i = 0; i < sample; i++) {
int v = bytes[i] & 0xFF;
if ((v < 0x09) || (v > 0x0D && v < 0x20)) {
suspicious++;
}
}
return suspicious > sample / 10;
}
public static String normalizeText(String text) {
if (text == null) {
return null;
}
return text.replace("\r\n", "\n")
.replace('\r', '\n')
.replaceAll("\\n{3,}", "\n\n")
.replaceAll("[ \\t]+\\n", "\n")
.trim();
}
} }

@ -5,16 +5,18 @@ import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentFamily; import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentStatus; import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.DocumentType; import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.service.DocumentService;
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload; import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.extraction.spi.ExtractionResult; import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.processing.spi.DocumentProcessingPolicy; import at.procon.dip.processing.spi.DocumentProcessingPolicy;
import at.procon.dip.processing.spi.StructuredDocumentProcessor; import at.procon.dip.processing.spi.StructuredDocumentProcessor;
import at.procon.dip.processing.spi.StructuredProcessingRequest; import at.procon.dip.processing.spi.StructuredProcessingRequest;
import at.procon.dip.domain.document.service.DocumentService;
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
import at.procon.ted.model.entity.ProcurementDocument; import at.procon.ted.model.entity.ProcurementDocument;
import at.procon.ted.service.TedPhase2GenericDocumentService;
import at.procon.ted.service.XmlParserService; import at.procon.ted.service.XmlParserService;
import java.nio.charset.StandardCharsets;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -30,6 +32,7 @@ public class TedStructuredDocumentProcessor implements StructuredDocumentProcess
private final XmlParserService xmlParserService; private final XmlParserService xmlParserService;
private final DocumentService documentService; private final DocumentService documentService;
private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService;
private final TedNoticeProjectionService tedNoticeProjectionService; private final TedNoticeProjectionService tedNoticeProjectionService;
@Override @Override
@ -46,7 +49,7 @@ public class TedStructuredDocumentProcessor implements StructuredDocumentProcess
public ExtractionResult process(StructuredProcessingRequest request) { public ExtractionResult process(StructuredProcessingRequest request) {
String xml = request.textContent(); String xml = request.textContent();
if (!StringUtils.hasText(xml) && request.binaryContent() != null) { if (!StringUtils.hasText(xml) && request.binaryContent() != null) {
xml = new String(request.binaryContent(), java.nio.charset.StandardCharsets.UTF_8); xml = new String(request.binaryContent(), StandardCharsets.UTF_8);
} }
if (!StringUtils.hasText(xml)) { if (!StringUtils.hasText(xml)) {
return new ExtractionResult(Map.of(), List.of(), List.of("TED structured processor received no XML payload")); return new ExtractionResult(Map.of(), List.of(), List.of("TED structured processor received no XML payload"));
@ -74,24 +77,15 @@ public class TedStructuredDocumentProcessor implements StructuredDocumentProcess
} }
documentService.save(canonical); documentService.save(canonical);
tedPhase2GenericDocumentService.syncTedDocument(tedDocument);
tedNoticeProjectionService.registerOrRefreshProjection(tedDocument, canonical.getId()); tedNoticeProjectionService.registerOrRefreshProjection(tedDocument, canonical.getId());
Map<String, Object> payload = new LinkedHashMap<>(); Map<String, Object> payload = new LinkedHashMap<>();
if (StringUtils.hasText(tedDocument.getProjectTitle())) { if (StringUtils.hasText(tedDocument.getProjectTitle())) payload.put("title", tedDocument.getProjectTitle());
payload.put("title", tedDocument.getProjectTitle()); if (StringUtils.hasText(tedDocument.getProjectDescription())) payload.put("description", tedDocument.getProjectDescription());
} if (StringUtils.hasText(tedDocument.getBuyerName())) payload.put("buyerName", tedDocument.getBuyerName());
if (StringUtils.hasText(tedDocument.getProjectDescription())) { if (tedDocument.getCpvCodes() != null && tedDocument.getCpvCodes().length > 0) payload.put("cpvCodes", String.join(", ", tedDocument.getCpvCodes()));
payload.put("description", tedDocument.getProjectDescription()); if (tedDocument.getNutsCodes() != null && tedDocument.getNutsCodes().length > 0) payload.put("nutsCodes", String.join(", ", tedDocument.getNutsCodes()));
}
if (StringUtils.hasText(tedDocument.getBuyerName())) {
payload.put("buyerName", tedDocument.getBuyerName());
}
if (tedDocument.getCpvCodes() != null && tedDocument.getCpvCodes().length > 0) {
payload.put("cpvCodes", String.join(", ", tedDocument.getCpvCodes()));
}
if (tedDocument.getNutsCodes() != null && tedDocument.getNutsCodes().length > 0) {
payload.put("nutsCodes", String.join(", ", tedDocument.getNutsCodes()));
}
payload.put("lotCount", tedDocument.getLots() == null ? 0 : tedDocument.getLots().size()); payload.put("lotCount", tedDocument.getLots() == null ? 0 : tedDocument.getLots().size());
payload.put("noticeId", tedDocument.getNoticeId()); payload.put("noticeId", tedDocument.getNoticeId());
payload.put("publicationId", tedDocument.getPublicationId()); payload.put("publicationId", tedDocument.getPublicationId());

@ -1,11 +1,11 @@
package at.procon.dip.processing.service; package at.procon.dip.processing.service;
import at.procon.dip.classification.spi.DetectionResult; import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.processing.spi.DocumentProcessingPolicy; import at.procon.dip.processing.spi.DocumentProcessingPolicy;
import at.procon.dip.processing.spi.StructuredDocumentProcessor; import at.procon.dip.processing.spi.StructuredDocumentProcessor;
import at.procon.dip.processing.spi.StructuredProcessingRequest; import at.procon.dip.processing.spi.StructuredProcessingRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;

@ -0,0 +1,20 @@
package at.procon.dip;
import org.testcontainers.containers.PostgreSQLContainer;
public class FixedPortPostgreSQLContainer<SELF extends FixedPortPostgreSQLContainer<SELF>>
extends PostgreSQLContainer<SELF> {
private final int hostPort;
public FixedPortPostgreSQLContainer(String imageName, int hostPort) {
super(imageName);
this.hostPort = hostPort;
}
@Override
protected void configure() {
super.configure();
addFixedExposedPort(hostPort, PostgreSQLContainer.POSTGRESQL_PORT);
}
}

@ -0,0 +1,65 @@
package at.procon.dip.extraction.impl;
import static org.assertj.core.api.Assertions.assertThat;
import at.procon.dip.classification.detector.BasicMimeAndExtensionDocumentTypeDetector;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.time.OffsetDateTime;
import java.util.Map;
import org.junit.jupiter.api.Test;
class BasicMimeAndExtensionDocumentTypeDetectorTest {
private final at.procon.dip.classification.detector.BasicMimeAndExtensionDocumentTypeDetector detector = new BasicMimeAndExtensionDocumentTypeDetector();
@Test
void shouldResolveKnownExcelMimeFromExtensionWhenMailUsesOctetStream() {
SourceDescriptor source = new SourceDescriptor(
DocumentAccessContext.publicDocument(),
SourceType.MAIL,
"mail-1:attachment:test.xls",
"mail://message/1",
"test.xls",
"application/octet-stream",
new byte[] {1,2,3},
null,
OffsetDateTime.now(),
OriginalContentStoragePolicy.STORE,
Map.of()
);
DetectionResult result = detector.detect(source);
assertThat(result.documentType()).isEqualTo(DocumentType.GENERIC_BINARY);
assertThat(result.mimeType()).isEqualTo("application/vnd.ms-excel");
assertThat(result.attributes()).containsEntry("detectedExtension", "xls");
assertThat(result.attributes()).containsEntry("effectiveMediaType", "application/vnd.ms-excel");
}
@Test
void shouldKeepCsvAsTextAndUseCsvMime() {
SourceDescriptor source = new SourceDescriptor(
DocumentAccessContext.publicDocument(),
SourceType.MAIL,
"mail-1:attachment:data.csv",
"mail://message/1",
"data.csv",
"application/octet-stream",
"a,b1,2".getBytes(),
null,
OffsetDateTime.now(),
OriginalContentStoragePolicy.STORE,
Map.of()
);
DetectionResult result = detector.detect(source);
assertThat(result.documentType()).isEqualTo(DocumentType.TEXT);
assertThat(result.mimeType()).isEqualTo("text/csv");
}
}

@ -0,0 +1,93 @@
package at.procon.dip.extraction.impl;
import static org.junit.jupiter.api.Assertions.*;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.domain.document.SourceType;
import java.io.ByteArrayOutputStream;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.Map;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Workbook;
import org.junit.jupiter.api.Test;
class SpreadsheetDocumentExtractorTest {
private final SpreadsheetDocumentExtractor extractor = new SpreadsheetDocumentExtractor();
@Test
void extractsOldExcelBinaryXls() throws Exception {
byte[] data = createLegacyXls();
SourceDescriptor source = new SourceDescriptor(
null,
SourceType.MAIL,
"mail-1:attachment:report.xls",
null,
"report.xls",
"application/vnd.ms-excel",
data,
null,
OffsetDateTime.now(),
OriginalContentStoragePolicy.STORE,
Map.of("title", "report.xls")
);
DetectionResult detection = new DetectionResult(DocumentType.GENERIC_BINARY, DocumentFamily.GENERIC,
"application/vnd.ms-excel", null, Map.of());
ExtractionResult result = extractor.extract(new ExtractionRequest(source, detection, null, data));
String text = result.derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
assertNotNull(text);
assertTrue(text.contains("Sheet: Sheet1"));
assertTrue(text.contains("Name | Amount"));
assertTrue(text.contains("Alice | 42"));
}
@Test
void extractsCsvAsNormalizedText() {
String csv = "Name,Amount\nAlice,42\nBob,77\n";
byte[] data = csv.getBytes(StandardCharsets.UTF_8);
SourceDescriptor source = new SourceDescriptor(
null,
SourceType.FILE_SYSTEM,
"csv-1",
null,
"report.csv",
"text/csv",
data,
csv,
OffsetDateTime.now(),
OriginalContentStoragePolicy.STORE,
Map.of("title", "report.csv")
);
DetectionResult detection = new DetectionResult(DocumentType.TEXT, DocumentFamily.GENERIC,
"text/csv", null, Map.of());
ExtractionResult result = extractor.extract(new ExtractionRequest(source, detection, csv, data));
String text = result.derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
assertEquals("Name | Amount\nAlice | 42\nBob | 77", text);
}
private byte[] createLegacyXls() throws Exception {
try (Workbook workbook = new HSSFWorkbook(); ByteArrayOutputStream out = new ByteArrayOutputStream()) {
var sheet = workbook.createSheet("Sheet1");
var header = sheet.createRow(0);
header.createCell(0).setCellValue("Name");
header.createCell(1).setCellValue("Amount");
var row = sheet.createRow(1);
row.createCell(0).setCellValue("Alice");
row.createCell(1).setCellValue(42);
workbook.write(out);
return out.toByteArray();
}
}
}

@ -0,0 +1,175 @@
package at.procon.dip.ingestion.adapter;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RelationType;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.service.DocumentRelationService;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.service.GenericDocumentImportService;
import at.procon.dip.ingestion.service.MailMessageExtractionService;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.service.attachment.ZipExtractionService;
import at.procon.dip.testsupport.MailBundleTestSupport;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;
import org.mockito.ArgumentCaptor;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.junit.jupiter.api.extension.ExtendWith;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.*;
@ExtendWith(MockitoExtension.class)
class MailDocumentIngestionAdapterBundleTest {
private static Path bundleRoot;
@Mock
private GenericDocumentImportService importService;
@Mock
private DocumentRelationService relationService;
@Mock
private ZipExtractionService zipExtractionService;
private MailDocumentIngestionAdapter adapter;
@BeforeAll
static void extractBundle() throws Exception {
bundleRoot = MailBundleTestSupport.extractBundleToTempDir();
}
@BeforeEach
void setUp() {
TedProcessorProperties properties = new TedProcessorProperties();
properties.getGenericIngestion().setEnabled(true);
properties.getGenericIngestion().setMailAdapterEnabled(true);
properties.getGenericIngestion().setExpandMailZipAttachments(false);
properties.getGenericIngestion().setMailImportBatchId("test-mail-bundle");
when(zipExtractionService.canHandle(any(), any())).thenReturn(false);
adapter = new MailDocumentIngestionAdapter(properties, importService, new MailMessageExtractionService(), relationService, zipExtractionService);
}
@ParameterizedTest(name = "ingest {0}")
@MethodSource("at.procon.dip.testsupport.MailBundleTestSupport#bundleMailNames")
void ingest_should_import_root_and_all_attachments_from_bundle(String fileName) throws Exception {
Path eml = bundleRoot.resolve("eml").resolve(fileName);
byte[] rawMime = Files.readAllBytes(eml);
List<String> expectedAttachmentNames = MailBundleTestSupport.EXPECTED_ATTACHMENT_NAMES.get(fileName);
AtomicInteger sequence = new AtomicInteger();
when(importService.importDocument(any(SourceDescriptor.class))).thenAnswer(invocation -> {
SourceDescriptor sd = invocation.getArgument(0);
int idx = sequence.incrementAndGet();
Document document = Document.builder()
.id(UUID.nameUUIDFromBytes((fileName + ":" + idx).getBytes()))
.documentType(sd.sourceType() == SourceType.MAIL && "message/rfc822".equals(sd.mediaType()) ? DocumentType.MIME_MESSAGE : DocumentType.GENERIC_BINARY)
.documentFamily(DocumentFamily.MAIL)
.status(DocumentStatus.RECEIVED)
.title(sd.fileName())
.mimeType(sd.mediaType())
.dedupHash(Integer.toHexString(idx))
.build();
DetectionResult detection = new DetectionResult(document.getDocumentType(), document.getDocumentFamily(), document.getMimeType(), null, java.util.Map.of());
return new ImportedDocumentResult(document, detection, List.of(), false);
});
SourceDescriptor source = new SourceDescriptor(
DocumentAccessContext.publicDocument(),
SourceType.MAIL,
fileName,
eml.toUri().toString(),
fileName,
"message/rfc822",
rawMime,
null,
OffsetDateTime.now(),
null,
java.util.Map.of()
);
IngestionResult result = adapter.ingest(source);
assertEquals(1 + expectedAttachmentNames.size(), result.documents().size(), "root + each attachment should be imported");
assertTrue(result.warnings().isEmpty(), "bundle sample should import without warnings: " + fileName);
ArgumentCaptor<SourceDescriptor> sourceCaptor = ArgumentCaptor.forClass(SourceDescriptor.class);
verify(importService, times(1 + expectedAttachmentNames.size())).importDocument(sourceCaptor.capture());
verify(relationService, times(expectedAttachmentNames.size())).ensureRelation(any());
List<SourceDescriptor> descriptors = sourceCaptor.getAllValues();
SourceDescriptor root = descriptors.getFirst();
assertEquals("message/rfc822", root.mediaType());
assertNotNull(root.textContent(), "root mail should carry serialized message text");
assertEquals(fileName, root.fileName());
List<String> importedAttachmentNames = new ArrayList<>();
for (int i = 1; i < descriptors.size(); i++) {
importedAttachmentNames.add(descriptors.get(i).fileName());
}
assertEquals(expectedAttachmentNames, importedAttachmentNames);
}
@ParameterizedTest(name = "octet-stream preview guard {0}")
@MethodSource("octetStreamMailNames")
void ingest_should_not_pass_preview_text_for_generic_octet_stream_attachments(String fileName) throws Exception {
Path eml = bundleRoot.resolve("eml").resolve(fileName);
byte[] rawMime = Files.readAllBytes(eml);
when(importService.importDocument(any(SourceDescriptor.class))).thenAnswer(invocation -> {
SourceDescriptor sd = invocation.getArgument(0);
Document document = Document.builder()
.id(UUID.randomUUID())
.documentType(DocumentType.UNKNOWN)
.documentFamily(DocumentFamily.MAIL)
.status(DocumentStatus.RECEIVED)
.title(sd.fileName())
.mimeType(sd.mediaType())
.build();
DetectionResult detection = new DetectionResult(DocumentType.UNKNOWN, DocumentFamily.MAIL, sd.mediaType(), null, java.util.Map.of());
return new ImportedDocumentResult(document, detection, List.of(), false);
});
adapter.ingest(new SourceDescriptor(
DocumentAccessContext.publicDocument(),
SourceType.MAIL,
fileName,
eml.toUri().toString(),
fileName,
"message/rfc822",
rawMime,
null,
OffsetDateTime.now(),
null,
java.util.Map.of()
));
ArgumentCaptor<SourceDescriptor> sourceCaptor = ArgumentCaptor.forClass(SourceDescriptor.class);
verify(importService, atLeast(1)).importDocument(sourceCaptor.capture());
List<SourceDescriptor> attachments = sourceCaptor.getAllValues().subList(1, sourceCaptor.getAllValues().size());
assertEquals(3, attachments.size());
assertTrue(attachments.stream().allMatch(sd -> sd.textContent() == null),
"octet-stream attachments should not get inline preview text");
}
static java.util.stream.Stream<String> octetStreamMailNames() {
return java.util.stream.Stream.of("sample-mail-05-generic-octet-stream.eml");
}
}

@ -0,0 +1,177 @@
package at.procon.dip.ingestion.adapter;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RelationType;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.service.DocumentRelationService;
import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.service.GenericDocumentImportService;
import at.procon.dip.ingestion.service.MailMessageExtractionService;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.service.attachment.ZipExtractionService;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.ArgumentCaptor;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
@ExtendWith(MockitoExtension.class)
class MailDocumentIngestionAdapterFileSystemTest {
@Mock
private GenericDocumentImportService importService;
@Mock
private DocumentRelationService relationService;
@Mock
private ZipExtractionService zipExtractionService;
private MailDocumentIngestionAdapter adapter;
private final List<SourceDescriptor> importedDescriptors = new ArrayList<>();
@BeforeEach
void setUp() {
TedProcessorProperties properties = new TedProcessorProperties();
properties.getGenericIngestion().setEnabled(true);
properties.getGenericIngestion().setMailAdapterEnabled(true);
properties.getGenericIngestion().setMailImportBatchId("test-mail-batch");
properties.getGenericIngestion().setDefaultOwnerTenantKey("tenant-a");
properties.getGenericIngestion().setMailDefaultVisibility(DocumentVisibility.TENANT);
MailMessageExtractionService extractionService = new MailMessageExtractionService();
adapter = new MailDocumentIngestionAdapter(
properties,
importService,
extractionService,
relationService,
zipExtractionService
);
when(zipExtractionService.canHandle(any(), any())).thenReturn(false);
when(relationService.ensureRelation(any())).thenReturn(null);
when(importService.importDocument(any())).thenAnswer(invocation -> {
SourceDescriptor descriptor = invocation.getArgument(0);
importedDescriptors.add(descriptor);
return new ImportedDocumentResult(
buildDocumentFor(descriptor),
new DetectionResult(inferType(descriptor), inferFamily(descriptor), descriptor.mediaType(), "en", Map.of()),
List.of(),
false
);
});
}
@Test
@DisplayName("Should ingest filesystem-loaded mail message with text and binary attachments")
void shouldIngestFileSystemLoadedMailMessage() throws Exception {
Path emlPath = Path.of("src", "test", "resources", "mail", "sample-message.eml");
assertTrue(Files.exists(emlPath), "sample .eml test file must exist");
byte[] mimeBytes = Files.readAllBytes(emlPath);
SourceDescriptor sourceDescriptor = new SourceDescriptor(
null,
SourceType.MAIL,
"fs-mail-001",
emlPath.toAbsolutePath().toUri().toString(),
emlPath.getFileName().toString(),
"message/rfc822",
mimeBytes,
null,
OffsetDateTime.parse("2026-03-18T15:27:59+01:00"),
null,
Map.of("source", "filesystem-test")
);
assertTrue(adapter.supports(sourceDescriptor));
IngestionResult result = adapter.ingest(sourceDescriptor);
assertEquals(3, result.documents().size(), "expected root mail document plus 2 attachment documents");
assertTrue(result.warnings().isEmpty(), "mail import should not create warnings for the sample message");
assertEquals(3, importedDescriptors.size(), "root + notes.txt + legacy.xls should be imported");
SourceDescriptor root = importedDescriptors.get(0);
assertEquals("message/rfc822", root.mediaType());
assertEquals("sample-message.eml", root.fileName());
assertNotNull(root.textContent());
assertTrue(root.textContent().contains("Subject: Sample mail with filesystem-loaded attachments"));
assertTrue(root.textContent().contains("Hello from the filesystem-backed sample message."));
assertEquals(DocumentVisibility.TENANT, root.accessContext().visibility());
assertNotNull(root.accessContext().ownerTenant());
assertEquals("tenant-a", root.accessContext().ownerTenant().tenantKey());
SourceDescriptor textAttachment = importedDescriptors.stream()
.filter(d -> "notes.txt".equals(d.fileName()))
.findFirst()
.orElseThrow();
assertEquals("text/plain", textAttachment.mediaType());
assertNotNull(textAttachment.textContent(), "plain text attachment should expose preview text");
assertTrue(textAttachment.textContent().contains("attachment notes"));
SourceDescriptor binaryAttachment = importedDescriptors.stream()
.filter(d -> "legacy.xls".equals(d.fileName()))
.findFirst()
.orElseThrow();
assertNull(binaryAttachment.textContent(), "binary old Excel attachment must not be passed as text content");
assertEquals("application/vnd.ms-excel", binaryAttachment.mediaType());
assertNotNull(binaryAttachment.binaryContent());
assertTrue(binaryAttachment.binaryContent().length > 0);
ArgumentCaptor<CreateDocumentRelationCommand> relationCaptor = ArgumentCaptor.forClass(CreateDocumentRelationCommand.class);
verify(relationService, times(2)).ensureRelation(relationCaptor.capture());
assertTrue(relationCaptor.getAllValues().stream().allMatch(cmd -> cmd.relationType() == RelationType.ATTACHMENT_OF));
}
private Document buildDocumentFor(SourceDescriptor descriptor) {
return Document.builder()
.id(UUID.nameUUIDFromBytes((descriptor.sourceIdentifier() + ":" + descriptor.fileName()).getBytes()))
.visibility(descriptor.accessContext() == null ? DocumentVisibility.PUBLIC : descriptor.accessContext().visibility())
.documentType(inferType(descriptor))
.documentFamily(inferFamily(descriptor))
.status(DocumentStatus.RECEIVED)
.title(descriptor.fileName())
.mimeType(descriptor.mediaType())
.dedupHash(Integer.toHexString((descriptor.sourceIdentifier() + descriptor.fileName()).hashCode()))
.build();
}
private DocumentType inferType(SourceDescriptor descriptor) {
if (descriptor.sourceType() == SourceType.MAIL && "message/rfc822".equals(descriptor.mediaType())) {
return DocumentType.EMAIL;
}
String fileName = descriptor.fileName() == null ? "" : descriptor.fileName().toLowerCase();
if (fileName.endsWith(".txt")) {
return DocumentType.TEXT;
}
return DocumentType.GENERIC_BINARY;
}
private DocumentFamily inferFamily(SourceDescriptor descriptor) {
return descriptor.sourceType() == SourceType.MAIL ? DocumentFamily.MAIL : DocumentFamily.GENERIC;
}
}

@ -0,0 +1,374 @@
package at.procon.dip.ingestion.integration;
import at.procon.dip.FixedPortPostgreSQLContainer;
import at.procon.dip.classification.detector.BasicMimeAndExtensionDocumentTypeDetector;
import at.procon.dip.classification.service.DocumentClassificationService;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.StorageType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.domain.document.entity.DocumentSource;
import at.procon.dip.domain.document.repository.DocumentContentRepository;
import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
import at.procon.dip.domain.document.repository.DocumentRelationRepository;
import at.procon.dip.domain.document.repository.DocumentRepository;
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.domain.document.service.DocumentContentService;
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
import at.procon.dip.domain.document.service.DocumentRelationService;
import at.procon.dip.domain.document.service.DocumentRepresentationService;
import at.procon.dip.domain.document.service.DocumentService;
import at.procon.dip.domain.document.service.DocumentSourceService;
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
import at.procon.dip.extraction.impl.*;
import at.procon.dip.extraction.service.DocumentExtractionService;
import at.procon.dip.ingestion.adapter.MailDocumentIngestionAdapter;
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
import at.procon.dip.ingestion.service.GenericDocumentImportService;
import at.procon.dip.ingestion.service.MailMessageExtractionService;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.normalization.impl.DefaultGenericTextRepresentationBuilder;
import at.procon.dip.normalization.service.TextRepresentationBuildService;
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.service.attachment.PdfExtractionService;
import at.procon.ted.service.attachment.ZipExtractionService;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.OffsetDateTime;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInstance;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.SpringBootConfiguration;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.domain.EntityScan;
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
import org.springframework.boot.autoconfigure.jdbc.JdbcTemplateAutoConfiguration;
import org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration;
import org.springframework.boot.autoconfigure.task.TaskSchedulingAutoConfiguration;
import org.springframework.boot.autoconfigure.transaction.TransactionAutoConfiguration;
import org.springframework.boot.autoconfigure.web.servlet.ServletWebServerFactoryAutoConfiguration;
import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.context.annotation.Import;
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
import org.springframework.test.context.DynamicPropertyRegistry;
import org.springframework.test.context.DynamicPropertySource;
import org.springframework.test.context.TestPropertySource;
import org.testcontainers.containers.PostgreSQLContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import static org.assertj.core.api.Assertions.assertThat;
@SpringBootTest(classes = MailBundleProcessingIntegrationTest.TestApplication.class)
@Testcontainers
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@TestPropertySource(properties = {
"spring.jpa.hibernate.ddl-auto=create-drop",
"spring.jpa.show-sql=false",
"spring.jpa.open-in-view=false",
"spring.jpa.properties.hibernate.default_schema=DOC",
"ted.vectorization.enabled=false",
"ted.generic-ingestion.enabled=true",
"ted.generic-ingestion.mail-adapter-enabled=true",
"ted.generic-ingestion.file-system-enabled=false",
"ted.generic-ingestion.rest-upload-enabled=false",
"ted.generic-ingestion.deduplicate-by-content-hash=false",
"ted.generic-ingestion.expand-mail-zip-attachments=true",
"ted.generic-ingestion.default-visibility=PUBLIC",
"ted.generic-ingestion.mail-default-visibility=RESTRICTED",
"ted.generic-ingestion.import-batch-id=test-mail-bundle",
"ted.generic-ingestion.mail-import-batch-id=test-mail-bundle-mail"
})
class MailBundleProcessingIntegrationTest {
@Container
static PostgreSQLContainer<?> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", 15432)
.withDatabaseName("dip_test")
.withUsername("test")
.withPassword("test")
.withInitScript("sql/create-doc-test-schemas.sql");
static {
postgres.start();
}
@DynamicPropertySource
static void registerProperties(DynamicPropertyRegistry registry) {
registry.add("spring.datasource.url", postgres::getJdbcUrl);
registry.add("spring.datasource.username", postgres::getUsername);
registry.add("spring.datasource.password", postgres::getPassword);
registry.add("spring.datasource.driver-class-name", postgres::getDriverClassName);
}
@Autowired
private DocumentIngestionGateway gateway;
@Autowired
private MailMessageExtractionService mailMessageExtractionService;
@Autowired
private DocumentRepository documentRepository;
@Autowired
private DocumentSourceRepository documentSourceRepository;
@Autowired
private DocumentContentRepository documentContentRepository;
@Autowired
private DocumentRelationRepository documentRelationRepository;
@Autowired
private DocumentTextRepresentationRepository documentTextRepresentationRepository;
@Autowired
private DocumentEmbeddingRepository documentEmbeddingRepository;
@Autowired
private DocumentEmbeddingModelRepository documentEmbeddingModelRepository;
@Autowired
private DocumentTenantRepository documentTenantRepository;
private Path bundleDirectory;
@BeforeEach
void setUp() throws Exception {
System.out.println("TEST setUp start");
cleanupDatabase();
bundleDirectory = Files.createTempDirectory("mail-bundle-");
try (InputStream in = getClass().getResourceAsStream("/mail-sample-eml-bundle.zip")) {
assertThat(in).isNotNull();
unzip(in, bundleDirectory);
}
}
@AfterEach
void tearDown() throws Exception {
//cleanupDatabase();
if (bundleDirectory != null && Files.exists(bundleDirectory)) {
Files.walk(bundleDirectory)
.sorted(Comparator.reverseOrder())
.forEach(path -> {
try {
Files.deleteIfExists(path);
} catch (IOException ignored) {
}
});
}
}
@org.junit.jupiter.api.Timeout(120)
@Test
void processesEntireMailBundleThroughRealGatewayAndPersistsResults() throws Exception {
List<Path> emlFiles = Files.walk(bundleDirectory)
.filter(path -> path.getFileName().toString().endsWith(".eml"))
.sorted()
.toList();
assertThat(emlFiles).hasSizeGreaterThanOrEqualTo(5);
int expectedRootDocuments = 0;
int expectedAttachmentDocuments = 0;
for (Path eml : emlFiles) {
byte[] raw = Files.readAllBytes(eml);
var parsed = mailMessageExtractionService.parse(raw);
expectedRootDocuments++;
expectedAttachmentDocuments += parsed.attachments().size();
IngestionResult result = gateway.ingest(new SourceDescriptor(
null,
at.procon.dip.domain.document.SourceType.MAIL,
eml.getFileName().toString(),
eml.toString(),
eml.getFileName().toString(),
"message/rfc822",
raw,
null,
OffsetDateTime.now(),
OriginalContentStoragePolicy.STORE,
Map.of("title", eml.getFileName().toString())
));
assertThat(result.documents()).isNotEmpty();
assertThat(result.documents().get(0).documentType()).isEqualTo(DocumentType.MIME_MESSAGE);
}
long totalDocuments = documentRepository.count();
long totalSources = documentSourceRepository.count();
long totalRelations = documentRelationRepository.count();
assertThat(totalDocuments).isEqualTo(expectedRootDocuments + expectedAttachmentDocuments);
assertThat(totalSources).isEqualTo(totalDocuments);
assertThat(totalRelations).isEqualTo(expectedAttachmentDocuments);
List<Document> allDocuments = documentRepository.findAll();
long rootCount = allDocuments.stream().filter(d -> d.getDocumentType() == DocumentType.MIME_MESSAGE).count();
assertThat(rootCount).isEqualTo(expectedRootDocuments);
List<DocumentSource> allSources = documentSourceRepository.findAll();
List<DocumentContent> allContent = documentContentRepository.findAll();
long mimeOriginalCount = allContent.stream()
.filter(c -> c.getContentRole() == ContentRole.ORIGINAL)
.filter(c -> c.getMimeType() != null && c.getMimeType().startsWith("message/rfc822"))
.count();
assertThat(mimeOriginalCount).isEqualTo(expectedRootDocuments);
assertThat(documentTextRepresentationRepository.count()).isGreaterThanOrEqualTo(expectedRootDocuments);
List<UUID> pdfDocumentIds = allSources.stream()
.filter(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".pdf"))
.map(s -> s.getDocument().getId())
.toList();
assertThat(pdfDocumentIds).isNotEmpty();
assertThat(allContent.stream()
.filter(c -> pdfDocumentIds.contains(c.getDocument().getId()))
.anyMatch(c -> c.getContentRole() == ContentRole.NORMALIZED_TEXT && c.getTextContent() != null && !c.getTextContent().isBlank()))
.isTrue();
List<UUID> spreadsheetIds = allSources.stream()
.filter(s -> s.getSourceFilename() != null && (s.getSourceFilename().toLowerCase().endsWith(".xlsx") || s.getSourceFilename().toLowerCase().endsWith(".xls")))
.map(s -> s.getDocument().getId())
.toList();
if (!spreadsheetIds.isEmpty()) {
assertThat(allContent.stream()
.filter(c -> spreadsheetIds.contains(c.getDocument().getId()))
.filter(c -> c.getContentRole() == ContentRole.ORIGINAL)
.anyMatch(c -> c.getStorageType() == StorageType.DB_BINARY || c.getStorageType() == StorageType.EXTERNAL_REFERENCE))
.isTrue();
}
}
@Test
void processesSingleFilesystemMailAndPersistsAttachmentsAndRelations() throws Exception {
Path sample = Files.walk(bundleDirectory)
//.filter(path -> path.getFileName().toString().equals("sample-mail-02-office-and-text.eml"))
.filter(path -> path.getFileName().toString().equals("sample-mail-01-basic-reporting.eml"))
.findFirst()
.orElseThrow();
byte[] raw = Files.readAllBytes(sample);
var parsed = mailMessageExtractionService.parse(raw);
IngestionResult result = gateway.ingest(new SourceDescriptor(
null,
at.procon.dip.domain.document.SourceType.MAIL,
"filesystem-sample-02",
sample.toString(),
sample.getFileName().toString(),
"message/rfc822",
raw,
null,
OffsetDateTime.now(),
OriginalContentStoragePolicy.STORE,
Map.of("title", sample.getFileName().toString())
));
assertThat(result.documents()).hasSize(1 + parsed.attachments().size());
assertThat(documentRepository.count()).isEqualTo(1 + parsed.attachments().size());
assertThat(documentRelationRepository.count()).isEqualTo(parsed.attachments().size());
List<DocumentSource> sources = documentSourceRepository.findAll();
assertThat(sources).anyMatch(s -> sample.getFileName().toString().equals(s.getSourceFilename()));
assertThat(sources).anyMatch(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".pdf"));
assertThat(sources).anyMatch(s -> s.getSourceFilename() != null && s.getSourceFilename().toLowerCase().endsWith(".csv"));
}
private void cleanupDatabase() {
System.out.println("cleanup: relations");
documentRelationRepository.deleteAll();
System.out.println("cleanup: embeddings");
documentEmbeddingRepository.deleteAll();
System.out.println("cleanup: text reps");
documentTextRepresentationRepository.deleteAll();
System.out.println("cleanup: content");
documentContentRepository.deleteAll();
System.out.println("cleanup: sources");
documentSourceRepository.deleteAll();
System.out.println("cleanup: documents");
documentRepository.deleteAll();
System.out.println("cleanup: models");
documentEmbeddingModelRepository.deleteAll();
System.out.println("cleanup: tenants");
documentTenantRepository.deleteAll();
}
private static void unzip(InputStream inputStream, Path targetDir) throws IOException {
try (ZipInputStream zis = new ZipInputStream(inputStream)) {
ZipEntry entry;
while ((entry = zis.getNextEntry()) != null) {
Path out = targetDir.resolve(entry.getName()).normalize();
if (!out.startsWith(targetDir)) {
throw new IOException("Zip entry outside target dir: " + entry.getName());
}
if (entry.isDirectory()) {
Files.createDirectories(out);
} else {
Files.createDirectories(out.getParent());
Files.copy(zis, out);
}
zis.closeEntry();
}
}
}
@SpringBootConfiguration
@ImportAutoConfiguration({
DataSourceAutoConfiguration.class,
HibernateJpaAutoConfiguration.class,
TransactionAutoConfiguration.class,
JdbcTemplateAutoConfiguration.class
})
@EnableConfigurationProperties(TedProcessorProperties.class)
@EntityScan(basePackages = {
"at.procon.dip.domain.document.entity",
"at.procon.dip.domain.tenant.entity"
})
@EnableJpaRepositories(basePackages = {
"at.procon.dip.domain.document.repository",
"at.procon.dip.domain.tenant.repository"
})
@Import({
DocumentIngestionGateway.class,
GenericDocumentImportService.class,
MailDocumentIngestionAdapter.class,
MailMessageExtractionService.class,
ZipExtractionService.class,
DocumentService.class,
DocumentSourceService.class,
DocumentContentService.class,
DocumentRepresentationService.class,
DocumentEmbeddingService.class,
DocumentRelationService.class,
DocumentClassificationService.class,
BasicMimeAndExtensionDocumentTypeDetector.class,
DocumentExtractionService.class,
PlainTextDocumentExtractor.class,
HtmlDocumentExtractor.class,
PdfDocumentExtractor.class,
BinaryPassThroughDocumentExtractor.class,
MimeMessageDocumentExtractor.class,
SpreadsheetDocumentExtractor.class,
TextRepresentationBuildService.class,
DefaultGenericTextRepresentationBuilder.class,
PdfExtractionService.class,
DocumentExtractionService.class,
GenericDocumentImportService.class,
StructuredDocumentProcessingService.class,
})
static class TestApplication {
}
}

@ -0,0 +1,66 @@
package at.procon.dip.ingestion.service;
import at.procon.dip.ingestion.service.MailMessageExtractionService.MailAttachment;
import at.procon.dip.testsupport.MailBundleTestSupport;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;
import static org.junit.jupiter.api.Assertions.*;
class MailMessageExtractionServiceBundleTest {
private static Path bundleRoot;
private final MailMessageExtractionService service = new MailMessageExtractionService();
@BeforeAll
static void extractBundle() throws Exception {
bundleRoot = MailBundleTestSupport.extractBundleToTempDir();
}
@ParameterizedTest(name = "parse {0}")
@MethodSource("at.procon.dip.testsupport.MailBundleTestSupport#bundleMailNames")
void parse_should_extract_expected_attachments_from_filesystem_bundle(String fileName) throws Exception {
Path eml = bundleRoot.resolve("eml").resolve(fileName);
byte[] rawMime = Files.readAllBytes(eml);
MailMessageExtractionService.ParsedMailMessage parsed = service.parse(rawMime);
assertNotNull(parsed.subject(), "subject should be parsed for " + fileName);
assertNotNull(parsed.receivedAt(), "receivedAt should be parsed for " + fileName);
assertFalse(parsed.attachments().isEmpty(), "attachments should be extracted for " + fileName);
List<String> actualNames = parsed.attachments().stream().map(MailAttachment::fileName).toList();
assertEquals(MailBundleTestSupport.EXPECTED_ATTACHMENT_NAMES.get(fileName), actualNames,
"attachment filenames should match validation bundle for " + fileName);
}
@Test
void parse_should_preserve_utf8_attachment_filenames() throws Exception {
Path eml = bundleRoot.resolve("eml").resolve("sample-mail-04-utf8-filenames.eml");
byte[] rawMime = Files.readAllBytes(eml);
MailMessageExtractionService.ParsedMailMessage parsed = service.parse(rawMime);
List<String> actualNames = parsed.attachments().stream().map(MailAttachment::fileName).toList();
assertEquals(List.of("prüfbericht.pdf", "данни.xlsx", "überblick.csv"), actualNames);
assertTrue(parsed.textBody().contains("UTF") || !parsed.textBody().isBlank(),
"UTF-8 sample should produce a readable body");
}
@Test
void parse_should_keep_octet_stream_attachments_in_bundle() throws Exception {
Path eml = bundleRoot.resolve("eml").resolve("sample-mail-05-generic-octet-stream.eml");
byte[] rawMime = Files.readAllBytes(eml);
MailMessageExtractionService.ParsedMailMessage parsed = service.parse(rawMime);
assertEquals(3, parsed.attachments().size());
assertTrue(parsed.attachments().stream().allMatch(a -> a.contentType() != null && a.contentType().contains("application/octet-stream")));
}
}

@ -0,0 +1,65 @@
package at.procon.dip.testsupport;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import org.junit.jupiter.api.Named;
public final class MailBundleTestSupport {
private MailBundleTestSupport() {
}
public static final Map<String, List<String>> EXPECTED_ATTACHMENT_NAMES = Map.of(
"sample-mail-01-basic-reporting.eml", List.of("sample-report.xlsx", "invoice-demo.pdf", "contacts.csv"),
"sample-mail-02-office-and-text.eml", List.of("purchase-order.docx", "readme.txt", "metadata.json", "payload.xml"),
"sample-mail-03-inline-image-and-files.eml", List.of("inline-logo.png", "overview.pdf", "contacts.csv"),
"sample-mail-04-utf8-filenames.eml", List.of("prüfbericht.pdf", "данни.xlsx", "überblick.csv"),
"sample-mail-05-generic-octet-stream.eml", List.of("generic-report.xlsx", "generic-invoice.pdf", "generic-data.csv")
);
public static Path extractBundleToTempDir() throws IOException {
Path tempDir = Files.createTempDirectory("mail-bundle-");
try (InputStream in = MailBundleTestSupport.class.getResourceAsStream("/mail/sample-eml-bundle.zip")) {
if (in == null) {
throw new IOException("Missing test resource /mail/sample-eml-bundle.zip");
}
Path zipPath = tempDir.resolve("sample-eml-bundle.zip");
Files.copy(in, zipPath, StandardCopyOption.REPLACE_EXISTING);
unzip(zipPath, tempDir);
}
return tempDir.resolve("eml_samples");
}
private static void unzip(Path zipPath, Path targetDir) throws IOException {
try (java.util.zip.ZipInputStream zin = new java.util.zip.ZipInputStream(Files.newInputStream(zipPath))) {
java.util.zip.ZipEntry entry;
while ((entry = zin.getNextEntry()) != null) {
Path out = targetDir.resolve(entry.getName()).normalize();
if (!out.startsWith(targetDir)) {
throw new IOException("Zip slip attempt: " + entry.getName());
}
if (entry.isDirectory()) {
Files.createDirectories(out);
} else {
Files.createDirectories(out.getParent());
Files.copy(zin, out, StandardCopyOption.REPLACE_EXISTING);
}
zin.closeEntry();
}
}
}
public static Stream<String> bundleMailNames() {
return EXPECTED_ATTACHMENT_NAMES.keySet().stream().sorted();
}
public static Stream<Named<String>> namedBundleMailNames() {
return bundleMailNames().map(name -> Named.of(name, name));
}
}

@ -0,0 +1,28 @@
From: Sender Example <sender@example.com>
To: Receiver Example <receiver@example.com>
Subject: Sample mail with filesystem-loaded attachments
Date: Tue, 18 Mar 2026 15:27:59 +0100
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="boundary42"
--boundary42
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: quoted-printable
Hello from the filesystem-backed sample message.
This mail has one text attachment and one old Excel attachment.
--boundary42
Content-Type: text/plain; charset=UTF-8; name="notes.txt"
Content-Disposition: attachment; filename="notes.txt"
Content-Transfer-Encoding: quoted-printable
These are the attachment notes.
--boundary42
Content-Type: application/vnd.ms-excel; name="legacy.xls"
Content-Disposition: attachment; filename="legacy.xls"
Content-Transfer-Encoding: base64
0M8R4KGxGuEAAAAAAAAAAEZBS0VYTFM=
--boundary42--

@ -0,0 +1,2 @@
CREATE SCHEMA IF NOT EXISTS DOC;
CREATE SCHEMA IF NOT EXISTS TED;
Loading…
Cancel
Save