diff --git a/docs/architecture/PHASE4_1_ADAPTERS.md b/docs/architecture/PHASE4_1_ADAPTERS.md
new file mode 100644
index 0000000..14e9944
--- /dev/null
+++ b/docs/architecture/PHASE4_1_ADAPTERS.md
@@ -0,0 +1,28 @@
+# Phase 4.1 adapter extensions
+
+## Added adapters
+
+### TED package adapter
+
+- Source type: `TED_PACKAGE`
+- Root access: `PUBLIC`, no owner tenant
+- Root document type: `TED_PACKAGE`
+- Child source type: `PACKAGE_CHILD`
+- Child relation: `EXTRACTED_FROM`
+
+The adapter imports the package artifact plus its XML members into the generic `DOC` model.
+It does not replace the existing legacy TED package processing path; instead it complements it, so the later legacy TED parsing step can still enrich the same canonical child documents into proper `TED_NOTICE` projections by dedup hash.
+
+### Mail/document adapter
+
+- Source type: `MAIL`
+- Root document type: `MIME_MESSAGE`
+- Child relation: `ATTACHMENT_OF`
+- Access: configurable via `mail-default-owner-tenant-key` and `mail-default-visibility`
+
+The adapter stores the message body as the semantic root text and imports attachments as child documents. ZIP attachments can optionally be expanded recursively.
+
+## Deduplication
+
+Phase 4 deduplication by content hash is refined so the same payload is only deduplicated within the same access scope (`visibility` + `owner tenant`).
+This prevents private documents from different tenants from being merged into one canonical document accidentally.
diff --git a/pom.xml b/pom.xml
index 007f25d..2106724 100644
--- a/pom.xml
+++ b/pom.xml
@@ -224,6 +224,14 @@
postgresql
test
+
+ org.flywaydb
+ flyway-core
+
+
+ org.flywaydb
+ flyway-database-postgresql
+
diff --git a/src/main/java/at/procon/dip/README_PHASE4_1.md b/src/main/java/at/procon/dip/README_PHASE4_1.md
new file mode 100644
index 0000000..b4796fb
--- /dev/null
+++ b/src/main/java/at/procon/dip/README_PHASE4_1.md
@@ -0,0 +1,34 @@
+# Phase 4.1 – TED package and mail/document adapters
+
+This phase extends the generic DOC ingestion SPI with two richer adapters:
+
+- `TedPackageDocumentIngestionAdapter`
+- `MailDocumentIngestionAdapter`
+
+## TED package adapter
+- imports the package artifact itself as a public DOC document
+- expands the `.tar.gz` package into XML child payloads
+- imports each child XML as a generic DOC child document
+- links children to the package root via `EXTRACTED_FROM`
+- keeps the existing legacy TED package processing path intact
+
+## Mail/document adapter
+- imports the MIME message as a DOC document
+- extracts subject/from/to/body into the mail root semantic text
+- imports attachments as child DOC documents
+- links attachments via `ATTACHMENT_OF`
+- optionally expands ZIP attachments recursively
+
+## Access semantics
+- TED packages and TED XML children are imported as `PUBLIC` with no owner tenant
+- mail documents use a dedicated default mail access context (`mail-default-owner-tenant-key`, `mail-default-visibility`)
+- deduplication is access-scope aware so private content is not merged across different tenants
+
+Additional note:
+- wrapper/container documents (for example TED package roots or ZIP wrapper documents expanded into child documents) can skip persistence of ORIGINAL content via `ted.generic-ingestion.store-original-content-for-wrapper-documents=false`, and adapters can now override that default per imported document through `SourceDescriptor.originalContentStoragePolicy` (`STORE` / `SKIP` / `DEFAULT`), while still keeping metadata, derived representations and child relations.
+
+- when original content storage is skipped for a document, GenericDocumentImportService now also skips extraction, derived-content persistence, representation building, and embedding queueing for that document
+
+
+Schema note:
+- `V8__doc_phase4_1_expand_document_and_source_types.sql` expands the generic `DOC` document/source type domain for `TED_PACKAGE` and `PACKAGE_CHILD`, and also repairs older local/dev schemas that used CHECK constraints instead of PostgreSQL ENUM types.
diff --git a/src/main/java/at/procon/dip/domain/document/DocumentType.java b/src/main/java/at/procon/dip/domain/document/DocumentType.java
index f6a651b..1ff0720 100644
--- a/src/main/java/at/procon/dip/domain/document/DocumentType.java
+++ b/src/main/java/at/procon/dip/domain/document/DocumentType.java
@@ -4,6 +4,7 @@ package at.procon.dip.domain.document;
* Canonical technical document type.
*/
public enum DocumentType {
+ TED_PACKAGE,
TED_NOTICE,
EMAIL,
MIME_MESSAGE,
diff --git a/src/main/java/at/procon/dip/domain/document/SourceType.java b/src/main/java/at/procon/dip/domain/document/SourceType.java
index d53b841..ccb0e62 100644
--- a/src/main/java/at/procon/dip/domain/document/SourceType.java
+++ b/src/main/java/at/procon/dip/domain/document/SourceType.java
@@ -5,6 +5,7 @@ package at.procon.dip.domain.document;
*/
public enum SourceType {
TED_PACKAGE,
+ PACKAGE_CHILD,
MAIL,
FILE_SYSTEM,
REST_UPLOAD,
diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java
index a039470..c1cf504 100644
--- a/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java
+++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java
@@ -13,4 +13,6 @@ public interface DocumentRelationRepository extends JpaRepository findByChildDocument_Id(UUID childDocumentId);
List findByParentDocument_IdAndRelationType(UUID parentDocumentId, RelationType relationType);
+
+ boolean existsByParentDocument_IdAndChildDocument_IdAndRelationType(UUID parentDocumentId, UUID childDocumentId, RelationType relationType);
}
diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java
index 6746b75..3e5192c 100644
--- a/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java
+++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java
@@ -15,6 +15,8 @@ public interface DocumentRepository extends JpaRepository {
Optional findByDedupHash(String dedupHash);
+ List findAllByDedupHash(String dedupHash);
+
boolean existsByDedupHash(String dedupHash);
List findByDocumentType(DocumentType documentType);
diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java
index f9c1bb1..2c59ff0 100644
--- a/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java
+++ b/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java
@@ -28,6 +28,19 @@ public class DocumentRelationService {
return relationRepository.save(relation);
}
+ public DocumentRelation ensureRelation(CreateDocumentRelationCommand command) {
+ boolean exists = relationRepository.existsByParentDocument_IdAndChildDocument_IdAndRelationType(
+ command.parentDocumentId(), command.childDocumentId(), command.relationType());
+ if (exists) {
+ return relationRepository.findByParentDocument_IdAndRelationType(command.parentDocumentId(), command.relationType())
+ .stream()
+ .filter(rel -> rel.getChildDocument() != null && command.childDocumentId().equals(rel.getChildDocument().getId()))
+ .findFirst()
+ .orElseGet(() -> createRelation(command));
+ }
+ return createRelation(command);
+ }
+
@Transactional(readOnly = true)
public List findChildren(UUID parentDocumentId) {
return relationRepository.findByParentDocument_Id(parentDocumentId);
diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java
index b6e3cb7..081b744 100644
--- a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java
+++ b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java
@@ -15,7 +15,6 @@ import at.procon.ted.model.entity.ProcurementLot;
import at.procon.ted.service.TedPhase2GenericDocumentService;
import java.util.ArrayList;
import java.util.List;
-import java.util.Optional;
import java.util.UUID;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@@ -63,8 +62,7 @@ public class TedNoticeProjectionService {
Document genericDocument = documentRepository.findById(resolvedDocumentId)
.orElseThrow(() -> new IllegalArgumentException("Unknown DOC document id: " + finalResolvedDocumentId));
- TedNoticeProjection projection = projectionRepository.findByLegacyProcurementDocumentId(legacyDocument.getId())
- .or(() -> projectionRepository.findByDocument_Id(genericDocument.getId()))
+ TedNoticeProjection projection = projectionRepository.findByDocument_Id(genericDocument.getId())
.orElseGet(TedNoticeProjection::new);
mapProjection(projection, genericDocument, legacyDocument);
@@ -72,19 +70,13 @@ public class TedNoticeProjectionService {
replaceLots(projection, legacyDocument.getLots());
replaceOrganizations(projection, legacyDocument.getOrganizations());
- log.debug("Phase 3 TED projection ensured for legacy {} -> projection {} / doc {}",
- legacyDocument.getId(), projection.getId(), genericDocument.getId());
+ log.debug("Phase 3 TED projection ensured for generic doc {} -> projection {} (noticeId={}, publicationId={})",
+ genericDocument.getId(), projection.getId(), legacyDocument.getNoticeId(), legacyDocument.getPublicationId());
return projection.getId();
}
- @Transactional(readOnly = true)
- public Optional findByLegacyProcurementDocumentId(UUID legacyDocumentId) {
- return projectionRepository.findByLegacyProcurementDocumentId(legacyDocumentId);
- }
-
private void mapProjection(TedNoticeProjection projection, Document genericDocument, ProcurementDocument legacyDocument) {
projection.setDocument(genericDocument);
- projection.setLegacyProcurementDocumentId(legacyDocument.getId());
projection.setNoticeId(legacyDocument.getNoticeId());
projection.setPublicationId(legacyDocument.getPublicationId());
projection.setNoticeUrl(legacyDocument.getNoticeUrl());
diff --git a/src/main/java/at/procon/dip/extraction/impl/MimeMessageDocumentExtractor.java b/src/main/java/at/procon/dip/extraction/impl/MimeMessageDocumentExtractor.java
new file mode 100644
index 0000000..9b4c5f0
--- /dev/null
+++ b/src/main/java/at/procon/dip/extraction/impl/MimeMessageDocumentExtractor.java
@@ -0,0 +1,56 @@
+package at.procon.dip.extraction.impl;
+
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.extraction.spi.DocumentExtractor;
+import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
+import at.procon.dip.extraction.spi.ExtractionRequest;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import at.procon.dip.ingestion.util.DocumentImportSupport;
+import java.nio.charset.StandardCharsets;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+@Component
+public class MimeMessageDocumentExtractor implements DocumentExtractor {
+
+ @Override
+ public boolean supports(DocumentType documentType, String mimeType) {
+ return documentType == DocumentType.MIME_MESSAGE || documentType == DocumentType.EMAIL;
+ }
+
+ @Override
+ public ExtractionResult extract(ExtractionRequest extractionRequest) {
+ String text = extractionRequest.textContent();
+ if (!StringUtils.hasText(text) && extractionRequest.binaryContent() != null) {
+ text = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8);
+ }
+ text = text == null ? null : text.replace("\r\n", "\n").replace('\r', '\n').trim();
+
+ Map attributes = new LinkedHashMap<>();
+ if (extractionRequest.sourceDescriptor().attributes() != null) {
+ attributes.putAll(extractionRequest.sourceDescriptor().attributes());
+ }
+ String title = DocumentImportSupport.firstNonBlank(extractionRequest.sourceDescriptor().attributes(), "title", "subject");
+ if (!StringUtils.hasText(title)) {
+ title = extractionRequest.sourceDescriptor().fileName();
+ }
+ if (StringUtils.hasText(title)) {
+ attributes.put("title", title);
+ }
+
+ if (!StringUtils.hasText(text)) {
+ return new ExtractionResult(Map.of(), List.of(new ExtractedStructuredPayload("mail-message", attributes)),
+ List.of("Mail message did not contain extractable text body"));
+ }
+
+ return new ExtractionResult(
+ Map.of(ContentRole.NORMALIZED_TEXT, text),
+ List.of(new ExtractedStructuredPayload("mail-message", attributes)),
+ List.of()
+ );
+ }
+}
diff --git a/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java b/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java
index 4eb6f07..4dfaa9a 100644
--- a/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java
+++ b/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java
@@ -22,7 +22,7 @@ public class PdfDocumentExtractor implements DocumentExtractor {
@Override
public boolean supports(DocumentType documentType, String mimeType) {
- return documentType == DocumentType.PDF || pdfExtractionService.canHandle("dummy.pdf", mimeType);
+ return documentType == DocumentType.PDF;
}
@Override
diff --git a/src/main/java/at/procon/dip/extraction/impl/TedPackageManifestExtractor.java b/src/main/java/at/procon/dip/extraction/impl/TedPackageManifestExtractor.java
new file mode 100644
index 0000000..e6873eb
--- /dev/null
+++ b/src/main/java/at/procon/dip/extraction/impl/TedPackageManifestExtractor.java
@@ -0,0 +1,49 @@
+package at.procon.dip.extraction.impl;
+
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.extraction.spi.DocumentExtractor;
+import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
+import at.procon.dip.extraction.spi.ExtractionRequest;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import at.procon.dip.ingestion.util.DocumentImportSupport;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+//@Component
+public class TedPackageManifestExtractor implements DocumentExtractor {
+
+ @Override
+ public boolean supports(DocumentType documentType, String mimeType) {
+ return documentType == DocumentType.TED_PACKAGE;
+ }
+
+ @Override
+ public ExtractionResult extract(ExtractionRequest extractionRequest) {
+ String manifest = extractionRequest.textContent();
+ if (!StringUtils.hasText(manifest)) {
+ manifest = "TED package: " + extractionRequest.sourceDescriptor().sourceIdentifier();
+ }
+
+ Map attributes = new LinkedHashMap<>();
+ if (extractionRequest.sourceDescriptor().attributes() != null) {
+ attributes.putAll(extractionRequest.sourceDescriptor().attributes());
+ }
+ String title = DocumentImportSupport.firstNonBlank(extractionRequest.sourceDescriptor().attributes(), "title", "packageId");
+ if (!StringUtils.hasText(title)) {
+ title = extractionRequest.sourceDescriptor().fileName();
+ }
+ if (StringUtils.hasText(title)) {
+ attributes.put("title", title);
+ }
+
+ return new ExtractionResult(
+ Map.of(ContentRole.NORMALIZED_TEXT, manifest),
+ List.of(new ExtractedStructuredPayload("ted-package-manifest", attributes)),
+ List.of()
+ );
+ }
+}
diff --git a/src/main/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapter.java
new file mode 100644
index 0000000..70ba90b
--- /dev/null
+++ b/src/main/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapter.java
@@ -0,0 +1,170 @@
+package at.procon.dip.ingestion.adapter;
+
+import at.procon.dip.domain.access.DocumentAccessContext;
+import at.procon.dip.domain.access.DocumentVisibility;
+import at.procon.dip.domain.document.RelationType;
+import at.procon.dip.domain.document.SourceType;
+import at.procon.dip.domain.tenant.TenantRef;
+import at.procon.dip.domain.document.service.DocumentRelationService;
+import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
+import at.procon.dip.ingestion.dto.ImportedDocumentResult;
+import at.procon.dip.ingestion.service.GenericDocumentImportService;
+import at.procon.dip.ingestion.service.MailMessageExtractionService;
+import at.procon.dip.ingestion.service.MailMessageExtractionService.MailAttachment;
+import at.procon.dip.ingestion.service.MailMessageExtractionService.ParsedMailMessage;
+import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
+import at.procon.dip.ingestion.spi.IngestionResult;
+import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import at.procon.dip.ingestion.util.DocumentImportSupport;
+import at.procon.ted.config.TedProcessorProperties;
+import at.procon.ted.service.attachment.AttachmentExtractor;
+import at.procon.ted.service.attachment.ZipExtractionService;
+import java.time.OffsetDateTime;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Component;
+
+@Component
+@RequiredArgsConstructor
+@Slf4j
+public class MailDocumentIngestionAdapter implements DocumentIngestionAdapter {
+
+ private final TedProcessorProperties properties;
+ private final GenericDocumentImportService importService;
+ private final MailMessageExtractionService mailExtractionService;
+ private final DocumentRelationService relationService;
+ private final ZipExtractionService zipExtractionService;
+
+ @Override
+ public boolean supports(SourceDescriptor sourceDescriptor) {
+ return sourceDescriptor.sourceType() == SourceType.MAIL
+ && properties.getGenericIngestion().isEnabled()
+ && properties.getGenericIngestion().isMailAdapterEnabled();
+ }
+
+ @Override
+ public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
+ byte[] rawMime = sourceDescriptor.binaryContent();
+ if (rawMime == null || rawMime.length == 0) {
+ throw new IllegalArgumentException("Mail adapter requires raw MIME bytes");
+ }
+ ParsedMailMessage parsed = mailExtractionService.parse(rawMime);
+ DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null ? defaultMailAccessContext() : sourceDescriptor.accessContext();
+
+ Map rootAttributes = new LinkedHashMap<>(sourceDescriptor.attributes() == null ? Map.of() : sourceDescriptor.attributes());
+ if (parsed.subject() != null) rootAttributes.put("subject", parsed.subject());
+ if (parsed.from() != null) rootAttributes.put("from", parsed.from());
+ if (!parsed.recipients().isEmpty()) rootAttributes.put("to", String.join(", ", parsed.recipients()));
+ rootAttributes.putIfAbsent("title", parsed.subject() != null ? parsed.subject() : sourceDescriptor.fileName());
+ rootAttributes.put("attachmentCount", Integer.toString(parsed.attachments().size()));
+ rootAttributes.put("importBatchId", properties.getGenericIngestion().getMailImportBatchId());
+
+ ImportedDocumentResult rootResult = importService.importDocument(new SourceDescriptor(
+ accessContext,
+ SourceType.MAIL,
+ sourceDescriptor.sourceIdentifier(),
+ sourceDescriptor.sourceUri(),
+ sourceDescriptor.fileName() != null ? sourceDescriptor.fileName() : fallbackMailFileName(parsed),
+ "message/rfc822",
+ rawMime,
+ mailExtractionService.serializeMessage(parsed),
+ parsed.receivedAt() == null ? OffsetDateTime.now() : parsed.receivedAt(),
+ OriginalContentStoragePolicy.STORE,
+ rootAttributes
+ ));
+
+ List documents = new ArrayList<>();
+ List warnings = new ArrayList<>(rootResult.warnings());
+ documents.add(rootResult.document().toCanonicalMetadata());
+
+ int sortOrder = 0;
+ for (MailAttachment attachment : parsed.attachments()) {
+ importAttachment(rootResult.document().getId(), accessContext, sourceDescriptor, attachment, documents, warnings, ++sortOrder, 0);
+ }
+
+ return new IngestionResult(documents, warnings);
+ }
+
+ private void importAttachment(java.util.UUID parentDocumentId, DocumentAccessContext accessContext, SourceDescriptor parentSource,
+ MailAttachment attachment, List documents,
+ List warnings, int sortOrder, int depth) {
+ boolean expandableWrapper = properties.getGenericIngestion().isExpandMailZipAttachments()
+ && zipExtractionService.canHandle(attachment.fileName(), attachment.contentType());
+
+ Map attachmentAttributes = new LinkedHashMap<>();
+ attachmentAttributes.put("title", attachment.fileName());
+ attachmentAttributes.put("mailSourceIdentifier", parentSource.sourceIdentifier());
+ attachmentAttributes.put("importBatchId", properties.getGenericIngestion().getMailImportBatchId());
+ if (expandableWrapper) {
+ attachmentAttributes.put("wrapperDocument", Boolean.TRUE.toString());
+ }
+
+ ImportedDocumentResult attachmentResult = importService.importDocument(new SourceDescriptor(
+ accessContext,
+ SourceType.MAIL,
+ parentSource.sourceIdentifier() + ":attachment:" + depth + ":" + attachment.fileName(),
+ parentSource.sourceUri(),
+ attachment.fileName(),
+ DocumentImportSupport.normalizeMediaType(attachment.contentType()),
+ attachment.data(),
+ previewTextIfLikelyText(attachment),
+ parentSource.receivedAt() == null ? OffsetDateTime.now() : parentSource.receivedAt(),
+ expandableWrapper ? OriginalContentStoragePolicy.SKIP : OriginalContentStoragePolicy.STORE,
+ attachmentAttributes
+ ));
+ documents.add(attachmentResult.document().toCanonicalMetadata());
+ warnings.addAll(attachmentResult.warnings());
+ RelationType relationType = depth > 0 || attachment.path() != null ? RelationType.EXTRACTED_FROM : RelationType.ATTACHMENT_OF;
+ relationService.ensureRelation(new CreateDocumentRelationCommand(
+ parentDocumentId, attachmentResult.document().getId(), relationType, sortOrder, attachment.fileName()));
+
+ if (expandableWrapper) {
+ AttachmentExtractor.ExtractionResult zipResult = zipExtractionService.extract(attachment.data(), attachment.fileName(), attachment.contentType());
+ if (!zipResult.success()) {
+ warnings.add("ZIP attachment extraction failed for " + attachment.fileName() + ": " + zipResult.errorMessage());
+ return;
+ }
+ int childSort = 0;
+ for (AttachmentExtractor.ChildAttachment child : zipResult.childAttachments()) {
+ importAttachment(attachmentResult.document().getId(), accessContext, parentSource,
+ new MailAttachment(child.filename(), child.contentType(), child.data(), child.data().length, child.pathInArchive()),
+ documents, warnings, ++childSort, depth + 1);
+ }
+ }
+ }
+
+ private String fallbackMailFileName(ParsedMailMessage parsed) {
+ String subject = parsed.subject() == null || parsed.subject().isBlank() ? "mail-message" : parsed.subject().replaceAll("[^A-Za-z0-9._-]", "_");
+ return subject + ".eml";
+ }
+
+ private DocumentAccessContext defaultMailAccessContext() {
+ String tenantKey = properties.getGenericIngestion().getMailDefaultOwnerTenantKey();
+ if (tenantKey == null || tenantKey.isBlank()) {
+ tenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey();
+ }
+ DocumentVisibility visibility = properties.getGenericIngestion().getMailDefaultVisibility();
+ TenantRef tenant = (tenantKey == null || tenantKey.isBlank()) ? null : new TenantRef(null, tenantKey, tenantKey);
+ if (tenant == null && visibility == DocumentVisibility.TENANT) {
+ visibility = DocumentVisibility.RESTRICTED;
+ }
+ return new DocumentAccessContext(tenant, visibility);
+ }
+
+ private String previewTextIfLikelyText(MailAttachment attachment) {
+ String mime = DocumentImportSupport.normalizeMediaType(attachment.contentType());
+ if (DocumentImportSupport.isLikelyTextMime(mime)) {
+ return attachment.safeTextPreview();
+ }
+ String ext = DocumentImportSupport.extensionOf(attachment.fileName());
+ if ("txt".equals(ext) || "xml".equals(ext) || "html".equals(ext) || "htm".equals(ext) || "md".equals(ext)) {
+ return attachment.safeTextPreview();
+ }
+ return null;
+ }
+}
diff --git a/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java
new file mode 100644
index 0000000..fbbd535
--- /dev/null
+++ b/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java
@@ -0,0 +1,130 @@
+package at.procon.dip.ingestion.adapter;
+
+import at.procon.dip.domain.access.DocumentAccessContext;
+import at.procon.dip.domain.document.RelationType;
+import at.procon.dip.domain.document.SourceType;
+import at.procon.dip.domain.document.entity.Document;
+import at.procon.dip.domain.document.service.DocumentRelationService;
+import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
+import at.procon.dip.ingestion.dto.ImportedDocumentResult;
+import at.procon.dip.ingestion.service.GenericDocumentImportService;
+import at.procon.dip.ingestion.service.TedPackageExpansionService;
+import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
+import at.procon.dip.ingestion.spi.IngestionResult;
+import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import at.procon.ted.config.TedProcessorProperties;
+import java.nio.charset.StandardCharsets;
+import java.time.OffsetDateTime;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Component;
+import org.springframework.transaction.annotation.Transactional;
+
+@Component
+@RequiredArgsConstructor
+@Slf4j
+public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdapter {
+
+ private final TedProcessorProperties properties;
+ private final GenericDocumentImportService importService;
+ private final TedPackageExpansionService expansionService;
+ private final DocumentRelationService relationService;
+
+ @Override
+ public boolean supports(SourceDescriptor sourceDescriptor) {
+ return sourceDescriptor.sourceType() == SourceType.TED_PACKAGE
+ && properties.getGenericIngestion().isEnabled()
+ && properties.getGenericIngestion().isTedPackageAdapterEnabled();
+ }
+
+ @Override
+ public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
+ byte[] packageBytes = sourceDescriptor.binaryContent();
+ if (packageBytes == null || packageBytes.length == 0) {
+ throw new IllegalArgumentException("TED package adapter requires tar.gz bytes");
+ }
+
+ TedPackageExpansionService.TedPackageExpansionResult expanded = expansionService.expand(packageBytes);
+ Map rootAttributes = new LinkedHashMap<>(sourceDescriptor.attributes() == null ? Map.of() : sourceDescriptor.attributes());
+ rootAttributes.putIfAbsent("packageId", sourceDescriptor.sourceIdentifier());
+ rootAttributes.putIfAbsent("title", sourceDescriptor.fileName() != null ? sourceDescriptor.fileName() : sourceDescriptor.sourceIdentifier());
+ rootAttributes.put("xmlEntryCount", Integer.toString(expanded.entries().size()));
+ rootAttributes.put("wrapperDocument", Boolean.TRUE.toString());
+ rootAttributes.put("importBatchId", properties.getGenericIngestion().getTedPackageImportBatchId());
+
+ ImportedDocumentResult packageDocument = importService.importDocument(new SourceDescriptor(
+ sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(),
+ SourceType.TED_PACKAGE,
+ sourceDescriptor.sourceIdentifier(),
+ sourceDescriptor.sourceUri(),
+ sourceDescriptor.fileName(),
+ sourceDescriptor.mediaType() == null ? "application/gzip" : sourceDescriptor.mediaType(),
+ packageBytes,
+ expanded.manifestText(),
+ sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt(),
+ OriginalContentStoragePolicy.SKIP,
+ rootAttributes
+ ));
+
+ List warnings = new ArrayList<>(packageDocument.warnings());
+ List documents = new ArrayList<>();
+ documents.add(packageDocument.document().toCanonicalMetadata());
+
+ int sortOrder = 0;
+ for (TedPackageExpansionService.TedPackageEntry entry : expanded.entries()) {
+ sortOrder++;
+ String childUri = "tedpkg://" + sourceDescriptor.sourceIdentifier() + "/" + entry.archivePath();
+ String childIdentifier = sourceDescriptor.sourceIdentifier() + ":" + entry.archivePath();
+ String xmlContent = resolveXmlContent(entry);
+
+ Map childAttributes = new LinkedHashMap<>();
+ childAttributes.put("documentTypeHint", "TED_NOTICE");
+ childAttributes.put("packageId", sourceDescriptor.sourceIdentifier());
+ childAttributes.put("archivePath", entry.archivePath());
+ childAttributes.put("title", entry.fileName());
+ childAttributes.put("importBatchId", properties.getGenericIngestion().getTedPackageImportBatchId());
+
+ ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor(
+ sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(),
+ SourceType.PACKAGE_CHILD,
+ childIdentifier,
+ childUri,
+ entry.fileName(),
+ entry.mediaType() == null ? "application/xml" : entry.mediaType(),
+ entry.data(),
+ xmlContent,
+ sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt(),
+ OriginalContentStoragePolicy.STORE,
+ childAttributes
+ ));
+
+ Document childDocument = childResult.document();
+ documents.add(childDocument.toCanonicalMetadata());
+ warnings.addAll(childResult.warnings());
+ if (childResult.deduplicated()) {
+ warnings.add("TED XML child already existed and was linked to package: " + entry.archivePath());
+ }
+ relationService.ensureRelation(new CreateDocumentRelationCommand(
+ packageDocument.document().getId(),
+ childDocument.getId(),
+ RelationType.EXTRACTED_FROM,
+ sortOrder,
+ entry.archivePath()
+ ));
+ }
+
+ return new IngestionResult(documents, warnings);
+ }
+
+ private String resolveXmlContent(TedPackageExpansionService.TedPackageEntry entry) {
+ if (entry.textUtf8() != null && !entry.textUtf8().isBlank()) {
+ return entry.textUtf8();
+ }
+ return new String(entry.data(), StandardCharsets.UTF_8);
+ }
+}
diff --git a/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java b/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java
index 6dea73e..a0a16b4 100644
--- a/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java
+++ b/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java
@@ -5,6 +5,7 @@ import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.domain.tenant.TenantRef;
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
+import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties;
import java.nio.file.Files;
@@ -72,6 +73,7 @@ public class GenericFileSystemIngestionRoute extends RouteBuilder {
payload,
null,
OffsetDateTime.now(),
+ OriginalContentStoragePolicy.DEFAULT,
attributes
);
ingestionGateway.ingest(descriptor);
diff --git a/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java b/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java
index 33f544a..86b91c3 100644
--- a/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java
+++ b/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java
@@ -9,6 +9,7 @@ import at.procon.dip.ingestion.dto.GenericImportResponse;
import at.procon.dip.ingestion.dto.GenericTextImportRequest;
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
import at.procon.dip.ingestion.spi.IngestionResult;
+import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties;
import java.time.OffsetDateTime;
@@ -62,6 +63,7 @@ public class GenericDocumentImportController {
file.getBytes(),
null,
OffsetDateTime.now(),
+ OriginalContentStoragePolicy.DEFAULT,
attributes
);
IngestionResult result = ingestionGateway.ingest(descriptor);
@@ -89,6 +91,7 @@ public class GenericDocumentImportController {
request.text() == null ? null : request.text().getBytes(java.nio.charset.StandardCharsets.UTF_8),
request.text(),
OffsetDateTime.now(),
+ OriginalContentStoragePolicy.DEFAULT,
attributes
);
IngestionResult result = ingestionGateway.ingest(descriptor);
diff --git a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java
index fbdaa5b..ec987b8 100644
--- a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java
+++ b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java
@@ -4,15 +4,12 @@ import at.procon.dip.classification.service.DocumentClassificationService;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.access.DocumentVisibility;
-import at.procon.dip.domain.document.CanonicalDocumentMetadata;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.StorageType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
-import at.procon.dip.domain.document.entity.DocumentSource;
-import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
import at.procon.dip.domain.document.repository.DocumentRepository;
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
import at.procon.dip.domain.document.service.DocumentContentService;
@@ -29,11 +26,15 @@ import at.procon.dip.extraction.service.DocumentExtractionService;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
+import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import at.procon.dip.normalization.service.TextRepresentationBuildService;
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
import at.procon.dip.normalization.spi.TextRepresentationDraft;
+import at.procon.dip.processing.service.StructuredDocumentProcessingService;
+import at.procon.dip.processing.spi.DocumentProcessingPolicy;
+import at.procon.dip.processing.spi.StructuredProcessingRequest;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.util.HashUtils;
import java.nio.charset.StandardCharsets;
@@ -43,7 +44,6 @@ import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
-import java.util.UUID;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
@@ -51,7 +51,7 @@ import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils;
/**
- * Phase 4 generic import pipeline that persists arbitrary document types into the DOC model.
+ * Generic import pipeline that persists arbitrary document types into the DOC model.
*/
@Service
@RequiredArgsConstructor
@@ -61,7 +61,6 @@ public class GenericDocumentImportService {
private final TedProcessorProperties properties;
private final DocumentRepository documentRepository;
private final DocumentSourceRepository documentSourceRepository;
- private final DocumentEmbeddingRepository documentEmbeddingRepository;
private final DocumentService documentService;
private final DocumentSourceService documentSourceService;
private final DocumentContentService documentContentService;
@@ -70,6 +69,7 @@ public class GenericDocumentImportService {
private final DocumentClassificationService classificationService;
private final DocumentExtractionService extractionService;
private final TextRepresentationBuildService representationBuildService;
+ private final StructuredDocumentProcessingService structuredProcessingService;
@Transactional
public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) {
@@ -77,20 +77,20 @@ public class GenericDocumentImportService {
DetectionResult detection = classificationService.detect(withResolvedMediaType(sourceDescriptor, payload));
String dedupHash = HashUtils.computeSha256(payload.binaryContent());
+ DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null
+ ? defaultAccessContext()
+ : sourceDescriptor.accessContext();
+
if (properties.getGenericIngestion().isDeduplicateByContentHash()) {
- Optional existing = documentRepository.findByDedupHash(dedupHash);
+ Optional existing = resolveDeduplicatedDocument(dedupHash, accessContext);
if (existing.isPresent()) {
Document document = existing.get();
ensureSource(document, sourceDescriptor);
- List warnings = List.of("Document content hash already imported; linked new source to existing document");
+ List warnings = List.of("Document content hash already imported within the same access scope; linked new source to existing document");
return new ImportedDocumentResult(document, detection, warnings, true);
}
}
- DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null
- ? defaultAccessContext()
- : sourceDescriptor.accessContext();
-
Document document = documentService.create(new CreateDocumentCommand(
accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(),
accessContext.visibility(),
@@ -108,21 +108,63 @@ public class GenericDocumentImportService {
ensureSource(document, sourceDescriptor);
documentService.updateStatus(document.getId(), DocumentStatus.CLASSIFIED);
- DocumentContent originalContent = persistOriginalContent(document, sourceDescriptor, detection, payload, dedupHash);
-
- ExtractionResult extractionResult = extractionService.extract(new ExtractionRequest(
- sourceDescriptor,
- detection,
- payload.textContent(),
- payload.binaryContent()
- ));
- List warnings = new ArrayList<>(extractionResult.warnings());
+ boolean persistOriginalContent = shouldPersistOriginalContent(sourceDescriptor);
+ DocumentContent originalContent = persistOriginalContent
+ ? persistOriginalContent(document, sourceDescriptor, detection, payload, dedupHash)
+ : null;
+
+ List warnings = new ArrayList<>();
+ DocumentProcessingPolicy processingPolicy = structuredProcessingService.resolvePolicy(sourceDescriptor, detection);
+ ExtractionResult extractionResult = emptyExtractionResult();
+ Map persistedDerivedContent = new LinkedHashMap<>();
+
+ if (persistOriginalContent) {
+ if (processingPolicy.runGenericExtraction()) {
+ extractionResult = extractionService.extract(new ExtractionRequest(
+ sourceDescriptor,
+ detection,
+ payload.textContent(),
+ payload.binaryContent()
+ ));
+ warnings.addAll(extractionResult.warnings());
+ if (processingPolicy.persistExtractedContent()) {
+ persistedDerivedContent.putAll(persistDerivedContent(document, detection, extractionResult, dedupHash, "generic"));
+ }
+ if (!extractionResult.derivedTextByRole().isEmpty()) {
+ documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
+ }
+ }
- Map persistedDerivedContent = persistDerivedContent(document, detection, extractionResult, dedupHash);
- documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
+ if (processingPolicy.invokeStructuredProcessor()) {
+ Optional structuredExtractionResult = structuredProcessingService.process(new StructuredProcessingRequest(
+ document,
+ originalContent,
+ sourceDescriptor,
+ detection,
+ payload.binaryContent(),
+ payload.textContent(),
+ dedupHash
+ ));
+ if (structuredExtractionResult.isPresent()) {
+ ExtractionResult result = structuredExtractionResult.get();
+ warnings.addAll(result.warnings());
+ extractionResult = mergeExtractionResults(extractionResult, result);
+ if (processingPolicy.persistExtractedContent()) {
+ persistedDerivedContent.putAll(persistDerivedContent(document, detection, result, dedupHash, "structured"));
+ }
+ if (!result.derivedTextByRole().isEmpty()) {
+ documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
+ }
+ }
+ }
- var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult));
- persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts);
+ if (processingPolicy.runRepresentationBuilders()) {
+ var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult));
+ persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts);
+ }
+ } else {
+ warnings.add("Original content storage disabled for this document; skipped extraction and text-representation processing");
+ }
Document reloaded = documentService.getRequired(document.getId());
if (reloaded.getStatus() == DocumentStatus.EXTRACTED) {
@@ -130,7 +172,7 @@ public class GenericDocumentImportService {
reloaded = documentService.getRequired(reloaded.getId());
}
- if (!extractionResult.structuredPayloads().isEmpty()) {
+ if (processingPolicy.applyStructuredTitleIfMissing() && !extractionResult.structuredPayloads().isEmpty()) {
applyStructuredTitleIfMissing(reloaded, extractionResult);
reloaded = documentService.getRequired(reloaded.getId());
}
@@ -138,6 +180,50 @@ public class GenericDocumentImportService {
return new ImportedDocumentResult(reloaded, detection, warnings, false);
}
+ private ExtractionResult mergeExtractionResults(ExtractionResult left, ExtractionResult right) {
+ Map derivedText = new LinkedHashMap<>();
+ if (left != null && left.derivedTextByRole() != null) {
+ derivedText.putAll(left.derivedTextByRole());
+ }
+ if (right != null && right.derivedTextByRole() != null) {
+ derivedText.putAll(right.derivedTextByRole());
+ }
+ List payloads = new ArrayList<>();
+ if (left != null && left.structuredPayloads() != null) {
+ payloads.addAll(left.structuredPayloads());
+ }
+ if (right != null && right.structuredPayloads() != null) {
+ payloads.addAll(right.structuredPayloads());
+ }
+ List warnings = new ArrayList<>();
+ if (left != null && left.warnings() != null) {
+ warnings.addAll(left.warnings());
+ }
+ if (right != null && right.warnings() != null) {
+ warnings.addAll(right.warnings());
+ }
+ return new ExtractionResult(derivedText, payloads, warnings);
+ }
+
+ private ExtractionResult emptyExtractionResult() {
+ return new ExtractionResult(java.util.Collections.emptyMap(), java.util.Collections.emptyList(), java.util.Collections.emptyList());
+ }
+
+ private Optional resolveDeduplicatedDocument(String dedupHash, DocumentAccessContext accessContext) {
+ return documentRepository.findAllByDedupHash(dedupHash).stream()
+ .filter(existing -> sameAccessScope(existing, accessContext))
+ .findFirst();
+ }
+
+ private boolean sameAccessScope(Document existing, DocumentAccessContext accessContext) {
+ if (existing.getVisibility() != accessContext.visibility()) {
+ return false;
+ }
+ String existingTenantKey = existing.getOwnerTenant() == null ? null : existing.getOwnerTenant().getTenantKey();
+ String requestedTenantKey = accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey();
+ return java.util.Objects.equals(existingTenantKey, requestedTenantKey);
+ }
+
private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) {
if (StringUtils.hasText(sourceDescriptor.mediaType())) {
return sourceDescriptor;
@@ -152,6 +238,7 @@ public class GenericDocumentImportService {
sourceDescriptor.binaryContent(),
sourceDescriptor.textContent(),
sourceDescriptor.receivedAt(),
+ sourceDescriptor.originalContentStoragePolicy(),
sourceDescriptor.attributes()
);
}
@@ -216,7 +303,7 @@ public class GenericDocumentImportService {
return sourceDescriptor.fileName();
}
if (StringUtils.hasText(payload.textContent())) {
- for (String line : payload.textContent().split("\\n")) {
+ for (String line : payload.textContent().split("\n")) {
if (StringUtils.hasText(line)) {
return DocumentImportSupport.ellipsize(line.trim(), 240);
}
@@ -244,6 +331,10 @@ public class GenericDocumentImportService {
return;
}
+ String importBatchId = sourceDescriptor.attributes() != null && StringUtils.hasText(sourceDescriptor.attributes().get("importBatchId"))
+ ? sourceDescriptor.attributes().get("importBatchId")
+ : properties.getGenericIngestion().getImportBatchId();
+
documentSourceService.addSource(new AddDocumentSourceCommand(
document.getId(),
sourceDescriptor.sourceType(),
@@ -251,11 +342,35 @@ public class GenericDocumentImportService {
sourceDescriptor.sourceUri(),
sourceDescriptor.fileName(),
null,
- properties.getGenericIngestion().getImportBatchId(),
+ importBatchId,
sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt()
));
}
+ private boolean shouldPersistOriginalContent(SourceDescriptor sourceDescriptor) {
+ if (sourceDescriptor.originalContentStoragePolicy() == OriginalContentStoragePolicy.STORE) {
+ return true;
+ }
+ if (sourceDescriptor.originalContentStoragePolicy() == OriginalContentStoragePolicy.SKIP) {
+ return false;
+ }
+ if (properties.getGenericIngestion().isStoreOriginalContentForWrapperDocuments()) {
+ return true;
+ }
+ return !isWrapperDocument(sourceDescriptor);
+ }
+
+ private boolean isWrapperDocument(SourceDescriptor sourceDescriptor) {
+ if (sourceDescriptor.attributes() == null || sourceDescriptor.attributes().isEmpty()) {
+ return false;
+ }
+ String wrapperFlag = sourceDescriptor.attributes().get("wrapperDocument");
+ if (wrapperFlag == null) {
+ wrapperFlag = sourceDescriptor.attributes().get("containerDocument");
+ }
+ return Boolean.parseBoolean(wrapperFlag);
+ }
+
private DocumentContent persistOriginalContent(Document document,
SourceDescriptor sourceDescriptor,
DetectionResult detection,
@@ -287,13 +402,14 @@ public class GenericDocumentImportService {
private Map persistDerivedContent(Document document,
DetectionResult detection,
ExtractionResult extractionResult,
- String baseHash) {
+ String baseHash,
+ String hashNamespace) {
Map result = new LinkedHashMap<>();
extractionResult.derivedTextByRole().forEach((role, text) -> {
if (!StringUtils.hasText(text)) {
return;
}
- String contentHash = HashUtils.computeSha256(baseHash + ":" + role.name() + ":" + text);
+ String contentHash = HashUtils.computeSha256(baseHash + ":" + hashNamespace + ":" + role.name() + ":" + text);
DocumentContent content = documentContentService.addContent(new AddDocumentContentCommand(
document.getId(),
role,
@@ -336,16 +452,13 @@ public class GenericDocumentImportService {
if (!StringUtils.hasText(draft.textBody())) {
continue;
}
- DocumentContent linkedContent = switch (draft.representationType()) {
- case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK ->
- derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
- };
+ DocumentContent linkedContent = resolveLinkedContent(originalContent, derivedContent, draft);
var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
document.getId(),
linkedContent == null ? null : linkedContent.getId(),
draft.representationType(),
- "phase4-generic-builder",
+ draft.builderKey() == null ? "phase4-generic-builder" : draft.builderKey(),
draft.languageCode(),
null,
draft.chunkIndex(),
@@ -362,7 +475,23 @@ public class GenericDocumentImportService {
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
}
+ private DocumentContent resolveLinkedContent(DocumentContent originalContent,
+ Map derivedContent,
+ TextRepresentationDraft draft) {
+ ContentRole sourceRole = draft.sourceContentRole();
+ if (sourceRole == null) {
+ sourceRole = ContentRole.NORMALIZED_TEXT;
+ }
+ if (sourceRole == ContentRole.ORIGINAL) {
+ return originalContent;
+ }
+ return derivedContent.getOrDefault(sourceRole, originalContent);
+ }
+
private boolean shouldQueueEmbedding(TextRepresentationDraft draft) {
+ if (Boolean.FALSE.equals(draft.queueForEmbedding())) {
+ return false;
+ }
return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true;
}
diff --git a/src/main/java/at/procon/dip/ingestion/service/MailMessageExtractionService.java b/src/main/java/at/procon/dip/ingestion/service/MailMessageExtractionService.java
new file mode 100644
index 0000000..3c9aea0
--- /dev/null
+++ b/src/main/java/at/procon/dip/ingestion/service/MailMessageExtractionService.java
@@ -0,0 +1,122 @@
+package at.procon.dip.ingestion.service;
+
+import jakarta.mail.BodyPart;
+import jakarta.mail.Multipart;
+import jakarta.mail.Part;
+import jakarta.mail.Session;
+import jakarta.mail.internet.MimeMessage;
+import jakarta.mail.internet.MimeUtility;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.time.OffsetDateTime;
+import java.time.ZoneId;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import lombok.extern.slf4j.Slf4j;
+import org.jsoup.Jsoup;
+import org.springframework.stereotype.Service;
+
+@Service
+@Slf4j
+public class MailMessageExtractionService {
+
+ public ParsedMailMessage parse(byte[] rawMime) {
+ try {
+ Session session = Session.getDefaultInstance(new Properties());
+ MimeMessage message = new MimeMessage(session, new ByteArrayInputStream(rawMime));
+ String subject = message.getSubject();
+ String from = message.getFrom() != null && message.getFrom().length > 0 ? message.getFrom()[0].toString() : null;
+ List recipients = new ArrayList<>();
+ if (message.getAllRecipients() != null) {
+ for (var recipient : message.getAllRecipients()) {
+ recipients.add(recipient.toString());
+ }
+ }
+ StringBuilder text = new StringBuilder();
+ StringBuilder html = new StringBuilder();
+ List attachments = new ArrayList<>();
+ processPart(message, text, html, attachments);
+ String normalizedText = text.length() > 0 ? text.toString().trim() : htmlToText(html.toString());
+ OffsetDateTime receivedAt = message.getReceivedDate() == null ? OffsetDateTime.now()
+ : message.getReceivedDate().toInstant().atZone(ZoneId.systemDefault()).toOffsetDateTime();
+ return new ParsedMailMessage(subject, from, recipients, receivedAt, normalizedText, html.toString(), attachments);
+ } catch (Exception e) {
+ throw new IllegalArgumentException("Failed to parse MIME message", e);
+ }
+ }
+
+ private void processPart(Part part, StringBuilder text, StringBuilder html, List attachments) throws Exception {
+ String disposition = part.getDisposition();
+ String contentType = part.getContentType() == null ? "application/octet-stream" : part.getContentType();
+ if (disposition != null && (Part.ATTACHMENT.equalsIgnoreCase(disposition) || Part.INLINE.equalsIgnoreCase(disposition))
+ && part.getFileName() != null) {
+ attachments.add(extractAttachment(part));
+ return;
+ }
+ Object content = part.getContent();
+ if (content instanceof Multipart multipart) {
+ for (int i = 0; i < multipart.getCount(); i++) {
+ BodyPart bodyPart = multipart.getBodyPart(i);
+ processPart(bodyPart, text, html, attachments);
+ }
+ } else if (contentType.toLowerCase().contains("text/plain")) {
+ text.append(content.toString()).append("\n");
+ } else if (contentType.toLowerCase().contains("text/html")) {
+ html.append(content.toString()).append("\n");
+ } else if (part.getFileName() != null) {
+ attachments.add(extractAttachment(part));
+ }
+ }
+
+ private MailAttachment extractAttachment(Part part) throws Exception {
+ String fileName = part.getFileName();
+ if (fileName == null) {
+ fileName = "attachment";
+ }
+ try {
+ fileName = MimeUtility.decodeText(fileName);
+ } catch (Exception ignored) {
+ }
+ String contentType = part.getContentType();
+ byte[] data;
+ try (InputStream in = part.getInputStream(); ByteArrayOutputStream out = new ByteArrayOutputStream()) {
+ in.transferTo(out);
+ data = out.toByteArray();
+ }
+ return new MailAttachment(fileName, contentType, data, data.length, null);
+ }
+
+ private String htmlToText(String html) {
+ if (html == null || html.isBlank()) {
+ return "";
+ }
+ try {
+ return Jsoup.parse(html).text().replaceAll("\s+", " ").trim();
+ } catch (Exception e) {
+ log.debug("Falling back to naive HTML cleanup: {}", e.getMessage());
+ return html.replaceAll("<[^>]+>", " ").replaceAll("\s+", " ").trim();
+ }
+ }
+
+ public String serializeMessage(ParsedMailMessage parsed) {
+ StringBuilder sb = new StringBuilder();
+ if (parsed.subject() != null) sb.append("Subject: ").append(parsed.subject()).append("\n");
+ if (parsed.from() != null) sb.append("From: ").append(parsed.from()).append("\n");
+ if (!parsed.recipients().isEmpty()) sb.append("To: ").append(String.join(", ", parsed.recipients())).append("\n");
+ sb.append("\n");
+ if (parsed.textBody() != null) sb.append(parsed.textBody());
+ return sb.toString().trim();
+ }
+
+ public record ParsedMailMessage(String subject, String from, List recipients, OffsetDateTime receivedAt,
+ String textBody, String htmlBody, List attachments) {}
+
+ public record MailAttachment(String fileName, String contentType, byte[] data, long sizeBytes, String path) {
+ public String safeTextPreview() {
+ return new String(data, StandardCharsets.UTF_8);
+ }
+ }
+}
diff --git a/src/main/java/at/procon/dip/ingestion/service/TedPackageExpansionService.java b/src/main/java/at/procon/dip/ingestion/service/TedPackageExpansionService.java
new file mode 100644
index 0000000..67bfd83
--- /dev/null
+++ b/src/main/java/at/procon/dip/ingestion/service/TedPackageExpansionService.java
@@ -0,0 +1,88 @@
+package at.procon.dip.ingestion.service;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.springframework.stereotype.Service;
+
+@Service
+@Slf4j
+public class TedPackageExpansionService {
+
+ private static final int MAX_FILES = 10000;
+ private static final long MAX_SINGLE_FILE_SIZE = 20L * 1024 * 1024;
+ private static final long MAX_TOTAL_EXTRACTED_SIZE = 1024L * 1024 * 1024;
+
+ public TedPackageExpansionResult expand(byte[] tarGzBytes) {
+ List entries = new ArrayList<>();
+ long total = 0;
+ try (TarArchiveInputStream tais = new TarArchiveInputStream(
+ new GzipCompressorInputStream(new ByteArrayInputStream(tarGzBytes)))) {
+ TarArchiveEntry entry;
+ while ((entry = tais.getNextTarEntry()) != null) {
+ if (entry.isDirectory()) {
+ continue;
+ }
+ if (entries.size() >= MAX_FILES) {
+ break;
+ }
+ String entryName = entry.getName();
+ if (!entryName.toLowerCase().endsWith(".xml")) {
+ continue;
+ }
+ if (entryName.contains("..") || entryName.startsWith("/") || entryName.startsWith("\\")) {
+ log.warn("Skipping suspicious TED package entry {}", entryName);
+ continue;
+ }
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ byte[] buffer = new byte[8192];
+ long fileSize = 0;
+ int read;
+ while ((read = tais.read(buffer)) > 0) {
+ fileSize += read;
+ total += read;
+ if (fileSize > MAX_SINGLE_FILE_SIZE || total > MAX_TOTAL_EXTRACTED_SIZE) {
+ throw new IOException("TED package extraction limits exceeded");
+ }
+ baos.write(buffer, 0, read);
+ }
+ byte[] data = baos.toByteArray();
+ entries.add(new TedPackageEntry(extractFilename(entryName), entryName, data, data.length, "application/xml"));
+ }
+ } catch (Exception e) {
+ throw new IllegalArgumentException("Failed to expand TED package", e);
+ }
+
+ String manifest = buildManifest(entries);
+ return new TedPackageExpansionResult(entries, manifest);
+ }
+
+ private String buildManifest(List entries) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("TED package contains ").append(entries.size()).append(" XML notice files\n");
+ for (TedPackageEntry entry : entries) {
+ sb.append("- ").append(entry.archivePath()).append(" (" ).append(entry.sizeBytes()).append(" bytes)\n");
+ }
+ return sb.toString().trim();
+ }
+
+ private String extractFilename(String path) {
+ int idx = Math.max(path.lastIndexOf('/'), path.lastIndexOf('\\'));
+ return idx >= 0 ? path.substring(idx + 1) : path;
+ }
+
+ public record TedPackageExpansionResult(List entries, String manifestText) {}
+
+ public record TedPackageEntry(String fileName, String archivePath, byte[] data, long sizeBytes, String mediaType) {
+ public String textUtf8() {
+ return new String(data, StandardCharsets.UTF_8);
+ }
+ }
+}
diff --git a/src/main/java/at/procon/dip/ingestion/spi/OriginalContentStoragePolicy.java b/src/main/java/at/procon/dip/ingestion/spi/OriginalContentStoragePolicy.java
new file mode 100644
index 0000000..8fcf2d3
--- /dev/null
+++ b/src/main/java/at/procon/dip/ingestion/spi/OriginalContentStoragePolicy.java
@@ -0,0 +1,11 @@
+package at.procon.dip.ingestion.spi;
+
+/**
+ * Controls whether the ORIGINAL raw payload should be persisted for a single imported document.
+ * DEFAULT defers to the global ingestion configuration and wrapper-document heuristics.
+ */
+public enum OriginalContentStoragePolicy {
+ DEFAULT,
+ STORE,
+ SKIP
+}
diff --git a/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java
index 60183ff..cc47fa7 100644
--- a/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java
+++ b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java
@@ -21,9 +21,16 @@ public record SourceDescriptor(
byte[] binaryContent,
String textContent,
OffsetDateTime receivedAt,
+ OriginalContentStoragePolicy originalContentStoragePolicy,
Map attributes
) {
+ public SourceDescriptor {
+ if (originalContentStoragePolicy == null) {
+ originalContentStoragePolicy = OriginalContentStoragePolicy.DEFAULT;
+ }
+ }
+
public boolean hasInlineBinaryContent() {
return binaryContent != null && binaryContent.length > 0;
}
diff --git a/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java b/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java
index 272b114..db3587a 100644
--- a/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java
+++ b/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java
@@ -45,7 +45,7 @@ public final class DocumentImportSupport {
public static DocumentFamily familyFor(DocumentType documentType) {
return switch (documentType) {
- case TED_NOTICE -> DocumentFamily.PROCUREMENT;
+ case TED_PACKAGE, TED_NOTICE -> DocumentFamily.PROCUREMENT;
case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL;
case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN ->
DocumentFamily.GENERIC;
diff --git a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java
index 6d0e2d7..7b5e1ab 100644
--- a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java
+++ b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java
@@ -12,12 +12,16 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
+import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
@Component
+@Order(100)
public class DefaultGenericTextRepresentationBuilder implements TextRepresentationBuilder {
+ public static final String BUILDER_KEY = "default-generic-text";
+
@Override
public boolean supports(DocumentType documentType) {
return documentType != DocumentType.TED_NOTICE;
@@ -39,25 +43,34 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
List drafts = new ArrayList<>();
drafts.add(new TextRepresentationDraft(
RepresentationType.FULLTEXT,
+ BUILDER_KEY,
request.detectionResult().languageCode(),
baseText,
false,
- null
+ null,
+ ContentRole.NORMALIZED_TEXT,
+ Boolean.TRUE
));
drafts.add(new TextRepresentationDraft(
RepresentationType.SEMANTIC_TEXT,
+ BUILDER_KEY,
request.detectionResult().languageCode(),
semantic,
true,
- null
+ null,
+ ContentRole.NORMALIZED_TEXT,
+ Boolean.TRUE
));
if (StringUtils.hasText(title)) {
drafts.add(new TextRepresentationDraft(
RepresentationType.TITLE_ABSTRACT,
+ BUILDER_KEY,
request.detectionResult().languageCode(),
title + "\n\n" + summary,
false,
- null
+ null,
+ ContentRole.NORMALIZED_TEXT,
+ Boolean.FALSE
));
}
return drafts;
diff --git a/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java
new file mode 100644
index 0000000..834082c
--- /dev/null
+++ b/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java
@@ -0,0 +1,138 @@
+package at.procon.dip.normalization.impl;
+
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.domain.document.RepresentationType;
+import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
+import at.procon.dip.ingestion.util.DocumentImportSupport;
+import at.procon.dip.normalization.spi.RepresentationBuildRequest;
+import at.procon.dip.normalization.spi.TextRepresentationBuilder;
+import at.procon.dip.normalization.spi.TextRepresentationDraft;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import org.springframework.core.annotation.Order;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+@Component
+@Order(10)
+public class TedStructuredTextRepresentationBuilder implements TextRepresentationBuilder {
+
+ public static final String BUILDER_KEY = "ted-structured-text";
+
+ @Override
+ public boolean supports(DocumentType documentType) {
+ return documentType == DocumentType.TED_NOTICE;
+ }
+
+ @Override
+ public List build(RepresentationBuildRequest request) {
+ String normalizedText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
+ if (!StringUtils.hasText(normalizedText)) {
+ return List.of();
+ }
+
+ Map attributes = request.extractionResult().structuredPayloads().stream()
+ .filter(payload -> Objects.equals(payload.projectionName(), "ted-notice"))
+ .map(ExtractedStructuredPayload::attributes)
+ .filter(Objects::nonNull)
+ .findFirst()
+ .orElse(Map.of());
+
+ String title = asString(attributes.get("title"));
+ String description = asString(attributes.get("description"));
+ String buyerName = asString(attributes.get("buyerName"));
+ String cpvCodes = asString(attributes.get("cpvCodes"));
+ String nutsCodes = asString(attributes.get("nutsCodes"));
+ String publicationId = asString(attributes.get("publicationId"));
+ String semanticText = buildSemanticText(title, description, buyerName, cpvCodes, nutsCodes, publicationId, normalizedText);
+ String summary = DocumentImportSupport.ellipsize(
+ StringUtils.hasText(description) ? description.trim() : normalizedText.replace('\n', ' ').trim(),
+ 1200
+ );
+
+ List drafts = new ArrayList<>();
+ drafts.add(new TextRepresentationDraft(
+ RepresentationType.SEMANTIC_TEXT,
+ BUILDER_KEY,
+ request.detectionResult().languageCode(),
+ semanticText,
+ true,
+ null,
+ ContentRole.NORMALIZED_TEXT,
+ Boolean.TRUE
+ ));
+ /*
+ drafts.add(new TextRepresentationDraft(
+ RepresentationType.FULLTEXT,
+ BUILDER_KEY,
+ request.detectionResult().languageCode(),
+ normalizedText,
+ false,
+ null,
+ ContentRole.NORMALIZED_TEXT,
+ Boolean.TRUE
+ ));
+ if (StringUtils.hasText(title)) {
+ drafts.add(new TextRepresentationDraft(
+ RepresentationType.TITLE_ABSTRACT,
+ BUILDER_KEY,
+ request.detectionResult().languageCode(),
+ title + "\n\n" + summary,
+ false,
+ null,
+ ContentRole.NORMALIZED_TEXT,
+ Boolean.FALSE
+ ));
+ }
+ drafts.add(new TextRepresentationDraft(
+ RepresentationType.SUMMARY,
+ BUILDER_KEY,
+ request.detectionResult().languageCode(),
+ summary,
+ false,
+ null,
+ ContentRole.NORMALIZED_TEXT,
+ Boolean.FALSE
+ ));
+ */
+ return drafts;
+ }
+
+ private String buildSemanticText(String title,
+ String description,
+ String buyerName,
+ String cpvCodes,
+ String nutsCodes,
+ String publicationId,
+ String normalizedText) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("Document type: TED_NOTICE\n");
+ if (StringUtils.hasText(publicationId)) {
+ sb.append("Publication: ").append(publicationId.trim()).append('\n');
+ }
+ if (StringUtils.hasText(title)) {
+ sb.append("Title: ").append(title.trim()).append("\n\n");
+ }
+ if (StringUtils.hasText(buyerName)) {
+ sb.append("Contracting Authority: ").append(buyerName.trim()).append('\n');
+ }
+ if (StringUtils.hasText(cpvCodes)) {
+ sb.append("CPV Codes: ").append(cpvCodes.trim()).append('\n');
+ }
+ if (StringUtils.hasText(nutsCodes)) {
+ sb.append("NUTS Codes: ").append(nutsCodes.trim()).append('\n');
+ }
+ if (StringUtils.hasText(description)) {
+ sb.append("\nDescription: ").append(description.trim()).append("\n\n");
+ }
+ sb.append(normalizedText.trim());
+ return sb.toString().trim();
+ }
+
+ private String asString(Object value) {
+ return value instanceof String s && StringUtils.hasText(s) ? s : null;
+ }
+}
diff --git a/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java b/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java
index 38f2d8a..bb2f3f7 100644
--- a/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java
+++ b/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java
@@ -3,8 +3,10 @@ package at.procon.dip.normalization.service;
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
import at.procon.dip.normalization.spi.TextRepresentationBuilder;
import at.procon.dip.normalization.spi.TextRepresentationDraft;
+import java.util.ArrayList;
import java.util.List;
import lombok.RequiredArgsConstructor;
+import org.springframework.core.annotation.AnnotationAwareOrderComparator;
import org.springframework.stereotype.Service;
@Service
@@ -14,11 +16,22 @@ public class TextRepresentationBuildService {
private final List builders;
public List build(RepresentationBuildRequest request) {
- return builders.stream()
+ List matchingBuilders = builders.stream()
.filter(builder -> builder.supports(request.detectionResult().documentType()))
- .findFirst()
- .orElseThrow(() -> new IllegalStateException(
- "No text representation builder registered for " + request.detectionResult().documentType()))
- .build(request);
+ .sorted(AnnotationAwareOrderComparator.INSTANCE)
+ .toList();
+ if (matchingBuilders.isEmpty()) {
+ throw new IllegalStateException(
+ "No text representation builder registered for " + request.detectionResult().documentType());
+ }
+
+ List result = new ArrayList<>();
+ for (TextRepresentationBuilder builder : matchingBuilders) {
+ List drafts = builder.build(request);
+ if (drafts != null && !drafts.isEmpty()) {
+ result.addAll(drafts);
+ }
+ }
+ return result;
}
}
diff --git a/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java
index af1f49a..4b7322d 100644
--- a/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java
+++ b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java
@@ -1,5 +1,6 @@
package at.procon.dip.normalization.spi;
+import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.RepresentationType;
/**
@@ -7,9 +8,20 @@ import at.procon.dip.domain.document.RepresentationType;
*/
public record TextRepresentationDraft(
RepresentationType representationType,
+ String builderKey,
String languageCode,
String textBody,
boolean primary,
- Integer chunkIndex
+ Integer chunkIndex,
+ ContentRole sourceContentRole,
+ Boolean queueForEmbedding
) {
+
+ public TextRepresentationDraft(RepresentationType representationType,
+ String languageCode,
+ String textBody,
+ boolean primary,
+ Integer chunkIndex) {
+ this(representationType, null, languageCode, textBody, primary, chunkIndex, ContentRole.NORMALIZED_TEXT, null);
+ }
}
diff --git a/src/main/java/at/procon/dip/processing/impl/TedStructuredDocumentProcessor.java b/src/main/java/at/procon/dip/processing/impl/TedStructuredDocumentProcessor.java
new file mode 100644
index 0000000..2cdfc14
--- /dev/null
+++ b/src/main/java/at/procon/dip/processing/impl/TedStructuredDocumentProcessor.java
@@ -0,0 +1,110 @@
+package at.procon.dip.processing.impl;
+
+import at.procon.dip.classification.spi.DetectionResult;
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentFamily;
+import at.procon.dip.domain.document.DocumentStatus;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import at.procon.dip.processing.spi.DocumentProcessingPolicy;
+import at.procon.dip.processing.spi.StructuredDocumentProcessor;
+import at.procon.dip.processing.spi.StructuredProcessingRequest;
+import at.procon.dip.domain.document.service.DocumentService;
+import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
+import at.procon.ted.model.entity.ProcurementDocument;
+import at.procon.ted.service.XmlParserService;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+@Component
+@RequiredArgsConstructor
+@Slf4j
+public class TedStructuredDocumentProcessor implements StructuredDocumentProcessor {
+
+ private final XmlParserService xmlParserService;
+ private final DocumentService documentService;
+ private final TedNoticeProjectionService tedNoticeProjectionService;
+
+ @Override
+ public boolean supports(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
+ return detectionResult.documentType() == DocumentType.TED_NOTICE;
+ }
+
+ @Override
+ public DocumentProcessingPolicy processingPolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
+ return DocumentProcessingPolicy.replaceGenericTextProcessing();
+ }
+
+ @Override
+ public ExtractionResult process(StructuredProcessingRequest request) {
+ String xml = request.textContent();
+ if (!StringUtils.hasText(xml) && request.binaryContent() != null) {
+ xml = new String(request.binaryContent(), java.nio.charset.StandardCharsets.UTF_8);
+ }
+ if (!StringUtils.hasText(xml)) {
+ return new ExtractionResult(Map.of(), List.of(), List.of("TED structured processor received no XML payload"));
+ }
+
+ ProcurementDocument tedDocument = xmlParserService.parseDocument(xml);
+ tedDocument.setDocumentHash(request.dedupHash());
+ tedDocument.setXmlDocument(xml);
+ tedDocument.setSourceFilename(request.sourceDescriptor().fileName());
+ tedDocument.setSourcePath(request.sourceDescriptor().sourceUri());
+ tedDocument.setFileSizeBytes(request.binaryContent() == null ? null : (long) request.binaryContent().length);
+
+ var canonical = request.document();
+ canonical.setDocumentType(DocumentType.TED_NOTICE);
+ canonical.setDocumentFamily(DocumentFamily.PROCUREMENT);
+ canonical.setStatus(DocumentStatus.CLASSIFIED);
+ canonical.setTitle(tedDocument.getProjectTitle());
+ canonical.setSummary(tedDocument.getProjectDescription());
+ canonical.setLanguageCode(tedDocument.getLanguageCode());
+ canonical.setMimeType(request.detectionResult().mimeType() == null ? "application/xml" : request.detectionResult().mimeType());
+ if (StringUtils.hasText(tedDocument.getPublicationId())) {
+ canonical.setBusinessKey("TED_NOTICE:" + tedDocument.getPublicationId());
+ } else if (StringUtils.hasText(tedDocument.getNoticeId())) {
+ canonical.setBusinessKey("TED_NOTICE:" + tedDocument.getNoticeId());
+ }
+ documentService.save(canonical);
+
+ tedNoticeProjectionService.registerOrRefreshProjection(tedDocument, canonical.getId());
+
+ Map payload = new LinkedHashMap<>();
+ if (StringUtils.hasText(tedDocument.getProjectTitle())) {
+ payload.put("title", tedDocument.getProjectTitle());
+ }
+ if (StringUtils.hasText(tedDocument.getProjectDescription())) {
+ payload.put("description", tedDocument.getProjectDescription());
+ }
+ if (StringUtils.hasText(tedDocument.getBuyerName())) {
+ payload.put("buyerName", tedDocument.getBuyerName());
+ }
+ if (tedDocument.getCpvCodes() != null && tedDocument.getCpvCodes().length > 0) {
+ payload.put("cpvCodes", String.join(", ", tedDocument.getCpvCodes()));
+ }
+ if (tedDocument.getNutsCodes() != null && tedDocument.getNutsCodes().length > 0) {
+ payload.put("nutsCodes", String.join(", ", tedDocument.getNutsCodes()));
+ }
+ payload.put("lotCount", tedDocument.getLots() == null ? 0 : tedDocument.getLots().size());
+ payload.put("noticeId", tedDocument.getNoticeId());
+ payload.put("publicationId", tedDocument.getPublicationId());
+
+ Map derivedText = new LinkedHashMap<>();
+ if (StringUtils.hasText(tedDocument.getTextContent())) {
+ derivedText.put(ContentRole.NORMALIZED_TEXT, tedDocument.getTextContent());
+ }
+
+ return new ExtractionResult(
+ derivedText,
+ List.of(new ExtractedStructuredPayload("ted-notice", payload)),
+ List.of()
+ );
+ }
+}
diff --git a/src/main/java/at/procon/dip/processing/service/StructuredDocumentProcessingService.java b/src/main/java/at/procon/dip/processing/service/StructuredDocumentProcessingService.java
new file mode 100644
index 0000000..0ff0357
--- /dev/null
+++ b/src/main/java/at/procon/dip/processing/service/StructuredDocumentProcessingService.java
@@ -0,0 +1,36 @@
+package at.procon.dip.processing.service;
+
+import at.procon.dip.classification.spi.DetectionResult;
+import at.procon.dip.processing.spi.DocumentProcessingPolicy;
+import at.procon.dip.processing.spi.StructuredDocumentProcessor;
+import at.procon.dip.processing.spi.StructuredProcessingRequest;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import java.util.List;
+import java.util.Optional;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+
+@Service
+@RequiredArgsConstructor
+public class StructuredDocumentProcessingService {
+
+ private final List processors;
+
+ public Optional resolve(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
+ return processors.stream()
+ .filter(processor -> processor.supports(sourceDescriptor, detectionResult))
+ .findFirst();
+ }
+
+ public DocumentProcessingPolicy resolvePolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
+ return resolve(sourceDescriptor, detectionResult)
+ .map(processor -> processor.processingPolicy(sourceDescriptor, detectionResult))
+ .orElse(DocumentProcessingPolicy.genericDefault());
+ }
+
+ public Optional process(StructuredProcessingRequest request) {
+ return resolve(request.sourceDescriptor(), request.detectionResult())
+ .map(processor -> processor.process(request));
+ }
+}
diff --git a/src/main/java/at/procon/dip/processing/spi/DocumentProcessingPolicy.java b/src/main/java/at/procon/dip/processing/spi/DocumentProcessingPolicy.java
new file mode 100644
index 0000000..a4cddc9
--- /dev/null
+++ b/src/main/java/at/procon/dip/processing/spi/DocumentProcessingPolicy.java
@@ -0,0 +1,22 @@
+package at.procon.dip.processing.spi;
+
+/**
+ * Controls which generic pipeline stages should run for a document and whether
+ * a structured processor should be invoked.
+ */
+public record DocumentProcessingPolicy(
+ boolean runGenericExtraction,
+ boolean persistExtractedContent,
+ boolean runRepresentationBuilders,
+ boolean invokeStructuredProcessor,
+ boolean applyStructuredTitleIfMissing
+) {
+
+ public static DocumentProcessingPolicy genericDefault() {
+ return new DocumentProcessingPolicy(true, true, true, true, true);
+ }
+
+ public static DocumentProcessingPolicy replaceGenericTextProcessing() {
+ return new DocumentProcessingPolicy(false, true, true, true, true);
+ }
+}
diff --git a/src/main/java/at/procon/dip/processing/spi/StructuredDocumentProcessor.java b/src/main/java/at/procon/dip/processing/spi/StructuredDocumentProcessor.java
new file mode 100644
index 0000000..2fbe670
--- /dev/null
+++ b/src/main/java/at/procon/dip/processing/spi/StructuredDocumentProcessor.java
@@ -0,0 +1,19 @@
+package at.procon.dip.processing.spi;
+
+import at.procon.dip.classification.spi.DetectionResult;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+
+/**
+ * Optional type-specific enrichment layer on top of the canonical DOC import.
+ */
+public interface StructuredDocumentProcessor {
+
+ boolean supports(SourceDescriptor sourceDescriptor, DetectionResult detectionResult);
+
+ default DocumentProcessingPolicy processingPolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
+ return DocumentProcessingPolicy.replaceGenericTextProcessing();
+ }
+
+ ExtractionResult process(StructuredProcessingRequest request);
+}
diff --git a/src/main/java/at/procon/dip/processing/spi/StructuredProcessingRequest.java b/src/main/java/at/procon/dip/processing/spi/StructuredProcessingRequest.java
new file mode 100644
index 0000000..8d7652b
--- /dev/null
+++ b/src/main/java/at/procon/dip/processing/spi/StructuredProcessingRequest.java
@@ -0,0 +1,20 @@
+package at.procon.dip.processing.spi;
+
+import at.procon.dip.classification.spi.DetectionResult;
+import at.procon.dip.domain.document.entity.Document;
+import at.procon.dip.domain.document.entity.DocumentContent;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+
+/**
+ * Canonical import context handed to a structured document processor.
+ */
+public record StructuredProcessingRequest(
+ Document document,
+ DocumentContent originalContent,
+ SourceDescriptor sourceDescriptor,
+ DetectionResult detectionResult,
+ byte[] binaryContent,
+ String textContent,
+ String dedupHash
+) {
+}
diff --git a/src/main/java/at/procon/ted/camel/MailRoute.java b/src/main/java/at/procon/ted/camel/MailRoute.java
index 2a56137..49e9e6b 100644
--- a/src/main/java/at/procon/ted/camel/MailRoute.java
+++ b/src/main/java/at/procon/ted/camel/MailRoute.java
@@ -1,6 +1,10 @@
package at.procon.ted.camel;
+import at.procon.dip.domain.document.SourceType;
import at.procon.ted.config.TedProcessorProperties;
+import at.procon.dip.ingestion.service.DocumentIngestionGateway;
+import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.service.attachment.AttachmentExtractor;
import at.procon.ted.service.attachment.AttachmentProcessingService;
import jakarta.mail.BodyPart;
@@ -14,16 +18,18 @@ import lombok.extern.slf4j.Slf4j;
import org.apache.camel.Exchange;
import org.apache.camel.LoggingLevel;
import org.apache.camel.builder.RouteBuilder;
+import org.apache.camel.component.mail.MailMessage;
import org.jsoup.Jsoup;
import org.springframework.stereotype.Component;
import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Properties;
+import java.time.OffsetDateTime;
+import java.time.ZoneId;
+import java.util.*;
/**
* Apache Camel route for IMAP mail processing.
@@ -51,6 +57,7 @@ public class MailRoute extends RouteBuilder {
private final TedProcessorProperties properties;
private final AttachmentProcessingService attachmentProcessingService;
+ private final DocumentIngestionGateway documentIngestionGateway;
@Override
public void configure() throws Exception {
@@ -105,7 +112,14 @@ public class MailRoute extends RouteBuilder {
from("direct:mime")
.routeId(ROUTE_ID_MIME)
.process(exchange -> {
- Message mailMessage = exchange.getIn().getBody(Message.class);
+ Message mailMessage = null;
+ MailMessage mailMessage_ = exchange.getIn().getBody(MailMessage.class);
+ if(mailMessage_ != null) {
+ mailMessage = mailMessage_.getMessage();
+ }
+ else {
+ mailMessage = exchange.getIn().getBody(Message.class);
+ }
if (mailMessage == null) {
log.warn("Received null mail message, skipping");
@@ -147,6 +161,41 @@ public class MailRoute extends RouteBuilder {
log.info("MIME decoded: subject='{}', textLength={}, htmlLength={}, attachments={}",
subject, finalTextContent.length(), htmlContent.length(), attachments.size());
+
+ if (properties.getGenericIngestion().isEnabled() && properties.getGenericIngestion().isMailAdapterEnabled()) {
+ try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+ mailMessage.writeTo(baos);
+ String sourceIdentifier = mailMessage.getHeader("Message-ID") != null ?
+ Arrays.stream(mailMessage.getHeader("Message-ID")).findFirst().orElse(null) : null;
+ if (sourceIdentifier == null || sourceIdentifier.isBlank()) {
+ sourceIdentifier = UUID.randomUUID().toString();
+ }
+ var result = documentIngestionGateway.ingest(new SourceDescriptor(
+ null,
+ SourceType.MAIL,
+ sourceIdentifier,
+ null,
+ subject != null ? subject.replaceAll("[^A-Za-z0-9._-]", "_") + ".eml" : "mail-message.eml",
+ "message/rfc822",
+ baos.toByteArray(),
+ null,
+ exchange.getIn().getHeader("mailReceivedDate", Date.class) == null
+ ? OffsetDateTime.now()
+ : exchange.getIn().getHeader("mailReceivedDate", Date.class).toInstant()
+ .atZone(ZoneId.systemDefault()).toOffsetDateTime(),
+ OriginalContentStoragePolicy.DEFAULT,
+ Map.of(
+ "subject", subject != null ? subject : "",
+ "from", from != null ? from : ""
+ )
+ ));
+ if (!result.warnings().isEmpty()) {
+ log.info("Mail adapter imported MIME message with {} warnings", result.warnings().size());
+ }
+ } catch (Exception e) {
+ log.warn("Phase 4.1 mail adapter import failed for subject '{}': {}", subject, e.getMessage());
+ }
+ }
})
// Queue attachments for async processing
.choice()
diff --git a/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java b/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java
index 4eabe9e..2f290ff 100644
--- a/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java
+++ b/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java
@@ -1,5 +1,9 @@
package at.procon.ted.camel;
+import at.procon.dip.ingestion.service.DocumentIngestionGateway;
+import at.procon.dip.ingestion.spi.IngestionResult;
+import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.model.entity.TedDailyPackage;
import at.procon.ted.repository.TedDailyPackageRepository;
@@ -53,6 +57,7 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder {
private final TedProcessorProperties properties;
private final TedDailyPackageRepository packageRepository;
private final TedPackageDownloadService downloadService;
+ private final DocumentIngestionGateway documentIngestionGateway;
private final BatchDocumentProcessingService batchProcessingService;
/**
@@ -146,9 +151,21 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder {
.process(this::markPackageDuplicate)
.otherwise()
.process(this::saveDownloadedPackage)
- .to("direct:extract-tar-gz")
+ .choice()
+ .when(header("skipLegacyXmlProcessing").isEqualTo(true))
+ .log(LoggingLevel.INFO, "Package ${header.packageId}: generic ingestion gateway-only mode active, skipping legacy XML batch persistence")
+ .to("direct:complete-package-after-gateway")
+ .otherwise()
+ .to("direct:extract-tar-gz")
+ .endChoice()
.end();
+ from("direct:complete-package-after-gateway")
+ .routeId("ted-package-gateway-only-completer")
+ .setHeader("processingStartTime", constant(System.currentTimeMillis()))
+ .process(this::markPackageCompleted)
+ .process(this::logPackageStatistics);
+
// tar.gz Extraction Route
from("direct:extract-tar-gz")
.routeId(ROUTE_ID_EXTRACTOR)
@@ -214,7 +231,7 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder {
long runningCount = downloadingCount + processingCount;
exchange.getIn().setHeader("runningCount", runningCount);
- exchange.getIn().setHeader("tooManyRunning", runningCount >= 2);
+ exchange.getIn().setHeader("tooManyRunning", runningCount >= 1);
if (runningCount > 0) {
log.info("Currently {} packages in progress ({} downloading, {} processing)",
@@ -278,21 +295,9 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder {
Integer serialNumber = exchange.getIn().getHeader("serialNumber", Integer.class);
String downloadUrl = exchange.getIn().getHeader("downloadUrl", String.class);
- Optional existing = packageRepository.findByPackageIdentifier(packageId);
- if (existing.isPresent()) {
- TedDailyPackage pkg = existing.get();
- if (pkg.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) {
- log.info("Retrying existing NOT_FOUND package in Camel route: {}", packageId);
- pkg.setDownloadUrl(downloadUrl);
- pkg.setErrorMessage(null);
- pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADING);
- pkg = packageRepository.save(pkg);
- exchange.getIn().setHeader("packageDbId", pkg.getId());
- return;
- }
-
- log.debug("Package {} already exists in DB with status {}", packageId, pkg.getDownloadStatus());
- exchange.getIn().setHeader("packageDbId", pkg.getId());
+ // Check if already exists
+ if (packageRepository.existsByPackageIdentifier(packageId)) {
+ log.debug("Package {} already exists in DB", packageId);
return;
}
@@ -372,6 +377,50 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder {
exchange.getIn().setHeader("downloadPath", downloadPath.toString());
exchange.getIn().setHeader("deleteAfterExtraction",
properties.getDownload().isDeleteAfterExtraction());
+
+ exchange.getIn().setHeader("skipLegacyXmlProcessing", false);
+
+ if (properties.getGenericIngestion().isEnabled() && properties.getGenericIngestion().isTedPackageAdapterEnabled()) {
+ try {
+ IngestionResult ingestionResult = documentIngestionGateway.ingest(new SourceDescriptor(
+ null,
+ at.procon.dip.domain.document.SourceType.TED_PACKAGE,
+ packageId,
+ downloadPath.toString(),
+ packageId + ".tar.gz",
+ "application/gzip",
+ body,
+ null,
+ OffsetDateTime.now(),
+ OriginalContentStoragePolicy.DEFAULT,
+ java.util.Map.of(
+ "packageId", packageId,
+ "title", packageId + ".tar.gz"
+ )
+ ));
+
+ int importedChildCount = Math.max(0, ingestionResult.documents().size() - 1);
+ exchange.getIn().setHeader("gatewayImportedChildCount", importedChildCount);
+ exchange.getIn().setHeader("gatewayImportWarnings", ingestionResult.warnings().size());
+
+ if (properties.getGenericIngestion().isGatewayOnlyForTedPackages()) {
+ packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
+ pkg.setXmlFileCount(importedChildCount);
+ pkg.setProcessedCount(importedChildCount);
+ pkg.setFailedCount(0);
+ packageRepository.save(pkg);
+ });
+
+ if (properties.getDownload().isDeleteAfterExtraction()) {
+ Files.deleteIfExists(downloadPath);
+ }
+
+ exchange.getIn().setHeader("skipLegacyXmlProcessing", true);
+ }
+ } catch (Exception e) {
+ log.warn("Phase 4.1 TED package adapter import failed for {}: {}", packageId, e.getMessage());
+ }
+ }
}
/**
diff --git a/src/main/java/at/procon/ted/config/TedProcessorProperties.java b/src/main/java/at/procon/ted/config/TedProcessorProperties.java
index 8a46df5..59bf0e7 100644
--- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java
+++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java
@@ -510,7 +510,7 @@ public class TedProcessorProperties {
/**
* Input directory for the generic filesystem importer.
*/
- private String inputDirectory = "D:/ted.europe/generic-input";
+ private String inputDirectory = "/ted.europe/generic-input";
/**
* Regular-expression file pattern used by the Camel file route.
@@ -570,6 +570,14 @@ public class TedProcessorProperties {
*/
private boolean deduplicateByContentHash = true;
+ /**
+ * Persist ORIGINAL content rows for wrapper/container documents that primarily exist
+ * to group or reference child documents (for example TED packages or expanded ZIP wrappers).
+ * When disabled, wrappers are still classified, extracted and represented, but the raw
+ * ORIGINAL content payload is not stored in DOC.doc_content.
+ */
+ private boolean storeOriginalContentForWrapperDocuments = true;
+
/**
* Queue only the primary text representation for embedding.
*/
@@ -580,6 +588,50 @@ public class TedProcessorProperties {
*/
@NotBlank
private String importBatchId = "phase4-generic";
+
+ /**
+ * Enable the Phase 4.1 TED package adapter built on top of the generic DOC ingestion SPI.
+ */
+ private boolean tedPackageAdapterEnabled = true;
+
+ /**
+ * Enable the Phase 4.1 mail/document adapter built on top of the generic DOC ingestion SPI.
+ */
+ private boolean mailAdapterEnabled = false;
+
+ /**
+ * Optional dedicated owner tenant key for imported mail messages and attachments.
+ * Falls back to defaultOwnerTenantKey when not configured.
+ */
+ private String mailDefaultOwnerTenantKey;
+
+ /**
+ * Default visibility for imported mail messages and attachments.
+ */
+ private at.procon.dip.domain.access.DocumentVisibility mailDefaultVisibility = at.procon.dip.domain.access.DocumentVisibility.TENANT;
+
+ /**
+ * Expand ZIP attachments recursively through the mail adapter.
+ */
+ private boolean expandMailZipAttachments = true;
+
+ /**
+ * Import batch identifier for TED package roots and extracted TED child documents.
+ */
+ @NotBlank
+ private String tedPackageImportBatchId = "phase41-ted-package";
+
+ /**
+ * When true, TED packages are persisted only through the generic ingestion gateway
+ * and the legacy XML batch persistence path is skipped.
+ */
+ private boolean gatewayOnlyForTedPackages = false;
+
+ /**
+ * Import batch identifier for imported mail root messages and child attachments.
+ */
+ @NotBlank
+ private String mailImportBatchId = "phase41-mail";
}
}
diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
index 0d9f1ec..743539b 100644
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@@ -39,7 +39,7 @@ spring:
order_updates: true
flyway:
- enabled: true
+ enabled: false
locations: classpath:db/migration
baseline-on-migrate: true
create-schemas: true
@@ -128,7 +128,9 @@ ted:
# TED Daily Package Download configuration
download:
# Enable/disable automatic package download
- enabled: false
+ enabled: true
+ # User service-based camel route
+ use-service-based: false
# Base URL for TED Daily Packages
base-url: https://ted.europa.eu/packages/daily/
# Download directory for tar.gz files
@@ -136,11 +138,11 @@ ted:
# Extract directory for XML files
extract-directory: /ted.europe/extracted
# Start year for downloads
- start-year: 2023
+ start-year: 2026
# Max consecutive 404 errors before stopping
max-consecutive-404: 4
# Polling interval (milliseconds) - 2 minutes
- poll-interval: 120000
+ poll-interval: 3600000
# Retry interval for tail NOT_FOUND packages - 6 hours
not-found-retry-interval: 21600000
# Grace period after year end before a previous-year tail 404 is treated as final
@@ -150,7 +152,7 @@ ted:
# Download timeout (milliseconds) - 5 minutes
download-timeout: 300000
# Max concurrent downloads
- max-concurrent-downloads: 2
+ max-concurrent-downloads: 1
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
delay-between-downloads: 3000
# Delete tar.gz after extraction
@@ -207,13 +209,13 @@ ted:
# Phase 4 generic ingestion configuration
generic-ingestion:
# Master switch for arbitrary document ingestion into the DOC model
- enabled: false
+ enabled: true
# Enable file-system polling for non-TED documents
file-system-enabled: false
# Allow REST/API upload endpoints for arbitrary documents
rest-upload-enabled: true
# Input directory for the generic Camel file route
- input-directory: D:/ted.europe/generic-input
+ input-directory: /ted.europe/generic-input
# Regex for files accepted by the generic file route
file-pattern: .*\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$
# Move successfully processed files here
@@ -236,10 +238,29 @@ ted:
max-binary-bytes-in-db: 5242880
# Deduplicate by content hash and attach additional sources to the same canonical document
deduplicate-by-content-hash: true
+ # Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers
+ store-original-content-for-wrapper-documents: true
# Queue only the primary text representation for vectorization
vectorize-primary-representation-only: true
# Import batch marker written to DOC.doc_source.import_batch_id
import-batch-id: phase4-generic
+ # Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI
+ ted-package-adapter-enabled: true
+ # Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI
+ mail-adapter-enabled: false
+ # Optional dedicated mail owner tenant, falls back to default-owner-tenant-key
+ mail-default-owner-tenant-key:
+ # Visibility for imported mail messages and attachments
+ mail-default-visibility: TENANT
+ # Expand ZIP attachments recursively through the mail adapter
+ expand-mail-zip-attachments: true
+ # Import batch marker for TED package roots and children
+ ted-package-import-batch-id: phase41-ted-package
+ # When true, TED package documents are stored only through the generic ingestion gateway
+ # and the legacy XML batch processing path is skipped
+ gateway-only-for-ted-packages: true
+ # Import batch marker for mail roots and attachments
+ mail-import-batch-id: phase41-mail
# Solution Brief processing configuration
solution-brief:
diff --git a/src/main/resources/db/migration/V8__doc_phase4_1_expand_document_and_source_types.sql b/src/main/resources/db/migration/V8__doc_phase4_1_expand_document_and_source_types.sql
new file mode 100644
index 0000000..d2420ca
--- /dev/null
+++ b/src/main/resources/db/migration/V8__doc_phase4_1_expand_document_and_source_types.sql
@@ -0,0 +1,80 @@
+-- Phase 4.1 enum/check-constraint expansion for newly introduced generic document/source types.
+-- Supports both:
+-- 1) PostgreSQL ENUM-backed columns created by Flyway
+-- 2) legacy VARCHAR + CHECK constraint variants that may exist in local/dev databases
+
+DO $$
+BEGIN
+ IF EXISTS (
+ SELECT 1
+ FROM pg_type t
+ JOIN pg_namespace n ON n.oid = t.typnamespace
+ WHERE n.nspname = 'doc'
+ AND t.typname = 'doc_document_type'
+ ) THEN
+ ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_PACKAGE';
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF EXISTS (
+ SELECT 1
+ FROM pg_type t
+ JOIN pg_namespace n ON n.oid = t.typnamespace
+ WHERE n.nspname = 'doc'
+ AND t.typname = 'doc_source_type'
+ ) THEN
+ ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'PACKAGE_CHILD';
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF EXISTS (
+ SELECT 1
+ FROM pg_constraint c
+ JOIN pg_class r ON r.oid = c.conrelid
+ JOIN pg_namespace n ON n.oid = r.relnamespace
+ WHERE n.nspname = 'doc'
+ AND r.relname = 'doc_document'
+ AND c.conname = 'doc_document_document_type_check'
+ ) THEN
+ ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_type_check;
+ ALTER TABLE DOC.doc_document
+ ADD CONSTRAINT doc_document_document_type_check
+ CHECK (
+ document_type IN (
+ 'TED_PACKAGE', 'TED_NOTICE', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
+ 'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'UNKNOWN'
+ )
+ );
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF EXISTS (
+ SELECT 1
+ FROM pg_constraint c
+ JOIN pg_class r ON r.oid = c.conrelid
+ JOIN pg_namespace n ON n.oid = r.relnamespace
+ WHERE n.nspname = 'doc'
+ AND r.relname = 'doc_source'
+ AND c.conname = 'doc_source_source_type_check'
+ ) THEN
+ ALTER TABLE DOC.doc_source DROP CONSTRAINT doc_source_source_type_check;
+ ALTER TABLE DOC.doc_source
+ ADD CONSTRAINT doc_source_source_type_check
+ CHECK (
+ source_type IN (
+ 'TED_PACKAGE', 'PACKAGE_CHILD', 'MAIL', 'FILE_SYSTEM', 'REST_UPLOAD',
+ 'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION'
+ )
+ );
+ END IF;
+END
+$$;