Refactor phases 4.1

master
trifonovt 1 month ago
parent ac59730f3e
commit f337af56b5

@ -0,0 +1,28 @@
# Phase 4.1 adapter extensions
## Added adapters
### TED package adapter
- Source type: `TED_PACKAGE`
- Root access: `PUBLIC`, no owner tenant
- Root document type: `TED_PACKAGE`
- Child source type: `PACKAGE_CHILD`
- Child relation: `EXTRACTED_FROM`
The adapter imports the package artifact plus its XML members into the generic `DOC` model.
It does not replace the existing legacy TED package processing path; instead it complements it, so the later legacy TED parsing step can still enrich the same canonical child documents into proper `TED_NOTICE` projections by dedup hash.
### Mail/document adapter
- Source type: `MAIL`
- Root document type: `MIME_MESSAGE`
- Child relation: `ATTACHMENT_OF`
- Access: configurable via `mail-default-owner-tenant-key` and `mail-default-visibility`
The adapter stores the message body as the semantic root text and imports attachments as child documents. ZIP attachments can optionally be expanded recursively.
## Deduplication
Phase 4 deduplication by content hash is refined so the same payload is only deduplicated within the same access scope (`visibility` + `owner tenant`).
This prevents private documents from different tenants from being merged into one canonical document accidentally.

@ -224,6 +224,14 @@
<artifactId>postgresql</artifactId> <artifactId>postgresql</artifactId>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency>
<groupId>org.flywaydb</groupId>
<artifactId>flyway-core</artifactId>
</dependency>
<dependency>
<groupId>org.flywaydb</groupId>
<artifactId>flyway-database-postgresql</artifactId>
</dependency>
</dependencies> </dependencies>
<build> <build>

@ -0,0 +1,34 @@
# Phase 4.1 TED package and mail/document adapters
This phase extends the generic DOC ingestion SPI with two richer adapters:
- `TedPackageDocumentIngestionAdapter`
- `MailDocumentIngestionAdapter`
## TED package adapter
- imports the package artifact itself as a public DOC document
- expands the `.tar.gz` package into XML child payloads
- imports each child XML as a generic DOC child document
- links children to the package root via `EXTRACTED_FROM`
- keeps the existing legacy TED package processing path intact
## Mail/document adapter
- imports the MIME message as a DOC document
- extracts subject/from/to/body into the mail root semantic text
- imports attachments as child DOC documents
- links attachments via `ATTACHMENT_OF`
- optionally expands ZIP attachments recursively
## Access semantics
- TED packages and TED XML children are imported as `PUBLIC` with no owner tenant
- mail documents use a dedicated default mail access context (`mail-default-owner-tenant-key`, `mail-default-visibility`)
- deduplication is access-scope aware so private content is not merged across different tenants
Additional note:
- wrapper/container documents (for example TED package roots or ZIP wrapper documents expanded into child documents) can skip persistence of ORIGINAL content via `ted.generic-ingestion.store-original-content-for-wrapper-documents=false`, and adapters can now override that default per imported document through `SourceDescriptor.originalContentStoragePolicy` (`STORE` / `SKIP` / `DEFAULT`), while still keeping metadata, derived representations and child relations.
- when original content storage is skipped for a document, GenericDocumentImportService now also skips extraction, derived-content persistence, representation building, and embedding queueing for that document
Schema note:
- `V8__doc_phase4_1_expand_document_and_source_types.sql` expands the generic `DOC` document/source type domain for `TED_PACKAGE` and `PACKAGE_CHILD`, and also repairs older local/dev schemas that used CHECK constraints instead of PostgreSQL ENUM types.

@ -4,6 +4,7 @@ package at.procon.dip.domain.document;
* Canonical technical document type. * Canonical technical document type.
*/ */
public enum DocumentType { public enum DocumentType {
TED_PACKAGE,
TED_NOTICE, TED_NOTICE,
EMAIL, EMAIL,
MIME_MESSAGE, MIME_MESSAGE,

@ -5,6 +5,7 @@ package at.procon.dip.domain.document;
*/ */
public enum SourceType { public enum SourceType {
TED_PACKAGE, TED_PACKAGE,
PACKAGE_CHILD,
MAIL, MAIL,
FILE_SYSTEM, FILE_SYSTEM,
REST_UPLOAD, REST_UPLOAD,

@ -13,4 +13,6 @@ public interface DocumentRelationRepository extends JpaRepository<DocumentRelati
List<DocumentRelation> findByChildDocument_Id(UUID childDocumentId); List<DocumentRelation> findByChildDocument_Id(UUID childDocumentId);
List<DocumentRelation> findByParentDocument_IdAndRelationType(UUID parentDocumentId, RelationType relationType); List<DocumentRelation> findByParentDocument_IdAndRelationType(UUID parentDocumentId, RelationType relationType);
boolean existsByParentDocument_IdAndChildDocument_IdAndRelationType(UUID parentDocumentId, UUID childDocumentId, RelationType relationType);
} }

@ -15,6 +15,8 @@ public interface DocumentRepository extends JpaRepository<Document, UUID> {
Optional<Document> findByDedupHash(String dedupHash); Optional<Document> findByDedupHash(String dedupHash);
List<Document> findAllByDedupHash(String dedupHash);
boolean existsByDedupHash(String dedupHash); boolean existsByDedupHash(String dedupHash);
List<Document> findByDocumentType(DocumentType documentType); List<Document> findByDocumentType(DocumentType documentType);

@ -28,6 +28,19 @@ public class DocumentRelationService {
return relationRepository.save(relation); return relationRepository.save(relation);
} }
public DocumentRelation ensureRelation(CreateDocumentRelationCommand command) {
boolean exists = relationRepository.existsByParentDocument_IdAndChildDocument_IdAndRelationType(
command.parentDocumentId(), command.childDocumentId(), command.relationType());
if (exists) {
return relationRepository.findByParentDocument_IdAndRelationType(command.parentDocumentId(), command.relationType())
.stream()
.filter(rel -> rel.getChildDocument() != null && command.childDocumentId().equals(rel.getChildDocument().getId()))
.findFirst()
.orElseGet(() -> createRelation(command));
}
return createRelation(command);
}
@Transactional(readOnly = true) @Transactional(readOnly = true)
public List<DocumentRelation> findChildren(UUID parentDocumentId) { public List<DocumentRelation> findChildren(UUID parentDocumentId) {
return relationRepository.findByParentDocument_Id(parentDocumentId); return relationRepository.findByParentDocument_Id(parentDocumentId);

@ -15,7 +15,6 @@ import at.procon.ted.model.entity.ProcurementLot;
import at.procon.ted.service.TedPhase2GenericDocumentService; import at.procon.ted.service.TedPhase2GenericDocumentService;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.UUID; import java.util.UUID;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -63,8 +62,7 @@ public class TedNoticeProjectionService {
Document genericDocument = documentRepository.findById(resolvedDocumentId) Document genericDocument = documentRepository.findById(resolvedDocumentId)
.orElseThrow(() -> new IllegalArgumentException("Unknown DOC document id: " + finalResolvedDocumentId)); .orElseThrow(() -> new IllegalArgumentException("Unknown DOC document id: " + finalResolvedDocumentId));
TedNoticeProjection projection = projectionRepository.findByLegacyProcurementDocumentId(legacyDocument.getId()) TedNoticeProjection projection = projectionRepository.findByDocument_Id(genericDocument.getId())
.or(() -> projectionRepository.findByDocument_Id(genericDocument.getId()))
.orElseGet(TedNoticeProjection::new); .orElseGet(TedNoticeProjection::new);
mapProjection(projection, genericDocument, legacyDocument); mapProjection(projection, genericDocument, legacyDocument);
@ -72,19 +70,13 @@ public class TedNoticeProjectionService {
replaceLots(projection, legacyDocument.getLots()); replaceLots(projection, legacyDocument.getLots());
replaceOrganizations(projection, legacyDocument.getOrganizations()); replaceOrganizations(projection, legacyDocument.getOrganizations());
log.debug("Phase 3 TED projection ensured for legacy {} -> projection {} / doc {}", log.debug("Phase 3 TED projection ensured for generic doc {} -> projection {} (noticeId={}, publicationId={})",
legacyDocument.getId(), projection.getId(), genericDocument.getId()); genericDocument.getId(), projection.getId(), legacyDocument.getNoticeId(), legacyDocument.getPublicationId());
return projection.getId(); return projection.getId();
} }
@Transactional(readOnly = true)
public Optional<TedNoticeProjection> findByLegacyProcurementDocumentId(UUID legacyDocumentId) {
return projectionRepository.findByLegacyProcurementDocumentId(legacyDocumentId);
}
private void mapProjection(TedNoticeProjection projection, Document genericDocument, ProcurementDocument legacyDocument) { private void mapProjection(TedNoticeProjection projection, Document genericDocument, ProcurementDocument legacyDocument) {
projection.setDocument(genericDocument); projection.setDocument(genericDocument);
projection.setLegacyProcurementDocumentId(legacyDocument.getId());
projection.setNoticeId(legacyDocument.getNoticeId()); projection.setNoticeId(legacyDocument.getNoticeId());
projection.setPublicationId(legacyDocument.getPublicationId()); projection.setPublicationId(legacyDocument.getPublicationId());
projection.setNoticeUrl(legacyDocument.getNoticeUrl()); projection.setNoticeUrl(legacyDocument.getNoticeUrl());

@ -0,0 +1,56 @@
package at.procon.dip.extraction.impl;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.extraction.spi.DocumentExtractor;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import java.nio.charset.StandardCharsets;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
@Component
public class MimeMessageDocumentExtractor implements DocumentExtractor {
@Override
public boolean supports(DocumentType documentType, String mimeType) {
return documentType == DocumentType.MIME_MESSAGE || documentType == DocumentType.EMAIL;
}
@Override
public ExtractionResult extract(ExtractionRequest extractionRequest) {
String text = extractionRequest.textContent();
if (!StringUtils.hasText(text) && extractionRequest.binaryContent() != null) {
text = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8);
}
text = text == null ? null : text.replace("\r\n", "\n").replace('\r', '\n').trim();
Map<String, Object> attributes = new LinkedHashMap<>();
if (extractionRequest.sourceDescriptor().attributes() != null) {
attributes.putAll(extractionRequest.sourceDescriptor().attributes());
}
String title = DocumentImportSupport.firstNonBlank(extractionRequest.sourceDescriptor().attributes(), "title", "subject");
if (!StringUtils.hasText(title)) {
title = extractionRequest.sourceDescriptor().fileName();
}
if (StringUtils.hasText(title)) {
attributes.put("title", title);
}
if (!StringUtils.hasText(text)) {
return new ExtractionResult(Map.of(), List.of(new ExtractedStructuredPayload("mail-message", attributes)),
List.of("Mail message did not contain extractable text body"));
}
return new ExtractionResult(
Map.of(ContentRole.NORMALIZED_TEXT, text),
List.of(new ExtractedStructuredPayload("mail-message", attributes)),
List.of()
);
}
}

@ -22,7 +22,7 @@ public class PdfDocumentExtractor implements DocumentExtractor {
@Override @Override
public boolean supports(DocumentType documentType, String mimeType) { public boolean supports(DocumentType documentType, String mimeType) {
return documentType == DocumentType.PDF || pdfExtractionService.canHandle("dummy.pdf", mimeType); return documentType == DocumentType.PDF;
} }
@Override @Override

@ -0,0 +1,49 @@
package at.procon.dip.extraction.impl;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.extraction.spi.DocumentExtractor;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
//@Component
public class TedPackageManifestExtractor implements DocumentExtractor {
@Override
public boolean supports(DocumentType documentType, String mimeType) {
return documentType == DocumentType.TED_PACKAGE;
}
@Override
public ExtractionResult extract(ExtractionRequest extractionRequest) {
String manifest = extractionRequest.textContent();
if (!StringUtils.hasText(manifest)) {
manifest = "TED package: " + extractionRequest.sourceDescriptor().sourceIdentifier();
}
Map<String, Object> attributes = new LinkedHashMap<>();
if (extractionRequest.sourceDescriptor().attributes() != null) {
attributes.putAll(extractionRequest.sourceDescriptor().attributes());
}
String title = DocumentImportSupport.firstNonBlank(extractionRequest.sourceDescriptor().attributes(), "title", "packageId");
if (!StringUtils.hasText(title)) {
title = extractionRequest.sourceDescriptor().fileName();
}
if (StringUtils.hasText(title)) {
attributes.put("title", title);
}
return new ExtractionResult(
Map.of(ContentRole.NORMALIZED_TEXT, manifest),
List.of(new ExtractedStructuredPayload("ted-package-manifest", attributes)),
List.of()
);
}
}

@ -0,0 +1,170 @@
package at.procon.dip.ingestion.adapter;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.RelationType;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.domain.tenant.TenantRef;
import at.procon.dip.domain.document.service.DocumentRelationService;
import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.service.GenericDocumentImportService;
import at.procon.dip.ingestion.service.MailMessageExtractionService;
import at.procon.dip.ingestion.service.MailMessageExtractionService.MailAttachment;
import at.procon.dip.ingestion.service.MailMessageExtractionService.ParsedMailMessage;
import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.service.attachment.AttachmentExtractor;
import at.procon.ted.service.attachment.ZipExtractionService;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
@Slf4j
public class MailDocumentIngestionAdapter implements DocumentIngestionAdapter {
private final TedProcessorProperties properties;
private final GenericDocumentImportService importService;
private final MailMessageExtractionService mailExtractionService;
private final DocumentRelationService relationService;
private final ZipExtractionService zipExtractionService;
@Override
public boolean supports(SourceDescriptor sourceDescriptor) {
return sourceDescriptor.sourceType() == SourceType.MAIL
&& properties.getGenericIngestion().isEnabled()
&& properties.getGenericIngestion().isMailAdapterEnabled();
}
@Override
public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
byte[] rawMime = sourceDescriptor.binaryContent();
if (rawMime == null || rawMime.length == 0) {
throw new IllegalArgumentException("Mail adapter requires raw MIME bytes");
}
ParsedMailMessage parsed = mailExtractionService.parse(rawMime);
DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null ? defaultMailAccessContext() : sourceDescriptor.accessContext();
Map<String, String> rootAttributes = new LinkedHashMap<>(sourceDescriptor.attributes() == null ? Map.of() : sourceDescriptor.attributes());
if (parsed.subject() != null) rootAttributes.put("subject", parsed.subject());
if (parsed.from() != null) rootAttributes.put("from", parsed.from());
if (!parsed.recipients().isEmpty()) rootAttributes.put("to", String.join(", ", parsed.recipients()));
rootAttributes.putIfAbsent("title", parsed.subject() != null ? parsed.subject() : sourceDescriptor.fileName());
rootAttributes.put("attachmentCount", Integer.toString(parsed.attachments().size()));
rootAttributes.put("importBatchId", properties.getGenericIngestion().getMailImportBatchId());
ImportedDocumentResult rootResult = importService.importDocument(new SourceDescriptor(
accessContext,
SourceType.MAIL,
sourceDescriptor.sourceIdentifier(),
sourceDescriptor.sourceUri(),
sourceDescriptor.fileName() != null ? sourceDescriptor.fileName() : fallbackMailFileName(parsed),
"message/rfc822",
rawMime,
mailExtractionService.serializeMessage(parsed),
parsed.receivedAt() == null ? OffsetDateTime.now() : parsed.receivedAt(),
OriginalContentStoragePolicy.STORE,
rootAttributes
));
List<at.procon.dip.domain.document.CanonicalDocumentMetadata> documents = new ArrayList<>();
List<String> warnings = new ArrayList<>(rootResult.warnings());
documents.add(rootResult.document().toCanonicalMetadata());
int sortOrder = 0;
for (MailAttachment attachment : parsed.attachments()) {
importAttachment(rootResult.document().getId(), accessContext, sourceDescriptor, attachment, documents, warnings, ++sortOrder, 0);
}
return new IngestionResult(documents, warnings);
}
private void importAttachment(java.util.UUID parentDocumentId, DocumentAccessContext accessContext, SourceDescriptor parentSource,
MailAttachment attachment, List<at.procon.dip.domain.document.CanonicalDocumentMetadata> documents,
List<String> warnings, int sortOrder, int depth) {
boolean expandableWrapper = properties.getGenericIngestion().isExpandMailZipAttachments()
&& zipExtractionService.canHandle(attachment.fileName(), attachment.contentType());
Map<String, String> attachmentAttributes = new LinkedHashMap<>();
attachmentAttributes.put("title", attachment.fileName());
attachmentAttributes.put("mailSourceIdentifier", parentSource.sourceIdentifier());
attachmentAttributes.put("importBatchId", properties.getGenericIngestion().getMailImportBatchId());
if (expandableWrapper) {
attachmentAttributes.put("wrapperDocument", Boolean.TRUE.toString());
}
ImportedDocumentResult attachmentResult = importService.importDocument(new SourceDescriptor(
accessContext,
SourceType.MAIL,
parentSource.sourceIdentifier() + ":attachment:" + depth + ":" + attachment.fileName(),
parentSource.sourceUri(),
attachment.fileName(),
DocumentImportSupport.normalizeMediaType(attachment.contentType()),
attachment.data(),
previewTextIfLikelyText(attachment),
parentSource.receivedAt() == null ? OffsetDateTime.now() : parentSource.receivedAt(),
expandableWrapper ? OriginalContentStoragePolicy.SKIP : OriginalContentStoragePolicy.STORE,
attachmentAttributes
));
documents.add(attachmentResult.document().toCanonicalMetadata());
warnings.addAll(attachmentResult.warnings());
RelationType relationType = depth > 0 || attachment.path() != null ? RelationType.EXTRACTED_FROM : RelationType.ATTACHMENT_OF;
relationService.ensureRelation(new CreateDocumentRelationCommand(
parentDocumentId, attachmentResult.document().getId(), relationType, sortOrder, attachment.fileName()));
if (expandableWrapper) {
AttachmentExtractor.ExtractionResult zipResult = zipExtractionService.extract(attachment.data(), attachment.fileName(), attachment.contentType());
if (!zipResult.success()) {
warnings.add("ZIP attachment extraction failed for " + attachment.fileName() + ": " + zipResult.errorMessage());
return;
}
int childSort = 0;
for (AttachmentExtractor.ChildAttachment child : zipResult.childAttachments()) {
importAttachment(attachmentResult.document().getId(), accessContext, parentSource,
new MailAttachment(child.filename(), child.contentType(), child.data(), child.data().length, child.pathInArchive()),
documents, warnings, ++childSort, depth + 1);
}
}
}
private String fallbackMailFileName(ParsedMailMessage parsed) {
String subject = parsed.subject() == null || parsed.subject().isBlank() ? "mail-message" : parsed.subject().replaceAll("[^A-Za-z0-9._-]", "_");
return subject + ".eml";
}
private DocumentAccessContext defaultMailAccessContext() {
String tenantKey = properties.getGenericIngestion().getMailDefaultOwnerTenantKey();
if (tenantKey == null || tenantKey.isBlank()) {
tenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey();
}
DocumentVisibility visibility = properties.getGenericIngestion().getMailDefaultVisibility();
TenantRef tenant = (tenantKey == null || tenantKey.isBlank()) ? null : new TenantRef(null, tenantKey, tenantKey);
if (tenant == null && visibility == DocumentVisibility.TENANT) {
visibility = DocumentVisibility.RESTRICTED;
}
return new DocumentAccessContext(tenant, visibility);
}
private String previewTextIfLikelyText(MailAttachment attachment) {
String mime = DocumentImportSupport.normalizeMediaType(attachment.contentType());
if (DocumentImportSupport.isLikelyTextMime(mime)) {
return attachment.safeTextPreview();
}
String ext = DocumentImportSupport.extensionOf(attachment.fileName());
if ("txt".equals(ext) || "xml".equals(ext) || "html".equals(ext) || "htm".equals(ext) || "md".equals(ext)) {
return attachment.safeTextPreview();
}
return null;
}
}

@ -0,0 +1,130 @@
package at.procon.dip.ingestion.adapter;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.document.RelationType;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.service.DocumentRelationService;
import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.service.GenericDocumentImportService;
import at.procon.dip.ingestion.service.TedPackageExpansionService;
import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
@Component
@RequiredArgsConstructor
@Slf4j
public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdapter {
private final TedProcessorProperties properties;
private final GenericDocumentImportService importService;
private final TedPackageExpansionService expansionService;
private final DocumentRelationService relationService;
@Override
public boolean supports(SourceDescriptor sourceDescriptor) {
return sourceDescriptor.sourceType() == SourceType.TED_PACKAGE
&& properties.getGenericIngestion().isEnabled()
&& properties.getGenericIngestion().isTedPackageAdapterEnabled();
}
@Override
public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
byte[] packageBytes = sourceDescriptor.binaryContent();
if (packageBytes == null || packageBytes.length == 0) {
throw new IllegalArgumentException("TED package adapter requires tar.gz bytes");
}
TedPackageExpansionService.TedPackageExpansionResult expanded = expansionService.expand(packageBytes);
Map<String, String> rootAttributes = new LinkedHashMap<>(sourceDescriptor.attributes() == null ? Map.of() : sourceDescriptor.attributes());
rootAttributes.putIfAbsent("packageId", sourceDescriptor.sourceIdentifier());
rootAttributes.putIfAbsent("title", sourceDescriptor.fileName() != null ? sourceDescriptor.fileName() : sourceDescriptor.sourceIdentifier());
rootAttributes.put("xmlEntryCount", Integer.toString(expanded.entries().size()));
rootAttributes.put("wrapperDocument", Boolean.TRUE.toString());
rootAttributes.put("importBatchId", properties.getGenericIngestion().getTedPackageImportBatchId());
ImportedDocumentResult packageDocument = importService.importDocument(new SourceDescriptor(
sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(),
SourceType.TED_PACKAGE,
sourceDescriptor.sourceIdentifier(),
sourceDescriptor.sourceUri(),
sourceDescriptor.fileName(),
sourceDescriptor.mediaType() == null ? "application/gzip" : sourceDescriptor.mediaType(),
packageBytes,
expanded.manifestText(),
sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt(),
OriginalContentStoragePolicy.SKIP,
rootAttributes
));
List<String> warnings = new ArrayList<>(packageDocument.warnings());
List<at.procon.dip.domain.document.CanonicalDocumentMetadata> documents = new ArrayList<>();
documents.add(packageDocument.document().toCanonicalMetadata());
int sortOrder = 0;
for (TedPackageExpansionService.TedPackageEntry entry : expanded.entries()) {
sortOrder++;
String childUri = "tedpkg://" + sourceDescriptor.sourceIdentifier() + "/" + entry.archivePath();
String childIdentifier = sourceDescriptor.sourceIdentifier() + ":" + entry.archivePath();
String xmlContent = resolveXmlContent(entry);
Map<String, String> childAttributes = new LinkedHashMap<>();
childAttributes.put("documentTypeHint", "TED_NOTICE");
childAttributes.put("packageId", sourceDescriptor.sourceIdentifier());
childAttributes.put("archivePath", entry.archivePath());
childAttributes.put("title", entry.fileName());
childAttributes.put("importBatchId", properties.getGenericIngestion().getTedPackageImportBatchId());
ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor(
sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(),
SourceType.PACKAGE_CHILD,
childIdentifier,
childUri,
entry.fileName(),
entry.mediaType() == null ? "application/xml" : entry.mediaType(),
entry.data(),
xmlContent,
sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt(),
OriginalContentStoragePolicy.STORE,
childAttributes
));
Document childDocument = childResult.document();
documents.add(childDocument.toCanonicalMetadata());
warnings.addAll(childResult.warnings());
if (childResult.deduplicated()) {
warnings.add("TED XML child already existed and was linked to package: " + entry.archivePath());
}
relationService.ensureRelation(new CreateDocumentRelationCommand(
packageDocument.document().getId(),
childDocument.getId(),
RelationType.EXTRACTED_FROM,
sortOrder,
entry.archivePath()
));
}
return new IngestionResult(documents, warnings);
}
private String resolveXmlContent(TedPackageExpansionService.TedPackageEntry entry) {
if (entry.textUtf8() != null && !entry.textUtf8().isBlank()) {
return entry.textUtf8();
}
return new String(entry.data(), StandardCharsets.UTF_8);
}
}

@ -5,6 +5,7 @@ import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.SourceType; import at.procon.dip.domain.document.SourceType;
import at.procon.dip.domain.tenant.TenantRef; import at.procon.dip.domain.tenant.TenantRef;
import at.procon.dip.ingestion.service.DocumentIngestionGateway; import at.procon.dip.ingestion.service.DocumentIngestionGateway;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.config.TedProcessorProperties;
import java.nio.file.Files; import java.nio.file.Files;
@ -72,6 +73,7 @@ public class GenericFileSystemIngestionRoute extends RouteBuilder {
payload, payload,
null, null,
OffsetDateTime.now(), OffsetDateTime.now(),
OriginalContentStoragePolicy.DEFAULT,
attributes attributes
); );
ingestionGateway.ingest(descriptor); ingestionGateway.ingest(descriptor);

@ -9,6 +9,7 @@ import at.procon.dip.ingestion.dto.GenericImportResponse;
import at.procon.dip.ingestion.dto.GenericTextImportRequest; import at.procon.dip.ingestion.dto.GenericTextImportRequest;
import at.procon.dip.ingestion.service.DocumentIngestionGateway; import at.procon.dip.ingestion.service.DocumentIngestionGateway;
import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.config.TedProcessorProperties;
import java.time.OffsetDateTime; import java.time.OffsetDateTime;
@ -62,6 +63,7 @@ public class GenericDocumentImportController {
file.getBytes(), file.getBytes(),
null, null,
OffsetDateTime.now(), OffsetDateTime.now(),
OriginalContentStoragePolicy.DEFAULT,
attributes attributes
); );
IngestionResult result = ingestionGateway.ingest(descriptor); IngestionResult result = ingestionGateway.ingest(descriptor);
@ -89,6 +91,7 @@ public class GenericDocumentImportController {
request.text() == null ? null : request.text().getBytes(java.nio.charset.StandardCharsets.UTF_8), request.text() == null ? null : request.text().getBytes(java.nio.charset.StandardCharsets.UTF_8),
request.text(), request.text(),
OffsetDateTime.now(), OffsetDateTime.now(),
OriginalContentStoragePolicy.DEFAULT,
attributes attributes
); );
IngestionResult result = ingestionGateway.ingest(descriptor); IngestionResult result = ingestionGateway.ingest(descriptor);

@ -4,15 +4,12 @@ import at.procon.dip.classification.service.DocumentClassificationService;
import at.procon.dip.classification.spi.DetectionResult; import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.access.DocumentAccessContext; import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.access.DocumentVisibility; import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
import at.procon.dip.domain.document.ContentRole; import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentStatus; import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.StorageType; import at.procon.dip.domain.document.StorageType;
import at.procon.dip.domain.document.entity.Document; import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentContent; import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
import at.procon.dip.domain.document.entity.DocumentSource;
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
import at.procon.dip.domain.document.repository.DocumentRepository; import at.procon.dip.domain.document.repository.DocumentRepository;
import at.procon.dip.domain.document.repository.DocumentSourceRepository; import at.procon.dip.domain.document.repository.DocumentSourceRepository;
import at.procon.dip.domain.document.service.DocumentContentService; import at.procon.dip.domain.document.service.DocumentContentService;
@ -29,11 +26,15 @@ import at.procon.dip.extraction.service.DocumentExtractionService;
import at.procon.dip.extraction.spi.ExtractionRequest; import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult; import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.dto.ImportedDocumentResult; import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.ingestion.util.DocumentImportSupport; import at.procon.dip.ingestion.util.DocumentImportSupport;
import at.procon.dip.normalization.service.TextRepresentationBuildService; import at.procon.dip.normalization.service.TextRepresentationBuildService;
import at.procon.dip.normalization.spi.RepresentationBuildRequest; import at.procon.dip.normalization.spi.RepresentationBuildRequest;
import at.procon.dip.normalization.spi.TextRepresentationDraft; import at.procon.dip.normalization.spi.TextRepresentationDraft;
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
import at.procon.dip.processing.spi.DocumentProcessingPolicy;
import at.procon.dip.processing.spi.StructuredProcessingRequest;
import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.util.HashUtils; import at.procon.ted.util.HashUtils;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
@ -43,7 +44,6 @@ import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.UUID;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -51,7 +51,7 @@ import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils; import org.springframework.util.StringUtils;
/** /**
* Phase 4 generic import pipeline that persists arbitrary document types into the DOC model. * Generic import pipeline that persists arbitrary document types into the DOC model.
*/ */
@Service @Service
@RequiredArgsConstructor @RequiredArgsConstructor
@ -61,7 +61,6 @@ public class GenericDocumentImportService {
private final TedProcessorProperties properties; private final TedProcessorProperties properties;
private final DocumentRepository documentRepository; private final DocumentRepository documentRepository;
private final DocumentSourceRepository documentSourceRepository; private final DocumentSourceRepository documentSourceRepository;
private final DocumentEmbeddingRepository documentEmbeddingRepository;
private final DocumentService documentService; private final DocumentService documentService;
private final DocumentSourceService documentSourceService; private final DocumentSourceService documentSourceService;
private final DocumentContentService documentContentService; private final DocumentContentService documentContentService;
@ -70,6 +69,7 @@ public class GenericDocumentImportService {
private final DocumentClassificationService classificationService; private final DocumentClassificationService classificationService;
private final DocumentExtractionService extractionService; private final DocumentExtractionService extractionService;
private final TextRepresentationBuildService representationBuildService; private final TextRepresentationBuildService representationBuildService;
private final StructuredDocumentProcessingService structuredProcessingService;
@Transactional @Transactional
public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) { public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) {
@ -77,20 +77,20 @@ public class GenericDocumentImportService {
DetectionResult detection = classificationService.detect(withResolvedMediaType(sourceDescriptor, payload)); DetectionResult detection = classificationService.detect(withResolvedMediaType(sourceDescriptor, payload));
String dedupHash = HashUtils.computeSha256(payload.binaryContent()); String dedupHash = HashUtils.computeSha256(payload.binaryContent());
DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null
? defaultAccessContext()
: sourceDescriptor.accessContext();
if (properties.getGenericIngestion().isDeduplicateByContentHash()) { if (properties.getGenericIngestion().isDeduplicateByContentHash()) {
Optional<Document> existing = documentRepository.findByDedupHash(dedupHash); Optional<Document> existing = resolveDeduplicatedDocument(dedupHash, accessContext);
if (existing.isPresent()) { if (existing.isPresent()) {
Document document = existing.get(); Document document = existing.get();
ensureSource(document, sourceDescriptor); ensureSource(document, sourceDescriptor);
List<String> warnings = List.of("Document content hash already imported; linked new source to existing document"); List<String> warnings = List.of("Document content hash already imported within the same access scope; linked new source to existing document");
return new ImportedDocumentResult(document, detection, warnings, true); return new ImportedDocumentResult(document, detection, warnings, true);
} }
} }
DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null
? defaultAccessContext()
: sourceDescriptor.accessContext();
Document document = documentService.create(new CreateDocumentCommand( Document document = documentService.create(new CreateDocumentCommand(
accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(), accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(),
accessContext.visibility(), accessContext.visibility(),
@ -108,21 +108,63 @@ public class GenericDocumentImportService {
ensureSource(document, sourceDescriptor); ensureSource(document, sourceDescriptor);
documentService.updateStatus(document.getId(), DocumentStatus.CLASSIFIED); documentService.updateStatus(document.getId(), DocumentStatus.CLASSIFIED);
DocumentContent originalContent = persistOriginalContent(document, sourceDescriptor, detection, payload, dedupHash); boolean persistOriginalContent = shouldPersistOriginalContent(sourceDescriptor);
DocumentContent originalContent = persistOriginalContent
? persistOriginalContent(document, sourceDescriptor, detection, payload, dedupHash)
: null;
List<String> warnings = new ArrayList<>();
DocumentProcessingPolicy processingPolicy = structuredProcessingService.resolvePolicy(sourceDescriptor, detection);
ExtractionResult extractionResult = emptyExtractionResult();
Map<ContentRole, DocumentContent> persistedDerivedContent = new LinkedHashMap<>();
ExtractionResult extractionResult = extractionService.extract(new ExtractionRequest( if (persistOriginalContent) {
if (processingPolicy.runGenericExtraction()) {
extractionResult = extractionService.extract(new ExtractionRequest(
sourceDescriptor, sourceDescriptor,
detection, detection,
payload.textContent(), payload.textContent(),
payload.binaryContent() payload.binaryContent()
)); ));
List<String> warnings = new ArrayList<>(extractionResult.warnings()); warnings.addAll(extractionResult.warnings());
if (processingPolicy.persistExtractedContent()) {
persistedDerivedContent.putAll(persistDerivedContent(document, detection, extractionResult, dedupHash, "generic"));
}
if (!extractionResult.derivedTextByRole().isEmpty()) {
documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
}
}
Map<ContentRole, DocumentContent> persistedDerivedContent = persistDerivedContent(document, detection, extractionResult, dedupHash); if (processingPolicy.invokeStructuredProcessor()) {
Optional<ExtractionResult> structuredExtractionResult = structuredProcessingService.process(new StructuredProcessingRequest(
document,
originalContent,
sourceDescriptor,
detection,
payload.binaryContent(),
payload.textContent(),
dedupHash
));
if (structuredExtractionResult.isPresent()) {
ExtractionResult result = structuredExtractionResult.get();
warnings.addAll(result.warnings());
extractionResult = mergeExtractionResults(extractionResult, result);
if (processingPolicy.persistExtractedContent()) {
persistedDerivedContent.putAll(persistDerivedContent(document, detection, result, dedupHash, "structured"));
}
if (!result.derivedTextByRole().isEmpty()) {
documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED); documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
}
}
}
if (processingPolicy.runRepresentationBuilders()) {
var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult)); var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult));
persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts); persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts);
}
} else {
warnings.add("Original content storage disabled for this document; skipped extraction and text-representation processing");
}
Document reloaded = documentService.getRequired(document.getId()); Document reloaded = documentService.getRequired(document.getId());
if (reloaded.getStatus() == DocumentStatus.EXTRACTED) { if (reloaded.getStatus() == DocumentStatus.EXTRACTED) {
@ -130,7 +172,7 @@ public class GenericDocumentImportService {
reloaded = documentService.getRequired(reloaded.getId()); reloaded = documentService.getRequired(reloaded.getId());
} }
if (!extractionResult.structuredPayloads().isEmpty()) { if (processingPolicy.applyStructuredTitleIfMissing() && !extractionResult.structuredPayloads().isEmpty()) {
applyStructuredTitleIfMissing(reloaded, extractionResult); applyStructuredTitleIfMissing(reloaded, extractionResult);
reloaded = documentService.getRequired(reloaded.getId()); reloaded = documentService.getRequired(reloaded.getId());
} }
@ -138,6 +180,50 @@ public class GenericDocumentImportService {
return new ImportedDocumentResult(reloaded, detection, warnings, false); return new ImportedDocumentResult(reloaded, detection, warnings, false);
} }
private ExtractionResult mergeExtractionResults(ExtractionResult left, ExtractionResult right) {
Map<ContentRole, String> derivedText = new LinkedHashMap<>();
if (left != null && left.derivedTextByRole() != null) {
derivedText.putAll(left.derivedTextByRole());
}
if (right != null && right.derivedTextByRole() != null) {
derivedText.putAll(right.derivedTextByRole());
}
List<at.procon.dip.extraction.spi.ExtractedStructuredPayload> payloads = new ArrayList<>();
if (left != null && left.structuredPayloads() != null) {
payloads.addAll(left.structuredPayloads());
}
if (right != null && right.structuredPayloads() != null) {
payloads.addAll(right.structuredPayloads());
}
List<String> warnings = new ArrayList<>();
if (left != null && left.warnings() != null) {
warnings.addAll(left.warnings());
}
if (right != null && right.warnings() != null) {
warnings.addAll(right.warnings());
}
return new ExtractionResult(derivedText, payloads, warnings);
}
private ExtractionResult emptyExtractionResult() {
return new ExtractionResult(java.util.Collections.emptyMap(), java.util.Collections.emptyList(), java.util.Collections.emptyList());
}
private Optional<Document> resolveDeduplicatedDocument(String dedupHash, DocumentAccessContext accessContext) {
return documentRepository.findAllByDedupHash(dedupHash).stream()
.filter(existing -> sameAccessScope(existing, accessContext))
.findFirst();
}
private boolean sameAccessScope(Document existing, DocumentAccessContext accessContext) {
if (existing.getVisibility() != accessContext.visibility()) {
return false;
}
String existingTenantKey = existing.getOwnerTenant() == null ? null : existing.getOwnerTenant().getTenantKey();
String requestedTenantKey = accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey();
return java.util.Objects.equals(existingTenantKey, requestedTenantKey);
}
private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) { private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) {
if (StringUtils.hasText(sourceDescriptor.mediaType())) { if (StringUtils.hasText(sourceDescriptor.mediaType())) {
return sourceDescriptor; return sourceDescriptor;
@ -152,6 +238,7 @@ public class GenericDocumentImportService {
sourceDescriptor.binaryContent(), sourceDescriptor.binaryContent(),
sourceDescriptor.textContent(), sourceDescriptor.textContent(),
sourceDescriptor.receivedAt(), sourceDescriptor.receivedAt(),
sourceDescriptor.originalContentStoragePolicy(),
sourceDescriptor.attributes() sourceDescriptor.attributes()
); );
} }
@ -216,7 +303,7 @@ public class GenericDocumentImportService {
return sourceDescriptor.fileName(); return sourceDescriptor.fileName();
} }
if (StringUtils.hasText(payload.textContent())) { if (StringUtils.hasText(payload.textContent())) {
for (String line : payload.textContent().split("\\n")) { for (String line : payload.textContent().split("\n")) {
if (StringUtils.hasText(line)) { if (StringUtils.hasText(line)) {
return DocumentImportSupport.ellipsize(line.trim(), 240); return DocumentImportSupport.ellipsize(line.trim(), 240);
} }
@ -244,6 +331,10 @@ public class GenericDocumentImportService {
return; return;
} }
String importBatchId = sourceDescriptor.attributes() != null && StringUtils.hasText(sourceDescriptor.attributes().get("importBatchId"))
? sourceDescriptor.attributes().get("importBatchId")
: properties.getGenericIngestion().getImportBatchId();
documentSourceService.addSource(new AddDocumentSourceCommand( documentSourceService.addSource(new AddDocumentSourceCommand(
document.getId(), document.getId(),
sourceDescriptor.sourceType(), sourceDescriptor.sourceType(),
@ -251,11 +342,35 @@ public class GenericDocumentImportService {
sourceDescriptor.sourceUri(), sourceDescriptor.sourceUri(),
sourceDescriptor.fileName(), sourceDescriptor.fileName(),
null, null,
properties.getGenericIngestion().getImportBatchId(), importBatchId,
sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt() sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt()
)); ));
} }
private boolean shouldPersistOriginalContent(SourceDescriptor sourceDescriptor) {
if (sourceDescriptor.originalContentStoragePolicy() == OriginalContentStoragePolicy.STORE) {
return true;
}
if (sourceDescriptor.originalContentStoragePolicy() == OriginalContentStoragePolicy.SKIP) {
return false;
}
if (properties.getGenericIngestion().isStoreOriginalContentForWrapperDocuments()) {
return true;
}
return !isWrapperDocument(sourceDescriptor);
}
private boolean isWrapperDocument(SourceDescriptor sourceDescriptor) {
if (sourceDescriptor.attributes() == null || sourceDescriptor.attributes().isEmpty()) {
return false;
}
String wrapperFlag = sourceDescriptor.attributes().get("wrapperDocument");
if (wrapperFlag == null) {
wrapperFlag = sourceDescriptor.attributes().get("containerDocument");
}
return Boolean.parseBoolean(wrapperFlag);
}
private DocumentContent persistOriginalContent(Document document, private DocumentContent persistOriginalContent(Document document,
SourceDescriptor sourceDescriptor, SourceDescriptor sourceDescriptor,
DetectionResult detection, DetectionResult detection,
@ -287,13 +402,14 @@ public class GenericDocumentImportService {
private Map<ContentRole, DocumentContent> persistDerivedContent(Document document, private Map<ContentRole, DocumentContent> persistDerivedContent(Document document,
DetectionResult detection, DetectionResult detection,
ExtractionResult extractionResult, ExtractionResult extractionResult,
String baseHash) { String baseHash,
String hashNamespace) {
Map<ContentRole, DocumentContent> result = new LinkedHashMap<>(); Map<ContentRole, DocumentContent> result = new LinkedHashMap<>();
extractionResult.derivedTextByRole().forEach((role, text) -> { extractionResult.derivedTextByRole().forEach((role, text) -> {
if (!StringUtils.hasText(text)) { if (!StringUtils.hasText(text)) {
return; return;
} }
String contentHash = HashUtils.computeSha256(baseHash + ":" + role.name() + ":" + text); String contentHash = HashUtils.computeSha256(baseHash + ":" + hashNamespace + ":" + role.name() + ":" + text);
DocumentContent content = documentContentService.addContent(new AddDocumentContentCommand( DocumentContent content = documentContentService.addContent(new AddDocumentContentCommand(
document.getId(), document.getId(),
role, role,
@ -336,16 +452,13 @@ public class GenericDocumentImportService {
if (!StringUtils.hasText(draft.textBody())) { if (!StringUtils.hasText(draft.textBody())) {
continue; continue;
} }
DocumentContent linkedContent = switch (draft.representationType()) { DocumentContent linkedContent = resolveLinkedContent(originalContent, derivedContent, draft);
case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK ->
derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
};
var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand( var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
document.getId(), document.getId(),
linkedContent == null ? null : linkedContent.getId(), linkedContent == null ? null : linkedContent.getId(),
draft.representationType(), draft.representationType(),
"phase4-generic-builder", draft.builderKey() == null ? "phase4-generic-builder" : draft.builderKey(),
draft.languageCode(), draft.languageCode(),
null, null,
draft.chunkIndex(), draft.chunkIndex(),
@ -362,7 +475,23 @@ public class GenericDocumentImportService {
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED); documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
} }
private DocumentContent resolveLinkedContent(DocumentContent originalContent,
Map<ContentRole, DocumentContent> derivedContent,
TextRepresentationDraft draft) {
ContentRole sourceRole = draft.sourceContentRole();
if (sourceRole == null) {
sourceRole = ContentRole.NORMALIZED_TEXT;
}
if (sourceRole == ContentRole.ORIGINAL) {
return originalContent;
}
return derivedContent.getOrDefault(sourceRole, originalContent);
}
private boolean shouldQueueEmbedding(TextRepresentationDraft draft) { private boolean shouldQueueEmbedding(TextRepresentationDraft draft) {
if (Boolean.FALSE.equals(draft.queueForEmbedding())) {
return false;
}
return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true; return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true;
} }

@ -0,0 +1,122 @@
package at.procon.dip.ingestion.service;
import jakarta.mail.BodyPart;
import jakarta.mail.Multipart;
import jakarta.mail.Part;
import jakarta.mail.Session;
import jakarta.mail.internet.MimeMessage;
import jakarta.mail.internet.MimeUtility;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.springframework.stereotype.Service;
@Service
@Slf4j
public class MailMessageExtractionService {
public ParsedMailMessage parse(byte[] rawMime) {
try {
Session session = Session.getDefaultInstance(new Properties());
MimeMessage message = new MimeMessage(session, new ByteArrayInputStream(rawMime));
String subject = message.getSubject();
String from = message.getFrom() != null && message.getFrom().length > 0 ? message.getFrom()[0].toString() : null;
List<String> recipients = new ArrayList<>();
if (message.getAllRecipients() != null) {
for (var recipient : message.getAllRecipients()) {
recipients.add(recipient.toString());
}
}
StringBuilder text = new StringBuilder();
StringBuilder html = new StringBuilder();
List<MailAttachment> attachments = new ArrayList<>();
processPart(message, text, html, attachments);
String normalizedText = text.length() > 0 ? text.toString().trim() : htmlToText(html.toString());
OffsetDateTime receivedAt = message.getReceivedDate() == null ? OffsetDateTime.now()
: message.getReceivedDate().toInstant().atZone(ZoneId.systemDefault()).toOffsetDateTime();
return new ParsedMailMessage(subject, from, recipients, receivedAt, normalizedText, html.toString(), attachments);
} catch (Exception e) {
throw new IllegalArgumentException("Failed to parse MIME message", e);
}
}
private void processPart(Part part, StringBuilder text, StringBuilder html, List<MailAttachment> attachments) throws Exception {
String disposition = part.getDisposition();
String contentType = part.getContentType() == null ? "application/octet-stream" : part.getContentType();
if (disposition != null && (Part.ATTACHMENT.equalsIgnoreCase(disposition) || Part.INLINE.equalsIgnoreCase(disposition))
&& part.getFileName() != null) {
attachments.add(extractAttachment(part));
return;
}
Object content = part.getContent();
if (content instanceof Multipart multipart) {
for (int i = 0; i < multipart.getCount(); i++) {
BodyPart bodyPart = multipart.getBodyPart(i);
processPart(bodyPart, text, html, attachments);
}
} else if (contentType.toLowerCase().contains("text/plain")) {
text.append(content.toString()).append("\n");
} else if (contentType.toLowerCase().contains("text/html")) {
html.append(content.toString()).append("\n");
} else if (part.getFileName() != null) {
attachments.add(extractAttachment(part));
}
}
private MailAttachment extractAttachment(Part part) throws Exception {
String fileName = part.getFileName();
if (fileName == null) {
fileName = "attachment";
}
try {
fileName = MimeUtility.decodeText(fileName);
} catch (Exception ignored) {
}
String contentType = part.getContentType();
byte[] data;
try (InputStream in = part.getInputStream(); ByteArrayOutputStream out = new ByteArrayOutputStream()) {
in.transferTo(out);
data = out.toByteArray();
}
return new MailAttachment(fileName, contentType, data, data.length, null);
}
private String htmlToText(String html) {
if (html == null || html.isBlank()) {
return "";
}
try {
return Jsoup.parse(html).text().replaceAll("\s+", " ").trim();
} catch (Exception e) {
log.debug("Falling back to naive HTML cleanup: {}", e.getMessage());
return html.replaceAll("<[^>]+>", " ").replaceAll("\s+", " ").trim();
}
}
public String serializeMessage(ParsedMailMessage parsed) {
StringBuilder sb = new StringBuilder();
if (parsed.subject() != null) sb.append("Subject: ").append(parsed.subject()).append("\n");
if (parsed.from() != null) sb.append("From: ").append(parsed.from()).append("\n");
if (!parsed.recipients().isEmpty()) sb.append("To: ").append(String.join(", ", parsed.recipients())).append("\n");
sb.append("\n");
if (parsed.textBody() != null) sb.append(parsed.textBody());
return sb.toString().trim();
}
public record ParsedMailMessage(String subject, String from, List<String> recipients, OffsetDateTime receivedAt,
String textBody, String htmlBody, List<MailAttachment> attachments) {}
public record MailAttachment(String fileName, String contentType, byte[] data, long sizeBytes, String path) {
public String safeTextPreview() {
return new String(data, StandardCharsets.UTF_8);
}
}
}

@ -0,0 +1,88 @@
package at.procon.dip.ingestion.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.springframework.stereotype.Service;
@Service
@Slf4j
public class TedPackageExpansionService {
private static final int MAX_FILES = 10000;
private static final long MAX_SINGLE_FILE_SIZE = 20L * 1024 * 1024;
private static final long MAX_TOTAL_EXTRACTED_SIZE = 1024L * 1024 * 1024;
public TedPackageExpansionResult expand(byte[] tarGzBytes) {
List<TedPackageEntry> entries = new ArrayList<>();
long total = 0;
try (TarArchiveInputStream tais = new TarArchiveInputStream(
new GzipCompressorInputStream(new ByteArrayInputStream(tarGzBytes)))) {
TarArchiveEntry entry;
while ((entry = tais.getNextTarEntry()) != null) {
if (entry.isDirectory()) {
continue;
}
if (entries.size() >= MAX_FILES) {
break;
}
String entryName = entry.getName();
if (!entryName.toLowerCase().endsWith(".xml")) {
continue;
}
if (entryName.contains("..") || entryName.startsWith("/") || entryName.startsWith("\\")) {
log.warn("Skipping suspicious TED package entry {}", entryName);
continue;
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[8192];
long fileSize = 0;
int read;
while ((read = tais.read(buffer)) > 0) {
fileSize += read;
total += read;
if (fileSize > MAX_SINGLE_FILE_SIZE || total > MAX_TOTAL_EXTRACTED_SIZE) {
throw new IOException("TED package extraction limits exceeded");
}
baos.write(buffer, 0, read);
}
byte[] data = baos.toByteArray();
entries.add(new TedPackageEntry(extractFilename(entryName), entryName, data, data.length, "application/xml"));
}
} catch (Exception e) {
throw new IllegalArgumentException("Failed to expand TED package", e);
}
String manifest = buildManifest(entries);
return new TedPackageExpansionResult(entries, manifest);
}
private String buildManifest(List<TedPackageEntry> entries) {
StringBuilder sb = new StringBuilder();
sb.append("TED package contains ").append(entries.size()).append(" XML notice files\n");
for (TedPackageEntry entry : entries) {
sb.append("- ").append(entry.archivePath()).append(" (" ).append(entry.sizeBytes()).append(" bytes)\n");
}
return sb.toString().trim();
}
private String extractFilename(String path) {
int idx = Math.max(path.lastIndexOf('/'), path.lastIndexOf('\\'));
return idx >= 0 ? path.substring(idx + 1) : path;
}
public record TedPackageExpansionResult(List<TedPackageEntry> entries, String manifestText) {}
public record TedPackageEntry(String fileName, String archivePath, byte[] data, long sizeBytes, String mediaType) {
public String textUtf8() {
return new String(data, StandardCharsets.UTF_8);
}
}
}

@ -0,0 +1,11 @@
package at.procon.dip.ingestion.spi;
/**
* Controls whether the ORIGINAL raw payload should be persisted for a single imported document.
* DEFAULT defers to the global ingestion configuration and wrapper-document heuristics.
*/
public enum OriginalContentStoragePolicy {
DEFAULT,
STORE,
SKIP
}

@ -21,9 +21,16 @@ public record SourceDescriptor(
byte[] binaryContent, byte[] binaryContent,
String textContent, String textContent,
OffsetDateTime receivedAt, OffsetDateTime receivedAt,
OriginalContentStoragePolicy originalContentStoragePolicy,
Map<String, String> attributes Map<String, String> attributes
) { ) {
public SourceDescriptor {
if (originalContentStoragePolicy == null) {
originalContentStoragePolicy = OriginalContentStoragePolicy.DEFAULT;
}
}
public boolean hasInlineBinaryContent() { public boolean hasInlineBinaryContent() {
return binaryContent != null && binaryContent.length > 0; return binaryContent != null && binaryContent.length > 0;
} }

@ -45,7 +45,7 @@ public final class DocumentImportSupport {
public static DocumentFamily familyFor(DocumentType documentType) { public static DocumentFamily familyFor(DocumentType documentType) {
return switch (documentType) { return switch (documentType) {
case TED_NOTICE -> DocumentFamily.PROCUREMENT; case TED_PACKAGE, TED_NOTICE -> DocumentFamily.PROCUREMENT;
case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL; case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL;
case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN -> case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN ->
DocumentFamily.GENERIC; DocumentFamily.GENERIC;

@ -12,12 +12,16 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils; import org.springframework.util.StringUtils;
@Component @Component
@Order(100)
public class DefaultGenericTextRepresentationBuilder implements TextRepresentationBuilder { public class DefaultGenericTextRepresentationBuilder implements TextRepresentationBuilder {
public static final String BUILDER_KEY = "default-generic-text";
@Override @Override
public boolean supports(DocumentType documentType) { public boolean supports(DocumentType documentType) {
return documentType != DocumentType.TED_NOTICE; return documentType != DocumentType.TED_NOTICE;
@ -39,25 +43,34 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
List<TextRepresentationDraft> drafts = new ArrayList<>(); List<TextRepresentationDraft> drafts = new ArrayList<>();
drafts.add(new TextRepresentationDraft( drafts.add(new TextRepresentationDraft(
RepresentationType.FULLTEXT, RepresentationType.FULLTEXT,
BUILDER_KEY,
request.detectionResult().languageCode(), request.detectionResult().languageCode(),
baseText, baseText,
false, false,
null null,
ContentRole.NORMALIZED_TEXT,
Boolean.TRUE
)); ));
drafts.add(new TextRepresentationDraft( drafts.add(new TextRepresentationDraft(
RepresentationType.SEMANTIC_TEXT, RepresentationType.SEMANTIC_TEXT,
BUILDER_KEY,
request.detectionResult().languageCode(), request.detectionResult().languageCode(),
semantic, semantic,
true, true,
null null,
ContentRole.NORMALIZED_TEXT,
Boolean.TRUE
)); ));
if (StringUtils.hasText(title)) { if (StringUtils.hasText(title)) {
drafts.add(new TextRepresentationDraft( drafts.add(new TextRepresentationDraft(
RepresentationType.TITLE_ABSTRACT, RepresentationType.TITLE_ABSTRACT,
BUILDER_KEY,
request.detectionResult().languageCode(), request.detectionResult().languageCode(),
title + "\n\n" + summary, title + "\n\n" + summary,
false, false,
null null,
ContentRole.NORMALIZED_TEXT,
Boolean.FALSE
)); ));
} }
return drafts; return drafts;

@ -0,0 +1,138 @@
package at.procon.dip.normalization.impl;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
import at.procon.dip.normalization.spi.TextRepresentationBuilder;
import at.procon.dip.normalization.spi.TextRepresentationDraft;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
@Component
@Order(10)
public class TedStructuredTextRepresentationBuilder implements TextRepresentationBuilder {
public static final String BUILDER_KEY = "ted-structured-text";
@Override
public boolean supports(DocumentType documentType) {
return documentType == DocumentType.TED_NOTICE;
}
@Override
public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
String normalizedText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
if (!StringUtils.hasText(normalizedText)) {
return List.of();
}
Map<String, Object> attributes = request.extractionResult().structuredPayloads().stream()
.filter(payload -> Objects.equals(payload.projectionName(), "ted-notice"))
.map(ExtractedStructuredPayload::attributes)
.filter(Objects::nonNull)
.findFirst()
.orElse(Map.of());
String title = asString(attributes.get("title"));
String description = asString(attributes.get("description"));
String buyerName = asString(attributes.get("buyerName"));
String cpvCodes = asString(attributes.get("cpvCodes"));
String nutsCodes = asString(attributes.get("nutsCodes"));
String publicationId = asString(attributes.get("publicationId"));
String semanticText = buildSemanticText(title, description, buyerName, cpvCodes, nutsCodes, publicationId, normalizedText);
String summary = DocumentImportSupport.ellipsize(
StringUtils.hasText(description) ? description.trim() : normalizedText.replace('\n', ' ').trim(),
1200
);
List<TextRepresentationDraft> drafts = new ArrayList<>();
drafts.add(new TextRepresentationDraft(
RepresentationType.SEMANTIC_TEXT,
BUILDER_KEY,
request.detectionResult().languageCode(),
semanticText,
true,
null,
ContentRole.NORMALIZED_TEXT,
Boolean.TRUE
));
/*
drafts.add(new TextRepresentationDraft(
RepresentationType.FULLTEXT,
BUILDER_KEY,
request.detectionResult().languageCode(),
normalizedText,
false,
null,
ContentRole.NORMALIZED_TEXT,
Boolean.TRUE
));
if (StringUtils.hasText(title)) {
drafts.add(new TextRepresentationDraft(
RepresentationType.TITLE_ABSTRACT,
BUILDER_KEY,
request.detectionResult().languageCode(),
title + "\n\n" + summary,
false,
null,
ContentRole.NORMALIZED_TEXT,
Boolean.FALSE
));
}
drafts.add(new TextRepresentationDraft(
RepresentationType.SUMMARY,
BUILDER_KEY,
request.detectionResult().languageCode(),
summary,
false,
null,
ContentRole.NORMALIZED_TEXT,
Boolean.FALSE
));
*/
return drafts;
}
private String buildSemanticText(String title,
String description,
String buyerName,
String cpvCodes,
String nutsCodes,
String publicationId,
String normalizedText) {
StringBuilder sb = new StringBuilder();
sb.append("Document type: TED_NOTICE\n");
if (StringUtils.hasText(publicationId)) {
sb.append("Publication: ").append(publicationId.trim()).append('\n');
}
if (StringUtils.hasText(title)) {
sb.append("Title: ").append(title.trim()).append("\n\n");
}
if (StringUtils.hasText(buyerName)) {
sb.append("Contracting Authority: ").append(buyerName.trim()).append('\n');
}
if (StringUtils.hasText(cpvCodes)) {
sb.append("CPV Codes: ").append(cpvCodes.trim()).append('\n');
}
if (StringUtils.hasText(nutsCodes)) {
sb.append("NUTS Codes: ").append(nutsCodes.trim()).append('\n');
}
if (StringUtils.hasText(description)) {
sb.append("\nDescription: ").append(description.trim()).append("\n\n");
}
sb.append(normalizedText.trim());
return sb.toString().trim();
}
private String asString(Object value) {
return value instanceof String s && StringUtils.hasText(s) ? s : null;
}
}

@ -3,8 +3,10 @@ package at.procon.dip.normalization.service;
import at.procon.dip.normalization.spi.RepresentationBuildRequest; import at.procon.dip.normalization.spi.RepresentationBuildRequest;
import at.procon.dip.normalization.spi.TextRepresentationBuilder; import at.procon.dip.normalization.spi.TextRepresentationBuilder;
import at.procon.dip.normalization.spi.TextRepresentationDraft; import at.procon.dip.normalization.spi.TextRepresentationDraft;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import org.springframework.core.annotation.AnnotationAwareOrderComparator;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@Service @Service
@ -14,11 +16,22 @@ public class TextRepresentationBuildService {
private final List<TextRepresentationBuilder> builders; private final List<TextRepresentationBuilder> builders;
public List<TextRepresentationDraft> build(RepresentationBuildRequest request) { public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
return builders.stream() List<TextRepresentationBuilder> matchingBuilders = builders.stream()
.filter(builder -> builder.supports(request.detectionResult().documentType())) .filter(builder -> builder.supports(request.detectionResult().documentType()))
.findFirst() .sorted(AnnotationAwareOrderComparator.INSTANCE)
.orElseThrow(() -> new IllegalStateException( .toList();
"No text representation builder registered for " + request.detectionResult().documentType())) if (matchingBuilders.isEmpty()) {
.build(request); throw new IllegalStateException(
"No text representation builder registered for " + request.detectionResult().documentType());
}
List<TextRepresentationDraft> result = new ArrayList<>();
for (TextRepresentationBuilder builder : matchingBuilders) {
List<TextRepresentationDraft> drafts = builder.build(request);
if (drafts != null && !drafts.isEmpty()) {
result.addAll(drafts);
}
}
return result;
} }
} }

@ -1,5 +1,6 @@
package at.procon.dip.normalization.spi; package at.procon.dip.normalization.spi;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.RepresentationType; import at.procon.dip.domain.document.RepresentationType;
/** /**
@ -7,9 +8,20 @@ import at.procon.dip.domain.document.RepresentationType;
*/ */
public record TextRepresentationDraft( public record TextRepresentationDraft(
RepresentationType representationType, RepresentationType representationType,
String builderKey,
String languageCode, String languageCode,
String textBody, String textBody,
boolean primary, boolean primary,
Integer chunkIndex Integer chunkIndex,
ContentRole sourceContentRole,
Boolean queueForEmbedding
) { ) {
public TextRepresentationDraft(RepresentationType representationType,
String languageCode,
String textBody,
boolean primary,
Integer chunkIndex) {
this(representationType, null, languageCode, textBody, primary, chunkIndex, ContentRole.NORMALIZED_TEXT, null);
}
} }

@ -0,0 +1,110 @@
package at.procon.dip.processing.impl;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.processing.spi.DocumentProcessingPolicy;
import at.procon.dip.processing.spi.StructuredDocumentProcessor;
import at.procon.dip.processing.spi.StructuredProcessingRequest;
import at.procon.dip.domain.document.service.DocumentService;
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
import at.procon.ted.model.entity.ProcurementDocument;
import at.procon.ted.service.XmlParserService;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
@Component
@RequiredArgsConstructor
@Slf4j
public class TedStructuredDocumentProcessor implements StructuredDocumentProcessor {
private final XmlParserService xmlParserService;
private final DocumentService documentService;
private final TedNoticeProjectionService tedNoticeProjectionService;
@Override
public boolean supports(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
return detectionResult.documentType() == DocumentType.TED_NOTICE;
}
@Override
public DocumentProcessingPolicy processingPolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
return DocumentProcessingPolicy.replaceGenericTextProcessing();
}
@Override
public ExtractionResult process(StructuredProcessingRequest request) {
String xml = request.textContent();
if (!StringUtils.hasText(xml) && request.binaryContent() != null) {
xml = new String(request.binaryContent(), java.nio.charset.StandardCharsets.UTF_8);
}
if (!StringUtils.hasText(xml)) {
return new ExtractionResult(Map.of(), List.of(), List.of("TED structured processor received no XML payload"));
}
ProcurementDocument tedDocument = xmlParserService.parseDocument(xml);
tedDocument.setDocumentHash(request.dedupHash());
tedDocument.setXmlDocument(xml);
tedDocument.setSourceFilename(request.sourceDescriptor().fileName());
tedDocument.setSourcePath(request.sourceDescriptor().sourceUri());
tedDocument.setFileSizeBytes(request.binaryContent() == null ? null : (long) request.binaryContent().length);
var canonical = request.document();
canonical.setDocumentType(DocumentType.TED_NOTICE);
canonical.setDocumentFamily(DocumentFamily.PROCUREMENT);
canonical.setStatus(DocumentStatus.CLASSIFIED);
canonical.setTitle(tedDocument.getProjectTitle());
canonical.setSummary(tedDocument.getProjectDescription());
canonical.setLanguageCode(tedDocument.getLanguageCode());
canonical.setMimeType(request.detectionResult().mimeType() == null ? "application/xml" : request.detectionResult().mimeType());
if (StringUtils.hasText(tedDocument.getPublicationId())) {
canonical.setBusinessKey("TED_NOTICE:" + tedDocument.getPublicationId());
} else if (StringUtils.hasText(tedDocument.getNoticeId())) {
canonical.setBusinessKey("TED_NOTICE:" + tedDocument.getNoticeId());
}
documentService.save(canonical);
tedNoticeProjectionService.registerOrRefreshProjection(tedDocument, canonical.getId());
Map<String, Object> payload = new LinkedHashMap<>();
if (StringUtils.hasText(tedDocument.getProjectTitle())) {
payload.put("title", tedDocument.getProjectTitle());
}
if (StringUtils.hasText(tedDocument.getProjectDescription())) {
payload.put("description", tedDocument.getProjectDescription());
}
if (StringUtils.hasText(tedDocument.getBuyerName())) {
payload.put("buyerName", tedDocument.getBuyerName());
}
if (tedDocument.getCpvCodes() != null && tedDocument.getCpvCodes().length > 0) {
payload.put("cpvCodes", String.join(", ", tedDocument.getCpvCodes()));
}
if (tedDocument.getNutsCodes() != null && tedDocument.getNutsCodes().length > 0) {
payload.put("nutsCodes", String.join(", ", tedDocument.getNutsCodes()));
}
payload.put("lotCount", tedDocument.getLots() == null ? 0 : tedDocument.getLots().size());
payload.put("noticeId", tedDocument.getNoticeId());
payload.put("publicationId", tedDocument.getPublicationId());
Map<ContentRole, String> derivedText = new LinkedHashMap<>();
if (StringUtils.hasText(tedDocument.getTextContent())) {
derivedText.put(ContentRole.NORMALIZED_TEXT, tedDocument.getTextContent());
}
return new ExtractionResult(
derivedText,
List.of(new ExtractedStructuredPayload("ted-notice", payload)),
List.of()
);
}
}

@ -0,0 +1,36 @@
package at.procon.dip.processing.service;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.processing.spi.DocumentProcessingPolicy;
import at.procon.dip.processing.spi.StructuredDocumentProcessor;
import at.procon.dip.processing.spi.StructuredProcessingRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.List;
import java.util.Optional;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
public class StructuredDocumentProcessingService {
private final List<StructuredDocumentProcessor> processors;
public Optional<StructuredDocumentProcessor> resolve(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
return processors.stream()
.filter(processor -> processor.supports(sourceDescriptor, detectionResult))
.findFirst();
}
public DocumentProcessingPolicy resolvePolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
return resolve(sourceDescriptor, detectionResult)
.map(processor -> processor.processingPolicy(sourceDescriptor, detectionResult))
.orElse(DocumentProcessingPolicy.genericDefault());
}
public Optional<ExtractionResult> process(StructuredProcessingRequest request) {
return resolve(request.sourceDescriptor(), request.detectionResult())
.map(processor -> processor.process(request));
}
}

@ -0,0 +1,22 @@
package at.procon.dip.processing.spi;
/**
* Controls which generic pipeline stages should run for a document and whether
* a structured processor should be invoked.
*/
public record DocumentProcessingPolicy(
boolean runGenericExtraction,
boolean persistExtractedContent,
boolean runRepresentationBuilders,
boolean invokeStructuredProcessor,
boolean applyStructuredTitleIfMissing
) {
public static DocumentProcessingPolicy genericDefault() {
return new DocumentProcessingPolicy(true, true, true, true, true);
}
public static DocumentProcessingPolicy replaceGenericTextProcessing() {
return new DocumentProcessingPolicy(false, true, true, true, true);
}
}

@ -0,0 +1,19 @@
package at.procon.dip.processing.spi;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
/**
* Optional type-specific enrichment layer on top of the canonical DOC import.
*/
public interface StructuredDocumentProcessor {
boolean supports(SourceDescriptor sourceDescriptor, DetectionResult detectionResult);
default DocumentProcessingPolicy processingPolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
return DocumentProcessingPolicy.replaceGenericTextProcessing();
}
ExtractionResult process(StructuredProcessingRequest request);
}

@ -0,0 +1,20 @@
package at.procon.dip.processing.spi;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.ingestion.spi.SourceDescriptor;
/**
* Canonical import context handed to a structured document processor.
*/
public record StructuredProcessingRequest(
Document document,
DocumentContent originalContent,
SourceDescriptor sourceDescriptor,
DetectionResult detectionResult,
byte[] binaryContent,
String textContent,
String dedupHash
) {
}

@ -1,6 +1,10 @@
package at.procon.ted.camel; package at.procon.ted.camel;
import at.procon.dip.domain.document.SourceType;
import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.config.TedProcessorProperties;
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.service.attachment.AttachmentExtractor; import at.procon.ted.service.attachment.AttachmentExtractor;
import at.procon.ted.service.attachment.AttachmentProcessingService; import at.procon.ted.service.attachment.AttachmentProcessingService;
import jakarta.mail.BodyPart; import jakarta.mail.BodyPart;
@ -14,16 +18,18 @@ import lombok.extern.slf4j.Slf4j;
import org.apache.camel.Exchange; import org.apache.camel.Exchange;
import org.apache.camel.LoggingLevel; import org.apache.camel.LoggingLevel;
import org.apache.camel.builder.RouteBuilder; import org.apache.camel.builder.RouteBuilder;
import org.apache.camel.component.mail.MailMessage;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.time.OffsetDateTime;
import java.util.List; import java.time.ZoneId;
import java.util.Properties; import java.util.*;
/** /**
* Apache Camel route for IMAP mail processing. * Apache Camel route for IMAP mail processing.
@ -51,6 +57,7 @@ public class MailRoute extends RouteBuilder {
private final TedProcessorProperties properties; private final TedProcessorProperties properties;
private final AttachmentProcessingService attachmentProcessingService; private final AttachmentProcessingService attachmentProcessingService;
private final DocumentIngestionGateway documentIngestionGateway;
@Override @Override
public void configure() throws Exception { public void configure() throws Exception {
@ -105,7 +112,14 @@ public class MailRoute extends RouteBuilder {
from("direct:mime") from("direct:mime")
.routeId(ROUTE_ID_MIME) .routeId(ROUTE_ID_MIME)
.process(exchange -> { .process(exchange -> {
Message mailMessage = exchange.getIn().getBody(Message.class); Message mailMessage = null;
MailMessage mailMessage_ = exchange.getIn().getBody(MailMessage.class);
if(mailMessage_ != null) {
mailMessage = mailMessage_.getMessage();
}
else {
mailMessage = exchange.getIn().getBody(Message.class);
}
if (mailMessage == null) { if (mailMessage == null) {
log.warn("Received null mail message, skipping"); log.warn("Received null mail message, skipping");
@ -147,6 +161,41 @@ public class MailRoute extends RouteBuilder {
log.info("MIME decoded: subject='{}', textLength={}, htmlLength={}, attachments={}", log.info("MIME decoded: subject='{}', textLength={}, htmlLength={}, attachments={}",
subject, finalTextContent.length(), htmlContent.length(), attachments.size()); subject, finalTextContent.length(), htmlContent.length(), attachments.size());
if (properties.getGenericIngestion().isEnabled() && properties.getGenericIngestion().isMailAdapterEnabled()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
mailMessage.writeTo(baos);
String sourceIdentifier = mailMessage.getHeader("Message-ID") != null ?
Arrays.stream(mailMessage.getHeader("Message-ID")).findFirst().orElse(null) : null;
if (sourceIdentifier == null || sourceIdentifier.isBlank()) {
sourceIdentifier = UUID.randomUUID().toString();
}
var result = documentIngestionGateway.ingest(new SourceDescriptor(
null,
SourceType.MAIL,
sourceIdentifier,
null,
subject != null ? subject.replaceAll("[^A-Za-z0-9._-]", "_") + ".eml" : "mail-message.eml",
"message/rfc822",
baos.toByteArray(),
null,
exchange.getIn().getHeader("mailReceivedDate", Date.class) == null
? OffsetDateTime.now()
: exchange.getIn().getHeader("mailReceivedDate", Date.class).toInstant()
.atZone(ZoneId.systemDefault()).toOffsetDateTime(),
OriginalContentStoragePolicy.DEFAULT,
Map.of(
"subject", subject != null ? subject : "",
"from", from != null ? from : ""
)
));
if (!result.warnings().isEmpty()) {
log.info("Mail adapter imported MIME message with {} warnings", result.warnings().size());
}
} catch (Exception e) {
log.warn("Phase 4.1 mail adapter import failed for subject '{}': {}", subject, e.getMessage());
}
}
}) })
// Queue attachments for async processing // Queue attachments for async processing
.choice() .choice()

@ -1,5 +1,9 @@
package at.procon.ted.camel; package at.procon.ted.camel;
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.model.entity.TedDailyPackage; import at.procon.ted.model.entity.TedDailyPackage;
import at.procon.ted.repository.TedDailyPackageRepository; import at.procon.ted.repository.TedDailyPackageRepository;
@ -53,6 +57,7 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder {
private final TedProcessorProperties properties; private final TedProcessorProperties properties;
private final TedDailyPackageRepository packageRepository; private final TedDailyPackageRepository packageRepository;
private final TedPackageDownloadService downloadService; private final TedPackageDownloadService downloadService;
private final DocumentIngestionGateway documentIngestionGateway;
private final BatchDocumentProcessingService batchProcessingService; private final BatchDocumentProcessingService batchProcessingService;
/** /**
@ -146,9 +151,21 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder {
.process(this::markPackageDuplicate) .process(this::markPackageDuplicate)
.otherwise() .otherwise()
.process(this::saveDownloadedPackage) .process(this::saveDownloadedPackage)
.choice()
.when(header("skipLegacyXmlProcessing").isEqualTo(true))
.log(LoggingLevel.INFO, "Package ${header.packageId}: generic ingestion gateway-only mode active, skipping legacy XML batch persistence")
.to("direct:complete-package-after-gateway")
.otherwise()
.to("direct:extract-tar-gz") .to("direct:extract-tar-gz")
.endChoice()
.end(); .end();
from("direct:complete-package-after-gateway")
.routeId("ted-package-gateway-only-completer")
.setHeader("processingStartTime", constant(System.currentTimeMillis()))
.process(this::markPackageCompleted)
.process(this::logPackageStatistics);
// tar.gz Extraction Route // tar.gz Extraction Route
from("direct:extract-tar-gz") from("direct:extract-tar-gz")
.routeId(ROUTE_ID_EXTRACTOR) .routeId(ROUTE_ID_EXTRACTOR)
@ -214,7 +231,7 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder {
long runningCount = downloadingCount + processingCount; long runningCount = downloadingCount + processingCount;
exchange.getIn().setHeader("runningCount", runningCount); exchange.getIn().setHeader("runningCount", runningCount);
exchange.getIn().setHeader("tooManyRunning", runningCount >= 2); exchange.getIn().setHeader("tooManyRunning", runningCount >= 1);
if (runningCount > 0) { if (runningCount > 0) {
log.info("Currently {} packages in progress ({} downloading, {} processing)", log.info("Currently {} packages in progress ({} downloading, {} processing)",
@ -278,21 +295,9 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder {
Integer serialNumber = exchange.getIn().getHeader("serialNumber", Integer.class); Integer serialNumber = exchange.getIn().getHeader("serialNumber", Integer.class);
String downloadUrl = exchange.getIn().getHeader("downloadUrl", String.class); String downloadUrl = exchange.getIn().getHeader("downloadUrl", String.class);
Optional<TedDailyPackage> existing = packageRepository.findByPackageIdentifier(packageId); // Check if already exists
if (existing.isPresent()) { if (packageRepository.existsByPackageIdentifier(packageId)) {
TedDailyPackage pkg = existing.get(); log.debug("Package {} already exists in DB", packageId);
if (pkg.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) {
log.info("Retrying existing NOT_FOUND package in Camel route: {}", packageId);
pkg.setDownloadUrl(downloadUrl);
pkg.setErrorMessage(null);
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADING);
pkg = packageRepository.save(pkg);
exchange.getIn().setHeader("packageDbId", pkg.getId());
return;
}
log.debug("Package {} already exists in DB with status {}", packageId, pkg.getDownloadStatus());
exchange.getIn().setHeader("packageDbId", pkg.getId());
return; return;
} }
@ -372,6 +377,50 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder {
exchange.getIn().setHeader("downloadPath", downloadPath.toString()); exchange.getIn().setHeader("downloadPath", downloadPath.toString());
exchange.getIn().setHeader("deleteAfterExtraction", exchange.getIn().setHeader("deleteAfterExtraction",
properties.getDownload().isDeleteAfterExtraction()); properties.getDownload().isDeleteAfterExtraction());
exchange.getIn().setHeader("skipLegacyXmlProcessing", false);
if (properties.getGenericIngestion().isEnabled() && properties.getGenericIngestion().isTedPackageAdapterEnabled()) {
try {
IngestionResult ingestionResult = documentIngestionGateway.ingest(new SourceDescriptor(
null,
at.procon.dip.domain.document.SourceType.TED_PACKAGE,
packageId,
downloadPath.toString(),
packageId + ".tar.gz",
"application/gzip",
body,
null,
OffsetDateTime.now(),
OriginalContentStoragePolicy.DEFAULT,
java.util.Map.of(
"packageId", packageId,
"title", packageId + ".tar.gz"
)
));
int importedChildCount = Math.max(0, ingestionResult.documents().size() - 1);
exchange.getIn().setHeader("gatewayImportedChildCount", importedChildCount);
exchange.getIn().setHeader("gatewayImportWarnings", ingestionResult.warnings().size());
if (properties.getGenericIngestion().isGatewayOnlyForTedPackages()) {
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
pkg.setXmlFileCount(importedChildCount);
pkg.setProcessedCount(importedChildCount);
pkg.setFailedCount(0);
packageRepository.save(pkg);
});
if (properties.getDownload().isDeleteAfterExtraction()) {
Files.deleteIfExists(downloadPath);
}
exchange.getIn().setHeader("skipLegacyXmlProcessing", true);
}
} catch (Exception e) {
log.warn("Phase 4.1 TED package adapter import failed for {}: {}", packageId, e.getMessage());
}
}
} }
/** /**

@ -510,7 +510,7 @@ public class TedProcessorProperties {
/** /**
* Input directory for the generic filesystem importer. * Input directory for the generic filesystem importer.
*/ */
private String inputDirectory = "D:/ted.europe/generic-input"; private String inputDirectory = "/ted.europe/generic-input";
/** /**
* Regular-expression file pattern used by the Camel file route. * Regular-expression file pattern used by the Camel file route.
@ -570,6 +570,14 @@ public class TedProcessorProperties {
*/ */
private boolean deduplicateByContentHash = true; private boolean deduplicateByContentHash = true;
/**
* Persist ORIGINAL content rows for wrapper/container documents that primarily exist
* to group or reference child documents (for example TED packages or expanded ZIP wrappers).
* When disabled, wrappers are still classified, extracted and represented, but the raw
* ORIGINAL content payload is not stored in DOC.doc_content.
*/
private boolean storeOriginalContentForWrapperDocuments = true;
/** /**
* Queue only the primary text representation for embedding. * Queue only the primary text representation for embedding.
*/ */
@ -580,6 +588,50 @@ public class TedProcessorProperties {
*/ */
@NotBlank @NotBlank
private String importBatchId = "phase4-generic"; private String importBatchId = "phase4-generic";
/**
* Enable the Phase 4.1 TED package adapter built on top of the generic DOC ingestion SPI.
*/
private boolean tedPackageAdapterEnabled = true;
/**
* Enable the Phase 4.1 mail/document adapter built on top of the generic DOC ingestion SPI.
*/
private boolean mailAdapterEnabled = false;
/**
* Optional dedicated owner tenant key for imported mail messages and attachments.
* Falls back to defaultOwnerTenantKey when not configured.
*/
private String mailDefaultOwnerTenantKey;
/**
* Default visibility for imported mail messages and attachments.
*/
private at.procon.dip.domain.access.DocumentVisibility mailDefaultVisibility = at.procon.dip.domain.access.DocumentVisibility.TENANT;
/**
* Expand ZIP attachments recursively through the mail adapter.
*/
private boolean expandMailZipAttachments = true;
/**
* Import batch identifier for TED package roots and extracted TED child documents.
*/
@NotBlank
private String tedPackageImportBatchId = "phase41-ted-package";
/**
* When true, TED packages are persisted only through the generic ingestion gateway
* and the legacy XML batch persistence path is skipped.
*/
private boolean gatewayOnlyForTedPackages = false;
/**
* Import batch identifier for imported mail root messages and child attachments.
*/
@NotBlank
private String mailImportBatchId = "phase41-mail";
} }
} }

@ -39,7 +39,7 @@ spring:
order_updates: true order_updates: true
flyway: flyway:
enabled: true enabled: false
locations: classpath:db/migration locations: classpath:db/migration
baseline-on-migrate: true baseline-on-migrate: true
create-schemas: true create-schemas: true
@ -128,7 +128,9 @@ ted:
# TED Daily Package Download configuration # TED Daily Package Download configuration
download: download:
# Enable/disable automatic package download # Enable/disable automatic package download
enabled: false enabled: true
# User service-based camel route
use-service-based: false
# Base URL for TED Daily Packages # Base URL for TED Daily Packages
base-url: https://ted.europa.eu/packages/daily/ base-url: https://ted.europa.eu/packages/daily/
# Download directory for tar.gz files # Download directory for tar.gz files
@ -136,11 +138,11 @@ ted:
# Extract directory for XML files # Extract directory for XML files
extract-directory: /ted.europe/extracted extract-directory: /ted.europe/extracted
# Start year for downloads # Start year for downloads
start-year: 2023 start-year: 2026
# Max consecutive 404 errors before stopping # Max consecutive 404 errors before stopping
max-consecutive-404: 4 max-consecutive-404: 4
# Polling interval (milliseconds) - 2 minutes # Polling interval (milliseconds) - 2 minutes
poll-interval: 120000 poll-interval: 3600000
# Retry interval for tail NOT_FOUND packages - 6 hours # Retry interval for tail NOT_FOUND packages - 6 hours
not-found-retry-interval: 21600000 not-found-retry-interval: 21600000
# Grace period after year end before a previous-year tail 404 is treated as final # Grace period after year end before a previous-year tail 404 is treated as final
@ -150,7 +152,7 @@ ted:
# Download timeout (milliseconds) - 5 minutes # Download timeout (milliseconds) - 5 minutes
download-timeout: 300000 download-timeout: 300000
# Max concurrent downloads # Max concurrent downloads
max-concurrent-downloads: 2 max-concurrent-downloads: 1
# Delay between downloads (milliseconds) for rate limiting - 5 seconds # Delay between downloads (milliseconds) for rate limiting - 5 seconds
delay-between-downloads: 3000 delay-between-downloads: 3000
# Delete tar.gz after extraction # Delete tar.gz after extraction
@ -207,13 +209,13 @@ ted:
# Phase 4 generic ingestion configuration # Phase 4 generic ingestion configuration
generic-ingestion: generic-ingestion:
# Master switch for arbitrary document ingestion into the DOC model # Master switch for arbitrary document ingestion into the DOC model
enabled: false enabled: true
# Enable file-system polling for non-TED documents # Enable file-system polling for non-TED documents
file-system-enabled: false file-system-enabled: false
# Allow REST/API upload endpoints for arbitrary documents # Allow REST/API upload endpoints for arbitrary documents
rest-upload-enabled: true rest-upload-enabled: true
# Input directory for the generic Camel file route # Input directory for the generic Camel file route
input-directory: D:/ted.europe/generic-input input-directory: /ted.europe/generic-input
# Regex for files accepted by the generic file route # Regex for files accepted by the generic file route
file-pattern: .*\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$ file-pattern: .*\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$
# Move successfully processed files here # Move successfully processed files here
@ -236,10 +238,29 @@ ted:
max-binary-bytes-in-db: 5242880 max-binary-bytes-in-db: 5242880
# Deduplicate by content hash and attach additional sources to the same canonical document # Deduplicate by content hash and attach additional sources to the same canonical document
deduplicate-by-content-hash: true deduplicate-by-content-hash: true
# Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers
store-original-content-for-wrapper-documents: true
# Queue only the primary text representation for vectorization # Queue only the primary text representation for vectorization
vectorize-primary-representation-only: true vectorize-primary-representation-only: true
# Import batch marker written to DOC.doc_source.import_batch_id # Import batch marker written to DOC.doc_source.import_batch_id
import-batch-id: phase4-generic import-batch-id: phase4-generic
# Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI
ted-package-adapter-enabled: true
# Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI
mail-adapter-enabled: false
# Optional dedicated mail owner tenant, falls back to default-owner-tenant-key
mail-default-owner-tenant-key:
# Visibility for imported mail messages and attachments
mail-default-visibility: TENANT
# Expand ZIP attachments recursively through the mail adapter
expand-mail-zip-attachments: true
# Import batch marker for TED package roots and children
ted-package-import-batch-id: phase41-ted-package
# When true, TED package documents are stored only through the generic ingestion gateway
# and the legacy XML batch processing path is skipped
gateway-only-for-ted-packages: true
# Import batch marker for mail roots and attachments
mail-import-batch-id: phase41-mail
# Solution Brief processing configuration # Solution Brief processing configuration
solution-brief: solution-brief:

@ -0,0 +1,80 @@
-- Phase 4.1 enum/check-constraint expansion for newly introduced generic document/source types.
-- Supports both:
-- 1) PostgreSQL ENUM-backed columns created by Flyway
-- 2) legacy VARCHAR + CHECK constraint variants that may exist in local/dev databases
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_type t
JOIN pg_namespace n ON n.oid = t.typnamespace
WHERE n.nspname = 'doc'
AND t.typname = 'doc_document_type'
) THEN
ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_PACKAGE';
END IF;
END
$$;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_type t
JOIN pg_namespace n ON n.oid = t.typnamespace
WHERE n.nspname = 'doc'
AND t.typname = 'doc_source_type'
) THEN
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'PACKAGE_CHILD';
END IF;
END
$$;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_constraint c
JOIN pg_class r ON r.oid = c.conrelid
JOIN pg_namespace n ON n.oid = r.relnamespace
WHERE n.nspname = 'doc'
AND r.relname = 'doc_document'
AND c.conname = 'doc_document_document_type_check'
) THEN
ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_type_check;
ALTER TABLE DOC.doc_document
ADD CONSTRAINT doc_document_document_type_check
CHECK (
document_type IN (
'TED_PACKAGE', 'TED_NOTICE', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'UNKNOWN'
)
);
END IF;
END
$$;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_constraint c
JOIN pg_class r ON r.oid = c.conrelid
JOIN pg_namespace n ON n.oid = r.relnamespace
WHERE n.nspname = 'doc'
AND r.relname = 'doc_source'
AND c.conname = 'doc_source_source_type_check'
) THEN
ALTER TABLE DOC.doc_source DROP CONSTRAINT doc_source_source_type_check;
ALTER TABLE DOC.doc_source
ADD CONSTRAINT doc_source_source_type_check
CHECK (
source_type IN (
'TED_PACKAGE', 'PACKAGE_CHILD', 'MAIL', 'FILE_SYSTEM', 'REST_UPLOAD',
'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION'
)
);
END IF;
END
$$;
Loading…
Cancel
Save