diff --git a/docs/architecture/PHASE4_1_ADAPTERS.md b/docs/architecture/PHASE4_1_ADAPTERS.md new file mode 100644 index 0000000..14e9944 --- /dev/null +++ b/docs/architecture/PHASE4_1_ADAPTERS.md @@ -0,0 +1,28 @@ +# Phase 4.1 adapter extensions + +## Added adapters + +### TED package adapter + +- Source type: `TED_PACKAGE` +- Root access: `PUBLIC`, no owner tenant +- Root document type: `TED_PACKAGE` +- Child source type: `PACKAGE_CHILD` +- Child relation: `EXTRACTED_FROM` + +The adapter imports the package artifact plus its XML members into the generic `DOC` model. +It does not replace the existing legacy TED package processing path; instead it complements it, so the later legacy TED parsing step can still enrich the same canonical child documents into proper `TED_NOTICE` projections by dedup hash. + +### Mail/document adapter + +- Source type: `MAIL` +- Root document type: `MIME_MESSAGE` +- Child relation: `ATTACHMENT_OF` +- Access: configurable via `mail-default-owner-tenant-key` and `mail-default-visibility` + +The adapter stores the message body as the semantic root text and imports attachments as child documents. ZIP attachments can optionally be expanded recursively. + +## Deduplication + +Phase 4 deduplication by content hash is refined so the same payload is only deduplicated within the same access scope (`visibility` + `owner tenant`). +This prevents private documents from different tenants from being merged into one canonical document accidentally. diff --git a/pom.xml b/pom.xml index 007f25d..2106724 100644 --- a/pom.xml +++ b/pom.xml @@ -224,6 +224,14 @@ postgresql test + + org.flywaydb + flyway-core + + + org.flywaydb + flyway-database-postgresql + diff --git a/src/main/java/at/procon/dip/README_PHASE4_1.md b/src/main/java/at/procon/dip/README_PHASE4_1.md new file mode 100644 index 0000000..b4796fb --- /dev/null +++ b/src/main/java/at/procon/dip/README_PHASE4_1.md @@ -0,0 +1,34 @@ +# Phase 4.1 – TED package and mail/document adapters + +This phase extends the generic DOC ingestion SPI with two richer adapters: + +- `TedPackageDocumentIngestionAdapter` +- `MailDocumentIngestionAdapter` + +## TED package adapter +- imports the package artifact itself as a public DOC document +- expands the `.tar.gz` package into XML child payloads +- imports each child XML as a generic DOC child document +- links children to the package root via `EXTRACTED_FROM` +- keeps the existing legacy TED package processing path intact + +## Mail/document adapter +- imports the MIME message as a DOC document +- extracts subject/from/to/body into the mail root semantic text +- imports attachments as child DOC documents +- links attachments via `ATTACHMENT_OF` +- optionally expands ZIP attachments recursively + +## Access semantics +- TED packages and TED XML children are imported as `PUBLIC` with no owner tenant +- mail documents use a dedicated default mail access context (`mail-default-owner-tenant-key`, `mail-default-visibility`) +- deduplication is access-scope aware so private content is not merged across different tenants + +Additional note: +- wrapper/container documents (for example TED package roots or ZIP wrapper documents expanded into child documents) can skip persistence of ORIGINAL content via `ted.generic-ingestion.store-original-content-for-wrapper-documents=false`, and adapters can now override that default per imported document through `SourceDescriptor.originalContentStoragePolicy` (`STORE` / `SKIP` / `DEFAULT`), while still keeping metadata, derived representations and child relations. + +- when original content storage is skipped for a document, GenericDocumentImportService now also skips extraction, derived-content persistence, representation building, and embedding queueing for that document + + +Schema note: +- `V8__doc_phase4_1_expand_document_and_source_types.sql` expands the generic `DOC` document/source type domain for `TED_PACKAGE` and `PACKAGE_CHILD`, and also repairs older local/dev schemas that used CHECK constraints instead of PostgreSQL ENUM types. diff --git a/src/main/java/at/procon/dip/domain/document/DocumentType.java b/src/main/java/at/procon/dip/domain/document/DocumentType.java index f6a651b..1ff0720 100644 --- a/src/main/java/at/procon/dip/domain/document/DocumentType.java +++ b/src/main/java/at/procon/dip/domain/document/DocumentType.java @@ -4,6 +4,7 @@ package at.procon.dip.domain.document; * Canonical technical document type. */ public enum DocumentType { + TED_PACKAGE, TED_NOTICE, EMAIL, MIME_MESSAGE, diff --git a/src/main/java/at/procon/dip/domain/document/SourceType.java b/src/main/java/at/procon/dip/domain/document/SourceType.java index d53b841..ccb0e62 100644 --- a/src/main/java/at/procon/dip/domain/document/SourceType.java +++ b/src/main/java/at/procon/dip/domain/document/SourceType.java @@ -5,6 +5,7 @@ package at.procon.dip.domain.document; */ public enum SourceType { TED_PACKAGE, + PACKAGE_CHILD, MAIL, FILE_SYSTEM, REST_UPLOAD, diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java index a039470..c1cf504 100644 --- a/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java @@ -13,4 +13,6 @@ public interface DocumentRelationRepository extends JpaRepository findByChildDocument_Id(UUID childDocumentId); List findByParentDocument_IdAndRelationType(UUID parentDocumentId, RelationType relationType); + + boolean existsByParentDocument_IdAndChildDocument_IdAndRelationType(UUID parentDocumentId, UUID childDocumentId, RelationType relationType); } diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java index 6746b75..3e5192c 100644 --- a/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java @@ -15,6 +15,8 @@ public interface DocumentRepository extends JpaRepository { Optional findByDedupHash(String dedupHash); + List findAllByDedupHash(String dedupHash); + boolean existsByDedupHash(String dedupHash); List findByDocumentType(DocumentType documentType); diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java index f9c1bb1..2c59ff0 100644 --- a/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java @@ -28,6 +28,19 @@ public class DocumentRelationService { return relationRepository.save(relation); } + public DocumentRelation ensureRelation(CreateDocumentRelationCommand command) { + boolean exists = relationRepository.existsByParentDocument_IdAndChildDocument_IdAndRelationType( + command.parentDocumentId(), command.childDocumentId(), command.relationType()); + if (exists) { + return relationRepository.findByParentDocument_IdAndRelationType(command.parentDocumentId(), command.relationType()) + .stream() + .filter(rel -> rel.getChildDocument() != null && command.childDocumentId().equals(rel.getChildDocument().getId())) + .findFirst() + .orElseGet(() -> createRelation(command)); + } + return createRelation(command); + } + @Transactional(readOnly = true) public List findChildren(UUID parentDocumentId) { return relationRepository.findByParentDocument_Id(parentDocumentId); diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java index b6e3cb7..081b744 100644 --- a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java +++ b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java @@ -15,7 +15,6 @@ import at.procon.ted.model.entity.ProcurementLot; import at.procon.ted.service.TedPhase2GenericDocumentService; import java.util.ArrayList; import java.util.List; -import java.util.Optional; import java.util.UUID; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -63,8 +62,7 @@ public class TedNoticeProjectionService { Document genericDocument = documentRepository.findById(resolvedDocumentId) .orElseThrow(() -> new IllegalArgumentException("Unknown DOC document id: " + finalResolvedDocumentId)); - TedNoticeProjection projection = projectionRepository.findByLegacyProcurementDocumentId(legacyDocument.getId()) - .or(() -> projectionRepository.findByDocument_Id(genericDocument.getId())) + TedNoticeProjection projection = projectionRepository.findByDocument_Id(genericDocument.getId()) .orElseGet(TedNoticeProjection::new); mapProjection(projection, genericDocument, legacyDocument); @@ -72,19 +70,13 @@ public class TedNoticeProjectionService { replaceLots(projection, legacyDocument.getLots()); replaceOrganizations(projection, legacyDocument.getOrganizations()); - log.debug("Phase 3 TED projection ensured for legacy {} -> projection {} / doc {}", - legacyDocument.getId(), projection.getId(), genericDocument.getId()); + log.debug("Phase 3 TED projection ensured for generic doc {} -> projection {} (noticeId={}, publicationId={})", + genericDocument.getId(), projection.getId(), legacyDocument.getNoticeId(), legacyDocument.getPublicationId()); return projection.getId(); } - @Transactional(readOnly = true) - public Optional findByLegacyProcurementDocumentId(UUID legacyDocumentId) { - return projectionRepository.findByLegacyProcurementDocumentId(legacyDocumentId); - } - private void mapProjection(TedNoticeProjection projection, Document genericDocument, ProcurementDocument legacyDocument) { projection.setDocument(genericDocument); - projection.setLegacyProcurementDocumentId(legacyDocument.getId()); projection.setNoticeId(legacyDocument.getNoticeId()); projection.setPublicationId(legacyDocument.getPublicationId()); projection.setNoticeUrl(legacyDocument.getNoticeUrl()); diff --git a/src/main/java/at/procon/dip/extraction/impl/MimeMessageDocumentExtractor.java b/src/main/java/at/procon/dip/extraction/impl/MimeMessageDocumentExtractor.java new file mode 100644 index 0000000..9b4c5f0 --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/impl/MimeMessageDocumentExtractor.java @@ -0,0 +1,56 @@ +package at.procon.dip.extraction.impl; + +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.extraction.spi.DocumentExtractor; +import at.procon.dip.extraction.spi.ExtractedStructuredPayload; +import at.procon.dip.extraction.spi.ExtractionRequest; +import at.procon.dip.extraction.spi.ExtractionResult; +import at.procon.dip.ingestion.util.DocumentImportSupport; +import java.nio.charset.StandardCharsets; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +@Component +public class MimeMessageDocumentExtractor implements DocumentExtractor { + + @Override + public boolean supports(DocumentType documentType, String mimeType) { + return documentType == DocumentType.MIME_MESSAGE || documentType == DocumentType.EMAIL; + } + + @Override + public ExtractionResult extract(ExtractionRequest extractionRequest) { + String text = extractionRequest.textContent(); + if (!StringUtils.hasText(text) && extractionRequest.binaryContent() != null) { + text = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8); + } + text = text == null ? null : text.replace("\r\n", "\n").replace('\r', '\n').trim(); + + Map attributes = new LinkedHashMap<>(); + if (extractionRequest.sourceDescriptor().attributes() != null) { + attributes.putAll(extractionRequest.sourceDescriptor().attributes()); + } + String title = DocumentImportSupport.firstNonBlank(extractionRequest.sourceDescriptor().attributes(), "title", "subject"); + if (!StringUtils.hasText(title)) { + title = extractionRequest.sourceDescriptor().fileName(); + } + if (StringUtils.hasText(title)) { + attributes.put("title", title); + } + + if (!StringUtils.hasText(text)) { + return new ExtractionResult(Map.of(), List.of(new ExtractedStructuredPayload("mail-message", attributes)), + List.of("Mail message did not contain extractable text body")); + } + + return new ExtractionResult( + Map.of(ContentRole.NORMALIZED_TEXT, text), + List.of(new ExtractedStructuredPayload("mail-message", attributes)), + List.of() + ); + } +} diff --git a/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java b/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java index 4eb6f07..4dfaa9a 100644 --- a/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java +++ b/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java @@ -22,7 +22,7 @@ public class PdfDocumentExtractor implements DocumentExtractor { @Override public boolean supports(DocumentType documentType, String mimeType) { - return documentType == DocumentType.PDF || pdfExtractionService.canHandle("dummy.pdf", mimeType); + return documentType == DocumentType.PDF; } @Override diff --git a/src/main/java/at/procon/dip/extraction/impl/TedPackageManifestExtractor.java b/src/main/java/at/procon/dip/extraction/impl/TedPackageManifestExtractor.java new file mode 100644 index 0000000..e6873eb --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/impl/TedPackageManifestExtractor.java @@ -0,0 +1,49 @@ +package at.procon.dip.extraction.impl; + +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.extraction.spi.DocumentExtractor; +import at.procon.dip.extraction.spi.ExtractedStructuredPayload; +import at.procon.dip.extraction.spi.ExtractionRequest; +import at.procon.dip.extraction.spi.ExtractionResult; +import at.procon.dip.ingestion.util.DocumentImportSupport; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +//@Component +public class TedPackageManifestExtractor implements DocumentExtractor { + + @Override + public boolean supports(DocumentType documentType, String mimeType) { + return documentType == DocumentType.TED_PACKAGE; + } + + @Override + public ExtractionResult extract(ExtractionRequest extractionRequest) { + String manifest = extractionRequest.textContent(); + if (!StringUtils.hasText(manifest)) { + manifest = "TED package: " + extractionRequest.sourceDescriptor().sourceIdentifier(); + } + + Map attributes = new LinkedHashMap<>(); + if (extractionRequest.sourceDescriptor().attributes() != null) { + attributes.putAll(extractionRequest.sourceDescriptor().attributes()); + } + String title = DocumentImportSupport.firstNonBlank(extractionRequest.sourceDescriptor().attributes(), "title", "packageId"); + if (!StringUtils.hasText(title)) { + title = extractionRequest.sourceDescriptor().fileName(); + } + if (StringUtils.hasText(title)) { + attributes.put("title", title); + } + + return new ExtractionResult( + Map.of(ContentRole.NORMALIZED_TEXT, manifest), + List.of(new ExtractedStructuredPayload("ted-package-manifest", attributes)), + List.of() + ); + } +} diff --git a/src/main/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapter.java new file mode 100644 index 0000000..70ba90b --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapter.java @@ -0,0 +1,170 @@ +package at.procon.dip.ingestion.adapter; + +import at.procon.dip.domain.access.DocumentAccessContext; +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.RelationType; +import at.procon.dip.domain.document.SourceType; +import at.procon.dip.domain.tenant.TenantRef; +import at.procon.dip.domain.document.service.DocumentRelationService; +import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand; +import at.procon.dip.ingestion.dto.ImportedDocumentResult; +import at.procon.dip.ingestion.service.GenericDocumentImportService; +import at.procon.dip.ingestion.service.MailMessageExtractionService; +import at.procon.dip.ingestion.service.MailMessageExtractionService.MailAttachment; +import at.procon.dip.ingestion.service.MailMessageExtractionService.ParsedMailMessage; +import at.procon.dip.ingestion.spi.DocumentIngestionAdapter; +import at.procon.dip.ingestion.spi.IngestionResult; +import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import at.procon.dip.ingestion.util.DocumentImportSupport; +import at.procon.ted.config.TedProcessorProperties; +import at.procon.ted.service.attachment.AttachmentExtractor; +import at.procon.ted.service.attachment.ZipExtractionService; +import java.time.OffsetDateTime; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +@Slf4j +public class MailDocumentIngestionAdapter implements DocumentIngestionAdapter { + + private final TedProcessorProperties properties; + private final GenericDocumentImportService importService; + private final MailMessageExtractionService mailExtractionService; + private final DocumentRelationService relationService; + private final ZipExtractionService zipExtractionService; + + @Override + public boolean supports(SourceDescriptor sourceDescriptor) { + return sourceDescriptor.sourceType() == SourceType.MAIL + && properties.getGenericIngestion().isEnabled() + && properties.getGenericIngestion().isMailAdapterEnabled(); + } + + @Override + public IngestionResult ingest(SourceDescriptor sourceDescriptor) { + byte[] rawMime = sourceDescriptor.binaryContent(); + if (rawMime == null || rawMime.length == 0) { + throw new IllegalArgumentException("Mail adapter requires raw MIME bytes"); + } + ParsedMailMessage parsed = mailExtractionService.parse(rawMime); + DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null ? defaultMailAccessContext() : sourceDescriptor.accessContext(); + + Map rootAttributes = new LinkedHashMap<>(sourceDescriptor.attributes() == null ? Map.of() : sourceDescriptor.attributes()); + if (parsed.subject() != null) rootAttributes.put("subject", parsed.subject()); + if (parsed.from() != null) rootAttributes.put("from", parsed.from()); + if (!parsed.recipients().isEmpty()) rootAttributes.put("to", String.join(", ", parsed.recipients())); + rootAttributes.putIfAbsent("title", parsed.subject() != null ? parsed.subject() : sourceDescriptor.fileName()); + rootAttributes.put("attachmentCount", Integer.toString(parsed.attachments().size())); + rootAttributes.put("importBatchId", properties.getGenericIngestion().getMailImportBatchId()); + + ImportedDocumentResult rootResult = importService.importDocument(new SourceDescriptor( + accessContext, + SourceType.MAIL, + sourceDescriptor.sourceIdentifier(), + sourceDescriptor.sourceUri(), + sourceDescriptor.fileName() != null ? sourceDescriptor.fileName() : fallbackMailFileName(parsed), + "message/rfc822", + rawMime, + mailExtractionService.serializeMessage(parsed), + parsed.receivedAt() == null ? OffsetDateTime.now() : parsed.receivedAt(), + OriginalContentStoragePolicy.STORE, + rootAttributes + )); + + List documents = new ArrayList<>(); + List warnings = new ArrayList<>(rootResult.warnings()); + documents.add(rootResult.document().toCanonicalMetadata()); + + int sortOrder = 0; + for (MailAttachment attachment : parsed.attachments()) { + importAttachment(rootResult.document().getId(), accessContext, sourceDescriptor, attachment, documents, warnings, ++sortOrder, 0); + } + + return new IngestionResult(documents, warnings); + } + + private void importAttachment(java.util.UUID parentDocumentId, DocumentAccessContext accessContext, SourceDescriptor parentSource, + MailAttachment attachment, List documents, + List warnings, int sortOrder, int depth) { + boolean expandableWrapper = properties.getGenericIngestion().isExpandMailZipAttachments() + && zipExtractionService.canHandle(attachment.fileName(), attachment.contentType()); + + Map attachmentAttributes = new LinkedHashMap<>(); + attachmentAttributes.put("title", attachment.fileName()); + attachmentAttributes.put("mailSourceIdentifier", parentSource.sourceIdentifier()); + attachmentAttributes.put("importBatchId", properties.getGenericIngestion().getMailImportBatchId()); + if (expandableWrapper) { + attachmentAttributes.put("wrapperDocument", Boolean.TRUE.toString()); + } + + ImportedDocumentResult attachmentResult = importService.importDocument(new SourceDescriptor( + accessContext, + SourceType.MAIL, + parentSource.sourceIdentifier() + ":attachment:" + depth + ":" + attachment.fileName(), + parentSource.sourceUri(), + attachment.fileName(), + DocumentImportSupport.normalizeMediaType(attachment.contentType()), + attachment.data(), + previewTextIfLikelyText(attachment), + parentSource.receivedAt() == null ? OffsetDateTime.now() : parentSource.receivedAt(), + expandableWrapper ? OriginalContentStoragePolicy.SKIP : OriginalContentStoragePolicy.STORE, + attachmentAttributes + )); + documents.add(attachmentResult.document().toCanonicalMetadata()); + warnings.addAll(attachmentResult.warnings()); + RelationType relationType = depth > 0 || attachment.path() != null ? RelationType.EXTRACTED_FROM : RelationType.ATTACHMENT_OF; + relationService.ensureRelation(new CreateDocumentRelationCommand( + parentDocumentId, attachmentResult.document().getId(), relationType, sortOrder, attachment.fileName())); + + if (expandableWrapper) { + AttachmentExtractor.ExtractionResult zipResult = zipExtractionService.extract(attachment.data(), attachment.fileName(), attachment.contentType()); + if (!zipResult.success()) { + warnings.add("ZIP attachment extraction failed for " + attachment.fileName() + ": " + zipResult.errorMessage()); + return; + } + int childSort = 0; + for (AttachmentExtractor.ChildAttachment child : zipResult.childAttachments()) { + importAttachment(attachmentResult.document().getId(), accessContext, parentSource, + new MailAttachment(child.filename(), child.contentType(), child.data(), child.data().length, child.pathInArchive()), + documents, warnings, ++childSort, depth + 1); + } + } + } + + private String fallbackMailFileName(ParsedMailMessage parsed) { + String subject = parsed.subject() == null || parsed.subject().isBlank() ? "mail-message" : parsed.subject().replaceAll("[^A-Za-z0-9._-]", "_"); + return subject + ".eml"; + } + + private DocumentAccessContext defaultMailAccessContext() { + String tenantKey = properties.getGenericIngestion().getMailDefaultOwnerTenantKey(); + if (tenantKey == null || tenantKey.isBlank()) { + tenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey(); + } + DocumentVisibility visibility = properties.getGenericIngestion().getMailDefaultVisibility(); + TenantRef tenant = (tenantKey == null || tenantKey.isBlank()) ? null : new TenantRef(null, tenantKey, tenantKey); + if (tenant == null && visibility == DocumentVisibility.TENANT) { + visibility = DocumentVisibility.RESTRICTED; + } + return new DocumentAccessContext(tenant, visibility); + } + + private String previewTextIfLikelyText(MailAttachment attachment) { + String mime = DocumentImportSupport.normalizeMediaType(attachment.contentType()); + if (DocumentImportSupport.isLikelyTextMime(mime)) { + return attachment.safeTextPreview(); + } + String ext = DocumentImportSupport.extensionOf(attachment.fileName()); + if ("txt".equals(ext) || "xml".equals(ext) || "html".equals(ext) || "htm".equals(ext) || "md".equals(ext)) { + return attachment.safeTextPreview(); + } + return null; + } +} diff --git a/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java new file mode 100644 index 0000000..fbbd535 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java @@ -0,0 +1,130 @@ +package at.procon.dip.ingestion.adapter; + +import at.procon.dip.domain.access.DocumentAccessContext; +import at.procon.dip.domain.document.RelationType; +import at.procon.dip.domain.document.SourceType; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.service.DocumentRelationService; +import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand; +import at.procon.dip.ingestion.dto.ImportedDocumentResult; +import at.procon.dip.ingestion.service.GenericDocumentImportService; +import at.procon.dip.ingestion.service.TedPackageExpansionService; +import at.procon.dip.ingestion.spi.DocumentIngestionAdapter; +import at.procon.dip.ingestion.spi.IngestionResult; +import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import at.procon.ted.config.TedProcessorProperties; +import java.nio.charset.StandardCharsets; +import java.time.OffsetDateTime; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Component; +import org.springframework.transaction.annotation.Transactional; + +@Component +@RequiredArgsConstructor +@Slf4j +public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdapter { + + private final TedProcessorProperties properties; + private final GenericDocumentImportService importService; + private final TedPackageExpansionService expansionService; + private final DocumentRelationService relationService; + + @Override + public boolean supports(SourceDescriptor sourceDescriptor) { + return sourceDescriptor.sourceType() == SourceType.TED_PACKAGE + && properties.getGenericIngestion().isEnabled() + && properties.getGenericIngestion().isTedPackageAdapterEnabled(); + } + + @Override + public IngestionResult ingest(SourceDescriptor sourceDescriptor) { + byte[] packageBytes = sourceDescriptor.binaryContent(); + if (packageBytes == null || packageBytes.length == 0) { + throw new IllegalArgumentException("TED package adapter requires tar.gz bytes"); + } + + TedPackageExpansionService.TedPackageExpansionResult expanded = expansionService.expand(packageBytes); + Map rootAttributes = new LinkedHashMap<>(sourceDescriptor.attributes() == null ? Map.of() : sourceDescriptor.attributes()); + rootAttributes.putIfAbsent("packageId", sourceDescriptor.sourceIdentifier()); + rootAttributes.putIfAbsent("title", sourceDescriptor.fileName() != null ? sourceDescriptor.fileName() : sourceDescriptor.sourceIdentifier()); + rootAttributes.put("xmlEntryCount", Integer.toString(expanded.entries().size())); + rootAttributes.put("wrapperDocument", Boolean.TRUE.toString()); + rootAttributes.put("importBatchId", properties.getGenericIngestion().getTedPackageImportBatchId()); + + ImportedDocumentResult packageDocument = importService.importDocument(new SourceDescriptor( + sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(), + SourceType.TED_PACKAGE, + sourceDescriptor.sourceIdentifier(), + sourceDescriptor.sourceUri(), + sourceDescriptor.fileName(), + sourceDescriptor.mediaType() == null ? "application/gzip" : sourceDescriptor.mediaType(), + packageBytes, + expanded.manifestText(), + sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt(), + OriginalContentStoragePolicy.SKIP, + rootAttributes + )); + + List warnings = new ArrayList<>(packageDocument.warnings()); + List documents = new ArrayList<>(); + documents.add(packageDocument.document().toCanonicalMetadata()); + + int sortOrder = 0; + for (TedPackageExpansionService.TedPackageEntry entry : expanded.entries()) { + sortOrder++; + String childUri = "tedpkg://" + sourceDescriptor.sourceIdentifier() + "/" + entry.archivePath(); + String childIdentifier = sourceDescriptor.sourceIdentifier() + ":" + entry.archivePath(); + String xmlContent = resolveXmlContent(entry); + + Map childAttributes = new LinkedHashMap<>(); + childAttributes.put("documentTypeHint", "TED_NOTICE"); + childAttributes.put("packageId", sourceDescriptor.sourceIdentifier()); + childAttributes.put("archivePath", entry.archivePath()); + childAttributes.put("title", entry.fileName()); + childAttributes.put("importBatchId", properties.getGenericIngestion().getTedPackageImportBatchId()); + + ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor( + sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(), + SourceType.PACKAGE_CHILD, + childIdentifier, + childUri, + entry.fileName(), + entry.mediaType() == null ? "application/xml" : entry.mediaType(), + entry.data(), + xmlContent, + sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt(), + OriginalContentStoragePolicy.STORE, + childAttributes + )); + + Document childDocument = childResult.document(); + documents.add(childDocument.toCanonicalMetadata()); + warnings.addAll(childResult.warnings()); + if (childResult.deduplicated()) { + warnings.add("TED XML child already existed and was linked to package: " + entry.archivePath()); + } + relationService.ensureRelation(new CreateDocumentRelationCommand( + packageDocument.document().getId(), + childDocument.getId(), + RelationType.EXTRACTED_FROM, + sortOrder, + entry.archivePath() + )); + } + + return new IngestionResult(documents, warnings); + } + + private String resolveXmlContent(TedPackageExpansionService.TedPackageEntry entry) { + if (entry.textUtf8() != null && !entry.textUtf8().isBlank()) { + return entry.textUtf8(); + } + return new String(entry.data(), StandardCharsets.UTF_8); + } +} diff --git a/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java b/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java index 6dea73e..a0a16b4 100644 --- a/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java +++ b/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java @@ -5,6 +5,7 @@ import at.procon.dip.domain.access.DocumentVisibility; import at.procon.dip.domain.document.SourceType; import at.procon.dip.domain.tenant.TenantRef; import at.procon.dip.ingestion.service.DocumentIngestionGateway; +import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.ted.config.TedProcessorProperties; import java.nio.file.Files; @@ -72,6 +73,7 @@ public class GenericFileSystemIngestionRoute extends RouteBuilder { payload, null, OffsetDateTime.now(), + OriginalContentStoragePolicy.DEFAULT, attributes ); ingestionGateway.ingest(descriptor); diff --git a/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java b/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java index 33f544a..86b91c3 100644 --- a/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java +++ b/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java @@ -9,6 +9,7 @@ import at.procon.dip.ingestion.dto.GenericImportResponse; import at.procon.dip.ingestion.dto.GenericTextImportRequest; import at.procon.dip.ingestion.service.DocumentIngestionGateway; import at.procon.dip.ingestion.spi.IngestionResult; +import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.ted.config.TedProcessorProperties; import java.time.OffsetDateTime; @@ -62,6 +63,7 @@ public class GenericDocumentImportController { file.getBytes(), null, OffsetDateTime.now(), + OriginalContentStoragePolicy.DEFAULT, attributes ); IngestionResult result = ingestionGateway.ingest(descriptor); @@ -89,6 +91,7 @@ public class GenericDocumentImportController { request.text() == null ? null : request.text().getBytes(java.nio.charset.StandardCharsets.UTF_8), request.text(), OffsetDateTime.now(), + OriginalContentStoragePolicy.DEFAULT, attributes ); IngestionResult result = ingestionGateway.ingest(descriptor); diff --git a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java index fbdaa5b..ec987b8 100644 --- a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java +++ b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java @@ -4,15 +4,12 @@ import at.procon.dip.classification.service.DocumentClassificationService; import at.procon.dip.classification.spi.DetectionResult; import at.procon.dip.domain.access.DocumentAccessContext; import at.procon.dip.domain.access.DocumentVisibility; -import at.procon.dip.domain.document.CanonicalDocumentMetadata; import at.procon.dip.domain.document.ContentRole; import at.procon.dip.domain.document.DocumentStatus; import at.procon.dip.domain.document.StorageType; import at.procon.dip.domain.document.entity.Document; import at.procon.dip.domain.document.entity.DocumentContent; import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; -import at.procon.dip.domain.document.entity.DocumentSource; -import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; import at.procon.dip.domain.document.repository.DocumentRepository; import at.procon.dip.domain.document.repository.DocumentSourceRepository; import at.procon.dip.domain.document.service.DocumentContentService; @@ -29,11 +26,15 @@ import at.procon.dip.extraction.service.DocumentExtractionService; import at.procon.dip.extraction.spi.ExtractionRequest; import at.procon.dip.extraction.spi.ExtractionResult; import at.procon.dip.ingestion.dto.ImportedDocumentResult; +import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.util.DocumentImportSupport; import at.procon.dip.normalization.service.TextRepresentationBuildService; import at.procon.dip.normalization.spi.RepresentationBuildRequest; import at.procon.dip.normalization.spi.TextRepresentationDraft; +import at.procon.dip.processing.service.StructuredDocumentProcessingService; +import at.procon.dip.processing.spi.DocumentProcessingPolicy; +import at.procon.dip.processing.spi.StructuredProcessingRequest; import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.util.HashUtils; import java.nio.charset.StandardCharsets; @@ -43,7 +44,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.UUID; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; @@ -51,7 +51,7 @@ import org.springframework.transaction.annotation.Transactional; import org.springframework.util.StringUtils; /** - * Phase 4 generic import pipeline that persists arbitrary document types into the DOC model. + * Generic import pipeline that persists arbitrary document types into the DOC model. */ @Service @RequiredArgsConstructor @@ -61,7 +61,6 @@ public class GenericDocumentImportService { private final TedProcessorProperties properties; private final DocumentRepository documentRepository; private final DocumentSourceRepository documentSourceRepository; - private final DocumentEmbeddingRepository documentEmbeddingRepository; private final DocumentService documentService; private final DocumentSourceService documentSourceService; private final DocumentContentService documentContentService; @@ -70,6 +69,7 @@ public class GenericDocumentImportService { private final DocumentClassificationService classificationService; private final DocumentExtractionService extractionService; private final TextRepresentationBuildService representationBuildService; + private final StructuredDocumentProcessingService structuredProcessingService; @Transactional public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) { @@ -77,20 +77,20 @@ public class GenericDocumentImportService { DetectionResult detection = classificationService.detect(withResolvedMediaType(sourceDescriptor, payload)); String dedupHash = HashUtils.computeSha256(payload.binaryContent()); + DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null + ? defaultAccessContext() + : sourceDescriptor.accessContext(); + if (properties.getGenericIngestion().isDeduplicateByContentHash()) { - Optional existing = documentRepository.findByDedupHash(dedupHash); + Optional existing = resolveDeduplicatedDocument(dedupHash, accessContext); if (existing.isPresent()) { Document document = existing.get(); ensureSource(document, sourceDescriptor); - List warnings = List.of("Document content hash already imported; linked new source to existing document"); + List warnings = List.of("Document content hash already imported within the same access scope; linked new source to existing document"); return new ImportedDocumentResult(document, detection, warnings, true); } } - DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null - ? defaultAccessContext() - : sourceDescriptor.accessContext(); - Document document = documentService.create(new CreateDocumentCommand( accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(), accessContext.visibility(), @@ -108,21 +108,63 @@ public class GenericDocumentImportService { ensureSource(document, sourceDescriptor); documentService.updateStatus(document.getId(), DocumentStatus.CLASSIFIED); - DocumentContent originalContent = persistOriginalContent(document, sourceDescriptor, detection, payload, dedupHash); - - ExtractionResult extractionResult = extractionService.extract(new ExtractionRequest( - sourceDescriptor, - detection, - payload.textContent(), - payload.binaryContent() - )); - List warnings = new ArrayList<>(extractionResult.warnings()); + boolean persistOriginalContent = shouldPersistOriginalContent(sourceDescriptor); + DocumentContent originalContent = persistOriginalContent + ? persistOriginalContent(document, sourceDescriptor, detection, payload, dedupHash) + : null; + + List warnings = new ArrayList<>(); + DocumentProcessingPolicy processingPolicy = structuredProcessingService.resolvePolicy(sourceDescriptor, detection); + ExtractionResult extractionResult = emptyExtractionResult(); + Map persistedDerivedContent = new LinkedHashMap<>(); + + if (persistOriginalContent) { + if (processingPolicy.runGenericExtraction()) { + extractionResult = extractionService.extract(new ExtractionRequest( + sourceDescriptor, + detection, + payload.textContent(), + payload.binaryContent() + )); + warnings.addAll(extractionResult.warnings()); + if (processingPolicy.persistExtractedContent()) { + persistedDerivedContent.putAll(persistDerivedContent(document, detection, extractionResult, dedupHash, "generic")); + } + if (!extractionResult.derivedTextByRole().isEmpty()) { + documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED); + } + } - Map persistedDerivedContent = persistDerivedContent(document, detection, extractionResult, dedupHash); - documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED); + if (processingPolicy.invokeStructuredProcessor()) { + Optional structuredExtractionResult = structuredProcessingService.process(new StructuredProcessingRequest( + document, + originalContent, + sourceDescriptor, + detection, + payload.binaryContent(), + payload.textContent(), + dedupHash + )); + if (structuredExtractionResult.isPresent()) { + ExtractionResult result = structuredExtractionResult.get(); + warnings.addAll(result.warnings()); + extractionResult = mergeExtractionResults(extractionResult, result); + if (processingPolicy.persistExtractedContent()) { + persistedDerivedContent.putAll(persistDerivedContent(document, detection, result, dedupHash, "structured")); + } + if (!result.derivedTextByRole().isEmpty()) { + documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED); + } + } + } - var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult)); - persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts); + if (processingPolicy.runRepresentationBuilders()) { + var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult)); + persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts); + } + } else { + warnings.add("Original content storage disabled for this document; skipped extraction and text-representation processing"); + } Document reloaded = documentService.getRequired(document.getId()); if (reloaded.getStatus() == DocumentStatus.EXTRACTED) { @@ -130,7 +172,7 @@ public class GenericDocumentImportService { reloaded = documentService.getRequired(reloaded.getId()); } - if (!extractionResult.structuredPayloads().isEmpty()) { + if (processingPolicy.applyStructuredTitleIfMissing() && !extractionResult.structuredPayloads().isEmpty()) { applyStructuredTitleIfMissing(reloaded, extractionResult); reloaded = documentService.getRequired(reloaded.getId()); } @@ -138,6 +180,50 @@ public class GenericDocumentImportService { return new ImportedDocumentResult(reloaded, detection, warnings, false); } + private ExtractionResult mergeExtractionResults(ExtractionResult left, ExtractionResult right) { + Map derivedText = new LinkedHashMap<>(); + if (left != null && left.derivedTextByRole() != null) { + derivedText.putAll(left.derivedTextByRole()); + } + if (right != null && right.derivedTextByRole() != null) { + derivedText.putAll(right.derivedTextByRole()); + } + List payloads = new ArrayList<>(); + if (left != null && left.structuredPayloads() != null) { + payloads.addAll(left.structuredPayloads()); + } + if (right != null && right.structuredPayloads() != null) { + payloads.addAll(right.structuredPayloads()); + } + List warnings = new ArrayList<>(); + if (left != null && left.warnings() != null) { + warnings.addAll(left.warnings()); + } + if (right != null && right.warnings() != null) { + warnings.addAll(right.warnings()); + } + return new ExtractionResult(derivedText, payloads, warnings); + } + + private ExtractionResult emptyExtractionResult() { + return new ExtractionResult(java.util.Collections.emptyMap(), java.util.Collections.emptyList(), java.util.Collections.emptyList()); + } + + private Optional resolveDeduplicatedDocument(String dedupHash, DocumentAccessContext accessContext) { + return documentRepository.findAllByDedupHash(dedupHash).stream() + .filter(existing -> sameAccessScope(existing, accessContext)) + .findFirst(); + } + + private boolean sameAccessScope(Document existing, DocumentAccessContext accessContext) { + if (existing.getVisibility() != accessContext.visibility()) { + return false; + } + String existingTenantKey = existing.getOwnerTenant() == null ? null : existing.getOwnerTenant().getTenantKey(); + String requestedTenantKey = accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(); + return java.util.Objects.equals(existingTenantKey, requestedTenantKey); + } + private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) { if (StringUtils.hasText(sourceDescriptor.mediaType())) { return sourceDescriptor; @@ -152,6 +238,7 @@ public class GenericDocumentImportService { sourceDescriptor.binaryContent(), sourceDescriptor.textContent(), sourceDescriptor.receivedAt(), + sourceDescriptor.originalContentStoragePolicy(), sourceDescriptor.attributes() ); } @@ -216,7 +303,7 @@ public class GenericDocumentImportService { return sourceDescriptor.fileName(); } if (StringUtils.hasText(payload.textContent())) { - for (String line : payload.textContent().split("\\n")) { + for (String line : payload.textContent().split("\n")) { if (StringUtils.hasText(line)) { return DocumentImportSupport.ellipsize(line.trim(), 240); } @@ -244,6 +331,10 @@ public class GenericDocumentImportService { return; } + String importBatchId = sourceDescriptor.attributes() != null && StringUtils.hasText(sourceDescriptor.attributes().get("importBatchId")) + ? sourceDescriptor.attributes().get("importBatchId") + : properties.getGenericIngestion().getImportBatchId(); + documentSourceService.addSource(new AddDocumentSourceCommand( document.getId(), sourceDescriptor.sourceType(), @@ -251,11 +342,35 @@ public class GenericDocumentImportService { sourceDescriptor.sourceUri(), sourceDescriptor.fileName(), null, - properties.getGenericIngestion().getImportBatchId(), + importBatchId, sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt() )); } + private boolean shouldPersistOriginalContent(SourceDescriptor sourceDescriptor) { + if (sourceDescriptor.originalContentStoragePolicy() == OriginalContentStoragePolicy.STORE) { + return true; + } + if (sourceDescriptor.originalContentStoragePolicy() == OriginalContentStoragePolicy.SKIP) { + return false; + } + if (properties.getGenericIngestion().isStoreOriginalContentForWrapperDocuments()) { + return true; + } + return !isWrapperDocument(sourceDescriptor); + } + + private boolean isWrapperDocument(SourceDescriptor sourceDescriptor) { + if (sourceDescriptor.attributes() == null || sourceDescriptor.attributes().isEmpty()) { + return false; + } + String wrapperFlag = sourceDescriptor.attributes().get("wrapperDocument"); + if (wrapperFlag == null) { + wrapperFlag = sourceDescriptor.attributes().get("containerDocument"); + } + return Boolean.parseBoolean(wrapperFlag); + } + private DocumentContent persistOriginalContent(Document document, SourceDescriptor sourceDescriptor, DetectionResult detection, @@ -287,13 +402,14 @@ public class GenericDocumentImportService { private Map persistDerivedContent(Document document, DetectionResult detection, ExtractionResult extractionResult, - String baseHash) { + String baseHash, + String hashNamespace) { Map result = new LinkedHashMap<>(); extractionResult.derivedTextByRole().forEach((role, text) -> { if (!StringUtils.hasText(text)) { return; } - String contentHash = HashUtils.computeSha256(baseHash + ":" + role.name() + ":" + text); + String contentHash = HashUtils.computeSha256(baseHash + ":" + hashNamespace + ":" + role.name() + ":" + text); DocumentContent content = documentContentService.addContent(new AddDocumentContentCommand( document.getId(), role, @@ -336,16 +452,13 @@ public class GenericDocumentImportService { if (!StringUtils.hasText(draft.textBody())) { continue; } - DocumentContent linkedContent = switch (draft.representationType()) { - case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK -> - derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent); - }; + DocumentContent linkedContent = resolveLinkedContent(originalContent, derivedContent, draft); var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand( document.getId(), linkedContent == null ? null : linkedContent.getId(), draft.representationType(), - "phase4-generic-builder", + draft.builderKey() == null ? "phase4-generic-builder" : draft.builderKey(), draft.languageCode(), null, draft.chunkIndex(), @@ -362,7 +475,23 @@ public class GenericDocumentImportService { documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED); } + private DocumentContent resolveLinkedContent(DocumentContent originalContent, + Map derivedContent, + TextRepresentationDraft draft) { + ContentRole sourceRole = draft.sourceContentRole(); + if (sourceRole == null) { + sourceRole = ContentRole.NORMALIZED_TEXT; + } + if (sourceRole == ContentRole.ORIGINAL) { + return originalContent; + } + return derivedContent.getOrDefault(sourceRole, originalContent); + } + private boolean shouldQueueEmbedding(TextRepresentationDraft draft) { + if (Boolean.FALSE.equals(draft.queueForEmbedding())) { + return false; + } return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true; } diff --git a/src/main/java/at/procon/dip/ingestion/service/MailMessageExtractionService.java b/src/main/java/at/procon/dip/ingestion/service/MailMessageExtractionService.java new file mode 100644 index 0000000..3c9aea0 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/service/MailMessageExtractionService.java @@ -0,0 +1,122 @@ +package at.procon.dip.ingestion.service; + +import jakarta.mail.BodyPart; +import jakarta.mail.Multipart; +import jakarta.mail.Part; +import jakarta.mail.Session; +import jakarta.mail.internet.MimeMessage; +import jakarta.mail.internet.MimeUtility; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.time.OffsetDateTime; +import java.time.ZoneId; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; +import lombok.extern.slf4j.Slf4j; +import org.jsoup.Jsoup; +import org.springframework.stereotype.Service; + +@Service +@Slf4j +public class MailMessageExtractionService { + + public ParsedMailMessage parse(byte[] rawMime) { + try { + Session session = Session.getDefaultInstance(new Properties()); + MimeMessage message = new MimeMessage(session, new ByteArrayInputStream(rawMime)); + String subject = message.getSubject(); + String from = message.getFrom() != null && message.getFrom().length > 0 ? message.getFrom()[0].toString() : null; + List recipients = new ArrayList<>(); + if (message.getAllRecipients() != null) { + for (var recipient : message.getAllRecipients()) { + recipients.add(recipient.toString()); + } + } + StringBuilder text = new StringBuilder(); + StringBuilder html = new StringBuilder(); + List attachments = new ArrayList<>(); + processPart(message, text, html, attachments); + String normalizedText = text.length() > 0 ? text.toString().trim() : htmlToText(html.toString()); + OffsetDateTime receivedAt = message.getReceivedDate() == null ? OffsetDateTime.now() + : message.getReceivedDate().toInstant().atZone(ZoneId.systemDefault()).toOffsetDateTime(); + return new ParsedMailMessage(subject, from, recipients, receivedAt, normalizedText, html.toString(), attachments); + } catch (Exception e) { + throw new IllegalArgumentException("Failed to parse MIME message", e); + } + } + + private void processPart(Part part, StringBuilder text, StringBuilder html, List attachments) throws Exception { + String disposition = part.getDisposition(); + String contentType = part.getContentType() == null ? "application/octet-stream" : part.getContentType(); + if (disposition != null && (Part.ATTACHMENT.equalsIgnoreCase(disposition) || Part.INLINE.equalsIgnoreCase(disposition)) + && part.getFileName() != null) { + attachments.add(extractAttachment(part)); + return; + } + Object content = part.getContent(); + if (content instanceof Multipart multipart) { + for (int i = 0; i < multipart.getCount(); i++) { + BodyPart bodyPart = multipart.getBodyPart(i); + processPart(bodyPart, text, html, attachments); + } + } else if (contentType.toLowerCase().contains("text/plain")) { + text.append(content.toString()).append("\n"); + } else if (contentType.toLowerCase().contains("text/html")) { + html.append(content.toString()).append("\n"); + } else if (part.getFileName() != null) { + attachments.add(extractAttachment(part)); + } + } + + private MailAttachment extractAttachment(Part part) throws Exception { + String fileName = part.getFileName(); + if (fileName == null) { + fileName = "attachment"; + } + try { + fileName = MimeUtility.decodeText(fileName); + } catch (Exception ignored) { + } + String contentType = part.getContentType(); + byte[] data; + try (InputStream in = part.getInputStream(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { + in.transferTo(out); + data = out.toByteArray(); + } + return new MailAttachment(fileName, contentType, data, data.length, null); + } + + private String htmlToText(String html) { + if (html == null || html.isBlank()) { + return ""; + } + try { + return Jsoup.parse(html).text().replaceAll("\s+", " ").trim(); + } catch (Exception e) { + log.debug("Falling back to naive HTML cleanup: {}", e.getMessage()); + return html.replaceAll("<[^>]+>", " ").replaceAll("\s+", " ").trim(); + } + } + + public String serializeMessage(ParsedMailMessage parsed) { + StringBuilder sb = new StringBuilder(); + if (parsed.subject() != null) sb.append("Subject: ").append(parsed.subject()).append("\n"); + if (parsed.from() != null) sb.append("From: ").append(parsed.from()).append("\n"); + if (!parsed.recipients().isEmpty()) sb.append("To: ").append(String.join(", ", parsed.recipients())).append("\n"); + sb.append("\n"); + if (parsed.textBody() != null) sb.append(parsed.textBody()); + return sb.toString().trim(); + } + + public record ParsedMailMessage(String subject, String from, List recipients, OffsetDateTime receivedAt, + String textBody, String htmlBody, List attachments) {} + + public record MailAttachment(String fileName, String contentType, byte[] data, long sizeBytes, String path) { + public String safeTextPreview() { + return new String(data, StandardCharsets.UTF_8); + } + } +} diff --git a/src/main/java/at/procon/dip/ingestion/service/TedPackageExpansionService.java b/src/main/java/at/procon/dip/ingestion/service/TedPackageExpansionService.java new file mode 100644 index 0000000..67bfd83 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/service/TedPackageExpansionService.java @@ -0,0 +1,88 @@ +package at.procon.dip.ingestion.service; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.springframework.stereotype.Service; + +@Service +@Slf4j +public class TedPackageExpansionService { + + private static final int MAX_FILES = 10000; + private static final long MAX_SINGLE_FILE_SIZE = 20L * 1024 * 1024; + private static final long MAX_TOTAL_EXTRACTED_SIZE = 1024L * 1024 * 1024; + + public TedPackageExpansionResult expand(byte[] tarGzBytes) { + List entries = new ArrayList<>(); + long total = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream( + new GzipCompressorInputStream(new ByteArrayInputStream(tarGzBytes)))) { + TarArchiveEntry entry; + while ((entry = tais.getNextTarEntry()) != null) { + if (entry.isDirectory()) { + continue; + } + if (entries.size() >= MAX_FILES) { + break; + } + String entryName = entry.getName(); + if (!entryName.toLowerCase().endsWith(".xml")) { + continue; + } + if (entryName.contains("..") || entryName.startsWith("/") || entryName.startsWith("\\")) { + log.warn("Skipping suspicious TED package entry {}", entryName); + continue; + } + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[8192]; + long fileSize = 0; + int read; + while ((read = tais.read(buffer)) > 0) { + fileSize += read; + total += read; + if (fileSize > MAX_SINGLE_FILE_SIZE || total > MAX_TOTAL_EXTRACTED_SIZE) { + throw new IOException("TED package extraction limits exceeded"); + } + baos.write(buffer, 0, read); + } + byte[] data = baos.toByteArray(); + entries.add(new TedPackageEntry(extractFilename(entryName), entryName, data, data.length, "application/xml")); + } + } catch (Exception e) { + throw new IllegalArgumentException("Failed to expand TED package", e); + } + + String manifest = buildManifest(entries); + return new TedPackageExpansionResult(entries, manifest); + } + + private String buildManifest(List entries) { + StringBuilder sb = new StringBuilder(); + sb.append("TED package contains ").append(entries.size()).append(" XML notice files\n"); + for (TedPackageEntry entry : entries) { + sb.append("- ").append(entry.archivePath()).append(" (" ).append(entry.sizeBytes()).append(" bytes)\n"); + } + return sb.toString().trim(); + } + + private String extractFilename(String path) { + int idx = Math.max(path.lastIndexOf('/'), path.lastIndexOf('\\')); + return idx >= 0 ? path.substring(idx + 1) : path; + } + + public record TedPackageExpansionResult(List entries, String manifestText) {} + + public record TedPackageEntry(String fileName, String archivePath, byte[] data, long sizeBytes, String mediaType) { + public String textUtf8() { + return new String(data, StandardCharsets.UTF_8); + } + } +} diff --git a/src/main/java/at/procon/dip/ingestion/spi/OriginalContentStoragePolicy.java b/src/main/java/at/procon/dip/ingestion/spi/OriginalContentStoragePolicy.java new file mode 100644 index 0000000..8fcf2d3 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/spi/OriginalContentStoragePolicy.java @@ -0,0 +1,11 @@ +package at.procon.dip.ingestion.spi; + +/** + * Controls whether the ORIGINAL raw payload should be persisted for a single imported document. + * DEFAULT defers to the global ingestion configuration and wrapper-document heuristics. + */ +public enum OriginalContentStoragePolicy { + DEFAULT, + STORE, + SKIP +} diff --git a/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java index 60183ff..cc47fa7 100644 --- a/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java +++ b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java @@ -21,9 +21,16 @@ public record SourceDescriptor( byte[] binaryContent, String textContent, OffsetDateTime receivedAt, + OriginalContentStoragePolicy originalContentStoragePolicy, Map attributes ) { + public SourceDescriptor { + if (originalContentStoragePolicy == null) { + originalContentStoragePolicy = OriginalContentStoragePolicy.DEFAULT; + } + } + public boolean hasInlineBinaryContent() { return binaryContent != null && binaryContent.length > 0; } diff --git a/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java b/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java index 272b114..db3587a 100644 --- a/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java +++ b/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java @@ -45,7 +45,7 @@ public final class DocumentImportSupport { public static DocumentFamily familyFor(DocumentType documentType) { return switch (documentType) { - case TED_NOTICE -> DocumentFamily.PROCUREMENT; + case TED_PACKAGE, TED_NOTICE -> DocumentFamily.PROCUREMENT; case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL; case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN -> DocumentFamily.GENERIC; diff --git a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java index 6d0e2d7..7b5e1ab 100644 --- a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java +++ b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java @@ -12,12 +12,16 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.Optional; +import org.springframework.core.annotation.Order; import org.springframework.stereotype.Component; import org.springframework.util.StringUtils; @Component +@Order(100) public class DefaultGenericTextRepresentationBuilder implements TextRepresentationBuilder { + public static final String BUILDER_KEY = "default-generic-text"; + @Override public boolean supports(DocumentType documentType) { return documentType != DocumentType.TED_NOTICE; @@ -39,25 +43,34 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati List drafts = new ArrayList<>(); drafts.add(new TextRepresentationDraft( RepresentationType.FULLTEXT, + BUILDER_KEY, request.detectionResult().languageCode(), baseText, false, - null + null, + ContentRole.NORMALIZED_TEXT, + Boolean.TRUE )); drafts.add(new TextRepresentationDraft( RepresentationType.SEMANTIC_TEXT, + BUILDER_KEY, request.detectionResult().languageCode(), semantic, true, - null + null, + ContentRole.NORMALIZED_TEXT, + Boolean.TRUE )); if (StringUtils.hasText(title)) { drafts.add(new TextRepresentationDraft( RepresentationType.TITLE_ABSTRACT, + BUILDER_KEY, request.detectionResult().languageCode(), title + "\n\n" + summary, false, - null + null, + ContentRole.NORMALIZED_TEXT, + Boolean.FALSE )); } return drafts; diff --git a/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java new file mode 100644 index 0000000..834082c --- /dev/null +++ b/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java @@ -0,0 +1,138 @@ +package at.procon.dip.normalization.impl; + +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.extraction.spi.ExtractedStructuredPayload; +import at.procon.dip.ingestion.util.DocumentImportSupport; +import at.procon.dip.normalization.spi.RepresentationBuildRequest; +import at.procon.dip.normalization.spi.TextRepresentationBuilder; +import at.procon.dip.normalization.spi.TextRepresentationDraft; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import org.springframework.core.annotation.Order; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +@Component +@Order(10) +public class TedStructuredTextRepresentationBuilder implements TextRepresentationBuilder { + + public static final String BUILDER_KEY = "ted-structured-text"; + + @Override + public boolean supports(DocumentType documentType) { + return documentType == DocumentType.TED_NOTICE; + } + + @Override + public List build(RepresentationBuildRequest request) { + String normalizedText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT); + if (!StringUtils.hasText(normalizedText)) { + return List.of(); + } + + Map attributes = request.extractionResult().structuredPayloads().stream() + .filter(payload -> Objects.equals(payload.projectionName(), "ted-notice")) + .map(ExtractedStructuredPayload::attributes) + .filter(Objects::nonNull) + .findFirst() + .orElse(Map.of()); + + String title = asString(attributes.get("title")); + String description = asString(attributes.get("description")); + String buyerName = asString(attributes.get("buyerName")); + String cpvCodes = asString(attributes.get("cpvCodes")); + String nutsCodes = asString(attributes.get("nutsCodes")); + String publicationId = asString(attributes.get("publicationId")); + String semanticText = buildSemanticText(title, description, buyerName, cpvCodes, nutsCodes, publicationId, normalizedText); + String summary = DocumentImportSupport.ellipsize( + StringUtils.hasText(description) ? description.trim() : normalizedText.replace('\n', ' ').trim(), + 1200 + ); + + List drafts = new ArrayList<>(); + drafts.add(new TextRepresentationDraft( + RepresentationType.SEMANTIC_TEXT, + BUILDER_KEY, + request.detectionResult().languageCode(), + semanticText, + true, + null, + ContentRole.NORMALIZED_TEXT, + Boolean.TRUE + )); + /* + drafts.add(new TextRepresentationDraft( + RepresentationType.FULLTEXT, + BUILDER_KEY, + request.detectionResult().languageCode(), + normalizedText, + false, + null, + ContentRole.NORMALIZED_TEXT, + Boolean.TRUE + )); + if (StringUtils.hasText(title)) { + drafts.add(new TextRepresentationDraft( + RepresentationType.TITLE_ABSTRACT, + BUILDER_KEY, + request.detectionResult().languageCode(), + title + "\n\n" + summary, + false, + null, + ContentRole.NORMALIZED_TEXT, + Boolean.FALSE + )); + } + drafts.add(new TextRepresentationDraft( + RepresentationType.SUMMARY, + BUILDER_KEY, + request.detectionResult().languageCode(), + summary, + false, + null, + ContentRole.NORMALIZED_TEXT, + Boolean.FALSE + )); + */ + return drafts; + } + + private String buildSemanticText(String title, + String description, + String buyerName, + String cpvCodes, + String nutsCodes, + String publicationId, + String normalizedText) { + StringBuilder sb = new StringBuilder(); + sb.append("Document type: TED_NOTICE\n"); + if (StringUtils.hasText(publicationId)) { + sb.append("Publication: ").append(publicationId.trim()).append('\n'); + } + if (StringUtils.hasText(title)) { + sb.append("Title: ").append(title.trim()).append("\n\n"); + } + if (StringUtils.hasText(buyerName)) { + sb.append("Contracting Authority: ").append(buyerName.trim()).append('\n'); + } + if (StringUtils.hasText(cpvCodes)) { + sb.append("CPV Codes: ").append(cpvCodes.trim()).append('\n'); + } + if (StringUtils.hasText(nutsCodes)) { + sb.append("NUTS Codes: ").append(nutsCodes.trim()).append('\n'); + } + if (StringUtils.hasText(description)) { + sb.append("\nDescription: ").append(description.trim()).append("\n\n"); + } + sb.append(normalizedText.trim()); + return sb.toString().trim(); + } + + private String asString(Object value) { + return value instanceof String s && StringUtils.hasText(s) ? s : null; + } +} diff --git a/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java b/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java index 38f2d8a..bb2f3f7 100644 --- a/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java +++ b/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java @@ -3,8 +3,10 @@ package at.procon.dip.normalization.service; import at.procon.dip.normalization.spi.RepresentationBuildRequest; import at.procon.dip.normalization.spi.TextRepresentationBuilder; import at.procon.dip.normalization.spi.TextRepresentationDraft; +import java.util.ArrayList; import java.util.List; import lombok.RequiredArgsConstructor; +import org.springframework.core.annotation.AnnotationAwareOrderComparator; import org.springframework.stereotype.Service; @Service @@ -14,11 +16,22 @@ public class TextRepresentationBuildService { private final List builders; public List build(RepresentationBuildRequest request) { - return builders.stream() + List matchingBuilders = builders.stream() .filter(builder -> builder.supports(request.detectionResult().documentType())) - .findFirst() - .orElseThrow(() -> new IllegalStateException( - "No text representation builder registered for " + request.detectionResult().documentType())) - .build(request); + .sorted(AnnotationAwareOrderComparator.INSTANCE) + .toList(); + if (matchingBuilders.isEmpty()) { + throw new IllegalStateException( + "No text representation builder registered for " + request.detectionResult().documentType()); + } + + List result = new ArrayList<>(); + for (TextRepresentationBuilder builder : matchingBuilders) { + List drafts = builder.build(request); + if (drafts != null && !drafts.isEmpty()) { + result.addAll(drafts); + } + } + return result; } } diff --git a/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java index af1f49a..4b7322d 100644 --- a/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java +++ b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java @@ -1,5 +1,6 @@ package at.procon.dip.normalization.spi; +import at.procon.dip.domain.document.ContentRole; import at.procon.dip.domain.document.RepresentationType; /** @@ -7,9 +8,20 @@ import at.procon.dip.domain.document.RepresentationType; */ public record TextRepresentationDraft( RepresentationType representationType, + String builderKey, String languageCode, String textBody, boolean primary, - Integer chunkIndex + Integer chunkIndex, + ContentRole sourceContentRole, + Boolean queueForEmbedding ) { + + public TextRepresentationDraft(RepresentationType representationType, + String languageCode, + String textBody, + boolean primary, + Integer chunkIndex) { + this(representationType, null, languageCode, textBody, primary, chunkIndex, ContentRole.NORMALIZED_TEXT, null); + } } diff --git a/src/main/java/at/procon/dip/processing/impl/TedStructuredDocumentProcessor.java b/src/main/java/at/procon/dip/processing/impl/TedStructuredDocumentProcessor.java new file mode 100644 index 0000000..2cdfc14 --- /dev/null +++ b/src/main/java/at/procon/dip/processing/impl/TedStructuredDocumentProcessor.java @@ -0,0 +1,110 @@ +package at.procon.dip.processing.impl; + +import at.procon.dip.classification.spi.DetectionResult; +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.extraction.spi.ExtractedStructuredPayload; +import at.procon.dip.extraction.spi.ExtractionResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import at.procon.dip.processing.spi.DocumentProcessingPolicy; +import at.procon.dip.processing.spi.StructuredDocumentProcessor; +import at.procon.dip.processing.spi.StructuredProcessingRequest; +import at.procon.dip.domain.document.service.DocumentService; +import at.procon.dip.domain.ted.service.TedNoticeProjectionService; +import at.procon.ted.model.entity.ProcurementDocument; +import at.procon.ted.service.XmlParserService; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +@Component +@RequiredArgsConstructor +@Slf4j +public class TedStructuredDocumentProcessor implements StructuredDocumentProcessor { + + private final XmlParserService xmlParserService; + private final DocumentService documentService; + private final TedNoticeProjectionService tedNoticeProjectionService; + + @Override + public boolean supports(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) { + return detectionResult.documentType() == DocumentType.TED_NOTICE; + } + + @Override + public DocumentProcessingPolicy processingPolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) { + return DocumentProcessingPolicy.replaceGenericTextProcessing(); + } + + @Override + public ExtractionResult process(StructuredProcessingRequest request) { + String xml = request.textContent(); + if (!StringUtils.hasText(xml) && request.binaryContent() != null) { + xml = new String(request.binaryContent(), java.nio.charset.StandardCharsets.UTF_8); + } + if (!StringUtils.hasText(xml)) { + return new ExtractionResult(Map.of(), List.of(), List.of("TED structured processor received no XML payload")); + } + + ProcurementDocument tedDocument = xmlParserService.parseDocument(xml); + tedDocument.setDocumentHash(request.dedupHash()); + tedDocument.setXmlDocument(xml); + tedDocument.setSourceFilename(request.sourceDescriptor().fileName()); + tedDocument.setSourcePath(request.sourceDescriptor().sourceUri()); + tedDocument.setFileSizeBytes(request.binaryContent() == null ? null : (long) request.binaryContent().length); + + var canonical = request.document(); + canonical.setDocumentType(DocumentType.TED_NOTICE); + canonical.setDocumentFamily(DocumentFamily.PROCUREMENT); + canonical.setStatus(DocumentStatus.CLASSIFIED); + canonical.setTitle(tedDocument.getProjectTitle()); + canonical.setSummary(tedDocument.getProjectDescription()); + canonical.setLanguageCode(tedDocument.getLanguageCode()); + canonical.setMimeType(request.detectionResult().mimeType() == null ? "application/xml" : request.detectionResult().mimeType()); + if (StringUtils.hasText(tedDocument.getPublicationId())) { + canonical.setBusinessKey("TED_NOTICE:" + tedDocument.getPublicationId()); + } else if (StringUtils.hasText(tedDocument.getNoticeId())) { + canonical.setBusinessKey("TED_NOTICE:" + tedDocument.getNoticeId()); + } + documentService.save(canonical); + + tedNoticeProjectionService.registerOrRefreshProjection(tedDocument, canonical.getId()); + + Map payload = new LinkedHashMap<>(); + if (StringUtils.hasText(tedDocument.getProjectTitle())) { + payload.put("title", tedDocument.getProjectTitle()); + } + if (StringUtils.hasText(tedDocument.getProjectDescription())) { + payload.put("description", tedDocument.getProjectDescription()); + } + if (StringUtils.hasText(tedDocument.getBuyerName())) { + payload.put("buyerName", tedDocument.getBuyerName()); + } + if (tedDocument.getCpvCodes() != null && tedDocument.getCpvCodes().length > 0) { + payload.put("cpvCodes", String.join(", ", tedDocument.getCpvCodes())); + } + if (tedDocument.getNutsCodes() != null && tedDocument.getNutsCodes().length > 0) { + payload.put("nutsCodes", String.join(", ", tedDocument.getNutsCodes())); + } + payload.put("lotCount", tedDocument.getLots() == null ? 0 : tedDocument.getLots().size()); + payload.put("noticeId", tedDocument.getNoticeId()); + payload.put("publicationId", tedDocument.getPublicationId()); + + Map derivedText = new LinkedHashMap<>(); + if (StringUtils.hasText(tedDocument.getTextContent())) { + derivedText.put(ContentRole.NORMALIZED_TEXT, tedDocument.getTextContent()); + } + + return new ExtractionResult( + derivedText, + List.of(new ExtractedStructuredPayload("ted-notice", payload)), + List.of() + ); + } +} diff --git a/src/main/java/at/procon/dip/processing/service/StructuredDocumentProcessingService.java b/src/main/java/at/procon/dip/processing/service/StructuredDocumentProcessingService.java new file mode 100644 index 0000000..0ff0357 --- /dev/null +++ b/src/main/java/at/procon/dip/processing/service/StructuredDocumentProcessingService.java @@ -0,0 +1,36 @@ +package at.procon.dip.processing.service; + +import at.procon.dip.classification.spi.DetectionResult; +import at.procon.dip.processing.spi.DocumentProcessingPolicy; +import at.procon.dip.processing.spi.StructuredDocumentProcessor; +import at.procon.dip.processing.spi.StructuredProcessingRequest; +import at.procon.dip.extraction.spi.ExtractionResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import java.util.List; +import java.util.Optional; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +public class StructuredDocumentProcessingService { + + private final List processors; + + public Optional resolve(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) { + return processors.stream() + .filter(processor -> processor.supports(sourceDescriptor, detectionResult)) + .findFirst(); + } + + public DocumentProcessingPolicy resolvePolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) { + return resolve(sourceDescriptor, detectionResult) + .map(processor -> processor.processingPolicy(sourceDescriptor, detectionResult)) + .orElse(DocumentProcessingPolicy.genericDefault()); + } + + public Optional process(StructuredProcessingRequest request) { + return resolve(request.sourceDescriptor(), request.detectionResult()) + .map(processor -> processor.process(request)); + } +} diff --git a/src/main/java/at/procon/dip/processing/spi/DocumentProcessingPolicy.java b/src/main/java/at/procon/dip/processing/spi/DocumentProcessingPolicy.java new file mode 100644 index 0000000..a4cddc9 --- /dev/null +++ b/src/main/java/at/procon/dip/processing/spi/DocumentProcessingPolicy.java @@ -0,0 +1,22 @@ +package at.procon.dip.processing.spi; + +/** + * Controls which generic pipeline stages should run for a document and whether + * a structured processor should be invoked. + */ +public record DocumentProcessingPolicy( + boolean runGenericExtraction, + boolean persistExtractedContent, + boolean runRepresentationBuilders, + boolean invokeStructuredProcessor, + boolean applyStructuredTitleIfMissing +) { + + public static DocumentProcessingPolicy genericDefault() { + return new DocumentProcessingPolicy(true, true, true, true, true); + } + + public static DocumentProcessingPolicy replaceGenericTextProcessing() { + return new DocumentProcessingPolicy(false, true, true, true, true); + } +} diff --git a/src/main/java/at/procon/dip/processing/spi/StructuredDocumentProcessor.java b/src/main/java/at/procon/dip/processing/spi/StructuredDocumentProcessor.java new file mode 100644 index 0000000..2fbe670 --- /dev/null +++ b/src/main/java/at/procon/dip/processing/spi/StructuredDocumentProcessor.java @@ -0,0 +1,19 @@ +package at.procon.dip.processing.spi; + +import at.procon.dip.classification.spi.DetectionResult; +import at.procon.dip.extraction.spi.ExtractionResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; + +/** + * Optional type-specific enrichment layer on top of the canonical DOC import. + */ +public interface StructuredDocumentProcessor { + + boolean supports(SourceDescriptor sourceDescriptor, DetectionResult detectionResult); + + default DocumentProcessingPolicy processingPolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) { + return DocumentProcessingPolicy.replaceGenericTextProcessing(); + } + + ExtractionResult process(StructuredProcessingRequest request); +} diff --git a/src/main/java/at/procon/dip/processing/spi/StructuredProcessingRequest.java b/src/main/java/at/procon/dip/processing/spi/StructuredProcessingRequest.java new file mode 100644 index 0000000..8d7652b --- /dev/null +++ b/src/main/java/at/procon/dip/processing/spi/StructuredProcessingRequest.java @@ -0,0 +1,20 @@ +package at.procon.dip.processing.spi; + +import at.procon.dip.classification.spi.DetectionResult; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.entity.DocumentContent; +import at.procon.dip.ingestion.spi.SourceDescriptor; + +/** + * Canonical import context handed to a structured document processor. + */ +public record StructuredProcessingRequest( + Document document, + DocumentContent originalContent, + SourceDescriptor sourceDescriptor, + DetectionResult detectionResult, + byte[] binaryContent, + String textContent, + String dedupHash +) { +} diff --git a/src/main/java/at/procon/ted/camel/MailRoute.java b/src/main/java/at/procon/ted/camel/MailRoute.java index 2a56137..49e9e6b 100644 --- a/src/main/java/at/procon/ted/camel/MailRoute.java +++ b/src/main/java/at/procon/ted/camel/MailRoute.java @@ -1,6 +1,10 @@ package at.procon.ted.camel; +import at.procon.dip.domain.document.SourceType; import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.ingestion.service.DocumentIngestionGateway; +import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; +import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.ted.service.attachment.AttachmentExtractor; import at.procon.ted.service.attachment.AttachmentProcessingService; import jakarta.mail.BodyPart; @@ -14,16 +18,18 @@ import lombok.extern.slf4j.Slf4j; import org.apache.camel.Exchange; import org.apache.camel.LoggingLevel; import org.apache.camel.builder.RouteBuilder; +import org.apache.camel.component.mail.MailMessage; import org.jsoup.Jsoup; import org.springframework.stereotype.Component; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.InputStream; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; +import java.time.OffsetDateTime; +import java.time.ZoneId; +import java.util.*; /** * Apache Camel route for IMAP mail processing. @@ -51,6 +57,7 @@ public class MailRoute extends RouteBuilder { private final TedProcessorProperties properties; private final AttachmentProcessingService attachmentProcessingService; + private final DocumentIngestionGateway documentIngestionGateway; @Override public void configure() throws Exception { @@ -105,7 +112,14 @@ public class MailRoute extends RouteBuilder { from("direct:mime") .routeId(ROUTE_ID_MIME) .process(exchange -> { - Message mailMessage = exchange.getIn().getBody(Message.class); + Message mailMessage = null; + MailMessage mailMessage_ = exchange.getIn().getBody(MailMessage.class); + if(mailMessage_ != null) { + mailMessage = mailMessage_.getMessage(); + } + else { + mailMessage = exchange.getIn().getBody(Message.class); + } if (mailMessage == null) { log.warn("Received null mail message, skipping"); @@ -147,6 +161,41 @@ public class MailRoute extends RouteBuilder { log.info("MIME decoded: subject='{}', textLength={}, htmlLength={}, attachments={}", subject, finalTextContent.length(), htmlContent.length(), attachments.size()); + + if (properties.getGenericIngestion().isEnabled() && properties.getGenericIngestion().isMailAdapterEnabled()) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + mailMessage.writeTo(baos); + String sourceIdentifier = mailMessage.getHeader("Message-ID") != null ? + Arrays.stream(mailMessage.getHeader("Message-ID")).findFirst().orElse(null) : null; + if (sourceIdentifier == null || sourceIdentifier.isBlank()) { + sourceIdentifier = UUID.randomUUID().toString(); + } + var result = documentIngestionGateway.ingest(new SourceDescriptor( + null, + SourceType.MAIL, + sourceIdentifier, + null, + subject != null ? subject.replaceAll("[^A-Za-z0-9._-]", "_") + ".eml" : "mail-message.eml", + "message/rfc822", + baos.toByteArray(), + null, + exchange.getIn().getHeader("mailReceivedDate", Date.class) == null + ? OffsetDateTime.now() + : exchange.getIn().getHeader("mailReceivedDate", Date.class).toInstant() + .atZone(ZoneId.systemDefault()).toOffsetDateTime(), + OriginalContentStoragePolicy.DEFAULT, + Map.of( + "subject", subject != null ? subject : "", + "from", from != null ? from : "" + ) + )); + if (!result.warnings().isEmpty()) { + log.info("Mail adapter imported MIME message with {} warnings", result.warnings().size()); + } + } catch (Exception e) { + log.warn("Phase 4.1 mail adapter import failed for subject '{}': {}", subject, e.getMessage()); + } + } }) // Queue attachments for async processing .choice() diff --git a/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java b/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java index 4eabe9e..2f290ff 100644 --- a/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java +++ b/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java @@ -1,5 +1,9 @@ package at.procon.ted.camel; +import at.procon.dip.ingestion.service.DocumentIngestionGateway; +import at.procon.dip.ingestion.spi.IngestionResult; +import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; +import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.model.entity.TedDailyPackage; import at.procon.ted.repository.TedDailyPackageRepository; @@ -53,6 +57,7 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder { private final TedProcessorProperties properties; private final TedDailyPackageRepository packageRepository; private final TedPackageDownloadService downloadService; + private final DocumentIngestionGateway documentIngestionGateway; private final BatchDocumentProcessingService batchProcessingService; /** @@ -146,9 +151,21 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder { .process(this::markPackageDuplicate) .otherwise() .process(this::saveDownloadedPackage) - .to("direct:extract-tar-gz") + .choice() + .when(header("skipLegacyXmlProcessing").isEqualTo(true)) + .log(LoggingLevel.INFO, "Package ${header.packageId}: generic ingestion gateway-only mode active, skipping legacy XML batch persistence") + .to("direct:complete-package-after-gateway") + .otherwise() + .to("direct:extract-tar-gz") + .endChoice() .end(); + from("direct:complete-package-after-gateway") + .routeId("ted-package-gateway-only-completer") + .setHeader("processingStartTime", constant(System.currentTimeMillis())) + .process(this::markPackageCompleted) + .process(this::logPackageStatistics); + // tar.gz Extraction Route from("direct:extract-tar-gz") .routeId(ROUTE_ID_EXTRACTOR) @@ -214,7 +231,7 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder { long runningCount = downloadingCount + processingCount; exchange.getIn().setHeader("runningCount", runningCount); - exchange.getIn().setHeader("tooManyRunning", runningCount >= 2); + exchange.getIn().setHeader("tooManyRunning", runningCount >= 1); if (runningCount > 0) { log.info("Currently {} packages in progress ({} downloading, {} processing)", @@ -278,21 +295,9 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder { Integer serialNumber = exchange.getIn().getHeader("serialNumber", Integer.class); String downloadUrl = exchange.getIn().getHeader("downloadUrl", String.class); - Optional existing = packageRepository.findByPackageIdentifier(packageId); - if (existing.isPresent()) { - TedDailyPackage pkg = existing.get(); - if (pkg.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) { - log.info("Retrying existing NOT_FOUND package in Camel route: {}", packageId); - pkg.setDownloadUrl(downloadUrl); - pkg.setErrorMessage(null); - pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADING); - pkg = packageRepository.save(pkg); - exchange.getIn().setHeader("packageDbId", pkg.getId()); - return; - } - - log.debug("Package {} already exists in DB with status {}", packageId, pkg.getDownloadStatus()); - exchange.getIn().setHeader("packageDbId", pkg.getId()); + // Check if already exists + if (packageRepository.existsByPackageIdentifier(packageId)) { + log.debug("Package {} already exists in DB", packageId); return; } @@ -372,6 +377,50 @@ public class TedPackageDownloadCamelRoute extends RouteBuilder { exchange.getIn().setHeader("downloadPath", downloadPath.toString()); exchange.getIn().setHeader("deleteAfterExtraction", properties.getDownload().isDeleteAfterExtraction()); + + exchange.getIn().setHeader("skipLegacyXmlProcessing", false); + + if (properties.getGenericIngestion().isEnabled() && properties.getGenericIngestion().isTedPackageAdapterEnabled()) { + try { + IngestionResult ingestionResult = documentIngestionGateway.ingest(new SourceDescriptor( + null, + at.procon.dip.domain.document.SourceType.TED_PACKAGE, + packageId, + downloadPath.toString(), + packageId + ".tar.gz", + "application/gzip", + body, + null, + OffsetDateTime.now(), + OriginalContentStoragePolicy.DEFAULT, + java.util.Map.of( + "packageId", packageId, + "title", packageId + ".tar.gz" + ) + )); + + int importedChildCount = Math.max(0, ingestionResult.documents().size() - 1); + exchange.getIn().setHeader("gatewayImportedChildCount", importedChildCount); + exchange.getIn().setHeader("gatewayImportWarnings", ingestionResult.warnings().size()); + + if (properties.getGenericIngestion().isGatewayOnlyForTedPackages()) { + packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> { + pkg.setXmlFileCount(importedChildCount); + pkg.setProcessedCount(importedChildCount); + pkg.setFailedCount(0); + packageRepository.save(pkg); + }); + + if (properties.getDownload().isDeleteAfterExtraction()) { + Files.deleteIfExists(downloadPath); + } + + exchange.getIn().setHeader("skipLegacyXmlProcessing", true); + } + } catch (Exception e) { + log.warn("Phase 4.1 TED package adapter import failed for {}: {}", packageId, e.getMessage()); + } + } } /** diff --git a/src/main/java/at/procon/ted/config/TedProcessorProperties.java b/src/main/java/at/procon/ted/config/TedProcessorProperties.java index 8a46df5..59bf0e7 100644 --- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java +++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java @@ -510,7 +510,7 @@ public class TedProcessorProperties { /** * Input directory for the generic filesystem importer. */ - private String inputDirectory = "D:/ted.europe/generic-input"; + private String inputDirectory = "/ted.europe/generic-input"; /** * Regular-expression file pattern used by the Camel file route. @@ -570,6 +570,14 @@ public class TedProcessorProperties { */ private boolean deduplicateByContentHash = true; + /** + * Persist ORIGINAL content rows for wrapper/container documents that primarily exist + * to group or reference child documents (for example TED packages or expanded ZIP wrappers). + * When disabled, wrappers are still classified, extracted and represented, but the raw + * ORIGINAL content payload is not stored in DOC.doc_content. + */ + private boolean storeOriginalContentForWrapperDocuments = true; + /** * Queue only the primary text representation for embedding. */ @@ -580,6 +588,50 @@ public class TedProcessorProperties { */ @NotBlank private String importBatchId = "phase4-generic"; + + /** + * Enable the Phase 4.1 TED package adapter built on top of the generic DOC ingestion SPI. + */ + private boolean tedPackageAdapterEnabled = true; + + /** + * Enable the Phase 4.1 mail/document adapter built on top of the generic DOC ingestion SPI. + */ + private boolean mailAdapterEnabled = false; + + /** + * Optional dedicated owner tenant key for imported mail messages and attachments. + * Falls back to defaultOwnerTenantKey when not configured. + */ + private String mailDefaultOwnerTenantKey; + + /** + * Default visibility for imported mail messages and attachments. + */ + private at.procon.dip.domain.access.DocumentVisibility mailDefaultVisibility = at.procon.dip.domain.access.DocumentVisibility.TENANT; + + /** + * Expand ZIP attachments recursively through the mail adapter. + */ + private boolean expandMailZipAttachments = true; + + /** + * Import batch identifier for TED package roots and extracted TED child documents. + */ + @NotBlank + private String tedPackageImportBatchId = "phase41-ted-package"; + + /** + * When true, TED packages are persisted only through the generic ingestion gateway + * and the legacy XML batch persistence path is skipped. + */ + private boolean gatewayOnlyForTedPackages = false; + + /** + * Import batch identifier for imported mail root messages and child attachments. + */ + @NotBlank + private String mailImportBatchId = "phase41-mail"; } } diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 0d9f1ec..743539b 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -39,7 +39,7 @@ spring: order_updates: true flyway: - enabled: true + enabled: false locations: classpath:db/migration baseline-on-migrate: true create-schemas: true @@ -128,7 +128,9 @@ ted: # TED Daily Package Download configuration download: # Enable/disable automatic package download - enabled: false + enabled: true + # User service-based camel route + use-service-based: false # Base URL for TED Daily Packages base-url: https://ted.europa.eu/packages/daily/ # Download directory for tar.gz files @@ -136,11 +138,11 @@ ted: # Extract directory for XML files extract-directory: /ted.europe/extracted # Start year for downloads - start-year: 2023 + start-year: 2026 # Max consecutive 404 errors before stopping max-consecutive-404: 4 # Polling interval (milliseconds) - 2 minutes - poll-interval: 120000 + poll-interval: 3600000 # Retry interval for tail NOT_FOUND packages - 6 hours not-found-retry-interval: 21600000 # Grace period after year end before a previous-year tail 404 is treated as final @@ -150,7 +152,7 @@ ted: # Download timeout (milliseconds) - 5 minutes download-timeout: 300000 # Max concurrent downloads - max-concurrent-downloads: 2 + max-concurrent-downloads: 1 # Delay between downloads (milliseconds) for rate limiting - 5 seconds delay-between-downloads: 3000 # Delete tar.gz after extraction @@ -207,13 +209,13 @@ ted: # Phase 4 generic ingestion configuration generic-ingestion: # Master switch for arbitrary document ingestion into the DOC model - enabled: false + enabled: true # Enable file-system polling for non-TED documents file-system-enabled: false # Allow REST/API upload endpoints for arbitrary documents rest-upload-enabled: true # Input directory for the generic Camel file route - input-directory: D:/ted.europe/generic-input + input-directory: /ted.europe/generic-input # Regex for files accepted by the generic file route file-pattern: .*\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$ # Move successfully processed files here @@ -236,10 +238,29 @@ ted: max-binary-bytes-in-db: 5242880 # Deduplicate by content hash and attach additional sources to the same canonical document deduplicate-by-content-hash: true + # Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers + store-original-content-for-wrapper-documents: true # Queue only the primary text representation for vectorization vectorize-primary-representation-only: true # Import batch marker written to DOC.doc_source.import_batch_id import-batch-id: phase4-generic + # Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI + ted-package-adapter-enabled: true + # Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI + mail-adapter-enabled: false + # Optional dedicated mail owner tenant, falls back to default-owner-tenant-key + mail-default-owner-tenant-key: + # Visibility for imported mail messages and attachments + mail-default-visibility: TENANT + # Expand ZIP attachments recursively through the mail adapter + expand-mail-zip-attachments: true + # Import batch marker for TED package roots and children + ted-package-import-batch-id: phase41-ted-package + # When true, TED package documents are stored only through the generic ingestion gateway + # and the legacy XML batch processing path is skipped + gateway-only-for-ted-packages: true + # Import batch marker for mail roots and attachments + mail-import-batch-id: phase41-mail # Solution Brief processing configuration solution-brief: diff --git a/src/main/resources/db/migration/V8__doc_phase4_1_expand_document_and_source_types.sql b/src/main/resources/db/migration/V8__doc_phase4_1_expand_document_and_source_types.sql new file mode 100644 index 0000000..d2420ca --- /dev/null +++ b/src/main/resources/db/migration/V8__doc_phase4_1_expand_document_and_source_types.sql @@ -0,0 +1,80 @@ +-- Phase 4.1 enum/check-constraint expansion for newly introduced generic document/source types. +-- Supports both: +-- 1) PostgreSQL ENUM-backed columns created by Flyway +-- 2) legacy VARCHAR + CHECK constraint variants that may exist in local/dev databases + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_type t + JOIN pg_namespace n ON n.oid = t.typnamespace + WHERE n.nspname = 'doc' + AND t.typname = 'doc_document_type' + ) THEN + ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_PACKAGE'; + END IF; +END +$$; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_type t + JOIN pg_namespace n ON n.oid = t.typnamespace + WHERE n.nspname = 'doc' + AND t.typname = 'doc_source_type' + ) THEN + ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'PACKAGE_CHILD'; + END IF; +END +$$; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_constraint c + JOIN pg_class r ON r.oid = c.conrelid + JOIN pg_namespace n ON n.oid = r.relnamespace + WHERE n.nspname = 'doc' + AND r.relname = 'doc_document' + AND c.conname = 'doc_document_document_type_check' + ) THEN + ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_type_check; + ALTER TABLE DOC.doc_document + ADD CONSTRAINT doc_document_document_type_check + CHECK ( + document_type IN ( + 'TED_PACKAGE', 'TED_NOTICE', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML', + 'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'UNKNOWN' + ) + ); + END IF; +END +$$; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_constraint c + JOIN pg_class r ON r.oid = c.conrelid + JOIN pg_namespace n ON n.oid = r.relnamespace + WHERE n.nspname = 'doc' + AND r.relname = 'doc_source' + AND c.conname = 'doc_source_source_type_check' + ) THEN + ALTER TABLE DOC.doc_source DROP CONSTRAINT doc_source_source_type_check; + ALTER TABLE DOC.doc_source + ADD CONSTRAINT doc_source_source_type_check + CHECK ( + source_type IN ( + 'TED_PACKAGE', 'PACKAGE_CHILD', 'MAIL', 'FILE_SYSTEM', 'REST_UPLOAD', + 'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION' + ) + ); + END IF; +END +$$;