Refactor phases 4.1
parent
ac59730f3e
commit
f337af56b5
@ -0,0 +1,28 @@
|
||||
# Phase 4.1 adapter extensions
|
||||
|
||||
## Added adapters
|
||||
|
||||
### TED package adapter
|
||||
|
||||
- Source type: `TED_PACKAGE`
|
||||
- Root access: `PUBLIC`, no owner tenant
|
||||
- Root document type: `TED_PACKAGE`
|
||||
- Child source type: `PACKAGE_CHILD`
|
||||
- Child relation: `EXTRACTED_FROM`
|
||||
|
||||
The adapter imports the package artifact plus its XML members into the generic `DOC` model.
|
||||
It does not replace the existing legacy TED package processing path; instead it complements it, so the later legacy TED parsing step can still enrich the same canonical child documents into proper `TED_NOTICE` projections by dedup hash.
|
||||
|
||||
### Mail/document adapter
|
||||
|
||||
- Source type: `MAIL`
|
||||
- Root document type: `MIME_MESSAGE`
|
||||
- Child relation: `ATTACHMENT_OF`
|
||||
- Access: configurable via `mail-default-owner-tenant-key` and `mail-default-visibility`
|
||||
|
||||
The adapter stores the message body as the semantic root text and imports attachments as child documents. ZIP attachments can optionally be expanded recursively.
|
||||
|
||||
## Deduplication
|
||||
|
||||
Phase 4 deduplication by content hash is refined so the same payload is only deduplicated within the same access scope (`visibility` + `owner tenant`).
|
||||
This prevents private documents from different tenants from being merged into one canonical document accidentally.
|
||||
@ -0,0 +1,56 @@
|
||||
package at.procon.dip.extraction.impl;
|
||||
|
||||
import at.procon.dip.domain.document.ContentRole;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.extraction.spi.DocumentExtractor;
|
||||
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
@Component
|
||||
public class MimeMessageDocumentExtractor implements DocumentExtractor {
|
||||
|
||||
@Override
|
||||
public boolean supports(DocumentType documentType, String mimeType) {
|
||||
return documentType == DocumentType.MIME_MESSAGE || documentType == DocumentType.EMAIL;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExtractionResult extract(ExtractionRequest extractionRequest) {
|
||||
String text = extractionRequest.textContent();
|
||||
if (!StringUtils.hasText(text) && extractionRequest.binaryContent() != null) {
|
||||
text = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8);
|
||||
}
|
||||
text = text == null ? null : text.replace("\r\n", "\n").replace('\r', '\n').trim();
|
||||
|
||||
Map<String, Object> attributes = new LinkedHashMap<>();
|
||||
if (extractionRequest.sourceDescriptor().attributes() != null) {
|
||||
attributes.putAll(extractionRequest.sourceDescriptor().attributes());
|
||||
}
|
||||
String title = DocumentImportSupport.firstNonBlank(extractionRequest.sourceDescriptor().attributes(), "title", "subject");
|
||||
if (!StringUtils.hasText(title)) {
|
||||
title = extractionRequest.sourceDescriptor().fileName();
|
||||
}
|
||||
if (StringUtils.hasText(title)) {
|
||||
attributes.put("title", title);
|
||||
}
|
||||
|
||||
if (!StringUtils.hasText(text)) {
|
||||
return new ExtractionResult(Map.of(), List.of(new ExtractedStructuredPayload("mail-message", attributes)),
|
||||
List.of("Mail message did not contain extractable text body"));
|
||||
}
|
||||
|
||||
return new ExtractionResult(
|
||||
Map.of(ContentRole.NORMALIZED_TEXT, text),
|
||||
List.of(new ExtractedStructuredPayload("mail-message", attributes)),
|
||||
List.of()
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,49 @@
|
||||
package at.procon.dip.extraction.impl;
|
||||
|
||||
import at.procon.dip.domain.document.ContentRole;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.extraction.spi.DocumentExtractor;
|
||||
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
//@Component
|
||||
public class TedPackageManifestExtractor implements DocumentExtractor {
|
||||
|
||||
@Override
|
||||
public boolean supports(DocumentType documentType, String mimeType) {
|
||||
return documentType == DocumentType.TED_PACKAGE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExtractionResult extract(ExtractionRequest extractionRequest) {
|
||||
String manifest = extractionRequest.textContent();
|
||||
if (!StringUtils.hasText(manifest)) {
|
||||
manifest = "TED package: " + extractionRequest.sourceDescriptor().sourceIdentifier();
|
||||
}
|
||||
|
||||
Map<String, Object> attributes = new LinkedHashMap<>();
|
||||
if (extractionRequest.sourceDescriptor().attributes() != null) {
|
||||
attributes.putAll(extractionRequest.sourceDescriptor().attributes());
|
||||
}
|
||||
String title = DocumentImportSupport.firstNonBlank(extractionRequest.sourceDescriptor().attributes(), "title", "packageId");
|
||||
if (!StringUtils.hasText(title)) {
|
||||
title = extractionRequest.sourceDescriptor().fileName();
|
||||
}
|
||||
if (StringUtils.hasText(title)) {
|
||||
attributes.put("title", title);
|
||||
}
|
||||
|
||||
return new ExtractionResult(
|
||||
Map.of(ContentRole.NORMALIZED_TEXT, manifest),
|
||||
List.of(new ExtractedStructuredPayload("ted-package-manifest", attributes)),
|
||||
List.of()
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,170 @@
|
||||
package at.procon.dip.ingestion.adapter;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.RelationType;
|
||||
import at.procon.dip.domain.document.SourceType;
|
||||
import at.procon.dip.domain.tenant.TenantRef;
|
||||
import at.procon.dip.domain.document.service.DocumentRelationService;
|
||||
import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
|
||||
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||
import at.procon.dip.ingestion.service.GenericDocumentImportService;
|
||||
import at.procon.dip.ingestion.service.MailMessageExtractionService;
|
||||
import at.procon.dip.ingestion.service.MailMessageExtractionService.MailAttachment;
|
||||
import at.procon.dip.ingestion.service.MailMessageExtractionService.ParsedMailMessage;
|
||||
import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
|
||||
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import at.procon.ted.service.attachment.AttachmentExtractor;
|
||||
import at.procon.ted.service.attachment.ZipExtractionService;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class MailDocumentIngestionAdapter implements DocumentIngestionAdapter {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final GenericDocumentImportService importService;
|
||||
private final MailMessageExtractionService mailExtractionService;
|
||||
private final DocumentRelationService relationService;
|
||||
private final ZipExtractionService zipExtractionService;
|
||||
|
||||
@Override
|
||||
public boolean supports(SourceDescriptor sourceDescriptor) {
|
||||
return sourceDescriptor.sourceType() == SourceType.MAIL
|
||||
&& properties.getGenericIngestion().isEnabled()
|
||||
&& properties.getGenericIngestion().isMailAdapterEnabled();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
|
||||
byte[] rawMime = sourceDescriptor.binaryContent();
|
||||
if (rawMime == null || rawMime.length == 0) {
|
||||
throw new IllegalArgumentException("Mail adapter requires raw MIME bytes");
|
||||
}
|
||||
ParsedMailMessage parsed = mailExtractionService.parse(rawMime);
|
||||
DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null ? defaultMailAccessContext() : sourceDescriptor.accessContext();
|
||||
|
||||
Map<String, String> rootAttributes = new LinkedHashMap<>(sourceDescriptor.attributes() == null ? Map.of() : sourceDescriptor.attributes());
|
||||
if (parsed.subject() != null) rootAttributes.put("subject", parsed.subject());
|
||||
if (parsed.from() != null) rootAttributes.put("from", parsed.from());
|
||||
if (!parsed.recipients().isEmpty()) rootAttributes.put("to", String.join(", ", parsed.recipients()));
|
||||
rootAttributes.putIfAbsent("title", parsed.subject() != null ? parsed.subject() : sourceDescriptor.fileName());
|
||||
rootAttributes.put("attachmentCount", Integer.toString(parsed.attachments().size()));
|
||||
rootAttributes.put("importBatchId", properties.getGenericIngestion().getMailImportBatchId());
|
||||
|
||||
ImportedDocumentResult rootResult = importService.importDocument(new SourceDescriptor(
|
||||
accessContext,
|
||||
SourceType.MAIL,
|
||||
sourceDescriptor.sourceIdentifier(),
|
||||
sourceDescriptor.sourceUri(),
|
||||
sourceDescriptor.fileName() != null ? sourceDescriptor.fileName() : fallbackMailFileName(parsed),
|
||||
"message/rfc822",
|
||||
rawMime,
|
||||
mailExtractionService.serializeMessage(parsed),
|
||||
parsed.receivedAt() == null ? OffsetDateTime.now() : parsed.receivedAt(),
|
||||
OriginalContentStoragePolicy.STORE,
|
||||
rootAttributes
|
||||
));
|
||||
|
||||
List<at.procon.dip.domain.document.CanonicalDocumentMetadata> documents = new ArrayList<>();
|
||||
List<String> warnings = new ArrayList<>(rootResult.warnings());
|
||||
documents.add(rootResult.document().toCanonicalMetadata());
|
||||
|
||||
int sortOrder = 0;
|
||||
for (MailAttachment attachment : parsed.attachments()) {
|
||||
importAttachment(rootResult.document().getId(), accessContext, sourceDescriptor, attachment, documents, warnings, ++sortOrder, 0);
|
||||
}
|
||||
|
||||
return new IngestionResult(documents, warnings);
|
||||
}
|
||||
|
||||
private void importAttachment(java.util.UUID parentDocumentId, DocumentAccessContext accessContext, SourceDescriptor parentSource,
|
||||
MailAttachment attachment, List<at.procon.dip.domain.document.CanonicalDocumentMetadata> documents,
|
||||
List<String> warnings, int sortOrder, int depth) {
|
||||
boolean expandableWrapper = properties.getGenericIngestion().isExpandMailZipAttachments()
|
||||
&& zipExtractionService.canHandle(attachment.fileName(), attachment.contentType());
|
||||
|
||||
Map<String, String> attachmentAttributes = new LinkedHashMap<>();
|
||||
attachmentAttributes.put("title", attachment.fileName());
|
||||
attachmentAttributes.put("mailSourceIdentifier", parentSource.sourceIdentifier());
|
||||
attachmentAttributes.put("importBatchId", properties.getGenericIngestion().getMailImportBatchId());
|
||||
if (expandableWrapper) {
|
||||
attachmentAttributes.put("wrapperDocument", Boolean.TRUE.toString());
|
||||
}
|
||||
|
||||
ImportedDocumentResult attachmentResult = importService.importDocument(new SourceDescriptor(
|
||||
accessContext,
|
||||
SourceType.MAIL,
|
||||
parentSource.sourceIdentifier() + ":attachment:" + depth + ":" + attachment.fileName(),
|
||||
parentSource.sourceUri(),
|
||||
attachment.fileName(),
|
||||
DocumentImportSupport.normalizeMediaType(attachment.contentType()),
|
||||
attachment.data(),
|
||||
previewTextIfLikelyText(attachment),
|
||||
parentSource.receivedAt() == null ? OffsetDateTime.now() : parentSource.receivedAt(),
|
||||
expandableWrapper ? OriginalContentStoragePolicy.SKIP : OriginalContentStoragePolicy.STORE,
|
||||
attachmentAttributes
|
||||
));
|
||||
documents.add(attachmentResult.document().toCanonicalMetadata());
|
||||
warnings.addAll(attachmentResult.warnings());
|
||||
RelationType relationType = depth > 0 || attachment.path() != null ? RelationType.EXTRACTED_FROM : RelationType.ATTACHMENT_OF;
|
||||
relationService.ensureRelation(new CreateDocumentRelationCommand(
|
||||
parentDocumentId, attachmentResult.document().getId(), relationType, sortOrder, attachment.fileName()));
|
||||
|
||||
if (expandableWrapper) {
|
||||
AttachmentExtractor.ExtractionResult zipResult = zipExtractionService.extract(attachment.data(), attachment.fileName(), attachment.contentType());
|
||||
if (!zipResult.success()) {
|
||||
warnings.add("ZIP attachment extraction failed for " + attachment.fileName() + ": " + zipResult.errorMessage());
|
||||
return;
|
||||
}
|
||||
int childSort = 0;
|
||||
for (AttachmentExtractor.ChildAttachment child : zipResult.childAttachments()) {
|
||||
importAttachment(attachmentResult.document().getId(), accessContext, parentSource,
|
||||
new MailAttachment(child.filename(), child.contentType(), child.data(), child.data().length, child.pathInArchive()),
|
||||
documents, warnings, ++childSort, depth + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String fallbackMailFileName(ParsedMailMessage parsed) {
|
||||
String subject = parsed.subject() == null || parsed.subject().isBlank() ? "mail-message" : parsed.subject().replaceAll("[^A-Za-z0-9._-]", "_");
|
||||
return subject + ".eml";
|
||||
}
|
||||
|
||||
private DocumentAccessContext defaultMailAccessContext() {
|
||||
String tenantKey = properties.getGenericIngestion().getMailDefaultOwnerTenantKey();
|
||||
if (tenantKey == null || tenantKey.isBlank()) {
|
||||
tenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey();
|
||||
}
|
||||
DocumentVisibility visibility = properties.getGenericIngestion().getMailDefaultVisibility();
|
||||
TenantRef tenant = (tenantKey == null || tenantKey.isBlank()) ? null : new TenantRef(null, tenantKey, tenantKey);
|
||||
if (tenant == null && visibility == DocumentVisibility.TENANT) {
|
||||
visibility = DocumentVisibility.RESTRICTED;
|
||||
}
|
||||
return new DocumentAccessContext(tenant, visibility);
|
||||
}
|
||||
|
||||
private String previewTextIfLikelyText(MailAttachment attachment) {
|
||||
String mime = DocumentImportSupport.normalizeMediaType(attachment.contentType());
|
||||
if (DocumentImportSupport.isLikelyTextMime(mime)) {
|
||||
return attachment.safeTextPreview();
|
||||
}
|
||||
String ext = DocumentImportSupport.extensionOf(attachment.fileName());
|
||||
if ("txt".equals(ext) || "xml".equals(ext) || "html".equals(ext) || "htm".equals(ext) || "md".equals(ext)) {
|
||||
return attachment.safeTextPreview();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,130 @@
|
||||
package at.procon.dip.ingestion.adapter;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||
import at.procon.dip.domain.document.RelationType;
|
||||
import at.procon.dip.domain.document.SourceType;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.domain.document.service.DocumentRelationService;
|
||||
import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
|
||||
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||
import at.procon.dip.ingestion.service.GenericDocumentImportService;
|
||||
import at.procon.dip.ingestion.service.TedPackageExpansionService;
|
||||
import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
|
||||
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdapter {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final GenericDocumentImportService importService;
|
||||
private final TedPackageExpansionService expansionService;
|
||||
private final DocumentRelationService relationService;
|
||||
|
||||
@Override
|
||||
public boolean supports(SourceDescriptor sourceDescriptor) {
|
||||
return sourceDescriptor.sourceType() == SourceType.TED_PACKAGE
|
||||
&& properties.getGenericIngestion().isEnabled()
|
||||
&& properties.getGenericIngestion().isTedPackageAdapterEnabled();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
|
||||
byte[] packageBytes = sourceDescriptor.binaryContent();
|
||||
if (packageBytes == null || packageBytes.length == 0) {
|
||||
throw new IllegalArgumentException("TED package adapter requires tar.gz bytes");
|
||||
}
|
||||
|
||||
TedPackageExpansionService.TedPackageExpansionResult expanded = expansionService.expand(packageBytes);
|
||||
Map<String, String> rootAttributes = new LinkedHashMap<>(sourceDescriptor.attributes() == null ? Map.of() : sourceDescriptor.attributes());
|
||||
rootAttributes.putIfAbsent("packageId", sourceDescriptor.sourceIdentifier());
|
||||
rootAttributes.putIfAbsent("title", sourceDescriptor.fileName() != null ? sourceDescriptor.fileName() : sourceDescriptor.sourceIdentifier());
|
||||
rootAttributes.put("xmlEntryCount", Integer.toString(expanded.entries().size()));
|
||||
rootAttributes.put("wrapperDocument", Boolean.TRUE.toString());
|
||||
rootAttributes.put("importBatchId", properties.getGenericIngestion().getTedPackageImportBatchId());
|
||||
|
||||
ImportedDocumentResult packageDocument = importService.importDocument(new SourceDescriptor(
|
||||
sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(),
|
||||
SourceType.TED_PACKAGE,
|
||||
sourceDescriptor.sourceIdentifier(),
|
||||
sourceDescriptor.sourceUri(),
|
||||
sourceDescriptor.fileName(),
|
||||
sourceDescriptor.mediaType() == null ? "application/gzip" : sourceDescriptor.mediaType(),
|
||||
packageBytes,
|
||||
expanded.manifestText(),
|
||||
sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt(),
|
||||
OriginalContentStoragePolicy.SKIP,
|
||||
rootAttributes
|
||||
));
|
||||
|
||||
List<String> warnings = new ArrayList<>(packageDocument.warnings());
|
||||
List<at.procon.dip.domain.document.CanonicalDocumentMetadata> documents = new ArrayList<>();
|
||||
documents.add(packageDocument.document().toCanonicalMetadata());
|
||||
|
||||
int sortOrder = 0;
|
||||
for (TedPackageExpansionService.TedPackageEntry entry : expanded.entries()) {
|
||||
sortOrder++;
|
||||
String childUri = "tedpkg://" + sourceDescriptor.sourceIdentifier() + "/" + entry.archivePath();
|
||||
String childIdentifier = sourceDescriptor.sourceIdentifier() + ":" + entry.archivePath();
|
||||
String xmlContent = resolveXmlContent(entry);
|
||||
|
||||
Map<String, String> childAttributes = new LinkedHashMap<>();
|
||||
childAttributes.put("documentTypeHint", "TED_NOTICE");
|
||||
childAttributes.put("packageId", sourceDescriptor.sourceIdentifier());
|
||||
childAttributes.put("archivePath", entry.archivePath());
|
||||
childAttributes.put("title", entry.fileName());
|
||||
childAttributes.put("importBatchId", properties.getGenericIngestion().getTedPackageImportBatchId());
|
||||
|
||||
ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor(
|
||||
sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(),
|
||||
SourceType.PACKAGE_CHILD,
|
||||
childIdentifier,
|
||||
childUri,
|
||||
entry.fileName(),
|
||||
entry.mediaType() == null ? "application/xml" : entry.mediaType(),
|
||||
entry.data(),
|
||||
xmlContent,
|
||||
sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt(),
|
||||
OriginalContentStoragePolicy.STORE,
|
||||
childAttributes
|
||||
));
|
||||
|
||||
Document childDocument = childResult.document();
|
||||
documents.add(childDocument.toCanonicalMetadata());
|
||||
warnings.addAll(childResult.warnings());
|
||||
if (childResult.deduplicated()) {
|
||||
warnings.add("TED XML child already existed and was linked to package: " + entry.archivePath());
|
||||
}
|
||||
relationService.ensureRelation(new CreateDocumentRelationCommand(
|
||||
packageDocument.document().getId(),
|
||||
childDocument.getId(),
|
||||
RelationType.EXTRACTED_FROM,
|
||||
sortOrder,
|
||||
entry.archivePath()
|
||||
));
|
||||
}
|
||||
|
||||
return new IngestionResult(documents, warnings);
|
||||
}
|
||||
|
||||
private String resolveXmlContent(TedPackageExpansionService.TedPackageEntry entry) {
|
||||
if (entry.textUtf8() != null && !entry.textUtf8().isBlank()) {
|
||||
return entry.textUtf8();
|
||||
}
|
||||
return new String(entry.data(), StandardCharsets.UTF_8);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,122 @@
|
||||
package at.procon.dip.ingestion.service;
|
||||
|
||||
import jakarta.mail.BodyPart;
|
||||
import jakarta.mail.Multipart;
|
||||
import jakarta.mail.Part;
|
||||
import jakarta.mail.Session;
|
||||
import jakarta.mail.internet.MimeMessage;
|
||||
import jakarta.mail.internet.MimeUtility;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class MailMessageExtractionService {
|
||||
|
||||
public ParsedMailMessage parse(byte[] rawMime) {
|
||||
try {
|
||||
Session session = Session.getDefaultInstance(new Properties());
|
||||
MimeMessage message = new MimeMessage(session, new ByteArrayInputStream(rawMime));
|
||||
String subject = message.getSubject();
|
||||
String from = message.getFrom() != null && message.getFrom().length > 0 ? message.getFrom()[0].toString() : null;
|
||||
List<String> recipients = new ArrayList<>();
|
||||
if (message.getAllRecipients() != null) {
|
||||
for (var recipient : message.getAllRecipients()) {
|
||||
recipients.add(recipient.toString());
|
||||
}
|
||||
}
|
||||
StringBuilder text = new StringBuilder();
|
||||
StringBuilder html = new StringBuilder();
|
||||
List<MailAttachment> attachments = new ArrayList<>();
|
||||
processPart(message, text, html, attachments);
|
||||
String normalizedText = text.length() > 0 ? text.toString().trim() : htmlToText(html.toString());
|
||||
OffsetDateTime receivedAt = message.getReceivedDate() == null ? OffsetDateTime.now()
|
||||
: message.getReceivedDate().toInstant().atZone(ZoneId.systemDefault()).toOffsetDateTime();
|
||||
return new ParsedMailMessage(subject, from, recipients, receivedAt, normalizedText, html.toString(), attachments);
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Failed to parse MIME message", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void processPart(Part part, StringBuilder text, StringBuilder html, List<MailAttachment> attachments) throws Exception {
|
||||
String disposition = part.getDisposition();
|
||||
String contentType = part.getContentType() == null ? "application/octet-stream" : part.getContentType();
|
||||
if (disposition != null && (Part.ATTACHMENT.equalsIgnoreCase(disposition) || Part.INLINE.equalsIgnoreCase(disposition))
|
||||
&& part.getFileName() != null) {
|
||||
attachments.add(extractAttachment(part));
|
||||
return;
|
||||
}
|
||||
Object content = part.getContent();
|
||||
if (content instanceof Multipart multipart) {
|
||||
for (int i = 0; i < multipart.getCount(); i++) {
|
||||
BodyPart bodyPart = multipart.getBodyPart(i);
|
||||
processPart(bodyPart, text, html, attachments);
|
||||
}
|
||||
} else if (contentType.toLowerCase().contains("text/plain")) {
|
||||
text.append(content.toString()).append("\n");
|
||||
} else if (contentType.toLowerCase().contains("text/html")) {
|
||||
html.append(content.toString()).append("\n");
|
||||
} else if (part.getFileName() != null) {
|
||||
attachments.add(extractAttachment(part));
|
||||
}
|
||||
}
|
||||
|
||||
private MailAttachment extractAttachment(Part part) throws Exception {
|
||||
String fileName = part.getFileName();
|
||||
if (fileName == null) {
|
||||
fileName = "attachment";
|
||||
}
|
||||
try {
|
||||
fileName = MimeUtility.decodeText(fileName);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
String contentType = part.getContentType();
|
||||
byte[] data;
|
||||
try (InputStream in = part.getInputStream(); ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||
in.transferTo(out);
|
||||
data = out.toByteArray();
|
||||
}
|
||||
return new MailAttachment(fileName, contentType, data, data.length, null);
|
||||
}
|
||||
|
||||
private String htmlToText(String html) {
|
||||
if (html == null || html.isBlank()) {
|
||||
return "";
|
||||
}
|
||||
try {
|
||||
return Jsoup.parse(html).text().replaceAll("\s+", " ").trim();
|
||||
} catch (Exception e) {
|
||||
log.debug("Falling back to naive HTML cleanup: {}", e.getMessage());
|
||||
return html.replaceAll("<[^>]+>", " ").replaceAll("\s+", " ").trim();
|
||||
}
|
||||
}
|
||||
|
||||
public String serializeMessage(ParsedMailMessage parsed) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (parsed.subject() != null) sb.append("Subject: ").append(parsed.subject()).append("\n");
|
||||
if (parsed.from() != null) sb.append("From: ").append(parsed.from()).append("\n");
|
||||
if (!parsed.recipients().isEmpty()) sb.append("To: ").append(String.join(", ", parsed.recipients())).append("\n");
|
||||
sb.append("\n");
|
||||
if (parsed.textBody() != null) sb.append(parsed.textBody());
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
public record ParsedMailMessage(String subject, String from, List<String> recipients, OffsetDateTime receivedAt,
|
||||
String textBody, String htmlBody, List<MailAttachment> attachments) {}
|
||||
|
||||
public record MailAttachment(String fileName, String contentType, byte[] data, long sizeBytes, String path) {
|
||||
public String safeTextPreview() {
|
||||
return new String(data, StandardCharsets.UTF_8);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,88 @@
|
||||
package at.procon.dip.ingestion.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class TedPackageExpansionService {
|
||||
|
||||
private static final int MAX_FILES = 10000;
|
||||
private static final long MAX_SINGLE_FILE_SIZE = 20L * 1024 * 1024;
|
||||
private static final long MAX_TOTAL_EXTRACTED_SIZE = 1024L * 1024 * 1024;
|
||||
|
||||
public TedPackageExpansionResult expand(byte[] tarGzBytes) {
|
||||
List<TedPackageEntry> entries = new ArrayList<>();
|
||||
long total = 0;
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(
|
||||
new GzipCompressorInputStream(new ByteArrayInputStream(tarGzBytes)))) {
|
||||
TarArchiveEntry entry;
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
if (entry.isDirectory()) {
|
||||
continue;
|
||||
}
|
||||
if (entries.size() >= MAX_FILES) {
|
||||
break;
|
||||
}
|
||||
String entryName = entry.getName();
|
||||
if (!entryName.toLowerCase().endsWith(".xml")) {
|
||||
continue;
|
||||
}
|
||||
if (entryName.contains("..") || entryName.startsWith("/") || entryName.startsWith("\\")) {
|
||||
log.warn("Skipping suspicious TED package entry {}", entryName);
|
||||
continue;
|
||||
}
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
byte[] buffer = new byte[8192];
|
||||
long fileSize = 0;
|
||||
int read;
|
||||
while ((read = tais.read(buffer)) > 0) {
|
||||
fileSize += read;
|
||||
total += read;
|
||||
if (fileSize > MAX_SINGLE_FILE_SIZE || total > MAX_TOTAL_EXTRACTED_SIZE) {
|
||||
throw new IOException("TED package extraction limits exceeded");
|
||||
}
|
||||
baos.write(buffer, 0, read);
|
||||
}
|
||||
byte[] data = baos.toByteArray();
|
||||
entries.add(new TedPackageEntry(extractFilename(entryName), entryName, data, data.length, "application/xml"));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Failed to expand TED package", e);
|
||||
}
|
||||
|
||||
String manifest = buildManifest(entries);
|
||||
return new TedPackageExpansionResult(entries, manifest);
|
||||
}
|
||||
|
||||
private String buildManifest(List<TedPackageEntry> entries) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("TED package contains ").append(entries.size()).append(" XML notice files\n");
|
||||
for (TedPackageEntry entry : entries) {
|
||||
sb.append("- ").append(entry.archivePath()).append(" (" ).append(entry.sizeBytes()).append(" bytes)\n");
|
||||
}
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
private String extractFilename(String path) {
|
||||
int idx = Math.max(path.lastIndexOf('/'), path.lastIndexOf('\\'));
|
||||
return idx >= 0 ? path.substring(idx + 1) : path;
|
||||
}
|
||||
|
||||
public record TedPackageExpansionResult(List<TedPackageEntry> entries, String manifestText) {}
|
||||
|
||||
public record TedPackageEntry(String fileName, String archivePath, byte[] data, long sizeBytes, String mediaType) {
|
||||
public String textUtf8() {
|
||||
return new String(data, StandardCharsets.UTF_8);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
package at.procon.dip.ingestion.spi;
|
||||
|
||||
/**
|
||||
* Controls whether the ORIGINAL raw payload should be persisted for a single imported document.
|
||||
* DEFAULT defers to the global ingestion configuration and wrapper-document heuristics.
|
||||
*/
|
||||
public enum OriginalContentStoragePolicy {
|
||||
DEFAULT,
|
||||
STORE,
|
||||
SKIP
|
||||
}
|
||||
@ -0,0 +1,138 @@
|
||||
package at.procon.dip.normalization.impl;
|
||||
|
||||
import at.procon.dip.domain.document.ContentRole;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||
import at.procon.dip.normalization.spi.TextRepresentationBuilder;
|
||||
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import org.springframework.core.annotation.Order;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
@Component
|
||||
@Order(10)
|
||||
public class TedStructuredTextRepresentationBuilder implements TextRepresentationBuilder {
|
||||
|
||||
public static final String BUILDER_KEY = "ted-structured-text";
|
||||
|
||||
@Override
|
||||
public boolean supports(DocumentType documentType) {
|
||||
return documentType == DocumentType.TED_NOTICE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
|
||||
String normalizedText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
|
||||
if (!StringUtils.hasText(normalizedText)) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
Map<String, Object> attributes = request.extractionResult().structuredPayloads().stream()
|
||||
.filter(payload -> Objects.equals(payload.projectionName(), "ted-notice"))
|
||||
.map(ExtractedStructuredPayload::attributes)
|
||||
.filter(Objects::nonNull)
|
||||
.findFirst()
|
||||
.orElse(Map.of());
|
||||
|
||||
String title = asString(attributes.get("title"));
|
||||
String description = asString(attributes.get("description"));
|
||||
String buyerName = asString(attributes.get("buyerName"));
|
||||
String cpvCodes = asString(attributes.get("cpvCodes"));
|
||||
String nutsCodes = asString(attributes.get("nutsCodes"));
|
||||
String publicationId = asString(attributes.get("publicationId"));
|
||||
String semanticText = buildSemanticText(title, description, buyerName, cpvCodes, nutsCodes, publicationId, normalizedText);
|
||||
String summary = DocumentImportSupport.ellipsize(
|
||||
StringUtils.hasText(description) ? description.trim() : normalizedText.replace('\n', ' ').trim(),
|
||||
1200
|
||||
);
|
||||
|
||||
List<TextRepresentationDraft> drafts = new ArrayList<>();
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
RepresentationType.SEMANTIC_TEXT,
|
||||
BUILDER_KEY,
|
||||
request.detectionResult().languageCode(),
|
||||
semanticText,
|
||||
true,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.TRUE
|
||||
));
|
||||
/*
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
RepresentationType.FULLTEXT,
|
||||
BUILDER_KEY,
|
||||
request.detectionResult().languageCode(),
|
||||
normalizedText,
|
||||
false,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.TRUE
|
||||
));
|
||||
if (StringUtils.hasText(title)) {
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
RepresentationType.TITLE_ABSTRACT,
|
||||
BUILDER_KEY,
|
||||
request.detectionResult().languageCode(),
|
||||
title + "\n\n" + summary,
|
||||
false,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.FALSE
|
||||
));
|
||||
}
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
RepresentationType.SUMMARY,
|
||||
BUILDER_KEY,
|
||||
request.detectionResult().languageCode(),
|
||||
summary,
|
||||
false,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.FALSE
|
||||
));
|
||||
*/
|
||||
return drafts;
|
||||
}
|
||||
|
||||
private String buildSemanticText(String title,
|
||||
String description,
|
||||
String buyerName,
|
||||
String cpvCodes,
|
||||
String nutsCodes,
|
||||
String publicationId,
|
||||
String normalizedText) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("Document type: TED_NOTICE\n");
|
||||
if (StringUtils.hasText(publicationId)) {
|
||||
sb.append("Publication: ").append(publicationId.trim()).append('\n');
|
||||
}
|
||||
if (StringUtils.hasText(title)) {
|
||||
sb.append("Title: ").append(title.trim()).append("\n\n");
|
||||
}
|
||||
if (StringUtils.hasText(buyerName)) {
|
||||
sb.append("Contracting Authority: ").append(buyerName.trim()).append('\n');
|
||||
}
|
||||
if (StringUtils.hasText(cpvCodes)) {
|
||||
sb.append("CPV Codes: ").append(cpvCodes.trim()).append('\n');
|
||||
}
|
||||
if (StringUtils.hasText(nutsCodes)) {
|
||||
sb.append("NUTS Codes: ").append(nutsCodes.trim()).append('\n');
|
||||
}
|
||||
if (StringUtils.hasText(description)) {
|
||||
sb.append("\nDescription: ").append(description.trim()).append("\n\n");
|
||||
}
|
||||
sb.append(normalizedText.trim());
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
private String asString(Object value) {
|
||||
return value instanceof String s && StringUtils.hasText(s) ? s : null;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,110 @@
|
||||
package at.procon.dip.processing.impl;
|
||||
|
||||
import at.procon.dip.classification.spi.DetectionResult;
|
||||
import at.procon.dip.domain.document.ContentRole;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentStatus;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import at.procon.dip.processing.spi.DocumentProcessingPolicy;
|
||||
import at.procon.dip.processing.spi.StructuredDocumentProcessor;
|
||||
import at.procon.dip.processing.spi.StructuredProcessingRequest;
|
||||
import at.procon.dip.domain.document.service.DocumentService;
|
||||
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
|
||||
import at.procon.ted.model.entity.ProcurementDocument;
|
||||
import at.procon.ted.service.XmlParserService;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class TedStructuredDocumentProcessor implements StructuredDocumentProcessor {
|
||||
|
||||
private final XmlParserService xmlParserService;
|
||||
private final DocumentService documentService;
|
||||
private final TedNoticeProjectionService tedNoticeProjectionService;
|
||||
|
||||
@Override
|
||||
public boolean supports(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
|
||||
return detectionResult.documentType() == DocumentType.TED_NOTICE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentProcessingPolicy processingPolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
|
||||
return DocumentProcessingPolicy.replaceGenericTextProcessing();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExtractionResult process(StructuredProcessingRequest request) {
|
||||
String xml = request.textContent();
|
||||
if (!StringUtils.hasText(xml) && request.binaryContent() != null) {
|
||||
xml = new String(request.binaryContent(), java.nio.charset.StandardCharsets.UTF_8);
|
||||
}
|
||||
if (!StringUtils.hasText(xml)) {
|
||||
return new ExtractionResult(Map.of(), List.of(), List.of("TED structured processor received no XML payload"));
|
||||
}
|
||||
|
||||
ProcurementDocument tedDocument = xmlParserService.parseDocument(xml);
|
||||
tedDocument.setDocumentHash(request.dedupHash());
|
||||
tedDocument.setXmlDocument(xml);
|
||||
tedDocument.setSourceFilename(request.sourceDescriptor().fileName());
|
||||
tedDocument.setSourcePath(request.sourceDescriptor().sourceUri());
|
||||
tedDocument.setFileSizeBytes(request.binaryContent() == null ? null : (long) request.binaryContent().length);
|
||||
|
||||
var canonical = request.document();
|
||||
canonical.setDocumentType(DocumentType.TED_NOTICE);
|
||||
canonical.setDocumentFamily(DocumentFamily.PROCUREMENT);
|
||||
canonical.setStatus(DocumentStatus.CLASSIFIED);
|
||||
canonical.setTitle(tedDocument.getProjectTitle());
|
||||
canonical.setSummary(tedDocument.getProjectDescription());
|
||||
canonical.setLanguageCode(tedDocument.getLanguageCode());
|
||||
canonical.setMimeType(request.detectionResult().mimeType() == null ? "application/xml" : request.detectionResult().mimeType());
|
||||
if (StringUtils.hasText(tedDocument.getPublicationId())) {
|
||||
canonical.setBusinessKey("TED_NOTICE:" + tedDocument.getPublicationId());
|
||||
} else if (StringUtils.hasText(tedDocument.getNoticeId())) {
|
||||
canonical.setBusinessKey("TED_NOTICE:" + tedDocument.getNoticeId());
|
||||
}
|
||||
documentService.save(canonical);
|
||||
|
||||
tedNoticeProjectionService.registerOrRefreshProjection(tedDocument, canonical.getId());
|
||||
|
||||
Map<String, Object> payload = new LinkedHashMap<>();
|
||||
if (StringUtils.hasText(tedDocument.getProjectTitle())) {
|
||||
payload.put("title", tedDocument.getProjectTitle());
|
||||
}
|
||||
if (StringUtils.hasText(tedDocument.getProjectDescription())) {
|
||||
payload.put("description", tedDocument.getProjectDescription());
|
||||
}
|
||||
if (StringUtils.hasText(tedDocument.getBuyerName())) {
|
||||
payload.put("buyerName", tedDocument.getBuyerName());
|
||||
}
|
||||
if (tedDocument.getCpvCodes() != null && tedDocument.getCpvCodes().length > 0) {
|
||||
payload.put("cpvCodes", String.join(", ", tedDocument.getCpvCodes()));
|
||||
}
|
||||
if (tedDocument.getNutsCodes() != null && tedDocument.getNutsCodes().length > 0) {
|
||||
payload.put("nutsCodes", String.join(", ", tedDocument.getNutsCodes()));
|
||||
}
|
||||
payload.put("lotCount", tedDocument.getLots() == null ? 0 : tedDocument.getLots().size());
|
||||
payload.put("noticeId", tedDocument.getNoticeId());
|
||||
payload.put("publicationId", tedDocument.getPublicationId());
|
||||
|
||||
Map<ContentRole, String> derivedText = new LinkedHashMap<>();
|
||||
if (StringUtils.hasText(tedDocument.getTextContent())) {
|
||||
derivedText.put(ContentRole.NORMALIZED_TEXT, tedDocument.getTextContent());
|
||||
}
|
||||
|
||||
return new ExtractionResult(
|
||||
derivedText,
|
||||
List.of(new ExtractedStructuredPayload("ted-notice", payload)),
|
||||
List.of()
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,36 @@
|
||||
package at.procon.dip.processing.service;
|
||||
|
||||
import at.procon.dip.classification.spi.DetectionResult;
|
||||
import at.procon.dip.processing.spi.DocumentProcessingPolicy;
|
||||
import at.procon.dip.processing.spi.StructuredDocumentProcessor;
|
||||
import at.procon.dip.processing.spi.StructuredProcessingRequest;
|
||||
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class StructuredDocumentProcessingService {
|
||||
|
||||
private final List<StructuredDocumentProcessor> processors;
|
||||
|
||||
public Optional<StructuredDocumentProcessor> resolve(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
|
||||
return processors.stream()
|
||||
.filter(processor -> processor.supports(sourceDescriptor, detectionResult))
|
||||
.findFirst();
|
||||
}
|
||||
|
||||
public DocumentProcessingPolicy resolvePolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
|
||||
return resolve(sourceDescriptor, detectionResult)
|
||||
.map(processor -> processor.processingPolicy(sourceDescriptor, detectionResult))
|
||||
.orElse(DocumentProcessingPolicy.genericDefault());
|
||||
}
|
||||
|
||||
public Optional<ExtractionResult> process(StructuredProcessingRequest request) {
|
||||
return resolve(request.sourceDescriptor(), request.detectionResult())
|
||||
.map(processor -> processor.process(request));
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,22 @@
|
||||
package at.procon.dip.processing.spi;
|
||||
|
||||
/**
|
||||
* Controls which generic pipeline stages should run for a document and whether
|
||||
* a structured processor should be invoked.
|
||||
*/
|
||||
public record DocumentProcessingPolicy(
|
||||
boolean runGenericExtraction,
|
||||
boolean persistExtractedContent,
|
||||
boolean runRepresentationBuilders,
|
||||
boolean invokeStructuredProcessor,
|
||||
boolean applyStructuredTitleIfMissing
|
||||
) {
|
||||
|
||||
public static DocumentProcessingPolicy genericDefault() {
|
||||
return new DocumentProcessingPolicy(true, true, true, true, true);
|
||||
}
|
||||
|
||||
public static DocumentProcessingPolicy replaceGenericTextProcessing() {
|
||||
return new DocumentProcessingPolicy(false, true, true, true, true);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
package at.procon.dip.processing.spi;
|
||||
|
||||
import at.procon.dip.classification.spi.DetectionResult;
|
||||
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
|
||||
/**
|
||||
* Optional type-specific enrichment layer on top of the canonical DOC import.
|
||||
*/
|
||||
public interface StructuredDocumentProcessor {
|
||||
|
||||
boolean supports(SourceDescriptor sourceDescriptor, DetectionResult detectionResult);
|
||||
|
||||
default DocumentProcessingPolicy processingPolicy(SourceDescriptor sourceDescriptor, DetectionResult detectionResult) {
|
||||
return DocumentProcessingPolicy.replaceGenericTextProcessing();
|
||||
}
|
||||
|
||||
ExtractionResult process(StructuredProcessingRequest request);
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
package at.procon.dip.processing.spi;
|
||||
|
||||
import at.procon.dip.classification.spi.DetectionResult;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
|
||||
/**
|
||||
* Canonical import context handed to a structured document processor.
|
||||
*/
|
||||
public record StructuredProcessingRequest(
|
||||
Document document,
|
||||
DocumentContent originalContent,
|
||||
SourceDescriptor sourceDescriptor,
|
||||
DetectionResult detectionResult,
|
||||
byte[] binaryContent,
|
||||
String textContent,
|
||||
String dedupHash
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,80 @@
|
||||
-- Phase 4.1 enum/check-constraint expansion for newly introduced generic document/source types.
|
||||
-- Supports both:
|
||||
-- 1) PostgreSQL ENUM-backed columns created by Flyway
|
||||
-- 2) legacy VARCHAR + CHECK constraint variants that may exist in local/dev databases
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_type t
|
||||
JOIN pg_namespace n ON n.oid = t.typnamespace
|
||||
WHERE n.nspname = 'doc'
|
||||
AND t.typname = 'doc_document_type'
|
||||
) THEN
|
||||
ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_PACKAGE';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_type t
|
||||
JOIN pg_namespace n ON n.oid = t.typnamespace
|
||||
WHERE n.nspname = 'doc'
|
||||
AND t.typname = 'doc_source_type'
|
||||
) THEN
|
||||
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'PACKAGE_CHILD';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_constraint c
|
||||
JOIN pg_class r ON r.oid = c.conrelid
|
||||
JOIN pg_namespace n ON n.oid = r.relnamespace
|
||||
WHERE n.nspname = 'doc'
|
||||
AND r.relname = 'doc_document'
|
||||
AND c.conname = 'doc_document_document_type_check'
|
||||
) THEN
|
||||
ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_type_check;
|
||||
ALTER TABLE DOC.doc_document
|
||||
ADD CONSTRAINT doc_document_document_type_check
|
||||
CHECK (
|
||||
document_type IN (
|
||||
'TED_PACKAGE', 'TED_NOTICE', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
|
||||
'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'UNKNOWN'
|
||||
)
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_constraint c
|
||||
JOIN pg_class r ON r.oid = c.conrelid
|
||||
JOIN pg_namespace n ON n.oid = r.relnamespace
|
||||
WHERE n.nspname = 'doc'
|
||||
AND r.relname = 'doc_source'
|
||||
AND c.conname = 'doc_source_source_type_check'
|
||||
) THEN
|
||||
ALTER TABLE DOC.doc_source DROP CONSTRAINT doc_source_source_type_check;
|
||||
ALTER TABLE DOC.doc_source
|
||||
ADD CONSTRAINT doc_source_source_type_check
|
||||
CHECK (
|
||||
source_type IN (
|
||||
'TED_PACKAGE', 'PACKAGE_CHILD', 'MAIL', 'FILE_SYSTEM', 'REST_UPLOAD',
|
||||
'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION'
|
||||
)
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
Loading…
Reference in New Issue