diff --git a/docs/architecture/PHASE4_GENERIC_INGESTION_PIPELINE.md b/docs/architecture/PHASE4_GENERIC_INGESTION_PIPELINE.md new file mode 100644 index 0000000..24297b2 --- /dev/null +++ b/docs/architecture/PHASE4_GENERIC_INGESTION_PIPELINE.md @@ -0,0 +1,58 @@ +# Phase 4 - Generic Ingestion Pipeline + +## Goal + +Add the first generic ingestion path so arbitrary documents can be imported into the canonical DOC model, +normalized into text representations, and queued for vectorization without depending on the TED-specific model. + +## Scope implemented + +### Input channels + +- file-system polling route for arbitrary documents +- REST/API upload endpoints + +### Detection + +- file extension + media type based classification + +### Extraction + +- PDF -> text via PDFBox +- HTML -> cleaned text via JSoup +- text / markdown / generic XML -> normalized UTF-8 text +- unsupported binary types -> fallback warning only + +### Representation building + +- default generic builder creates: + - FULLTEXT + - SEMANTIC_TEXT + - TITLE_ABSTRACT + +### Persistence + +- original content stored in DOC.doc_content +- binary originals can now be stored inline in `binary_content` +- derived text variants persisted as additional DOC.doc_content rows +- text representations persisted in DOC.doc_text_representation +- pending embeddings created in DOC.doc_embedding when enabled + +## Access model + +The generic pipeline uses the Phase 0/1 access model: + +- optional owner tenant +- mandatory visibility + +This supports both: +- public documents (`owner_tenant_id = null`, `visibility = PUBLIC`) +- tenant-owned documents (`owner_tenant_id != null`, `visibility = TENANT/SHARED/...`) + +## Deliberately deferred + +- DOCX extraction +- ZIP recursive child import in the generic pipeline +- MIME/EML structured parsing +- generic structured projections beyond TED +- chunked long-document representations diff --git a/src/main/java/at/procon/dip/README_PHASE4.md b/src/main/java/at/procon/dip/README_PHASE4.md new file mode 100644 index 0000000..47eb95d --- /dev/null +++ b/src/main/java/at/procon/dip/README_PHASE4.md @@ -0,0 +1,40 @@ +# Phase 4 - Generic Ingestion Pipeline + +Phase 4 introduces the first generalized ingestion flow on top of the DOC backbone. + +## What is included + +- generic ingestion gateway with adapter selection +- file-system ingestion adapter and Camel route +- REST/API upload controller for arbitrary documents +- document type detection by media type / extension +- first extractors for: + - plain text / markdown / generic XML + - HTML + - PDF + - binary fallback +- default representation builder for non-TED documents +- binary payload support in `DOC.doc_content.binary_content` +- automatic creation of pending generic embeddings for imported representations + +## Important behavior + +- current TED runtime remains intact +- generic ingestion is disabled by default and must be enabled with: + - `ted.generic-ingestion.enabled=true` +- file-system polling is separately controlled with: + - `ted.generic-ingestion.file-system-enabled=true` +- REST/API upload endpoints are under: + - `/api/v1/dip/import/upload` + - `/api/v1/dip/import/text` + +## Current supported generic document types + +- PDF +- HTML +- TEXT +- MARKDOWN +- XML_GENERIC +- UNKNOWN text-like files + +DOCX, ZIP child extraction, and MIME body parsing are intentionally left for later phases. diff --git a/src/main/java/at/procon/dip/classification/detector/BasicMimeAndExtensionDocumentTypeDetector.java b/src/main/java/at/procon/dip/classification/detector/BasicMimeAndExtensionDocumentTypeDetector.java new file mode 100644 index 0000000..743b6c7 --- /dev/null +++ b/src/main/java/at/procon/dip/classification/detector/BasicMimeAndExtensionDocumentTypeDetector.java @@ -0,0 +1,82 @@ +package at.procon.dip.classification.detector; + +import at.procon.dip.classification.spi.DetectionResult; +import at.procon.dip.classification.spi.DocumentTypeDetector; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import at.procon.dip.ingestion.util.DocumentImportSupport; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +/** + * Basic Phase 4 detector using file extension, media type, and lightweight heuristics. + */ +@Component +public class BasicMimeAndExtensionDocumentTypeDetector implements DocumentTypeDetector { + + @Override + public boolean supports(SourceDescriptor sourceDescriptor) { + return true; + } + + @Override + public DetectionResult detect(SourceDescriptor sourceDescriptor) { + String normalizedMediaType = DocumentImportSupport.normalizeMediaType(sourceDescriptor.mediaType()); + String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName()); + + DocumentType documentType = detectByMediaType(normalizedMediaType); + if (documentType == DocumentType.UNKNOWN) { + documentType = detectByExtension(extension); + } + if (documentType == DocumentType.UNKNOWN && sourceDescriptor.hasInlineTextContent()) { + documentType = DocumentType.TEXT; + } + + DocumentFamily family = DocumentImportSupport.familyFor(documentType); + String languageCode = sourceDescriptor.attributes() == null ? null : sourceDescriptor.attributes().get("languageCode"); + Map attributes = new HashMap<>(); + attributes.put("detectedExtension", extension); + if (normalizedMediaType != null) { + attributes.put("normalizedMediaType", normalizedMediaType); + } + if (StringUtils.hasText(sourceDescriptor.fileName())) { + attributes.put("fileName", sourceDescriptor.fileName()); + } + + return new DetectionResult(documentType, family, normalizedMediaType, languageCode, attributes); + } + + private DocumentType detectByMediaType(String mediaType) { + if (!StringUtils.hasText(mediaType)) { + return DocumentType.UNKNOWN; + } + return switch (mediaType.toLowerCase(Locale.ROOT)) { + case "application/pdf", "application/x-pdf" -> DocumentType.PDF; + case "text/html", "application/xhtml+xml" -> DocumentType.HTML; + case "text/plain" -> DocumentType.TEXT; + case "text/markdown", "text/x-markdown" -> DocumentType.MARKDOWN; + case "application/xml", "text/xml" -> DocumentType.XML_GENERIC; + case "message/rfc822" -> DocumentType.MIME_MESSAGE; + case "application/zip", "application/x-zip-compressed" -> DocumentType.ZIP_ARCHIVE; + default -> mediaType.startsWith("text/") ? DocumentType.TEXT : DocumentType.UNKNOWN; + }; + } + + private DocumentType detectByExtension(String extension) { + return switch (extension) { + case "pdf" -> DocumentType.PDF; + case "html", "htm" -> DocumentType.HTML; + case "txt", "log", "csv", "json", "yaml", "yml" -> DocumentType.TEXT; + case "md", "markdown" -> DocumentType.MARKDOWN; + case "xml", "xsd", "xslt" -> DocumentType.XML_GENERIC; + case "eml", "msg" -> DocumentType.MIME_MESSAGE; + case "zip" -> DocumentType.ZIP_ARCHIVE; + case "docx" -> DocumentType.DOCX; + default -> DocumentType.UNKNOWN; + }; + } +} diff --git a/src/main/java/at/procon/dip/classification/service/DocumentClassificationService.java b/src/main/java/at/procon/dip/classification/service/DocumentClassificationService.java new file mode 100644 index 0000000..2511a6f --- /dev/null +++ b/src/main/java/at/procon/dip/classification/service/DocumentClassificationService.java @@ -0,0 +1,26 @@ +package at.procon.dip.classification.service; + +import at.procon.dip.classification.spi.DetectionResult; +import at.procon.dip.classification.spi.DocumentTypeDetector; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; + +/** + * Selects the first matching detector for a source descriptor. + */ +@Service +@RequiredArgsConstructor +public class DocumentClassificationService { + + private final List detectors; + + public DetectionResult detect(SourceDescriptor sourceDescriptor) { + return detectors.stream() + .filter(detector -> detector.supports(sourceDescriptor)) + .findFirst() + .orElseThrow(() -> new IllegalStateException("No document type detector available")) + .detect(sourceDescriptor); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java index af746bf..e7c9f76 100644 --- a/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java @@ -13,6 +13,7 @@ import jakarta.persistence.GenerationType; import jakarta.persistence.Id; import jakarta.persistence.Index; import jakarta.persistence.JoinColumn; +import jakarta.persistence.Lob; import jakarta.persistence.ManyToOne; import jakarta.persistence.PrePersist; import jakarta.persistence.Table; @@ -66,6 +67,10 @@ public class DocumentContent { @Column(name = "text_content", columnDefinition = "TEXT") private String textContent; + @Lob + @Column(name = "binary_content") + private byte[] binaryContent; + @Column(name = "binary_ref", columnDefinition = "TEXT") private String binaryRef; diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java index 27ee608..a97f14f 100644 --- a/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java @@ -25,6 +25,7 @@ public class DocumentContentService { .mimeType(command.mimeType()) .charsetName(command.charsetName()) .textContent(command.textContent()) + .binaryContent(command.binaryContent()) .binaryRef(command.binaryRef()) .contentHash(command.contentHash()) .sizeBytes(command.sizeBytes()) diff --git a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java index 284f9be..3af2e65 100644 --- a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java +++ b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java @@ -11,6 +11,7 @@ public record AddDocumentContentCommand( String mimeType, String charsetName, String textContent, + byte[] binaryContent, String binaryRef, String contentHash, Long sizeBytes diff --git a/src/main/java/at/procon/dip/extraction/impl/BinaryPassThroughDocumentExtractor.java b/src/main/java/at/procon/dip/extraction/impl/BinaryPassThroughDocumentExtractor.java new file mode 100644 index 0000000..63e7f24 --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/impl/BinaryPassThroughDocumentExtractor.java @@ -0,0 +1,30 @@ +package at.procon.dip.extraction.impl; + +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.extraction.spi.DocumentExtractor; +import at.procon.dip.extraction.spi.ExtractionRequest; +import at.procon.dip.extraction.spi.ExtractionResult; +import java.util.List; +import java.util.Map; +import org.springframework.stereotype.Component; + +/** + * Fallback extractor for binary formats not yet supported by specialized extractors. + */ +@Component +public class BinaryPassThroughDocumentExtractor implements DocumentExtractor { + + @Override + public boolean supports(DocumentType documentType, String mimeType) { + return documentType == DocumentType.DOCX + || documentType == DocumentType.ZIP_ARCHIVE + || documentType == DocumentType.GENERIC_BINARY + || documentType == DocumentType.MIME_MESSAGE; + } + + @Override + public ExtractionResult extract(ExtractionRequest extractionRequest) { + return new ExtractionResult(Map.of(), List.of(), List.of( + "No specialized extractor available yet for " + extractionRequest.detectionResult().documentType())); + } +} diff --git a/src/main/java/at/procon/dip/extraction/impl/HtmlDocumentExtractor.java b/src/main/java/at/procon/dip/extraction/impl/HtmlDocumentExtractor.java new file mode 100644 index 0000000..1e03e70 --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/impl/HtmlDocumentExtractor.java @@ -0,0 +1,60 @@ +package at.procon.dip.extraction.impl; + +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.extraction.spi.ExtractedStructuredPayload; +import at.procon.dip.extraction.spi.DocumentExtractor; +import at.procon.dip.extraction.spi.ExtractionRequest; +import at.procon.dip.extraction.spi.ExtractionResult; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +@Component +public class HtmlDocumentExtractor implements DocumentExtractor { + + @Override + public boolean supports(DocumentType documentType, String mimeType) { + return documentType == DocumentType.HTML || "text/html".equalsIgnoreCase(mimeType) + || "application/xhtml+xml".equalsIgnoreCase(mimeType); + } + + @Override + public ExtractionResult extract(ExtractionRequest extractionRequest) { + String html = extractionRequest.textContent(); + if (!StringUtils.hasText(html) && extractionRequest.binaryContent() != null) { + html = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8); + } + if (!StringUtils.hasText(html)) { + return new ExtractionResult(Map.of(), List.of(), List.of("No HTML content available")); + } + + Document parsed = Jsoup.parse(html); + parsed.outputSettings().prettyPrint(false); + String cleanText = parsed.text(); + + Map derivedText = new LinkedHashMap<>(); + derivedText.put(ContentRole.HTML_CLEAN, parsed.body() != null ? parsed.body().text() : cleanText); + derivedText.put(ContentRole.NORMALIZED_TEXT, cleanText); + + Map payload = new LinkedHashMap<>(); + if (StringUtils.hasText(parsed.title())) { + payload.put("title", parsed.title().trim()); + } + if (parsed.body() != null) { + payload.put("bodyTextLength", parsed.body().text().length()); + } + List warnings = new ArrayList<>(); + if (!StringUtils.hasText(parsed.title())) { + warnings.add("HTML document has no element"); + } + + return new ExtractionResult(derivedText, List.of(new ExtractedStructuredPayload("html-document", payload)), warnings); + } +} diff --git a/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java b/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java new file mode 100644 index 0000000..4eb6f07 --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java @@ -0,0 +1,69 @@ +package at.procon.dip.extraction.impl; + +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.extraction.spi.ExtractedStructuredPayload; +import at.procon.dip.extraction.spi.DocumentExtractor; +import at.procon.dip.extraction.spi.ExtractionRequest; +import at.procon.dip.extraction.spi.ExtractionResult; +import at.procon.ted.service.attachment.PdfExtractionService; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +@Component +@RequiredArgsConstructor +public class PdfDocumentExtractor implements DocumentExtractor { + + private final PdfExtractionService pdfExtractionService; + + @Override + public boolean supports(DocumentType documentType, String mimeType) { + return documentType == DocumentType.PDF || pdfExtractionService.canHandle("dummy.pdf", mimeType); + } + + @Override + public ExtractionResult extract(ExtractionRequest extractionRequest) { + byte[] binary = extractionRequest.binaryContent(); + if (binary == null || binary.length == 0) { + return new ExtractionResult(Map.of(), List.of(), List.of("No PDF binary payload available")); + } + + PdfExtractionService.ExtractionResult extraction = pdfExtractionService.extract( + binary, + extractionRequest.sourceDescriptor().fileName(), + extractionRequest.detectionResult().mimeType() + ); + if (!extraction.success()) { + return new ExtractionResult(Map.of(), List.of(), List.of(extraction.errorMessage())); + } + + String text = extraction.extractedText(); + if (!StringUtils.hasText(text)) { + return new ExtractionResult(Map.of(), List.of(), List.of("PDF text extraction returned no text")); + } + + Map<String, Object> payload = new LinkedHashMap<>(); + payload.put("title", deriveTitle(text, extractionRequest.sourceDescriptor().fileName())); + payload.put("extractor", "pdfbox"); + + return new ExtractionResult( + Map.of(ContentRole.NORMALIZED_TEXT, text), + List.of(new ExtractedStructuredPayload("pdf-document", payload)), + List.of() + ); + } + + private String deriveTitle(String text, String fallback) { + for (String line : text.split("\\n")) { + if (StringUtils.hasText(line)) { + String trimmed = line.trim(); + return trimmed.length() > 240 ? trimmed.substring(0, 240) : trimmed; + } + } + return fallback; + } +} diff --git a/src/main/java/at/procon/dip/extraction/impl/PlainTextDocumentExtractor.java b/src/main/java/at/procon/dip/extraction/impl/PlainTextDocumentExtractor.java new file mode 100644 index 0000000..4202ac2 --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/impl/PlainTextDocumentExtractor.java @@ -0,0 +1,69 @@ +package at.procon.dip.extraction.impl; + +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.extraction.spi.ExtractedStructuredPayload; +import at.procon.dip.extraction.spi.DocumentExtractor; +import at.procon.dip.extraction.spi.ExtractionRequest; +import at.procon.dip.extraction.spi.ExtractionResult; +import at.procon.dip.ingestion.util.DocumentImportSupport; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +@Component +public class PlainTextDocumentExtractor implements DocumentExtractor { + + @Override + public boolean supports(DocumentType documentType, String mimeType) { + return documentType == DocumentType.TEXT + || documentType == DocumentType.MARKDOWN + || documentType == DocumentType.XML_GENERIC + || documentType == DocumentType.UNKNOWN + || DocumentImportSupport.isLikelyTextMime(mimeType); + } + + @Override + public ExtractionResult extract(ExtractionRequest extractionRequest) { + String text = extractionRequest.textContent(); + if (!StringUtils.hasText(text) && extractionRequest.binaryContent() != null) { + text = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8); + } + text = normalizeText(text); + + if (!StringUtils.hasText(text)) { + return new ExtractionResult(Map.of(), List.of(), List.of("No text content extracted")); + } + + String title = deriveTitle(text, extractionRequest.sourceDescriptor().fileName()); + return new ExtractionResult( + Map.of(ContentRole.NORMALIZED_TEXT, text), + List.of(new ExtractedStructuredPayload("generic-document", Map.of("title", title))), + List.of() + ); + } + + private String normalizeText(String text) { + if (text == null) { + return null; + } + return text.replace("\r\n", "\n") + .replace('\r', '\n') + .replaceAll("\\n{3,}", "\n\n") + .replaceAll("[ \t]+\n", "\n") + .trim(); + } + + private String deriveTitle(String text, String fallback) { + if (StringUtils.hasText(text)) { + for (String line : text.split("\\n")) { + if (StringUtils.hasText(line)) { + return DocumentImportSupport.ellipsize(line.trim(), 240); + } + } + } + return fallback; + } +} diff --git a/src/main/java/at/procon/dip/extraction/service/DocumentExtractionService.java b/src/main/java/at/procon/dip/extraction/service/DocumentExtractionService.java new file mode 100644 index 0000000..79da558 --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/service/DocumentExtractionService.java @@ -0,0 +1,26 @@ +package at.procon.dip.extraction.service; + +import at.procon.dip.extraction.spi.DocumentExtractor; +import at.procon.dip.extraction.spi.ExtractionRequest; +import at.procon.dip.extraction.spi.ExtractionResult; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +public class DocumentExtractionService { + + private final List<DocumentExtractor> extractors; + + public ExtractionResult extract(ExtractionRequest extractionRequest) { + return extractors.stream() + .filter(extractor -> extractor.supports( + extractionRequest.detectionResult().documentType(), + extractionRequest.detectionResult().mimeType())) + .findFirst() + .orElseThrow(() -> new IllegalStateException( + "No extractor registered for type " + extractionRequest.detectionResult().documentType())) + .extract(extractionRequest); + } +} diff --git a/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java new file mode 100644 index 0000000..91025ec --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java @@ -0,0 +1,28 @@ +package at.procon.dip.ingestion.adapter; + +import at.procon.dip.ingestion.dto.ImportedDocumentResult; +import at.procon.dip.ingestion.service.GenericDocumentImportService; +import at.procon.dip.ingestion.spi.DocumentIngestionAdapter; +import at.procon.dip.ingestion.spi.IngestionResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +public class FileSystemDocumentIngestionAdapter implements DocumentIngestionAdapter { + + private final GenericDocumentImportService importService; + + @Override + public boolean supports(SourceDescriptor sourceDescriptor) { + return sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.FILE_SYSTEM; + } + + @Override + public IngestionResult ingest(SourceDescriptor sourceDescriptor) { + ImportedDocumentResult imported = importService.importDocument(sourceDescriptor); + return new IngestionResult(List.of(imported.document().toCanonicalMetadata()), imported.warnings()); + } +} diff --git a/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java new file mode 100644 index 0000000..39088ed --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java @@ -0,0 +1,31 @@ +package at.procon.dip.ingestion.adapter; + +import at.procon.dip.domain.document.SourceType; +import at.procon.dip.ingestion.dto.ImportedDocumentResult; +import at.procon.dip.ingestion.service.GenericDocumentImportService; +import at.procon.dip.ingestion.spi.DocumentIngestionAdapter; +import at.procon.dip.ingestion.spi.IngestionResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +public class InlineContentDocumentIngestionAdapter implements DocumentIngestionAdapter { + + private final GenericDocumentImportService importService; + + @Override + public boolean supports(SourceDescriptor sourceDescriptor) { + return sourceDescriptor.sourceType() == SourceType.REST_UPLOAD + || sourceDescriptor.sourceType() == SourceType.MANUAL_UPLOAD + || sourceDescriptor.sourceType() == SourceType.API; + } + + @Override + public IngestionResult ingest(SourceDescriptor sourceDescriptor) { + ImportedDocumentResult imported = importService.importDocument(sourceDescriptor); + return new IngestionResult(List.of(imported.document().toCanonicalMetadata()), imported.warnings()); + } +} diff --git a/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java b/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java new file mode 100644 index 0000000..6dea73e --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java @@ -0,0 +1,88 @@ +package at.procon.dip.ingestion.camel; + +import at.procon.dip.domain.access.DocumentAccessContext; +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.SourceType; +import at.procon.dip.domain.tenant.TenantRef; +import at.procon.dip.ingestion.service.DocumentIngestionGateway; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import at.procon.ted.config.TedProcessorProperties; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.OffsetDateTime; +import java.util.LinkedHashMap; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.apache.camel.Exchange; +import org.apache.camel.builder.RouteBuilder; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +@Component +@RequiredArgsConstructor +@Slf4j +public class GenericFileSystemIngestionRoute extends RouteBuilder { + + private final TedProcessorProperties properties; + private final DocumentIngestionGateway ingestionGateway; + + @Override + public void configure() { + if (!properties.getGenericIngestion().isEnabled() || !properties.getGenericIngestion().isFileSystemEnabled()) { + log.info("Phase 4 generic filesystem ingestion route disabled"); + return; + } + + var config = properties.getGenericIngestion(); + log.info("Configuring Phase 4 generic filesystem ingestion from {}", config.getInputDirectory()); + + fromF("file:%s?recursive=true&include=%s&delay=%d&maxMessagesPerPoll=%d&move=%s&moveFailed=%s", + config.getInputDirectory(), + config.getFilePattern(), + config.getPollInterval(), + config.getMaxMessagesPerPoll(), + config.getProcessedDirectory(), + config.getErrorDirectory()) + .routeId("dip-generic-filesystem-ingestion") + .process(exchange -> processFile(exchange)) + .log("Imported generic document from ${header.CamelFileAbsolutePath}"); + } + + private void processFile(Exchange exchange) throws Exception { + Path path = exchange.getIn().getBody(Path.class); + if (path == null) { + String absolutePath = exchange.getIn().getHeader(Exchange.FILE_PATH, String.class); + path = Path.of(absolutePath); + } + byte[] payload = Files.readAllBytes(path); + Map<String, String> attributes = new LinkedHashMap<>(); + String languageCode = properties.getGenericIngestion().getDefaultLanguageCode(); + if (StringUtils.hasText(languageCode)) { + attributes.put("languageCode", languageCode); + } + + SourceDescriptor descriptor = new SourceDescriptor( + buildDefaultAccessContext(), + SourceType.FILE_SYSTEM, + path.getFileName().toString(), + path.toAbsolutePath().toString(), + path.getFileName().toString(), + Files.probeContentType(path), + payload, + null, + OffsetDateTime.now(), + attributes + ); + ingestionGateway.ingest(descriptor); + } + + private DocumentAccessContext buildDefaultAccessContext() { + String ownerTenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey(); + DocumentVisibility visibility = properties.getGenericIngestion().getDefaultVisibility(); + if (!StringUtils.hasText(ownerTenantKey)) { + return new DocumentAccessContext(null, visibility); + } + return new DocumentAccessContext(new TenantRef(null, ownerTenantKey, ownerTenantKey), visibility); + } +} diff --git a/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java b/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java new file mode 100644 index 0000000..33f544a --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java @@ -0,0 +1,128 @@ +package at.procon.dip.ingestion.controller; + +import at.procon.dip.domain.access.DocumentAccessContext; +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.CanonicalDocumentMetadata; +import at.procon.dip.domain.document.SourceType; +import at.procon.dip.domain.tenant.TenantRef; +import at.procon.dip.ingestion.dto.GenericImportResponse; +import at.procon.dip.ingestion.dto.GenericTextImportRequest; +import at.procon.dip.ingestion.service.DocumentIngestionGateway; +import at.procon.dip.ingestion.spi.IngestionResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import at.procon.ted.config.TedProcessorProperties; +import java.time.OffsetDateTime; +import java.util.LinkedHashMap; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.util.StringUtils; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestBody; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + +@RestController +@RequestMapping("/v1/dip/import") +@RequiredArgsConstructor +public class GenericDocumentImportController { + + private final TedProcessorProperties properties; + private final DocumentIngestionGateway ingestionGateway; + + @PostMapping(path = "/upload", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) + public ResponseEntity<GenericImportResponse> upload( + @RequestParam("file") MultipartFile file, + @RequestParam(value = "ownerTenantKey", required = false) String ownerTenantKey, + @RequestParam(value = "visibility", required = false) DocumentVisibility visibility, + @RequestParam(value = "languageCode", required = false) String languageCode, + @RequestParam(value = "title", required = false) String title, + @RequestParam(value = "sourceIdentifier", required = false) String sourceIdentifier + ) throws Exception { + ensureRestUploadEnabled(); + + Map<String, String> attributes = new LinkedHashMap<>(); + if (StringUtils.hasText(languageCode)) { + attributes.put("languageCode", languageCode); + } + if (StringUtils.hasText(title)) { + attributes.put("title", title); + } + + SourceDescriptor descriptor = new SourceDescriptor( + buildAccessContext(ownerTenantKey, visibility), + SourceType.REST_UPLOAD, + StringUtils.hasText(sourceIdentifier) ? sourceIdentifier : file.getOriginalFilename(), + null, + file.getOriginalFilename(), + file.getContentType(), + file.getBytes(), + null, + OffsetDateTime.now(), + attributes + ); + IngestionResult result = ingestionGateway.ingest(descriptor); + return ResponseEntity.ok(toResponse(result)); + } + + @PostMapping(path = "/text", consumes = MediaType.APPLICATION_JSON_VALUE) + public ResponseEntity<GenericImportResponse> importText(@RequestBody GenericTextImportRequest request) { + ensureRestUploadEnabled(); + Map<String, String> attributes = new LinkedHashMap<>(); + if (StringUtils.hasText(request.languageCode())) { + attributes.put("languageCode", request.languageCode()); + } + if (StringUtils.hasText(request.title())) { + attributes.put("title", request.title()); + } + + SourceDescriptor descriptor = new SourceDescriptor( + buildAccessContext(request.ownerTenantKey(), request.visibility()), + SourceType.API, + StringUtils.hasText(request.sourceIdentifier()) ? request.sourceIdentifier() : request.fileName(), + null, + request.fileName(), + request.mediaType(), + request.text() == null ? null : request.text().getBytes(java.nio.charset.StandardCharsets.UTF_8), + request.text(), + OffsetDateTime.now(), + attributes + ); + IngestionResult result = ingestionGateway.ingest(descriptor); + return ResponseEntity.ok(toResponse(result)); + } + + private void ensureRestUploadEnabled() { + if (!properties.getGenericIngestion().isEnabled() || !properties.getGenericIngestion().isRestUploadEnabled()) { + throw new IllegalStateException("Generic REST import is disabled"); + } + } + + private DocumentAccessContext buildAccessContext(String ownerTenantKey, DocumentVisibility visibility) { + DocumentVisibility effectiveVisibility = visibility != null + ? visibility + : properties.getGenericIngestion().getDefaultVisibility(); + if (!StringUtils.hasText(ownerTenantKey)) { + return new DocumentAccessContext(null, effectiveVisibility); + } + return new DocumentAccessContext(new TenantRef(null, ownerTenantKey, ownerTenantKey), effectiveVisibility); + } + + private GenericImportResponse toResponse(IngestionResult result) { + CanonicalDocumentMetadata metadata = result.documents().stream() + .findFirst() + .orElseThrow(() -> new IllegalStateException("No imported document metadata returned")); + return new GenericImportResponse( + metadata.documentId(), + metadata.title(), + metadata.documentType(), + metadata.documentFamily(), + metadata.status(), + result.warnings().stream().anyMatch(w -> w != null && w.contains("already imported")), + result.warnings() + ); + } +} diff --git a/src/main/java/at/procon/dip/ingestion/dto/GenericImportResponse.java b/src/main/java/at/procon/dip/ingestion/dto/GenericImportResponse.java new file mode 100644 index 0000000..a2da5af --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/dto/GenericImportResponse.java @@ -0,0 +1,18 @@ +package at.procon.dip.ingestion.dto; + +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.DocumentStatus; +import java.util.List; +import java.util.UUID; + +public record GenericImportResponse( + UUID documentId, + String title, + DocumentType documentType, + DocumentFamily documentFamily, + DocumentStatus status, + boolean deduplicated, + List<String> warnings +) { +} diff --git a/src/main/java/at/procon/dip/ingestion/dto/GenericTextImportRequest.java b/src/main/java/at/procon/dip/ingestion/dto/GenericTextImportRequest.java new file mode 100644 index 0000000..c6560f9 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/dto/GenericTextImportRequest.java @@ -0,0 +1,15 @@ +package at.procon.dip.ingestion.dto; + +import at.procon.dip.domain.access.DocumentVisibility; + +public record GenericTextImportRequest( + String text, + String fileName, + String mediaType, + String ownerTenantKey, + DocumentVisibility visibility, + String languageCode, + String title, + String sourceIdentifier +) { +} diff --git a/src/main/java/at/procon/dip/ingestion/dto/ImportedDocumentResult.java b/src/main/java/at/procon/dip/ingestion/dto/ImportedDocumentResult.java new file mode 100644 index 0000000..fc85716 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/dto/ImportedDocumentResult.java @@ -0,0 +1,16 @@ +package at.procon.dip.ingestion.dto; + +import at.procon.dip.classification.spi.DetectionResult; +import at.procon.dip.domain.document.entity.Document; +import java.util.List; + +/** + * Internal result of the generic Phase 4 import pipeline. + */ +public record ImportedDocumentResult( + Document document, + DetectionResult detectionResult, + List<String> warnings, + boolean deduplicated +) { +} diff --git a/src/main/java/at/procon/dip/ingestion/service/DocumentIngestionGateway.java b/src/main/java/at/procon/dip/ingestion/service/DocumentIngestionGateway.java new file mode 100644 index 0000000..0666467 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/service/DocumentIngestionGateway.java @@ -0,0 +1,24 @@ +package at.procon.dip.ingestion.service; + +import at.procon.dip.ingestion.spi.DocumentIngestionAdapter; +import at.procon.dip.ingestion.spi.IngestionResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +public class DocumentIngestionGateway { + + private final List<DocumentIngestionAdapter> adapters; + + public IngestionResult ingest(SourceDescriptor sourceDescriptor) { + return adapters.stream() + .filter(adapter -> adapter.supports(sourceDescriptor)) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException( + "No ingestion adapter registered for source type " + sourceDescriptor.sourceType())) + .ingest(sourceDescriptor); + } +} diff --git a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java new file mode 100644 index 0000000..fbdaa5b --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java @@ -0,0 +1,393 @@ +package at.procon.dip.ingestion.service; + +import at.procon.dip.classification.service.DocumentClassificationService; +import at.procon.dip.classification.spi.DetectionResult; +import at.procon.dip.domain.access.DocumentAccessContext; +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.CanonicalDocumentMetadata; +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.StorageType; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.entity.DocumentContent; +import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; +import at.procon.dip.domain.document.entity.DocumentSource; +import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; +import at.procon.dip.domain.document.repository.DocumentRepository; +import at.procon.dip.domain.document.repository.DocumentSourceRepository; +import at.procon.dip.domain.document.service.DocumentContentService; +import at.procon.dip.domain.document.service.DocumentEmbeddingService; +import at.procon.dip.domain.document.service.DocumentRepresentationService; +import at.procon.dip.domain.document.service.DocumentService; +import at.procon.dip.domain.document.service.DocumentSourceService; +import at.procon.dip.domain.document.service.command.AddDocumentContentCommand; +import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand; +import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand; +import at.procon.dip.domain.document.service.command.CreateDocumentCommand; +import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand; +import at.procon.dip.extraction.service.DocumentExtractionService; +import at.procon.dip.extraction.spi.ExtractionRequest; +import at.procon.dip.extraction.spi.ExtractionResult; +import at.procon.dip.ingestion.dto.ImportedDocumentResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import at.procon.dip.ingestion.util.DocumentImportSupport; +import at.procon.dip.normalization.service.TextRepresentationBuildService; +import at.procon.dip.normalization.spi.RepresentationBuildRequest; +import at.procon.dip.normalization.spi.TextRepresentationDraft; +import at.procon.ted.config.TedProcessorProperties; +import at.procon.ted.util.HashUtils; +import java.nio.charset.StandardCharsets; +import java.time.OffsetDateTime; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; +import org.springframework.util.StringUtils; + +/** + * Phase 4 generic import pipeline that persists arbitrary document types into the DOC model. + */ +@Service +@RequiredArgsConstructor +@Slf4j +public class GenericDocumentImportService { + + private final TedProcessorProperties properties; + private final DocumentRepository documentRepository; + private final DocumentSourceRepository documentSourceRepository; + private final DocumentEmbeddingRepository documentEmbeddingRepository; + private final DocumentService documentService; + private final DocumentSourceService documentSourceService; + private final DocumentContentService documentContentService; + private final DocumentRepresentationService documentRepresentationService; + private final DocumentEmbeddingService documentEmbeddingService; + private final DocumentClassificationService classificationService; + private final DocumentExtractionService extractionService; + private final TextRepresentationBuildService representationBuildService; + + @Transactional + public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) { + ResolvedPayload payload = resolvePayload(sourceDescriptor); + DetectionResult detection = classificationService.detect(withResolvedMediaType(sourceDescriptor, payload)); + String dedupHash = HashUtils.computeSha256(payload.binaryContent()); + + if (properties.getGenericIngestion().isDeduplicateByContentHash()) { + Optional<Document> existing = documentRepository.findByDedupHash(dedupHash); + if (existing.isPresent()) { + Document document = existing.get(); + ensureSource(document, sourceDescriptor); + List<String> warnings = List.of("Document content hash already imported; linked new source to existing document"); + return new ImportedDocumentResult(document, detection, warnings, true); + } + } + + DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null + ? defaultAccessContext() + : sourceDescriptor.accessContext(); + + Document document = documentService.create(new CreateDocumentCommand( + accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(), + accessContext.visibility(), + detection.documentType(), + detection.documentFamily(), + DocumentStatus.RECEIVED, + determineInitialTitle(sourceDescriptor, detection, payload), + null, + detection.languageCode(), + detection.mimeType(), + buildBusinessKey(sourceDescriptor), + dedupHash + )); + + ensureSource(document, sourceDescriptor); + documentService.updateStatus(document.getId(), DocumentStatus.CLASSIFIED); + + DocumentContent originalContent = persistOriginalContent(document, sourceDescriptor, detection, payload, dedupHash); + + ExtractionResult extractionResult = extractionService.extract(new ExtractionRequest( + sourceDescriptor, + detection, + payload.textContent(), + payload.binaryContent() + )); + List<String> warnings = new ArrayList<>(extractionResult.warnings()); + + Map<ContentRole, DocumentContent> persistedDerivedContent = persistDerivedContent(document, detection, extractionResult, dedupHash); + documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED); + + var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult)); + persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts); + + Document reloaded = documentService.getRequired(document.getId()); + if (reloaded.getStatus() == DocumentStatus.EXTRACTED) { + documentService.updateStatus(reloaded.getId(), DocumentStatus.REPRESENTED); + reloaded = documentService.getRequired(reloaded.getId()); + } + + if (!extractionResult.structuredPayloads().isEmpty()) { + applyStructuredTitleIfMissing(reloaded, extractionResult); + reloaded = documentService.getRequired(reloaded.getId()); + } + + return new ImportedDocumentResult(reloaded, detection, warnings, false); + } + + private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) { + if (StringUtils.hasText(sourceDescriptor.mediaType())) { + return sourceDescriptor; + } + return new SourceDescriptor( + sourceDescriptor.accessContext(), + sourceDescriptor.sourceType(), + sourceDescriptor.sourceIdentifier(), + sourceDescriptor.sourceUri(), + sourceDescriptor.fileName(), + payload.mediaType(), + sourceDescriptor.binaryContent(), + sourceDescriptor.textContent(), + sourceDescriptor.receivedAt(), + sourceDescriptor.attributes() + ); + } + + private ResolvedPayload resolvePayload(SourceDescriptor sourceDescriptor) { + byte[] binary = sourceDescriptor.binaryContent(); + String text = sourceDescriptor.textContent(); + + if ((binary == null || binary.length == 0) && StringUtils.hasText(text)) { + binary = text.getBytes(StandardCharsets.UTF_8); + } + if ((binary == null || binary.length == 0) && StringUtils.hasText(sourceDescriptor.sourceUri())) { + try { + java.nio.file.Path path = java.nio.file.Path.of(sourceDescriptor.sourceUri()); + if (java.nio.file.Files.exists(path)) { + binary = java.nio.file.Files.readAllBytes(path); + } + } catch (Exception e) { + throw new IllegalArgumentException("Failed to read source payload from " + sourceDescriptor.sourceUri(), e); + } + } + if (binary == null || binary.length == 0) { + throw new IllegalArgumentException("No payload content available for source " + sourceDescriptor.sourceIdentifier()); + } + if (!StringUtils.hasText(text) && DocumentImportSupport.isLikelyTextMime(sourceDescriptor.mediaType())) { + text = new String(binary, StandardCharsets.UTF_8); + } + + return new ResolvedPayload(binary, text, inferMediaType(sourceDescriptor)); + } + + private String inferMediaType(SourceDescriptor sourceDescriptor) { + if (StringUtils.hasText(sourceDescriptor.mediaType())) { + return DocumentImportSupport.normalizeMediaType(sourceDescriptor.mediaType()); + } + String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName()); + return switch (extension) { + case "pdf" -> "application/pdf"; + case "html", "htm" -> "text/html"; + case "md", "markdown" -> "text/markdown"; + case "xml" -> "application/xml"; + case "txt", "log", "csv", "json", "yaml", "yml" -> "text/plain"; + case "eml" -> "message/rfc822"; + default -> null; + }; + } + + private DocumentAccessContext defaultAccessContext() { + String tenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey(); + DocumentVisibility visibility = properties.getGenericIngestion().getDefaultVisibility(); + if (!StringUtils.hasText(tenantKey)) { + return new DocumentAccessContext(null, visibility); + } + return new DocumentAccessContext(new at.procon.dip.domain.tenant.TenantRef(null, tenantKey, tenantKey), visibility); + } + + private String determineInitialTitle(SourceDescriptor sourceDescriptor, DetectionResult detection, ResolvedPayload payload) { + if (sourceDescriptor.attributes() != null && StringUtils.hasText(sourceDescriptor.attributes().get("title"))) { + return sourceDescriptor.attributes().get("title"); + } + if (StringUtils.hasText(sourceDescriptor.fileName())) { + return sourceDescriptor.fileName(); + } + if (StringUtils.hasText(payload.textContent())) { + for (String line : payload.textContent().split("\\n")) { + if (StringUtils.hasText(line)) { + return DocumentImportSupport.ellipsize(line.trim(), 240); + } + } + } + return detection.documentType().name(); + } + + private String buildBusinessKey(SourceDescriptor sourceDescriptor) { + if (StringUtils.hasText(sourceDescriptor.sourceIdentifier())) { + return sourceDescriptor.sourceType() + ":" + sourceDescriptor.sourceIdentifier(); + } + if (StringUtils.hasText(sourceDescriptor.sourceUri())) { + return sourceDescriptor.sourceType() + ":" + sourceDescriptor.sourceUri(); + } + return sourceDescriptor.sourceType() + ":" + java.util.UUID.randomUUID(); + } + + private void ensureSource(Document document, SourceDescriptor sourceDescriptor) { + boolean alreadyLinked = documentSourceRepository.findByDocument_Id(document.getId()).stream().anyMatch(existing -> + equalsNullable(existing.getExternalSourceId(), sourceDescriptor.sourceIdentifier()) + && equalsNullable(existing.getSourceUri(), sourceDescriptor.sourceUri()) + && equalsNullable(existing.getSourceFilename(), sourceDescriptor.fileName())); + if (alreadyLinked) { + return; + } + + documentSourceService.addSource(new AddDocumentSourceCommand( + document.getId(), + sourceDescriptor.sourceType(), + sourceDescriptor.sourceIdentifier(), + sourceDescriptor.sourceUri(), + sourceDescriptor.fileName(), + null, + properties.getGenericIngestion().getImportBatchId(), + sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt() + )); + } + + private DocumentContent persistOriginalContent(Document document, + SourceDescriptor sourceDescriptor, + DetectionResult detection, + ResolvedPayload payload, + String dedupHash) { + boolean storeBinaryInDb = shouldStoreBinaryInDb(payload.binaryContent()); + boolean textPreferred = DocumentImportSupport.isLikelyTextMime(detection.mimeType()) || sourceDescriptor.hasInlineTextContent(); + + return documentContentService.addContent(new AddDocumentContentCommand( + document.getId(), + ContentRole.ORIGINAL, + textPreferred ? StorageType.DB_TEXT : (storeBinaryInDb ? StorageType.DB_BINARY : StorageType.EXTERNAL_REFERENCE), + detection.mimeType(), + textPreferred ? StandardCharsets.UTF_8.name() : null, + textPreferred ? payload.textContent() : null, + storeBinaryInDb && !textPreferred ? payload.binaryContent() : null, + !storeBinaryInDb && !textPreferred ? sourceDescriptor.sourceUri() : null, + dedupHash, + (long) payload.binaryContent().length + )); + } + + private boolean shouldStoreBinaryInDb(byte[] binaryContent) { + return properties.getGenericIngestion().isStoreOriginalBinaryInDb() + && binaryContent != null + && binaryContent.length <= properties.getGenericIngestion().getMaxBinaryBytesInDb(); + } + + private Map<ContentRole, DocumentContent> persistDerivedContent(Document document, + DetectionResult detection, + ExtractionResult extractionResult, + String baseHash) { + Map<ContentRole, DocumentContent> result = new LinkedHashMap<>(); + extractionResult.derivedTextByRole().forEach((role, text) -> { + if (!StringUtils.hasText(text)) { + return; + } + String contentHash = HashUtils.computeSha256(baseHash + ":" + role.name() + ":" + text); + DocumentContent content = documentContentService.addContent(new AddDocumentContentCommand( + document.getId(), + role, + StorageType.DB_TEXT, + detection.mimeType(), + StandardCharsets.UTF_8.name(), + text, + null, + null, + contentHash, + (long) text.length() + )); + result.put(role, content); + }); + return result; + } + + private void persistRepresentationsAndEmbeddings(Document document, + DocumentContent originalContent, + Map<ContentRole, DocumentContent> derivedContent, + List<TextRepresentationDraft> drafts) { + if (drafts == null || drafts.isEmpty()) { + return; + } + + DocumentEmbeddingModel model = null; + if (properties.getVectorization().isEnabled() && properties.getVectorization().isGenericPipelineEnabled()) { + model = documentEmbeddingService.registerModel(new RegisterEmbeddingModelCommand( + properties.getVectorization().getModelName(), + properties.getVectorization().getEmbeddingProvider(), + properties.getVectorization().getModelName(), + properties.getVectorization().getDimensions(), + null, + false, + true + )); + } + + for (TextRepresentationDraft draft : drafts) { + if (!StringUtils.hasText(draft.textBody())) { + continue; + } + DocumentContent linkedContent = switch (draft.representationType()) { + case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK -> + derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent); + }; + + var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand( + document.getId(), + linkedContent == null ? null : linkedContent.getId(), + draft.representationType(), + "phase4-generic-builder", + draft.languageCode(), + null, + draft.chunkIndex(), + null, + null, + draft.primary(), + draft.textBody() + )); + + if (model != null && shouldQueueEmbedding(draft)) { + documentEmbeddingService.ensurePendingEmbedding(document.getId(), representation.getId(), model.getId()); + } + } + documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED); + } + + private boolean shouldQueueEmbedding(TextRepresentationDraft draft) { + return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true; + } + + private void applyStructuredTitleIfMissing(Document document, ExtractionResult extractionResult) { + boolean missingTitle = !StringUtils.hasText(document.getTitle()) || document.getTitle().equals(document.getDocumentType().name()); + if (!missingTitle) { + return; + } + for (var payload : extractionResult.structuredPayloads()) { + if (payload.attributes() == null) { + continue; + } + Object title = payload.attributes().get("title"); + if (title instanceof String titleValue && StringUtils.hasText(titleValue)) { + document.setTitle(titleValue); + documentService.save(document); + return; + } + } + } + + private boolean equalsNullable(String left, String right) { + return java.util.Objects.equals(left, right); + } + + private record ResolvedPayload(byte[] binaryContent, String textContent, String mediaType) { + } +} diff --git a/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java index 644bd5e..60183ff 100644 --- a/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java +++ b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java @@ -2,10 +2,14 @@ package at.procon.dip.ingestion.spi; import at.procon.dip.domain.access.DocumentAccessContext; import at.procon.dip.domain.document.SourceType; +import java.time.OffsetDateTime; import java.util.Map; /** * Describes a source object that should be ingested into the canonical document model. + * <p> + * Phase 4 extends the descriptor with optional inline payload fields so adapters can support + * both file-backed imports and direct upload/API ingestion without introducing separate DTO trees. */ public record SourceDescriptor( DocumentAccessContext accessContext, @@ -14,6 +18,17 @@ public record SourceDescriptor( String sourceUri, String fileName, String mediaType, + byte[] binaryContent, + String textContent, + OffsetDateTime receivedAt, Map<String, String> attributes ) { + + public boolean hasInlineBinaryContent() { + return binaryContent != null && binaryContent.length > 0; + } + + public boolean hasInlineTextContent() { + return textContent != null && !textContent.isBlank(); + } } diff --git a/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java b/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java new file mode 100644 index 0000000..272b114 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java @@ -0,0 +1,84 @@ +package at.procon.dip.ingestion.util; + +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import java.nio.charset.StandardCharsets; +import java.util.Locale; +import java.util.Map; +import org.springframework.util.StringUtils; + +/** + * Shared utility methods for Phase 4 generic ingestion. + */ +public final class DocumentImportSupport { + + private DocumentImportSupport() { + } + + public static String extensionOf(String fileName) { + if (!StringUtils.hasText(fileName) || !fileName.contains(".")) { + return ""; + } + return fileName.substring(fileName.lastIndexOf('.') + 1).toLowerCase(Locale.ROOT); + } + + public static boolean isLikelyTextMime(String mediaType) { + if (!StringUtils.hasText(mediaType)) { + return false; + } + String normalized = normalizeMediaType(mediaType); + return normalized.startsWith("text/") + || normalized.contains("json") + || normalized.contains("xml") + || normalized.contains("javascript") + || normalized.equals("application/xhtml+xml"); + } + + public static String normalizeMediaType(String mediaType) { + if (!StringUtils.hasText(mediaType)) { + return null; + } + int idx = mediaType.indexOf(';'); + String result = idx >= 0 ? mediaType.substring(0, idx) : mediaType; + return result.trim().toLowerCase(Locale.ROOT); + } + + public static DocumentFamily familyFor(DocumentType documentType) { + return switch (documentType) { + case TED_NOTICE -> DocumentFamily.PROCUREMENT; + case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL; + case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN -> + DocumentFamily.GENERIC; + }; + } + + public static String safeUtf8(byte[] bytes) { + if (bytes == null) { + return null; + } + return new String(bytes, StandardCharsets.UTF_8); + } + + public static String firstNonBlank(Map<String, String> values, String... keys) { + for (String key : keys) { + if (values == null) { + continue; + } + String value = values.get(key); + if (StringUtils.hasText(value)) { + return value; + } + } + return null; + } + + public static String ellipsize(String text, int maxLength) { + if (text == null || text.length() <= maxLength) { + return text; + } + if (maxLength <= 3) { + return text.substring(0, maxLength); + } + return text.substring(0, maxLength - 3) + "..."; + } +} diff --git a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java new file mode 100644 index 0000000..6d0e2d7 --- /dev/null +++ b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java @@ -0,0 +1,86 @@ +package at.procon.dip.normalization.impl; + +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.extraction.spi.ExtractedStructuredPayload; +import at.procon.dip.ingestion.util.DocumentImportSupport; +import at.procon.dip.normalization.spi.RepresentationBuildRequest; +import at.procon.dip.normalization.spi.TextRepresentationBuilder; +import at.procon.dip.normalization.spi.TextRepresentationDraft; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +@Component +public class DefaultGenericTextRepresentationBuilder implements TextRepresentationBuilder { + + @Override + public boolean supports(DocumentType documentType) { + return documentType != DocumentType.TED_NOTICE; + } + + @Override + public List<TextRepresentationDraft> build(RepresentationBuildRequest request) { + String normalizedText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT); + String htmlCleanText = request.extractionResult().derivedTextByRole().get(ContentRole.HTML_CLEAN); + String baseText = StringUtils.hasText(normalizedText) ? normalizedText : htmlCleanText; + if (!StringUtils.hasText(baseText)) { + return List.of(); + } + + String title = findStructuredTitle(request).orElse(request.sourceDescriptor().fileName()); + String summary = DocumentImportSupport.ellipsize(baseText.replace('\n', ' ').trim(), 1200); + String semantic = buildSemanticText(title, summary, request.detectionResult().documentType()); + + List<TextRepresentationDraft> drafts = new ArrayList<>(); + drafts.add(new TextRepresentationDraft( + RepresentationType.FULLTEXT, + request.detectionResult().languageCode(), + baseText, + false, + null + )); + drafts.add(new TextRepresentationDraft( + RepresentationType.SEMANTIC_TEXT, + request.detectionResult().languageCode(), + semantic, + true, + null + )); + if (StringUtils.hasText(title)) { + drafts.add(new TextRepresentationDraft( + RepresentationType.TITLE_ABSTRACT, + request.detectionResult().languageCode(), + title + "\n\n" + summary, + false, + null + )); + } + return drafts; + } + + private Optional<String> findStructuredTitle(RepresentationBuildRequest request) { + return request.extractionResult().structuredPayloads().stream() + .map(ExtractedStructuredPayload::attributes) + .filter(Objects::nonNull) + .map(attributes -> attributes.get("title")) + .filter(String.class::isInstance) + .map(String.class::cast) + .filter(StringUtils::hasText) + .findFirst(); + } + + private String buildSemanticText(String title, String summary, DocumentType documentType) { + StringBuilder sb = new StringBuilder(); + sb.append("Document type: ").append(documentType.name()).append('\n'); + if (StringUtils.hasText(title)) { + sb.append("Title: ").append(title.trim()).append("\n\n"); + } + sb.append(summary); + return sb.toString().trim(); + } +} diff --git a/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java b/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java new file mode 100644 index 0000000..38f2d8a --- /dev/null +++ b/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java @@ -0,0 +1,24 @@ +package at.procon.dip.normalization.service; + +import at.procon.dip.normalization.spi.RepresentationBuildRequest; +import at.procon.dip.normalization.spi.TextRepresentationBuilder; +import at.procon.dip.normalization.spi.TextRepresentationDraft; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +public class TextRepresentationBuildService { + + private final List<TextRepresentationBuilder> builders; + + public List<TextRepresentationDraft> build(RepresentationBuildRequest request) { + return builders.stream() + .filter(builder -> builder.supports(request.detectionResult().documentType())) + .findFirst() + .orElseThrow(() -> new IllegalStateException( + "No text representation builder registered for " + request.detectionResult().documentType())) + .build(request); + } +} diff --git a/src/main/java/at/procon/ted/config/TedProcessorProperties.java b/src/main/java/at/procon/ted/config/TedProcessorProperties.java index e804f5e..8a46df5 100644 --- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java +++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java @@ -28,6 +28,7 @@ public class TedProcessorProperties { private MailProperties mail = new MailProperties(); private SolutionBriefProperties solutionBrief = new SolutionBriefProperties(); private ProjectionProperties projection = new ProjectionProperties(); + private GenericIngestionProperties genericIngestion = new GenericIngestionProperties(); /** * Input directory configuration for Apache Camel file consumer. @@ -484,4 +485,101 @@ public class TedProcessorProperties { */ private String idempotentRepository = "./solution-brief-processed.dat"; } + + /** + * Phase 4 generic ingestion configuration. + */ + @Data + public static class GenericIngestionProperties { + + /** + * Master switch for the generic ingestion pipeline. + */ + private boolean enabled = false; + + /** + * Enable/disable filesystem import route for arbitrary documents. + */ + private boolean fileSystemEnabled = false; + + /** + * Enable/disable REST/API upload endpoints for arbitrary documents. + */ + private boolean restUploadEnabled = true; + + /** + * Input directory for the generic filesystem importer. + */ + private String inputDirectory = "D:/ted.europe/generic-input"; + + /** + * Regular-expression file pattern used by the Camel file route. + */ + private String filePattern = ".*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$"; + + /** + * Directory where successfully imported files are moved. + */ + private String processedDirectory = ".dip-processed"; + + /** + * Directory where failed files are moved. + */ + private String errorDirectory = ".dip-error"; + + /** + * Polling interval in milliseconds. + */ + @Positive + private long pollInterval = 15000; + + /** + * Maximum number of files per poll. + */ + @Positive + private int maxMessagesPerPoll = 10; + + /** + * Default owner tenant for files imported through the generic route. + */ + private String defaultOwnerTenantKey; + + /** + * Default visibility for generic imports when not supplied explicitly. + */ + private at.procon.dip.domain.access.DocumentVisibility defaultVisibility = at.procon.dip.domain.access.DocumentVisibility.PUBLIC; + + /** + * Optional default language code applied to filesystem imports. + */ + private String defaultLanguageCode; + + /** + * Persist original binary payloads in DB when they are small enough. + */ + private boolean storeOriginalBinaryInDb = true; + + /** + * Maximum binary size (bytes) stored inline in DOC.doc_content.binary_content. + */ + @Positive + private int maxBinaryBytesInDb = 5242880; + + /** + * Whether an already imported content hash should resolve to the existing document. + */ + private boolean deduplicateByContentHash = true; + + /** + * Queue only the primary text representation for embedding. + */ + private boolean vectorizePrimaryRepresentationOnly = true; + + /** + * Import batch identifier written to DOC.doc_source.import_batch_id. + */ + @NotBlank + private String importBatchId = "phase4-generic"; + } + } diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 059fe1f..0d9f1ec 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -204,6 +204,43 @@ ted: # Maximum number of legacy TED documents to backfill during startup startup-backfill-limit: 250 + # Phase 4 generic ingestion configuration + generic-ingestion: + # Master switch for arbitrary document ingestion into the DOC model + enabled: false + # Enable file-system polling for non-TED documents + file-system-enabled: false + # Allow REST/API upload endpoints for arbitrary documents + rest-upload-enabled: true + # Input directory for the generic Camel file route + input-directory: D:/ted.europe/generic-input + # Regex for files accepted by the generic file route + file-pattern: .*\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$ + # Move successfully processed files here + processed-directory: .dip-processed + # Move failed files here + error-directory: .dip-error + # Polling interval for the generic route + poll-interval: 15000 + # Maximum files per poll + max-messages-per-poll: 10 + # Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs + default-owner-tenant-key: + # Default visibility when no explicit access context is provided + default-visibility: PUBLIC + # Optional default language for filesystem imports + default-language-code: + # Store small binary originals in DOC.doc_content.binary_content + store-original-binary-in-db: true + # Maximum binary payload size persisted inline in DB + max-binary-bytes-in-db: 5242880 + # Deduplicate by content hash and attach additional sources to the same canonical document + deduplicate-by-content-hash: true + # Queue only the primary text representation for vectorization + vectorize-primary-representation-only: true + # Import batch marker written to DOC.doc_source.import_batch_id + import-batch-id: phase4-generic + # Solution Brief processing configuration solution-brief: # Enable/disable Solution Brief processing diff --git a/src/main/resources/db/migration/V7__doc_phase4_binary_content_support.sql b/src/main/resources/db/migration/V7__doc_phase4_binary_content_support.sql new file mode 100644 index 0000000..98681c1 --- /dev/null +++ b/src/main/resources/db/migration/V7__doc_phase4_binary_content_support.sql @@ -0,0 +1,9 @@ +-- Phase 4: Generic ingestion support for arbitrary document types. +-- Add binary payload storage to the generic DOC content table so non-text originals +-- such as PDFs can be stored directly in the database when configured. + +ALTER TABLE DOC.doc_content + ADD COLUMN IF NOT EXISTS binary_content BYTEA; + +CREATE INDEX IF NOT EXISTS idx_doc_content_document_role + ON DOC.doc_content(document_id, content_role, created_at DESC);