Refactor phases 4

master
trifonovt 1 month ago
parent adc4f2da43
commit ac59730f3e

@ -0,0 +1,58 @@
# Phase 4 - Generic Ingestion Pipeline
## Goal
Add the first generic ingestion path so arbitrary documents can be imported into the canonical DOC model,
normalized into text representations, and queued for vectorization without depending on the TED-specific model.
## Scope implemented
### Input channels
- file-system polling route for arbitrary documents
- REST/API upload endpoints
### Detection
- file extension + media type based classification
### Extraction
- PDF -> text via PDFBox
- HTML -> cleaned text via JSoup
- text / markdown / generic XML -> normalized UTF-8 text
- unsupported binary types -> fallback warning only
### Representation building
- default generic builder creates:
- FULLTEXT
- SEMANTIC_TEXT
- TITLE_ABSTRACT
### Persistence
- original content stored in DOC.doc_content
- binary originals can now be stored inline in `binary_content`
- derived text variants persisted as additional DOC.doc_content rows
- text representations persisted in DOC.doc_text_representation
- pending embeddings created in DOC.doc_embedding when enabled
## Access model
The generic pipeline uses the Phase 0/1 access model:
- optional owner tenant
- mandatory visibility
This supports both:
- public documents (`owner_tenant_id = null`, `visibility = PUBLIC`)
- tenant-owned documents (`owner_tenant_id != null`, `visibility = TENANT/SHARED/...`)
## Deliberately deferred
- DOCX extraction
- ZIP recursive child import in the generic pipeline
- MIME/EML structured parsing
- generic structured projections beyond TED
- chunked long-document representations

@ -0,0 +1,40 @@
# Phase 4 - Generic Ingestion Pipeline
Phase 4 introduces the first generalized ingestion flow on top of the DOC backbone.
## What is included
- generic ingestion gateway with adapter selection
- file-system ingestion adapter and Camel route
- REST/API upload controller for arbitrary documents
- document type detection by media type / extension
- first extractors for:
- plain text / markdown / generic XML
- HTML
- PDF
- binary fallback
- default representation builder for non-TED documents
- binary payload support in `DOC.doc_content.binary_content`
- automatic creation of pending generic embeddings for imported representations
## Important behavior
- current TED runtime remains intact
- generic ingestion is disabled by default and must be enabled with:
- `ted.generic-ingestion.enabled=true`
- file-system polling is separately controlled with:
- `ted.generic-ingestion.file-system-enabled=true`
- REST/API upload endpoints are under:
- `/api/v1/dip/import/upload`
- `/api/v1/dip/import/text`
## Current supported generic document types
- PDF
- HTML
- TEXT
- MARKDOWN
- XML_GENERIC
- UNKNOWN text-like files
DOCX, ZIP child extraction, and MIME body parsing are intentionally left for later phases.

@ -0,0 +1,82 @@
package at.procon.dip.classification.detector;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.classification.spi.DocumentTypeDetector;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
/**
* Basic Phase 4 detector using file extension, media type, and lightweight heuristics.
*/
@Component
public class BasicMimeAndExtensionDocumentTypeDetector implements DocumentTypeDetector {
@Override
public boolean supports(SourceDescriptor sourceDescriptor) {
return true;
}
@Override
public DetectionResult detect(SourceDescriptor sourceDescriptor) {
String normalizedMediaType = DocumentImportSupport.normalizeMediaType(sourceDescriptor.mediaType());
String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName());
DocumentType documentType = detectByMediaType(normalizedMediaType);
if (documentType == DocumentType.UNKNOWN) {
documentType = detectByExtension(extension);
}
if (documentType == DocumentType.UNKNOWN && sourceDescriptor.hasInlineTextContent()) {
documentType = DocumentType.TEXT;
}
DocumentFamily family = DocumentImportSupport.familyFor(documentType);
String languageCode = sourceDescriptor.attributes() == null ? null : sourceDescriptor.attributes().get("languageCode");
Map<String, String> attributes = new HashMap<>();
attributes.put("detectedExtension", extension);
if (normalizedMediaType != null) {
attributes.put("normalizedMediaType", normalizedMediaType);
}
if (StringUtils.hasText(sourceDescriptor.fileName())) {
attributes.put("fileName", sourceDescriptor.fileName());
}
return new DetectionResult(documentType, family, normalizedMediaType, languageCode, attributes);
}
private DocumentType detectByMediaType(String mediaType) {
if (!StringUtils.hasText(mediaType)) {
return DocumentType.UNKNOWN;
}
return switch (mediaType.toLowerCase(Locale.ROOT)) {
case "application/pdf", "application/x-pdf" -> DocumentType.PDF;
case "text/html", "application/xhtml+xml" -> DocumentType.HTML;
case "text/plain" -> DocumentType.TEXT;
case "text/markdown", "text/x-markdown" -> DocumentType.MARKDOWN;
case "application/xml", "text/xml" -> DocumentType.XML_GENERIC;
case "message/rfc822" -> DocumentType.MIME_MESSAGE;
case "application/zip", "application/x-zip-compressed" -> DocumentType.ZIP_ARCHIVE;
default -> mediaType.startsWith("text/") ? DocumentType.TEXT : DocumentType.UNKNOWN;
};
}
private DocumentType detectByExtension(String extension) {
return switch (extension) {
case "pdf" -> DocumentType.PDF;
case "html", "htm" -> DocumentType.HTML;
case "txt", "log", "csv", "json", "yaml", "yml" -> DocumentType.TEXT;
case "md", "markdown" -> DocumentType.MARKDOWN;
case "xml", "xsd", "xslt" -> DocumentType.XML_GENERIC;
case "eml", "msg" -> DocumentType.MIME_MESSAGE;
case "zip" -> DocumentType.ZIP_ARCHIVE;
case "docx" -> DocumentType.DOCX;
default -> DocumentType.UNKNOWN;
};
}
}

@ -0,0 +1,26 @@
package at.procon.dip.classification.service;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.classification.spi.DocumentTypeDetector;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
/**
* Selects the first matching detector for a source descriptor.
*/
@Service
@RequiredArgsConstructor
public class DocumentClassificationService {
private final List<DocumentTypeDetector> detectors;
public DetectionResult detect(SourceDescriptor sourceDescriptor) {
return detectors.stream()
.filter(detector -> detector.supports(sourceDescriptor))
.findFirst()
.orElseThrow(() -> new IllegalStateException("No document type detector available"))
.detect(sourceDescriptor);
}
}

@ -13,6 +13,7 @@ import jakarta.persistence.GenerationType;
import jakarta.persistence.Id;
import jakarta.persistence.Index;
import jakarta.persistence.JoinColumn;
import jakarta.persistence.Lob;
import jakarta.persistence.ManyToOne;
import jakarta.persistence.PrePersist;
import jakarta.persistence.Table;
@ -66,6 +67,10 @@ public class DocumentContent {
@Column(name = "text_content", columnDefinition = "TEXT")
private String textContent;
@Lob
@Column(name = "binary_content")
private byte[] binaryContent;
@Column(name = "binary_ref", columnDefinition = "TEXT")
private String binaryRef;

@ -25,6 +25,7 @@ public class DocumentContentService {
.mimeType(command.mimeType())
.charsetName(command.charsetName())
.textContent(command.textContent())
.binaryContent(command.binaryContent())
.binaryRef(command.binaryRef())
.contentHash(command.contentHash())
.sizeBytes(command.sizeBytes())

@ -11,6 +11,7 @@ public record AddDocumentContentCommand(
String mimeType,
String charsetName,
String textContent,
byte[] binaryContent,
String binaryRef,
String contentHash,
Long sizeBytes

@ -0,0 +1,30 @@
package at.procon.dip.extraction.impl;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.extraction.spi.DocumentExtractor;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Component;
/**
* Fallback extractor for binary formats not yet supported by specialized extractors.
*/
@Component
public class BinaryPassThroughDocumentExtractor implements DocumentExtractor {
@Override
public boolean supports(DocumentType documentType, String mimeType) {
return documentType == DocumentType.DOCX
|| documentType == DocumentType.ZIP_ARCHIVE
|| documentType == DocumentType.GENERIC_BINARY
|| documentType == DocumentType.MIME_MESSAGE;
}
@Override
public ExtractionResult extract(ExtractionRequest extractionRequest) {
return new ExtractionResult(Map.of(), List.of(), List.of(
"No specialized extractor available yet for " + extractionRequest.detectionResult().documentType()));
}
}

@ -0,0 +1,60 @@
package at.procon.dip.extraction.impl;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.extraction.spi.DocumentExtractor;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
@Component
public class HtmlDocumentExtractor implements DocumentExtractor {
@Override
public boolean supports(DocumentType documentType, String mimeType) {
return documentType == DocumentType.HTML || "text/html".equalsIgnoreCase(mimeType)
|| "application/xhtml+xml".equalsIgnoreCase(mimeType);
}
@Override
public ExtractionResult extract(ExtractionRequest extractionRequest) {
String html = extractionRequest.textContent();
if (!StringUtils.hasText(html) && extractionRequest.binaryContent() != null) {
html = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8);
}
if (!StringUtils.hasText(html)) {
return new ExtractionResult(Map.of(), List.of(), List.of("No HTML content available"));
}
Document parsed = Jsoup.parse(html);
parsed.outputSettings().prettyPrint(false);
String cleanText = parsed.text();
Map<ContentRole, String> derivedText = new LinkedHashMap<>();
derivedText.put(ContentRole.HTML_CLEAN, parsed.body() != null ? parsed.body().text() : cleanText);
derivedText.put(ContentRole.NORMALIZED_TEXT, cleanText);
Map<String, Object> payload = new LinkedHashMap<>();
if (StringUtils.hasText(parsed.title())) {
payload.put("title", parsed.title().trim());
}
if (parsed.body() != null) {
payload.put("bodyTextLength", parsed.body().text().length());
}
List<String> warnings = new ArrayList<>();
if (!StringUtils.hasText(parsed.title())) {
warnings.add("HTML document has no <title> element");
}
return new ExtractionResult(derivedText, List.of(new ExtractedStructuredPayload("html-document", payload)), warnings);
}
}

@ -0,0 +1,69 @@
package at.procon.dip.extraction.impl;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.extraction.spi.DocumentExtractor;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.ted.service.attachment.PdfExtractionService;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
@Component
@RequiredArgsConstructor
public class PdfDocumentExtractor implements DocumentExtractor {
private final PdfExtractionService pdfExtractionService;
@Override
public boolean supports(DocumentType documentType, String mimeType) {
return documentType == DocumentType.PDF || pdfExtractionService.canHandle("dummy.pdf", mimeType);
}
@Override
public ExtractionResult extract(ExtractionRequest extractionRequest) {
byte[] binary = extractionRequest.binaryContent();
if (binary == null || binary.length == 0) {
return new ExtractionResult(Map.of(), List.of(), List.of("No PDF binary payload available"));
}
PdfExtractionService.ExtractionResult extraction = pdfExtractionService.extract(
binary,
extractionRequest.sourceDescriptor().fileName(),
extractionRequest.detectionResult().mimeType()
);
if (!extraction.success()) {
return new ExtractionResult(Map.of(), List.of(), List.of(extraction.errorMessage()));
}
String text = extraction.extractedText();
if (!StringUtils.hasText(text)) {
return new ExtractionResult(Map.of(), List.of(), List.of("PDF text extraction returned no text"));
}
Map<String, Object> payload = new LinkedHashMap<>();
payload.put("title", deriveTitle(text, extractionRequest.sourceDescriptor().fileName()));
payload.put("extractor", "pdfbox");
return new ExtractionResult(
Map.of(ContentRole.NORMALIZED_TEXT, text),
List.of(new ExtractedStructuredPayload("pdf-document", payload)),
List.of()
);
}
private String deriveTitle(String text, String fallback) {
for (String line : text.split("\\n")) {
if (StringUtils.hasText(line)) {
String trimmed = line.trim();
return trimmed.length() > 240 ? trimmed.substring(0, 240) : trimmed;
}
}
return fallback;
}
}

@ -0,0 +1,69 @@
package at.procon.dip.extraction.impl;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.extraction.spi.DocumentExtractor;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
@Component
public class PlainTextDocumentExtractor implements DocumentExtractor {
@Override
public boolean supports(DocumentType documentType, String mimeType) {
return documentType == DocumentType.TEXT
|| documentType == DocumentType.MARKDOWN
|| documentType == DocumentType.XML_GENERIC
|| documentType == DocumentType.UNKNOWN
|| DocumentImportSupport.isLikelyTextMime(mimeType);
}
@Override
public ExtractionResult extract(ExtractionRequest extractionRequest) {
String text = extractionRequest.textContent();
if (!StringUtils.hasText(text) && extractionRequest.binaryContent() != null) {
text = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8);
}
text = normalizeText(text);
if (!StringUtils.hasText(text)) {
return new ExtractionResult(Map.of(), List.of(), List.of("No text content extracted"));
}
String title = deriveTitle(text, extractionRequest.sourceDescriptor().fileName());
return new ExtractionResult(
Map.of(ContentRole.NORMALIZED_TEXT, text),
List.of(new ExtractedStructuredPayload("generic-document", Map.of("title", title))),
List.of()
);
}
private String normalizeText(String text) {
if (text == null) {
return null;
}
return text.replace("\r\n", "\n")
.replace('\r', '\n')
.replaceAll("\\n{3,}", "\n\n")
.replaceAll("[ \t]+\n", "\n")
.trim();
}
private String deriveTitle(String text, String fallback) {
if (StringUtils.hasText(text)) {
for (String line : text.split("\\n")) {
if (StringUtils.hasText(line)) {
return DocumentImportSupport.ellipsize(line.trim(), 240);
}
}
}
return fallback;
}
}

@ -0,0 +1,26 @@
package at.procon.dip.extraction.service;
import at.procon.dip.extraction.spi.DocumentExtractor;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
public class DocumentExtractionService {
private final List<DocumentExtractor> extractors;
public ExtractionResult extract(ExtractionRequest extractionRequest) {
return extractors.stream()
.filter(extractor -> extractor.supports(
extractionRequest.detectionResult().documentType(),
extractionRequest.detectionResult().mimeType()))
.findFirst()
.orElseThrow(() -> new IllegalStateException(
"No extractor registered for type " + extractionRequest.detectionResult().documentType()))
.extract(extractionRequest);
}
}

@ -0,0 +1,28 @@
package at.procon.dip.ingestion.adapter;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.service.GenericDocumentImportService;
import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
public class FileSystemDocumentIngestionAdapter implements DocumentIngestionAdapter {
private final GenericDocumentImportService importService;
@Override
public boolean supports(SourceDescriptor sourceDescriptor) {
return sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.FILE_SYSTEM;
}
@Override
public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
ImportedDocumentResult imported = importService.importDocument(sourceDescriptor);
return new IngestionResult(List.of(imported.document().toCanonicalMetadata()), imported.warnings());
}
}

@ -0,0 +1,31 @@
package at.procon.dip.ingestion.adapter;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.service.GenericDocumentImportService;
import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
public class InlineContentDocumentIngestionAdapter implements DocumentIngestionAdapter {
private final GenericDocumentImportService importService;
@Override
public boolean supports(SourceDescriptor sourceDescriptor) {
return sourceDescriptor.sourceType() == SourceType.REST_UPLOAD
|| sourceDescriptor.sourceType() == SourceType.MANUAL_UPLOAD
|| sourceDescriptor.sourceType() == SourceType.API;
}
@Override
public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
ImportedDocumentResult imported = importService.importDocument(sourceDescriptor);
return new IngestionResult(List.of(imported.document().toCanonicalMetadata()), imported.warnings());
}
}

@ -0,0 +1,88 @@
package at.procon.dip.ingestion.camel;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.domain.tenant.TenantRef;
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.OffsetDateTime;
import java.util.LinkedHashMap;
import java.util.Map;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.camel.Exchange;
import org.apache.camel.builder.RouteBuilder;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
@Component
@RequiredArgsConstructor
@Slf4j
public class GenericFileSystemIngestionRoute extends RouteBuilder {
private final TedProcessorProperties properties;
private final DocumentIngestionGateway ingestionGateway;
@Override
public void configure() {
if (!properties.getGenericIngestion().isEnabled() || !properties.getGenericIngestion().isFileSystemEnabled()) {
log.info("Phase 4 generic filesystem ingestion route disabled");
return;
}
var config = properties.getGenericIngestion();
log.info("Configuring Phase 4 generic filesystem ingestion from {}", config.getInputDirectory());
fromF("file:%s?recursive=true&include=%s&delay=%d&maxMessagesPerPoll=%d&move=%s&moveFailed=%s",
config.getInputDirectory(),
config.getFilePattern(),
config.getPollInterval(),
config.getMaxMessagesPerPoll(),
config.getProcessedDirectory(),
config.getErrorDirectory())
.routeId("dip-generic-filesystem-ingestion")
.process(exchange -> processFile(exchange))
.log("Imported generic document from ${header.CamelFileAbsolutePath}");
}
private void processFile(Exchange exchange) throws Exception {
Path path = exchange.getIn().getBody(Path.class);
if (path == null) {
String absolutePath = exchange.getIn().getHeader(Exchange.FILE_PATH, String.class);
path = Path.of(absolutePath);
}
byte[] payload = Files.readAllBytes(path);
Map<String, String> attributes = new LinkedHashMap<>();
String languageCode = properties.getGenericIngestion().getDefaultLanguageCode();
if (StringUtils.hasText(languageCode)) {
attributes.put("languageCode", languageCode);
}
SourceDescriptor descriptor = new SourceDescriptor(
buildDefaultAccessContext(),
SourceType.FILE_SYSTEM,
path.getFileName().toString(),
path.toAbsolutePath().toString(),
path.getFileName().toString(),
Files.probeContentType(path),
payload,
null,
OffsetDateTime.now(),
attributes
);
ingestionGateway.ingest(descriptor);
}
private DocumentAccessContext buildDefaultAccessContext() {
String ownerTenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey();
DocumentVisibility visibility = properties.getGenericIngestion().getDefaultVisibility();
if (!StringUtils.hasText(ownerTenantKey)) {
return new DocumentAccessContext(null, visibility);
}
return new DocumentAccessContext(new TenantRef(null, ownerTenantKey, ownerTenantKey), visibility);
}
}

@ -0,0 +1,128 @@
package at.procon.dip.ingestion.controller;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.domain.tenant.TenantRef;
import at.procon.dip.ingestion.dto.GenericImportResponse;
import at.procon.dip.ingestion.dto.GenericTextImportRequest;
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.ted.config.TedProcessorProperties;
import java.time.OffsetDateTime;
import java.util.LinkedHashMap;
import java.util.Map;
import lombok.RequiredArgsConstructor;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.util.StringUtils;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
@RestController
@RequestMapping("/v1/dip/import")
@RequiredArgsConstructor
public class GenericDocumentImportController {
private final TedProcessorProperties properties;
private final DocumentIngestionGateway ingestionGateway;
@PostMapping(path = "/upload", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
public ResponseEntity<GenericImportResponse> upload(
@RequestParam("file") MultipartFile file,
@RequestParam(value = "ownerTenantKey", required = false) String ownerTenantKey,
@RequestParam(value = "visibility", required = false) DocumentVisibility visibility,
@RequestParam(value = "languageCode", required = false) String languageCode,
@RequestParam(value = "title", required = false) String title,
@RequestParam(value = "sourceIdentifier", required = false) String sourceIdentifier
) throws Exception {
ensureRestUploadEnabled();
Map<String, String> attributes = new LinkedHashMap<>();
if (StringUtils.hasText(languageCode)) {
attributes.put("languageCode", languageCode);
}
if (StringUtils.hasText(title)) {
attributes.put("title", title);
}
SourceDescriptor descriptor = new SourceDescriptor(
buildAccessContext(ownerTenantKey, visibility),
SourceType.REST_UPLOAD,
StringUtils.hasText(sourceIdentifier) ? sourceIdentifier : file.getOriginalFilename(),
null,
file.getOriginalFilename(),
file.getContentType(),
file.getBytes(),
null,
OffsetDateTime.now(),
attributes
);
IngestionResult result = ingestionGateway.ingest(descriptor);
return ResponseEntity.ok(toResponse(result));
}
@PostMapping(path = "/text", consumes = MediaType.APPLICATION_JSON_VALUE)
public ResponseEntity<GenericImportResponse> importText(@RequestBody GenericTextImportRequest request) {
ensureRestUploadEnabled();
Map<String, String> attributes = new LinkedHashMap<>();
if (StringUtils.hasText(request.languageCode())) {
attributes.put("languageCode", request.languageCode());
}
if (StringUtils.hasText(request.title())) {
attributes.put("title", request.title());
}
SourceDescriptor descriptor = new SourceDescriptor(
buildAccessContext(request.ownerTenantKey(), request.visibility()),
SourceType.API,
StringUtils.hasText(request.sourceIdentifier()) ? request.sourceIdentifier() : request.fileName(),
null,
request.fileName(),
request.mediaType(),
request.text() == null ? null : request.text().getBytes(java.nio.charset.StandardCharsets.UTF_8),
request.text(),
OffsetDateTime.now(),
attributes
);
IngestionResult result = ingestionGateway.ingest(descriptor);
return ResponseEntity.ok(toResponse(result));
}
private void ensureRestUploadEnabled() {
if (!properties.getGenericIngestion().isEnabled() || !properties.getGenericIngestion().isRestUploadEnabled()) {
throw new IllegalStateException("Generic REST import is disabled");
}
}
private DocumentAccessContext buildAccessContext(String ownerTenantKey, DocumentVisibility visibility) {
DocumentVisibility effectiveVisibility = visibility != null
? visibility
: properties.getGenericIngestion().getDefaultVisibility();
if (!StringUtils.hasText(ownerTenantKey)) {
return new DocumentAccessContext(null, effectiveVisibility);
}
return new DocumentAccessContext(new TenantRef(null, ownerTenantKey, ownerTenantKey), effectiveVisibility);
}
private GenericImportResponse toResponse(IngestionResult result) {
CanonicalDocumentMetadata metadata = result.documents().stream()
.findFirst()
.orElseThrow(() -> new IllegalStateException("No imported document metadata returned"));
return new GenericImportResponse(
metadata.documentId(),
metadata.title(),
metadata.documentType(),
metadata.documentFamily(),
metadata.status(),
result.warnings().stream().anyMatch(w -> w != null && w.contains("already imported")),
result.warnings()
);
}
}

@ -0,0 +1,18 @@
package at.procon.dip.ingestion.dto;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.DocumentStatus;
import java.util.List;
import java.util.UUID;
public record GenericImportResponse(
UUID documentId,
String title,
DocumentType documentType,
DocumentFamily documentFamily,
DocumentStatus status,
boolean deduplicated,
List<String> warnings
) {
}

@ -0,0 +1,15 @@
package at.procon.dip.ingestion.dto;
import at.procon.dip.domain.access.DocumentVisibility;
public record GenericTextImportRequest(
String text,
String fileName,
String mediaType,
String ownerTenantKey,
DocumentVisibility visibility,
String languageCode,
String title,
String sourceIdentifier
) {
}

@ -0,0 +1,16 @@
package at.procon.dip.ingestion.dto;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.document.entity.Document;
import java.util.List;
/**
* Internal result of the generic Phase 4 import pipeline.
*/
public record ImportedDocumentResult(
Document document,
DetectionResult detectionResult,
List<String> warnings,
boolean deduplicated
) {
}

@ -0,0 +1,24 @@
package at.procon.dip.ingestion.service;
import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
public class DocumentIngestionGateway {
private final List<DocumentIngestionAdapter> adapters;
public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
return adapters.stream()
.filter(adapter -> adapter.supports(sourceDescriptor))
.findFirst()
.orElseThrow(() -> new IllegalArgumentException(
"No ingestion adapter registered for source type " + sourceDescriptor.sourceType()))
.ingest(sourceDescriptor);
}
}

@ -0,0 +1,393 @@
package at.procon.dip.ingestion.service;
import at.procon.dip.classification.service.DocumentClassificationService;
import at.procon.dip.classification.spi.DetectionResult;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.StorageType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
import at.procon.dip.domain.document.entity.DocumentSource;
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
import at.procon.dip.domain.document.repository.DocumentRepository;
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
import at.procon.dip.domain.document.service.DocumentContentService;
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
import at.procon.dip.domain.document.service.DocumentRepresentationService;
import at.procon.dip.domain.document.service.DocumentService;
import at.procon.dip.domain.document.service.DocumentSourceService;
import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
import at.procon.dip.extraction.service.DocumentExtractionService;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import at.procon.dip.normalization.service.TextRepresentationBuildService;
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
import at.procon.dip.normalization.spi.TextRepresentationDraft;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.util.HashUtils;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils;
/**
* Phase 4 generic import pipeline that persists arbitrary document types into the DOC model.
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class GenericDocumentImportService {
private final TedProcessorProperties properties;
private final DocumentRepository documentRepository;
private final DocumentSourceRepository documentSourceRepository;
private final DocumentEmbeddingRepository documentEmbeddingRepository;
private final DocumentService documentService;
private final DocumentSourceService documentSourceService;
private final DocumentContentService documentContentService;
private final DocumentRepresentationService documentRepresentationService;
private final DocumentEmbeddingService documentEmbeddingService;
private final DocumentClassificationService classificationService;
private final DocumentExtractionService extractionService;
private final TextRepresentationBuildService representationBuildService;
@Transactional
public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) {
ResolvedPayload payload = resolvePayload(sourceDescriptor);
DetectionResult detection = classificationService.detect(withResolvedMediaType(sourceDescriptor, payload));
String dedupHash = HashUtils.computeSha256(payload.binaryContent());
if (properties.getGenericIngestion().isDeduplicateByContentHash()) {
Optional<Document> existing = documentRepository.findByDedupHash(dedupHash);
if (existing.isPresent()) {
Document document = existing.get();
ensureSource(document, sourceDescriptor);
List<String> warnings = List.of("Document content hash already imported; linked new source to existing document");
return new ImportedDocumentResult(document, detection, warnings, true);
}
}
DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null
? defaultAccessContext()
: sourceDescriptor.accessContext();
Document document = documentService.create(new CreateDocumentCommand(
accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(),
accessContext.visibility(),
detection.documentType(),
detection.documentFamily(),
DocumentStatus.RECEIVED,
determineInitialTitle(sourceDescriptor, detection, payload),
null,
detection.languageCode(),
detection.mimeType(),
buildBusinessKey(sourceDescriptor),
dedupHash
));
ensureSource(document, sourceDescriptor);
documentService.updateStatus(document.getId(), DocumentStatus.CLASSIFIED);
DocumentContent originalContent = persistOriginalContent(document, sourceDescriptor, detection, payload, dedupHash);
ExtractionResult extractionResult = extractionService.extract(new ExtractionRequest(
sourceDescriptor,
detection,
payload.textContent(),
payload.binaryContent()
));
List<String> warnings = new ArrayList<>(extractionResult.warnings());
Map<ContentRole, DocumentContent> persistedDerivedContent = persistDerivedContent(document, detection, extractionResult, dedupHash);
documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult));
persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts);
Document reloaded = documentService.getRequired(document.getId());
if (reloaded.getStatus() == DocumentStatus.EXTRACTED) {
documentService.updateStatus(reloaded.getId(), DocumentStatus.REPRESENTED);
reloaded = documentService.getRequired(reloaded.getId());
}
if (!extractionResult.structuredPayloads().isEmpty()) {
applyStructuredTitleIfMissing(reloaded, extractionResult);
reloaded = documentService.getRequired(reloaded.getId());
}
return new ImportedDocumentResult(reloaded, detection, warnings, false);
}
private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) {
if (StringUtils.hasText(sourceDescriptor.mediaType())) {
return sourceDescriptor;
}
return new SourceDescriptor(
sourceDescriptor.accessContext(),
sourceDescriptor.sourceType(),
sourceDescriptor.sourceIdentifier(),
sourceDescriptor.sourceUri(),
sourceDescriptor.fileName(),
payload.mediaType(),
sourceDescriptor.binaryContent(),
sourceDescriptor.textContent(),
sourceDescriptor.receivedAt(),
sourceDescriptor.attributes()
);
}
private ResolvedPayload resolvePayload(SourceDescriptor sourceDescriptor) {
byte[] binary = sourceDescriptor.binaryContent();
String text = sourceDescriptor.textContent();
if ((binary == null || binary.length == 0) && StringUtils.hasText(text)) {
binary = text.getBytes(StandardCharsets.UTF_8);
}
if ((binary == null || binary.length == 0) && StringUtils.hasText(sourceDescriptor.sourceUri())) {
try {
java.nio.file.Path path = java.nio.file.Path.of(sourceDescriptor.sourceUri());
if (java.nio.file.Files.exists(path)) {
binary = java.nio.file.Files.readAllBytes(path);
}
} catch (Exception e) {
throw new IllegalArgumentException("Failed to read source payload from " + sourceDescriptor.sourceUri(), e);
}
}
if (binary == null || binary.length == 0) {
throw new IllegalArgumentException("No payload content available for source " + sourceDescriptor.sourceIdentifier());
}
if (!StringUtils.hasText(text) && DocumentImportSupport.isLikelyTextMime(sourceDescriptor.mediaType())) {
text = new String(binary, StandardCharsets.UTF_8);
}
return new ResolvedPayload(binary, text, inferMediaType(sourceDescriptor));
}
private String inferMediaType(SourceDescriptor sourceDescriptor) {
if (StringUtils.hasText(sourceDescriptor.mediaType())) {
return DocumentImportSupport.normalizeMediaType(sourceDescriptor.mediaType());
}
String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName());
return switch (extension) {
case "pdf" -> "application/pdf";
case "html", "htm" -> "text/html";
case "md", "markdown" -> "text/markdown";
case "xml" -> "application/xml";
case "txt", "log", "csv", "json", "yaml", "yml" -> "text/plain";
case "eml" -> "message/rfc822";
default -> null;
};
}
private DocumentAccessContext defaultAccessContext() {
String tenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey();
DocumentVisibility visibility = properties.getGenericIngestion().getDefaultVisibility();
if (!StringUtils.hasText(tenantKey)) {
return new DocumentAccessContext(null, visibility);
}
return new DocumentAccessContext(new at.procon.dip.domain.tenant.TenantRef(null, tenantKey, tenantKey), visibility);
}
private String determineInitialTitle(SourceDescriptor sourceDescriptor, DetectionResult detection, ResolvedPayload payload) {
if (sourceDescriptor.attributes() != null && StringUtils.hasText(sourceDescriptor.attributes().get("title"))) {
return sourceDescriptor.attributes().get("title");
}
if (StringUtils.hasText(sourceDescriptor.fileName())) {
return sourceDescriptor.fileName();
}
if (StringUtils.hasText(payload.textContent())) {
for (String line : payload.textContent().split("\\n")) {
if (StringUtils.hasText(line)) {
return DocumentImportSupport.ellipsize(line.trim(), 240);
}
}
}
return detection.documentType().name();
}
private String buildBusinessKey(SourceDescriptor sourceDescriptor) {
if (StringUtils.hasText(sourceDescriptor.sourceIdentifier())) {
return sourceDescriptor.sourceType() + ":" + sourceDescriptor.sourceIdentifier();
}
if (StringUtils.hasText(sourceDescriptor.sourceUri())) {
return sourceDescriptor.sourceType() + ":" + sourceDescriptor.sourceUri();
}
return sourceDescriptor.sourceType() + ":" + java.util.UUID.randomUUID();
}
private void ensureSource(Document document, SourceDescriptor sourceDescriptor) {
boolean alreadyLinked = documentSourceRepository.findByDocument_Id(document.getId()).stream().anyMatch(existing ->
equalsNullable(existing.getExternalSourceId(), sourceDescriptor.sourceIdentifier())
&& equalsNullable(existing.getSourceUri(), sourceDescriptor.sourceUri())
&& equalsNullable(existing.getSourceFilename(), sourceDescriptor.fileName()));
if (alreadyLinked) {
return;
}
documentSourceService.addSource(new AddDocumentSourceCommand(
document.getId(),
sourceDescriptor.sourceType(),
sourceDescriptor.sourceIdentifier(),
sourceDescriptor.sourceUri(),
sourceDescriptor.fileName(),
null,
properties.getGenericIngestion().getImportBatchId(),
sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt()
));
}
private DocumentContent persistOriginalContent(Document document,
SourceDescriptor sourceDescriptor,
DetectionResult detection,
ResolvedPayload payload,
String dedupHash) {
boolean storeBinaryInDb = shouldStoreBinaryInDb(payload.binaryContent());
boolean textPreferred = DocumentImportSupport.isLikelyTextMime(detection.mimeType()) || sourceDescriptor.hasInlineTextContent();
return documentContentService.addContent(new AddDocumentContentCommand(
document.getId(),
ContentRole.ORIGINAL,
textPreferred ? StorageType.DB_TEXT : (storeBinaryInDb ? StorageType.DB_BINARY : StorageType.EXTERNAL_REFERENCE),
detection.mimeType(),
textPreferred ? StandardCharsets.UTF_8.name() : null,
textPreferred ? payload.textContent() : null,
storeBinaryInDb && !textPreferred ? payload.binaryContent() : null,
!storeBinaryInDb && !textPreferred ? sourceDescriptor.sourceUri() : null,
dedupHash,
(long) payload.binaryContent().length
));
}
private boolean shouldStoreBinaryInDb(byte[] binaryContent) {
return properties.getGenericIngestion().isStoreOriginalBinaryInDb()
&& binaryContent != null
&& binaryContent.length <= properties.getGenericIngestion().getMaxBinaryBytesInDb();
}
private Map<ContentRole, DocumentContent> persistDerivedContent(Document document,
DetectionResult detection,
ExtractionResult extractionResult,
String baseHash) {
Map<ContentRole, DocumentContent> result = new LinkedHashMap<>();
extractionResult.derivedTextByRole().forEach((role, text) -> {
if (!StringUtils.hasText(text)) {
return;
}
String contentHash = HashUtils.computeSha256(baseHash + ":" + role.name() + ":" + text);
DocumentContent content = documentContentService.addContent(new AddDocumentContentCommand(
document.getId(),
role,
StorageType.DB_TEXT,
detection.mimeType(),
StandardCharsets.UTF_8.name(),
text,
null,
null,
contentHash,
(long) text.length()
));
result.put(role, content);
});
return result;
}
private void persistRepresentationsAndEmbeddings(Document document,
DocumentContent originalContent,
Map<ContentRole, DocumentContent> derivedContent,
List<TextRepresentationDraft> drafts) {
if (drafts == null || drafts.isEmpty()) {
return;
}
DocumentEmbeddingModel model = null;
if (properties.getVectorization().isEnabled() && properties.getVectorization().isGenericPipelineEnabled()) {
model = documentEmbeddingService.registerModel(new RegisterEmbeddingModelCommand(
properties.getVectorization().getModelName(),
properties.getVectorization().getEmbeddingProvider(),
properties.getVectorization().getModelName(),
properties.getVectorization().getDimensions(),
null,
false,
true
));
}
for (TextRepresentationDraft draft : drafts) {
if (!StringUtils.hasText(draft.textBody())) {
continue;
}
DocumentContent linkedContent = switch (draft.representationType()) {
case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK ->
derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
};
var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
document.getId(),
linkedContent == null ? null : linkedContent.getId(),
draft.representationType(),
"phase4-generic-builder",
draft.languageCode(),
null,
draft.chunkIndex(),
null,
null,
draft.primary(),
draft.textBody()
));
if (model != null && shouldQueueEmbedding(draft)) {
documentEmbeddingService.ensurePendingEmbedding(document.getId(), representation.getId(), model.getId());
}
}
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
}
private boolean shouldQueueEmbedding(TextRepresentationDraft draft) {
return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true;
}
private void applyStructuredTitleIfMissing(Document document, ExtractionResult extractionResult) {
boolean missingTitle = !StringUtils.hasText(document.getTitle()) || document.getTitle().equals(document.getDocumentType().name());
if (!missingTitle) {
return;
}
for (var payload : extractionResult.structuredPayloads()) {
if (payload.attributes() == null) {
continue;
}
Object title = payload.attributes().get("title");
if (title instanceof String titleValue && StringUtils.hasText(titleValue)) {
document.setTitle(titleValue);
documentService.save(document);
return;
}
}
}
private boolean equalsNullable(String left, String right) {
return java.util.Objects.equals(left, right);
}
private record ResolvedPayload(byte[] binaryContent, String textContent, String mediaType) {
}
}

@ -2,10 +2,14 @@ package at.procon.dip.ingestion.spi;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.document.SourceType;
import java.time.OffsetDateTime;
import java.util.Map;
/**
* Describes a source object that should be ingested into the canonical document model.
* <p>
* Phase 4 extends the descriptor with optional inline payload fields so adapters can support
* both file-backed imports and direct upload/API ingestion without introducing separate DTO trees.
*/
public record SourceDescriptor(
DocumentAccessContext accessContext,
@ -14,6 +18,17 @@ public record SourceDescriptor(
String sourceUri,
String fileName,
String mediaType,
byte[] binaryContent,
String textContent,
OffsetDateTime receivedAt,
Map<String, String> attributes
) {
public boolean hasInlineBinaryContent() {
return binaryContent != null && binaryContent.length > 0;
}
public boolean hasInlineTextContent() {
return textContent != null && !textContent.isBlank();
}
}

@ -0,0 +1,84 @@
package at.procon.dip.ingestion.util;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.Map;
import org.springframework.util.StringUtils;
/**
* Shared utility methods for Phase 4 generic ingestion.
*/
public final class DocumentImportSupport {
private DocumentImportSupport() {
}
public static String extensionOf(String fileName) {
if (!StringUtils.hasText(fileName) || !fileName.contains(".")) {
return "";
}
return fileName.substring(fileName.lastIndexOf('.') + 1).toLowerCase(Locale.ROOT);
}
public static boolean isLikelyTextMime(String mediaType) {
if (!StringUtils.hasText(mediaType)) {
return false;
}
String normalized = normalizeMediaType(mediaType);
return normalized.startsWith("text/")
|| normalized.contains("json")
|| normalized.contains("xml")
|| normalized.contains("javascript")
|| normalized.equals("application/xhtml+xml");
}
public static String normalizeMediaType(String mediaType) {
if (!StringUtils.hasText(mediaType)) {
return null;
}
int idx = mediaType.indexOf(';');
String result = idx >= 0 ? mediaType.substring(0, idx) : mediaType;
return result.trim().toLowerCase(Locale.ROOT);
}
public static DocumentFamily familyFor(DocumentType documentType) {
return switch (documentType) {
case TED_NOTICE -> DocumentFamily.PROCUREMENT;
case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL;
case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN ->
DocumentFamily.GENERIC;
};
}
public static String safeUtf8(byte[] bytes) {
if (bytes == null) {
return null;
}
return new String(bytes, StandardCharsets.UTF_8);
}
public static String firstNonBlank(Map<String, String> values, String... keys) {
for (String key : keys) {
if (values == null) {
continue;
}
String value = values.get(key);
if (StringUtils.hasText(value)) {
return value;
}
}
return null;
}
public static String ellipsize(String text, int maxLength) {
if (text == null || text.length() <= maxLength) {
return text;
}
if (maxLength <= 3) {
return text.substring(0, maxLength);
}
return text.substring(0, maxLength - 3) + "...";
}
}

@ -0,0 +1,86 @@
package at.procon.dip.normalization.impl;
import at.procon.dip.domain.document.ContentRole;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
import at.procon.dip.normalization.spi.TextRepresentationBuilder;
import at.procon.dip.normalization.spi.TextRepresentationDraft;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
@Component
public class DefaultGenericTextRepresentationBuilder implements TextRepresentationBuilder {
@Override
public boolean supports(DocumentType documentType) {
return documentType != DocumentType.TED_NOTICE;
}
@Override
public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
String normalizedText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
String htmlCleanText = request.extractionResult().derivedTextByRole().get(ContentRole.HTML_CLEAN);
String baseText = StringUtils.hasText(normalizedText) ? normalizedText : htmlCleanText;
if (!StringUtils.hasText(baseText)) {
return List.of();
}
String title = findStructuredTitle(request).orElse(request.sourceDescriptor().fileName());
String summary = DocumentImportSupport.ellipsize(baseText.replace('\n', ' ').trim(), 1200);
String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
List<TextRepresentationDraft> drafts = new ArrayList<>();
drafts.add(new TextRepresentationDraft(
RepresentationType.FULLTEXT,
request.detectionResult().languageCode(),
baseText,
false,
null
));
drafts.add(new TextRepresentationDraft(
RepresentationType.SEMANTIC_TEXT,
request.detectionResult().languageCode(),
semantic,
true,
null
));
if (StringUtils.hasText(title)) {
drafts.add(new TextRepresentationDraft(
RepresentationType.TITLE_ABSTRACT,
request.detectionResult().languageCode(),
title + "\n\n" + summary,
false,
null
));
}
return drafts;
}
private Optional<String> findStructuredTitle(RepresentationBuildRequest request) {
return request.extractionResult().structuredPayloads().stream()
.map(ExtractedStructuredPayload::attributes)
.filter(Objects::nonNull)
.map(attributes -> attributes.get("title"))
.filter(String.class::isInstance)
.map(String.class::cast)
.filter(StringUtils::hasText)
.findFirst();
}
private String buildSemanticText(String title, String summary, DocumentType documentType) {
StringBuilder sb = new StringBuilder();
sb.append("Document type: ").append(documentType.name()).append('\n');
if (StringUtils.hasText(title)) {
sb.append("Title: ").append(title.trim()).append("\n\n");
}
sb.append(summary);
return sb.toString().trim();
}
}

@ -0,0 +1,24 @@
package at.procon.dip.normalization.service;
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
import at.procon.dip.normalization.spi.TextRepresentationBuilder;
import at.procon.dip.normalization.spi.TextRepresentationDraft;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
public class TextRepresentationBuildService {
private final List<TextRepresentationBuilder> builders;
public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
return builders.stream()
.filter(builder -> builder.supports(request.detectionResult().documentType()))
.findFirst()
.orElseThrow(() -> new IllegalStateException(
"No text representation builder registered for " + request.detectionResult().documentType()))
.build(request);
}
}

@ -28,6 +28,7 @@ public class TedProcessorProperties {
private MailProperties mail = new MailProperties();
private SolutionBriefProperties solutionBrief = new SolutionBriefProperties();
private ProjectionProperties projection = new ProjectionProperties();
private GenericIngestionProperties genericIngestion = new GenericIngestionProperties();
/**
* Input directory configuration for Apache Camel file consumer.
@ -484,4 +485,101 @@ public class TedProcessorProperties {
*/
private String idempotentRepository = "./solution-brief-processed.dat";
}
/**
* Phase 4 generic ingestion configuration.
*/
@Data
public static class GenericIngestionProperties {
/**
* Master switch for the generic ingestion pipeline.
*/
private boolean enabled = false;
/**
* Enable/disable filesystem import route for arbitrary documents.
*/
private boolean fileSystemEnabled = false;
/**
* Enable/disable REST/API upload endpoints for arbitrary documents.
*/
private boolean restUploadEnabled = true;
/**
* Input directory for the generic filesystem importer.
*/
private String inputDirectory = "D:/ted.europe/generic-input";
/**
* Regular-expression file pattern used by the Camel file route.
*/
private String filePattern = ".*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$";
/**
* Directory where successfully imported files are moved.
*/
private String processedDirectory = ".dip-processed";
/**
* Directory where failed files are moved.
*/
private String errorDirectory = ".dip-error";
/**
* Polling interval in milliseconds.
*/
@Positive
private long pollInterval = 15000;
/**
* Maximum number of files per poll.
*/
@Positive
private int maxMessagesPerPoll = 10;
/**
* Default owner tenant for files imported through the generic route.
*/
private String defaultOwnerTenantKey;
/**
* Default visibility for generic imports when not supplied explicitly.
*/
private at.procon.dip.domain.access.DocumentVisibility defaultVisibility = at.procon.dip.domain.access.DocumentVisibility.PUBLIC;
/**
* Optional default language code applied to filesystem imports.
*/
private String defaultLanguageCode;
/**
* Persist original binary payloads in DB when they are small enough.
*/
private boolean storeOriginalBinaryInDb = true;
/**
* Maximum binary size (bytes) stored inline in DOC.doc_content.binary_content.
*/
@Positive
private int maxBinaryBytesInDb = 5242880;
/**
* Whether an already imported content hash should resolve to the existing document.
*/
private boolean deduplicateByContentHash = true;
/**
* Queue only the primary text representation for embedding.
*/
private boolean vectorizePrimaryRepresentationOnly = true;
/**
* Import batch identifier written to DOC.doc_source.import_batch_id.
*/
@NotBlank
private String importBatchId = "phase4-generic";
}
}

@ -204,6 +204,43 @@ ted:
# Maximum number of legacy TED documents to backfill during startup
startup-backfill-limit: 250
# Phase 4 generic ingestion configuration
generic-ingestion:
# Master switch for arbitrary document ingestion into the DOC model
enabled: false
# Enable file-system polling for non-TED documents
file-system-enabled: false
# Allow REST/API upload endpoints for arbitrary documents
rest-upload-enabled: true
# Input directory for the generic Camel file route
input-directory: D:/ted.europe/generic-input
# Regex for files accepted by the generic file route
file-pattern: .*\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$
# Move successfully processed files here
processed-directory: .dip-processed
# Move failed files here
error-directory: .dip-error
# Polling interval for the generic route
poll-interval: 15000
# Maximum files per poll
max-messages-per-poll: 10
# Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs
default-owner-tenant-key:
# Default visibility when no explicit access context is provided
default-visibility: PUBLIC
# Optional default language for filesystem imports
default-language-code:
# Store small binary originals in DOC.doc_content.binary_content
store-original-binary-in-db: true
# Maximum binary payload size persisted inline in DB
max-binary-bytes-in-db: 5242880
# Deduplicate by content hash and attach additional sources to the same canonical document
deduplicate-by-content-hash: true
# Queue only the primary text representation for vectorization
vectorize-primary-representation-only: true
# Import batch marker written to DOC.doc_source.import_batch_id
import-batch-id: phase4-generic
# Solution Brief processing configuration
solution-brief:
# Enable/disable Solution Brief processing

@ -0,0 +1,9 @@
-- Phase 4: Generic ingestion support for arbitrary document types.
-- Add binary payload storage to the generic DOC content table so non-text originals
-- such as PDFs can be stored directly in the database when configured.
ALTER TABLE DOC.doc_content
ADD COLUMN IF NOT EXISTS binary_content BYTEA;
CREATE INDEX IF NOT EXISTS idx_doc_content_document_role
ON DOC.doc_content(document_id, content_role, created_at DESC);
Loading…
Cancel
Save