Refactor phases 4

2026-03-17 15:20:42 +01:00 · 2026-03-17 15:20:42 +01:00 · ac59730f3e
parent adc4f2da43
commit ac59730f3e
28 changed files with 1561 additions and 0 deletions
--- a/docs/architecture/PHASE4_GENERIC_INGESTION_PIPELINE.md
+++ b/docs/architecture/PHASE4_GENERIC_INGESTION_PIPELINE.md
@ -0,0 +1,58 @@
+# Phase 4 - Generic Ingestion Pipeline
+
+## Goal
+
+Add the first generic ingestion path so arbitrary documents can be imported into the canonical DOC model,
+normalized into text representations, and queued for vectorization without depending on the TED-specific model.
+
+## Scope implemented
+
+### Input channels
+
+- file-system polling route for arbitrary documents
+- REST/API upload endpoints
+
+### Detection
+
+- file extension + media type based classification
+
+### Extraction
+
+- PDF -> text via PDFBox
+- HTML -> cleaned text via JSoup
+- text / markdown / generic XML -> normalized UTF-8 text
+- unsupported binary types -> fallback warning only
+
+### Representation building
+
+- default generic builder creates:
+  - FULLTEXT
+  - SEMANTIC_TEXT
+  - TITLE_ABSTRACT
+
+### Persistence
+
+- original content stored in DOC.doc_content
+- binary originals can now be stored inline in `binary_content`
+- derived text variants persisted as additional DOC.doc_content rows
+- text representations persisted in DOC.doc_text_representation
+- pending embeddings created in DOC.doc_embedding when enabled
+
+## Access model
+
+The generic pipeline uses the Phase 0/1 access model:
+
+- optional owner tenant
+- mandatory visibility
+
+This supports both:
+- public documents (`owner_tenant_id = null`, `visibility = PUBLIC`)
+- tenant-owned documents (`owner_tenant_id != null`, `visibility = TENANT/SHARED/...`)
+
+## Deliberately deferred
+
+- DOCX extraction
+- ZIP recursive child import in the generic pipeline
+- MIME/EML structured parsing
+- generic structured projections beyond TED
+- chunked long-document representations
--- a/src/main/java/at/procon/dip/README_PHASE4.md
+++ b/src/main/java/at/procon/dip/README_PHASE4.md
@ -0,0 +1,40 @@
+# Phase 4 - Generic Ingestion Pipeline
+
+Phase 4 introduces the first generalized ingestion flow on top of the DOC backbone.
+
+## What is included
+
+- generic ingestion gateway with adapter selection
+- file-system ingestion adapter and Camel route
+- REST/API upload controller for arbitrary documents
+- document type detection by media type / extension
+- first extractors for:
+  - plain text / markdown / generic XML
+  - HTML
+  - PDF
+  - binary fallback
+- default representation builder for non-TED documents
+- binary payload support in `DOC.doc_content.binary_content`
+- automatic creation of pending generic embeddings for imported representations
+
+## Important behavior
+
+- current TED runtime remains intact
+- generic ingestion is disabled by default and must be enabled with:
+  - `ted.generic-ingestion.enabled=true`
+- file-system polling is separately controlled with:
+  - `ted.generic-ingestion.file-system-enabled=true`
+- REST/API upload endpoints are under:
+  - `/api/v1/dip/import/upload`
+  - `/api/v1/dip/import/text`
+
+## Current supported generic document types
+
+- PDF
+- HTML
+- TEXT
+- MARKDOWN
+- XML_GENERIC
+- UNKNOWN text-like files
+
+DOCX, ZIP child extraction, and MIME body parsing are intentionally left for later phases.
--- a/src/main/java/at/procon/dip/classification/detector/BasicMimeAndExtensionDocumentTypeDetector.java
+++ b/src/main/java/at/procon/dip/classification/detector/BasicMimeAndExtensionDocumentTypeDetector.java
@ -0,0 +1,82 @@
+package at.procon.dip.classification.detector;
+
+import at.procon.dip.classification.spi.DetectionResult;
+import at.procon.dip.classification.spi.DocumentTypeDetector;
+import at.procon.dip.domain.document.DocumentFamily;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import at.procon.dip.ingestion.util.DocumentImportSupport;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+/**
+ * Basic Phase 4 detector using file extension, media type, and lightweight heuristics.
+ */
+@Component
+public class BasicMimeAndExtensionDocumentTypeDetector implements DocumentTypeDetector {
+
+    @Override
+    public boolean supports(SourceDescriptor sourceDescriptor) {
+        return true;
+    }
+
+    @Override
+    public DetectionResult detect(SourceDescriptor sourceDescriptor) {
+        String normalizedMediaType = DocumentImportSupport.normalizeMediaType(sourceDescriptor.mediaType());
+        String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName());
+
+        DocumentType documentType = detectByMediaType(normalizedMediaType);
+        if (documentType == DocumentType.UNKNOWN) {
+            documentType = detectByExtension(extension);
+        }
+        if (documentType == DocumentType.UNKNOWN && sourceDescriptor.hasInlineTextContent()) {
+            documentType = DocumentType.TEXT;
+        }
+
+        DocumentFamily family = DocumentImportSupport.familyFor(documentType);
+        String languageCode = sourceDescriptor.attributes() == null ? null : sourceDescriptor.attributes().get("languageCode");
+        Map<String, String> attributes = new HashMap<>();
+        attributes.put("detectedExtension", extension);
+        if (normalizedMediaType != null) {
+            attributes.put("normalizedMediaType", normalizedMediaType);
+        }
+        if (StringUtils.hasText(sourceDescriptor.fileName())) {
+            attributes.put("fileName", sourceDescriptor.fileName());
+        }
+
+        return new DetectionResult(documentType, family, normalizedMediaType, languageCode, attributes);
+    }
+
+    private DocumentType detectByMediaType(String mediaType) {
+        if (!StringUtils.hasText(mediaType)) {
+            return DocumentType.UNKNOWN;
+        }
+        return switch (mediaType.toLowerCase(Locale.ROOT)) {
+            case "application/pdf", "application/x-pdf" -> DocumentType.PDF;
+            case "text/html", "application/xhtml+xml" -> DocumentType.HTML;
+            case "text/plain" -> DocumentType.TEXT;
+            case "text/markdown", "text/x-markdown" -> DocumentType.MARKDOWN;
+            case "application/xml", "text/xml" -> DocumentType.XML_GENERIC;
+            case "message/rfc822" -> DocumentType.MIME_MESSAGE;
+            case "application/zip", "application/x-zip-compressed" -> DocumentType.ZIP_ARCHIVE;
+            default -> mediaType.startsWith("text/") ? DocumentType.TEXT : DocumentType.UNKNOWN;
+        };
+    }
+
+    private DocumentType detectByExtension(String extension) {
+        return switch (extension) {
+            case "pdf" -> DocumentType.PDF;
+            case "html", "htm" -> DocumentType.HTML;
+            case "txt", "log", "csv", "json", "yaml", "yml" -> DocumentType.TEXT;
+            case "md", "markdown" -> DocumentType.MARKDOWN;
+            case "xml", "xsd", "xslt" -> DocumentType.XML_GENERIC;
+            case "eml", "msg" -> DocumentType.MIME_MESSAGE;
+            case "zip" -> DocumentType.ZIP_ARCHIVE;
+            case "docx" -> DocumentType.DOCX;
+            default -> DocumentType.UNKNOWN;
+        };
+    }
+}
--- a/src/main/java/at/procon/dip/classification/service/DocumentClassificationService.java
+++ b/src/main/java/at/procon/dip/classification/service/DocumentClassificationService.java
@ -0,0 +1,26 @@
+package at.procon.dip.classification.service;
+
+import at.procon.dip.classification.spi.DetectionResult;
+import at.procon.dip.classification.spi.DocumentTypeDetector;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import java.util.List;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+
+/**
+ * Selects the first matching detector for a source descriptor.
+ */
+@Service
+@RequiredArgsConstructor
+public class DocumentClassificationService {
+
+    private final List<DocumentTypeDetector> detectors;
+
+    public DetectionResult detect(SourceDescriptor sourceDescriptor) {
+        return detectors.stream()
+                .filter(detector -> detector.supports(sourceDescriptor))
+                .findFirst()
+                .orElseThrow(() -> new IllegalStateException("No document type detector available"))
+                .detect(sourceDescriptor);
+    }
+}
--- a/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java
+++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java
@ -13,6 +13,7 @@ import jakarta.persistence.GenerationType;
 import jakarta.persistence.Id;
 import jakarta.persistence.Index;
 import jakarta.persistence.JoinColumn;
+import jakarta.persistence.Lob;
 import jakarta.persistence.ManyToOne;
 import jakarta.persistence.PrePersist;
 import jakarta.persistence.Table;
@ -66,6 +67,10 @@ public class DocumentContent {
    @Column(name = "text_content", columnDefinition = "TEXT")
    private String textContent;

+    @Lob
+    @Column(name = "binary_content")
+    private byte[] binaryContent;
+
    @Column(name = "binary_ref", columnDefinition = "TEXT")
    private String binaryRef;

--- a/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java
+++ b/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java
@ -25,6 +25,7 @@ public class DocumentContentService {
                .mimeType(command.mimeType())
                .charsetName(command.charsetName())
                .textContent(command.textContent())
+                .binaryContent(command.binaryContent())
                .binaryRef(command.binaryRef())
                .contentHash(command.contentHash())
                .sizeBytes(command.sizeBytes())
--- a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java
+++ b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java
@ -11,6 +11,7 @@ public record AddDocumentContentCommand(
        String mimeType,
        String charsetName,
        String textContent,
+        byte[] binaryContent,
        String binaryRef,
        String contentHash,
        Long sizeBytes
--- a/src/main/java/at/procon/dip/extraction/impl/BinaryPassThroughDocumentExtractor.java
+++ b/src/main/java/at/procon/dip/extraction/impl/BinaryPassThroughDocumentExtractor.java
@ -0,0 +1,30 @@
+package at.procon.dip.extraction.impl;
+
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.extraction.spi.DocumentExtractor;
+import at.procon.dip.extraction.spi.ExtractionRequest;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import java.util.List;
+import java.util.Map;
+import org.springframework.stereotype.Component;
+
+/**
+ * Fallback extractor for binary formats not yet supported by specialized extractors.
+ */
+@Component
+public class BinaryPassThroughDocumentExtractor implements DocumentExtractor {
+
+    @Override
+    public boolean supports(DocumentType documentType, String mimeType) {
+        return documentType == DocumentType.DOCX
+                || documentType == DocumentType.ZIP_ARCHIVE
+                || documentType == DocumentType.GENERIC_BINARY
+                || documentType == DocumentType.MIME_MESSAGE;
+    }
+
+    @Override
+    public ExtractionResult extract(ExtractionRequest extractionRequest) {
+        return new ExtractionResult(Map.of(), List.of(), List.of(
+                "No specialized extractor available yet for " + extractionRequest.detectionResult().documentType()));
+    }
+}
--- a/src/main/java/at/procon/dip/extraction/impl/HtmlDocumentExtractor.java
+++ b/src/main/java/at/procon/dip/extraction/impl/HtmlDocumentExtractor.java
@ -0,0 +1,60 @@
+package at.procon.dip.extraction.impl;
+
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
+import at.procon.dip.extraction.spi.DocumentExtractor;
+import at.procon.dip.extraction.spi.ExtractionRequest;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+@Component
+public class HtmlDocumentExtractor implements DocumentExtractor {
+
+    @Override
+    public boolean supports(DocumentType documentType, String mimeType) {
+        return documentType == DocumentType.HTML || "text/html".equalsIgnoreCase(mimeType)
+                || "application/xhtml+xml".equalsIgnoreCase(mimeType);
+    }
+
+    @Override
+    public ExtractionResult extract(ExtractionRequest extractionRequest) {
+        String html = extractionRequest.textContent();
+        if (!StringUtils.hasText(html) && extractionRequest.binaryContent() != null) {
+            html = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8);
+        }
+        if (!StringUtils.hasText(html)) {
+            return new ExtractionResult(Map.of(), List.of(), List.of("No HTML content available"));
+        }
+
+        Document parsed = Jsoup.parse(html);
+        parsed.outputSettings().prettyPrint(false);
+        String cleanText = parsed.text();
+
+        Map<ContentRole, String> derivedText = new LinkedHashMap<>();
+        derivedText.put(ContentRole.HTML_CLEAN, parsed.body() != null ? parsed.body().text() : cleanText);
+        derivedText.put(ContentRole.NORMALIZED_TEXT, cleanText);
+
+        Map<String, Object> payload = new LinkedHashMap<>();
+        if (StringUtils.hasText(parsed.title())) {
+            payload.put("title", parsed.title().trim());
+        }
+        if (parsed.body() != null) {
+            payload.put("bodyTextLength", parsed.body().text().length());
+        }
+        List<String> warnings = new ArrayList<>();
+        if (!StringUtils.hasText(parsed.title())) {
+            warnings.add("HTML document has no <title> element");
+        }
+
+        return new ExtractionResult(derivedText, List.of(new ExtractedStructuredPayload("html-document", payload)), warnings);
+    }
+}
--- a/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java
+++ b/src/main/java/at/procon/dip/extraction/impl/PdfDocumentExtractor.java
@ -0,0 +1,69 @@
+package at.procon.dip.extraction.impl;
+
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
+import at.procon.dip.extraction.spi.DocumentExtractor;
+import at.procon.dip.extraction.spi.ExtractionRequest;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import at.procon.ted.service.attachment.PdfExtractionService;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+@Component
+@RequiredArgsConstructor
+public class PdfDocumentExtractor implements DocumentExtractor {
+
+    private final PdfExtractionService pdfExtractionService;
+
+    @Override
+    public boolean supports(DocumentType documentType, String mimeType) {
+        return documentType == DocumentType.PDF || pdfExtractionService.canHandle("dummy.pdf", mimeType);
+    }
+
+    @Override
+    public ExtractionResult extract(ExtractionRequest extractionRequest) {
+        byte[] binary = extractionRequest.binaryContent();
+        if (binary == null || binary.length == 0) {
+            return new ExtractionResult(Map.of(), List.of(), List.of("No PDF binary payload available"));
+        }
+
+        PdfExtractionService.ExtractionResult extraction = pdfExtractionService.extract(
+                binary,
+                extractionRequest.sourceDescriptor().fileName(),
+                extractionRequest.detectionResult().mimeType()
+        );
+        if (!extraction.success()) {
+            return new ExtractionResult(Map.of(), List.of(), List.of(extraction.errorMessage()));
+        }
+
+        String text = extraction.extractedText();
+        if (!StringUtils.hasText(text)) {
+            return new ExtractionResult(Map.of(), List.of(), List.of("PDF text extraction returned no text"));
+        }
+
+        Map<String, Object> payload = new LinkedHashMap<>();
+        payload.put("title", deriveTitle(text, extractionRequest.sourceDescriptor().fileName()));
+        payload.put("extractor", "pdfbox");
+
+        return new ExtractionResult(
+                Map.of(ContentRole.NORMALIZED_TEXT, text),
+                List.of(new ExtractedStructuredPayload("pdf-document", payload)),
+                List.of()
+        );
+    }
+
+    private String deriveTitle(String text, String fallback) {
+        for (String line : text.split("\\n")) {
+            if (StringUtils.hasText(line)) {
+                String trimmed = line.trim();
+                return trimmed.length() > 240 ? trimmed.substring(0, 240) : trimmed;
+            }
+        }
+        return fallback;
+    }
+}
--- a/src/main/java/at/procon/dip/extraction/impl/PlainTextDocumentExtractor.java
+++ b/src/main/java/at/procon/dip/extraction/impl/PlainTextDocumentExtractor.java
@ -0,0 +1,69 @@
+package at.procon.dip.extraction.impl;
+
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
+import at.procon.dip.extraction.spi.DocumentExtractor;
+import at.procon.dip.extraction.spi.ExtractionRequest;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import at.procon.dip.ingestion.util.DocumentImportSupport;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import java.util.Map;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+@Component
+public class PlainTextDocumentExtractor implements DocumentExtractor {
+
+    @Override
+    public boolean supports(DocumentType documentType, String mimeType) {
+        return documentType == DocumentType.TEXT
+                || documentType == DocumentType.MARKDOWN
+                || documentType == DocumentType.XML_GENERIC
+                || documentType == DocumentType.UNKNOWN
+                || DocumentImportSupport.isLikelyTextMime(mimeType);
+    }
+
+    @Override
+    public ExtractionResult extract(ExtractionRequest extractionRequest) {
+        String text = extractionRequest.textContent();
+        if (!StringUtils.hasText(text) && extractionRequest.binaryContent() != null) {
+            text = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8);
+        }
+        text = normalizeText(text);
+
+        if (!StringUtils.hasText(text)) {
+            return new ExtractionResult(Map.of(), List.of(), List.of("No text content extracted"));
+        }
+
+        String title = deriveTitle(text, extractionRequest.sourceDescriptor().fileName());
+        return new ExtractionResult(
+                Map.of(ContentRole.NORMALIZED_TEXT, text),
+                List.of(new ExtractedStructuredPayload("generic-document", Map.of("title", title))),
+                List.of()
+        );
+    }
+
+    private String normalizeText(String text) {
+        if (text == null) {
+            return null;
+        }
+        return text.replace("\r\n", "\n")
+                .replace('\r', '\n')
+                .replaceAll("\\n{3,}", "\n\n")
+                .replaceAll("[ \t]+\n", "\n")
+                .trim();
+    }
+
+    private String deriveTitle(String text, String fallback) {
+        if (StringUtils.hasText(text)) {
+            for (String line : text.split("\\n")) {
+                if (StringUtils.hasText(line)) {
+                    return DocumentImportSupport.ellipsize(line.trim(), 240);
+                }
+            }
+        }
+        return fallback;
+    }
+}
--- a/src/main/java/at/procon/dip/extraction/service/DocumentExtractionService.java
+++ b/src/main/java/at/procon/dip/extraction/service/DocumentExtractionService.java
@ -0,0 +1,26 @@
+package at.procon.dip.extraction.service;
+
+import at.procon.dip.extraction.spi.DocumentExtractor;
+import at.procon.dip.extraction.spi.ExtractionRequest;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import java.util.List;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+
+@Service
+@RequiredArgsConstructor
+public class DocumentExtractionService {
+
+    private final List<DocumentExtractor> extractors;
+
+    public ExtractionResult extract(ExtractionRequest extractionRequest) {
+        return extractors.stream()
+                .filter(extractor -> extractor.supports(
+                        extractionRequest.detectionResult().documentType(),
+                        extractionRequest.detectionResult().mimeType()))
+                .findFirst()
+                .orElseThrow(() -> new IllegalStateException(
+                        "No extractor registered for type " + extractionRequest.detectionResult().documentType()))
+                .extract(extractionRequest);
+    }
+}
--- a/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java
+++ b/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java
@ -0,0 +1,28 @@
+package at.procon.dip.ingestion.adapter;
+
+import at.procon.dip.ingestion.dto.ImportedDocumentResult;
+import at.procon.dip.ingestion.service.GenericDocumentImportService;
+import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
+import at.procon.dip.ingestion.spi.IngestionResult;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import java.util.List;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Component;
+
+@Component
+@RequiredArgsConstructor
+public class FileSystemDocumentIngestionAdapter implements DocumentIngestionAdapter {
+
+    private final GenericDocumentImportService importService;
+
+    @Override
+    public boolean supports(SourceDescriptor sourceDescriptor) {
+        return sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.FILE_SYSTEM;
+    }
+
+    @Override
+    public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
+        ImportedDocumentResult imported = importService.importDocument(sourceDescriptor);
+        return new IngestionResult(List.of(imported.document().toCanonicalMetadata()), imported.warnings());
+    }
+}
--- a/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java
+++ b/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java
@ -0,0 +1,31 @@
+package at.procon.dip.ingestion.adapter;
+
+import at.procon.dip.domain.document.SourceType;
+import at.procon.dip.ingestion.dto.ImportedDocumentResult;
+import at.procon.dip.ingestion.service.GenericDocumentImportService;
+import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
+import at.procon.dip.ingestion.spi.IngestionResult;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import java.util.List;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Component;
+
+@Component
+@RequiredArgsConstructor
+public class InlineContentDocumentIngestionAdapter implements DocumentIngestionAdapter {
+
+    private final GenericDocumentImportService importService;
+
+    @Override
+    public boolean supports(SourceDescriptor sourceDescriptor) {
+        return sourceDescriptor.sourceType() == SourceType.REST_UPLOAD
+                || sourceDescriptor.sourceType() == SourceType.MANUAL_UPLOAD
+                || sourceDescriptor.sourceType() == SourceType.API;
+    }
+
+    @Override
+    public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
+        ImportedDocumentResult imported = importService.importDocument(sourceDescriptor);
+        return new IngestionResult(List.of(imported.document().toCanonicalMetadata()), imported.warnings());
+    }
+}
--- a/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java
+++ b/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java
@ -0,0 +1,88 @@
+package at.procon.dip.ingestion.camel;
+
+import at.procon.dip.domain.access.DocumentAccessContext;
+import at.procon.dip.domain.access.DocumentVisibility;
+import at.procon.dip.domain.document.SourceType;
+import at.procon.dip.domain.tenant.TenantRef;
+import at.procon.dip.ingestion.service.DocumentIngestionGateway;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import at.procon.ted.config.TedProcessorProperties;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.time.OffsetDateTime;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.camel.Exchange;
+import org.apache.camel.builder.RouteBuilder;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+@Component
+@RequiredArgsConstructor
+@Slf4j
+public class GenericFileSystemIngestionRoute extends RouteBuilder {
+
+    private final TedProcessorProperties properties;
+    private final DocumentIngestionGateway ingestionGateway;
+
+    @Override
+    public void configure() {
+        if (!properties.getGenericIngestion().isEnabled() || !properties.getGenericIngestion().isFileSystemEnabled()) {
+            log.info("Phase 4 generic filesystem ingestion route disabled");
+            return;
+        }
+
+        var config = properties.getGenericIngestion();
+        log.info("Configuring Phase 4 generic filesystem ingestion from {}", config.getInputDirectory());
+
+        fromF("file:%s?recursive=true&include=%s&delay=%d&maxMessagesPerPoll=%d&move=%s&moveFailed=%s",
+                config.getInputDirectory(),
+                config.getFilePattern(),
+                config.getPollInterval(),
+                config.getMaxMessagesPerPoll(),
+                config.getProcessedDirectory(),
+                config.getErrorDirectory())
+                .routeId("dip-generic-filesystem-ingestion")
+                .process(exchange -> processFile(exchange))
+                .log("Imported generic document from ${header.CamelFileAbsolutePath}");
+    }
+
+    private void processFile(Exchange exchange) throws Exception {
+        Path path = exchange.getIn().getBody(Path.class);
+        if (path == null) {
+            String absolutePath = exchange.getIn().getHeader(Exchange.FILE_PATH, String.class);
+            path = Path.of(absolutePath);
+        }
+        byte[] payload = Files.readAllBytes(path);
+        Map<String, String> attributes = new LinkedHashMap<>();
+        String languageCode = properties.getGenericIngestion().getDefaultLanguageCode();
+        if (StringUtils.hasText(languageCode)) {
+            attributes.put("languageCode", languageCode);
+        }
+
+        SourceDescriptor descriptor = new SourceDescriptor(
+                buildDefaultAccessContext(),
+                SourceType.FILE_SYSTEM,
+                path.getFileName().toString(),
+                path.toAbsolutePath().toString(),
+                path.getFileName().toString(),
+                Files.probeContentType(path),
+                payload,
+                null,
+                OffsetDateTime.now(),
+                attributes
+        );
+        ingestionGateway.ingest(descriptor);
+    }
+
+    private DocumentAccessContext buildDefaultAccessContext() {
+        String ownerTenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey();
+        DocumentVisibility visibility = properties.getGenericIngestion().getDefaultVisibility();
+        if (!StringUtils.hasText(ownerTenantKey)) {
+            return new DocumentAccessContext(null, visibility);
+        }
+        return new DocumentAccessContext(new TenantRef(null, ownerTenantKey, ownerTenantKey), visibility);
+    }
+}
--- a/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java
+++ b/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java
@ -0,0 +1,128 @@
+package at.procon.dip.ingestion.controller;
+
+import at.procon.dip.domain.access.DocumentAccessContext;
+import at.procon.dip.domain.access.DocumentVisibility;
+import at.procon.dip.domain.document.CanonicalDocumentMetadata;
+import at.procon.dip.domain.document.SourceType;
+import at.procon.dip.domain.tenant.TenantRef;
+import at.procon.dip.ingestion.dto.GenericImportResponse;
+import at.procon.dip.ingestion.dto.GenericTextImportRequest;
+import at.procon.dip.ingestion.service.DocumentIngestionGateway;
+import at.procon.dip.ingestion.spi.IngestionResult;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import at.procon.ted.config.TedProcessorProperties;
+import java.time.OffsetDateTime;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import lombok.RequiredArgsConstructor;
+import org.springframework.http.MediaType;
+import org.springframework.http.ResponseEntity;
+import org.springframework.util.StringUtils;
+import org.springframework.web.bind.annotation.PostMapping;
+import org.springframework.web.bind.annotation.RequestBody;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.bind.annotation.RestController;
+import org.springframework.web.multipart.MultipartFile;
+
+@RestController
+@RequestMapping("/v1/dip/import")
+@RequiredArgsConstructor
+public class GenericDocumentImportController {
+
+    private final TedProcessorProperties properties;
+    private final DocumentIngestionGateway ingestionGateway;
+
+    @PostMapping(path = "/upload", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
+    public ResponseEntity<GenericImportResponse> upload(
+            @RequestParam("file") MultipartFile file,
+            @RequestParam(value = "ownerTenantKey", required = false) String ownerTenantKey,
+            @RequestParam(value = "visibility", required = false) DocumentVisibility visibility,
+            @RequestParam(value = "languageCode", required = false) String languageCode,
+            @RequestParam(value = "title", required = false) String title,
+            @RequestParam(value = "sourceIdentifier", required = false) String sourceIdentifier
+    ) throws Exception {
+        ensureRestUploadEnabled();
+
+        Map<String, String> attributes = new LinkedHashMap<>();
+        if (StringUtils.hasText(languageCode)) {
+            attributes.put("languageCode", languageCode);
+        }
+        if (StringUtils.hasText(title)) {
+            attributes.put("title", title);
+        }
+
+        SourceDescriptor descriptor = new SourceDescriptor(
+                buildAccessContext(ownerTenantKey, visibility),
+                SourceType.REST_UPLOAD,
+                StringUtils.hasText(sourceIdentifier) ? sourceIdentifier : file.getOriginalFilename(),
+                null,
+                file.getOriginalFilename(),
+                file.getContentType(),
+                file.getBytes(),
+                null,
+                OffsetDateTime.now(),
+                attributes
+        );
+        IngestionResult result = ingestionGateway.ingest(descriptor);
+        return ResponseEntity.ok(toResponse(result));
+    }
+
+    @PostMapping(path = "/text", consumes = MediaType.APPLICATION_JSON_VALUE)
+    public ResponseEntity<GenericImportResponse> importText(@RequestBody GenericTextImportRequest request) {
+        ensureRestUploadEnabled();
+        Map<String, String> attributes = new LinkedHashMap<>();
+        if (StringUtils.hasText(request.languageCode())) {
+            attributes.put("languageCode", request.languageCode());
+        }
+        if (StringUtils.hasText(request.title())) {
+            attributes.put("title", request.title());
+        }
+
+        SourceDescriptor descriptor = new SourceDescriptor(
+                buildAccessContext(request.ownerTenantKey(), request.visibility()),
+                SourceType.API,
+                StringUtils.hasText(request.sourceIdentifier()) ? request.sourceIdentifier() : request.fileName(),
+                null,
+                request.fileName(),
+                request.mediaType(),
+                request.text() == null ? null : request.text().getBytes(java.nio.charset.StandardCharsets.UTF_8),
+                request.text(),
+                OffsetDateTime.now(),
+                attributes
+        );
+        IngestionResult result = ingestionGateway.ingest(descriptor);
+        return ResponseEntity.ok(toResponse(result));
+    }
+
+    private void ensureRestUploadEnabled() {
+        if (!properties.getGenericIngestion().isEnabled() || !properties.getGenericIngestion().isRestUploadEnabled()) {
+            throw new IllegalStateException("Generic REST import is disabled");
+        }
+    }
+
+    private DocumentAccessContext buildAccessContext(String ownerTenantKey, DocumentVisibility visibility) {
+        DocumentVisibility effectiveVisibility = visibility != null
+                ? visibility
+                : properties.getGenericIngestion().getDefaultVisibility();
+        if (!StringUtils.hasText(ownerTenantKey)) {
+            return new DocumentAccessContext(null, effectiveVisibility);
+        }
+        return new DocumentAccessContext(new TenantRef(null, ownerTenantKey, ownerTenantKey), effectiveVisibility);
+    }
+
+    private GenericImportResponse toResponse(IngestionResult result) {
+        CanonicalDocumentMetadata metadata = result.documents().stream()
+                .findFirst()
+                .orElseThrow(() -> new IllegalStateException("No imported document metadata returned"));
+        return new GenericImportResponse(
+                metadata.documentId(),
+                metadata.title(),
+                metadata.documentType(),
+                metadata.documentFamily(),
+                metadata.status(),
+                result.warnings().stream().anyMatch(w -> w != null && w.contains("already imported")),
+                result.warnings()
+        );
+    }
+}
--- a/src/main/java/at/procon/dip/ingestion/dto/GenericImportResponse.java
+++ b/src/main/java/at/procon/dip/ingestion/dto/GenericImportResponse.java
@ -0,0 +1,18 @@
+package at.procon.dip.ingestion.dto;
+
+import at.procon.dip.domain.document.DocumentFamily;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.domain.document.DocumentStatus;
+import java.util.List;
+import java.util.UUID;
+
+public record GenericImportResponse(
+        UUID documentId,
+        String title,
+        DocumentType documentType,
+        DocumentFamily documentFamily,
+        DocumentStatus status,
+        boolean deduplicated,
+        List<String> warnings
+) {
+}
--- a/src/main/java/at/procon/dip/ingestion/dto/GenericTextImportRequest.java
+++ b/src/main/java/at/procon/dip/ingestion/dto/GenericTextImportRequest.java
@ -0,0 +1,15 @@
+package at.procon.dip.ingestion.dto;
+
+import at.procon.dip.domain.access.DocumentVisibility;
+
+public record GenericTextImportRequest(
+        String text,
+        String fileName,
+        String mediaType,
+        String ownerTenantKey,
+        DocumentVisibility visibility,
+        String languageCode,
+        String title,
+        String sourceIdentifier
+) {
+}
--- a/src/main/java/at/procon/dip/ingestion/dto/ImportedDocumentResult.java
+++ b/src/main/java/at/procon/dip/ingestion/dto/ImportedDocumentResult.java
@ -0,0 +1,16 @@
+package at.procon.dip.ingestion.dto;
+
+import at.procon.dip.classification.spi.DetectionResult;
+import at.procon.dip.domain.document.entity.Document;
+import java.util.List;
+
+/**
+ * Internal result of the generic Phase 4 import pipeline.
+ */
+public record ImportedDocumentResult(
+        Document document,
+        DetectionResult detectionResult,
+        List<String> warnings,
+        boolean deduplicated
+) {
+}
--- a/src/main/java/at/procon/dip/ingestion/service/DocumentIngestionGateway.java
+++ b/src/main/java/at/procon/dip/ingestion/service/DocumentIngestionGateway.java
@ -0,0 +1,24 @@
+package at.procon.dip.ingestion.service;
+
+import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
+import at.procon.dip.ingestion.spi.IngestionResult;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import java.util.List;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+
+@Service
+@RequiredArgsConstructor
+public class DocumentIngestionGateway {
+
+    private final List<DocumentIngestionAdapter> adapters;
+
+    public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
+        return adapters.stream()
+                .filter(adapter -> adapter.supports(sourceDescriptor))
+                .findFirst()
+                .orElseThrow(() -> new IllegalArgumentException(
+                        "No ingestion adapter registered for source type " + sourceDescriptor.sourceType()))
+                .ingest(sourceDescriptor);
+    }
+}
--- a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java
+++ b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java
@ -0,0 +1,393 @@
+package at.procon.dip.ingestion.service;
+
+import at.procon.dip.classification.service.DocumentClassificationService;
+import at.procon.dip.classification.spi.DetectionResult;
+import at.procon.dip.domain.access.DocumentAccessContext;
+import at.procon.dip.domain.access.DocumentVisibility;
+import at.procon.dip.domain.document.CanonicalDocumentMetadata;
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentStatus;
+import at.procon.dip.domain.document.StorageType;
+import at.procon.dip.domain.document.entity.Document;
+import at.procon.dip.domain.document.entity.DocumentContent;
+import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
+import at.procon.dip.domain.document.entity.DocumentSource;
+import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
+import at.procon.dip.domain.document.repository.DocumentRepository;
+import at.procon.dip.domain.document.repository.DocumentSourceRepository;
+import at.procon.dip.domain.document.service.DocumentContentService;
+import at.procon.dip.domain.document.service.DocumentEmbeddingService;
+import at.procon.dip.domain.document.service.DocumentRepresentationService;
+import at.procon.dip.domain.document.service.DocumentService;
+import at.procon.dip.domain.document.service.DocumentSourceService;
+import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
+import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
+import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
+import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
+import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
+import at.procon.dip.extraction.service.DocumentExtractionService;
+import at.procon.dip.extraction.spi.ExtractionRequest;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import at.procon.dip.ingestion.dto.ImportedDocumentResult;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+import at.procon.dip.ingestion.util.DocumentImportSupport;
+import at.procon.dip.normalization.service.TextRepresentationBuildService;
+import at.procon.dip.normalization.spi.RepresentationBuildRequest;
+import at.procon.dip.normalization.spi.TextRepresentationDraft;
+import at.procon.ted.config.TedProcessorProperties;
+import at.procon.ted.util.HashUtils;
+import java.nio.charset.StandardCharsets;
+import java.time.OffsetDateTime;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+import org.springframework.util.StringUtils;
+
+/**
+ * Phase 4 generic import pipeline that persists arbitrary document types into the DOC model.
+ */
+@Service
+@RequiredArgsConstructor
+@Slf4j
+public class GenericDocumentImportService {
+
+    private final TedProcessorProperties properties;
+    private final DocumentRepository documentRepository;
+    private final DocumentSourceRepository documentSourceRepository;
+    private final DocumentEmbeddingRepository documentEmbeddingRepository;
+    private final DocumentService documentService;
+    private final DocumentSourceService documentSourceService;
+    private final DocumentContentService documentContentService;
+    private final DocumentRepresentationService documentRepresentationService;
+    private final DocumentEmbeddingService documentEmbeddingService;
+    private final DocumentClassificationService classificationService;
+    private final DocumentExtractionService extractionService;
+    private final TextRepresentationBuildService representationBuildService;
+
+    @Transactional
+    public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) {
+        ResolvedPayload payload = resolvePayload(sourceDescriptor);
+        DetectionResult detection = classificationService.detect(withResolvedMediaType(sourceDescriptor, payload));
+        String dedupHash = HashUtils.computeSha256(payload.binaryContent());
+
+        if (properties.getGenericIngestion().isDeduplicateByContentHash()) {
+            Optional<Document> existing = documentRepository.findByDedupHash(dedupHash);
+            if (existing.isPresent()) {
+                Document document = existing.get();
+                ensureSource(document, sourceDescriptor);
+                List<String> warnings = List.of("Document content hash already imported; linked new source to existing document");
+                return new ImportedDocumentResult(document, detection, warnings, true);
+            }
+        }
+
+        DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null
+                ? defaultAccessContext()
+                : sourceDescriptor.accessContext();
+
+        Document document = documentService.create(new CreateDocumentCommand(
+                accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(),
+                accessContext.visibility(),
+                detection.documentType(),
+                detection.documentFamily(),
+                DocumentStatus.RECEIVED,
+                determineInitialTitle(sourceDescriptor, detection, payload),
+                null,
+                detection.languageCode(),
+                detection.mimeType(),
+                buildBusinessKey(sourceDescriptor),
+                dedupHash
+        ));
+
+        ensureSource(document, sourceDescriptor);
+        documentService.updateStatus(document.getId(), DocumentStatus.CLASSIFIED);
+
+        DocumentContent originalContent = persistOriginalContent(document, sourceDescriptor, detection, payload, dedupHash);
+
+        ExtractionResult extractionResult = extractionService.extract(new ExtractionRequest(
+                sourceDescriptor,
+                detection,
+                payload.textContent(),
+                payload.binaryContent()
+        ));
+        List<String> warnings = new ArrayList<>(extractionResult.warnings());
+
+        Map<ContentRole, DocumentContent> persistedDerivedContent = persistDerivedContent(document, detection, extractionResult, dedupHash);
+        documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
+
+        var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult));
+        persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts);
+
+        Document reloaded = documentService.getRequired(document.getId());
+        if (reloaded.getStatus() == DocumentStatus.EXTRACTED) {
+            documentService.updateStatus(reloaded.getId(), DocumentStatus.REPRESENTED);
+            reloaded = documentService.getRequired(reloaded.getId());
+        }
+
+        if (!extractionResult.structuredPayloads().isEmpty()) {
+            applyStructuredTitleIfMissing(reloaded, extractionResult);
+            reloaded = documentService.getRequired(reloaded.getId());
+        }
+
+        return new ImportedDocumentResult(reloaded, detection, warnings, false);
+    }
+
+    private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) {
+        if (StringUtils.hasText(sourceDescriptor.mediaType())) {
+            return sourceDescriptor;
+        }
+        return new SourceDescriptor(
+                sourceDescriptor.accessContext(),
+                sourceDescriptor.sourceType(),
+                sourceDescriptor.sourceIdentifier(),
+                sourceDescriptor.sourceUri(),
+                sourceDescriptor.fileName(),
+                payload.mediaType(),
+                sourceDescriptor.binaryContent(),
+                sourceDescriptor.textContent(),
+                sourceDescriptor.receivedAt(),
+                sourceDescriptor.attributes()
+        );
+    }
+
+    private ResolvedPayload resolvePayload(SourceDescriptor sourceDescriptor) {
+        byte[] binary = sourceDescriptor.binaryContent();
+        String text = sourceDescriptor.textContent();
+
+        if ((binary == null || binary.length == 0) && StringUtils.hasText(text)) {
+            binary = text.getBytes(StandardCharsets.UTF_8);
+        }
+        if ((binary == null || binary.length == 0) && StringUtils.hasText(sourceDescriptor.sourceUri())) {
+            try {
+                java.nio.file.Path path = java.nio.file.Path.of(sourceDescriptor.sourceUri());
+                if (java.nio.file.Files.exists(path)) {
+                    binary = java.nio.file.Files.readAllBytes(path);
+                }
+            } catch (Exception e) {
+                throw new IllegalArgumentException("Failed to read source payload from " + sourceDescriptor.sourceUri(), e);
+            }
+        }
+        if (binary == null || binary.length == 0) {
+            throw new IllegalArgumentException("No payload content available for source " + sourceDescriptor.sourceIdentifier());
+        }
+        if (!StringUtils.hasText(text) && DocumentImportSupport.isLikelyTextMime(sourceDescriptor.mediaType())) {
+            text = new String(binary, StandardCharsets.UTF_8);
+        }
+
+        return new ResolvedPayload(binary, text, inferMediaType(sourceDescriptor));
+    }
+
+    private String inferMediaType(SourceDescriptor sourceDescriptor) {
+        if (StringUtils.hasText(sourceDescriptor.mediaType())) {
+            return DocumentImportSupport.normalizeMediaType(sourceDescriptor.mediaType());
+        }
+        String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName());
+        return switch (extension) {
+            case "pdf" -> "application/pdf";
+            case "html", "htm" -> "text/html";
+            case "md", "markdown" -> "text/markdown";
+            case "xml" -> "application/xml";
+            case "txt", "log", "csv", "json", "yaml", "yml" -> "text/plain";
+            case "eml" -> "message/rfc822";
+            default -> null;
+        };
+    }
+
+    private DocumentAccessContext defaultAccessContext() {
+        String tenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey();
+        DocumentVisibility visibility = properties.getGenericIngestion().getDefaultVisibility();
+        if (!StringUtils.hasText(tenantKey)) {
+            return new DocumentAccessContext(null, visibility);
+        }
+        return new DocumentAccessContext(new at.procon.dip.domain.tenant.TenantRef(null, tenantKey, tenantKey), visibility);
+    }
+
+    private String determineInitialTitle(SourceDescriptor sourceDescriptor, DetectionResult detection, ResolvedPayload payload) {
+        if (sourceDescriptor.attributes() != null && StringUtils.hasText(sourceDescriptor.attributes().get("title"))) {
+            return sourceDescriptor.attributes().get("title");
+        }
+        if (StringUtils.hasText(sourceDescriptor.fileName())) {
+            return sourceDescriptor.fileName();
+        }
+        if (StringUtils.hasText(payload.textContent())) {
+            for (String line : payload.textContent().split("\\n")) {
+                if (StringUtils.hasText(line)) {
+                    return DocumentImportSupport.ellipsize(line.trim(), 240);
+                }
+            }
+        }
+        return detection.documentType().name();
+    }
+
+    private String buildBusinessKey(SourceDescriptor sourceDescriptor) {
+        if (StringUtils.hasText(sourceDescriptor.sourceIdentifier())) {
+            return sourceDescriptor.sourceType() + ":" + sourceDescriptor.sourceIdentifier();
+        }
+        if (StringUtils.hasText(sourceDescriptor.sourceUri())) {
+            return sourceDescriptor.sourceType() + ":" + sourceDescriptor.sourceUri();
+        }
+        return sourceDescriptor.sourceType() + ":" + java.util.UUID.randomUUID();
+    }
+
+    private void ensureSource(Document document, SourceDescriptor sourceDescriptor) {
+        boolean alreadyLinked = documentSourceRepository.findByDocument_Id(document.getId()).stream().anyMatch(existing ->
+                equalsNullable(existing.getExternalSourceId(), sourceDescriptor.sourceIdentifier())
+                        && equalsNullable(existing.getSourceUri(), sourceDescriptor.sourceUri())
+                        && equalsNullable(existing.getSourceFilename(), sourceDescriptor.fileName()));
+        if (alreadyLinked) {
+            return;
+        }
+
+        documentSourceService.addSource(new AddDocumentSourceCommand(
+                document.getId(),
+                sourceDescriptor.sourceType(),
+                sourceDescriptor.sourceIdentifier(),
+                sourceDescriptor.sourceUri(),
+                sourceDescriptor.fileName(),
+                null,
+                properties.getGenericIngestion().getImportBatchId(),
+                sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt()
+        ));
+    }
+
+    private DocumentContent persistOriginalContent(Document document,
+                                                   SourceDescriptor sourceDescriptor,
+                                                   DetectionResult detection,
+                                                   ResolvedPayload payload,
+                                                   String dedupHash) {
+        boolean storeBinaryInDb = shouldStoreBinaryInDb(payload.binaryContent());
+        boolean textPreferred = DocumentImportSupport.isLikelyTextMime(detection.mimeType()) || sourceDescriptor.hasInlineTextContent();
+
+        return documentContentService.addContent(new AddDocumentContentCommand(
+                document.getId(),
+                ContentRole.ORIGINAL,
+                textPreferred ? StorageType.DB_TEXT : (storeBinaryInDb ? StorageType.DB_BINARY : StorageType.EXTERNAL_REFERENCE),
+                detection.mimeType(),
+                textPreferred ? StandardCharsets.UTF_8.name() : null,
+                textPreferred ? payload.textContent() : null,
+                storeBinaryInDb && !textPreferred ? payload.binaryContent() : null,
+                !storeBinaryInDb && !textPreferred ? sourceDescriptor.sourceUri() : null,
+                dedupHash,
+                (long) payload.binaryContent().length
+        ));
+    }
+
+    private boolean shouldStoreBinaryInDb(byte[] binaryContent) {
+        return properties.getGenericIngestion().isStoreOriginalBinaryInDb()
+                && binaryContent != null
+                && binaryContent.length <= properties.getGenericIngestion().getMaxBinaryBytesInDb();
+    }
+
+    private Map<ContentRole, DocumentContent> persistDerivedContent(Document document,
+                                                                    DetectionResult detection,
+                                                                    ExtractionResult extractionResult,
+                                                                    String baseHash) {
+        Map<ContentRole, DocumentContent> result = new LinkedHashMap<>();
+        extractionResult.derivedTextByRole().forEach((role, text) -> {
+            if (!StringUtils.hasText(text)) {
+                return;
+            }
+            String contentHash = HashUtils.computeSha256(baseHash + ":" + role.name() + ":" + text);
+            DocumentContent content = documentContentService.addContent(new AddDocumentContentCommand(
+                    document.getId(),
+                    role,
+                    StorageType.DB_TEXT,
+                    detection.mimeType(),
+                    StandardCharsets.UTF_8.name(),
+                    text,
+                    null,
+                    null,
+                    contentHash,
+                    (long) text.length()
+            ));
+            result.put(role, content);
+        });
+        return result;
+    }
+
+    private void persistRepresentationsAndEmbeddings(Document document,
+                                                     DocumentContent originalContent,
+                                                     Map<ContentRole, DocumentContent> derivedContent,
+                                                     List<TextRepresentationDraft> drafts) {
+        if (drafts == null || drafts.isEmpty()) {
+            return;
+        }
+
+        DocumentEmbeddingModel model = null;
+        if (properties.getVectorization().isEnabled() && properties.getVectorization().isGenericPipelineEnabled()) {
+            model = documentEmbeddingService.registerModel(new RegisterEmbeddingModelCommand(
+                    properties.getVectorization().getModelName(),
+                    properties.getVectorization().getEmbeddingProvider(),
+                    properties.getVectorization().getModelName(),
+                    properties.getVectorization().getDimensions(),
+                    null,
+                    false,
+                    true
+            ));
+        }
+
+        for (TextRepresentationDraft draft : drafts) {
+            if (!StringUtils.hasText(draft.textBody())) {
+                continue;
+            }
+            DocumentContent linkedContent = switch (draft.representationType()) {
+                case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK ->
+                        derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
+            };
+
+            var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
+                    document.getId(),
+                    linkedContent == null ? null : linkedContent.getId(),
+                    draft.representationType(),
+                    "phase4-generic-builder",
+                    draft.languageCode(),
+                    null,
+                    draft.chunkIndex(),
+                    null,
+                    null,
+                    draft.primary(),
+                    draft.textBody()
+            ));
+
+            if (model != null && shouldQueueEmbedding(draft)) {
+                documentEmbeddingService.ensurePendingEmbedding(document.getId(), representation.getId(), model.getId());
+            }
+        }
+        documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
+    }
+
+    private boolean shouldQueueEmbedding(TextRepresentationDraft draft) {
+        return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true;
+    }
+
+    private void applyStructuredTitleIfMissing(Document document, ExtractionResult extractionResult) {
+        boolean missingTitle = !StringUtils.hasText(document.getTitle()) || document.getTitle().equals(document.getDocumentType().name());
+        if (!missingTitle) {
+            return;
+        }
+        for (var payload : extractionResult.structuredPayloads()) {
+            if (payload.attributes() == null) {
+                continue;
+            }
+            Object title = payload.attributes().get("title");
+            if (title instanceof String titleValue && StringUtils.hasText(titleValue)) {
+                document.setTitle(titleValue);
+                documentService.save(document);
+                return;
+            }
+        }
+    }
+
+    private boolean equalsNullable(String left, String right) {
+        return java.util.Objects.equals(left, right);
+    }
+
+    private record ResolvedPayload(byte[] binaryContent, String textContent, String mediaType) {
+    }
+}
--- a/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java
+++ b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java
@ -2,10 +2,14 @@ package at.procon.dip.ingestion.spi;

 import at.procon.dip.domain.access.DocumentAccessContext;
 import at.procon.dip.domain.document.SourceType;
+import java.time.OffsetDateTime;
 import java.util.Map;

 /**
 * Describes a source object that should be ingested into the canonical document model.
+ * <p>
+ * Phase 4 extends the descriptor with optional inline payload fields so adapters can support
+ * both file-backed imports and direct upload/API ingestion without introducing separate DTO trees.
 */
 public record SourceDescriptor(
        DocumentAccessContext accessContext,
@ -14,6 +18,17 @@ public record SourceDescriptor(
        String sourceUri,
        String fileName,
        String mediaType,
+        byte[] binaryContent,
+        String textContent,
+        OffsetDateTime receivedAt,
        Map<String, String> attributes
 ) {
+
+    public boolean hasInlineBinaryContent() {
+        return binaryContent != null && binaryContent.length > 0;
+    }
+
+    public boolean hasInlineTextContent() {
+        return textContent != null && !textContent.isBlank();
+    }
 }
--- a/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java
+++ b/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java
@ -0,0 +1,84 @@
+package at.procon.dip.ingestion.util;
+
+import at.procon.dip.domain.document.DocumentFamily;
+import at.procon.dip.domain.document.DocumentType;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
+import java.util.Map;
+import org.springframework.util.StringUtils;
+
+/**
+ * Shared utility methods for Phase 4 generic ingestion.
+ */
+public final class DocumentImportSupport {
+
+    private DocumentImportSupport() {
+    }
+
+    public static String extensionOf(String fileName) {
+        if (!StringUtils.hasText(fileName) || !fileName.contains(".")) {
+            return "";
+        }
+        return fileName.substring(fileName.lastIndexOf('.') + 1).toLowerCase(Locale.ROOT);
+    }
+
+    public static boolean isLikelyTextMime(String mediaType) {
+        if (!StringUtils.hasText(mediaType)) {
+            return false;
+        }
+        String normalized = normalizeMediaType(mediaType);
+        return normalized.startsWith("text/")
+                || normalized.contains("json")
+                || normalized.contains("xml")
+                || normalized.contains("javascript")
+                || normalized.equals("application/xhtml+xml");
+    }
+
+    public static String normalizeMediaType(String mediaType) {
+        if (!StringUtils.hasText(mediaType)) {
+            return null;
+        }
+        int idx = mediaType.indexOf(';');
+        String result = idx >= 0 ? mediaType.substring(0, idx) : mediaType;
+        return result.trim().toLowerCase(Locale.ROOT);
+    }
+
+    public static DocumentFamily familyFor(DocumentType documentType) {
+        return switch (documentType) {
+            case TED_NOTICE -> DocumentFamily.PROCUREMENT;
+            case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL;
+            case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN ->
+                    DocumentFamily.GENERIC;
+        };
+    }
+
+    public static String safeUtf8(byte[] bytes) {
+        if (bytes == null) {
+            return null;
+        }
+        return new String(bytes, StandardCharsets.UTF_8);
+    }
+
+    public static String firstNonBlank(Map<String, String> values, String... keys) {
+        for (String key : keys) {
+            if (values == null) {
+                continue;
+            }
+            String value = values.get(key);
+            if (StringUtils.hasText(value)) {
+                return value;
+            }
+        }
+        return null;
+    }
+
+    public static String ellipsize(String text, int maxLength) {
+        if (text == null || text.length() <= maxLength) {
+            return text;
+        }
+        if (maxLength <= 3) {
+            return text.substring(0, maxLength);
+        }
+        return text.substring(0, maxLength - 3) + "...";
+    }
+}
--- a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java
+++ b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java
@ -0,0 +1,86 @@
+package at.procon.dip.normalization.impl;
+
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.domain.document.RepresentationType;
+import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
+import at.procon.dip.ingestion.util.DocumentImportSupport;
+import at.procon.dip.normalization.spi.RepresentationBuildRequest;
+import at.procon.dip.normalization.spi.TextRepresentationBuilder;
+import at.procon.dip.normalization.spi.TextRepresentationDraft;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+@Component
+public class DefaultGenericTextRepresentationBuilder implements TextRepresentationBuilder {
+
+    @Override
+    public boolean supports(DocumentType documentType) {
+        return documentType != DocumentType.TED_NOTICE;
+    }
+
+    @Override
+    public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
+        String normalizedText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
+        String htmlCleanText = request.extractionResult().derivedTextByRole().get(ContentRole.HTML_CLEAN);
+        String baseText = StringUtils.hasText(normalizedText) ? normalizedText : htmlCleanText;
+        if (!StringUtils.hasText(baseText)) {
+            return List.of();
+        }
+
+        String title = findStructuredTitle(request).orElse(request.sourceDescriptor().fileName());
+        String summary = DocumentImportSupport.ellipsize(baseText.replace('\n', ' ').trim(), 1200);
+        String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
+
+        List<TextRepresentationDraft> drafts = new ArrayList<>();
+        drafts.add(new TextRepresentationDraft(
+                RepresentationType.FULLTEXT,
+                request.detectionResult().languageCode(),
+                baseText,
+                false,
+                null
+        ));
+        drafts.add(new TextRepresentationDraft(
+                RepresentationType.SEMANTIC_TEXT,
+                request.detectionResult().languageCode(),
+                semantic,
+                true,
+                null
+        ));
+        if (StringUtils.hasText(title)) {
+            drafts.add(new TextRepresentationDraft(
+                    RepresentationType.TITLE_ABSTRACT,
+                    request.detectionResult().languageCode(),
+                    title + "\n\n" + summary,
+                    false,
+                    null
+            ));
+        }
+        return drafts;
+    }
+
+    private Optional<String> findStructuredTitle(RepresentationBuildRequest request) {
+        return request.extractionResult().structuredPayloads().stream()
+                .map(ExtractedStructuredPayload::attributes)
+                .filter(Objects::nonNull)
+                .map(attributes -> attributes.get("title"))
+                .filter(String.class::isInstance)
+                .map(String.class::cast)
+                .filter(StringUtils::hasText)
+                .findFirst();
+    }
+
+    private String buildSemanticText(String title, String summary, DocumentType documentType) {
+        StringBuilder sb = new StringBuilder();
+        sb.append("Document type: ").append(documentType.name()).append('\n');
+        if (StringUtils.hasText(title)) {
+            sb.append("Title: ").append(title.trim()).append("\n\n");
+        }
+        sb.append(summary);
+        return sb.toString().trim();
+    }
+}
--- a/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java
+++ b/src/main/java/at/procon/dip/normalization/service/TextRepresentationBuildService.java
@ -0,0 +1,24 @@
+package at.procon.dip.normalization.service;
+
+import at.procon.dip.normalization.spi.RepresentationBuildRequest;
+import at.procon.dip.normalization.spi.TextRepresentationBuilder;
+import at.procon.dip.normalization.spi.TextRepresentationDraft;
+import java.util.List;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+
+@Service
+@RequiredArgsConstructor
+public class TextRepresentationBuildService {
+
+    private final List<TextRepresentationBuilder> builders;
+
+    public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
+        return builders.stream()
+                .filter(builder -> builder.supports(request.detectionResult().documentType()))
+                .findFirst()
+                .orElseThrow(() -> new IllegalStateException(
+                        "No text representation builder registered for " + request.detectionResult().documentType()))
+                .build(request);
+    }
+}
--- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java
+++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java
@ -28,6 +28,7 @@ public class TedProcessorProperties {
    private MailProperties mail = new MailProperties();
    private SolutionBriefProperties solutionBrief = new SolutionBriefProperties();
    private ProjectionProperties projection = new ProjectionProperties();
+    private GenericIngestionProperties genericIngestion = new GenericIngestionProperties();

    /**
     * Input directory configuration for Apache Camel file consumer.
@ -484,4 +485,101 @@ public class TedProcessorProperties {
         */
        private String idempotentRepository = "./solution-brief-processed.dat";
    }
+
+    /**
+     * Phase 4 generic ingestion configuration.
+     */
+    @Data
+    public static class GenericIngestionProperties {
+
+        /**
+         * Master switch for the generic ingestion pipeline.
+         */
+        private boolean enabled = false;
+
+        /**
+         * Enable/disable filesystem import route for arbitrary documents.
+         */
+        private boolean fileSystemEnabled = false;
+
+        /**
+         * Enable/disable REST/API upload endpoints for arbitrary documents.
+         */
+        private boolean restUploadEnabled = true;
+
+        /**
+         * Input directory for the generic filesystem importer.
+         */
+        private String inputDirectory = "D:/ted.europe/generic-input";
+
+        /**
+         * Regular-expression file pattern used by the Camel file route.
+         */
+        private String filePattern = ".*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$";
+
+        /**
+         * Directory where successfully imported files are moved.
+         */
+        private String processedDirectory = ".dip-processed";
+
+        /**
+         * Directory where failed files are moved.
+         */
+        private String errorDirectory = ".dip-error";
+
+        /**
+         * Polling interval in milliseconds.
+         */
+        @Positive
+        private long pollInterval = 15000;
+
+        /**
+         * Maximum number of files per poll.
+         */
+        @Positive
+        private int maxMessagesPerPoll = 10;
+
+        /**
+         * Default owner tenant for files imported through the generic route.
+         */
+        private String defaultOwnerTenantKey;
+
+        /**
+         * Default visibility for generic imports when not supplied explicitly.
+         */
+        private at.procon.dip.domain.access.DocumentVisibility defaultVisibility = at.procon.dip.domain.access.DocumentVisibility.PUBLIC;
+
+        /**
+         * Optional default language code applied to filesystem imports.
+         */
+        private String defaultLanguageCode;
+
+        /**
+         * Persist original binary payloads in DB when they are small enough.
+         */
+        private boolean storeOriginalBinaryInDb = true;
+
+        /**
+         * Maximum binary size (bytes) stored inline in DOC.doc_content.binary_content.
+         */
+        @Positive
+        private int maxBinaryBytesInDb = 5242880;
+
+        /**
+         * Whether an already imported content hash should resolve to the existing document.
+         */
+        private boolean deduplicateByContentHash = true;
+
+        /**
+         * Queue only the primary text representation for embedding.
+         */
+        private boolean vectorizePrimaryRepresentationOnly = true;
+
+        /**
+         * Import batch identifier written to DOC.doc_source.import_batch_id.
+         */
+        @NotBlank
+        private String importBatchId = "phase4-generic";
+    }
+
 }
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@ -204,6 +204,43 @@ ted:
    # Maximum number of legacy TED documents to backfill during startup
    startup-backfill-limit: 250

+  # Phase 4 generic ingestion configuration
+  generic-ingestion:
+    # Master switch for arbitrary document ingestion into the DOC model
+    enabled: false
+    # Enable file-system polling for non-TED documents
+    file-system-enabled: false
+    # Allow REST/API upload endpoints for arbitrary documents
+    rest-upload-enabled: true
+    # Input directory for the generic Camel file route
+    input-directory: D:/ted.europe/generic-input
+    # Regex for files accepted by the generic file route
+    file-pattern: .*\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$
+    # Move successfully processed files here
+    processed-directory: .dip-processed
+    # Move failed files here
+    error-directory: .dip-error
+    # Polling interval for the generic route
+    poll-interval: 15000
+    # Maximum files per poll
+    max-messages-per-poll: 10
+    # Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs
+    default-owner-tenant-key:
+    # Default visibility when no explicit access context is provided
+    default-visibility: PUBLIC
+    # Optional default language for filesystem imports
+    default-language-code:
+    # Store small binary originals in DOC.doc_content.binary_content
+    store-original-binary-in-db: true
+    # Maximum binary payload size persisted inline in DB
+    max-binary-bytes-in-db: 5242880
+    # Deduplicate by content hash and attach additional sources to the same canonical document
+    deduplicate-by-content-hash: true
+    # Queue only the primary text representation for vectorization
+    vectorize-primary-representation-only: true
+    # Import batch marker written to DOC.doc_source.import_batch_id
+    import-batch-id: phase4-generic
+
  # Solution Brief processing configuration
  solution-brief:
    # Enable/disable Solution Brief processing
--- a/src/main/resources/db/migration/V7__doc_phase4_binary_content_support.sql
+++ b/src/main/resources/db/migration/V7__doc_phase4_binary_content_support.sql
@ -0,0 +1,9 @@
+-- Phase 4: Generic ingestion support for arbitrary document types.
+-- Add binary payload storage to the generic DOC content table so non-text originals
+-- such as PDFs can be stored directly in the database when configured.
+
+ALTER TABLE DOC.doc_content
+    ADD COLUMN IF NOT EXISTS binary_content BYTEA;
+
+CREATE INDEX IF NOT EXISTS idx_doc_content_document_role
+    ON DOC.doc_content(document_id, content_role, created_at DESC);