attributes
+) {
+}
diff --git a/src/main/java/at/procon/dip/classification/spi/DocumentTypeDetector.java b/src/main/java/at/procon/dip/classification/spi/DocumentTypeDetector.java
new file mode 100644
index 0000000..712938a
--- /dev/null
+++ b/src/main/java/at/procon/dip/classification/spi/DocumentTypeDetector.java
@@ -0,0 +1,13 @@
+package at.procon.dip.classification.spi;
+
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+
+/**
+ * Determines a canonical type/family before extraction starts.
+ */
+public interface DocumentTypeDetector {
+
+ boolean supports(SourceDescriptor sourceDescriptor);
+
+ DetectionResult detect(SourceDescriptor sourceDescriptor);
+}
diff --git a/src/main/java/at/procon/dip/domain/access/DocumentAccessContext.java b/src/main/java/at/procon/dip/domain/access/DocumentAccessContext.java
new file mode 100644
index 0000000..1e79063
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/access/DocumentAccessContext.java
@@ -0,0 +1,31 @@
+package at.procon.dip.domain.access;
+
+import at.procon.dip.domain.tenant.TenantRef;
+import java.util.Objects;
+
+/**
+ * Canonical ownership and visibility descriptor for a document.
+ *
+ * A document may have no owner tenant, for example public TED notices.
+ * Visibility is always mandatory and defines who may search/read the document.
+ */
+public record DocumentAccessContext(
+ TenantRef ownerTenant,
+ DocumentVisibility visibility
+) {
+
+ public DocumentAccessContext {
+ Objects.requireNonNull(visibility, "visibility must not be null");
+ }
+
+ public static DocumentAccessContext publicDocument() {
+ return new DocumentAccessContext(null, DocumentVisibility.PUBLIC);
+ }
+
+ public static DocumentAccessContext tenantOwned(TenantRef ownerTenant) {
+ return new DocumentAccessContext(
+ Objects.requireNonNull(ownerTenant, "ownerTenant must not be null"),
+ DocumentVisibility.TENANT
+ );
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/access/DocumentVisibility.java b/src/main/java/at/procon/dip/domain/access/DocumentVisibility.java
new file mode 100644
index 0000000..a8ecb27
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/access/DocumentVisibility.java
@@ -0,0 +1,11 @@
+package at.procon.dip.domain.access;
+
+/**
+ * Describes who may access a document independently from ownership.
+ */
+public enum DocumentVisibility {
+ PUBLIC,
+ TENANT,
+ SHARED,
+ RESTRICTED
+}
diff --git a/src/main/java/at/procon/dip/domain/document/CanonicalDocumentMetadata.java b/src/main/java/at/procon/dip/domain/document/CanonicalDocumentMetadata.java
new file mode 100644
index 0000000..7cad293
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/CanonicalDocumentMetadata.java
@@ -0,0 +1,23 @@
+package at.procon.dip.domain.document;
+
+import at.procon.dip.domain.access.DocumentAccessContext;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+
+/**
+ * Minimal canonical document descriptor used by Phase 0 SPI contracts.
+ */
+public record CanonicalDocumentMetadata(
+ UUID documentId,
+ DocumentAccessContext accessContext,
+ DocumentType documentType,
+ DocumentFamily documentFamily,
+ DocumentStatus status,
+ String title,
+ String languageCode,
+ String mimeType,
+ String dedupHash,
+ OffsetDateTime createdAt,
+ OffsetDateTime updatedAt
+) {
+}
diff --git a/src/main/java/at/procon/dip/domain/document/ContentRole.java b/src/main/java/at/procon/dip/domain/document/ContentRole.java
new file mode 100644
index 0000000..dcdfba6
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/ContentRole.java
@@ -0,0 +1,14 @@
+package at.procon.dip.domain.document;
+
+/**
+ * Role of a stored content version.
+ */
+public enum ContentRole {
+ ORIGINAL,
+ NORMALIZED_TEXT,
+ OCR_TEXT,
+ HTML_CLEAN,
+ EXTRACTED_METADATA_JSON,
+ THUMBNAIL,
+ DERIVED_BINARY
+}
diff --git a/src/main/java/at/procon/dip/domain/document/DistanceMetric.java b/src/main/java/at/procon/dip/domain/document/DistanceMetric.java
new file mode 100644
index 0000000..0bb8d68
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/DistanceMetric.java
@@ -0,0 +1,10 @@
+package at.procon.dip.domain.document;
+
+/**
+ * Distance metric used by an embedding model.
+ */
+public enum DistanceMetric {
+ COSINE,
+ L2,
+ INNER_PRODUCT
+}
diff --git a/src/main/java/at/procon/dip/domain/document/DocumentFamily.java b/src/main/java/at/procon/dip/domain/document/DocumentFamily.java
new file mode 100644
index 0000000..790fe33
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/DocumentFamily.java
@@ -0,0 +1,12 @@
+package at.procon.dip.domain.document;
+
+/**
+ * Functional grouping used for broad search and routing decisions.
+ */
+public enum DocumentFamily {
+ PROCUREMENT,
+ MAIL,
+ ATTACHMENT,
+ KNOWLEDGE,
+ GENERIC
+}
diff --git a/src/main/java/at/procon/dip/domain/document/DocumentStatus.java b/src/main/java/at/procon/dip/domain/document/DocumentStatus.java
new file mode 100644
index 0000000..b6ddddd
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/DocumentStatus.java
@@ -0,0 +1,14 @@
+package at.procon.dip.domain.document;
+
+/**
+ * Generic lifecycle state for a canonical document.
+ */
+public enum DocumentStatus {
+ RECEIVED,
+ CLASSIFIED,
+ EXTRACTED,
+ REPRESENTED,
+ INDEXED,
+ FAILED,
+ ARCHIVED
+}
diff --git a/src/main/java/at/procon/dip/domain/document/DocumentType.java b/src/main/java/at/procon/dip/domain/document/DocumentType.java
new file mode 100644
index 0000000..f6a651b
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/DocumentType.java
@@ -0,0 +1,19 @@
+package at.procon.dip.domain.document;
+
+/**
+ * Canonical technical document type.
+ */
+public enum DocumentType {
+ TED_NOTICE,
+ EMAIL,
+ MIME_MESSAGE,
+ PDF,
+ DOCX,
+ HTML,
+ XML_GENERIC,
+ TEXT,
+ MARKDOWN,
+ ZIP_ARCHIVE,
+ GENERIC_BINARY,
+ UNKNOWN
+}
diff --git a/src/main/java/at/procon/dip/domain/document/EmbeddingStatus.java b/src/main/java/at/procon/dip/domain/document/EmbeddingStatus.java
new file mode 100644
index 0000000..894ce3a
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/EmbeddingStatus.java
@@ -0,0 +1,12 @@
+package at.procon.dip.domain.document;
+
+/**
+ * Generic lifecycle state of an embedding record in the DOC schema.
+ */
+public enum EmbeddingStatus {
+ PENDING,
+ PROCESSING,
+ COMPLETED,
+ FAILED,
+ SKIPPED
+}
diff --git a/src/main/java/at/procon/dip/domain/document/RelationType.java b/src/main/java/at/procon/dip/domain/document/RelationType.java
new file mode 100644
index 0000000..1759192
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/RelationType.java
@@ -0,0 +1,14 @@
+package at.procon.dip.domain.document;
+
+/**
+ * Logical relationship between canonical documents.
+ */
+public enum RelationType {
+ CONTAINS,
+ ATTACHMENT_OF,
+ EXTRACTED_FROM,
+ DERIVED_FROM,
+ PART_OF,
+ VERSION_OF,
+ RELATED_TO
+}
diff --git a/src/main/java/at/procon/dip/domain/document/RepresentationType.java b/src/main/java/at/procon/dip/domain/document/RepresentationType.java
new file mode 100644
index 0000000..0cc7b3e
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/RepresentationType.java
@@ -0,0 +1,13 @@
+package at.procon.dip.domain.document;
+
+/**
+ * Search-oriented text representation that can be embedded independently.
+ */
+public enum RepresentationType {
+ FULLTEXT,
+ SEMANTIC_TEXT,
+ SUMMARY,
+ TITLE_ABSTRACT,
+ CHUNK,
+ METADATA_ENRICHED
+}
diff --git a/src/main/java/at/procon/dip/domain/document/SourceType.java b/src/main/java/at/procon/dip/domain/document/SourceType.java
new file mode 100644
index 0000000..d53b841
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/SourceType.java
@@ -0,0 +1,15 @@
+package at.procon.dip.domain.document;
+
+/**
+ * Provenance of an imported document.
+ */
+public enum SourceType {
+ TED_PACKAGE,
+ MAIL,
+ FILE_SYSTEM,
+ REST_UPLOAD,
+ MANUAL_UPLOAD,
+ ZIP_CHILD,
+ API,
+ MIGRATION
+}
diff --git a/src/main/java/at/procon/dip/domain/document/StorageType.java b/src/main/java/at/procon/dip/domain/document/StorageType.java
new file mode 100644
index 0000000..0ee68a0
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/StorageType.java
@@ -0,0 +1,12 @@
+package at.procon.dip.domain.document;
+
+/**
+ * Physical storage strategy for content.
+ */
+public enum StorageType {
+ DB_TEXT,
+ DB_BINARY,
+ FILE_PATH,
+ OBJECT_STORAGE,
+ EXTERNAL_REFERENCE
+}
diff --git a/src/main/java/at/procon/dip/domain/document/entity/Document.java b/src/main/java/at/procon/dip/domain/document/entity/Document.java
new file mode 100644
index 0000000..2d71e94
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/entity/Document.java
@@ -0,0 +1,133 @@
+package at.procon.dip.domain.document.entity;
+
+import at.procon.dip.architecture.SchemaNames;
+import at.procon.dip.domain.access.DocumentAccessContext;
+import at.procon.dip.domain.access.DocumentVisibility;
+import at.procon.dip.domain.document.CanonicalDocumentMetadata;
+import at.procon.dip.domain.document.DocumentFamily;
+import at.procon.dip.domain.document.DocumentStatus;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.domain.tenant.entity.DocumentTenant;
+import jakarta.persistence.Column;
+import jakarta.persistence.Entity;
+import jakarta.persistence.EnumType;
+import jakarta.persistence.Enumerated;
+import jakarta.persistence.FetchType;
+import jakarta.persistence.GeneratedValue;
+import jakarta.persistence.GenerationType;
+import jakarta.persistence.Id;
+import jakarta.persistence.Index;
+import jakarta.persistence.JoinColumn;
+import jakarta.persistence.ManyToOne;
+import jakarta.persistence.PrePersist;
+import jakarta.persistence.PreUpdate;
+import jakarta.persistence.Table;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+
+/**
+ * Canonical document root entity for the generalized DOC schema.
+ */
+@Entity
+@Table(schema = SchemaNames.DOC, name = "doc_document", indexes = {
+ @Index(name = "idx_doc_document_type", columnList = "document_type"),
+ @Index(name = "idx_doc_document_family", columnList = "document_family"),
+ @Index(name = "idx_doc_document_status", columnList = "status"),
+ @Index(name = "idx_doc_document_visibility", columnList = "visibility"),
+ @Index(name = "idx_doc_document_owner_tenant", columnList = "owner_tenant_id"),
+ @Index(name = "idx_doc_document_dedup_hash", columnList = "dedup_hash"),
+ @Index(name = "idx_doc_document_business_key", columnList = "business_key"),
+ @Index(name = "idx_doc_document_created_at", columnList = "created_at")
+})
+@Getter
+@Setter
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+public class Document {
+
+ @Id
+ @GeneratedValue(strategy = GenerationType.UUID)
+ private UUID id;
+
+ @ManyToOne(fetch = FetchType.LAZY)
+ @JoinColumn(name = "owner_tenant_id")
+ private DocumentTenant ownerTenant;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "visibility", nullable = false, length = 32)
+ @Builder.Default
+ private DocumentVisibility visibility = DocumentVisibility.PUBLIC;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "document_type", nullable = false, length = 64)
+ private DocumentType documentType;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "document_family", nullable = false, length = 64)
+ private DocumentFamily documentFamily;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "status", nullable = false, length = 32)
+ @Builder.Default
+ private DocumentStatus status = DocumentStatus.RECEIVED;
+
+ @Column(name = "title", length = 1000)
+ private String title;
+
+ @Column(name = "summary", columnDefinition = "TEXT")
+ private String summary;
+
+ @Column(name = "language_code", length = 16)
+ private String languageCode;
+
+ @Column(name = "mime_type", length = 255)
+ private String mimeType;
+
+ @Column(name = "business_key", length = 255)
+ private String businessKey;
+
+ @Column(name = "dedup_hash", length = 64)
+ private String dedupHash;
+
+ @Builder.Default
+ @Column(name = "created_at", nullable = false, updatable = false)
+ private OffsetDateTime createdAt = OffsetDateTime.now();
+
+ @Builder.Default
+ @Column(name = "updated_at", nullable = false)
+ private OffsetDateTime updatedAt = OffsetDateTime.now();
+
+ @PrePersist
+ protected void onCreate() {
+ createdAt = OffsetDateTime.now();
+ updatedAt = OffsetDateTime.now();
+ }
+
+ @PreUpdate
+ protected void onUpdate() {
+ updatedAt = OffsetDateTime.now();
+ }
+
+ public CanonicalDocumentMetadata toCanonicalMetadata() {
+ return new CanonicalDocumentMetadata(
+ id,
+ new DocumentAccessContext(ownerTenant == null ? null : new at.procon.dip.domain.tenant.TenantRef(
+ ownerTenant.getId().toString(), ownerTenant.getTenantKey(), ownerTenant.getDisplayName()), visibility),
+ documentType,
+ documentFamily,
+ status,
+ title,
+ languageCode,
+ mimeType,
+ dedupHash,
+ createdAt,
+ updatedAt
+ );
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java
new file mode 100644
index 0000000..af746bf
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java
@@ -0,0 +1,86 @@
+package at.procon.dip.domain.document.entity;
+
+import at.procon.dip.architecture.SchemaNames;
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.StorageType;
+import jakarta.persistence.Column;
+import jakarta.persistence.Entity;
+import jakarta.persistence.EnumType;
+import jakarta.persistence.Enumerated;
+import jakarta.persistence.FetchType;
+import jakarta.persistence.GeneratedValue;
+import jakarta.persistence.GenerationType;
+import jakarta.persistence.Id;
+import jakarta.persistence.Index;
+import jakarta.persistence.JoinColumn;
+import jakarta.persistence.ManyToOne;
+import jakarta.persistence.PrePersist;
+import jakarta.persistence.Table;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+
+/**
+ * Stored payload variant for a canonical document.
+ */
+@Entity
+@Table(schema = SchemaNames.DOC, name = "doc_content", indexes = {
+ @Index(name = "idx_doc_content_document", columnList = "document_id"),
+ @Index(name = "idx_doc_content_role", columnList = "content_role"),
+ @Index(name = "idx_doc_content_hash", columnList = "content_hash"),
+ @Index(name = "idx_doc_content_storage_type", columnList = "storage_type")
+})
+@Getter
+@Setter
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+public class DocumentContent {
+
+ @Id
+ @GeneratedValue(strategy = GenerationType.UUID)
+ private UUID id;
+
+ @ManyToOne(fetch = FetchType.LAZY, optional = false)
+ @JoinColumn(name = "document_id", nullable = false)
+ private Document document;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "content_role", nullable = false, length = 64)
+ private ContentRole contentRole;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "storage_type", nullable = false, length = 64)
+ private StorageType storageType;
+
+ @Column(name = "mime_type", length = 255)
+ private String mimeType;
+
+ @Column(name = "charset_name", length = 120)
+ private String charsetName;
+
+ @Column(name = "text_content", columnDefinition = "TEXT")
+ private String textContent;
+
+ @Column(name = "binary_ref", columnDefinition = "TEXT")
+ private String binaryRef;
+
+ @Column(name = "content_hash", length = 64)
+ private String contentHash;
+
+ @Column(name = "size_bytes")
+ private Long sizeBytes;
+
+ @Builder.Default
+ @Column(name = "created_at", nullable = false, updatable = false)
+ private OffsetDateTime createdAt = OffsetDateTime.now();
+
+ @PrePersist
+ protected void onCreate() {
+ createdAt = OffsetDateTime.now();
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java
new file mode 100644
index 0000000..07797d1
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java
@@ -0,0 +1,103 @@
+package at.procon.dip.domain.document.entity;
+
+import at.procon.dip.architecture.SchemaNames;
+import at.procon.dip.domain.document.EmbeddingStatus;
+import jakarta.persistence.Column;
+import jakarta.persistence.Entity;
+import jakarta.persistence.EnumType;
+import jakarta.persistence.Enumerated;
+import jakarta.persistence.FetchType;
+import jakarta.persistence.GeneratedValue;
+import jakarta.persistence.GenerationType;
+import jakarta.persistence.Id;
+import jakarta.persistence.Index;
+import jakarta.persistence.JoinColumn;
+import jakarta.persistence.ManyToOne;
+import jakarta.persistence.PrePersist;
+import jakarta.persistence.PreUpdate;
+import jakarta.persistence.Table;
+import jakarta.persistence.Transient;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+
+/**
+ * Generic vectorization record separated from the canonical document structure.
+ *
+ * The actual pgvector payload is persisted in the {@code embedding_vector} column via native SQL
+ * in later phases. The transient field exists only as a convenient in-memory carrier.
+ */
+@Entity
+@Table(schema = SchemaNames.DOC, name = "doc_embedding", indexes = {
+ @Index(name = "idx_doc_embedding_document", columnList = "document_id"),
+ @Index(name = "idx_doc_embedding_repr", columnList = "representation_id"),
+ @Index(name = "idx_doc_embedding_model", columnList = "model_id"),
+ @Index(name = "idx_doc_embedding_status", columnList = "embedding_status"),
+ @Index(name = "idx_doc_embedding_embedded_at", columnList = "embedded_at")
+})
+@Getter
+@Setter
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+public class DocumentEmbedding {
+
+ @Id
+ @GeneratedValue(strategy = GenerationType.UUID)
+ private UUID id;
+
+ @ManyToOne(fetch = FetchType.LAZY, optional = false)
+ @JoinColumn(name = "document_id", nullable = false)
+ private Document document;
+
+ @ManyToOne(fetch = FetchType.LAZY, optional = false)
+ @JoinColumn(name = "representation_id", nullable = false)
+ private DocumentTextRepresentation representation;
+
+ @ManyToOne(fetch = FetchType.LAZY, optional = false)
+ @JoinColumn(name = "model_id", nullable = false)
+ private DocumentEmbeddingModel model;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "embedding_status", nullable = false, length = 32)
+ @Builder.Default
+ private EmbeddingStatus embeddingStatus = EmbeddingStatus.PENDING;
+
+ @Column(name = "token_count")
+ private Integer tokenCount;
+
+ @Column(name = "embedding_dimensions")
+ private Integer embeddingDimensions;
+
+ @Column(name = "error_message", columnDefinition = "TEXT")
+ private String errorMessage;
+
+ @Column(name = "embedded_at")
+ private OffsetDateTime embeddedAt;
+
+ @Builder.Default
+ @Column(name = "created_at", nullable = false, updatable = false)
+ private OffsetDateTime createdAt = OffsetDateTime.now();
+
+ @Builder.Default
+ @Column(name = "updated_at", nullable = false)
+ private OffsetDateTime updatedAt = OffsetDateTime.now();
+
+ @Transient
+ private float[] embeddingVector;
+
+ @PrePersist
+ protected void onCreate() {
+ createdAt = OffsetDateTime.now();
+ updatedAt = OffsetDateTime.now();
+ }
+
+ @PreUpdate
+ protected void onUpdate() {
+ updatedAt = OffsetDateTime.now();
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingModel.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingModel.java
new file mode 100644
index 0000000..9f49f35
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingModel.java
@@ -0,0 +1,86 @@
+package at.procon.dip.domain.document.entity;
+
+import at.procon.dip.architecture.SchemaNames;
+import at.procon.dip.domain.document.DistanceMetric;
+import jakarta.persistence.Column;
+import jakarta.persistence.Entity;
+import jakarta.persistence.EnumType;
+import jakarta.persistence.Enumerated;
+import jakarta.persistence.GeneratedValue;
+import jakarta.persistence.GenerationType;
+import jakarta.persistence.Id;
+import jakarta.persistence.Index;
+import jakarta.persistence.PrePersist;
+import jakarta.persistence.PreUpdate;
+import jakarta.persistence.Table;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+
+/**
+ * Embedding model catalog row used by generic vectorization.
+ */
+@Entity
+@Table(schema = SchemaNames.DOC, name = "doc_embedding_model", indexes = {
+ @Index(name = "idx_doc_embedding_model_key", columnList = "model_key", unique = true),
+ @Index(name = "idx_doc_embedding_model_active", columnList = "active")
+})
+@Getter
+@Setter
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+public class DocumentEmbeddingModel {
+
+ @Id
+ @GeneratedValue(strategy = GenerationType.UUID)
+ private UUID id;
+
+ @Column(name = "model_key", nullable = false, unique = true, length = 255)
+ private String modelKey;
+
+ @Column(name = "provider", nullable = false, length = 120)
+ private String provider;
+
+ @Column(name = "display_name", length = 255)
+ private String displayName;
+
+ @Column(name = "dimensions", nullable = false)
+ private Integer dimensions;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "distance_metric", nullable = false, length = 32)
+ @Builder.Default
+ private DistanceMetric distanceMetric = DistanceMetric.COSINE;
+
+ @Builder.Default
+ @Column(name = "query_prefix_required", nullable = false)
+ private boolean queryPrefixRequired = false;
+
+ @Builder.Default
+ @Column(name = "active", nullable = false)
+ private boolean active = true;
+
+ @Builder.Default
+ @Column(name = "created_at", nullable = false, updatable = false)
+ private OffsetDateTime createdAt = OffsetDateTime.now();
+
+ @Builder.Default
+ @Column(name = "updated_at", nullable = false)
+ private OffsetDateTime updatedAt = OffsetDateTime.now();
+
+ @PrePersist
+ protected void onCreate() {
+ createdAt = OffsetDateTime.now();
+ updatedAt = OffsetDateTime.now();
+ }
+
+ @PreUpdate
+ protected void onUpdate() {
+ updatedAt = OffsetDateTime.now();
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentRelation.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentRelation.java
new file mode 100644
index 0000000..dfa6a9c
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentRelation.java
@@ -0,0 +1,72 @@
+package at.procon.dip.domain.document.entity;
+
+import at.procon.dip.architecture.SchemaNames;
+import at.procon.dip.domain.document.RelationType;
+import jakarta.persistence.Column;
+import jakarta.persistence.Entity;
+import jakarta.persistence.EnumType;
+import jakarta.persistence.Enumerated;
+import jakarta.persistence.FetchType;
+import jakarta.persistence.GeneratedValue;
+import jakarta.persistence.GenerationType;
+import jakarta.persistence.Id;
+import jakarta.persistence.Index;
+import jakarta.persistence.JoinColumn;
+import jakarta.persistence.ManyToOne;
+import jakarta.persistence.PrePersist;
+import jakarta.persistence.Table;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+
+/**
+ * Directed relationship between two canonical documents.
+ */
+@Entity
+@Table(schema = SchemaNames.DOC, name = "doc_relation", indexes = {
+ @Index(name = "idx_doc_relation_parent", columnList = "parent_document_id"),
+ @Index(name = "idx_doc_relation_child", columnList = "child_document_id"),
+ @Index(name = "idx_doc_relation_type", columnList = "relation_type")
+})
+@Getter
+@Setter
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+public class DocumentRelation {
+
+ @Id
+ @GeneratedValue(strategy = GenerationType.UUID)
+ private UUID id;
+
+ @ManyToOne(fetch = FetchType.LAZY, optional = false)
+ @JoinColumn(name = "parent_document_id", nullable = false)
+ private Document parentDocument;
+
+ @ManyToOne(fetch = FetchType.LAZY, optional = false)
+ @JoinColumn(name = "child_document_id", nullable = false)
+ private Document childDocument;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "relation_type", nullable = false, length = 64)
+ private RelationType relationType;
+
+ @Column(name = "sort_order")
+ private Integer sortOrder;
+
+ @Column(name = "relation_metadata", columnDefinition = "TEXT")
+ private String relationMetadata;
+
+ @Builder.Default
+ @Column(name = "created_at", nullable = false, updatable = false)
+ private OffsetDateTime createdAt = OffsetDateTime.now();
+
+ @PrePersist
+ protected void onCreate() {
+ createdAt = OffsetDateTime.now();
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentSource.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentSource.java
new file mode 100644
index 0000000..fff1e52
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentSource.java
@@ -0,0 +1,85 @@
+package at.procon.dip.domain.document.entity;
+
+import at.procon.dip.architecture.SchemaNames;
+import at.procon.dip.domain.document.SourceType;
+import jakarta.persistence.Column;
+import jakarta.persistence.Entity;
+import jakarta.persistence.EnumType;
+import jakarta.persistence.Enumerated;
+import jakarta.persistence.FetchType;
+import jakarta.persistence.GeneratedValue;
+import jakarta.persistence.GenerationType;
+import jakarta.persistence.Id;
+import jakarta.persistence.Index;
+import jakarta.persistence.JoinColumn;
+import jakarta.persistence.ManyToOne;
+import jakarta.persistence.PrePersist;
+import jakarta.persistence.Table;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+
+/**
+ * Provenance row for a canonical document.
+ */
+@Entity
+@Table(schema = SchemaNames.DOC, name = "doc_source", indexes = {
+ @Index(name = "idx_doc_source_document", columnList = "document_id"),
+ @Index(name = "idx_doc_source_type", columnList = "source_type"),
+ @Index(name = "idx_doc_source_external_id", columnList = "external_source_id"),
+ @Index(name = "idx_doc_source_received_at", columnList = "received_at"),
+ @Index(name = "idx_doc_source_parent_source", columnList = "parent_source_id")
+})
+@Getter
+@Setter
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+public class DocumentSource {
+
+ @Id
+ @GeneratedValue(strategy = GenerationType.UUID)
+ private UUID id;
+
+ @ManyToOne(fetch = FetchType.LAZY, optional = false)
+ @JoinColumn(name = "document_id", nullable = false)
+ private Document document;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "source_type", nullable = false, length = 64)
+ private SourceType sourceType;
+
+ @Column(name = "external_source_id", length = 500)
+ private String externalSourceId;
+
+ @Column(name = "source_uri", columnDefinition = "TEXT")
+ private String sourceUri;
+
+ @Column(name = "source_filename", length = 1000)
+ private String sourceFilename;
+
+ @Column(name = "parent_source_id")
+ private UUID parentSourceId;
+
+ @Column(name = "import_batch_id", length = 255)
+ private String importBatchId;
+
+ @Column(name = "received_at")
+ private OffsetDateTime receivedAt;
+
+ @Builder.Default
+ @Column(name = "created_at", nullable = false, updatable = false)
+ private OffsetDateTime createdAt = OffsetDateTime.now();
+
+ @PrePersist
+ protected void onCreate() {
+ createdAt = OffsetDateTime.now();
+ if (receivedAt == null) {
+ receivedAt = createdAt;
+ }
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentTextRepresentation.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentTextRepresentation.java
new file mode 100644
index 0000000..cfb4774
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentTextRepresentation.java
@@ -0,0 +1,98 @@
+package at.procon.dip.domain.document.entity;
+
+import at.procon.dip.architecture.SchemaNames;
+import at.procon.dip.domain.document.RepresentationType;
+import jakarta.persistence.Column;
+import jakarta.persistence.Entity;
+import jakarta.persistence.EnumType;
+import jakarta.persistence.Enumerated;
+import jakarta.persistence.FetchType;
+import jakarta.persistence.GeneratedValue;
+import jakarta.persistence.GenerationType;
+import jakarta.persistence.Id;
+import jakarta.persistence.Index;
+import jakarta.persistence.JoinColumn;
+import jakarta.persistence.ManyToOne;
+import jakarta.persistence.PrePersist;
+import jakarta.persistence.Table;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+
+/**
+ * Search-oriented text derived from a canonical document.
+ */
+@Entity
+@Table(schema = SchemaNames.DOC, name = "doc_text_representation", indexes = {
+ @Index(name = "idx_doc_text_repr_document", columnList = "document_id"),
+ @Index(name = "idx_doc_text_repr_content", columnList = "content_id"),
+ @Index(name = "idx_doc_text_repr_type", columnList = "representation_type"),
+ @Index(name = "idx_doc_text_repr_primary", columnList = "is_primary")
+})
+@Getter
+@Setter
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+public class DocumentTextRepresentation {
+
+ @Id
+ @GeneratedValue(strategy = GenerationType.UUID)
+ private UUID id;
+
+ @ManyToOne(fetch = FetchType.LAZY, optional = false)
+ @JoinColumn(name = "document_id", nullable = false)
+ private Document document;
+
+ @ManyToOne(fetch = FetchType.LAZY)
+ @JoinColumn(name = "content_id")
+ private DocumentContent content;
+
+ @Enumerated(EnumType.STRING)
+ @Column(name = "representation_type", nullable = false, length = 64)
+ private RepresentationType representationType;
+
+ @Column(name = "builder_key", length = 255)
+ private String builderKey;
+
+ @Column(name = "language_code", length = 16)
+ private String languageCode;
+
+ @Column(name = "token_count")
+ private Integer tokenCount;
+
+ @Column(name = "char_count")
+ private Integer charCount;
+
+ @Column(name = "chunk_index")
+ private Integer chunkIndex;
+
+ @Column(name = "chunk_start_offset")
+ private Integer chunkStartOffset;
+
+ @Column(name = "chunk_end_offset")
+ private Integer chunkEndOffset;
+
+ @Builder.Default
+ @Column(name = "is_primary", nullable = false)
+ private boolean primaryRepresentation = false;
+
+ @Column(name = "text_body", columnDefinition = "TEXT", nullable = false)
+ private String textBody;
+
+ @Builder.Default
+ @Column(name = "created_at", nullable = false, updatable = false)
+ private OffsetDateTime createdAt = OffsetDateTime.now();
+
+ @PrePersist
+ protected void onCreate() {
+ createdAt = OffsetDateTime.now();
+ if (charCount == null && textBody != null) {
+ charCount = textBody.length();
+ }
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentContentRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentContentRepository.java
new file mode 100644
index 0000000..3f67cf2
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentContentRepository.java
@@ -0,0 +1,17 @@
+package at.procon.dip.domain.document.repository;
+
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.entity.DocumentContent;
+import java.util.List;
+import java.util.Optional;
+import java.util.UUID;
+import org.springframework.data.jpa.repository.JpaRepository;
+
+public interface DocumentContentRepository extends JpaRepository {
+
+ List findByDocument_Id(UUID documentId);
+
+ List findByDocument_IdAndContentRole(UUID documentId, ContentRole contentRole);
+
+ Optional findByContentHash(String contentHash);
+}
diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingModelRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingModelRepository.java
new file mode 100644
index 0000000..833c859
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingModelRepository.java
@@ -0,0 +1,11 @@
+package at.procon.dip.domain.document.repository;
+
+import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
+import java.util.Optional;
+import java.util.UUID;
+import org.springframework.data.jpa.repository.JpaRepository;
+
+public interface DocumentEmbeddingModelRepository extends JpaRepository {
+
+ Optional findByModelKey(String modelKey);
+}
diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java
new file mode 100644
index 0000000..e5e1b99
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java
@@ -0,0 +1,55 @@
+package at.procon.dip.domain.document.repository;
+
+import at.procon.dip.domain.document.EmbeddingStatus;
+import at.procon.dip.domain.document.entity.DocumentEmbedding;
+import java.time.OffsetDateTime;
+import java.util.List;
+import java.util.Optional;
+import java.util.UUID;
+import org.springframework.data.domain.Pageable;
+import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.data.jpa.repository.Modifying;
+import org.springframework.data.jpa.repository.Query;
+import org.springframework.data.repository.query.Param;
+
+public interface DocumentEmbeddingRepository extends JpaRepository {
+
+ List findByDocument_Id(UUID documentId);
+
+ List findByRepresentation_Id(UUID representationId);
+
+ List findByEmbeddingStatus(EmbeddingStatus embeddingStatus);
+
+ Optional findByRepresentation_IdAndModel_Id(UUID representationId, UUID modelId);
+
+ @Query("SELECT e.id FROM DocumentEmbedding e WHERE e.embeddingStatus = :status ORDER BY e.createdAt ASC")
+ List findIdsByEmbeddingStatus(@Param("status") EmbeddingStatus status, Pageable pageable);
+
+ @Query("SELECT e FROM DocumentEmbedding e " +
+ "JOIN FETCH e.document d " +
+ "JOIN FETCH e.representation r " +
+ "JOIN FETCH e.model m " +
+ "WHERE e.id = :embeddingId")
+ Optional findDetailedById(@Param("embeddingId") UUID embeddingId);
+
+ @Modifying
+ @Query(value = "UPDATE doc.doc_embedding SET embedding_vector = CAST(:vectorData AS vector), " +
+ "embedding_status = 'COMPLETED', embedded_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP, " +
+ "error_message = NULL, token_count = :tokenCount, embedding_dimensions = :dimensions WHERE id = :id",
+ nativeQuery = true)
+ int updateEmbeddingVector(@Param("id") UUID id,
+ @Param("vectorData") String vectorData,
+ @Param("tokenCount") Integer tokenCount,
+ @Param("dimensions") Integer dimensions);
+
+ @Modifying
+ @Query("UPDATE DocumentEmbedding e SET e.embeddingStatus = :status, e.errorMessage = :errorMessage, " +
+ "e.embeddedAt = :embeddedAt, e.updatedAt = CURRENT_TIMESTAMP WHERE e.id = :embeddingId")
+ int updateEmbeddingStatus(@Param("embeddingId") UUID embeddingId,
+ @Param("status") EmbeddingStatus status,
+ @Param("errorMessage") String errorMessage,
+ @Param("embeddedAt") OffsetDateTime embeddedAt);
+
+ @Query("SELECT e.embeddingStatus, COUNT(e) FROM DocumentEmbedding e GROUP BY e.embeddingStatus")
+ List countByEmbeddingStatus();
+}
diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java
new file mode 100644
index 0000000..a039470
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java
@@ -0,0 +1,16 @@
+package at.procon.dip.domain.document.repository;
+
+import at.procon.dip.domain.document.RelationType;
+import at.procon.dip.domain.document.entity.DocumentRelation;
+import java.util.List;
+import java.util.UUID;
+import org.springframework.data.jpa.repository.JpaRepository;
+
+public interface DocumentRelationRepository extends JpaRepository {
+
+ List findByParentDocument_Id(UUID parentDocumentId);
+
+ List findByChildDocument_Id(UUID childDocumentId);
+
+ List findByParentDocument_IdAndRelationType(UUID parentDocumentId, RelationType relationType);
+}
diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java
new file mode 100644
index 0000000..6746b75
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java
@@ -0,0 +1,31 @@
+package at.procon.dip.domain.document.repository;
+
+import at.procon.dip.domain.access.DocumentVisibility;
+import at.procon.dip.domain.document.DocumentFamily;
+import at.procon.dip.domain.document.DocumentStatus;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.domain.document.entity.Document;
+import java.util.Collection;
+import java.util.List;
+import java.util.Optional;
+import java.util.UUID;
+import org.springframework.data.jpa.repository.JpaRepository;
+
+public interface DocumentRepository extends JpaRepository {
+
+ Optional findByDedupHash(String dedupHash);
+
+ boolean existsByDedupHash(String dedupHash);
+
+ List findByDocumentType(DocumentType documentType);
+
+ List findByDocumentFamily(DocumentFamily documentFamily);
+
+ List findByStatus(DocumentStatus status);
+
+ List findByVisibility(DocumentVisibility visibility);
+
+ List findByOwnerTenant_TenantKey(String tenantKey);
+
+ List findByOwnerTenant_TenantKeyIn(Collection tenantKeys);
+}
diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentSourceRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentSourceRepository.java
new file mode 100644
index 0000000..31e100d
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentSourceRepository.java
@@ -0,0 +1,17 @@
+package at.procon.dip.domain.document.repository;
+
+import at.procon.dip.domain.document.SourceType;
+import at.procon.dip.domain.document.entity.DocumentSource;
+import java.util.List;
+import java.util.Optional;
+import java.util.UUID;
+import org.springframework.data.jpa.repository.JpaRepository;
+
+public interface DocumentSourceRepository extends JpaRepository {
+
+ List findByDocument_Id(UUID documentId);
+
+ List findBySourceType(SourceType sourceType);
+
+ Optional findByExternalSourceId(String externalSourceId);
+}
diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java
new file mode 100644
index 0000000..8dcbf34
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java
@@ -0,0 +1,19 @@
+package at.procon.dip.domain.document.repository;
+
+import at.procon.dip.domain.document.RepresentationType;
+import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
+import java.util.List;
+import java.util.Optional;
+import java.util.UUID;
+import org.springframework.data.jpa.repository.JpaRepository;
+
+public interface DocumentTextRepresentationRepository extends JpaRepository {
+
+ List findByDocument_Id(UUID documentId);
+
+ List findByDocument_IdAndRepresentationType(UUID documentId, RepresentationType representationType);
+
+ List findByPrimaryRepresentationTrue();
+
+ Optional findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId);
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java
new file mode 100644
index 0000000..27ee608
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java
@@ -0,0 +1,45 @@
+package at.procon.dip.domain.document.service;
+
+import at.procon.dip.domain.document.entity.DocumentContent;
+import at.procon.dip.domain.document.repository.DocumentContentRepository;
+import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
+import java.util.List;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+@Service
+@RequiredArgsConstructor
+@Transactional
+public class DocumentContentService {
+
+ private final DocumentService documentService;
+ private final DocumentContentRepository contentRepository;
+
+ public DocumentContent addContent(AddDocumentContentCommand command) {
+ DocumentContent content = DocumentContent.builder()
+ .document(documentService.getRequired(command.documentId()))
+ .contentRole(command.contentRole())
+ .storageType(command.storageType())
+ .mimeType(command.mimeType())
+ .charsetName(command.charsetName())
+ .textContent(command.textContent())
+ .binaryRef(command.binaryRef())
+ .contentHash(command.contentHash())
+ .sizeBytes(command.sizeBytes())
+ .build();
+ return contentRepository.save(content);
+ }
+
+ @Transactional(readOnly = true)
+ public DocumentContent getRequired(UUID contentId) {
+ return contentRepository.findById(contentId)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown content id: " + contentId));
+ }
+
+ @Transactional(readOnly = true)
+ public List findByDocument(UUID documentId) {
+ return contentRepository.findByDocument_Id(documentId);
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentEmbeddingService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentEmbeddingService.java
new file mode 100644
index 0000000..6caf4be
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/DocumentEmbeddingService.java
@@ -0,0 +1,125 @@
+package at.procon.dip.domain.document.service;
+
+import at.procon.dip.domain.document.DistanceMetric;
+import at.procon.dip.domain.document.EmbeddingStatus;
+import at.procon.dip.domain.document.entity.DocumentEmbedding;
+import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
+import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
+import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
+import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
+import java.time.OffsetDateTime;
+import java.util.List;
+import java.util.Optional;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+@Service
+@RequiredArgsConstructor
+@Transactional
+public class DocumentEmbeddingService {
+
+ private final DocumentService documentService;
+ private final DocumentRepresentationService representationService;
+ private final DocumentEmbeddingRepository embeddingRepository;
+ private final DocumentEmbeddingModelRepository modelRepository;
+
+ public DocumentEmbeddingModel registerModel(RegisterEmbeddingModelCommand command) {
+ DocumentEmbeddingModel model = modelRepository.findByModelKey(command.modelKey())
+ .orElseGet(DocumentEmbeddingModel::new);
+ model.setModelKey(command.modelKey());
+ model.setProvider(command.provider());
+ model.setDisplayName(command.displayName());
+ model.setDimensions(command.dimensions());
+ model.setDistanceMetric(command.distanceMetric() == null ? DistanceMetric.COSINE : command.distanceMetric());
+ model.setQueryPrefixRequired(command.queryPrefixRequired());
+ model.setActive(command.active());
+ return modelRepository.save(model);
+ }
+
+ public DocumentEmbedding createPendingEmbedding(UUID documentId, UUID representationId, UUID modelId) {
+ DocumentEmbeddingModel model = getRequiredModel(modelId);
+ DocumentEmbedding embedding = DocumentEmbedding.builder()
+ .document(documentService.getRequired(documentId))
+ .representation(representationService.getRequired(representationId))
+ .model(model)
+ .embeddingDimensions(model.getDimensions())
+ .embeddingStatus(EmbeddingStatus.PENDING)
+ .build();
+ return embeddingRepository.save(embedding);
+ }
+
+ public DocumentEmbedding ensurePendingEmbedding(UUID documentId, UUID representationId, UUID modelId) {
+ Optional existing = embeddingRepository.findByRepresentation_IdAndModel_Id(representationId, modelId);
+ if (existing.isPresent()) {
+ DocumentEmbedding embedding = existing.get();
+ embedding.setDocument(documentService.getRequired(documentId));
+ embedding.setRepresentation(representationService.getRequired(representationId));
+ embedding.setModel(getRequiredModel(modelId));
+ embedding.setEmbeddingDimensions(embedding.getModel().getDimensions());
+ embedding.setEmbeddingStatus(EmbeddingStatus.PENDING);
+ embedding.setErrorMessage(null);
+ embedding.setEmbeddedAt(null);
+ return embeddingRepository.save(embedding);
+ }
+ return createPendingEmbedding(documentId, representationId, modelId);
+ }
+
+ public DocumentEmbedding markCompleted(UUID embeddingId, Integer tokenCount) {
+ DocumentEmbedding embedding = getRequired(embeddingId);
+ embedding.setEmbeddingStatus(EmbeddingStatus.COMPLETED);
+ embedding.setTokenCount(tokenCount);
+ embedding.setEmbeddedAt(OffsetDateTime.now());
+ embedding.setErrorMessage(null);
+ return embeddingRepository.save(embedding);
+ }
+
+ public DocumentEmbedding markFailed(UUID embeddingId, String errorMessage) {
+ DocumentEmbedding embedding = getRequired(embeddingId);
+ embedding.setEmbeddingStatus(EmbeddingStatus.FAILED);
+ embedding.setErrorMessage(errorMessage);
+ embedding.setEmbeddedAt(null);
+ return embeddingRepository.save(embedding);
+ }
+
+ public DocumentEmbedding markProcessing(UUID embeddingId) {
+ DocumentEmbedding embedding = getRequired(embeddingId);
+ embedding.setEmbeddingStatus(EmbeddingStatus.PROCESSING);
+ embedding.setErrorMessage(null);
+ return embeddingRepository.save(embedding);
+ }
+
+ public DocumentEmbedding markSkipped(UUID embeddingId, String reason) {
+ DocumentEmbedding embedding = getRequired(embeddingId);
+ embedding.setEmbeddingStatus(EmbeddingStatus.SKIPPED);
+ embedding.setErrorMessage(reason);
+ embedding.setEmbeddedAt(OffsetDateTime.now());
+ return embeddingRepository.save(embedding);
+ }
+
+ @Transactional(readOnly = true)
+ public DocumentEmbedding getRequired(UUID embeddingId) {
+ return embeddingRepository.findById(embeddingId)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
+ }
+
+ @Transactional(readOnly = true)
+ public DocumentEmbeddingModel getRequiredModel(UUID modelId) {
+ return modelRepository.findById(modelId)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown embedding model id: " + modelId));
+ }
+
+
+ @Transactional(readOnly = true)
+ public DocumentEmbeddingModel findActiveModelByKey(String modelKey) {
+ return modelRepository.findByModelKey(modelKey)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown embedding model key: " + modelKey));
+ }
+
+ @Transactional(readOnly = true)
+ public List findPendingEmbeddings() {
+ return embeddingRepository.findByEmbeddingStatus(EmbeddingStatus.PENDING);
+ }
+}
+
diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java
new file mode 100644
index 0000000..f9c1bb1
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java
@@ -0,0 +1,35 @@
+package at.procon.dip.domain.document.service;
+
+import at.procon.dip.domain.document.entity.DocumentRelation;
+import at.procon.dip.domain.document.repository.DocumentRelationRepository;
+import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
+import java.util.List;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+@Service
+@RequiredArgsConstructor
+@Transactional
+public class DocumentRelationService {
+
+ private final DocumentService documentService;
+ private final DocumentRelationRepository relationRepository;
+
+ public DocumentRelation createRelation(CreateDocumentRelationCommand command) {
+ DocumentRelation relation = DocumentRelation.builder()
+ .parentDocument(documentService.getRequired(command.parentDocumentId()))
+ .childDocument(documentService.getRequired(command.childDocumentId()))
+ .relationType(command.relationType())
+ .sortOrder(command.sortOrder())
+ .relationMetadata(command.relationMetadata())
+ .build();
+ return relationRepository.save(relation);
+ }
+
+ @Transactional(readOnly = true)
+ public List findChildren(UUID parentDocumentId) {
+ return relationRepository.findByParentDocument_Id(parentDocumentId);
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java
new file mode 100644
index 0000000..8111e08
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java
@@ -0,0 +1,50 @@
+package at.procon.dip.domain.document.service;
+
+import at.procon.dip.domain.document.entity.DocumentContent;
+import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
+import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
+import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
+import java.util.List;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+@Service
+@RequiredArgsConstructor
+@Transactional
+public class DocumentRepresentationService {
+
+ private final DocumentService documentService;
+ private final DocumentContentService contentService;
+ private final DocumentTextRepresentationRepository representationRepository;
+
+ public DocumentTextRepresentation addRepresentation(AddDocumentTextRepresentationCommand command) {
+ DocumentContent content = command.contentId() == null ? null : contentService.getRequired(command.contentId());
+ DocumentTextRepresentation representation = DocumentTextRepresentation.builder()
+ .document(documentService.getRequired(command.documentId()))
+ .content(content)
+ .representationType(command.representationType())
+ .builderKey(command.builderKey())
+ .languageCode(command.languageCode())
+ .tokenCount(command.tokenCount())
+ .chunkIndex(command.chunkIndex())
+ .chunkStartOffset(command.chunkStartOffset())
+ .chunkEndOffset(command.chunkEndOffset())
+ .primaryRepresentation(command.primaryRepresentation())
+ .textBody(command.textBody())
+ .build();
+ return representationRepository.save(representation);
+ }
+
+ @Transactional(readOnly = true)
+ public DocumentTextRepresentation getRequired(UUID representationId) {
+ return representationRepository.findById(representationId)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown representation id: " + representationId));
+ }
+
+ @Transactional(readOnly = true)
+ public List findByDocument(UUID documentId) {
+ return representationRepository.findByDocument_Id(documentId);
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentService.java
new file mode 100644
index 0000000..22bfe22
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/DocumentService.java
@@ -0,0 +1,75 @@
+package at.procon.dip.domain.document.service;
+
+import at.procon.dip.domain.document.CanonicalDocumentMetadata;
+import at.procon.dip.domain.document.DocumentStatus;
+import at.procon.dip.domain.document.entity.Document;
+import at.procon.dip.domain.document.repository.DocumentRepository;
+import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
+import at.procon.dip.domain.tenant.entity.DocumentTenant;
+import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
+import java.util.List;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+@Service
+@RequiredArgsConstructor
+@Transactional
+public class DocumentService {
+
+ private final DocumentRepository documentRepository;
+ private final DocumentTenantRepository tenantRepository;
+
+ public Document create(CreateDocumentCommand command) {
+ DocumentTenant ownerTenant = resolveOwnerTenant(command.ownerTenantKey());
+ Document document = Document.builder()
+ .ownerTenant(ownerTenant)
+ .visibility(command.visibility())
+ .documentType(command.documentType())
+ .documentFamily(command.documentFamily())
+ .status(command.status() == null ? DocumentStatus.RECEIVED : command.status())
+ .title(command.title())
+ .summary(command.summary())
+ .languageCode(command.languageCode())
+ .mimeType(command.mimeType())
+ .businessKey(command.businessKey())
+ .dedupHash(command.dedupHash())
+ .build();
+ return documentRepository.save(document);
+ }
+
+ public Document save(Document document) {
+ return documentRepository.save(document);
+ }
+
+ public Document updateStatus(UUID documentId, DocumentStatus status) {
+ Document document = getRequired(documentId);
+ document.setStatus(status);
+ return documentRepository.save(document);
+ }
+
+ @Transactional(readOnly = true)
+ public Document getRequired(UUID documentId) {
+ return documentRepository.findById(documentId)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown document id: " + documentId));
+ }
+
+ @Transactional(readOnly = true)
+ public List findAll() {
+ return documentRepository.findAll();
+ }
+
+ @Transactional(readOnly = true)
+ public CanonicalDocumentMetadata getMetadata(UUID documentId) {
+ return getRequired(documentId).toCanonicalMetadata();
+ }
+
+ private DocumentTenant resolveOwnerTenant(String ownerTenantKey) {
+ if (ownerTenantKey == null || ownerTenantKey.isBlank()) {
+ return null;
+ }
+ return tenantRepository.findByTenantKey(ownerTenantKey)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown tenant key: " + ownerTenantKey));
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentSourceService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentSourceService.java
new file mode 100644
index 0000000..c26e226
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/DocumentSourceService.java
@@ -0,0 +1,38 @@
+package at.procon.dip.domain.document.service;
+
+import at.procon.dip.domain.document.entity.DocumentSource;
+import at.procon.dip.domain.document.repository.DocumentSourceRepository;
+import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
+import java.util.List;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+@Service
+@RequiredArgsConstructor
+@Transactional
+public class DocumentSourceService {
+
+ private final DocumentService documentService;
+ private final DocumentSourceRepository sourceRepository;
+
+ public DocumentSource addSource(AddDocumentSourceCommand command) {
+ DocumentSource source = DocumentSource.builder()
+ .document(documentService.getRequired(command.documentId()))
+ .sourceType(command.sourceType())
+ .externalSourceId(command.externalSourceId())
+ .sourceUri(command.sourceUri())
+ .sourceFilename(command.sourceFilename())
+ .parentSourceId(command.parentSourceId())
+ .importBatchId(command.importBatchId())
+ .receivedAt(command.receivedAt())
+ .build();
+ return sourceRepository.save(source);
+ }
+
+ @Transactional(readOnly = true)
+ public List findByDocument(UUID documentId) {
+ return sourceRepository.findByDocument_Id(documentId);
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java
new file mode 100644
index 0000000..284f9be
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java
@@ -0,0 +1,18 @@
+package at.procon.dip.domain.document.service.command;
+
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.StorageType;
+import java.util.UUID;
+
+public record AddDocumentContentCommand(
+ UUID documentId,
+ ContentRole contentRole,
+ StorageType storageType,
+ String mimeType,
+ String charsetName,
+ String textContent,
+ String binaryRef,
+ String contentHash,
+ Long sizeBytes
+) {
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentSourceCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentSourceCommand.java
new file mode 100644
index 0000000..75961b8
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentSourceCommand.java
@@ -0,0 +1,17 @@
+package at.procon.dip.domain.document.service.command;
+
+import at.procon.dip.domain.document.SourceType;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+
+public record AddDocumentSourceCommand(
+ UUID documentId,
+ SourceType sourceType,
+ String externalSourceId,
+ String sourceUri,
+ String sourceFilename,
+ UUID parentSourceId,
+ String importBatchId,
+ OffsetDateTime receivedAt
+) {
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java
new file mode 100644
index 0000000..3106218
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java
@@ -0,0 +1,19 @@
+package at.procon.dip.domain.document.service.command;
+
+import at.procon.dip.domain.document.RepresentationType;
+import java.util.UUID;
+
+public record AddDocumentTextRepresentationCommand(
+ UUID documentId,
+ UUID contentId,
+ RepresentationType representationType,
+ String builderKey,
+ String languageCode,
+ Integer tokenCount,
+ Integer chunkIndex,
+ Integer chunkStartOffset,
+ Integer chunkEndOffset,
+ boolean primaryRepresentation,
+ String textBody
+) {
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentCommand.java
new file mode 100644
index 0000000..77345d3
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentCommand.java
@@ -0,0 +1,24 @@
+package at.procon.dip.domain.document.service.command;
+
+import at.procon.dip.domain.access.DocumentVisibility;
+import at.procon.dip.domain.document.DocumentFamily;
+import at.procon.dip.domain.document.DocumentStatus;
+import at.procon.dip.domain.document.DocumentType;
+
+/**
+ * Minimal Phase 1 command for creating the canonical document root.
+ */
+public record CreateDocumentCommand(
+ String ownerTenantKey,
+ DocumentVisibility visibility,
+ DocumentType documentType,
+ DocumentFamily documentFamily,
+ DocumentStatus status,
+ String title,
+ String summary,
+ String languageCode,
+ String mimeType,
+ String businessKey,
+ String dedupHash
+) {
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentRelationCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentRelationCommand.java
new file mode 100644
index 0000000..8bc0d39
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentRelationCommand.java
@@ -0,0 +1,13 @@
+package at.procon.dip.domain.document.service.command;
+
+import at.procon.dip.domain.document.RelationType;
+import java.util.UUID;
+
+public record CreateDocumentRelationCommand(
+ UUID parentDocumentId,
+ UUID childDocumentId,
+ RelationType relationType,
+ Integer sortOrder,
+ String relationMetadata
+) {
+}
diff --git a/src/main/java/at/procon/dip/domain/document/service/command/RegisterEmbeddingModelCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/RegisterEmbeddingModelCommand.java
new file mode 100644
index 0000000..fbb52b9
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/document/service/command/RegisterEmbeddingModelCommand.java
@@ -0,0 +1,14 @@
+package at.procon.dip.domain.document.service.command;
+
+import at.procon.dip.domain.document.DistanceMetric;
+
+public record RegisterEmbeddingModelCommand(
+ String modelKey,
+ String provider,
+ String displayName,
+ Integer dimensions,
+ DistanceMetric distanceMetric,
+ boolean queryPrefixRequired,
+ boolean active
+) {
+}
diff --git a/src/main/java/at/procon/dip/domain/tenant/TenantRef.java b/src/main/java/at/procon/dip/domain/tenant/TenantRef.java
new file mode 100644
index 0000000..0e56fc1
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/tenant/TenantRef.java
@@ -0,0 +1,11 @@
+package at.procon.dip.domain.tenant;
+
+/**
+ * Canonical tenant reference used to express document ownership.
+ */
+public record TenantRef(
+ String tenantId,
+ String tenantKey,
+ String displayName
+) {
+}
diff --git a/src/main/java/at/procon/dip/domain/tenant/entity/DocumentTenant.java b/src/main/java/at/procon/dip/domain/tenant/entity/DocumentTenant.java
new file mode 100644
index 0000000..fffc368
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/tenant/entity/DocumentTenant.java
@@ -0,0 +1,71 @@
+package at.procon.dip.domain.tenant.entity;
+
+import at.procon.dip.architecture.SchemaNames;
+import jakarta.persistence.Column;
+import jakarta.persistence.Entity;
+import jakarta.persistence.GeneratedValue;
+import jakarta.persistence.GenerationType;
+import jakarta.persistence.Id;
+import jakarta.persistence.Index;
+import jakarta.persistence.PrePersist;
+import jakarta.persistence.PreUpdate;
+import jakarta.persistence.Table;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+
+/**
+ * Canonical owner tenant catalog for the generalized DOC schema.
+ */
+@Entity
+@Table(schema = SchemaNames.DOC, name = "doc_tenant", indexes = {
+ @Index(name = "idx_doc_tenant_key", columnList = "tenant_key", unique = true),
+ @Index(name = "idx_doc_tenant_active", columnList = "active")
+})
+@Getter
+@Setter
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+public class DocumentTenant {
+
+ @Id
+ @GeneratedValue(strategy = GenerationType.UUID)
+ private UUID id;
+
+ @Column(name = "tenant_key", nullable = false, unique = true, length = 120)
+ private String tenantKey;
+
+ @Column(name = "display_name", nullable = false, length = 255)
+ private String displayName;
+
+ @Column(name = "description", columnDefinition = "TEXT")
+ private String description;
+
+ @Builder.Default
+ @Column(name = "active", nullable = false)
+ private boolean active = true;
+
+ @Builder.Default
+ @Column(name = "created_at", nullable = false, updatable = false)
+ private OffsetDateTime createdAt = OffsetDateTime.now();
+
+ @Builder.Default
+ @Column(name = "updated_at", nullable = false)
+ private OffsetDateTime updatedAt = OffsetDateTime.now();
+
+ @PrePersist
+ protected void onCreate() {
+ createdAt = OffsetDateTime.now();
+ updatedAt = OffsetDateTime.now();
+ }
+
+ @PreUpdate
+ protected void onUpdate() {
+ updatedAt = OffsetDateTime.now();
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/tenant/repository/DocumentTenantRepository.java b/src/main/java/at/procon/dip/domain/tenant/repository/DocumentTenantRepository.java
new file mode 100644
index 0000000..7bd8299
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/tenant/repository/DocumentTenantRepository.java
@@ -0,0 +1,13 @@
+package at.procon.dip.domain.tenant.repository;
+
+import at.procon.dip.domain.tenant.entity.DocumentTenant;
+import java.util.Optional;
+import java.util.UUID;
+import org.springframework.data.jpa.repository.JpaRepository;
+
+public interface DocumentTenantRepository extends JpaRepository {
+
+ Optional findByTenantKey(String tenantKey);
+
+ boolean existsByTenantKey(String tenantKey);
+}
diff --git a/src/main/java/at/procon/dip/domain/tenant/service/DocumentTenantService.java b/src/main/java/at/procon/dip/domain/tenant/service/DocumentTenantService.java
new file mode 100644
index 0000000..811e114
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/tenant/service/DocumentTenantService.java
@@ -0,0 +1,45 @@
+package at.procon.dip.domain.tenant.service;
+
+import at.procon.dip.domain.tenant.entity.DocumentTenant;
+import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
+import at.procon.dip.domain.tenant.service.command.CreateTenantCommand;
+import java.util.List;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+@Service
+@RequiredArgsConstructor
+@Transactional
+public class DocumentTenantService {
+
+ private final DocumentTenantRepository tenantRepository;
+
+ public DocumentTenant createOrUpdate(CreateTenantCommand command) {
+ DocumentTenant tenant = tenantRepository.findByTenantKey(command.tenantKey())
+ .orElseGet(DocumentTenant::new);
+ tenant.setTenantKey(command.tenantKey());
+ tenant.setDisplayName(command.displayName());
+ tenant.setDescription(command.description());
+ tenant.setActive(command.active());
+ return tenantRepository.save(tenant);
+ }
+
+ @Transactional(readOnly = true)
+ public DocumentTenant getRequiredById(UUID id) {
+ return tenantRepository.findById(id)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown tenant id: " + id));
+ }
+
+ @Transactional(readOnly = true)
+ public DocumentTenant getRequiredByTenantKey(String tenantKey) {
+ return tenantRepository.findByTenantKey(tenantKey)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown tenant key: " + tenantKey));
+ }
+
+ @Transactional(readOnly = true)
+ public List findAll() {
+ return tenantRepository.findAll();
+ }
+}
diff --git a/src/main/java/at/procon/dip/domain/tenant/service/command/CreateTenantCommand.java b/src/main/java/at/procon/dip/domain/tenant/service/command/CreateTenantCommand.java
new file mode 100644
index 0000000..ea6fe20
--- /dev/null
+++ b/src/main/java/at/procon/dip/domain/tenant/service/command/CreateTenantCommand.java
@@ -0,0 +1,9 @@
+package at.procon.dip.domain.tenant.service.command;
+
+public record CreateTenantCommand(
+ String tenantKey,
+ String displayName,
+ String description,
+ boolean active
+) {
+}
diff --git a/src/main/java/at/procon/dip/extraction/spi/DocumentExtractor.java b/src/main/java/at/procon/dip/extraction/spi/DocumentExtractor.java
new file mode 100644
index 0000000..934837a
--- /dev/null
+++ b/src/main/java/at/procon/dip/extraction/spi/DocumentExtractor.java
@@ -0,0 +1,13 @@
+package at.procon.dip.extraction.spi;
+
+import at.procon.dip.domain.document.DocumentType;
+
+/**
+ * Type-specific extraction contract.
+ */
+public interface DocumentExtractor {
+
+ boolean supports(DocumentType documentType, String mimeType);
+
+ ExtractionResult extract(ExtractionRequest extractionRequest);
+}
diff --git a/src/main/java/at/procon/dip/extraction/spi/ExtractedStructuredPayload.java b/src/main/java/at/procon/dip/extraction/spi/ExtractedStructuredPayload.java
new file mode 100644
index 0000000..5c74505
--- /dev/null
+++ b/src/main/java/at/procon/dip/extraction/spi/ExtractedStructuredPayload.java
@@ -0,0 +1,12 @@
+package at.procon.dip.extraction.spi;
+
+import java.util.Map;
+
+/**
+ * Type-specific structured payload produced by an extractor.
+ */
+public record ExtractedStructuredPayload(
+ String projectionName,
+ Map attributes
+) {
+}
diff --git a/src/main/java/at/procon/dip/extraction/spi/ExtractionRequest.java b/src/main/java/at/procon/dip/extraction/spi/ExtractionRequest.java
new file mode 100644
index 0000000..e6a3e54
--- /dev/null
+++ b/src/main/java/at/procon/dip/extraction/spi/ExtractionRequest.java
@@ -0,0 +1,15 @@
+package at.procon.dip.extraction.spi;
+
+import at.procon.dip.classification.spi.DetectionResult;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+
+/**
+ * Input to a document extractor.
+ */
+public record ExtractionRequest(
+ SourceDescriptor sourceDescriptor,
+ DetectionResult detectionResult,
+ String textContent,
+ byte[] binaryContent
+) {
+}
diff --git a/src/main/java/at/procon/dip/extraction/spi/ExtractionResult.java b/src/main/java/at/procon/dip/extraction/spi/ExtractionResult.java
new file mode 100644
index 0000000..c5c0ac1
--- /dev/null
+++ b/src/main/java/at/procon/dip/extraction/spi/ExtractionResult.java
@@ -0,0 +1,15 @@
+package at.procon.dip.extraction.spi;
+
+import at.procon.dip.domain.document.ContentRole;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Output of a document extractor before normalization and persistence.
+ */
+public record ExtractionResult(
+ Map derivedTextByRole,
+ List structuredPayloads,
+ List warnings
+) {
+}
diff --git a/src/main/java/at/procon/dip/ingestion/spi/DocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/spi/DocumentIngestionAdapter.java
new file mode 100644
index 0000000..4c8be89
--- /dev/null
+++ b/src/main/java/at/procon/dip/ingestion/spi/DocumentIngestionAdapter.java
@@ -0,0 +1,11 @@
+package at.procon.dip.ingestion.spi;
+
+/**
+ * Extension point for source-specific import adapters.
+ */
+public interface DocumentIngestionAdapter {
+
+ boolean supports(SourceDescriptor sourceDescriptor);
+
+ IngestionResult ingest(SourceDescriptor sourceDescriptor);
+}
diff --git a/src/main/java/at/procon/dip/ingestion/spi/IngestionResult.java b/src/main/java/at/procon/dip/ingestion/spi/IngestionResult.java
new file mode 100644
index 0000000..fff5bc4
--- /dev/null
+++ b/src/main/java/at/procon/dip/ingestion/spi/IngestionResult.java
@@ -0,0 +1,13 @@
+package at.procon.dip.ingestion.spi;
+
+import at.procon.dip.domain.document.CanonicalDocumentMetadata;
+import java.util.List;
+
+/**
+ * Result of an ingestion adapter execution.
+ */
+public record IngestionResult(
+ List documents,
+ List warnings
+) {
+}
diff --git a/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java
new file mode 100644
index 0000000..644bd5e
--- /dev/null
+++ b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java
@@ -0,0 +1,19 @@
+package at.procon.dip.ingestion.spi;
+
+import at.procon.dip.domain.access.DocumentAccessContext;
+import at.procon.dip.domain.document.SourceType;
+import java.util.Map;
+
+/**
+ * Describes a source object that should be ingested into the canonical document model.
+ */
+public record SourceDescriptor(
+ DocumentAccessContext accessContext,
+ SourceType sourceType,
+ String sourceIdentifier,
+ String sourceUri,
+ String fileName,
+ String mediaType,
+ Map attributes
+) {
+}
diff --git a/src/main/java/at/procon/dip/migration/MigrationStrategyMode.java b/src/main/java/at/procon/dip/migration/MigrationStrategyMode.java
new file mode 100644
index 0000000..57e5b53
--- /dev/null
+++ b/src/main/java/at/procon/dip/migration/MigrationStrategyMode.java
@@ -0,0 +1,12 @@
+package at.procon.dip.migration;
+
+/**
+ * Phase 0 decision for introducing the generalized model incrementally.
+ */
+public enum MigrationStrategyMode {
+ ADDITIVE_SCHEMA,
+ DUAL_WRITE,
+ BACKFILL,
+ CUTOVER,
+ RETIRE_LEGACY
+}
diff --git a/src/main/java/at/procon/dip/normalization/spi/RepresentationBuildRequest.java b/src/main/java/at/procon/dip/normalization/spi/RepresentationBuildRequest.java
new file mode 100644
index 0000000..13358e4
--- /dev/null
+++ b/src/main/java/at/procon/dip/normalization/spi/RepresentationBuildRequest.java
@@ -0,0 +1,15 @@
+package at.procon.dip.normalization.spi;
+
+import at.procon.dip.classification.spi.DetectionResult;
+import at.procon.dip.extraction.spi.ExtractionResult;
+import at.procon.dip.ingestion.spi.SourceDescriptor;
+
+/**
+ * Input for text-representation builders.
+ */
+public record RepresentationBuildRequest(
+ SourceDescriptor sourceDescriptor,
+ DetectionResult detectionResult,
+ ExtractionResult extractionResult
+) {
+}
diff --git a/src/main/java/at/procon/dip/normalization/spi/TextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationBuilder.java
new file mode 100644
index 0000000..c7fa594
--- /dev/null
+++ b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationBuilder.java
@@ -0,0 +1,14 @@
+package at.procon.dip.normalization.spi;
+
+import at.procon.dip.domain.document.DocumentType;
+import java.util.List;
+
+/**
+ * Builds search-oriented text representations independently from raw extraction.
+ */
+public interface TextRepresentationBuilder {
+
+ boolean supports(DocumentType documentType);
+
+ List build(RepresentationBuildRequest request);
+}
diff --git a/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java
new file mode 100644
index 0000000..af1f49a
--- /dev/null
+++ b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java
@@ -0,0 +1,15 @@
+package at.procon.dip.normalization.spi;
+
+import at.procon.dip.domain.document.RepresentationType;
+
+/**
+ * Candidate text representation for semantic indexing.
+ */
+public record TextRepresentationDraft(
+ RepresentationType representationType,
+ String languageCode,
+ String textBody,
+ boolean primary,
+ Integer chunkIndex
+) {
+}
diff --git a/src/main/java/at/procon/dip/processing/spi/ProcessingStage.java b/src/main/java/at/procon/dip/processing/spi/ProcessingStage.java
new file mode 100644
index 0000000..474a259
--- /dev/null
+++ b/src/main/java/at/procon/dip/processing/spi/ProcessingStage.java
@@ -0,0 +1,14 @@
+package at.procon.dip.processing.spi;
+
+/**
+ * Cross-cutting processing stages for generic document orchestration.
+ */
+public enum ProcessingStage {
+ INGESTION,
+ CLASSIFICATION,
+ EXTRACTION,
+ NORMALIZATION,
+ VECTORIZATION,
+ INDEXING,
+ SEARCH
+}
diff --git a/src/main/java/at/procon/dip/search/spi/SearchDocumentScope.java b/src/main/java/at/procon/dip/search/spi/SearchDocumentScope.java
new file mode 100644
index 0000000..fd2a373
--- /dev/null
+++ b/src/main/java/at/procon/dip/search/spi/SearchDocumentScope.java
@@ -0,0 +1,18 @@
+package at.procon.dip.search.spi;
+
+import at.procon.dip.domain.access.DocumentVisibility;
+import at.procon.dip.domain.document.DocumentFamily;
+import at.procon.dip.domain.document.DocumentType;
+import java.util.Set;
+
+/**
+ * Minimal generic search scope for future hybrid/semantic search services.
+ */
+public record SearchDocumentScope(
+ Set ownerTenantKeys,
+ Set documentTypes,
+ Set documentFamilies,
+ Set visibilities,
+ String languageCode
+) {
+}
diff --git a/src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java b/src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java
new file mode 100644
index 0000000..8330325
--- /dev/null
+++ b/src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java
@@ -0,0 +1,211 @@
+package at.procon.dip.vectorization.camel;
+
+import at.procon.dip.domain.document.EmbeddingStatus;
+import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
+import at.procon.dip.vectorization.service.DocumentEmbeddingProcessingService;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.camel.Exchange;
+import org.apache.camel.LoggingLevel;
+import org.apache.camel.builder.RouteBuilder;
+import org.apache.camel.model.dataformat.JsonLibrary;
+import org.springframework.data.domain.PageRequest;
+import org.springframework.stereotype.Component;
+
+import at.procon.ted.config.TedProcessorProperties;
+import java.util.List;
+import java.util.UUID;
+
+/**
+ * Phase 2 generic vectorization route.
+ * Uses DOC.doc_text_representation as the source text and DOC.doc_embedding as the write target.
+ */
+@Component
+@RequiredArgsConstructor
+@Slf4j
+public class GenericVectorizationRoute extends RouteBuilder {
+
+ private static final String ROUTE_ID_TRIGGER = "generic-vectorization-trigger";
+ private static final String ROUTE_ID_PROCESSOR = "generic-vectorization-processor";
+ private static final String ROUTE_ID_SCHEDULER = "generic-vectorization-scheduler";
+
+ private final TedProcessorProperties properties;
+ private final DocumentEmbeddingRepository embeddingRepository;
+ private final DocumentEmbeddingProcessingService processingService;
+
+ private java.util.concurrent.ExecutorService executorService() {
+ return java.util.concurrent.Executors.newFixedThreadPool(
+ 1,
+ r -> {
+ Thread thread = new Thread(r);
+ thread.setName("doc-vectorization-" + thread.getId());
+ thread.setDaemon(true);
+ thread.setPriority(Thread.MAX_PRIORITY);
+ return thread;
+ }
+ );
+ }
+
+ @Override
+ public void configure() {
+ if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) {
+ log.info("Phase 2 generic vectorization route disabled");
+ return;
+ }
+
+ log.info("Configuring generic vectorization routes (phase2=true, apiUrl={}, scheduler={}ms)",
+ properties.getVectorization().getApiUrl(),
+ properties.getVectorization().getGenericSchedulerPeriodMs());
+
+ onException(Exception.class)
+ .handled(true)
+ .process(exchange -> {
+ UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
+ Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class);
+ String error = exception != null ? exception.getMessage() : "Unknown vectorization error";
+ if (embeddingId != null) {
+ try {
+ processingService.markAsFailed(embeddingId, error);
+ } catch (Exception nested) {
+ log.warn("Failed to mark embedding {} as failed: {}", embeddingId, nested.getMessage());
+ }
+ }
+ })
+ .to("log:generic-vectorization-error?level=WARN");
+
+ from("direct:vectorize-embedding")
+ .routeId(ROUTE_ID_TRIGGER)
+ .doTry()
+ .to("seda:vectorize-embedding-async?waitForTaskToComplete=Never&size=1000&blockWhenFull=true&timeout=5000")
+ .doCatch(Exception.class)
+ .log(LoggingLevel.WARN, "Failed to queue embedding ${header.embeddingId}: ${exception.message}")
+ .end();
+
+ from("seda:vectorize-embedding-async?size=1000")
+ .routeId(ROUTE_ID_PROCESSOR)
+ .threads().executorService(executorService())
+ .process(exchange -> {
+ UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
+ DocumentEmbeddingProcessingService.EmbeddingPayload payload =
+ processingService.prepareEmbeddingForVectorization(embeddingId);
+ if (payload == null) {
+ exchange.setProperty("skipVectorization", true);
+ return;
+ }
+
+ EmbedRequest request = new EmbedRequest();
+ request.text = payload.textContent();
+ request.isQuery = false;
+
+ exchange.getIn().setHeader("embeddingId", payload.embeddingId());
+ exchange.getIn().setHeader("documentId", payload.documentId());
+ exchange.getIn().setHeader(Exchange.HTTP_METHOD, "POST");
+ exchange.getIn().setHeader(Exchange.CONTENT_TYPE, "application/json");
+ exchange.getIn().setBody(request);
+ })
+ .choice()
+ .when(exchangeProperty("skipVectorization").isEqualTo(true))
+ .log(LoggingLevel.DEBUG, "Skipping generic vectorization for ${header.embeddingId}")
+ .otherwise()
+ .marshal().json(JsonLibrary.Jackson)
+ .setProperty("retryCount", constant(0))
+ .setProperty("maxRetries", constant(properties.getVectorization().getMaxRetries()))
+ .setProperty("vectorizationSuccess", constant(false))
+ .loopDoWhile(simple("${exchangeProperty.vectorizationSuccess} == false && ${exchangeProperty.retryCount} < ${exchangeProperty.maxRetries}"))
+ .process(exchange -> {
+ Integer retryCount = exchange.getProperty("retryCount", Integer.class);
+ exchange.setProperty("retryCount", retryCount + 1);
+ if (retryCount > 0) {
+ long backoffMs = (long) Math.pow(2, retryCount) * 1000L;
+ Thread.sleep(backoffMs);
+ }
+ })
+ .doTry()
+ .toD(properties.getVectorization().getApiUrl() + "/embed?bridgeEndpoint=true&throwExceptionOnFailure=false&connectTimeout=" +
+ properties.getVectorization().getConnectTimeout() + "&socketTimeout=" +
+ properties.getVectorization().getSocketTimeout())
+ .process(exchange -> {
+ Integer statusCode = exchange.getIn().getHeader(Exchange.HTTP_RESPONSE_CODE, Integer.class);
+ if (statusCode == null || statusCode != 200) {
+ String body = exchange.getIn().getBody(String.class);
+ throw new RuntimeException("Embedding service returned HTTP " + statusCode + ": " + body);
+ }
+ })
+ .unmarshal().json(JsonLibrary.Jackson, EmbedResponse.class)
+ .process(exchange -> {
+ UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
+ EmbedResponse response = exchange.getIn().getBody(EmbedResponse.class);
+ if (response == null || response.embedding == null) {
+ throw new RuntimeException("Embedding service returned null embedding response");
+ }
+ processingService.saveEmbedding(embeddingId, response.embedding, response.tokenCount);
+ exchange.setProperty("vectorizationSuccess", true);
+ })
+ .doCatch(Exception.class)
+ .process(exchange -> {
+ UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
+ Integer retryCount = exchange.getProperty("retryCount", Integer.class);
+ Integer maxRetries = exchange.getProperty("maxRetries", Integer.class);
+ Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class);
+ String errorMsg = exception != null ? exception.getMessage() : "Unknown error";
+ if (errorMsg != null && errorMsg.contains("Connection pool shut down")) {
+ log.warn("Generic vectorization aborted for {} because the application is shutting down", embeddingId);
+ exchange.setProperty("vectorizationSuccess", true);
+ return;
+ }
+ if (retryCount >= maxRetries) {
+ processingService.markAsFailed(embeddingId, errorMsg);
+ } else {
+ log.warn("Generic vectorization attempt #{} failed for {}: {}", retryCount, embeddingId, errorMsg);
+ }
+ })
+ .end()
+ .end()
+ .end();
+
+ from("timer:generic-vectorization-scheduler?period=" + properties.getVectorization().getGenericSchedulerPeriodMs() + "&delay=500")
+ .routeId(ROUTE_ID_SCHEDULER)
+ .process(exchange -> {
+ int batchSize = properties.getVectorization().getBatchSize();
+ List pending = embeddingRepository.findIdsByEmbeddingStatus(EmbeddingStatus.PENDING, PageRequest.of(0, batchSize));
+ List failed = List.of();
+ if (pending.isEmpty()) {
+ failed = embeddingRepository.findIdsByEmbeddingStatus(EmbeddingStatus.FAILED, PageRequest.of(0, batchSize));
+ }
+ List toProcess = !pending.isEmpty() ? pending : failed;
+ if (toProcess.isEmpty()) {
+ exchange.setProperty("noPendingEmbeddings", true);
+ } else {
+ exchange.getIn().setBody(toProcess);
+ }
+ })
+ .choice()
+ .when(exchangeProperty("noPendingEmbeddings").isEqualTo(true))
+ .log(LoggingLevel.DEBUG, "Generic vectorization scheduler: nothing pending")
+ .otherwise()
+ .split(body())
+ .process(exchange -> {
+ UUID embeddingId = exchange.getIn().getBody(UUID.class);
+ exchange.getIn().setHeader("embeddingId", embeddingId);
+ })
+ .to("direct:vectorize-embedding")
+ .end()
+ .end();
+ }
+
+ public static class EmbedRequest {
+ @JsonProperty("text")
+ public String text;
+
+ @JsonProperty("is_query")
+ public boolean isQuery;
+ }
+
+ public static class EmbedResponse {
+ public float[] embedding;
+ public int dimensions;
+ @JsonProperty("token_count")
+ public int tokenCount;
+ }
+}
diff --git a/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java b/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java
new file mode 100644
index 0000000..b81c022
--- /dev/null
+++ b/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java
@@ -0,0 +1,142 @@
+package at.procon.dip.vectorization.service;
+
+import at.procon.dip.domain.document.DocumentStatus;
+import at.procon.dip.domain.document.EmbeddingStatus;
+import at.procon.dip.domain.document.entity.DocumentEmbedding;
+import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
+import at.procon.dip.domain.document.service.DocumentService;
+import at.procon.ted.config.TedProcessorProperties;
+import at.procon.ted.model.entity.VectorizationStatus;
+import at.procon.ted.repository.ProcurementDocumentRepository;
+import at.procon.ted.service.VectorizationService;
+import java.time.OffsetDateTime;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Propagation;
+import org.springframework.transaction.annotation.Transactional;
+
+/**
+ * Phase 2 generic vectorization processor that works on DOC text representations and DOC embeddings.
+ *
+ * The service keeps the existing TED semantic search operational by optionally dual-writing completed
+ * embeddings back into the legacy TED procurement_document vector columns, resolved by document hash.
+ */
+@Service
+@RequiredArgsConstructor
+@Slf4j
+public class DocumentEmbeddingProcessingService {
+
+ private final DocumentEmbeddingRepository embeddingRepository;
+ private final DocumentService documentService;
+ private final VectorizationService vectorizationService;
+ private final TedProcessorProperties properties;
+ private final ProcurementDocumentRepository procurementDocumentRepository;
+
+ @Transactional(propagation = Propagation.REQUIRES_NEW)
+ public EmbeddingPayload prepareEmbeddingForVectorization(UUID embeddingId) {
+ DocumentEmbedding embedding = embeddingRepository.findDetailedById(embeddingId)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
+
+ if (embedding.getEmbeddingStatus() == EmbeddingStatus.PROCESSING) {
+ log.debug("Embedding {} is already PROCESSING, skipping duplicate queue entry", embeddingId);
+ return null;
+ }
+
+ embedding.setEmbeddingStatus(EmbeddingStatus.PROCESSING);
+ embedding.setErrorMessage(null);
+ embeddingRepository.save(embedding);
+
+ String textBody = embedding.getRepresentation().getTextBody();
+ if (textBody == null || textBody.isBlank()) {
+ embedding.setEmbeddingStatus(EmbeddingStatus.SKIPPED);
+ embedding.setErrorMessage("No text representation available");
+ embedding.setEmbeddedAt(OffsetDateTime.now());
+ embeddingRepository.save(embedding);
+ documentService.updateStatus(embedding.getDocument().getId(), DocumentStatus.REPRESENTED);
+ return null;
+ }
+
+ int maxLength = properties.getVectorization().getMaxTextLength();
+ if (textBody.length() > maxLength) {
+ log.debug("Truncating representation {} for embedding {} from {} to {} chars",
+ embedding.getRepresentation().getId(), embeddingId, textBody.length(), maxLength);
+ textBody = textBody.substring(0, maxLength);
+ }
+
+ return new EmbeddingPayload(
+ embedding.getId(),
+ embedding.getDocument().getId(),
+ embedding.getDocument().getDedupHash(),
+ textBody,
+ embedding.getModel().getDimensions(),
+ embedding.getModel().isQueryPrefixRequired(),
+ embedding.getRepresentation().getId()
+ );
+ }
+
+ @Transactional(propagation = Propagation.REQUIRES_NEW)
+ public void saveEmbedding(UUID embeddingId, float[] embedding, Integer tokenCount) {
+ DocumentEmbedding loaded = embeddingRepository.findDetailedById(embeddingId)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
+
+ int expectedDimensions = loaded.getModel().getDimensions();
+ if (embedding == null || embedding.length != expectedDimensions) {
+ throw new IllegalArgumentException("Invalid embedding dimension for " + embeddingId +
+ ": expected " + expectedDimensions + ", got " + (embedding == null ? 0 : embedding.length));
+ }
+
+ String vectorString = vectorizationService.floatArrayToVectorString(embedding);
+ embeddingRepository.updateEmbeddingVector(embeddingId, vectorString, tokenCount, embedding.length);
+ documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.INDEXED);
+
+ if (properties.getVectorization().isDualWriteLegacyTedVectors()) {
+ dualWriteLegacyTedVector(loaded, vectorString, tokenCount);
+ }
+ }
+
+ @Transactional(propagation = Propagation.REQUIRES_NEW)
+ public void markAsFailed(UUID embeddingId, String errorMessage) {
+ DocumentEmbedding loaded = embeddingRepository.findDetailedById(embeddingId)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
+
+ embeddingRepository.updateEmbeddingStatus(embeddingId, EmbeddingStatus.FAILED, errorMessage, null);
+ documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.FAILED);
+
+ if (properties.getVectorization().isDualWriteLegacyTedVectors()) {
+ loaded.getDocument().getDedupHash();
+ procurementDocumentRepository.findByDocumentHash(loaded.getDocument().getDedupHash())
+ .ifPresent(doc -> procurementDocumentRepository.updateVectorizationStatus(
+ doc.getId(), VectorizationStatus.FAILED, errorMessage, null));
+ }
+ }
+
+ private void dualWriteLegacyTedVector(DocumentEmbedding embedding, String vectorString, Integer tokenCount) {
+ String dedupHash = embedding.getDocument().getDedupHash();
+ if (dedupHash == null || dedupHash.isBlank()) {
+ return;
+ }
+
+ procurementDocumentRepository.findByDocumentHash(dedupHash)
+ .ifPresentOrElse(
+ legacy -> {
+ procurementDocumentRepository.updateContentVector(legacy.getId(), vectorString, tokenCount);
+ log.debug("Dual-wrote embedding {} back to legacy TED document {}", embedding.getId(), legacy.getId());
+ },
+ () -> log.debug("No legacy TED document found for DOC embedding {} with dedup hash {}",
+ embedding.getId(), dedupHash)
+ );
+ }
+
+ public record EmbeddingPayload(
+ UUID embeddingId,
+ UUID documentId,
+ String dedupHash,
+ String textContent,
+ Integer expectedDimensions,
+ boolean queryPrefixRequired,
+ UUID representationId
+ ) {
+ }
+}
diff --git a/src/main/java/at/procon/dip/vectorization/spi/EmbeddingModelDescriptor.java b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingModelDescriptor.java
new file mode 100644
index 0000000..9e31809
--- /dev/null
+++ b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingModelDescriptor.java
@@ -0,0 +1,13 @@
+package at.procon.dip.vectorization.spi;
+
+/**
+ * Describes one embedding model registered in the platform.
+ */
+public record EmbeddingModelDescriptor(
+ String modelKey,
+ String provider,
+ int dimensions,
+ String distanceMetric,
+ boolean queryPrefixRequired
+) {
+}
diff --git a/src/main/java/at/procon/dip/vectorization/spi/EmbeddingProvider.java b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingProvider.java
new file mode 100644
index 0000000..5d1cbff
--- /dev/null
+++ b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingProvider.java
@@ -0,0 +1,13 @@
+package at.procon.dip.vectorization.spi;
+
+import java.util.List;
+
+/**
+ * Provider abstraction for vectorization backends.
+ */
+public interface EmbeddingProvider {
+
+ EmbeddingModelDescriptor model();
+
+ EmbeddingResult embed(List texts, boolean queryMode);
+}
diff --git a/src/main/java/at/procon/dip/vectorization/spi/EmbeddingResult.java b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingResult.java
new file mode 100644
index 0000000..eea6b65
--- /dev/null
+++ b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingResult.java
@@ -0,0 +1,13 @@
+package at.procon.dip.vectorization.spi;
+
+import java.util.List;
+
+/**
+ * Embedding output for one or more representations.
+ */
+public record EmbeddingResult(
+ EmbeddingModelDescriptor model,
+ List vectors,
+ List warnings
+) {
+}
diff --git a/src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java b/src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java
new file mode 100644
index 0000000..6aff6e8
--- /dev/null
+++ b/src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java
@@ -0,0 +1,41 @@
+package at.procon.dip.vectorization.startup;
+
+import at.procon.dip.domain.document.service.DocumentEmbeddingService;
+import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
+import at.procon.ted.config.TedProcessorProperties;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.boot.ApplicationArguments;
+import org.springframework.boot.ApplicationRunner;
+import org.springframework.stereotype.Component;
+
+/**
+ * Ensures the configured embedding model exists in DOC.doc_embedding_model.
+ */
+@Component
+@RequiredArgsConstructor
+@Slf4j
+public class ConfiguredEmbeddingModelStartupRunner implements ApplicationRunner {
+
+ private final TedProcessorProperties properties;
+ private final DocumentEmbeddingService embeddingService;
+
+ @Override
+ public void run(ApplicationArguments args) {
+ if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) {
+ return;
+ }
+
+ embeddingService.registerModel(new RegisterEmbeddingModelCommand(
+ properties.getVectorization().getModelName(),
+ properties.getVectorization().getEmbeddingProvider(),
+ properties.getVectorization().getModelName(),
+ properties.getVectorization().getDimensions(),
+ null,
+ false,
+ true
+ ));
+
+ log.info("Phase 2 embedding model ensured: {}", properties.getVectorization().getModelName());
+ }
+}
diff --git a/src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java b/src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java
new file mode 100644
index 0000000..5266c62
--- /dev/null
+++ b/src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java
@@ -0,0 +1,60 @@
+package at.procon.dip.vectorization.startup;
+
+import at.procon.dip.domain.document.EmbeddingStatus;
+import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
+import at.procon.ted.config.TedProcessorProperties;
+import java.util.List;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.camel.ProducerTemplate;
+import org.springframework.boot.ApplicationArguments;
+import org.springframework.boot.ApplicationRunner;
+import org.springframework.data.domain.PageRequest;
+import org.springframework.stereotype.Component;
+
+/**
+ * Queues pending and failed DOC embeddings immediately on startup.
+ */
+@Component
+@RequiredArgsConstructor
+@Slf4j
+public class GenericVectorizationStartupRunner implements ApplicationRunner {
+
+ private static final int BATCH_SIZE = 1000;
+
+ private final TedProcessorProperties properties;
+ private final DocumentEmbeddingRepository embeddingRepository;
+ private final ProducerTemplate producerTemplate;
+
+ @Override
+ public void run(ApplicationArguments args) {
+ if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) {
+ return;
+ }
+
+ int queued = 0;
+ queued += queueByStatus(EmbeddingStatus.PENDING, "PENDING");
+ queued += queueByStatus(EmbeddingStatus.FAILED, "FAILED");
+ log.info("Generic vectorization startup runner queued {} embedding jobs", queued);
+ }
+
+ private int queueByStatus(EmbeddingStatus status, String label) {
+ int queued = 0;
+ int page = 0;
+ List ids;
+ do {
+ ids = embeddingRepository.findIdsByEmbeddingStatus(status, PageRequest.of(page, BATCH_SIZE));
+ for (UUID id : ids) {
+ try {
+ producerTemplate.sendBodyAndHeader("direct:vectorize-embedding", null, "embeddingId", id);
+ queued++;
+ } catch (Exception e) {
+ log.warn("Failed to queue {} embedding {}: {}", label, id, e.getMessage());
+ }
+ }
+ page++;
+ } while (ids.size() == BATCH_SIZE);
+ return queued;
+ }
+}
diff --git a/src/main/java/at/procon/ted/TedProcurementProcessorApplication.java b/src/main/java/at/procon/ted/TedProcurementProcessorApplication.java
index c86a461..b8409c9 100644
--- a/src/main/java/at/procon/ted/TedProcurementProcessorApplication.java
+++ b/src/main/java/at/procon/ted/TedProcurementProcessorApplication.java
@@ -1,26 +1,20 @@
package at.procon.ted;
-import org.springframework.boot.SpringApplication;
-import org.springframework.boot.autoconfigure.SpringBootApplication;
-import org.springframework.scheduling.annotation.EnableAsync;
+import at.procon.dip.DocumentIntelligencePlatformApplication;
/**
- * TED Procurement Document Processor Application.
- *
- * Processes EU eForms public procurement notices from TED (Tenders Electronic Daily).
- * Features:
- * - Directory watching with Apache Camel for automated XML processing
- * - PostgreSQL storage with native XML support and pgvector for semantic search
- * - Asynchronous document vectorization using multilingual-e5-large model
- * - REST API for structured and semantic search
- *
- * @author Martin.Schweitzer@procon.co.at and claude.ai
+ * Legacy entry point kept for backward compatibility.
+ *
+ * The platform is being generalized beyond TED-specific procurement documents.
+ * New runtime packaging should use {@link DocumentIntelligencePlatformApplication}.
*/
-@SpringBootApplication
-@EnableAsync
-public class TedProcurementProcessorApplication {
+@Deprecated(forRemoval = false, since = "1.1.0")
+public final class TedProcurementProcessorApplication {
+
+ private TedProcurementProcessorApplication() {
+ }
public static void main(String[] args) {
- SpringApplication.run(TedProcurementProcessorApplication.class, args);
+ DocumentIntelligencePlatformApplication.main(args);
}
}
diff --git a/src/main/java/at/procon/ted/camel/VectorizationRoute.java b/src/main/java/at/procon/ted/camel/VectorizationRoute.java
index 40203af..84865ee 100644
--- a/src/main/java/at/procon/ted/camel/VectorizationRoute.java
+++ b/src/main/java/at/procon/ted/camel/VectorizationRoute.java
@@ -68,6 +68,10 @@ public class VectorizationRoute extends RouteBuilder {
log.info("Vectorization is disabled, skipping route configuration");
return;
}
+ if (properties.getVectorization().isGenericPipelineEnabled()) {
+ log.info("Legacy vectorization route disabled because Phase 2 generic pipeline is enabled");
+ return;
+ }
log.info("Configuring vectorization routes (enabled=true, apiUrl={}, connectTimeout={}ms, socketTimeout={}ms, maxRetries={}, scheduler every 6s)",
properties.getVectorization().getApiUrl(),
diff --git a/src/main/java/at/procon/ted/config/TedProcessorProperties.java b/src/main/java/at/procon/ted/config/TedProcessorProperties.java
index 0e307cd..aa434c0 100644
--- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java
+++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java
@@ -152,6 +152,37 @@ public class TedProcessorProperties {
*/
@Min(0)
private int maxRetries = 5;
+
+ /**
+ * Enable the Phase 2 generic vectorization pipeline based on DOC text representations
+ * and DOC embeddings instead of the legacy TED document vector columns as the primary
+ * write target.
+ */
+ private boolean genericPipelineEnabled = true;
+
+ /**
+ * Keep writing completed TED embeddings back to the legacy ted.procurement_document
+ * vector columns so the existing semantic search stays operational during migration.
+ */
+ private boolean dualWriteLegacyTedVectors = true;
+
+ /**
+ * Scheduler interval for generic embedding polling (milliseconds).
+ */
+ @Positive
+ private long genericSchedulerPeriodMs = 6000;
+
+ /**
+ * Builder key for the primary TED semantic representation created during Phase 2 dual-write.
+ */
+ @NotBlank
+ private String primaryRepresentationBuilderKey = "ted-phase2-primary-representation";
+
+ /**
+ * Provider key used when registering the configured embedding model in DOC.doc_embedding_model.
+ */
+ @NotBlank
+ private String embeddingProvider = "http-embedding-service";
}
/**
diff --git a/src/main/java/at/procon/ted/controller/AdminController.java b/src/main/java/at/procon/ted/controller/AdminController.java
index acf5c3f..6434142 100644
--- a/src/main/java/at/procon/ted/controller/AdminController.java
+++ b/src/main/java/at/procon/ted/controller/AdminController.java
@@ -1,5 +1,8 @@
package at.procon.ted.controller;
+import at.procon.dip.domain.document.EmbeddingStatus;
+import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
+import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.model.entity.ProcessingLog;
import at.procon.ted.model.entity.VectorizationStatus;
import at.procon.ted.repository.ProcurementDocumentRepository;
@@ -41,6 +44,9 @@ public class AdminController {
private final VectorizationService vectorizationService;
private final DocumentProcessingService documentProcessingService;
private final ProcurementDocumentRepository documentRepository;
+ private final DocumentEmbeddingRepository documentEmbeddingRepository;
+ private final TedProcessorProperties properties;
+ private final at.procon.ted.service.TedPhase2GenericDocumentService tedPhase2GenericDocumentService;
private final at.procon.ted.repository.ProcessingLogRepository logRepository;
private final ProducerTemplate producerTemplate;
private final at.procon.ted.service.DataCleanupService dataCleanupService;
@@ -68,10 +74,17 @@ public class AdminController {
public ResponseEntity> getVectorizationStatus() {
Map status = new HashMap<>();
- List counts = documentRepository.countByVectorizationStatus();
Map statusCounts = new HashMap<>();
- for (Object[] row : counts) {
- statusCounts.put(((VectorizationStatus) row[0]).name(), (Long) row[1]);
+ if (properties.getVectorization().isGenericPipelineEnabled()) {
+ List counts = documentEmbeddingRepository.countByEmbeddingStatus();
+ for (Object[] row : counts) {
+ statusCounts.put(((EmbeddingStatus) row[0]).name(), (Long) row[1]);
+ }
+ } else {
+ List counts = documentRepository.countByVectorizationStatus();
+ for (Object[] row : counts) {
+ statusCounts.put(((VectorizationStatus) row[0]).name(), (Long) row[1]);
+ }
}
status.put("counts", statusCounts);
@@ -102,8 +115,14 @@ public class AdminController {
return ResponseEntity.badRequest().body(result);
}
- // Trigger vectorization via Camel route
- producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", documentId);
+ if (properties.getVectorization().isGenericPipelineEnabled()) {
+ var document = documentRepository.findById(documentId).orElseThrow();
+ UUID embeddingId = tedPhase2GenericDocumentService.registerOrRefreshTedDocument(document);
+ producerTemplate.sendBodyAndHeader("direct:vectorize-embedding", null, "embeddingId", embeddingId);
+ result.put("embeddingId", embeddingId);
+ } else {
+ producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", documentId);
+ }
result.put("success", true);
result.put("message", "Vectorization triggered for document " + documentId);
@@ -127,15 +146,24 @@ public class AdminController {
return ResponseEntity.badRequest().body(result);
}
- var pending = documentRepository.findByVectorizationStatus(
- VectorizationStatus.PENDING,
- PageRequest.of(0, Math.min(batchSize, 500)));
-
int count = 0;
- for (var doc : pending) {
- // Trigger vectorization via Camel route
- producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", doc.getId());
- count++;
+ if (properties.getVectorization().isGenericPipelineEnabled()) {
+ var pending = documentEmbeddingRepository.findIdsByEmbeddingStatus(
+ EmbeddingStatus.PENDING,
+ PageRequest.of(0, Math.min(batchSize, 500)));
+ for (UUID embeddingId : pending) {
+ producerTemplate.sendBodyAndHeader("direct:vectorize-embedding", null, "embeddingId", embeddingId);
+ count++;
+ }
+ } else {
+ var pending = documentRepository.findByVectorizationStatus(
+ VectorizationStatus.PENDING,
+ PageRequest.of(0, Math.min(batchSize, 500)));
+
+ for (var doc : pending) {
+ producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", doc.getId());
+ count++;
+ }
}
result.put("success", true);
diff --git a/src/main/java/at/procon/ted/event/VectorizationEventListener.java b/src/main/java/at/procon/ted/event/VectorizationEventListener.java
index 6823dac..0c2efd7 100644
--- a/src/main/java/at/procon/ted/event/VectorizationEventListener.java
+++ b/src/main/java/at/procon/ted/event/VectorizationEventListener.java
@@ -28,7 +28,7 @@ public class VectorizationEventListener {
*/
@TransactionalEventListener(phase = TransactionPhase.AFTER_COMMIT)
public void onDocumentSaved(DocumentSavedEvent event) {
- if (!properties.getVectorization().isEnabled()) {
+ if (!properties.getVectorization().isEnabled() || properties.getVectorization().isGenericPipelineEnabled()) {
return;
}
diff --git a/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java b/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java
index 0412b15..1526192 100644
--- a/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java
+++ b/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java
@@ -38,6 +38,7 @@ public class BatchDocumentProcessingService {
private final XmlParserService xmlParserService;
private final ProcurementDocumentRepository documentRepository;
private final ProcessingLogService processingLogService;
+ private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService;
/**
* Process a batch of XML files from a Daily Package.
@@ -129,6 +130,10 @@ public class BatchDocumentProcessingService {
ProcessingLog.EventStatus.SUCCESS,
"Document parsed and stored successfully (batch)", null,
doc.getSourceFilename(), 0);
+
+ if (doc.getDocumentHash() != null) {
+ tedPhase2GenericDocumentService.registerOrRefreshTedDocument(doc);
+ }
}
log.info("Successfully inserted {} documents in batch", savedDocuments.size());
diff --git a/src/main/java/at/procon/ted/service/DocumentProcessingService.java b/src/main/java/at/procon/ted/service/DocumentProcessingService.java
index 0d480fc..dd04db1 100644
--- a/src/main/java/at/procon/ted/service/DocumentProcessingService.java
+++ b/src/main/java/at/procon/ted/service/DocumentProcessingService.java
@@ -36,6 +36,7 @@ public class DocumentProcessingService {
private final ProcessingLogService processingLogService;
private final TedProcessorProperties properties;
private final ApplicationEventPublisher eventPublisher;
+ private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService;
/**
* Process an XML document from the file system.
@@ -87,10 +88,15 @@ public class DocumentProcessingService {
"Document parsed and stored successfully", null, filename,
(int) (System.currentTimeMillis() - startTime));
- // Publish event to trigger vectorization AFTER transaction commit
- // This ensures document is visible in DB and avoids transaction isolation issues
- eventPublisher.publishEvent(new DocumentSavedEvent(document.getId(), document.getPublicationId()));
- log.debug("Document saved successfully, vectorization event published: {}", document.getId());
+ if (properties.getVectorization().isGenericPipelineEnabled()) {
+ tedPhase2GenericDocumentService.registerOrRefreshTedDocument(document);
+ log.debug("Document saved successfully, Phase 2 generic vectorization record ensured: {}", document.getId());
+ } else {
+ // Publish event to trigger vectorization AFTER transaction commit
+ // This ensures document is visible in DB and avoids transaction isolation issues
+ eventPublisher.publishEvent(new DocumentSavedEvent(document.getId(), document.getPublicationId()));
+ log.debug("Document saved successfully, vectorization event published: {}", document.getId());
+ }
return ProcessingResult.success(document.getId(), documentHash, document.getPublicationId());
@@ -141,9 +147,11 @@ public class DocumentProcessingService {
documentRepository.save(updated);
- // Note: Re-vectorization will be triggered automatically by
- // VectorizationRoute scheduler (checks for PENDING documents every 60s)
+ if (properties.getVectorization().isGenericPipelineEnabled()) {
+ tedPhase2GenericDocumentService.registerOrRefreshTedDocument(updated);
+ }
+ // Note: Re-vectorization will be triggered automatically by the active scheduler
return updated;
} catch (Exception e) {
log.error("Failed to reprocess document {}: {}", publicationId, e.getMessage());
diff --git a/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java b/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java
new file mode 100644
index 0000000..9563578
--- /dev/null
+++ b/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java
@@ -0,0 +1,197 @@
+package at.procon.ted.service;
+
+import at.procon.dip.domain.access.DocumentVisibility;
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentFamily;
+import at.procon.dip.domain.document.DocumentStatus;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.domain.document.RepresentationType;
+import at.procon.dip.domain.document.SourceType;
+import at.procon.dip.domain.document.StorageType;
+import at.procon.dip.domain.document.entity.Document;
+import at.procon.dip.domain.document.entity.DocumentContent;
+import at.procon.dip.domain.document.entity.DocumentEmbedding;
+import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
+import at.procon.dip.domain.document.entity.DocumentSource;
+import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
+import at.procon.dip.domain.document.repository.DocumentContentRepository;
+import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
+import at.procon.dip.domain.document.repository.DocumentRepository;
+import at.procon.dip.domain.document.repository.DocumentSourceRepository;
+import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
+import at.procon.dip.domain.document.service.DocumentEmbeddingService;
+import at.procon.dip.domain.document.service.DocumentService;
+import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
+import at.procon.ted.config.TedProcessorProperties;
+import at.procon.ted.model.entity.ProcurementDocument;
+import java.time.OffsetDateTime;
+import java.util.List;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+/**
+ * Phase 2 bridge that dual-writes TED documents into the generic DOC persistence backbone.
+ */
+@Service
+@RequiredArgsConstructor
+@Slf4j
+public class TedPhase2GenericDocumentService {
+
+ private final TedProcessorProperties properties;
+ private final DocumentRepository documentRepository;
+ private final DocumentContentRepository contentRepository;
+ private final DocumentSourceRepository sourceRepository;
+ private final DocumentTextRepresentationRepository representationRepository;
+ private final DocumentEmbeddingRepository embeddingRepository;
+ private final DocumentService documentService;
+ private final DocumentEmbeddingService embeddingService;
+
+ @Transactional
+ public UUID registerOrRefreshTedDocument(ProcurementDocument tedDocument) {
+ if (!properties.getVectorization().isGenericPipelineEnabled()) {
+ return null;
+ }
+
+ Document document = documentRepository.findByDedupHash(tedDocument.getDocumentHash())
+ .orElseGet(() -> createGenericDocument(tedDocument));
+
+ document.setDocumentType(DocumentType.TED_NOTICE);
+ document.setDocumentFamily(DocumentFamily.PROCUREMENT);
+ document.setVisibility(DocumentVisibility.PUBLIC);
+ document.setStatus(DocumentStatus.REPRESENTED);
+ document.setTitle(tedDocument.getProjectTitle());
+ document.setSummary(tedDocument.getProjectDescription());
+ document.setLanguageCode(tedDocument.getLanguageCode());
+ document.setMimeType("application/xml");
+ document.setBusinessKey(buildBusinessKey(tedDocument));
+ document.setDedupHash(tedDocument.getDocumentHash());
+ document = documentRepository.save(document);
+
+ ensureTedSource(document, tedDocument);
+ DocumentContent originalContent = ensureOriginalContent(document, tedDocument);
+ DocumentTextRepresentation representation = ensurePrimaryRepresentation(document, originalContent, tedDocument);
+ DocumentEmbedding embedding = ensurePendingEmbedding(document, representation);
+
+ log.debug("Phase 2 DOC bridge ensured generic TED document {} -> embedding {}", document.getId(), embedding.getId());
+ return embedding.getId();
+ }
+
+ private Document createGenericDocument(ProcurementDocument tedDocument) {
+ return documentService.create(new at.procon.dip.domain.document.service.command.CreateDocumentCommand(
+ null,
+ DocumentVisibility.PUBLIC,
+ DocumentType.TED_NOTICE,
+ DocumentFamily.PROCUREMENT,
+ DocumentStatus.REPRESENTED,
+ tedDocument.getProjectTitle(),
+ tedDocument.getProjectDescription(),
+ tedDocument.getLanguageCode(),
+ "application/xml",
+ buildBusinessKey(tedDocument),
+ tedDocument.getDocumentHash()
+ ));
+ }
+
+ private void ensureTedSource(Document document, ProcurementDocument tedDocument) {
+ String externalId = tedDocument.getPublicationId() != null ? tedDocument.getPublicationId() : tedDocument.getId().toString();
+ boolean sourceExists = sourceRepository.findByDocument_Id(document.getId()).stream()
+ .anyMatch(existing -> externalId.equals(existing.getExternalSourceId()));
+ if (sourceExists) {
+ return;
+ }
+
+ DocumentSource source = DocumentSource.builder()
+ .document(document)
+ .sourceType(SourceType.FILE_SYSTEM)
+ .externalSourceId(externalId)
+ .sourceUri(tedDocument.getSourcePath())
+ .sourceFilename(tedDocument.getSourceFilename())
+ .importBatchId("ted-phase2")
+ .receivedAt(OffsetDateTime.now())
+ .build();
+ sourceRepository.save(source);
+ }
+
+ private DocumentContent ensureOriginalContent(Document document, ProcurementDocument tedDocument) {
+ List existing = contentRepository.findByDocument_IdAndContentRole(document.getId(), ContentRole.ORIGINAL);
+ if (!existing.isEmpty()) {
+ DocumentContent content = existing.get(0);
+ content.setMimeType("application/xml");
+ content.setStorageType(StorageType.DB_TEXT);
+ content.setTextContent(tedDocument.getXmlDocument());
+ content.setContentHash(tedDocument.getDocumentHash());
+ content.setSizeBytes(tedDocument.getFileSizeBytes());
+ return contentRepository.save(content);
+ }
+
+ DocumentContent content = DocumentContent.builder()
+ .document(document)
+ .contentRole(ContentRole.ORIGINAL)
+ .storageType(StorageType.DB_TEXT)
+ .mimeType("application/xml")
+ .charsetName("UTF-8")
+ .textContent(tedDocument.getXmlDocument())
+ .contentHash(tedDocument.getDocumentHash())
+ .sizeBytes(tedDocument.getFileSizeBytes())
+ .build();
+ return contentRepository.save(content);
+ }
+
+ private DocumentTextRepresentation ensurePrimaryRepresentation(Document document,
+ DocumentContent originalContent,
+ ProcurementDocument tedDocument) {
+ DocumentTextRepresentation representation = representationRepository
+ .findFirstByDocument_IdAndPrimaryRepresentationTrue(document.getId())
+ .orElseGet(DocumentTextRepresentation::new);
+
+ representation.setDocument(document);
+ representation.setContent(originalContent);
+ representation.setRepresentationType(RepresentationType.SEMANTIC_TEXT);
+ representation.setBuilderKey(properties.getVectorization().getPrimaryRepresentationBuilderKey());
+ representation.setLanguageCode(tedDocument.getLanguageCode());
+ representation.setPrimaryRepresentation(true);
+ representation.setTextBody(tedDocument.getTextContent() != null ? tedDocument.getTextContent() : tedDocument.getProjectDescription());
+ representation.setTokenCount(null);
+ representation.setChunkIndex(null);
+ representation.setChunkStartOffset(null);
+ representation.setChunkEndOffset(null);
+ return representationRepository.save(representation);
+ }
+
+ private DocumentEmbedding ensurePendingEmbedding(Document document, DocumentTextRepresentation representation) {
+ DocumentEmbeddingModel model = embeddingService.registerModel(new RegisterEmbeddingModelCommand(
+ properties.getVectorization().getModelName(),
+ properties.getVectorization().getEmbeddingProvider(),
+ properties.getVectorization().getModelName(),
+ properties.getVectorization().getDimensions(),
+ null,
+ false,
+ true
+ ));
+
+ return embeddingRepository.findByRepresentation_IdAndModel_Id(representation.getId(), model.getId())
+ .map(existing -> {
+ existing.setDocument(document);
+ existing.setRepresentation(representation);
+ existing.setModel(model);
+ existing.setEmbeddingStatus(at.procon.dip.domain.document.EmbeddingStatus.PENDING);
+ existing.setErrorMessage(null);
+ existing.setEmbeddedAt(null);
+ return embeddingRepository.save(existing);
+ })
+ .orElseGet(() -> embeddingService.createPendingEmbedding(document.getId(), representation.getId(), model.getId()));
+ }
+
+ private String buildBusinessKey(ProcurementDocument tedDocument) {
+ if (tedDocument.getPublicationId() != null && !tedDocument.getPublicationId().isBlank()) {
+ return "TED:publication:" + tedDocument.getPublicationId();
+ }
+ if (tedDocument.getNoticeUrl() != null && !tedDocument.getNoticeUrl().isBlank()) {
+ return "TED:url:" + tedDocument.getNoticeUrl();
+ }
+ return "TED:hash:" + tedDocument.getDocumentHash();
+ }
+}
diff --git a/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java b/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java
index 2048f3d..b75c2be 100644
--- a/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java
+++ b/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java
@@ -44,6 +44,10 @@ public class VectorizationStartupRunner implements ApplicationRunner {
log.info("Vectorization is disabled, skipping startup processing");
return;
}
+ if (properties.getVectorization().isGenericPipelineEnabled()) {
+ log.info("Legacy vectorization startup runner disabled because Phase 2 generic pipeline is enabled");
+ return;
+ }
log.info("Checking for pending and failed vectorizations on startup...");
diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
index 8edd412..15dcad8 100644
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@@ -1,19 +1,19 @@
-# TED Procurement Document Processor Configuration
+# Document Intelligence Platform Configuration
# Author: Martin.Schweitzer@procon.co.at and claude.ai
server:
- port: 8888
+ port: 8889
servlet:
context-path: /api
spring:
application:
- name: ted-procurement-processor
+ name: document-intelligence-platform
datasource:
- url: jdbc:postgresql://94.130.218.54:32333/RELM
+ url: jdbc:postgresql://localhost:5432/RELM
username: ${DB_USERNAME:postgres}
- password: ${DB_PASSWORD:PDmXRx0Rbk9OFOn9qO5Gm/mPCfqW8zwbZ+/YIU1lySc=}
+ password: ${DB_PASSWORD:P54!pcd#Wi}
driver-class-name: org.postgresql.Driver
hikari:
maximum-pool-size: 5
@@ -25,11 +25,12 @@ spring:
jpa:
hibernate:
- ddl-auto: none
+ ddl-auto: update
show-sql: false
open-in-view: false
properties:
hibernate:
+ dialect: org.hibernate.dialect.PostgreSQLDialect
format_sql: true
default_schema: TED
jdbc:
@@ -42,7 +43,9 @@ spring:
locations: classpath:db/migration
baseline-on-migrate: true
create-schemas: true
- schemas: TED
+ schemas:
+ - TED
+ - DOC
default-schema: TED
# Apache Camel Configuration
@@ -102,6 +105,16 @@ ted:
socket-timeout: 60000
# Maximum retries on connection failure
max-retries: 5
+ # Phase 2: use generic DOC representation/embedding pipeline as primary vectorization path
+ generic-pipeline-enabled: true
+ # Keep legacy TED vector columns updated until semantic search is migrated
+ dual-write-legacy-ted-vectors: true
+ # Scheduler interval for generic embedding polling
+ generic-scheduler-period-ms: 6000
+ # Builder identifier for primary TED semantic representations in DOC
+ primary-representation-builder-key: ted-phase2-primary-representation
+ # Provider key stored in DOC.doc_embedding_model
+ embedding-provider: http-embedding-service
# Search configuration
search:
@@ -115,7 +128,7 @@ ted:
# TED Daily Package Download configuration
download:
# Enable/disable automatic package download
- enabled: true
+ enabled: false
# Base URL for TED Daily Packages
base-url: https://ted.europa.eu/packages/daily/
# Download directory for tar.gz files
@@ -148,7 +161,7 @@ ted:
# IMAP Mail configuration
mail:
# Enable/disable mail processing
- enabled: true
+ enabled: false
# IMAP server hostname
host: mail.mymagenta.business
# IMAP server port (993 for IMAPS)
@@ -172,11 +185,11 @@ ted:
# Max messages per poll
max-messages-per-poll: 10
# Output directory for processed attachments
- attachment-output-directory: D:/ted.europe/mail-attachments
+ attachment-output-directory: /ted.europe/mail-attachments
# Enable/disable MIME file input processing
mime-input-enabled: true
# Input directory for MIME files (.eml)
- mime-input-directory: D:/ted.europe/mime-input
+ mime-input-directory: /ted.europe/mime-input
# File pattern for MIME files (regex)
mime-input-pattern: .*\\.eml
# Polling interval for MIME input directory (milliseconds)
@@ -185,7 +198,7 @@ ted:
# Solution Brief processing configuration
solution-brief:
# Enable/disable Solution Brief processing
- enabled: true
+ enabled: false
# Input directory for Solution Brief PDF files
input-directory: C:/work/SolutionBrief
# Output directory for Excel result files (relative to input or absolute)
diff --git a/src/main/resources/db/migration/V4__add_doc_generic_persistence_backbone.sql b/src/main/resources/db/migration/V4__add_doc_generic_persistence_backbone.sql
new file mode 100644
index 0000000..4e81c62
--- /dev/null
+++ b/src/main/resources/db/migration/V4__add_doc_generic_persistence_backbone.sql
@@ -0,0 +1,281 @@
+-- Phase 1: Generic DOC persistence backbone for the Procon Document Intelligence Platform
+-- This migration is additive and intentionally does not modify the existing TED runtime tables.
+
+CREATE SCHEMA IF NOT EXISTS DOC;
+
+SET search_path TO TED, DOC, public;
+
+DO $$
+BEGIN
+ CREATE EXTENSION IF NOT EXISTS pgcrypto SCHEMA public;
+EXCEPTION
+ WHEN insufficient_privilege THEN
+ RAISE NOTICE 'Skipping pgcrypto extension creation (insufficient privileges)';
+ WHEN duplicate_object THEN
+ RAISE NOTICE 'Extension pgcrypto already exists';
+END
+$$;
+
+DO $$
+BEGIN
+ CREATE EXTENSION IF NOT EXISTS vector SCHEMA public;
+EXCEPTION
+ WHEN insufficient_privilege THEN
+ RAISE NOTICE 'Skipping vector extension creation (insufficient privileges)';
+ WHEN duplicate_object THEN
+ RAISE NOTICE 'Extension vector already exists';
+ WHEN undefined_file THEN
+ RAISE WARNING 'Extension vector not available - install pgvector on the database server';
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_visibility') THEN
+ CREATE TYPE DOC.doc_document_visibility AS ENUM ('PUBLIC', 'TENANT', 'SHARED', 'RESTRICTED');
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_type') THEN
+ CREATE TYPE DOC.doc_document_type AS ENUM (
+ 'TED_NOTICE', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
+ 'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'UNKNOWN'
+ );
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_family') THEN
+ CREATE TYPE DOC.doc_document_family AS ENUM ('PROCUREMENT', 'MAIL', 'ATTACHMENT', 'KNOWLEDGE', 'GENERIC');
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_status') THEN
+ CREATE TYPE DOC.doc_document_status AS ENUM ('RECEIVED', 'CLASSIFIED', 'EXTRACTED', 'REPRESENTED', 'INDEXED', 'FAILED', 'ARCHIVED');
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_source_type') THEN
+ CREATE TYPE DOC.doc_source_type AS ENUM ('TED_PACKAGE', 'MAIL', 'FILE_SYSTEM', 'REST_UPLOAD', 'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION');
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_content_role') THEN
+ CREATE TYPE DOC.doc_content_role AS ENUM (
+ 'ORIGINAL', 'NORMALIZED_TEXT', 'OCR_TEXT', 'HTML_CLEAN',
+ 'EXTRACTED_METADATA_JSON', 'THUMBNAIL', 'DERIVED_BINARY'
+ );
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_storage_type') THEN
+ CREATE TYPE DOC.doc_storage_type AS ENUM ('DB_TEXT', 'DB_BINARY', 'FILE_PATH', 'OBJECT_STORAGE', 'EXTERNAL_REFERENCE');
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_representation_type') THEN
+ CREATE TYPE DOC.doc_representation_type AS ENUM ('FULLTEXT', 'SEMANTIC_TEXT', 'SUMMARY', 'TITLE_ABSTRACT', 'CHUNK', 'METADATA_ENRICHED');
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_embedding_status') THEN
+ CREATE TYPE DOC.doc_embedding_status AS ENUM ('PENDING', 'PROCESSING', 'COMPLETED', 'FAILED', 'SKIPPED');
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_distance_metric') THEN
+ CREATE TYPE DOC.doc_distance_metric AS ENUM ('COSINE', 'L2', 'INNER_PRODUCT');
+ END IF;
+END
+$$;
+
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_relation_type') THEN
+ CREATE TYPE DOC.doc_relation_type AS ENUM ('CONTAINS', 'ATTACHMENT_OF', 'EXTRACTED_FROM', 'DERIVED_FROM', 'PART_OF', 'VERSION_OF', 'RELATED_TO');
+ END IF;
+END
+$$;
+
+CREATE TABLE IF NOT EXISTS DOC.doc_tenant (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ tenant_key VARCHAR(120) NOT NULL UNIQUE,
+ display_name VARCHAR(255) NOT NULL,
+ description TEXT,
+ active BOOLEAN NOT NULL DEFAULT TRUE,
+ created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS DOC.doc_document (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ owner_tenant_id UUID REFERENCES DOC.doc_tenant(id),
+ visibility DOC.doc_document_visibility NOT NULL,
+ document_type DOC.doc_document_type NOT NULL,
+ document_family DOC.doc_document_family NOT NULL,
+ status DOC.doc_document_status NOT NULL DEFAULT 'RECEIVED',
+ title VARCHAR(1000),
+ summary TEXT,
+ language_code VARCHAR(16),
+ mime_type VARCHAR(255),
+ business_key VARCHAR(255),
+ dedup_hash VARCHAR(64),
+ created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS DOC.doc_source (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
+ source_type DOC.doc_source_type NOT NULL,
+ external_source_id VARCHAR(500),
+ source_uri TEXT,
+ source_filename VARCHAR(1000),
+ parent_source_id UUID,
+ import_batch_id VARCHAR(255),
+ received_at TIMESTAMP WITH TIME ZONE,
+ created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS DOC.doc_content (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
+ content_role DOC.doc_content_role NOT NULL,
+ storage_type DOC.doc_storage_type NOT NULL,
+ mime_type VARCHAR(255),
+ charset_name VARCHAR(120),
+ text_content TEXT,
+ binary_ref TEXT,
+ content_hash VARCHAR(64),
+ size_bytes BIGINT,
+ created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS DOC.doc_text_representation (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
+ content_id UUID REFERENCES DOC.doc_content(id) ON DELETE SET NULL,
+ representation_type DOC.doc_representation_type NOT NULL,
+ builder_key VARCHAR(255),
+ language_code VARCHAR(16),
+ token_count INTEGER,
+ char_count INTEGER,
+ chunk_index INTEGER,
+ chunk_start_offset INTEGER,
+ chunk_end_offset INTEGER,
+ is_primary BOOLEAN NOT NULL DEFAULT FALSE,
+ text_body TEXT NOT NULL,
+ created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS DOC.doc_embedding_model (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ model_key VARCHAR(255) NOT NULL UNIQUE,
+ provider VARCHAR(120) NOT NULL,
+ display_name VARCHAR(255),
+ dimensions INTEGER NOT NULL,
+ distance_metric DOC.doc_distance_metric NOT NULL DEFAULT 'COSINE',
+ query_prefix_required BOOLEAN NOT NULL DEFAULT FALSE,
+ active BOOLEAN NOT NULL DEFAULT TRUE,
+ created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS DOC.doc_embedding (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
+ representation_id UUID NOT NULL REFERENCES DOC.doc_text_representation(id) ON DELETE CASCADE,
+ model_id UUID NOT NULL REFERENCES DOC.doc_embedding_model(id),
+ embedding_status DOC.doc_embedding_status NOT NULL DEFAULT 'PENDING',
+ token_count INTEGER,
+ embedding_dimensions INTEGER,
+ error_message TEXT,
+ embedded_at TIMESTAMP WITH TIME ZONE,
+ embedding_vector public.vector,
+ created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS DOC.doc_relation (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ parent_document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
+ child_document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
+ relation_type DOC.doc_relation_type NOT NULL,
+ sort_order INTEGER,
+ relation_metadata TEXT,
+ created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ CONSTRAINT chk_doc_relation_no_self CHECK (parent_document_id <> child_document_id)
+);
+
+CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_tenant_key ON DOC.doc_tenant(tenant_key);
+CREATE INDEX IF NOT EXISTS idx_doc_tenant_active ON DOC.doc_tenant(active);
+
+CREATE INDEX IF NOT EXISTS idx_doc_document_type ON DOC.doc_document(document_type);
+CREATE INDEX IF NOT EXISTS idx_doc_document_family ON DOC.doc_document(document_family);
+CREATE INDEX IF NOT EXISTS idx_doc_document_status ON DOC.doc_document(status);
+CREATE INDEX IF NOT EXISTS idx_doc_document_visibility ON DOC.doc_document(visibility);
+CREATE INDEX IF NOT EXISTS idx_doc_document_owner_tenant ON DOC.doc_document(owner_tenant_id);
+CREATE INDEX IF NOT EXISTS idx_doc_document_dedup_hash ON DOC.doc_document(dedup_hash);
+CREATE INDEX IF NOT EXISTS idx_doc_document_business_key ON DOC.doc_document(business_key);
+CREATE INDEX IF NOT EXISTS idx_doc_document_created_at ON DOC.doc_document(created_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_doc_source_document ON DOC.doc_source(document_id);
+CREATE INDEX IF NOT EXISTS idx_doc_source_type ON DOC.doc_source(source_type);
+CREATE INDEX IF NOT EXISTS idx_doc_source_external_id ON DOC.doc_source(external_source_id);
+CREATE INDEX IF NOT EXISTS idx_doc_source_received_at ON DOC.doc_source(received_at DESC);
+CREATE INDEX IF NOT EXISTS idx_doc_source_parent_source ON DOC.doc_source(parent_source_id);
+
+CREATE INDEX IF NOT EXISTS idx_doc_content_document ON DOC.doc_content(document_id);
+CREATE INDEX IF NOT EXISTS idx_doc_content_role ON DOC.doc_content(content_role);
+CREATE INDEX IF NOT EXISTS idx_doc_content_hash ON DOC.doc_content(content_hash);
+CREATE INDEX IF NOT EXISTS idx_doc_content_storage_type ON DOC.doc_content(storage_type);
+
+CREATE INDEX IF NOT EXISTS idx_doc_text_repr_document ON DOC.doc_text_representation(document_id);
+CREATE INDEX IF NOT EXISTS idx_doc_text_repr_content ON DOC.doc_text_representation(content_id);
+CREATE INDEX IF NOT EXISTS idx_doc_text_repr_type ON DOC.doc_text_representation(representation_type);
+CREATE INDEX IF NOT EXISTS idx_doc_text_repr_primary ON DOC.doc_text_representation(is_primary);
+
+CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_embedding_model_key ON DOC.doc_embedding_model(model_key);
+CREATE INDEX IF NOT EXISTS idx_doc_embedding_model_active ON DOC.doc_embedding_model(active);
+
+CREATE INDEX IF NOT EXISTS idx_doc_embedding_document ON DOC.doc_embedding(document_id);
+CREATE INDEX IF NOT EXISTS idx_doc_embedding_repr ON DOC.doc_embedding(representation_id);
+CREATE INDEX IF NOT EXISTS idx_doc_embedding_model ON DOC.doc_embedding(model_id);
+CREATE INDEX IF NOT EXISTS idx_doc_embedding_status ON DOC.doc_embedding(embedding_status);
+CREATE INDEX IF NOT EXISTS idx_doc_embedding_embedded_at ON DOC.doc_embedding(embedded_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_doc_relation_parent ON DOC.doc_relation(parent_document_id);
+CREATE INDEX IF NOT EXISTS idx_doc_relation_child ON DOC.doc_relation(child_document_id);
+CREATE INDEX IF NOT EXISTS idx_doc_relation_type ON DOC.doc_relation(relation_type);
+
+COMMENT ON SCHEMA DOC IS 'Generic document platform schema introduced in Phase 1';
+COMMENT ON TABLE DOC.doc_document IS 'Canonical document root with optional owner tenant and mandatory visibility';
+COMMENT ON TABLE DOC.doc_content IS 'Stored payload variants for a canonical document';
+COMMENT ON TABLE DOC.doc_text_representation IS 'Search-oriented text representations derived from document content';
+COMMENT ON TABLE DOC.doc_embedding IS 'Embedding lifecycle separated from document structure';
diff --git a/src/main/resources/db/migration/V5__doc_phase2_vectorization_support.sql b/src/main/resources/db/migration/V5__doc_phase2_vectorization_support.sql
new file mode 100644
index 0000000..cce7b19
--- /dev/null
+++ b/src/main/resources/db/migration/V5__doc_phase2_vectorization_support.sql
@@ -0,0 +1,14 @@
+-- Phase 2: Vectorization decoupling support in the generic DOC schema
+-- Adds safety constraints and indexes for representation-based embedding processing.
+
+CREATE UNIQUE INDEX IF NOT EXISTS uq_doc_embedding_representation_model
+ ON DOC.doc_embedding(representation_id, model_id);
+
+CREATE INDEX IF NOT EXISTS idx_doc_embedding_status_created
+ ON DOC.doc_embedding(embedding_status, created_at);
+
+CREATE INDEX IF NOT EXISTS idx_doc_embedding_status_updated
+ ON DOC.doc_embedding(embedding_status, updated_at);
+
+CREATE INDEX IF NOT EXISTS idx_doc_text_repr_document_primary
+ ON DOC.doc_text_representation(document_id, is_primary);