Refactor phases 0-2
parent
21edbc35a2
commit
71fb43a5ea
@ -0,0 +1,48 @@
|
||||
# Phase 2 - Representation-based vectorization and dual-write compatibility
|
||||
|
||||
## Goal
|
||||
|
||||
Decouple vectorization from the TED document entity so arbitrary document types can use a shared
|
||||
representation-to-embedding pipeline.
|
||||
|
||||
## Primary changes
|
||||
|
||||
1. **Primary vectorization source**
|
||||
- before: `TED.procurement_document.text_content`
|
||||
- now: `DOC.doc_text_representation.text_body`
|
||||
|
||||
2. **Primary vectorization target**
|
||||
- before: `TED.procurement_document.content_vector`
|
||||
- now: `DOC.doc_embedding.embedding_vector`
|
||||
|
||||
3. **Compatibility during migration**
|
||||
- completed embeddings are optionally mirrored back to the legacy TED vector columns using the
|
||||
shared TED document hash (`document_hash` / `dedup_hash`)
|
||||
|
||||
4. **TED dual-write bridge**
|
||||
- fresh TED documents are projected into the generic `DOC` model immediately after persistence
|
||||
|
||||
## Key services introduced
|
||||
|
||||
- `TedPhase2GenericDocumentService`
|
||||
- creates/refreshes generic DOC records for TED notices
|
||||
- `DocumentEmbeddingProcessingService`
|
||||
- processes DOC embedding lifecycle records
|
||||
- `GenericVectorizationRoute`
|
||||
- scheduler + worker route for asynchronous DOC embedding generation
|
||||
- `ConfiguredEmbeddingModelStartupRunner`
|
||||
- ensures the configured embedding model exists in `DOC.doc_embedding_model`
|
||||
- `GenericVectorizationStartupRunner`
|
||||
- queues pending/failed DOC embeddings on startup
|
||||
|
||||
## Behavior when Phase 2 is enabled
|
||||
|
||||
- legacy `VectorizationRoute` is disabled
|
||||
- legacy startup queueing is disabled
|
||||
- legacy event-based vectorization queueing is disabled
|
||||
- generic scheduler and startup runner handle DOC embeddings instead
|
||||
|
||||
## Compatibility intent
|
||||
|
||||
This phase keeps the existing TED search endpoints working while the new generic indexing layer becomes
|
||||
operational. The next phase can migrate search reads from the TED table to `DOC.doc_embedding`.
|
||||
@ -0,0 +1,28 @@
|
||||
package at.procon.dip;
|
||||
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.boot.autoconfigure.domain.EntityScan;
|
||||
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||
import org.springframework.scheduling.annotation.EnableAsync;
|
||||
|
||||
/**
|
||||
* Procon Document Intelligence Platform (DIP).
|
||||
*
|
||||
* <p>Phase 0 introduces a generic platform root namespace and architecture contracts
|
||||
* while keeping the existing TED-specific runtime intact. Subsequent phases can move
|
||||
* modules incrementally from {@code at.procon.ted} into the broader document platform.</p>
|
||||
*/
|
||||
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
|
||||
@EnableAsync
|
||||
//@EnableConfigurationProperties(TedProcessorProperties.class)
|
||||
@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity"})
|
||||
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository"})
|
||||
public class DocumentIntelligencePlatformApplication {
|
||||
|
||||
public static void main(String[] args) {
|
||||
SpringApplication.run(DocumentIntelligencePlatformApplication.class, args);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,28 @@
|
||||
package at.procon.dip;
|
||||
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.boot.autoconfigure.domain.EntityScan;
|
||||
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||
import org.springframework.scheduling.annotation.EnableAsync;
|
||||
|
||||
/**
|
||||
* Procon Document Intelligence Platform (DIP).
|
||||
*
|
||||
* <p>Phase 0 introduces a generic platform root namespace and architecture contracts
|
||||
* while keeping the existing TED-specific runtime intact. Subsequent phases can move
|
||||
* modules incrementally from {@code at.procon.ted} into the broader document platform.</p>
|
||||
*/
|
||||
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
|
||||
@EnableAsync
|
||||
//@EnableConfigurationProperties(TedProcessorProperties.class)
|
||||
@EntityScan(basePackages = {"at.procon.ted.model.entity"})
|
||||
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository"})
|
||||
public class DocumentIntelligencePlatformApplication {
|
||||
|
||||
public static void main(String[] args) {
|
||||
SpringApplication.run(DocumentIntelligencePlatformApplication.class, args);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
# Phase 2 - Vectorization decoupling
|
||||
|
||||
Phase 2 moves the primary vectorization pipeline from `TED.procurement_document` to the generic `DOC`
|
||||
representation and embedding model introduced in Phase 1.
|
||||
|
||||
Implemented in this phase:
|
||||
- `DOC.doc_text_representation` is now the primary text source for embeddings
|
||||
- `DOC.doc_embedding` is the primary persistence target for embedding lifecycle and vectors
|
||||
- a generic Camel route processes pending/failed embeddings asynchronously
|
||||
- TED imports dual-write into the generic model by creating:
|
||||
- canonical `DOC.doc_document`
|
||||
- original `DOC.doc_content`
|
||||
- primary `DOC.doc_text_representation`
|
||||
- pending `DOC.doc_embedding`
|
||||
- compatibility mode keeps writing completed TED embeddings back into
|
||||
`TED.procurement_document.content_vector` so the legacy semantic search continues to work
|
||||
|
||||
This phase is intentionally additive and does not yet migrate TED semantic search reads away from the legacy table.
|
||||
@ -0,0 +1,45 @@
|
||||
package at.procon.dip.architecture;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Central architecture constants for the generalized platform.
|
||||
* <p>Phase 1 extends the package map with the additive generic persistence backbone.</p>
|
||||
*/
|
||||
public final class PlatformArchitecture {
|
||||
|
||||
public static final String PLATFORM_NAME = "Procon Document Intelligence Platform";
|
||||
public static final String PLATFORM_SHORT_NAME = "DIP";
|
||||
public static final String BASE_NAMESPACE = "at.procon.dip";
|
||||
public static final String LEGACY_NAMESPACE = "at.procon.ted";
|
||||
|
||||
public static final String GENERIC_SCHEMA = "DOC";
|
||||
public static final String TED_SCHEMA = "TED";
|
||||
|
||||
public static final List<String> GENERIC_PACKAGE_AREAS = List.of(
|
||||
"at.procon.dip.architecture",
|
||||
"at.procon.dip.domain.access",
|
||||
"at.procon.dip.domain.document",
|
||||
"at.procon.dip.domain.tenant",
|
||||
"at.procon.dip.domain.document.entity",
|
||||
"at.procon.dip.domain.document.repository",
|
||||
"at.procon.dip.domain.document.service",
|
||||
"at.procon.dip.domain.tenant.entity",
|
||||
"at.procon.dip.domain.tenant.repository",
|
||||
"at.procon.dip.domain.tenant.service",
|
||||
"at.procon.dip.ingestion.spi",
|
||||
"at.procon.dip.classification.spi",
|
||||
"at.procon.dip.extraction.spi",
|
||||
"at.procon.dip.normalization.spi",
|
||||
"at.procon.dip.vectorization.spi",
|
||||
"at.procon.dip.vectorization.service",
|
||||
"at.procon.dip.vectorization.camel",
|
||||
"at.procon.dip.vectorization.startup",
|
||||
"at.procon.dip.search.spi",
|
||||
"at.procon.dip.processing.spi",
|
||||
"at.procon.dip.migration"
|
||||
);
|
||||
|
||||
private PlatformArchitecture() {
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package at.procon.dip.architecture;
|
||||
|
||||
/**
|
||||
* Target schema names for the generalized model.
|
||||
*/
|
||||
public final class SchemaNames {
|
||||
|
||||
public static final String DOC = "DOC";
|
||||
public static final String TED = "TED";
|
||||
|
||||
private SchemaNames() {
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package at.procon.dip.classification.spi;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Result of document type detection/classification.
|
||||
*/
|
||||
public record DetectionResult(
|
||||
DocumentType documentType,
|
||||
DocumentFamily documentFamily,
|
||||
String mimeType,
|
||||
String languageCode,
|
||||
Map<String, String> attributes
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package at.procon.dip.classification.spi;
|
||||
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
|
||||
/**
|
||||
* Determines a canonical type/family before extraction starts.
|
||||
*/
|
||||
public interface DocumentTypeDetector {
|
||||
|
||||
boolean supports(SourceDescriptor sourceDescriptor);
|
||||
|
||||
DetectionResult detect(SourceDescriptor sourceDescriptor);
|
||||
}
|
||||
@ -0,0 +1,31 @@
|
||||
package at.procon.dip.domain.access;
|
||||
|
||||
import at.procon.dip.domain.tenant.TenantRef;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Canonical ownership and visibility descriptor for a document.
|
||||
* <p>
|
||||
* A document may have no owner tenant, for example public TED notices.
|
||||
* Visibility is always mandatory and defines who may search/read the document.
|
||||
*/
|
||||
public record DocumentAccessContext(
|
||||
TenantRef ownerTenant,
|
||||
DocumentVisibility visibility
|
||||
) {
|
||||
|
||||
public DocumentAccessContext {
|
||||
Objects.requireNonNull(visibility, "visibility must not be null");
|
||||
}
|
||||
|
||||
public static DocumentAccessContext publicDocument() {
|
||||
return new DocumentAccessContext(null, DocumentVisibility.PUBLIC);
|
||||
}
|
||||
|
||||
public static DocumentAccessContext tenantOwned(TenantRef ownerTenant) {
|
||||
return new DocumentAccessContext(
|
||||
Objects.requireNonNull(ownerTenant, "ownerTenant must not be null"),
|
||||
DocumentVisibility.TENANT
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
package at.procon.dip.domain.access;
|
||||
|
||||
/**
|
||||
* Describes who may access a document independently from ownership.
|
||||
*/
|
||||
public enum DocumentVisibility {
|
||||
PUBLIC,
|
||||
TENANT,
|
||||
SHARED,
|
||||
RESTRICTED
|
||||
}
|
||||
@ -0,0 +1,23 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Minimal canonical document descriptor used by Phase 0 SPI contracts.
|
||||
*/
|
||||
public record CanonicalDocumentMetadata(
|
||||
UUID documentId,
|
||||
DocumentAccessContext accessContext,
|
||||
DocumentType documentType,
|
||||
DocumentFamily documentFamily,
|
||||
DocumentStatus status,
|
||||
String title,
|
||||
String languageCode,
|
||||
String mimeType,
|
||||
String dedupHash,
|
||||
OffsetDateTime createdAt,
|
||||
OffsetDateTime updatedAt
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
/**
|
||||
* Role of a stored content version.
|
||||
*/
|
||||
public enum ContentRole {
|
||||
ORIGINAL,
|
||||
NORMALIZED_TEXT,
|
||||
OCR_TEXT,
|
||||
HTML_CLEAN,
|
||||
EXTRACTED_METADATA_JSON,
|
||||
THUMBNAIL,
|
||||
DERIVED_BINARY
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
/**
|
||||
* Distance metric used by an embedding model.
|
||||
*/
|
||||
public enum DistanceMetric {
|
||||
COSINE,
|
||||
L2,
|
||||
INNER_PRODUCT
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
/**
|
||||
* Functional grouping used for broad search and routing decisions.
|
||||
*/
|
||||
public enum DocumentFamily {
|
||||
PROCUREMENT,
|
||||
MAIL,
|
||||
ATTACHMENT,
|
||||
KNOWLEDGE,
|
||||
GENERIC
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
/**
|
||||
* Generic lifecycle state for a canonical document.
|
||||
*/
|
||||
public enum DocumentStatus {
|
||||
RECEIVED,
|
||||
CLASSIFIED,
|
||||
EXTRACTED,
|
||||
REPRESENTED,
|
||||
INDEXED,
|
||||
FAILED,
|
||||
ARCHIVED
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
/**
|
||||
* Canonical technical document type.
|
||||
*/
|
||||
public enum DocumentType {
|
||||
TED_NOTICE,
|
||||
EMAIL,
|
||||
MIME_MESSAGE,
|
||||
PDF,
|
||||
DOCX,
|
||||
HTML,
|
||||
XML_GENERIC,
|
||||
TEXT,
|
||||
MARKDOWN,
|
||||
ZIP_ARCHIVE,
|
||||
GENERIC_BINARY,
|
||||
UNKNOWN
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
/**
|
||||
* Generic lifecycle state of an embedding record in the DOC schema.
|
||||
*/
|
||||
public enum EmbeddingStatus {
|
||||
PENDING,
|
||||
PROCESSING,
|
||||
COMPLETED,
|
||||
FAILED,
|
||||
SKIPPED
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
/**
|
||||
* Logical relationship between canonical documents.
|
||||
*/
|
||||
public enum RelationType {
|
||||
CONTAINS,
|
||||
ATTACHMENT_OF,
|
||||
EXTRACTED_FROM,
|
||||
DERIVED_FROM,
|
||||
PART_OF,
|
||||
VERSION_OF,
|
||||
RELATED_TO
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
/**
|
||||
* Search-oriented text representation that can be embedded independently.
|
||||
*/
|
||||
public enum RepresentationType {
|
||||
FULLTEXT,
|
||||
SEMANTIC_TEXT,
|
||||
SUMMARY,
|
||||
TITLE_ABSTRACT,
|
||||
CHUNK,
|
||||
METADATA_ENRICHED
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
/**
|
||||
* Provenance of an imported document.
|
||||
*/
|
||||
public enum SourceType {
|
||||
TED_PACKAGE,
|
||||
MAIL,
|
||||
FILE_SYSTEM,
|
||||
REST_UPLOAD,
|
||||
MANUAL_UPLOAD,
|
||||
ZIP_CHILD,
|
||||
API,
|
||||
MIGRATION
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package at.procon.dip.domain.document;
|
||||
|
||||
/**
|
||||
* Physical storage strategy for content.
|
||||
*/
|
||||
public enum StorageType {
|
||||
DB_TEXT,
|
||||
DB_BINARY,
|
||||
FILE_PATH,
|
||||
OBJECT_STORAGE,
|
||||
EXTERNAL_REFERENCE
|
||||
}
|
||||
@ -0,0 +1,133 @@
|
||||
package at.procon.dip.domain.document.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentStatus;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.tenant.entity.DocumentTenant;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.EnumType;
|
||||
import jakarta.persistence.Enumerated;
|
||||
import jakarta.persistence.FetchType;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.JoinColumn;
|
||||
import jakarta.persistence.ManyToOne;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.PreUpdate;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Canonical document root entity for the generalized DOC schema.
|
||||
*/
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_document", indexes = {
|
||||
@Index(name = "idx_doc_document_type", columnList = "document_type"),
|
||||
@Index(name = "idx_doc_document_family", columnList = "document_family"),
|
||||
@Index(name = "idx_doc_document_status", columnList = "status"),
|
||||
@Index(name = "idx_doc_document_visibility", columnList = "visibility"),
|
||||
@Index(name = "idx_doc_document_owner_tenant", columnList = "owner_tenant_id"),
|
||||
@Index(name = "idx_doc_document_dedup_hash", columnList = "dedup_hash"),
|
||||
@Index(name = "idx_doc_document_business_key", columnList = "business_key"),
|
||||
@Index(name = "idx_doc_document_created_at", columnList = "created_at")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class Document {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY)
|
||||
@JoinColumn(name = "owner_tenant_id")
|
||||
private DocumentTenant ownerTenant;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "visibility", nullable = false, length = 32)
|
||||
@Builder.Default
|
||||
private DocumentVisibility visibility = DocumentVisibility.PUBLIC;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "document_type", nullable = false, length = 64)
|
||||
private DocumentType documentType;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "document_family", nullable = false, length = 64)
|
||||
private DocumentFamily documentFamily;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "status", nullable = false, length = 32)
|
||||
@Builder.Default
|
||||
private DocumentStatus status = DocumentStatus.RECEIVED;
|
||||
|
||||
@Column(name = "title", length = 1000)
|
||||
private String title;
|
||||
|
||||
@Column(name = "summary", columnDefinition = "TEXT")
|
||||
private String summary;
|
||||
|
||||
@Column(name = "language_code", length = 16)
|
||||
private String languageCode;
|
||||
|
||||
@Column(name = "mime_type", length = 255)
|
||||
private String mimeType;
|
||||
|
||||
@Column(name = "business_key", length = 255)
|
||||
private String businessKey;
|
||||
|
||||
@Column(name = "dedup_hash", length = 64)
|
||||
private String dedupHash;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "updated_at", nullable = false)
|
||||
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
updatedAt = OffsetDateTime.now();
|
||||
}
|
||||
|
||||
@PreUpdate
|
||||
protected void onUpdate() {
|
||||
updatedAt = OffsetDateTime.now();
|
||||
}
|
||||
|
||||
public CanonicalDocumentMetadata toCanonicalMetadata() {
|
||||
return new CanonicalDocumentMetadata(
|
||||
id,
|
||||
new DocumentAccessContext(ownerTenant == null ? null : new at.procon.dip.domain.tenant.TenantRef(
|
||||
ownerTenant.getId().toString(), ownerTenant.getTenantKey(), ownerTenant.getDisplayName()), visibility),
|
||||
documentType,
|
||||
documentFamily,
|
||||
status,
|
||||
title,
|
||||
languageCode,
|
||||
mimeType,
|
||||
dedupHash,
|
||||
createdAt,
|
||||
updatedAt
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,86 @@
|
||||
package at.procon.dip.domain.document.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.domain.document.ContentRole;
|
||||
import at.procon.dip.domain.document.StorageType;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.EnumType;
|
||||
import jakarta.persistence.Enumerated;
|
||||
import jakarta.persistence.FetchType;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.JoinColumn;
|
||||
import jakarta.persistence.ManyToOne;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Stored payload variant for a canonical document.
|
||||
*/
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_content", indexes = {
|
||||
@Index(name = "idx_doc_content_document", columnList = "document_id"),
|
||||
@Index(name = "idx_doc_content_role", columnList = "content_role"),
|
||||
@Index(name = "idx_doc_content_hash", columnList = "content_hash"),
|
||||
@Index(name = "idx_doc_content_storage_type", columnList = "storage_type")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class DocumentContent {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "document_id", nullable = false)
|
||||
private Document document;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "content_role", nullable = false, length = 64)
|
||||
private ContentRole contentRole;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "storage_type", nullable = false, length = 64)
|
||||
private StorageType storageType;
|
||||
|
||||
@Column(name = "mime_type", length = 255)
|
||||
private String mimeType;
|
||||
|
||||
@Column(name = "charset_name", length = 120)
|
||||
private String charsetName;
|
||||
|
||||
@Column(name = "text_content", columnDefinition = "TEXT")
|
||||
private String textContent;
|
||||
|
||||
@Column(name = "binary_ref", columnDefinition = "TEXT")
|
||||
private String binaryRef;
|
||||
|
||||
@Column(name = "content_hash", length = 64)
|
||||
private String contentHash;
|
||||
|
||||
@Column(name = "size_bytes")
|
||||
private Long sizeBytes;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,103 @@
|
||||
package at.procon.dip.domain.document.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.EnumType;
|
||||
import jakarta.persistence.Enumerated;
|
||||
import jakarta.persistence.FetchType;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.JoinColumn;
|
||||
import jakarta.persistence.ManyToOne;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.PreUpdate;
|
||||
import jakarta.persistence.Table;
|
||||
import jakarta.persistence.Transient;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Generic vectorization record separated from the canonical document structure.
|
||||
* <p>
|
||||
* The actual pgvector payload is persisted in the {@code embedding_vector} column via native SQL
|
||||
* in later phases. The transient field exists only as a convenient in-memory carrier.
|
||||
*/
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_embedding", indexes = {
|
||||
@Index(name = "idx_doc_embedding_document", columnList = "document_id"),
|
||||
@Index(name = "idx_doc_embedding_repr", columnList = "representation_id"),
|
||||
@Index(name = "idx_doc_embedding_model", columnList = "model_id"),
|
||||
@Index(name = "idx_doc_embedding_status", columnList = "embedding_status"),
|
||||
@Index(name = "idx_doc_embedding_embedded_at", columnList = "embedded_at")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class DocumentEmbedding {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "document_id", nullable = false)
|
||||
private Document document;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "representation_id", nullable = false)
|
||||
private DocumentTextRepresentation representation;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "model_id", nullable = false)
|
||||
private DocumentEmbeddingModel model;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "embedding_status", nullable = false, length = 32)
|
||||
@Builder.Default
|
||||
private EmbeddingStatus embeddingStatus = EmbeddingStatus.PENDING;
|
||||
|
||||
@Column(name = "token_count")
|
||||
private Integer tokenCount;
|
||||
|
||||
@Column(name = "embedding_dimensions")
|
||||
private Integer embeddingDimensions;
|
||||
|
||||
@Column(name = "error_message", columnDefinition = "TEXT")
|
||||
private String errorMessage;
|
||||
|
||||
@Column(name = "embedded_at")
|
||||
private OffsetDateTime embeddedAt;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "updated_at", nullable = false)
|
||||
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
||||
|
||||
@Transient
|
||||
private float[] embeddingVector;
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
updatedAt = OffsetDateTime.now();
|
||||
}
|
||||
|
||||
@PreUpdate
|
||||
protected void onUpdate() {
|
||||
updatedAt = OffsetDateTime.now();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,86 @@
|
||||
package at.procon.dip.domain.document.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.domain.document.DistanceMetric;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.EnumType;
|
||||
import jakarta.persistence.Enumerated;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.PreUpdate;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Embedding model catalog row used by generic vectorization.
|
||||
*/
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_embedding_model", indexes = {
|
||||
@Index(name = "idx_doc_embedding_model_key", columnList = "model_key", unique = true),
|
||||
@Index(name = "idx_doc_embedding_model_active", columnList = "active")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class DocumentEmbeddingModel {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@Column(name = "model_key", nullable = false, unique = true, length = 255)
|
||||
private String modelKey;
|
||||
|
||||
@Column(name = "provider", nullable = false, length = 120)
|
||||
private String provider;
|
||||
|
||||
@Column(name = "display_name", length = 255)
|
||||
private String displayName;
|
||||
|
||||
@Column(name = "dimensions", nullable = false)
|
||||
private Integer dimensions;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "distance_metric", nullable = false, length = 32)
|
||||
@Builder.Default
|
||||
private DistanceMetric distanceMetric = DistanceMetric.COSINE;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "query_prefix_required", nullable = false)
|
||||
private boolean queryPrefixRequired = false;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "active", nullable = false)
|
||||
private boolean active = true;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "updated_at", nullable = false)
|
||||
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
updatedAt = OffsetDateTime.now();
|
||||
}
|
||||
|
||||
@PreUpdate
|
||||
protected void onUpdate() {
|
||||
updatedAt = OffsetDateTime.now();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,72 @@
|
||||
package at.procon.dip.domain.document.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.domain.document.RelationType;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.EnumType;
|
||||
import jakarta.persistence.Enumerated;
|
||||
import jakarta.persistence.FetchType;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.JoinColumn;
|
||||
import jakarta.persistence.ManyToOne;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Directed relationship between two canonical documents.
|
||||
*/
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_relation", indexes = {
|
||||
@Index(name = "idx_doc_relation_parent", columnList = "parent_document_id"),
|
||||
@Index(name = "idx_doc_relation_child", columnList = "child_document_id"),
|
||||
@Index(name = "idx_doc_relation_type", columnList = "relation_type")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class DocumentRelation {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "parent_document_id", nullable = false)
|
||||
private Document parentDocument;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "child_document_id", nullable = false)
|
||||
private Document childDocument;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "relation_type", nullable = false, length = 64)
|
||||
private RelationType relationType;
|
||||
|
||||
@Column(name = "sort_order")
|
||||
private Integer sortOrder;
|
||||
|
||||
@Column(name = "relation_metadata", columnDefinition = "TEXT")
|
||||
private String relationMetadata;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,85 @@
|
||||
package at.procon.dip.domain.document.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.domain.document.SourceType;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.EnumType;
|
||||
import jakarta.persistence.Enumerated;
|
||||
import jakarta.persistence.FetchType;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.JoinColumn;
|
||||
import jakarta.persistence.ManyToOne;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Provenance row for a canonical document.
|
||||
*/
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_source", indexes = {
|
||||
@Index(name = "idx_doc_source_document", columnList = "document_id"),
|
||||
@Index(name = "idx_doc_source_type", columnList = "source_type"),
|
||||
@Index(name = "idx_doc_source_external_id", columnList = "external_source_id"),
|
||||
@Index(name = "idx_doc_source_received_at", columnList = "received_at"),
|
||||
@Index(name = "idx_doc_source_parent_source", columnList = "parent_source_id")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class DocumentSource {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "document_id", nullable = false)
|
||||
private Document document;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "source_type", nullable = false, length = 64)
|
||||
private SourceType sourceType;
|
||||
|
||||
@Column(name = "external_source_id", length = 500)
|
||||
private String externalSourceId;
|
||||
|
||||
@Column(name = "source_uri", columnDefinition = "TEXT")
|
||||
private String sourceUri;
|
||||
|
||||
@Column(name = "source_filename", length = 1000)
|
||||
private String sourceFilename;
|
||||
|
||||
@Column(name = "parent_source_id")
|
||||
private UUID parentSourceId;
|
||||
|
||||
@Column(name = "import_batch_id", length = 255)
|
||||
private String importBatchId;
|
||||
|
||||
@Column(name = "received_at")
|
||||
private OffsetDateTime receivedAt;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
if (receivedAt == null) {
|
||||
receivedAt = createdAt;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,98 @@
|
||||
package at.procon.dip.domain.document.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.EnumType;
|
||||
import jakarta.persistence.Enumerated;
|
||||
import jakarta.persistence.FetchType;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.JoinColumn;
|
||||
import jakarta.persistence.ManyToOne;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Search-oriented text derived from a canonical document.
|
||||
*/
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_text_representation", indexes = {
|
||||
@Index(name = "idx_doc_text_repr_document", columnList = "document_id"),
|
||||
@Index(name = "idx_doc_text_repr_content", columnList = "content_id"),
|
||||
@Index(name = "idx_doc_text_repr_type", columnList = "representation_type"),
|
||||
@Index(name = "idx_doc_text_repr_primary", columnList = "is_primary")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class DocumentTextRepresentation {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "document_id", nullable = false)
|
||||
private Document document;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY)
|
||||
@JoinColumn(name = "content_id")
|
||||
private DocumentContent content;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "representation_type", nullable = false, length = 64)
|
||||
private RepresentationType representationType;
|
||||
|
||||
@Column(name = "builder_key", length = 255)
|
||||
private String builderKey;
|
||||
|
||||
@Column(name = "language_code", length = 16)
|
||||
private String languageCode;
|
||||
|
||||
@Column(name = "token_count")
|
||||
private Integer tokenCount;
|
||||
|
||||
@Column(name = "char_count")
|
||||
private Integer charCount;
|
||||
|
||||
@Column(name = "chunk_index")
|
||||
private Integer chunkIndex;
|
||||
|
||||
@Column(name = "chunk_start_offset")
|
||||
private Integer chunkStartOffset;
|
||||
|
||||
@Column(name = "chunk_end_offset")
|
||||
private Integer chunkEndOffset;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "is_primary", nullable = false)
|
||||
private boolean primaryRepresentation = false;
|
||||
|
||||
@Column(name = "text_body", columnDefinition = "TEXT", nullable = false)
|
||||
private String textBody;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
if (charCount == null && textBody != null) {
|
||||
charCount = textBody.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package at.procon.dip.domain.document.repository;
|
||||
|
||||
import at.procon.dip.domain.document.ContentRole;
|
||||
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface DocumentContentRepository extends JpaRepository<DocumentContent, UUID> {
|
||||
|
||||
List<DocumentContent> findByDocument_Id(UUID documentId);
|
||||
|
||||
List<DocumentContent> findByDocument_IdAndContentRole(UUID documentId, ContentRole contentRole);
|
||||
|
||||
Optional<DocumentContent> findByContentHash(String contentHash);
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
package at.procon.dip.domain.document.repository;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface DocumentEmbeddingModelRepository extends JpaRepository<DocumentEmbeddingModel, UUID> {
|
||||
|
||||
Optional<DocumentEmbeddingModel> findByModelKey(String modelKey);
|
||||
}
|
||||
@ -0,0 +1,55 @@
|
||||
package at.procon.dip.domain.document.repository;
|
||||
|
||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbedding;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.domain.Pageable;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.data.jpa.repository.Modifying;
|
||||
import org.springframework.data.jpa.repository.Query;
|
||||
import org.springframework.data.repository.query.Param;
|
||||
|
||||
public interface DocumentEmbeddingRepository extends JpaRepository<DocumentEmbedding, UUID> {
|
||||
|
||||
List<DocumentEmbedding> findByDocument_Id(UUID documentId);
|
||||
|
||||
List<DocumentEmbedding> findByRepresentation_Id(UUID representationId);
|
||||
|
||||
List<DocumentEmbedding> findByEmbeddingStatus(EmbeddingStatus embeddingStatus);
|
||||
|
||||
Optional<DocumentEmbedding> findByRepresentation_IdAndModel_Id(UUID representationId, UUID modelId);
|
||||
|
||||
@Query("SELECT e.id FROM DocumentEmbedding e WHERE e.embeddingStatus = :status ORDER BY e.createdAt ASC")
|
||||
List<UUID> findIdsByEmbeddingStatus(@Param("status") EmbeddingStatus status, Pageable pageable);
|
||||
|
||||
@Query("SELECT e FROM DocumentEmbedding e " +
|
||||
"JOIN FETCH e.document d " +
|
||||
"JOIN FETCH e.representation r " +
|
||||
"JOIN FETCH e.model m " +
|
||||
"WHERE e.id = :embeddingId")
|
||||
Optional<DocumentEmbedding> findDetailedById(@Param("embeddingId") UUID embeddingId);
|
||||
|
||||
@Modifying
|
||||
@Query(value = "UPDATE doc.doc_embedding SET embedding_vector = CAST(:vectorData AS vector), " +
|
||||
"embedding_status = 'COMPLETED', embedded_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP, " +
|
||||
"error_message = NULL, token_count = :tokenCount, embedding_dimensions = :dimensions WHERE id = :id",
|
||||
nativeQuery = true)
|
||||
int updateEmbeddingVector(@Param("id") UUID id,
|
||||
@Param("vectorData") String vectorData,
|
||||
@Param("tokenCount") Integer tokenCount,
|
||||
@Param("dimensions") Integer dimensions);
|
||||
|
||||
@Modifying
|
||||
@Query("UPDATE DocumentEmbedding e SET e.embeddingStatus = :status, e.errorMessage = :errorMessage, " +
|
||||
"e.embeddedAt = :embeddedAt, e.updatedAt = CURRENT_TIMESTAMP WHERE e.id = :embeddingId")
|
||||
int updateEmbeddingStatus(@Param("embeddingId") UUID embeddingId,
|
||||
@Param("status") EmbeddingStatus status,
|
||||
@Param("errorMessage") String errorMessage,
|
||||
@Param("embeddedAt") OffsetDateTime embeddedAt);
|
||||
|
||||
@Query("SELECT e.embeddingStatus, COUNT(e) FROM DocumentEmbedding e GROUP BY e.embeddingStatus")
|
||||
List<Object[]> countByEmbeddingStatus();
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
package at.procon.dip.domain.document.repository;
|
||||
|
||||
import at.procon.dip.domain.document.RelationType;
|
||||
import at.procon.dip.domain.document.entity.DocumentRelation;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface DocumentRelationRepository extends JpaRepository<DocumentRelation, UUID> {
|
||||
|
||||
List<DocumentRelation> findByParentDocument_Id(UUID parentDocumentId);
|
||||
|
||||
List<DocumentRelation> findByChildDocument_Id(UUID childDocumentId);
|
||||
|
||||
List<DocumentRelation> findByParentDocument_IdAndRelationType(UUID parentDocumentId, RelationType relationType);
|
||||
}
|
||||
@ -0,0 +1,31 @@
|
||||
package at.procon.dip.domain.document.repository;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentStatus;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface DocumentRepository extends JpaRepository<Document, UUID> {
|
||||
|
||||
Optional<Document> findByDedupHash(String dedupHash);
|
||||
|
||||
boolean existsByDedupHash(String dedupHash);
|
||||
|
||||
List<Document> findByDocumentType(DocumentType documentType);
|
||||
|
||||
List<Document> findByDocumentFamily(DocumentFamily documentFamily);
|
||||
|
||||
List<Document> findByStatus(DocumentStatus status);
|
||||
|
||||
List<Document> findByVisibility(DocumentVisibility visibility);
|
||||
|
||||
List<Document> findByOwnerTenant_TenantKey(String tenantKey);
|
||||
|
||||
List<Document> findByOwnerTenant_TenantKeyIn(Collection<String> tenantKeys);
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package at.procon.dip.domain.document.repository;
|
||||
|
||||
import at.procon.dip.domain.document.SourceType;
|
||||
import at.procon.dip.domain.document.entity.DocumentSource;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface DocumentSourceRepository extends JpaRepository<DocumentSource, UUID> {
|
||||
|
||||
List<DocumentSource> findByDocument_Id(UUID documentId);
|
||||
|
||||
List<DocumentSource> findBySourceType(SourceType sourceType);
|
||||
|
||||
Optional<DocumentSource> findByExternalSourceId(String externalSourceId);
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
package at.procon.dip.domain.document.repository;
|
||||
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface DocumentTextRepresentationRepository extends JpaRepository<DocumentTextRepresentation, UUID> {
|
||||
|
||||
List<DocumentTextRepresentation> findByDocument_Id(UUID documentId);
|
||||
|
||||
List<DocumentTextRepresentation> findByDocument_IdAndRepresentationType(UUID documentId, RepresentationType representationType);
|
||||
|
||||
List<DocumentTextRepresentation> findByPrimaryRepresentationTrue();
|
||||
|
||||
Optional<DocumentTextRepresentation> findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId);
|
||||
}
|
||||
@ -0,0 +1,45 @@
|
||||
package at.procon.dip.domain.document.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||
import at.procon.dip.domain.document.repository.DocumentContentRepository;
|
||||
import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Transactional
|
||||
public class DocumentContentService {
|
||||
|
||||
private final DocumentService documentService;
|
||||
private final DocumentContentRepository contentRepository;
|
||||
|
||||
public DocumentContent addContent(AddDocumentContentCommand command) {
|
||||
DocumentContent content = DocumentContent.builder()
|
||||
.document(documentService.getRequired(command.documentId()))
|
||||
.contentRole(command.contentRole())
|
||||
.storageType(command.storageType())
|
||||
.mimeType(command.mimeType())
|
||||
.charsetName(command.charsetName())
|
||||
.textContent(command.textContent())
|
||||
.binaryRef(command.binaryRef())
|
||||
.contentHash(command.contentHash())
|
||||
.sizeBytes(command.sizeBytes())
|
||||
.build();
|
||||
return contentRepository.save(content);
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public DocumentContent getRequired(UUID contentId) {
|
||||
return contentRepository.findById(contentId)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown content id: " + contentId));
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public List<DocumentContent> findByDocument(UUID documentId) {
|
||||
return contentRepository.findByDocument_Id(documentId);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,125 @@
|
||||
package at.procon.dip.domain.document.service;
|
||||
|
||||
import at.procon.dip.domain.document.DistanceMetric;
|
||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbedding;
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||
import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||
import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Transactional
|
||||
public class DocumentEmbeddingService {
|
||||
|
||||
private final DocumentService documentService;
|
||||
private final DocumentRepresentationService representationService;
|
||||
private final DocumentEmbeddingRepository embeddingRepository;
|
||||
private final DocumentEmbeddingModelRepository modelRepository;
|
||||
|
||||
public DocumentEmbeddingModel registerModel(RegisterEmbeddingModelCommand command) {
|
||||
DocumentEmbeddingModel model = modelRepository.findByModelKey(command.modelKey())
|
||||
.orElseGet(DocumentEmbeddingModel::new);
|
||||
model.setModelKey(command.modelKey());
|
||||
model.setProvider(command.provider());
|
||||
model.setDisplayName(command.displayName());
|
||||
model.setDimensions(command.dimensions());
|
||||
model.setDistanceMetric(command.distanceMetric() == null ? DistanceMetric.COSINE : command.distanceMetric());
|
||||
model.setQueryPrefixRequired(command.queryPrefixRequired());
|
||||
model.setActive(command.active());
|
||||
return modelRepository.save(model);
|
||||
}
|
||||
|
||||
public DocumentEmbedding createPendingEmbedding(UUID documentId, UUID representationId, UUID modelId) {
|
||||
DocumentEmbeddingModel model = getRequiredModel(modelId);
|
||||
DocumentEmbedding embedding = DocumentEmbedding.builder()
|
||||
.document(documentService.getRequired(documentId))
|
||||
.representation(representationService.getRequired(representationId))
|
||||
.model(model)
|
||||
.embeddingDimensions(model.getDimensions())
|
||||
.embeddingStatus(EmbeddingStatus.PENDING)
|
||||
.build();
|
||||
return embeddingRepository.save(embedding);
|
||||
}
|
||||
|
||||
public DocumentEmbedding ensurePendingEmbedding(UUID documentId, UUID representationId, UUID modelId) {
|
||||
Optional<DocumentEmbedding> existing = embeddingRepository.findByRepresentation_IdAndModel_Id(representationId, modelId);
|
||||
if (existing.isPresent()) {
|
||||
DocumentEmbedding embedding = existing.get();
|
||||
embedding.setDocument(documentService.getRequired(documentId));
|
||||
embedding.setRepresentation(representationService.getRequired(representationId));
|
||||
embedding.setModel(getRequiredModel(modelId));
|
||||
embedding.setEmbeddingDimensions(embedding.getModel().getDimensions());
|
||||
embedding.setEmbeddingStatus(EmbeddingStatus.PENDING);
|
||||
embedding.setErrorMessage(null);
|
||||
embedding.setEmbeddedAt(null);
|
||||
return embeddingRepository.save(embedding);
|
||||
}
|
||||
return createPendingEmbedding(documentId, representationId, modelId);
|
||||
}
|
||||
|
||||
public DocumentEmbedding markCompleted(UUID embeddingId, Integer tokenCount) {
|
||||
DocumentEmbedding embedding = getRequired(embeddingId);
|
||||
embedding.setEmbeddingStatus(EmbeddingStatus.COMPLETED);
|
||||
embedding.setTokenCount(tokenCount);
|
||||
embedding.setEmbeddedAt(OffsetDateTime.now());
|
||||
embedding.setErrorMessage(null);
|
||||
return embeddingRepository.save(embedding);
|
||||
}
|
||||
|
||||
public DocumentEmbedding markFailed(UUID embeddingId, String errorMessage) {
|
||||
DocumentEmbedding embedding = getRequired(embeddingId);
|
||||
embedding.setEmbeddingStatus(EmbeddingStatus.FAILED);
|
||||
embedding.setErrorMessage(errorMessage);
|
||||
embedding.setEmbeddedAt(null);
|
||||
return embeddingRepository.save(embedding);
|
||||
}
|
||||
|
||||
public DocumentEmbedding markProcessing(UUID embeddingId) {
|
||||
DocumentEmbedding embedding = getRequired(embeddingId);
|
||||
embedding.setEmbeddingStatus(EmbeddingStatus.PROCESSING);
|
||||
embedding.setErrorMessage(null);
|
||||
return embeddingRepository.save(embedding);
|
||||
}
|
||||
|
||||
public DocumentEmbedding markSkipped(UUID embeddingId, String reason) {
|
||||
DocumentEmbedding embedding = getRequired(embeddingId);
|
||||
embedding.setEmbeddingStatus(EmbeddingStatus.SKIPPED);
|
||||
embedding.setErrorMessage(reason);
|
||||
embedding.setEmbeddedAt(OffsetDateTime.now());
|
||||
return embeddingRepository.save(embedding);
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public DocumentEmbedding getRequired(UUID embeddingId) {
|
||||
return embeddingRepository.findById(embeddingId)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public DocumentEmbeddingModel getRequiredModel(UUID modelId) {
|
||||
return modelRepository.findById(modelId)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding model id: " + modelId));
|
||||
}
|
||||
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public DocumentEmbeddingModel findActiveModelByKey(String modelKey) {
|
||||
return modelRepository.findByModelKey(modelKey)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding model key: " + modelKey));
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public List<DocumentEmbedding> findPendingEmbeddings() {
|
||||
return embeddingRepository.findByEmbeddingStatus(EmbeddingStatus.PENDING);
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,35 @@
|
||||
package at.procon.dip.domain.document.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentRelation;
|
||||
import at.procon.dip.domain.document.repository.DocumentRelationRepository;
|
||||
import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Transactional
|
||||
public class DocumentRelationService {
|
||||
|
||||
private final DocumentService documentService;
|
||||
private final DocumentRelationRepository relationRepository;
|
||||
|
||||
public DocumentRelation createRelation(CreateDocumentRelationCommand command) {
|
||||
DocumentRelation relation = DocumentRelation.builder()
|
||||
.parentDocument(documentService.getRequired(command.parentDocumentId()))
|
||||
.childDocument(documentService.getRequired(command.childDocumentId()))
|
||||
.relationType(command.relationType())
|
||||
.sortOrder(command.sortOrder())
|
||||
.relationMetadata(command.relationMetadata())
|
||||
.build();
|
||||
return relationRepository.save(relation);
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public List<DocumentRelation> findChildren(UUID parentDocumentId) {
|
||||
return relationRepository.findByParentDocument_Id(parentDocumentId);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,50 @@
|
||||
package at.procon.dip.domain.document.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Transactional
|
||||
public class DocumentRepresentationService {
|
||||
|
||||
private final DocumentService documentService;
|
||||
private final DocumentContentService contentService;
|
||||
private final DocumentTextRepresentationRepository representationRepository;
|
||||
|
||||
public DocumentTextRepresentation addRepresentation(AddDocumentTextRepresentationCommand command) {
|
||||
DocumentContent content = command.contentId() == null ? null : contentService.getRequired(command.contentId());
|
||||
DocumentTextRepresentation representation = DocumentTextRepresentation.builder()
|
||||
.document(documentService.getRequired(command.documentId()))
|
||||
.content(content)
|
||||
.representationType(command.representationType())
|
||||
.builderKey(command.builderKey())
|
||||
.languageCode(command.languageCode())
|
||||
.tokenCount(command.tokenCount())
|
||||
.chunkIndex(command.chunkIndex())
|
||||
.chunkStartOffset(command.chunkStartOffset())
|
||||
.chunkEndOffset(command.chunkEndOffset())
|
||||
.primaryRepresentation(command.primaryRepresentation())
|
||||
.textBody(command.textBody())
|
||||
.build();
|
||||
return representationRepository.save(representation);
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public DocumentTextRepresentation getRequired(UUID representationId) {
|
||||
return representationRepository.findById(representationId)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown representation id: " + representationId));
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public List<DocumentTextRepresentation> findByDocument(UUID documentId) {
|
||||
return representationRepository.findByDocument_Id(documentId);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,75 @@
|
||||
package at.procon.dip.domain.document.service;
|
||||
|
||||
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
|
||||
import at.procon.dip.domain.document.DocumentStatus;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.domain.document.repository.DocumentRepository;
|
||||
import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
|
||||
import at.procon.dip.domain.tenant.entity.DocumentTenant;
|
||||
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Transactional
|
||||
public class DocumentService {
|
||||
|
||||
private final DocumentRepository documentRepository;
|
||||
private final DocumentTenantRepository tenantRepository;
|
||||
|
||||
public Document create(CreateDocumentCommand command) {
|
||||
DocumentTenant ownerTenant = resolveOwnerTenant(command.ownerTenantKey());
|
||||
Document document = Document.builder()
|
||||
.ownerTenant(ownerTenant)
|
||||
.visibility(command.visibility())
|
||||
.documentType(command.documentType())
|
||||
.documentFamily(command.documentFamily())
|
||||
.status(command.status() == null ? DocumentStatus.RECEIVED : command.status())
|
||||
.title(command.title())
|
||||
.summary(command.summary())
|
||||
.languageCode(command.languageCode())
|
||||
.mimeType(command.mimeType())
|
||||
.businessKey(command.businessKey())
|
||||
.dedupHash(command.dedupHash())
|
||||
.build();
|
||||
return documentRepository.save(document);
|
||||
}
|
||||
|
||||
public Document save(Document document) {
|
||||
return documentRepository.save(document);
|
||||
}
|
||||
|
||||
public Document updateStatus(UUID documentId, DocumentStatus status) {
|
||||
Document document = getRequired(documentId);
|
||||
document.setStatus(status);
|
||||
return documentRepository.save(document);
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public Document getRequired(UUID documentId) {
|
||||
return documentRepository.findById(documentId)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown document id: " + documentId));
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public List<Document> findAll() {
|
||||
return documentRepository.findAll();
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public CanonicalDocumentMetadata getMetadata(UUID documentId) {
|
||||
return getRequired(documentId).toCanonicalMetadata();
|
||||
}
|
||||
|
||||
private DocumentTenant resolveOwnerTenant(String ownerTenantKey) {
|
||||
if (ownerTenantKey == null || ownerTenantKey.isBlank()) {
|
||||
return null;
|
||||
}
|
||||
return tenantRepository.findByTenantKey(ownerTenantKey)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown tenant key: " + ownerTenantKey));
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,38 @@
|
||||
package at.procon.dip.domain.document.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentSource;
|
||||
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
||||
import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Transactional
|
||||
public class DocumentSourceService {
|
||||
|
||||
private final DocumentService documentService;
|
||||
private final DocumentSourceRepository sourceRepository;
|
||||
|
||||
public DocumentSource addSource(AddDocumentSourceCommand command) {
|
||||
DocumentSource source = DocumentSource.builder()
|
||||
.document(documentService.getRequired(command.documentId()))
|
||||
.sourceType(command.sourceType())
|
||||
.externalSourceId(command.externalSourceId())
|
||||
.sourceUri(command.sourceUri())
|
||||
.sourceFilename(command.sourceFilename())
|
||||
.parentSourceId(command.parentSourceId())
|
||||
.importBatchId(command.importBatchId())
|
||||
.receivedAt(command.receivedAt())
|
||||
.build();
|
||||
return sourceRepository.save(source);
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public List<DocumentSource> findByDocument(UUID documentId) {
|
||||
return sourceRepository.findByDocument_Id(documentId);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
package at.procon.dip.domain.document.service.command;
|
||||
|
||||
import at.procon.dip.domain.document.ContentRole;
|
||||
import at.procon.dip.domain.document.StorageType;
|
||||
import java.util.UUID;
|
||||
|
||||
public record AddDocumentContentCommand(
|
||||
UUID documentId,
|
||||
ContentRole contentRole,
|
||||
StorageType storageType,
|
||||
String mimeType,
|
||||
String charsetName,
|
||||
String textContent,
|
||||
String binaryRef,
|
||||
String contentHash,
|
||||
Long sizeBytes
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package at.procon.dip.domain.document.service.command;
|
||||
|
||||
import at.procon.dip.domain.document.SourceType;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
|
||||
public record AddDocumentSourceCommand(
|
||||
UUID documentId,
|
||||
SourceType sourceType,
|
||||
String externalSourceId,
|
||||
String sourceUri,
|
||||
String sourceFilename,
|
||||
UUID parentSourceId,
|
||||
String importBatchId,
|
||||
OffsetDateTime receivedAt
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
package at.procon.dip.domain.document.service.command;
|
||||
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import java.util.UUID;
|
||||
|
||||
public record AddDocumentTextRepresentationCommand(
|
||||
UUID documentId,
|
||||
UUID contentId,
|
||||
RepresentationType representationType,
|
||||
String builderKey,
|
||||
String languageCode,
|
||||
Integer tokenCount,
|
||||
Integer chunkIndex,
|
||||
Integer chunkStartOffset,
|
||||
Integer chunkEndOffset,
|
||||
boolean primaryRepresentation,
|
||||
String textBody
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package at.procon.dip.domain.document.service.command;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentStatus;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
|
||||
/**
|
||||
* Minimal Phase 1 command for creating the canonical document root.
|
||||
*/
|
||||
public record CreateDocumentCommand(
|
||||
String ownerTenantKey,
|
||||
DocumentVisibility visibility,
|
||||
DocumentType documentType,
|
||||
DocumentFamily documentFamily,
|
||||
DocumentStatus status,
|
||||
String title,
|
||||
String summary,
|
||||
String languageCode,
|
||||
String mimeType,
|
||||
String businessKey,
|
||||
String dedupHash
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package at.procon.dip.domain.document.service.command;
|
||||
|
||||
import at.procon.dip.domain.document.RelationType;
|
||||
import java.util.UUID;
|
||||
|
||||
public record CreateDocumentRelationCommand(
|
||||
UUID parentDocumentId,
|
||||
UUID childDocumentId,
|
||||
RelationType relationType,
|
||||
Integer sortOrder,
|
||||
String relationMetadata
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package at.procon.dip.domain.document.service.command;
|
||||
|
||||
import at.procon.dip.domain.document.DistanceMetric;
|
||||
|
||||
public record RegisterEmbeddingModelCommand(
|
||||
String modelKey,
|
||||
String provider,
|
||||
String displayName,
|
||||
Integer dimensions,
|
||||
DistanceMetric distanceMetric,
|
||||
boolean queryPrefixRequired,
|
||||
boolean active
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
package at.procon.dip.domain.tenant;
|
||||
|
||||
/**
|
||||
* Canonical tenant reference used to express document ownership.
|
||||
*/
|
||||
public record TenantRef(
|
||||
String tenantId,
|
||||
String tenantKey,
|
||||
String displayName
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,71 @@
|
||||
package at.procon.dip.domain.tenant.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.PreUpdate;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Canonical owner tenant catalog for the generalized DOC schema.
|
||||
*/
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_tenant", indexes = {
|
||||
@Index(name = "idx_doc_tenant_key", columnList = "tenant_key", unique = true),
|
||||
@Index(name = "idx_doc_tenant_active", columnList = "active")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class DocumentTenant {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@Column(name = "tenant_key", nullable = false, unique = true, length = 120)
|
||||
private String tenantKey;
|
||||
|
||||
@Column(name = "display_name", nullable = false, length = 255)
|
||||
private String displayName;
|
||||
|
||||
@Column(name = "description", columnDefinition = "TEXT")
|
||||
private String description;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "active", nullable = false)
|
||||
private boolean active = true;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "updated_at", nullable = false)
|
||||
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
updatedAt = OffsetDateTime.now();
|
||||
}
|
||||
|
||||
@PreUpdate
|
||||
protected void onUpdate() {
|
||||
updatedAt = OffsetDateTime.now();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package at.procon.dip.domain.tenant.repository;
|
||||
|
||||
import at.procon.dip.domain.tenant.entity.DocumentTenant;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface DocumentTenantRepository extends JpaRepository<DocumentTenant, UUID> {
|
||||
|
||||
Optional<DocumentTenant> findByTenantKey(String tenantKey);
|
||||
|
||||
boolean existsByTenantKey(String tenantKey);
|
||||
}
|
||||
@ -0,0 +1,45 @@
|
||||
package at.procon.dip.domain.tenant.service;
|
||||
|
||||
import at.procon.dip.domain.tenant.entity.DocumentTenant;
|
||||
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
|
||||
import at.procon.dip.domain.tenant.service.command.CreateTenantCommand;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Transactional
|
||||
public class DocumentTenantService {
|
||||
|
||||
private final DocumentTenantRepository tenantRepository;
|
||||
|
||||
public DocumentTenant createOrUpdate(CreateTenantCommand command) {
|
||||
DocumentTenant tenant = tenantRepository.findByTenantKey(command.tenantKey())
|
||||
.orElseGet(DocumentTenant::new);
|
||||
tenant.setTenantKey(command.tenantKey());
|
||||
tenant.setDisplayName(command.displayName());
|
||||
tenant.setDescription(command.description());
|
||||
tenant.setActive(command.active());
|
||||
return tenantRepository.save(tenant);
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public DocumentTenant getRequiredById(UUID id) {
|
||||
return tenantRepository.findById(id)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown tenant id: " + id));
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public DocumentTenant getRequiredByTenantKey(String tenantKey) {
|
||||
return tenantRepository.findByTenantKey(tenantKey)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown tenant key: " + tenantKey));
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public List<DocumentTenant> findAll() {
|
||||
return tenantRepository.findAll();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
package at.procon.dip.domain.tenant.service.command;
|
||||
|
||||
public record CreateTenantCommand(
|
||||
String tenantKey,
|
||||
String displayName,
|
||||
String description,
|
||||
boolean active
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package at.procon.dip.extraction.spi;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
|
||||
/**
|
||||
* Type-specific extraction contract.
|
||||
*/
|
||||
public interface DocumentExtractor {
|
||||
|
||||
boolean supports(DocumentType documentType, String mimeType);
|
||||
|
||||
ExtractionResult extract(ExtractionRequest extractionRequest);
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package at.procon.dip.extraction.spi;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Type-specific structured payload produced by an extractor.
|
||||
*/
|
||||
public record ExtractedStructuredPayload(
|
||||
String projectionName,
|
||||
Map<String, Object> attributes
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package at.procon.dip.extraction.spi;
|
||||
|
||||
import at.procon.dip.classification.spi.DetectionResult;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
|
||||
/**
|
||||
* Input to a document extractor.
|
||||
*/
|
||||
public record ExtractionRequest(
|
||||
SourceDescriptor sourceDescriptor,
|
||||
DetectionResult detectionResult,
|
||||
String textContent,
|
||||
byte[] binaryContent
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package at.procon.dip.extraction.spi;
|
||||
|
||||
import at.procon.dip.domain.document.ContentRole;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Output of a document extractor before normalization and persistence.
|
||||
*/
|
||||
public record ExtractionResult(
|
||||
Map<ContentRole, String> derivedTextByRole,
|
||||
List<ExtractedStructuredPayload> structuredPayloads,
|
||||
List<String> warnings
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
package at.procon.dip.ingestion.spi;
|
||||
|
||||
/**
|
||||
* Extension point for source-specific import adapters.
|
||||
*/
|
||||
public interface DocumentIngestionAdapter {
|
||||
|
||||
boolean supports(SourceDescriptor sourceDescriptor);
|
||||
|
||||
IngestionResult ingest(SourceDescriptor sourceDescriptor);
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package at.procon.dip.ingestion.spi;
|
||||
|
||||
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Result of an ingestion adapter execution.
|
||||
*/
|
||||
public record IngestionResult(
|
||||
List<CanonicalDocumentMetadata> documents,
|
||||
List<String> warnings
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
package at.procon.dip.ingestion.spi;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||
import at.procon.dip.domain.document.SourceType;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Describes a source object that should be ingested into the canonical document model.
|
||||
*/
|
||||
public record SourceDescriptor(
|
||||
DocumentAccessContext accessContext,
|
||||
SourceType sourceType,
|
||||
String sourceIdentifier,
|
||||
String sourceUri,
|
||||
String fileName,
|
||||
String mediaType,
|
||||
Map<String, String> attributes
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package at.procon.dip.migration;
|
||||
|
||||
/**
|
||||
* Phase 0 decision for introducing the generalized model incrementally.
|
||||
*/
|
||||
public enum MigrationStrategyMode {
|
||||
ADDITIVE_SCHEMA,
|
||||
DUAL_WRITE,
|
||||
BACKFILL,
|
||||
CUTOVER,
|
||||
RETIRE_LEGACY
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package at.procon.dip.normalization.spi;
|
||||
|
||||
import at.procon.dip.classification.spi.DetectionResult;
|
||||
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
|
||||
/**
|
||||
* Input for text-representation builders.
|
||||
*/
|
||||
public record RepresentationBuildRequest(
|
||||
SourceDescriptor sourceDescriptor,
|
||||
DetectionResult detectionResult,
|
||||
ExtractionResult extractionResult
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package at.procon.dip.normalization.spi;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Builds search-oriented text representations independently from raw extraction.
|
||||
*/
|
||||
public interface TextRepresentationBuilder {
|
||||
|
||||
boolean supports(DocumentType documentType);
|
||||
|
||||
List<TextRepresentationDraft> build(RepresentationBuildRequest request);
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package at.procon.dip.normalization.spi;
|
||||
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
|
||||
/**
|
||||
* Candidate text representation for semantic indexing.
|
||||
*/
|
||||
public record TextRepresentationDraft(
|
||||
RepresentationType representationType,
|
||||
String languageCode,
|
||||
String textBody,
|
||||
boolean primary,
|
||||
Integer chunkIndex
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package at.procon.dip.processing.spi;
|
||||
|
||||
/**
|
||||
* Cross-cutting processing stages for generic document orchestration.
|
||||
*/
|
||||
public enum ProcessingStage {
|
||||
INGESTION,
|
||||
CLASSIFICATION,
|
||||
EXTRACTION,
|
||||
NORMALIZATION,
|
||||
VECTORIZATION,
|
||||
INDEXING,
|
||||
SEARCH
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
package at.procon.dip.search.spi;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Minimal generic search scope for future hybrid/semantic search services.
|
||||
*/
|
||||
public record SearchDocumentScope(
|
||||
Set<String> ownerTenantKeys,
|
||||
Set<DocumentType> documentTypes,
|
||||
Set<DocumentFamily> documentFamilies,
|
||||
Set<DocumentVisibility> visibilities,
|
||||
String languageCode
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,211 @@
|
||||
package at.procon.dip.vectorization.camel;
|
||||
|
||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||
import at.procon.dip.vectorization.service.DocumentEmbeddingProcessingService;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.camel.Exchange;
|
||||
import org.apache.camel.LoggingLevel;
|
||||
import org.apache.camel.builder.RouteBuilder;
|
||||
import org.apache.camel.model.dataformat.JsonLibrary;
|
||||
import org.springframework.data.domain.PageRequest;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Phase 2 generic vectorization route.
|
||||
* Uses DOC.doc_text_representation as the source text and DOC.doc_embedding as the write target.
|
||||
*/
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class GenericVectorizationRoute extends RouteBuilder {
|
||||
|
||||
private static final String ROUTE_ID_TRIGGER = "generic-vectorization-trigger";
|
||||
private static final String ROUTE_ID_PROCESSOR = "generic-vectorization-processor";
|
||||
private static final String ROUTE_ID_SCHEDULER = "generic-vectorization-scheduler";
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final DocumentEmbeddingRepository embeddingRepository;
|
||||
private final DocumentEmbeddingProcessingService processingService;
|
||||
|
||||
private java.util.concurrent.ExecutorService executorService() {
|
||||
return java.util.concurrent.Executors.newFixedThreadPool(
|
||||
1,
|
||||
r -> {
|
||||
Thread thread = new Thread(r);
|
||||
thread.setName("doc-vectorization-" + thread.getId());
|
||||
thread.setDaemon(true);
|
||||
thread.setPriority(Thread.MAX_PRIORITY);
|
||||
return thread;
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void configure() {
|
||||
if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) {
|
||||
log.info("Phase 2 generic vectorization route disabled");
|
||||
return;
|
||||
}
|
||||
|
||||
log.info("Configuring generic vectorization routes (phase2=true, apiUrl={}, scheduler={}ms)",
|
||||
properties.getVectorization().getApiUrl(),
|
||||
properties.getVectorization().getGenericSchedulerPeriodMs());
|
||||
|
||||
onException(Exception.class)
|
||||
.handled(true)
|
||||
.process(exchange -> {
|
||||
UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
|
||||
Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class);
|
||||
String error = exception != null ? exception.getMessage() : "Unknown vectorization error";
|
||||
if (embeddingId != null) {
|
||||
try {
|
||||
processingService.markAsFailed(embeddingId, error);
|
||||
} catch (Exception nested) {
|
||||
log.warn("Failed to mark embedding {} as failed: {}", embeddingId, nested.getMessage());
|
||||
}
|
||||
}
|
||||
})
|
||||
.to("log:generic-vectorization-error?level=WARN");
|
||||
|
||||
from("direct:vectorize-embedding")
|
||||
.routeId(ROUTE_ID_TRIGGER)
|
||||
.doTry()
|
||||
.to("seda:vectorize-embedding-async?waitForTaskToComplete=Never&size=1000&blockWhenFull=true&timeout=5000")
|
||||
.doCatch(Exception.class)
|
||||
.log(LoggingLevel.WARN, "Failed to queue embedding ${header.embeddingId}: ${exception.message}")
|
||||
.end();
|
||||
|
||||
from("seda:vectorize-embedding-async?size=1000")
|
||||
.routeId(ROUTE_ID_PROCESSOR)
|
||||
.threads().executorService(executorService())
|
||||
.process(exchange -> {
|
||||
UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
|
||||
DocumentEmbeddingProcessingService.EmbeddingPayload payload =
|
||||
processingService.prepareEmbeddingForVectorization(embeddingId);
|
||||
if (payload == null) {
|
||||
exchange.setProperty("skipVectorization", true);
|
||||
return;
|
||||
}
|
||||
|
||||
EmbedRequest request = new EmbedRequest();
|
||||
request.text = payload.textContent();
|
||||
request.isQuery = false;
|
||||
|
||||
exchange.getIn().setHeader("embeddingId", payload.embeddingId());
|
||||
exchange.getIn().setHeader("documentId", payload.documentId());
|
||||
exchange.getIn().setHeader(Exchange.HTTP_METHOD, "POST");
|
||||
exchange.getIn().setHeader(Exchange.CONTENT_TYPE, "application/json");
|
||||
exchange.getIn().setBody(request);
|
||||
})
|
||||
.choice()
|
||||
.when(exchangeProperty("skipVectorization").isEqualTo(true))
|
||||
.log(LoggingLevel.DEBUG, "Skipping generic vectorization for ${header.embeddingId}")
|
||||
.otherwise()
|
||||
.marshal().json(JsonLibrary.Jackson)
|
||||
.setProperty("retryCount", constant(0))
|
||||
.setProperty("maxRetries", constant(properties.getVectorization().getMaxRetries()))
|
||||
.setProperty("vectorizationSuccess", constant(false))
|
||||
.loopDoWhile(simple("${exchangeProperty.vectorizationSuccess} == false && ${exchangeProperty.retryCount} < ${exchangeProperty.maxRetries}"))
|
||||
.process(exchange -> {
|
||||
Integer retryCount = exchange.getProperty("retryCount", Integer.class);
|
||||
exchange.setProperty("retryCount", retryCount + 1);
|
||||
if (retryCount > 0) {
|
||||
long backoffMs = (long) Math.pow(2, retryCount) * 1000L;
|
||||
Thread.sleep(backoffMs);
|
||||
}
|
||||
})
|
||||
.doTry()
|
||||
.toD(properties.getVectorization().getApiUrl() + "/embed?bridgeEndpoint=true&throwExceptionOnFailure=false&connectTimeout=" +
|
||||
properties.getVectorization().getConnectTimeout() + "&socketTimeout=" +
|
||||
properties.getVectorization().getSocketTimeout())
|
||||
.process(exchange -> {
|
||||
Integer statusCode = exchange.getIn().getHeader(Exchange.HTTP_RESPONSE_CODE, Integer.class);
|
||||
if (statusCode == null || statusCode != 200) {
|
||||
String body = exchange.getIn().getBody(String.class);
|
||||
throw new RuntimeException("Embedding service returned HTTP " + statusCode + ": " + body);
|
||||
}
|
||||
})
|
||||
.unmarshal().json(JsonLibrary.Jackson, EmbedResponse.class)
|
||||
.process(exchange -> {
|
||||
UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
|
||||
EmbedResponse response = exchange.getIn().getBody(EmbedResponse.class);
|
||||
if (response == null || response.embedding == null) {
|
||||
throw new RuntimeException("Embedding service returned null embedding response");
|
||||
}
|
||||
processingService.saveEmbedding(embeddingId, response.embedding, response.tokenCount);
|
||||
exchange.setProperty("vectorizationSuccess", true);
|
||||
})
|
||||
.doCatch(Exception.class)
|
||||
.process(exchange -> {
|
||||
UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
|
||||
Integer retryCount = exchange.getProperty("retryCount", Integer.class);
|
||||
Integer maxRetries = exchange.getProperty("maxRetries", Integer.class);
|
||||
Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class);
|
||||
String errorMsg = exception != null ? exception.getMessage() : "Unknown error";
|
||||
if (errorMsg != null && errorMsg.contains("Connection pool shut down")) {
|
||||
log.warn("Generic vectorization aborted for {} because the application is shutting down", embeddingId);
|
||||
exchange.setProperty("vectorizationSuccess", true);
|
||||
return;
|
||||
}
|
||||
if (retryCount >= maxRetries) {
|
||||
processingService.markAsFailed(embeddingId, errorMsg);
|
||||
} else {
|
||||
log.warn("Generic vectorization attempt #{} failed for {}: {}", retryCount, embeddingId, errorMsg);
|
||||
}
|
||||
})
|
||||
.end()
|
||||
.end()
|
||||
.end();
|
||||
|
||||
from("timer:generic-vectorization-scheduler?period=" + properties.getVectorization().getGenericSchedulerPeriodMs() + "&delay=500")
|
||||
.routeId(ROUTE_ID_SCHEDULER)
|
||||
.process(exchange -> {
|
||||
int batchSize = properties.getVectorization().getBatchSize();
|
||||
List<UUID> pending = embeddingRepository.findIdsByEmbeddingStatus(EmbeddingStatus.PENDING, PageRequest.of(0, batchSize));
|
||||
List<UUID> failed = List.of();
|
||||
if (pending.isEmpty()) {
|
||||
failed = embeddingRepository.findIdsByEmbeddingStatus(EmbeddingStatus.FAILED, PageRequest.of(0, batchSize));
|
||||
}
|
||||
List<UUID> toProcess = !pending.isEmpty() ? pending : failed;
|
||||
if (toProcess.isEmpty()) {
|
||||
exchange.setProperty("noPendingEmbeddings", true);
|
||||
} else {
|
||||
exchange.getIn().setBody(toProcess);
|
||||
}
|
||||
})
|
||||
.choice()
|
||||
.when(exchangeProperty("noPendingEmbeddings").isEqualTo(true))
|
||||
.log(LoggingLevel.DEBUG, "Generic vectorization scheduler: nothing pending")
|
||||
.otherwise()
|
||||
.split(body())
|
||||
.process(exchange -> {
|
||||
UUID embeddingId = exchange.getIn().getBody(UUID.class);
|
||||
exchange.getIn().setHeader("embeddingId", embeddingId);
|
||||
})
|
||||
.to("direct:vectorize-embedding")
|
||||
.end()
|
||||
.end();
|
||||
}
|
||||
|
||||
public static class EmbedRequest {
|
||||
@JsonProperty("text")
|
||||
public String text;
|
||||
|
||||
@JsonProperty("is_query")
|
||||
public boolean isQuery;
|
||||
}
|
||||
|
||||
public static class EmbedResponse {
|
||||
public float[] embedding;
|
||||
public int dimensions;
|
||||
@JsonProperty("token_count")
|
||||
public int tokenCount;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,142 @@
|
||||
package at.procon.dip.vectorization.service;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentStatus;
|
||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbedding;
|
||||
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||
import at.procon.dip.domain.document.service.DocumentService;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import at.procon.ted.model.entity.VectorizationStatus;
|
||||
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||
import at.procon.ted.service.VectorizationService;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Propagation;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
/**
|
||||
* Phase 2 generic vectorization processor that works on DOC text representations and DOC embeddings.
|
||||
* <p>
|
||||
* The service keeps the existing TED semantic search operational by optionally dual-writing completed
|
||||
* embeddings back into the legacy TED procurement_document vector columns, resolved by document hash.
|
||||
*/
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class DocumentEmbeddingProcessingService {
|
||||
|
||||
private final DocumentEmbeddingRepository embeddingRepository;
|
||||
private final DocumentService documentService;
|
||||
private final VectorizationService vectorizationService;
|
||||
private final TedProcessorProperties properties;
|
||||
private final ProcurementDocumentRepository procurementDocumentRepository;
|
||||
|
||||
@Transactional(propagation = Propagation.REQUIRES_NEW)
|
||||
public EmbeddingPayload prepareEmbeddingForVectorization(UUID embeddingId) {
|
||||
DocumentEmbedding embedding = embeddingRepository.findDetailedById(embeddingId)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
|
||||
|
||||
if (embedding.getEmbeddingStatus() == EmbeddingStatus.PROCESSING) {
|
||||
log.debug("Embedding {} is already PROCESSING, skipping duplicate queue entry", embeddingId);
|
||||
return null;
|
||||
}
|
||||
|
||||
embedding.setEmbeddingStatus(EmbeddingStatus.PROCESSING);
|
||||
embedding.setErrorMessage(null);
|
||||
embeddingRepository.save(embedding);
|
||||
|
||||
String textBody = embedding.getRepresentation().getTextBody();
|
||||
if (textBody == null || textBody.isBlank()) {
|
||||
embedding.setEmbeddingStatus(EmbeddingStatus.SKIPPED);
|
||||
embedding.setErrorMessage("No text representation available");
|
||||
embedding.setEmbeddedAt(OffsetDateTime.now());
|
||||
embeddingRepository.save(embedding);
|
||||
documentService.updateStatus(embedding.getDocument().getId(), DocumentStatus.REPRESENTED);
|
||||
return null;
|
||||
}
|
||||
|
||||
int maxLength = properties.getVectorization().getMaxTextLength();
|
||||
if (textBody.length() > maxLength) {
|
||||
log.debug("Truncating representation {} for embedding {} from {} to {} chars",
|
||||
embedding.getRepresentation().getId(), embeddingId, textBody.length(), maxLength);
|
||||
textBody = textBody.substring(0, maxLength);
|
||||
}
|
||||
|
||||
return new EmbeddingPayload(
|
||||
embedding.getId(),
|
||||
embedding.getDocument().getId(),
|
||||
embedding.getDocument().getDedupHash(),
|
||||
textBody,
|
||||
embedding.getModel().getDimensions(),
|
||||
embedding.getModel().isQueryPrefixRequired(),
|
||||
embedding.getRepresentation().getId()
|
||||
);
|
||||
}
|
||||
|
||||
@Transactional(propagation = Propagation.REQUIRES_NEW)
|
||||
public void saveEmbedding(UUID embeddingId, float[] embedding, Integer tokenCount) {
|
||||
DocumentEmbedding loaded = embeddingRepository.findDetailedById(embeddingId)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
|
||||
|
||||
int expectedDimensions = loaded.getModel().getDimensions();
|
||||
if (embedding == null || embedding.length != expectedDimensions) {
|
||||
throw new IllegalArgumentException("Invalid embedding dimension for " + embeddingId +
|
||||
": expected " + expectedDimensions + ", got " + (embedding == null ? 0 : embedding.length));
|
||||
}
|
||||
|
||||
String vectorString = vectorizationService.floatArrayToVectorString(embedding);
|
||||
embeddingRepository.updateEmbeddingVector(embeddingId, vectorString, tokenCount, embedding.length);
|
||||
documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.INDEXED);
|
||||
|
||||
if (properties.getVectorization().isDualWriteLegacyTedVectors()) {
|
||||
dualWriteLegacyTedVector(loaded, vectorString, tokenCount);
|
||||
}
|
||||
}
|
||||
|
||||
@Transactional(propagation = Propagation.REQUIRES_NEW)
|
||||
public void markAsFailed(UUID embeddingId, String errorMessage) {
|
||||
DocumentEmbedding loaded = embeddingRepository.findDetailedById(embeddingId)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
|
||||
|
||||
embeddingRepository.updateEmbeddingStatus(embeddingId, EmbeddingStatus.FAILED, errorMessage, null);
|
||||
documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.FAILED);
|
||||
|
||||
if (properties.getVectorization().isDualWriteLegacyTedVectors()) {
|
||||
loaded.getDocument().getDedupHash();
|
||||
procurementDocumentRepository.findByDocumentHash(loaded.getDocument().getDedupHash())
|
||||
.ifPresent(doc -> procurementDocumentRepository.updateVectorizationStatus(
|
||||
doc.getId(), VectorizationStatus.FAILED, errorMessage, null));
|
||||
}
|
||||
}
|
||||
|
||||
private void dualWriteLegacyTedVector(DocumentEmbedding embedding, String vectorString, Integer tokenCount) {
|
||||
String dedupHash = embedding.getDocument().getDedupHash();
|
||||
if (dedupHash == null || dedupHash.isBlank()) {
|
||||
return;
|
||||
}
|
||||
|
||||
procurementDocumentRepository.findByDocumentHash(dedupHash)
|
||||
.ifPresentOrElse(
|
||||
legacy -> {
|
||||
procurementDocumentRepository.updateContentVector(legacy.getId(), vectorString, tokenCount);
|
||||
log.debug("Dual-wrote embedding {} back to legacy TED document {}", embedding.getId(), legacy.getId());
|
||||
},
|
||||
() -> log.debug("No legacy TED document found for DOC embedding {} with dedup hash {}",
|
||||
embedding.getId(), dedupHash)
|
||||
);
|
||||
}
|
||||
|
||||
public record EmbeddingPayload(
|
||||
UUID embeddingId,
|
||||
UUID documentId,
|
||||
String dedupHash,
|
||||
String textContent,
|
||||
Integer expectedDimensions,
|
||||
boolean queryPrefixRequired,
|
||||
UUID representationId
|
||||
) {
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package at.procon.dip.vectorization.spi;
|
||||
|
||||
/**
|
||||
* Describes one embedding model registered in the platform.
|
||||
*/
|
||||
public record EmbeddingModelDescriptor(
|
||||
String modelKey,
|
||||
String provider,
|
||||
int dimensions,
|
||||
String distanceMetric,
|
||||
boolean queryPrefixRequired
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package at.procon.dip.vectorization.spi;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Provider abstraction for vectorization backends.
|
||||
*/
|
||||
public interface EmbeddingProvider {
|
||||
|
||||
EmbeddingModelDescriptor model();
|
||||
|
||||
EmbeddingResult embed(List<String> texts, boolean queryMode);
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package at.procon.dip.vectorization.spi;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Embedding output for one or more representations.
|
||||
*/
|
||||
public record EmbeddingResult(
|
||||
EmbeddingModelDescriptor model,
|
||||
List<float[]> vectors,
|
||||
List<String> warnings
|
||||
) {
|
||||
}
|
||||
@ -0,0 +1,41 @@
|
||||
package at.procon.dip.vectorization.startup;
|
||||
|
||||
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
||||
import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.boot.ApplicationArguments;
|
||||
import org.springframework.boot.ApplicationRunner;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
/**
|
||||
* Ensures the configured embedding model exists in DOC.doc_embedding_model.
|
||||
*/
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class ConfiguredEmbeddingModelStartupRunner implements ApplicationRunner {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final DocumentEmbeddingService embeddingService;
|
||||
|
||||
@Override
|
||||
public void run(ApplicationArguments args) {
|
||||
if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) {
|
||||
return;
|
||||
}
|
||||
|
||||
embeddingService.registerModel(new RegisterEmbeddingModelCommand(
|
||||
properties.getVectorization().getModelName(),
|
||||
properties.getVectorization().getEmbeddingProvider(),
|
||||
properties.getVectorization().getModelName(),
|
||||
properties.getVectorization().getDimensions(),
|
||||
null,
|
||||
false,
|
||||
true
|
||||
));
|
||||
|
||||
log.info("Phase 2 embedding model ensured: {}", properties.getVectorization().getModelName());
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,60 @@
|
||||
package at.procon.dip.vectorization.startup;
|
||||
|
||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.camel.ProducerTemplate;
|
||||
import org.springframework.boot.ApplicationArguments;
|
||||
import org.springframework.boot.ApplicationRunner;
|
||||
import org.springframework.data.domain.PageRequest;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
/**
|
||||
* Queues pending and failed DOC embeddings immediately on startup.
|
||||
*/
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class GenericVectorizationStartupRunner implements ApplicationRunner {
|
||||
|
||||
private static final int BATCH_SIZE = 1000;
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final DocumentEmbeddingRepository embeddingRepository;
|
||||
private final ProducerTemplate producerTemplate;
|
||||
|
||||
@Override
|
||||
public void run(ApplicationArguments args) {
|
||||
if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) {
|
||||
return;
|
||||
}
|
||||
|
||||
int queued = 0;
|
||||
queued += queueByStatus(EmbeddingStatus.PENDING, "PENDING");
|
||||
queued += queueByStatus(EmbeddingStatus.FAILED, "FAILED");
|
||||
log.info("Generic vectorization startup runner queued {} embedding jobs", queued);
|
||||
}
|
||||
|
||||
private int queueByStatus(EmbeddingStatus status, String label) {
|
||||
int queued = 0;
|
||||
int page = 0;
|
||||
List<UUID> ids;
|
||||
do {
|
||||
ids = embeddingRepository.findIdsByEmbeddingStatus(status, PageRequest.of(page, BATCH_SIZE));
|
||||
for (UUID id : ids) {
|
||||
try {
|
||||
producerTemplate.sendBodyAndHeader("direct:vectorize-embedding", null, "embeddingId", id);
|
||||
queued++;
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to queue {} embedding {}: {}", label, id, e.getMessage());
|
||||
}
|
||||
}
|
||||
page++;
|
||||
} while (ids.size() == BATCH_SIZE);
|
||||
return queued;
|
||||
}
|
||||
}
|
||||
@ -1,26 +1,20 @@
|
||||
package at.procon.ted;
|
||||
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.scheduling.annotation.EnableAsync;
|
||||
import at.procon.dip.DocumentIntelligencePlatformApplication;
|
||||
|
||||
/**
|
||||
* TED Procurement Document Processor Application.
|
||||
*
|
||||
* Processes EU eForms public procurement notices from TED (Tenders Electronic Daily).
|
||||
* Features:
|
||||
* - Directory watching with Apache Camel for automated XML processing
|
||||
* - PostgreSQL storage with native XML support and pgvector for semantic search
|
||||
* - Asynchronous document vectorization using multilingual-e5-large model
|
||||
* - REST API for structured and semantic search
|
||||
*
|
||||
* @author Martin.Schweitzer@procon.co.at and claude.ai
|
||||
* Legacy entry point kept for backward compatibility.
|
||||
*
|
||||
* <p>The platform is being generalized beyond TED-specific procurement documents.
|
||||
* New runtime packaging should use {@link DocumentIntelligencePlatformApplication}.</p>
|
||||
*/
|
||||
@SpringBootApplication
|
||||
@EnableAsync
|
||||
public class TedProcurementProcessorApplication {
|
||||
@Deprecated(forRemoval = false, since = "1.1.0")
|
||||
public final class TedProcurementProcessorApplication {
|
||||
|
||||
private TedProcurementProcessorApplication() {
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
SpringApplication.run(TedProcurementProcessorApplication.class, args);
|
||||
DocumentIntelligencePlatformApplication.main(args);
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,197 @@
|
||||
package at.procon.ted.service;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.ContentRole;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentStatus;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.domain.document.SourceType;
|
||||
import at.procon.dip.domain.document.StorageType;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbedding;
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||
import at.procon.dip.domain.document.entity.DocumentSource;
|
||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||
import at.procon.dip.domain.document.repository.DocumentContentRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
||||
import at.procon.dip.domain.document.service.DocumentService;
|
||||
import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import at.procon.ted.model.entity.ProcurementDocument;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
/**
|
||||
* Phase 2 bridge that dual-writes TED documents into the generic DOC persistence backbone.
|
||||
*/
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class TedPhase2GenericDocumentService {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final DocumentRepository documentRepository;
|
||||
private final DocumentContentRepository contentRepository;
|
||||
private final DocumentSourceRepository sourceRepository;
|
||||
private final DocumentTextRepresentationRepository representationRepository;
|
||||
private final DocumentEmbeddingRepository embeddingRepository;
|
||||
private final DocumentService documentService;
|
||||
private final DocumentEmbeddingService embeddingService;
|
||||
|
||||
@Transactional
|
||||
public UUID registerOrRefreshTedDocument(ProcurementDocument tedDocument) {
|
||||
if (!properties.getVectorization().isGenericPipelineEnabled()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Document document = documentRepository.findByDedupHash(tedDocument.getDocumentHash())
|
||||
.orElseGet(() -> createGenericDocument(tedDocument));
|
||||
|
||||
document.setDocumentType(DocumentType.TED_NOTICE);
|
||||
document.setDocumentFamily(DocumentFamily.PROCUREMENT);
|
||||
document.setVisibility(DocumentVisibility.PUBLIC);
|
||||
document.setStatus(DocumentStatus.REPRESENTED);
|
||||
document.setTitle(tedDocument.getProjectTitle());
|
||||
document.setSummary(tedDocument.getProjectDescription());
|
||||
document.setLanguageCode(tedDocument.getLanguageCode());
|
||||
document.setMimeType("application/xml");
|
||||
document.setBusinessKey(buildBusinessKey(tedDocument));
|
||||
document.setDedupHash(tedDocument.getDocumentHash());
|
||||
document = documentRepository.save(document);
|
||||
|
||||
ensureTedSource(document, tedDocument);
|
||||
DocumentContent originalContent = ensureOriginalContent(document, tedDocument);
|
||||
DocumentTextRepresentation representation = ensurePrimaryRepresentation(document, originalContent, tedDocument);
|
||||
DocumentEmbedding embedding = ensurePendingEmbedding(document, representation);
|
||||
|
||||
log.debug("Phase 2 DOC bridge ensured generic TED document {} -> embedding {}", document.getId(), embedding.getId());
|
||||
return embedding.getId();
|
||||
}
|
||||
|
||||
private Document createGenericDocument(ProcurementDocument tedDocument) {
|
||||
return documentService.create(new at.procon.dip.domain.document.service.command.CreateDocumentCommand(
|
||||
null,
|
||||
DocumentVisibility.PUBLIC,
|
||||
DocumentType.TED_NOTICE,
|
||||
DocumentFamily.PROCUREMENT,
|
||||
DocumentStatus.REPRESENTED,
|
||||
tedDocument.getProjectTitle(),
|
||||
tedDocument.getProjectDescription(),
|
||||
tedDocument.getLanguageCode(),
|
||||
"application/xml",
|
||||
buildBusinessKey(tedDocument),
|
||||
tedDocument.getDocumentHash()
|
||||
));
|
||||
}
|
||||
|
||||
private void ensureTedSource(Document document, ProcurementDocument tedDocument) {
|
||||
String externalId = tedDocument.getPublicationId() != null ? tedDocument.getPublicationId() : tedDocument.getId().toString();
|
||||
boolean sourceExists = sourceRepository.findByDocument_Id(document.getId()).stream()
|
||||
.anyMatch(existing -> externalId.equals(existing.getExternalSourceId()));
|
||||
if (sourceExists) {
|
||||
return;
|
||||
}
|
||||
|
||||
DocumentSource source = DocumentSource.builder()
|
||||
.document(document)
|
||||
.sourceType(SourceType.FILE_SYSTEM)
|
||||
.externalSourceId(externalId)
|
||||
.sourceUri(tedDocument.getSourcePath())
|
||||
.sourceFilename(tedDocument.getSourceFilename())
|
||||
.importBatchId("ted-phase2")
|
||||
.receivedAt(OffsetDateTime.now())
|
||||
.build();
|
||||
sourceRepository.save(source);
|
||||
}
|
||||
|
||||
private DocumentContent ensureOriginalContent(Document document, ProcurementDocument tedDocument) {
|
||||
List<DocumentContent> existing = contentRepository.findByDocument_IdAndContentRole(document.getId(), ContentRole.ORIGINAL);
|
||||
if (!existing.isEmpty()) {
|
||||
DocumentContent content = existing.get(0);
|
||||
content.setMimeType("application/xml");
|
||||
content.setStorageType(StorageType.DB_TEXT);
|
||||
content.setTextContent(tedDocument.getXmlDocument());
|
||||
content.setContentHash(tedDocument.getDocumentHash());
|
||||
content.setSizeBytes(tedDocument.getFileSizeBytes());
|
||||
return contentRepository.save(content);
|
||||
}
|
||||
|
||||
DocumentContent content = DocumentContent.builder()
|
||||
.document(document)
|
||||
.contentRole(ContentRole.ORIGINAL)
|
||||
.storageType(StorageType.DB_TEXT)
|
||||
.mimeType("application/xml")
|
||||
.charsetName("UTF-8")
|
||||
.textContent(tedDocument.getXmlDocument())
|
||||
.contentHash(tedDocument.getDocumentHash())
|
||||
.sizeBytes(tedDocument.getFileSizeBytes())
|
||||
.build();
|
||||
return contentRepository.save(content);
|
||||
}
|
||||
|
||||
private DocumentTextRepresentation ensurePrimaryRepresentation(Document document,
|
||||
DocumentContent originalContent,
|
||||
ProcurementDocument tedDocument) {
|
||||
DocumentTextRepresentation representation = representationRepository
|
||||
.findFirstByDocument_IdAndPrimaryRepresentationTrue(document.getId())
|
||||
.orElseGet(DocumentTextRepresentation::new);
|
||||
|
||||
representation.setDocument(document);
|
||||
representation.setContent(originalContent);
|
||||
representation.setRepresentationType(RepresentationType.SEMANTIC_TEXT);
|
||||
representation.setBuilderKey(properties.getVectorization().getPrimaryRepresentationBuilderKey());
|
||||
representation.setLanguageCode(tedDocument.getLanguageCode());
|
||||
representation.setPrimaryRepresentation(true);
|
||||
representation.setTextBody(tedDocument.getTextContent() != null ? tedDocument.getTextContent() : tedDocument.getProjectDescription());
|
||||
representation.setTokenCount(null);
|
||||
representation.setChunkIndex(null);
|
||||
representation.setChunkStartOffset(null);
|
||||
representation.setChunkEndOffset(null);
|
||||
return representationRepository.save(representation);
|
||||
}
|
||||
|
||||
private DocumentEmbedding ensurePendingEmbedding(Document document, DocumentTextRepresentation representation) {
|
||||
DocumentEmbeddingModel model = embeddingService.registerModel(new RegisterEmbeddingModelCommand(
|
||||
properties.getVectorization().getModelName(),
|
||||
properties.getVectorization().getEmbeddingProvider(),
|
||||
properties.getVectorization().getModelName(),
|
||||
properties.getVectorization().getDimensions(),
|
||||
null,
|
||||
false,
|
||||
true
|
||||
));
|
||||
|
||||
return embeddingRepository.findByRepresentation_IdAndModel_Id(representation.getId(), model.getId())
|
||||
.map(existing -> {
|
||||
existing.setDocument(document);
|
||||
existing.setRepresentation(representation);
|
||||
existing.setModel(model);
|
||||
existing.setEmbeddingStatus(at.procon.dip.domain.document.EmbeddingStatus.PENDING);
|
||||
existing.setErrorMessage(null);
|
||||
existing.setEmbeddedAt(null);
|
||||
return embeddingRepository.save(existing);
|
||||
})
|
||||
.orElseGet(() -> embeddingService.createPendingEmbedding(document.getId(), representation.getId(), model.getId()));
|
||||
}
|
||||
|
||||
private String buildBusinessKey(ProcurementDocument tedDocument) {
|
||||
if (tedDocument.getPublicationId() != null && !tedDocument.getPublicationId().isBlank()) {
|
||||
return "TED:publication:" + tedDocument.getPublicationId();
|
||||
}
|
||||
if (tedDocument.getNoticeUrl() != null && !tedDocument.getNoticeUrl().isBlank()) {
|
||||
return "TED:url:" + tedDocument.getNoticeUrl();
|
||||
}
|
||||
return "TED:hash:" + tedDocument.getDocumentHash();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,281 @@
|
||||
-- Phase 1: Generic DOC persistence backbone for the Procon Document Intelligence Platform
|
||||
-- This migration is additive and intentionally does not modify the existing TED runtime tables.
|
||||
|
||||
CREATE SCHEMA IF NOT EXISTS DOC;
|
||||
|
||||
SET search_path TO TED, DOC, public;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
CREATE EXTENSION IF NOT EXISTS pgcrypto SCHEMA public;
|
||||
EXCEPTION
|
||||
WHEN insufficient_privilege THEN
|
||||
RAISE NOTICE 'Skipping pgcrypto extension creation (insufficient privileges)';
|
||||
WHEN duplicate_object THEN
|
||||
RAISE NOTICE 'Extension pgcrypto already exists';
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
CREATE EXTENSION IF NOT EXISTS vector SCHEMA public;
|
||||
EXCEPTION
|
||||
WHEN insufficient_privilege THEN
|
||||
RAISE NOTICE 'Skipping vector extension creation (insufficient privileges)';
|
||||
WHEN duplicate_object THEN
|
||||
RAISE NOTICE 'Extension vector already exists';
|
||||
WHEN undefined_file THEN
|
||||
RAISE WARNING 'Extension vector not available - install pgvector on the database server';
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_visibility') THEN
|
||||
CREATE TYPE DOC.doc_document_visibility AS ENUM ('PUBLIC', 'TENANT', 'SHARED', 'RESTRICTED');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_type') THEN
|
||||
CREATE TYPE DOC.doc_document_type AS ENUM (
|
||||
'TED_NOTICE', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
|
||||
'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'UNKNOWN'
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_family') THEN
|
||||
CREATE TYPE DOC.doc_document_family AS ENUM ('PROCUREMENT', 'MAIL', 'ATTACHMENT', 'KNOWLEDGE', 'GENERIC');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_status') THEN
|
||||
CREATE TYPE DOC.doc_document_status AS ENUM ('RECEIVED', 'CLASSIFIED', 'EXTRACTED', 'REPRESENTED', 'INDEXED', 'FAILED', 'ARCHIVED');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_source_type') THEN
|
||||
CREATE TYPE DOC.doc_source_type AS ENUM ('TED_PACKAGE', 'MAIL', 'FILE_SYSTEM', 'REST_UPLOAD', 'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_content_role') THEN
|
||||
CREATE TYPE DOC.doc_content_role AS ENUM (
|
||||
'ORIGINAL', 'NORMALIZED_TEXT', 'OCR_TEXT', 'HTML_CLEAN',
|
||||
'EXTRACTED_METADATA_JSON', 'THUMBNAIL', 'DERIVED_BINARY'
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_storage_type') THEN
|
||||
CREATE TYPE DOC.doc_storage_type AS ENUM ('DB_TEXT', 'DB_BINARY', 'FILE_PATH', 'OBJECT_STORAGE', 'EXTERNAL_REFERENCE');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_representation_type') THEN
|
||||
CREATE TYPE DOC.doc_representation_type AS ENUM ('FULLTEXT', 'SEMANTIC_TEXT', 'SUMMARY', 'TITLE_ABSTRACT', 'CHUNK', 'METADATA_ENRICHED');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_embedding_status') THEN
|
||||
CREATE TYPE DOC.doc_embedding_status AS ENUM ('PENDING', 'PROCESSING', 'COMPLETED', 'FAILED', 'SKIPPED');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_distance_metric') THEN
|
||||
CREATE TYPE DOC.doc_distance_metric AS ENUM ('COSINE', 'L2', 'INNER_PRODUCT');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_relation_type') THEN
|
||||
CREATE TYPE DOC.doc_relation_type AS ENUM ('CONTAINS', 'ATTACHMENT_OF', 'EXTRACTED_FROM', 'DERIVED_FROM', 'PART_OF', 'VERSION_OF', 'RELATED_TO');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOC.doc_tenant (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_key VARCHAR(120) NOT NULL UNIQUE,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
description TEXT,
|
||||
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOC.doc_document (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
owner_tenant_id UUID REFERENCES DOC.doc_tenant(id),
|
||||
visibility DOC.doc_document_visibility NOT NULL,
|
||||
document_type DOC.doc_document_type NOT NULL,
|
||||
document_family DOC.doc_document_family NOT NULL,
|
||||
status DOC.doc_document_status NOT NULL DEFAULT 'RECEIVED',
|
||||
title VARCHAR(1000),
|
||||
summary TEXT,
|
||||
language_code VARCHAR(16),
|
||||
mime_type VARCHAR(255),
|
||||
business_key VARCHAR(255),
|
||||
dedup_hash VARCHAR(64),
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOC.doc_source (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||
source_type DOC.doc_source_type NOT NULL,
|
||||
external_source_id VARCHAR(500),
|
||||
source_uri TEXT,
|
||||
source_filename VARCHAR(1000),
|
||||
parent_source_id UUID,
|
||||
import_batch_id VARCHAR(255),
|
||||
received_at TIMESTAMP WITH TIME ZONE,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOC.doc_content (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||
content_role DOC.doc_content_role NOT NULL,
|
||||
storage_type DOC.doc_storage_type NOT NULL,
|
||||
mime_type VARCHAR(255),
|
||||
charset_name VARCHAR(120),
|
||||
text_content TEXT,
|
||||
binary_ref TEXT,
|
||||
content_hash VARCHAR(64),
|
||||
size_bytes BIGINT,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOC.doc_text_representation (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||
content_id UUID REFERENCES DOC.doc_content(id) ON DELETE SET NULL,
|
||||
representation_type DOC.doc_representation_type NOT NULL,
|
||||
builder_key VARCHAR(255),
|
||||
language_code VARCHAR(16),
|
||||
token_count INTEGER,
|
||||
char_count INTEGER,
|
||||
chunk_index INTEGER,
|
||||
chunk_start_offset INTEGER,
|
||||
chunk_end_offset INTEGER,
|
||||
is_primary BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
text_body TEXT NOT NULL,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOC.doc_embedding_model (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
model_key VARCHAR(255) NOT NULL UNIQUE,
|
||||
provider VARCHAR(120) NOT NULL,
|
||||
display_name VARCHAR(255),
|
||||
dimensions INTEGER NOT NULL,
|
||||
distance_metric DOC.doc_distance_metric NOT NULL DEFAULT 'COSINE',
|
||||
query_prefix_required BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOC.doc_embedding (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||
representation_id UUID NOT NULL REFERENCES DOC.doc_text_representation(id) ON DELETE CASCADE,
|
||||
model_id UUID NOT NULL REFERENCES DOC.doc_embedding_model(id),
|
||||
embedding_status DOC.doc_embedding_status NOT NULL DEFAULT 'PENDING',
|
||||
token_count INTEGER,
|
||||
embedding_dimensions INTEGER,
|
||||
error_message TEXT,
|
||||
embedded_at TIMESTAMP WITH TIME ZONE,
|
||||
embedding_vector public.vector,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOC.doc_relation (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
parent_document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||
child_document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||
relation_type DOC.doc_relation_type NOT NULL,
|
||||
sort_order INTEGER,
|
||||
relation_metadata TEXT,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
CONSTRAINT chk_doc_relation_no_self CHECK (parent_document_id <> child_document_id)
|
||||
);
|
||||
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_tenant_key ON DOC.doc_tenant(tenant_key);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_tenant_active ON DOC.doc_tenant(active);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_type ON DOC.doc_document(document_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_family ON DOC.doc_document(document_family);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_status ON DOC.doc_document(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_visibility ON DOC.doc_document(visibility);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_owner_tenant ON DOC.doc_document(owner_tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_dedup_hash ON DOC.doc_document(dedup_hash);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_business_key ON DOC.doc_document(business_key);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_created_at ON DOC.doc_document(created_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_source_document ON DOC.doc_source(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_source_type ON DOC.doc_source(source_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_source_external_id ON DOC.doc_source(external_source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_source_received_at ON DOC.doc_source(received_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_source_parent_source ON DOC.doc_source(parent_source_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_content_document ON DOC.doc_content(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_content_role ON DOC.doc_content(content_role);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_content_hash ON DOC.doc_content(content_hash);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_content_storage_type ON DOC.doc_content(storage_type);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_document ON DOC.doc_text_representation(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_content ON DOC.doc_text_representation(content_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_type ON DOC.doc_text_representation(representation_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_primary ON DOC.doc_text_representation(is_primary);
|
||||
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_embedding_model_key ON DOC.doc_embedding_model(model_key);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_embedding_model_active ON DOC.doc_embedding_model(active);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_embedding_document ON DOC.doc_embedding(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_embedding_repr ON DOC.doc_embedding(representation_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_embedding_model ON DOC.doc_embedding(model_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_embedding_status ON DOC.doc_embedding(embedding_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_embedding_embedded_at ON DOC.doc_embedding(embedded_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_relation_parent ON DOC.doc_relation(parent_document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_relation_child ON DOC.doc_relation(child_document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_relation_type ON DOC.doc_relation(relation_type);
|
||||
|
||||
COMMENT ON SCHEMA DOC IS 'Generic document platform schema introduced in Phase 1';
|
||||
COMMENT ON TABLE DOC.doc_document IS 'Canonical document root with optional owner tenant and mandatory visibility';
|
||||
COMMENT ON TABLE DOC.doc_content IS 'Stored payload variants for a canonical document';
|
||||
COMMENT ON TABLE DOC.doc_text_representation IS 'Search-oriented text representations derived from document content';
|
||||
COMMENT ON TABLE DOC.doc_embedding IS 'Embedding lifecycle separated from document structure';
|
||||
@ -0,0 +1,14 @@
|
||||
-- Phase 2: Vectorization decoupling support in the generic DOC schema
|
||||
-- Adds safety constraints and indexes for representation-based embedding processing.
|
||||
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_doc_embedding_representation_model
|
||||
ON DOC.doc_embedding(representation_id, model_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_embedding_status_created
|
||||
ON DOC.doc_embedding(embedding_status, created_at);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_embedding_status_updated
|
||||
ON DOC.doc_embedding(embedding_status, updated_at);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_document_primary
|
||||
ON DOC.doc_text_representation(document_id, is_primary);
|
||||
Loading…
Reference in New Issue