Refactor phases 0-2
parent
21edbc35a2
commit
71fb43a5ea
@ -0,0 +1,48 @@
|
|||||||
|
# Phase 2 - Representation-based vectorization and dual-write compatibility
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Decouple vectorization from the TED document entity so arbitrary document types can use a shared
|
||||||
|
representation-to-embedding pipeline.
|
||||||
|
|
||||||
|
## Primary changes
|
||||||
|
|
||||||
|
1. **Primary vectorization source**
|
||||||
|
- before: `TED.procurement_document.text_content`
|
||||||
|
- now: `DOC.doc_text_representation.text_body`
|
||||||
|
|
||||||
|
2. **Primary vectorization target**
|
||||||
|
- before: `TED.procurement_document.content_vector`
|
||||||
|
- now: `DOC.doc_embedding.embedding_vector`
|
||||||
|
|
||||||
|
3. **Compatibility during migration**
|
||||||
|
- completed embeddings are optionally mirrored back to the legacy TED vector columns using the
|
||||||
|
shared TED document hash (`document_hash` / `dedup_hash`)
|
||||||
|
|
||||||
|
4. **TED dual-write bridge**
|
||||||
|
- fresh TED documents are projected into the generic `DOC` model immediately after persistence
|
||||||
|
|
||||||
|
## Key services introduced
|
||||||
|
|
||||||
|
- `TedPhase2GenericDocumentService`
|
||||||
|
- creates/refreshes generic DOC records for TED notices
|
||||||
|
- `DocumentEmbeddingProcessingService`
|
||||||
|
- processes DOC embedding lifecycle records
|
||||||
|
- `GenericVectorizationRoute`
|
||||||
|
- scheduler + worker route for asynchronous DOC embedding generation
|
||||||
|
- `ConfiguredEmbeddingModelStartupRunner`
|
||||||
|
- ensures the configured embedding model exists in `DOC.doc_embedding_model`
|
||||||
|
- `GenericVectorizationStartupRunner`
|
||||||
|
- queues pending/failed DOC embeddings on startup
|
||||||
|
|
||||||
|
## Behavior when Phase 2 is enabled
|
||||||
|
|
||||||
|
- legacy `VectorizationRoute` is disabled
|
||||||
|
- legacy startup queueing is disabled
|
||||||
|
- legacy event-based vectorization queueing is disabled
|
||||||
|
- generic scheduler and startup runner handle DOC embeddings instead
|
||||||
|
|
||||||
|
## Compatibility intent
|
||||||
|
|
||||||
|
This phase keeps the existing TED search endpoints working while the new generic indexing layer becomes
|
||||||
|
operational. The next phase can migrate search reads from the TED table to `DOC.doc_embedding`.
|
||||||
@ -0,0 +1,28 @@
|
|||||||
|
package at.procon.dip;
|
||||||
|
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import org.springframework.boot.SpringApplication;
|
||||||
|
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||||
|
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||||
|
import org.springframework.boot.autoconfigure.domain.EntityScan;
|
||||||
|
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||||
|
import org.springframework.scheduling.annotation.EnableAsync;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Procon Document Intelligence Platform (DIP).
|
||||||
|
*
|
||||||
|
* <p>Phase 0 introduces a generic platform root namespace and architecture contracts
|
||||||
|
* while keeping the existing TED-specific runtime intact. Subsequent phases can move
|
||||||
|
* modules incrementally from {@code at.procon.ted} into the broader document platform.</p>
|
||||||
|
*/
|
||||||
|
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
|
||||||
|
@EnableAsync
|
||||||
|
//@EnableConfigurationProperties(TedProcessorProperties.class)
|
||||||
|
@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity"})
|
||||||
|
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository"})
|
||||||
|
public class DocumentIntelligencePlatformApplication {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
SpringApplication.run(DocumentIntelligencePlatformApplication.class, args);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,28 @@
|
|||||||
|
package at.procon.dip;
|
||||||
|
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import org.springframework.boot.SpringApplication;
|
||||||
|
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||||
|
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||||
|
import org.springframework.boot.autoconfigure.domain.EntityScan;
|
||||||
|
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||||
|
import org.springframework.scheduling.annotation.EnableAsync;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Procon Document Intelligence Platform (DIP).
|
||||||
|
*
|
||||||
|
* <p>Phase 0 introduces a generic platform root namespace and architecture contracts
|
||||||
|
* while keeping the existing TED-specific runtime intact. Subsequent phases can move
|
||||||
|
* modules incrementally from {@code at.procon.ted} into the broader document platform.</p>
|
||||||
|
*/
|
||||||
|
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
|
||||||
|
@EnableAsync
|
||||||
|
//@EnableConfigurationProperties(TedProcessorProperties.class)
|
||||||
|
@EntityScan(basePackages = {"at.procon.ted.model.entity"})
|
||||||
|
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository"})
|
||||||
|
public class DocumentIntelligencePlatformApplication {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
SpringApplication.run(DocumentIntelligencePlatformApplication.class, args);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
# Phase 2 - Vectorization decoupling
|
||||||
|
|
||||||
|
Phase 2 moves the primary vectorization pipeline from `TED.procurement_document` to the generic `DOC`
|
||||||
|
representation and embedding model introduced in Phase 1.
|
||||||
|
|
||||||
|
Implemented in this phase:
|
||||||
|
- `DOC.doc_text_representation` is now the primary text source for embeddings
|
||||||
|
- `DOC.doc_embedding` is the primary persistence target for embedding lifecycle and vectors
|
||||||
|
- a generic Camel route processes pending/failed embeddings asynchronously
|
||||||
|
- TED imports dual-write into the generic model by creating:
|
||||||
|
- canonical `DOC.doc_document`
|
||||||
|
- original `DOC.doc_content`
|
||||||
|
- primary `DOC.doc_text_representation`
|
||||||
|
- pending `DOC.doc_embedding`
|
||||||
|
- compatibility mode keeps writing completed TED embeddings back into
|
||||||
|
`TED.procurement_document.content_vector` so the legacy semantic search continues to work
|
||||||
|
|
||||||
|
This phase is intentionally additive and does not yet migrate TED semantic search reads away from the legacy table.
|
||||||
@ -0,0 +1,45 @@
|
|||||||
|
package at.procon.dip.architecture;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Central architecture constants for the generalized platform.
|
||||||
|
* <p>Phase 1 extends the package map with the additive generic persistence backbone.</p>
|
||||||
|
*/
|
||||||
|
public final class PlatformArchitecture {
|
||||||
|
|
||||||
|
public static final String PLATFORM_NAME = "Procon Document Intelligence Platform";
|
||||||
|
public static final String PLATFORM_SHORT_NAME = "DIP";
|
||||||
|
public static final String BASE_NAMESPACE = "at.procon.dip";
|
||||||
|
public static final String LEGACY_NAMESPACE = "at.procon.ted";
|
||||||
|
|
||||||
|
public static final String GENERIC_SCHEMA = "DOC";
|
||||||
|
public static final String TED_SCHEMA = "TED";
|
||||||
|
|
||||||
|
public static final List<String> GENERIC_PACKAGE_AREAS = List.of(
|
||||||
|
"at.procon.dip.architecture",
|
||||||
|
"at.procon.dip.domain.access",
|
||||||
|
"at.procon.dip.domain.document",
|
||||||
|
"at.procon.dip.domain.tenant",
|
||||||
|
"at.procon.dip.domain.document.entity",
|
||||||
|
"at.procon.dip.domain.document.repository",
|
||||||
|
"at.procon.dip.domain.document.service",
|
||||||
|
"at.procon.dip.domain.tenant.entity",
|
||||||
|
"at.procon.dip.domain.tenant.repository",
|
||||||
|
"at.procon.dip.domain.tenant.service",
|
||||||
|
"at.procon.dip.ingestion.spi",
|
||||||
|
"at.procon.dip.classification.spi",
|
||||||
|
"at.procon.dip.extraction.spi",
|
||||||
|
"at.procon.dip.normalization.spi",
|
||||||
|
"at.procon.dip.vectorization.spi",
|
||||||
|
"at.procon.dip.vectorization.service",
|
||||||
|
"at.procon.dip.vectorization.camel",
|
||||||
|
"at.procon.dip.vectorization.startup",
|
||||||
|
"at.procon.dip.search.spi",
|
||||||
|
"at.procon.dip.processing.spi",
|
||||||
|
"at.procon.dip.migration"
|
||||||
|
);
|
||||||
|
|
||||||
|
private PlatformArchitecture() {
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package at.procon.dip.architecture;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Target schema names for the generalized model.
|
||||||
|
*/
|
||||||
|
public final class SchemaNames {
|
||||||
|
|
||||||
|
public static final String DOC = "DOC";
|
||||||
|
public static final String TED = "TED";
|
||||||
|
|
||||||
|
private SchemaNames() {
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,17 @@
|
|||||||
|
package at.procon.dip.classification.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Result of document type detection/classification.
|
||||||
|
*/
|
||||||
|
public record DetectionResult(
|
||||||
|
DocumentType documentType,
|
||||||
|
DocumentFamily documentFamily,
|
||||||
|
String mimeType,
|
||||||
|
String languageCode,
|
||||||
|
Map<String, String> attributes
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package at.procon.dip.classification.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines a canonical type/family before extraction starts.
|
||||||
|
*/
|
||||||
|
public interface DocumentTypeDetector {
|
||||||
|
|
||||||
|
boolean supports(SourceDescriptor sourceDescriptor);
|
||||||
|
|
||||||
|
DetectionResult detect(SourceDescriptor sourceDescriptor);
|
||||||
|
}
|
||||||
@ -0,0 +1,31 @@
|
|||||||
|
package at.procon.dip.domain.access;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.tenant.TenantRef;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Canonical ownership and visibility descriptor for a document.
|
||||||
|
* <p>
|
||||||
|
* A document may have no owner tenant, for example public TED notices.
|
||||||
|
* Visibility is always mandatory and defines who may search/read the document.
|
||||||
|
*/
|
||||||
|
public record DocumentAccessContext(
|
||||||
|
TenantRef ownerTenant,
|
||||||
|
DocumentVisibility visibility
|
||||||
|
) {
|
||||||
|
|
||||||
|
public DocumentAccessContext {
|
||||||
|
Objects.requireNonNull(visibility, "visibility must not be null");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static DocumentAccessContext publicDocument() {
|
||||||
|
return new DocumentAccessContext(null, DocumentVisibility.PUBLIC);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static DocumentAccessContext tenantOwned(TenantRef ownerTenant) {
|
||||||
|
return new DocumentAccessContext(
|
||||||
|
Objects.requireNonNull(ownerTenant, "ownerTenant must not be null"),
|
||||||
|
DocumentVisibility.TENANT
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
package at.procon.dip.domain.access;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Describes who may access a document independently from ownership.
|
||||||
|
*/
|
||||||
|
public enum DocumentVisibility {
|
||||||
|
PUBLIC,
|
||||||
|
TENANT,
|
||||||
|
SHARED,
|
||||||
|
RESTRICTED
|
||||||
|
}
|
||||||
@ -0,0 +1,23 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimal canonical document descriptor used by Phase 0 SPI contracts.
|
||||||
|
*/
|
||||||
|
public record CanonicalDocumentMetadata(
|
||||||
|
UUID documentId,
|
||||||
|
DocumentAccessContext accessContext,
|
||||||
|
DocumentType documentType,
|
||||||
|
DocumentFamily documentFamily,
|
||||||
|
DocumentStatus status,
|
||||||
|
String title,
|
||||||
|
String languageCode,
|
||||||
|
String mimeType,
|
||||||
|
String dedupHash,
|
||||||
|
OffsetDateTime createdAt,
|
||||||
|
OffsetDateTime updatedAt
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Role of a stored content version.
|
||||||
|
*/
|
||||||
|
public enum ContentRole {
|
||||||
|
ORIGINAL,
|
||||||
|
NORMALIZED_TEXT,
|
||||||
|
OCR_TEXT,
|
||||||
|
HTML_CLEAN,
|
||||||
|
EXTRACTED_METADATA_JSON,
|
||||||
|
THUMBNAIL,
|
||||||
|
DERIVED_BINARY
|
||||||
|
}
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Distance metric used by an embedding model.
|
||||||
|
*/
|
||||||
|
public enum DistanceMetric {
|
||||||
|
COSINE,
|
||||||
|
L2,
|
||||||
|
INNER_PRODUCT
|
||||||
|
}
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Functional grouping used for broad search and routing decisions.
|
||||||
|
*/
|
||||||
|
public enum DocumentFamily {
|
||||||
|
PROCUREMENT,
|
||||||
|
MAIL,
|
||||||
|
ATTACHMENT,
|
||||||
|
KNOWLEDGE,
|
||||||
|
GENERIC
|
||||||
|
}
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generic lifecycle state for a canonical document.
|
||||||
|
*/
|
||||||
|
public enum DocumentStatus {
|
||||||
|
RECEIVED,
|
||||||
|
CLASSIFIED,
|
||||||
|
EXTRACTED,
|
||||||
|
REPRESENTED,
|
||||||
|
INDEXED,
|
||||||
|
FAILED,
|
||||||
|
ARCHIVED
|
||||||
|
}
|
||||||
@ -0,0 +1,19 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Canonical technical document type.
|
||||||
|
*/
|
||||||
|
public enum DocumentType {
|
||||||
|
TED_NOTICE,
|
||||||
|
EMAIL,
|
||||||
|
MIME_MESSAGE,
|
||||||
|
PDF,
|
||||||
|
DOCX,
|
||||||
|
HTML,
|
||||||
|
XML_GENERIC,
|
||||||
|
TEXT,
|
||||||
|
MARKDOWN,
|
||||||
|
ZIP_ARCHIVE,
|
||||||
|
GENERIC_BINARY,
|
||||||
|
UNKNOWN
|
||||||
|
}
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generic lifecycle state of an embedding record in the DOC schema.
|
||||||
|
*/
|
||||||
|
public enum EmbeddingStatus {
|
||||||
|
PENDING,
|
||||||
|
PROCESSING,
|
||||||
|
COMPLETED,
|
||||||
|
FAILED,
|
||||||
|
SKIPPED
|
||||||
|
}
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Logical relationship between canonical documents.
|
||||||
|
*/
|
||||||
|
public enum RelationType {
|
||||||
|
CONTAINS,
|
||||||
|
ATTACHMENT_OF,
|
||||||
|
EXTRACTED_FROM,
|
||||||
|
DERIVED_FROM,
|
||||||
|
PART_OF,
|
||||||
|
VERSION_OF,
|
||||||
|
RELATED_TO
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search-oriented text representation that can be embedded independently.
|
||||||
|
*/
|
||||||
|
public enum RepresentationType {
|
||||||
|
FULLTEXT,
|
||||||
|
SEMANTIC_TEXT,
|
||||||
|
SUMMARY,
|
||||||
|
TITLE_ABSTRACT,
|
||||||
|
CHUNK,
|
||||||
|
METADATA_ENRICHED
|
||||||
|
}
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provenance of an imported document.
|
||||||
|
*/
|
||||||
|
public enum SourceType {
|
||||||
|
TED_PACKAGE,
|
||||||
|
MAIL,
|
||||||
|
FILE_SYSTEM,
|
||||||
|
REST_UPLOAD,
|
||||||
|
MANUAL_UPLOAD,
|
||||||
|
ZIP_CHILD,
|
||||||
|
API,
|
||||||
|
MIGRATION
|
||||||
|
}
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
package at.procon.dip.domain.document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Physical storage strategy for content.
|
||||||
|
*/
|
||||||
|
public enum StorageType {
|
||||||
|
DB_TEXT,
|
||||||
|
DB_BINARY,
|
||||||
|
FILE_PATH,
|
||||||
|
OBJECT_STORAGE,
|
||||||
|
EXTERNAL_REFERENCE
|
||||||
|
}
|
||||||
@ -0,0 +1,133 @@
|
|||||||
|
package at.procon.dip.domain.document.entity;
|
||||||
|
|
||||||
|
import at.procon.dip.architecture.SchemaNames;
|
||||||
|
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.tenant.entity.DocumentTenant;
|
||||||
|
import jakarta.persistence.Column;
|
||||||
|
import jakarta.persistence.Entity;
|
||||||
|
import jakarta.persistence.EnumType;
|
||||||
|
import jakarta.persistence.Enumerated;
|
||||||
|
import jakarta.persistence.FetchType;
|
||||||
|
import jakarta.persistence.GeneratedValue;
|
||||||
|
import jakarta.persistence.GenerationType;
|
||||||
|
import jakarta.persistence.Id;
|
||||||
|
import jakarta.persistence.Index;
|
||||||
|
import jakarta.persistence.JoinColumn;
|
||||||
|
import jakarta.persistence.ManyToOne;
|
||||||
|
import jakarta.persistence.PrePersist;
|
||||||
|
import jakarta.persistence.PreUpdate;
|
||||||
|
import jakarta.persistence.Table;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Canonical document root entity for the generalized DOC schema.
|
||||||
|
*/
|
||||||
|
@Entity
|
||||||
|
@Table(schema = SchemaNames.DOC, name = "doc_document", indexes = {
|
||||||
|
@Index(name = "idx_doc_document_type", columnList = "document_type"),
|
||||||
|
@Index(name = "idx_doc_document_family", columnList = "document_family"),
|
||||||
|
@Index(name = "idx_doc_document_status", columnList = "status"),
|
||||||
|
@Index(name = "idx_doc_document_visibility", columnList = "visibility"),
|
||||||
|
@Index(name = "idx_doc_document_owner_tenant", columnList = "owner_tenant_id"),
|
||||||
|
@Index(name = "idx_doc_document_dedup_hash", columnList = "dedup_hash"),
|
||||||
|
@Index(name = "idx_doc_document_business_key", columnList = "business_key"),
|
||||||
|
@Index(name = "idx_doc_document_created_at", columnList = "created_at")
|
||||||
|
})
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class Document {
|
||||||
|
|
||||||
|
@Id
|
||||||
|
@GeneratedValue(strategy = GenerationType.UUID)
|
||||||
|
private UUID id;
|
||||||
|
|
||||||
|
@ManyToOne(fetch = FetchType.LAZY)
|
||||||
|
@JoinColumn(name = "owner_tenant_id")
|
||||||
|
private DocumentTenant ownerTenant;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "visibility", nullable = false, length = 32)
|
||||||
|
@Builder.Default
|
||||||
|
private DocumentVisibility visibility = DocumentVisibility.PUBLIC;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "document_type", nullable = false, length = 64)
|
||||||
|
private DocumentType documentType;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "document_family", nullable = false, length = 64)
|
||||||
|
private DocumentFamily documentFamily;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "status", nullable = false, length = 32)
|
||||||
|
@Builder.Default
|
||||||
|
private DocumentStatus status = DocumentStatus.RECEIVED;
|
||||||
|
|
||||||
|
@Column(name = "title", length = 1000)
|
||||||
|
private String title;
|
||||||
|
|
||||||
|
@Column(name = "summary", columnDefinition = "TEXT")
|
||||||
|
private String summary;
|
||||||
|
|
||||||
|
@Column(name = "language_code", length = 16)
|
||||||
|
private String languageCode;
|
||||||
|
|
||||||
|
@Column(name = "mime_type", length = 255)
|
||||||
|
private String mimeType;
|
||||||
|
|
||||||
|
@Column(name = "business_key", length = 255)
|
||||||
|
private String businessKey;
|
||||||
|
|
||||||
|
@Column(name = "dedup_hash", length = 64)
|
||||||
|
private String dedupHash;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "updated_at", nullable = false)
|
||||||
|
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@PrePersist
|
||||||
|
protected void onCreate() {
|
||||||
|
createdAt = OffsetDateTime.now();
|
||||||
|
updatedAt = OffsetDateTime.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
@PreUpdate
|
||||||
|
protected void onUpdate() {
|
||||||
|
updatedAt = OffsetDateTime.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
public CanonicalDocumentMetadata toCanonicalMetadata() {
|
||||||
|
return new CanonicalDocumentMetadata(
|
||||||
|
id,
|
||||||
|
new DocumentAccessContext(ownerTenant == null ? null : new at.procon.dip.domain.tenant.TenantRef(
|
||||||
|
ownerTenant.getId().toString(), ownerTenant.getTenantKey(), ownerTenant.getDisplayName()), visibility),
|
||||||
|
documentType,
|
||||||
|
documentFamily,
|
||||||
|
status,
|
||||||
|
title,
|
||||||
|
languageCode,
|
||||||
|
mimeType,
|
||||||
|
dedupHash,
|
||||||
|
createdAt,
|
||||||
|
updatedAt
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,86 @@
|
|||||||
|
package at.procon.dip.domain.document.entity;
|
||||||
|
|
||||||
|
import at.procon.dip.architecture.SchemaNames;
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.StorageType;
|
||||||
|
import jakarta.persistence.Column;
|
||||||
|
import jakarta.persistence.Entity;
|
||||||
|
import jakarta.persistence.EnumType;
|
||||||
|
import jakarta.persistence.Enumerated;
|
||||||
|
import jakarta.persistence.FetchType;
|
||||||
|
import jakarta.persistence.GeneratedValue;
|
||||||
|
import jakarta.persistence.GenerationType;
|
||||||
|
import jakarta.persistence.Id;
|
||||||
|
import jakarta.persistence.Index;
|
||||||
|
import jakarta.persistence.JoinColumn;
|
||||||
|
import jakarta.persistence.ManyToOne;
|
||||||
|
import jakarta.persistence.PrePersist;
|
||||||
|
import jakarta.persistence.Table;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stored payload variant for a canonical document.
|
||||||
|
*/
|
||||||
|
@Entity
|
||||||
|
@Table(schema = SchemaNames.DOC, name = "doc_content", indexes = {
|
||||||
|
@Index(name = "idx_doc_content_document", columnList = "document_id"),
|
||||||
|
@Index(name = "idx_doc_content_role", columnList = "content_role"),
|
||||||
|
@Index(name = "idx_doc_content_hash", columnList = "content_hash"),
|
||||||
|
@Index(name = "idx_doc_content_storage_type", columnList = "storage_type")
|
||||||
|
})
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class DocumentContent {
|
||||||
|
|
||||||
|
@Id
|
||||||
|
@GeneratedValue(strategy = GenerationType.UUID)
|
||||||
|
private UUID id;
|
||||||
|
|
||||||
|
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||||
|
@JoinColumn(name = "document_id", nullable = false)
|
||||||
|
private Document document;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "content_role", nullable = false, length = 64)
|
||||||
|
private ContentRole contentRole;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "storage_type", nullable = false, length = 64)
|
||||||
|
private StorageType storageType;
|
||||||
|
|
||||||
|
@Column(name = "mime_type", length = 255)
|
||||||
|
private String mimeType;
|
||||||
|
|
||||||
|
@Column(name = "charset_name", length = 120)
|
||||||
|
private String charsetName;
|
||||||
|
|
||||||
|
@Column(name = "text_content", columnDefinition = "TEXT")
|
||||||
|
private String textContent;
|
||||||
|
|
||||||
|
@Column(name = "binary_ref", columnDefinition = "TEXT")
|
||||||
|
private String binaryRef;
|
||||||
|
|
||||||
|
@Column(name = "content_hash", length = 64)
|
||||||
|
private String contentHash;
|
||||||
|
|
||||||
|
@Column(name = "size_bytes")
|
||||||
|
private Long sizeBytes;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@PrePersist
|
||||||
|
protected void onCreate() {
|
||||||
|
createdAt = OffsetDateTime.now();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,103 @@
|
|||||||
|
package at.procon.dip.domain.document.entity;
|
||||||
|
|
||||||
|
import at.procon.dip.architecture.SchemaNames;
|
||||||
|
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||||
|
import jakarta.persistence.Column;
|
||||||
|
import jakarta.persistence.Entity;
|
||||||
|
import jakarta.persistence.EnumType;
|
||||||
|
import jakarta.persistence.Enumerated;
|
||||||
|
import jakarta.persistence.FetchType;
|
||||||
|
import jakarta.persistence.GeneratedValue;
|
||||||
|
import jakarta.persistence.GenerationType;
|
||||||
|
import jakarta.persistence.Id;
|
||||||
|
import jakarta.persistence.Index;
|
||||||
|
import jakarta.persistence.JoinColumn;
|
||||||
|
import jakarta.persistence.ManyToOne;
|
||||||
|
import jakarta.persistence.PrePersist;
|
||||||
|
import jakarta.persistence.PreUpdate;
|
||||||
|
import jakarta.persistence.Table;
|
||||||
|
import jakarta.persistence.Transient;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generic vectorization record separated from the canonical document structure.
|
||||||
|
* <p>
|
||||||
|
* The actual pgvector payload is persisted in the {@code embedding_vector} column via native SQL
|
||||||
|
* in later phases. The transient field exists only as a convenient in-memory carrier.
|
||||||
|
*/
|
||||||
|
@Entity
|
||||||
|
@Table(schema = SchemaNames.DOC, name = "doc_embedding", indexes = {
|
||||||
|
@Index(name = "idx_doc_embedding_document", columnList = "document_id"),
|
||||||
|
@Index(name = "idx_doc_embedding_repr", columnList = "representation_id"),
|
||||||
|
@Index(name = "idx_doc_embedding_model", columnList = "model_id"),
|
||||||
|
@Index(name = "idx_doc_embedding_status", columnList = "embedding_status"),
|
||||||
|
@Index(name = "idx_doc_embedding_embedded_at", columnList = "embedded_at")
|
||||||
|
})
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class DocumentEmbedding {
|
||||||
|
|
||||||
|
@Id
|
||||||
|
@GeneratedValue(strategy = GenerationType.UUID)
|
||||||
|
private UUID id;
|
||||||
|
|
||||||
|
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||||
|
@JoinColumn(name = "document_id", nullable = false)
|
||||||
|
private Document document;
|
||||||
|
|
||||||
|
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||||
|
@JoinColumn(name = "representation_id", nullable = false)
|
||||||
|
private DocumentTextRepresentation representation;
|
||||||
|
|
||||||
|
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||||
|
@JoinColumn(name = "model_id", nullable = false)
|
||||||
|
private DocumentEmbeddingModel model;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "embedding_status", nullable = false, length = 32)
|
||||||
|
@Builder.Default
|
||||||
|
private EmbeddingStatus embeddingStatus = EmbeddingStatus.PENDING;
|
||||||
|
|
||||||
|
@Column(name = "token_count")
|
||||||
|
private Integer tokenCount;
|
||||||
|
|
||||||
|
@Column(name = "embedding_dimensions")
|
||||||
|
private Integer embeddingDimensions;
|
||||||
|
|
||||||
|
@Column(name = "error_message", columnDefinition = "TEXT")
|
||||||
|
private String errorMessage;
|
||||||
|
|
||||||
|
@Column(name = "embedded_at")
|
||||||
|
private OffsetDateTime embeddedAt;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "updated_at", nullable = false)
|
||||||
|
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@Transient
|
||||||
|
private float[] embeddingVector;
|
||||||
|
|
||||||
|
@PrePersist
|
||||||
|
protected void onCreate() {
|
||||||
|
createdAt = OffsetDateTime.now();
|
||||||
|
updatedAt = OffsetDateTime.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
@PreUpdate
|
||||||
|
protected void onUpdate() {
|
||||||
|
updatedAt = OffsetDateTime.now();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,86 @@
|
|||||||
|
package at.procon.dip.domain.document.entity;
|
||||||
|
|
||||||
|
import at.procon.dip.architecture.SchemaNames;
|
||||||
|
import at.procon.dip.domain.document.DistanceMetric;
|
||||||
|
import jakarta.persistence.Column;
|
||||||
|
import jakarta.persistence.Entity;
|
||||||
|
import jakarta.persistence.EnumType;
|
||||||
|
import jakarta.persistence.Enumerated;
|
||||||
|
import jakarta.persistence.GeneratedValue;
|
||||||
|
import jakarta.persistence.GenerationType;
|
||||||
|
import jakarta.persistence.Id;
|
||||||
|
import jakarta.persistence.Index;
|
||||||
|
import jakarta.persistence.PrePersist;
|
||||||
|
import jakarta.persistence.PreUpdate;
|
||||||
|
import jakarta.persistence.Table;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Embedding model catalog row used by generic vectorization.
|
||||||
|
*/
|
||||||
|
@Entity
|
||||||
|
@Table(schema = SchemaNames.DOC, name = "doc_embedding_model", indexes = {
|
||||||
|
@Index(name = "idx_doc_embedding_model_key", columnList = "model_key", unique = true),
|
||||||
|
@Index(name = "idx_doc_embedding_model_active", columnList = "active")
|
||||||
|
})
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class DocumentEmbeddingModel {
|
||||||
|
|
||||||
|
@Id
|
||||||
|
@GeneratedValue(strategy = GenerationType.UUID)
|
||||||
|
private UUID id;
|
||||||
|
|
||||||
|
@Column(name = "model_key", nullable = false, unique = true, length = 255)
|
||||||
|
private String modelKey;
|
||||||
|
|
||||||
|
@Column(name = "provider", nullable = false, length = 120)
|
||||||
|
private String provider;
|
||||||
|
|
||||||
|
@Column(name = "display_name", length = 255)
|
||||||
|
private String displayName;
|
||||||
|
|
||||||
|
@Column(name = "dimensions", nullable = false)
|
||||||
|
private Integer dimensions;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "distance_metric", nullable = false, length = 32)
|
||||||
|
@Builder.Default
|
||||||
|
private DistanceMetric distanceMetric = DistanceMetric.COSINE;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "query_prefix_required", nullable = false)
|
||||||
|
private boolean queryPrefixRequired = false;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "active", nullable = false)
|
||||||
|
private boolean active = true;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "updated_at", nullable = false)
|
||||||
|
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@PrePersist
|
||||||
|
protected void onCreate() {
|
||||||
|
createdAt = OffsetDateTime.now();
|
||||||
|
updatedAt = OffsetDateTime.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
@PreUpdate
|
||||||
|
protected void onUpdate() {
|
||||||
|
updatedAt = OffsetDateTime.now();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,72 @@
|
|||||||
|
package at.procon.dip.domain.document.entity;
|
||||||
|
|
||||||
|
import at.procon.dip.architecture.SchemaNames;
|
||||||
|
import at.procon.dip.domain.document.RelationType;
|
||||||
|
import jakarta.persistence.Column;
|
||||||
|
import jakarta.persistence.Entity;
|
||||||
|
import jakarta.persistence.EnumType;
|
||||||
|
import jakarta.persistence.Enumerated;
|
||||||
|
import jakarta.persistence.FetchType;
|
||||||
|
import jakarta.persistence.GeneratedValue;
|
||||||
|
import jakarta.persistence.GenerationType;
|
||||||
|
import jakarta.persistence.Id;
|
||||||
|
import jakarta.persistence.Index;
|
||||||
|
import jakarta.persistence.JoinColumn;
|
||||||
|
import jakarta.persistence.ManyToOne;
|
||||||
|
import jakarta.persistence.PrePersist;
|
||||||
|
import jakarta.persistence.Table;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Directed relationship between two canonical documents.
|
||||||
|
*/
|
||||||
|
@Entity
|
||||||
|
@Table(schema = SchemaNames.DOC, name = "doc_relation", indexes = {
|
||||||
|
@Index(name = "idx_doc_relation_parent", columnList = "parent_document_id"),
|
||||||
|
@Index(name = "idx_doc_relation_child", columnList = "child_document_id"),
|
||||||
|
@Index(name = "idx_doc_relation_type", columnList = "relation_type")
|
||||||
|
})
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class DocumentRelation {
|
||||||
|
|
||||||
|
@Id
|
||||||
|
@GeneratedValue(strategy = GenerationType.UUID)
|
||||||
|
private UUID id;
|
||||||
|
|
||||||
|
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||||
|
@JoinColumn(name = "parent_document_id", nullable = false)
|
||||||
|
private Document parentDocument;
|
||||||
|
|
||||||
|
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||||
|
@JoinColumn(name = "child_document_id", nullable = false)
|
||||||
|
private Document childDocument;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "relation_type", nullable = false, length = 64)
|
||||||
|
private RelationType relationType;
|
||||||
|
|
||||||
|
@Column(name = "sort_order")
|
||||||
|
private Integer sortOrder;
|
||||||
|
|
||||||
|
@Column(name = "relation_metadata", columnDefinition = "TEXT")
|
||||||
|
private String relationMetadata;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@PrePersist
|
||||||
|
protected void onCreate() {
|
||||||
|
createdAt = OffsetDateTime.now();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,85 @@
|
|||||||
|
package at.procon.dip.domain.document.entity;
|
||||||
|
|
||||||
|
import at.procon.dip.architecture.SchemaNames;
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import jakarta.persistence.Column;
|
||||||
|
import jakarta.persistence.Entity;
|
||||||
|
import jakarta.persistence.EnumType;
|
||||||
|
import jakarta.persistence.Enumerated;
|
||||||
|
import jakarta.persistence.FetchType;
|
||||||
|
import jakarta.persistence.GeneratedValue;
|
||||||
|
import jakarta.persistence.GenerationType;
|
||||||
|
import jakarta.persistence.Id;
|
||||||
|
import jakarta.persistence.Index;
|
||||||
|
import jakarta.persistence.JoinColumn;
|
||||||
|
import jakarta.persistence.ManyToOne;
|
||||||
|
import jakarta.persistence.PrePersist;
|
||||||
|
import jakarta.persistence.Table;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provenance row for a canonical document.
|
||||||
|
*/
|
||||||
|
@Entity
|
||||||
|
@Table(schema = SchemaNames.DOC, name = "doc_source", indexes = {
|
||||||
|
@Index(name = "idx_doc_source_document", columnList = "document_id"),
|
||||||
|
@Index(name = "idx_doc_source_type", columnList = "source_type"),
|
||||||
|
@Index(name = "idx_doc_source_external_id", columnList = "external_source_id"),
|
||||||
|
@Index(name = "idx_doc_source_received_at", columnList = "received_at"),
|
||||||
|
@Index(name = "idx_doc_source_parent_source", columnList = "parent_source_id")
|
||||||
|
})
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class DocumentSource {
|
||||||
|
|
||||||
|
@Id
|
||||||
|
@GeneratedValue(strategy = GenerationType.UUID)
|
||||||
|
private UUID id;
|
||||||
|
|
||||||
|
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||||
|
@JoinColumn(name = "document_id", nullable = false)
|
||||||
|
private Document document;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "source_type", nullable = false, length = 64)
|
||||||
|
private SourceType sourceType;
|
||||||
|
|
||||||
|
@Column(name = "external_source_id", length = 500)
|
||||||
|
private String externalSourceId;
|
||||||
|
|
||||||
|
@Column(name = "source_uri", columnDefinition = "TEXT")
|
||||||
|
private String sourceUri;
|
||||||
|
|
||||||
|
@Column(name = "source_filename", length = 1000)
|
||||||
|
private String sourceFilename;
|
||||||
|
|
||||||
|
@Column(name = "parent_source_id")
|
||||||
|
private UUID parentSourceId;
|
||||||
|
|
||||||
|
@Column(name = "import_batch_id", length = 255)
|
||||||
|
private String importBatchId;
|
||||||
|
|
||||||
|
@Column(name = "received_at")
|
||||||
|
private OffsetDateTime receivedAt;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@PrePersist
|
||||||
|
protected void onCreate() {
|
||||||
|
createdAt = OffsetDateTime.now();
|
||||||
|
if (receivedAt == null) {
|
||||||
|
receivedAt = createdAt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
package at.procon.dip.domain.document.entity;
|
||||||
|
|
||||||
|
import at.procon.dip.architecture.SchemaNames;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import jakarta.persistence.Column;
|
||||||
|
import jakarta.persistence.Entity;
|
||||||
|
import jakarta.persistence.EnumType;
|
||||||
|
import jakarta.persistence.Enumerated;
|
||||||
|
import jakarta.persistence.FetchType;
|
||||||
|
import jakarta.persistence.GeneratedValue;
|
||||||
|
import jakarta.persistence.GenerationType;
|
||||||
|
import jakarta.persistence.Id;
|
||||||
|
import jakarta.persistence.Index;
|
||||||
|
import jakarta.persistence.JoinColumn;
|
||||||
|
import jakarta.persistence.ManyToOne;
|
||||||
|
import jakarta.persistence.PrePersist;
|
||||||
|
import jakarta.persistence.Table;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search-oriented text derived from a canonical document.
|
||||||
|
*/
|
||||||
|
@Entity
|
||||||
|
@Table(schema = SchemaNames.DOC, name = "doc_text_representation", indexes = {
|
||||||
|
@Index(name = "idx_doc_text_repr_document", columnList = "document_id"),
|
||||||
|
@Index(name = "idx_doc_text_repr_content", columnList = "content_id"),
|
||||||
|
@Index(name = "idx_doc_text_repr_type", columnList = "representation_type"),
|
||||||
|
@Index(name = "idx_doc_text_repr_primary", columnList = "is_primary")
|
||||||
|
})
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class DocumentTextRepresentation {
|
||||||
|
|
||||||
|
@Id
|
||||||
|
@GeneratedValue(strategy = GenerationType.UUID)
|
||||||
|
private UUID id;
|
||||||
|
|
||||||
|
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||||
|
@JoinColumn(name = "document_id", nullable = false)
|
||||||
|
private Document document;
|
||||||
|
|
||||||
|
@ManyToOne(fetch = FetchType.LAZY)
|
||||||
|
@JoinColumn(name = "content_id")
|
||||||
|
private DocumentContent content;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(name = "representation_type", nullable = false, length = 64)
|
||||||
|
private RepresentationType representationType;
|
||||||
|
|
||||||
|
@Column(name = "builder_key", length = 255)
|
||||||
|
private String builderKey;
|
||||||
|
|
||||||
|
@Column(name = "language_code", length = 16)
|
||||||
|
private String languageCode;
|
||||||
|
|
||||||
|
@Column(name = "token_count")
|
||||||
|
private Integer tokenCount;
|
||||||
|
|
||||||
|
@Column(name = "char_count")
|
||||||
|
private Integer charCount;
|
||||||
|
|
||||||
|
@Column(name = "chunk_index")
|
||||||
|
private Integer chunkIndex;
|
||||||
|
|
||||||
|
@Column(name = "chunk_start_offset")
|
||||||
|
private Integer chunkStartOffset;
|
||||||
|
|
||||||
|
@Column(name = "chunk_end_offset")
|
||||||
|
private Integer chunkEndOffset;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "is_primary", nullable = false)
|
||||||
|
private boolean primaryRepresentation = false;
|
||||||
|
|
||||||
|
@Column(name = "text_body", columnDefinition = "TEXT", nullable = false)
|
||||||
|
private String textBody;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@PrePersist
|
||||||
|
protected void onCreate() {
|
||||||
|
createdAt = OffsetDateTime.now();
|
||||||
|
if (charCount == null && textBody != null) {
|
||||||
|
charCount = textBody.length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,17 @@
|
|||||||
|
package at.procon.dip.domain.document.repository;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
|
public interface DocumentContentRepository extends JpaRepository<DocumentContent, UUID> {
|
||||||
|
|
||||||
|
List<DocumentContent> findByDocument_Id(UUID documentId);
|
||||||
|
|
||||||
|
List<DocumentContent> findByDocument_IdAndContentRole(UUID documentId, ContentRole contentRole);
|
||||||
|
|
||||||
|
Optional<DocumentContent> findByContentHash(String contentHash);
|
||||||
|
}
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
package at.procon.dip.domain.document.repository;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
|
public interface DocumentEmbeddingModelRepository extends JpaRepository<DocumentEmbeddingModel, UUID> {
|
||||||
|
|
||||||
|
Optional<DocumentEmbeddingModel> findByModelKey(String modelKey);
|
||||||
|
}
|
||||||
@ -0,0 +1,55 @@
|
|||||||
|
package at.procon.dip.domain.document.repository;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentEmbedding;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.springframework.data.domain.Pageable;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
import org.springframework.data.jpa.repository.Modifying;
|
||||||
|
import org.springframework.data.jpa.repository.Query;
|
||||||
|
import org.springframework.data.repository.query.Param;
|
||||||
|
|
||||||
|
public interface DocumentEmbeddingRepository extends JpaRepository<DocumentEmbedding, UUID> {
|
||||||
|
|
||||||
|
List<DocumentEmbedding> findByDocument_Id(UUID documentId);
|
||||||
|
|
||||||
|
List<DocumentEmbedding> findByRepresentation_Id(UUID representationId);
|
||||||
|
|
||||||
|
List<DocumentEmbedding> findByEmbeddingStatus(EmbeddingStatus embeddingStatus);
|
||||||
|
|
||||||
|
Optional<DocumentEmbedding> findByRepresentation_IdAndModel_Id(UUID representationId, UUID modelId);
|
||||||
|
|
||||||
|
@Query("SELECT e.id FROM DocumentEmbedding e WHERE e.embeddingStatus = :status ORDER BY e.createdAt ASC")
|
||||||
|
List<UUID> findIdsByEmbeddingStatus(@Param("status") EmbeddingStatus status, Pageable pageable);
|
||||||
|
|
||||||
|
@Query("SELECT e FROM DocumentEmbedding e " +
|
||||||
|
"JOIN FETCH e.document d " +
|
||||||
|
"JOIN FETCH e.representation r " +
|
||||||
|
"JOIN FETCH e.model m " +
|
||||||
|
"WHERE e.id = :embeddingId")
|
||||||
|
Optional<DocumentEmbedding> findDetailedById(@Param("embeddingId") UUID embeddingId);
|
||||||
|
|
||||||
|
@Modifying
|
||||||
|
@Query(value = "UPDATE doc.doc_embedding SET embedding_vector = CAST(:vectorData AS vector), " +
|
||||||
|
"embedding_status = 'COMPLETED', embedded_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP, " +
|
||||||
|
"error_message = NULL, token_count = :tokenCount, embedding_dimensions = :dimensions WHERE id = :id",
|
||||||
|
nativeQuery = true)
|
||||||
|
int updateEmbeddingVector(@Param("id") UUID id,
|
||||||
|
@Param("vectorData") String vectorData,
|
||||||
|
@Param("tokenCount") Integer tokenCount,
|
||||||
|
@Param("dimensions") Integer dimensions);
|
||||||
|
|
||||||
|
@Modifying
|
||||||
|
@Query("UPDATE DocumentEmbedding e SET e.embeddingStatus = :status, e.errorMessage = :errorMessage, " +
|
||||||
|
"e.embeddedAt = :embeddedAt, e.updatedAt = CURRENT_TIMESTAMP WHERE e.id = :embeddingId")
|
||||||
|
int updateEmbeddingStatus(@Param("embeddingId") UUID embeddingId,
|
||||||
|
@Param("status") EmbeddingStatus status,
|
||||||
|
@Param("errorMessage") String errorMessage,
|
||||||
|
@Param("embeddedAt") OffsetDateTime embeddedAt);
|
||||||
|
|
||||||
|
@Query("SELECT e.embeddingStatus, COUNT(e) FROM DocumentEmbedding e GROUP BY e.embeddingStatus")
|
||||||
|
List<Object[]> countByEmbeddingStatus();
|
||||||
|
}
|
||||||
@ -0,0 +1,16 @@
|
|||||||
|
package at.procon.dip.domain.document.repository;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.RelationType;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentRelation;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
|
public interface DocumentRelationRepository extends JpaRepository<DocumentRelation, UUID> {
|
||||||
|
|
||||||
|
List<DocumentRelation> findByParentDocument_Id(UUID parentDocumentId);
|
||||||
|
|
||||||
|
List<DocumentRelation> findByChildDocument_Id(UUID childDocumentId);
|
||||||
|
|
||||||
|
List<DocumentRelation> findByParentDocument_IdAndRelationType(UUID parentDocumentId, RelationType relationType);
|
||||||
|
}
|
||||||
@ -0,0 +1,31 @@
|
|||||||
|
package at.procon.dip.domain.document.repository;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
|
public interface DocumentRepository extends JpaRepository<Document, UUID> {
|
||||||
|
|
||||||
|
Optional<Document> findByDedupHash(String dedupHash);
|
||||||
|
|
||||||
|
boolean existsByDedupHash(String dedupHash);
|
||||||
|
|
||||||
|
List<Document> findByDocumentType(DocumentType documentType);
|
||||||
|
|
||||||
|
List<Document> findByDocumentFamily(DocumentFamily documentFamily);
|
||||||
|
|
||||||
|
List<Document> findByStatus(DocumentStatus status);
|
||||||
|
|
||||||
|
List<Document> findByVisibility(DocumentVisibility visibility);
|
||||||
|
|
||||||
|
List<Document> findByOwnerTenant_TenantKey(String tenantKey);
|
||||||
|
|
||||||
|
List<Document> findByOwnerTenant_TenantKeyIn(Collection<String> tenantKeys);
|
||||||
|
}
|
||||||
@ -0,0 +1,17 @@
|
|||||||
|
package at.procon.dip.domain.document.repository;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentSource;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
|
public interface DocumentSourceRepository extends JpaRepository<DocumentSource, UUID> {
|
||||||
|
|
||||||
|
List<DocumentSource> findByDocument_Id(UUID documentId);
|
||||||
|
|
||||||
|
List<DocumentSource> findBySourceType(SourceType sourceType);
|
||||||
|
|
||||||
|
Optional<DocumentSource> findByExternalSourceId(String externalSourceId);
|
||||||
|
}
|
||||||
@ -0,0 +1,19 @@
|
|||||||
|
package at.procon.dip.domain.document.repository;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
|
public interface DocumentTextRepresentationRepository extends JpaRepository<DocumentTextRepresentation, UUID> {
|
||||||
|
|
||||||
|
List<DocumentTextRepresentation> findByDocument_Id(UUID documentId);
|
||||||
|
|
||||||
|
List<DocumentTextRepresentation> findByDocument_IdAndRepresentationType(UUID documentId, RepresentationType representationType);
|
||||||
|
|
||||||
|
List<DocumentTextRepresentation> findByPrimaryRepresentationTrue();
|
||||||
|
|
||||||
|
Optional<DocumentTextRepresentation> findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId);
|
||||||
|
}
|
||||||
@ -0,0 +1,45 @@
|
|||||||
|
package at.procon.dip.domain.document.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentContentRepository;
|
||||||
|
import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Transactional
|
||||||
|
public class DocumentContentService {
|
||||||
|
|
||||||
|
private final DocumentService documentService;
|
||||||
|
private final DocumentContentRepository contentRepository;
|
||||||
|
|
||||||
|
public DocumentContent addContent(AddDocumentContentCommand command) {
|
||||||
|
DocumentContent content = DocumentContent.builder()
|
||||||
|
.document(documentService.getRequired(command.documentId()))
|
||||||
|
.contentRole(command.contentRole())
|
||||||
|
.storageType(command.storageType())
|
||||||
|
.mimeType(command.mimeType())
|
||||||
|
.charsetName(command.charsetName())
|
||||||
|
.textContent(command.textContent())
|
||||||
|
.binaryRef(command.binaryRef())
|
||||||
|
.contentHash(command.contentHash())
|
||||||
|
.sizeBytes(command.sizeBytes())
|
||||||
|
.build();
|
||||||
|
return contentRepository.save(content);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public DocumentContent getRequired(UUID contentId) {
|
||||||
|
return contentRepository.findById(contentId)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown content id: " + contentId));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public List<DocumentContent> findByDocument(UUID documentId) {
|
||||||
|
return contentRepository.findByDocument_Id(documentId);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,125 @@
|
|||||||
|
package at.procon.dip.domain.document.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DistanceMetric;
|
||||||
|
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentEmbedding;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||||
|
import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Transactional
|
||||||
|
public class DocumentEmbeddingService {
|
||||||
|
|
||||||
|
private final DocumentService documentService;
|
||||||
|
private final DocumentRepresentationService representationService;
|
||||||
|
private final DocumentEmbeddingRepository embeddingRepository;
|
||||||
|
private final DocumentEmbeddingModelRepository modelRepository;
|
||||||
|
|
||||||
|
public DocumentEmbeddingModel registerModel(RegisterEmbeddingModelCommand command) {
|
||||||
|
DocumentEmbeddingModel model = modelRepository.findByModelKey(command.modelKey())
|
||||||
|
.orElseGet(DocumentEmbeddingModel::new);
|
||||||
|
model.setModelKey(command.modelKey());
|
||||||
|
model.setProvider(command.provider());
|
||||||
|
model.setDisplayName(command.displayName());
|
||||||
|
model.setDimensions(command.dimensions());
|
||||||
|
model.setDistanceMetric(command.distanceMetric() == null ? DistanceMetric.COSINE : command.distanceMetric());
|
||||||
|
model.setQueryPrefixRequired(command.queryPrefixRequired());
|
||||||
|
model.setActive(command.active());
|
||||||
|
return modelRepository.save(model);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentEmbedding createPendingEmbedding(UUID documentId, UUID representationId, UUID modelId) {
|
||||||
|
DocumentEmbeddingModel model = getRequiredModel(modelId);
|
||||||
|
DocumentEmbedding embedding = DocumentEmbedding.builder()
|
||||||
|
.document(documentService.getRequired(documentId))
|
||||||
|
.representation(representationService.getRequired(representationId))
|
||||||
|
.model(model)
|
||||||
|
.embeddingDimensions(model.getDimensions())
|
||||||
|
.embeddingStatus(EmbeddingStatus.PENDING)
|
||||||
|
.build();
|
||||||
|
return embeddingRepository.save(embedding);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentEmbedding ensurePendingEmbedding(UUID documentId, UUID representationId, UUID modelId) {
|
||||||
|
Optional<DocumentEmbedding> existing = embeddingRepository.findByRepresentation_IdAndModel_Id(representationId, modelId);
|
||||||
|
if (existing.isPresent()) {
|
||||||
|
DocumentEmbedding embedding = existing.get();
|
||||||
|
embedding.setDocument(documentService.getRequired(documentId));
|
||||||
|
embedding.setRepresentation(representationService.getRequired(representationId));
|
||||||
|
embedding.setModel(getRequiredModel(modelId));
|
||||||
|
embedding.setEmbeddingDimensions(embedding.getModel().getDimensions());
|
||||||
|
embedding.setEmbeddingStatus(EmbeddingStatus.PENDING);
|
||||||
|
embedding.setErrorMessage(null);
|
||||||
|
embedding.setEmbeddedAt(null);
|
||||||
|
return embeddingRepository.save(embedding);
|
||||||
|
}
|
||||||
|
return createPendingEmbedding(documentId, representationId, modelId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentEmbedding markCompleted(UUID embeddingId, Integer tokenCount) {
|
||||||
|
DocumentEmbedding embedding = getRequired(embeddingId);
|
||||||
|
embedding.setEmbeddingStatus(EmbeddingStatus.COMPLETED);
|
||||||
|
embedding.setTokenCount(tokenCount);
|
||||||
|
embedding.setEmbeddedAt(OffsetDateTime.now());
|
||||||
|
embedding.setErrorMessage(null);
|
||||||
|
return embeddingRepository.save(embedding);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentEmbedding markFailed(UUID embeddingId, String errorMessage) {
|
||||||
|
DocumentEmbedding embedding = getRequired(embeddingId);
|
||||||
|
embedding.setEmbeddingStatus(EmbeddingStatus.FAILED);
|
||||||
|
embedding.setErrorMessage(errorMessage);
|
||||||
|
embedding.setEmbeddedAt(null);
|
||||||
|
return embeddingRepository.save(embedding);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentEmbedding markProcessing(UUID embeddingId) {
|
||||||
|
DocumentEmbedding embedding = getRequired(embeddingId);
|
||||||
|
embedding.setEmbeddingStatus(EmbeddingStatus.PROCESSING);
|
||||||
|
embedding.setErrorMessage(null);
|
||||||
|
return embeddingRepository.save(embedding);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentEmbedding markSkipped(UUID embeddingId, String reason) {
|
||||||
|
DocumentEmbedding embedding = getRequired(embeddingId);
|
||||||
|
embedding.setEmbeddingStatus(EmbeddingStatus.SKIPPED);
|
||||||
|
embedding.setErrorMessage(reason);
|
||||||
|
embedding.setEmbeddedAt(OffsetDateTime.now());
|
||||||
|
return embeddingRepository.save(embedding);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public DocumentEmbedding getRequired(UUID embeddingId) {
|
||||||
|
return embeddingRepository.findById(embeddingId)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public DocumentEmbeddingModel getRequiredModel(UUID modelId) {
|
||||||
|
return modelRepository.findById(modelId)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding model id: " + modelId));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public DocumentEmbeddingModel findActiveModelByKey(String modelKey) {
|
||||||
|
return modelRepository.findByModelKey(modelKey)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding model key: " + modelKey));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public List<DocumentEmbedding> findPendingEmbeddings() {
|
||||||
|
return embeddingRepository.findByEmbeddingStatus(EmbeddingStatus.PENDING);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@ -0,0 +1,35 @@
|
|||||||
|
package at.procon.dip.domain.document.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentRelation;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentRelationRepository;
|
||||||
|
import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Transactional
|
||||||
|
public class DocumentRelationService {
|
||||||
|
|
||||||
|
private final DocumentService documentService;
|
||||||
|
private final DocumentRelationRepository relationRepository;
|
||||||
|
|
||||||
|
public DocumentRelation createRelation(CreateDocumentRelationCommand command) {
|
||||||
|
DocumentRelation relation = DocumentRelation.builder()
|
||||||
|
.parentDocument(documentService.getRequired(command.parentDocumentId()))
|
||||||
|
.childDocument(documentService.getRequired(command.childDocumentId()))
|
||||||
|
.relationType(command.relationType())
|
||||||
|
.sortOrder(command.sortOrder())
|
||||||
|
.relationMetadata(command.relationMetadata())
|
||||||
|
.build();
|
||||||
|
return relationRepository.save(relation);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public List<DocumentRelation> findChildren(UUID parentDocumentId) {
|
||||||
|
return relationRepository.findByParentDocument_Id(parentDocumentId);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,50 @@
|
|||||||
|
package at.procon.dip.domain.document.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||||
|
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Transactional
|
||||||
|
public class DocumentRepresentationService {
|
||||||
|
|
||||||
|
private final DocumentService documentService;
|
||||||
|
private final DocumentContentService contentService;
|
||||||
|
private final DocumentTextRepresentationRepository representationRepository;
|
||||||
|
|
||||||
|
public DocumentTextRepresentation addRepresentation(AddDocumentTextRepresentationCommand command) {
|
||||||
|
DocumentContent content = command.contentId() == null ? null : contentService.getRequired(command.contentId());
|
||||||
|
DocumentTextRepresentation representation = DocumentTextRepresentation.builder()
|
||||||
|
.document(documentService.getRequired(command.documentId()))
|
||||||
|
.content(content)
|
||||||
|
.representationType(command.representationType())
|
||||||
|
.builderKey(command.builderKey())
|
||||||
|
.languageCode(command.languageCode())
|
||||||
|
.tokenCount(command.tokenCount())
|
||||||
|
.chunkIndex(command.chunkIndex())
|
||||||
|
.chunkStartOffset(command.chunkStartOffset())
|
||||||
|
.chunkEndOffset(command.chunkEndOffset())
|
||||||
|
.primaryRepresentation(command.primaryRepresentation())
|
||||||
|
.textBody(command.textBody())
|
||||||
|
.build();
|
||||||
|
return representationRepository.save(representation);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public DocumentTextRepresentation getRequired(UUID representationId) {
|
||||||
|
return representationRepository.findById(representationId)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown representation id: " + representationId));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public List<DocumentTextRepresentation> findByDocument(UUID documentId) {
|
||||||
|
return representationRepository.findByDocument_Id(documentId);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,75 @@
|
|||||||
|
package at.procon.dip.domain.document.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentRepository;
|
||||||
|
import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
|
||||||
|
import at.procon.dip.domain.tenant.entity.DocumentTenant;
|
||||||
|
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Transactional
|
||||||
|
public class DocumentService {
|
||||||
|
|
||||||
|
private final DocumentRepository documentRepository;
|
||||||
|
private final DocumentTenantRepository tenantRepository;
|
||||||
|
|
||||||
|
public Document create(CreateDocumentCommand command) {
|
||||||
|
DocumentTenant ownerTenant = resolveOwnerTenant(command.ownerTenantKey());
|
||||||
|
Document document = Document.builder()
|
||||||
|
.ownerTenant(ownerTenant)
|
||||||
|
.visibility(command.visibility())
|
||||||
|
.documentType(command.documentType())
|
||||||
|
.documentFamily(command.documentFamily())
|
||||||
|
.status(command.status() == null ? DocumentStatus.RECEIVED : command.status())
|
||||||
|
.title(command.title())
|
||||||
|
.summary(command.summary())
|
||||||
|
.languageCode(command.languageCode())
|
||||||
|
.mimeType(command.mimeType())
|
||||||
|
.businessKey(command.businessKey())
|
||||||
|
.dedupHash(command.dedupHash())
|
||||||
|
.build();
|
||||||
|
return documentRepository.save(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Document save(Document document) {
|
||||||
|
return documentRepository.save(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Document updateStatus(UUID documentId, DocumentStatus status) {
|
||||||
|
Document document = getRequired(documentId);
|
||||||
|
document.setStatus(status);
|
||||||
|
return documentRepository.save(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public Document getRequired(UUID documentId) {
|
||||||
|
return documentRepository.findById(documentId)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown document id: " + documentId));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public List<Document> findAll() {
|
||||||
|
return documentRepository.findAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public CanonicalDocumentMetadata getMetadata(UUID documentId) {
|
||||||
|
return getRequired(documentId).toCanonicalMetadata();
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentTenant resolveOwnerTenant(String ownerTenantKey) {
|
||||||
|
if (ownerTenantKey == null || ownerTenantKey.isBlank()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return tenantRepository.findByTenantKey(ownerTenantKey)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown tenant key: " + ownerTenantKey));
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,38 @@
|
|||||||
|
package at.procon.dip.domain.document.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentSource;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
||||||
|
import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Transactional
|
||||||
|
public class DocumentSourceService {
|
||||||
|
|
||||||
|
private final DocumentService documentService;
|
||||||
|
private final DocumentSourceRepository sourceRepository;
|
||||||
|
|
||||||
|
public DocumentSource addSource(AddDocumentSourceCommand command) {
|
||||||
|
DocumentSource source = DocumentSource.builder()
|
||||||
|
.document(documentService.getRequired(command.documentId()))
|
||||||
|
.sourceType(command.sourceType())
|
||||||
|
.externalSourceId(command.externalSourceId())
|
||||||
|
.sourceUri(command.sourceUri())
|
||||||
|
.sourceFilename(command.sourceFilename())
|
||||||
|
.parentSourceId(command.parentSourceId())
|
||||||
|
.importBatchId(command.importBatchId())
|
||||||
|
.receivedAt(command.receivedAt())
|
||||||
|
.build();
|
||||||
|
return sourceRepository.save(source);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public List<DocumentSource> findByDocument(UUID documentId) {
|
||||||
|
return sourceRepository.findByDocument_Id(documentId);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
package at.procon.dip.domain.document.service.command;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.StorageType;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
public record AddDocumentContentCommand(
|
||||||
|
UUID documentId,
|
||||||
|
ContentRole contentRole,
|
||||||
|
StorageType storageType,
|
||||||
|
String mimeType,
|
||||||
|
String charsetName,
|
||||||
|
String textContent,
|
||||||
|
String binaryRef,
|
||||||
|
String contentHash,
|
||||||
|
Long sizeBytes
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,17 @@
|
|||||||
|
package at.procon.dip.domain.document.service.command;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
public record AddDocumentSourceCommand(
|
||||||
|
UUID documentId,
|
||||||
|
SourceType sourceType,
|
||||||
|
String externalSourceId,
|
||||||
|
String sourceUri,
|
||||||
|
String sourceFilename,
|
||||||
|
UUID parentSourceId,
|
||||||
|
String importBatchId,
|
||||||
|
OffsetDateTime receivedAt
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,19 @@
|
|||||||
|
package at.procon.dip.domain.document.service.command;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
public record AddDocumentTextRepresentationCommand(
|
||||||
|
UUID documentId,
|
||||||
|
UUID contentId,
|
||||||
|
RepresentationType representationType,
|
||||||
|
String builderKey,
|
||||||
|
String languageCode,
|
||||||
|
Integer tokenCount,
|
||||||
|
Integer chunkIndex,
|
||||||
|
Integer chunkStartOffset,
|
||||||
|
Integer chunkEndOffset,
|
||||||
|
boolean primaryRepresentation,
|
||||||
|
String textBody
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,24 @@
|
|||||||
|
package at.procon.dip.domain.document.service.command;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimal Phase 1 command for creating the canonical document root.
|
||||||
|
*/
|
||||||
|
public record CreateDocumentCommand(
|
||||||
|
String ownerTenantKey,
|
||||||
|
DocumentVisibility visibility,
|
||||||
|
DocumentType documentType,
|
||||||
|
DocumentFamily documentFamily,
|
||||||
|
DocumentStatus status,
|
||||||
|
String title,
|
||||||
|
String summary,
|
||||||
|
String languageCode,
|
||||||
|
String mimeType,
|
||||||
|
String businessKey,
|
||||||
|
String dedupHash
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package at.procon.dip.domain.document.service.command;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.RelationType;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
public record CreateDocumentRelationCommand(
|
||||||
|
UUID parentDocumentId,
|
||||||
|
UUID childDocumentId,
|
||||||
|
RelationType relationType,
|
||||||
|
Integer sortOrder,
|
||||||
|
String relationMetadata
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
package at.procon.dip.domain.document.service.command;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DistanceMetric;
|
||||||
|
|
||||||
|
public record RegisterEmbeddingModelCommand(
|
||||||
|
String modelKey,
|
||||||
|
String provider,
|
||||||
|
String displayName,
|
||||||
|
Integer dimensions,
|
||||||
|
DistanceMetric distanceMetric,
|
||||||
|
boolean queryPrefixRequired,
|
||||||
|
boolean active
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
package at.procon.dip.domain.tenant;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Canonical tenant reference used to express document ownership.
|
||||||
|
*/
|
||||||
|
public record TenantRef(
|
||||||
|
String tenantId,
|
||||||
|
String tenantKey,
|
||||||
|
String displayName
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,71 @@
|
|||||||
|
package at.procon.dip.domain.tenant.entity;
|
||||||
|
|
||||||
|
import at.procon.dip.architecture.SchemaNames;
|
||||||
|
import jakarta.persistence.Column;
|
||||||
|
import jakarta.persistence.Entity;
|
||||||
|
import jakarta.persistence.GeneratedValue;
|
||||||
|
import jakarta.persistence.GenerationType;
|
||||||
|
import jakarta.persistence.Id;
|
||||||
|
import jakarta.persistence.Index;
|
||||||
|
import jakarta.persistence.PrePersist;
|
||||||
|
import jakarta.persistence.PreUpdate;
|
||||||
|
import jakarta.persistence.Table;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Canonical owner tenant catalog for the generalized DOC schema.
|
||||||
|
*/
|
||||||
|
@Entity
|
||||||
|
@Table(schema = SchemaNames.DOC, name = "doc_tenant", indexes = {
|
||||||
|
@Index(name = "idx_doc_tenant_key", columnList = "tenant_key", unique = true),
|
||||||
|
@Index(name = "idx_doc_tenant_active", columnList = "active")
|
||||||
|
})
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class DocumentTenant {
|
||||||
|
|
||||||
|
@Id
|
||||||
|
@GeneratedValue(strategy = GenerationType.UUID)
|
||||||
|
private UUID id;
|
||||||
|
|
||||||
|
@Column(name = "tenant_key", nullable = false, unique = true, length = 120)
|
||||||
|
private String tenantKey;
|
||||||
|
|
||||||
|
@Column(name = "display_name", nullable = false, length = 255)
|
||||||
|
private String displayName;
|
||||||
|
|
||||||
|
@Column(name = "description", columnDefinition = "TEXT")
|
||||||
|
private String description;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "active", nullable = false)
|
||||||
|
private boolean active = true;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@Column(name = "updated_at", nullable = false)
|
||||||
|
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
||||||
|
|
||||||
|
@PrePersist
|
||||||
|
protected void onCreate() {
|
||||||
|
createdAt = OffsetDateTime.now();
|
||||||
|
updatedAt = OffsetDateTime.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
@PreUpdate
|
||||||
|
protected void onUpdate() {
|
||||||
|
updatedAt = OffsetDateTime.now();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package at.procon.dip.domain.tenant.repository;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.tenant.entity.DocumentTenant;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
|
public interface DocumentTenantRepository extends JpaRepository<DocumentTenant, UUID> {
|
||||||
|
|
||||||
|
Optional<DocumentTenant> findByTenantKey(String tenantKey);
|
||||||
|
|
||||||
|
boolean existsByTenantKey(String tenantKey);
|
||||||
|
}
|
||||||
@ -0,0 +1,45 @@
|
|||||||
|
package at.procon.dip.domain.tenant.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.tenant.entity.DocumentTenant;
|
||||||
|
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
|
||||||
|
import at.procon.dip.domain.tenant.service.command.CreateTenantCommand;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Transactional
|
||||||
|
public class DocumentTenantService {
|
||||||
|
|
||||||
|
private final DocumentTenantRepository tenantRepository;
|
||||||
|
|
||||||
|
public DocumentTenant createOrUpdate(CreateTenantCommand command) {
|
||||||
|
DocumentTenant tenant = tenantRepository.findByTenantKey(command.tenantKey())
|
||||||
|
.orElseGet(DocumentTenant::new);
|
||||||
|
tenant.setTenantKey(command.tenantKey());
|
||||||
|
tenant.setDisplayName(command.displayName());
|
||||||
|
tenant.setDescription(command.description());
|
||||||
|
tenant.setActive(command.active());
|
||||||
|
return tenantRepository.save(tenant);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public DocumentTenant getRequiredById(UUID id) {
|
||||||
|
return tenantRepository.findById(id)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown tenant id: " + id));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public DocumentTenant getRequiredByTenantKey(String tenantKey) {
|
||||||
|
return tenantRepository.findByTenantKey(tenantKey)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown tenant key: " + tenantKey));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public List<DocumentTenant> findAll() {
|
||||||
|
return tenantRepository.findAll();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,9 @@
|
|||||||
|
package at.procon.dip.domain.tenant.service.command;
|
||||||
|
|
||||||
|
public record CreateTenantCommand(
|
||||||
|
String tenantKey,
|
||||||
|
String displayName,
|
||||||
|
String description,
|
||||||
|
boolean active
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package at.procon.dip.extraction.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Type-specific extraction contract.
|
||||||
|
*/
|
||||||
|
public interface DocumentExtractor {
|
||||||
|
|
||||||
|
boolean supports(DocumentType documentType, String mimeType);
|
||||||
|
|
||||||
|
ExtractionResult extract(ExtractionRequest extractionRequest);
|
||||||
|
}
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
package at.procon.dip.extraction.spi;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Type-specific structured payload produced by an extractor.
|
||||||
|
*/
|
||||||
|
public record ExtractedStructuredPayload(
|
||||||
|
String projectionName,
|
||||||
|
Map<String, Object> attributes
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
package at.procon.dip.extraction.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.classification.spi.DetectionResult;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Input to a document extractor.
|
||||||
|
*/
|
||||||
|
public record ExtractionRequest(
|
||||||
|
SourceDescriptor sourceDescriptor,
|
||||||
|
DetectionResult detectionResult,
|
||||||
|
String textContent,
|
||||||
|
byte[] binaryContent
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
package at.procon.dip.extraction.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Output of a document extractor before normalization and persistence.
|
||||||
|
*/
|
||||||
|
public record ExtractionResult(
|
||||||
|
Map<ContentRole, String> derivedTextByRole,
|
||||||
|
List<ExtractedStructuredPayload> structuredPayloads,
|
||||||
|
List<String> warnings
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
package at.procon.dip.ingestion.spi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extension point for source-specific import adapters.
|
||||||
|
*/
|
||||||
|
public interface DocumentIngestionAdapter {
|
||||||
|
|
||||||
|
boolean supports(SourceDescriptor sourceDescriptor);
|
||||||
|
|
||||||
|
IngestionResult ingest(SourceDescriptor sourceDescriptor);
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package at.procon.dip.ingestion.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Result of an ingestion adapter execution.
|
||||||
|
*/
|
||||||
|
public record IngestionResult(
|
||||||
|
List<CanonicalDocumentMetadata> documents,
|
||||||
|
List<String> warnings
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,19 @@
|
|||||||
|
package at.procon.dip.ingestion.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Describes a source object that should be ingested into the canonical document model.
|
||||||
|
*/
|
||||||
|
public record SourceDescriptor(
|
||||||
|
DocumentAccessContext accessContext,
|
||||||
|
SourceType sourceType,
|
||||||
|
String sourceIdentifier,
|
||||||
|
String sourceUri,
|
||||||
|
String fileName,
|
||||||
|
String mediaType,
|
||||||
|
Map<String, String> attributes
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
package at.procon.dip.migration;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Phase 0 decision for introducing the generalized model incrementally.
|
||||||
|
*/
|
||||||
|
public enum MigrationStrategyMode {
|
||||||
|
ADDITIVE_SCHEMA,
|
||||||
|
DUAL_WRITE,
|
||||||
|
BACKFILL,
|
||||||
|
CUTOVER,
|
||||||
|
RETIRE_LEGACY
|
||||||
|
}
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
package at.procon.dip.normalization.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.classification.spi.DetectionResult;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Input for text-representation builders.
|
||||||
|
*/
|
||||||
|
public record RepresentationBuildRequest(
|
||||||
|
SourceDescriptor sourceDescriptor,
|
||||||
|
DetectionResult detectionResult,
|
||||||
|
ExtractionResult extractionResult
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
package at.procon.dip.normalization.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds search-oriented text representations independently from raw extraction.
|
||||||
|
*/
|
||||||
|
public interface TextRepresentationBuilder {
|
||||||
|
|
||||||
|
boolean supports(DocumentType documentType);
|
||||||
|
|
||||||
|
List<TextRepresentationDraft> build(RepresentationBuildRequest request);
|
||||||
|
}
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
package at.procon.dip.normalization.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Candidate text representation for semantic indexing.
|
||||||
|
*/
|
||||||
|
public record TextRepresentationDraft(
|
||||||
|
RepresentationType representationType,
|
||||||
|
String languageCode,
|
||||||
|
String textBody,
|
||||||
|
boolean primary,
|
||||||
|
Integer chunkIndex
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
package at.procon.dip.processing.spi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cross-cutting processing stages for generic document orchestration.
|
||||||
|
*/
|
||||||
|
public enum ProcessingStage {
|
||||||
|
INGESTION,
|
||||||
|
CLASSIFICATION,
|
||||||
|
EXTRACTION,
|
||||||
|
NORMALIZATION,
|
||||||
|
VECTORIZATION,
|
||||||
|
INDEXING,
|
||||||
|
SEARCH
|
||||||
|
}
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
package at.procon.dip.search.spi;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimal generic search scope for future hybrid/semantic search services.
|
||||||
|
*/
|
||||||
|
public record SearchDocumentScope(
|
||||||
|
Set<String> ownerTenantKeys,
|
||||||
|
Set<DocumentType> documentTypes,
|
||||||
|
Set<DocumentFamily> documentFamilies,
|
||||||
|
Set<DocumentVisibility> visibilities,
|
||||||
|
String languageCode
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,211 @@
|
|||||||
|
package at.procon.dip.vectorization.camel;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||||
|
import at.procon.dip.vectorization.service.DocumentEmbeddingProcessingService;
|
||||||
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.camel.Exchange;
|
||||||
|
import org.apache.camel.LoggingLevel;
|
||||||
|
import org.apache.camel.builder.RouteBuilder;
|
||||||
|
import org.apache.camel.model.dataformat.JsonLibrary;
|
||||||
|
import org.springframework.data.domain.PageRequest;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Phase 2 generic vectorization route.
|
||||||
|
* Uses DOC.doc_text_representation as the source text and DOC.doc_embedding as the write target.
|
||||||
|
*/
|
||||||
|
@Component
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class GenericVectorizationRoute extends RouteBuilder {
|
||||||
|
|
||||||
|
private static final String ROUTE_ID_TRIGGER = "generic-vectorization-trigger";
|
||||||
|
private static final String ROUTE_ID_PROCESSOR = "generic-vectorization-processor";
|
||||||
|
private static final String ROUTE_ID_SCHEDULER = "generic-vectorization-scheduler";
|
||||||
|
|
||||||
|
private final TedProcessorProperties properties;
|
||||||
|
private final DocumentEmbeddingRepository embeddingRepository;
|
||||||
|
private final DocumentEmbeddingProcessingService processingService;
|
||||||
|
|
||||||
|
private java.util.concurrent.ExecutorService executorService() {
|
||||||
|
return java.util.concurrent.Executors.newFixedThreadPool(
|
||||||
|
1,
|
||||||
|
r -> {
|
||||||
|
Thread thread = new Thread(r);
|
||||||
|
thread.setName("doc-vectorization-" + thread.getId());
|
||||||
|
thread.setDaemon(true);
|
||||||
|
thread.setPriority(Thread.MAX_PRIORITY);
|
||||||
|
return thread;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void configure() {
|
||||||
|
if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) {
|
||||||
|
log.info("Phase 2 generic vectorization route disabled");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("Configuring generic vectorization routes (phase2=true, apiUrl={}, scheduler={}ms)",
|
||||||
|
properties.getVectorization().getApiUrl(),
|
||||||
|
properties.getVectorization().getGenericSchedulerPeriodMs());
|
||||||
|
|
||||||
|
onException(Exception.class)
|
||||||
|
.handled(true)
|
||||||
|
.process(exchange -> {
|
||||||
|
UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
|
||||||
|
Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class);
|
||||||
|
String error = exception != null ? exception.getMessage() : "Unknown vectorization error";
|
||||||
|
if (embeddingId != null) {
|
||||||
|
try {
|
||||||
|
processingService.markAsFailed(embeddingId, error);
|
||||||
|
} catch (Exception nested) {
|
||||||
|
log.warn("Failed to mark embedding {} as failed: {}", embeddingId, nested.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.to("log:generic-vectorization-error?level=WARN");
|
||||||
|
|
||||||
|
from("direct:vectorize-embedding")
|
||||||
|
.routeId(ROUTE_ID_TRIGGER)
|
||||||
|
.doTry()
|
||||||
|
.to("seda:vectorize-embedding-async?waitForTaskToComplete=Never&size=1000&blockWhenFull=true&timeout=5000")
|
||||||
|
.doCatch(Exception.class)
|
||||||
|
.log(LoggingLevel.WARN, "Failed to queue embedding ${header.embeddingId}: ${exception.message}")
|
||||||
|
.end();
|
||||||
|
|
||||||
|
from("seda:vectorize-embedding-async?size=1000")
|
||||||
|
.routeId(ROUTE_ID_PROCESSOR)
|
||||||
|
.threads().executorService(executorService())
|
||||||
|
.process(exchange -> {
|
||||||
|
UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
|
||||||
|
DocumentEmbeddingProcessingService.EmbeddingPayload payload =
|
||||||
|
processingService.prepareEmbeddingForVectorization(embeddingId);
|
||||||
|
if (payload == null) {
|
||||||
|
exchange.setProperty("skipVectorization", true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
EmbedRequest request = new EmbedRequest();
|
||||||
|
request.text = payload.textContent();
|
||||||
|
request.isQuery = false;
|
||||||
|
|
||||||
|
exchange.getIn().setHeader("embeddingId", payload.embeddingId());
|
||||||
|
exchange.getIn().setHeader("documentId", payload.documentId());
|
||||||
|
exchange.getIn().setHeader(Exchange.HTTP_METHOD, "POST");
|
||||||
|
exchange.getIn().setHeader(Exchange.CONTENT_TYPE, "application/json");
|
||||||
|
exchange.getIn().setBody(request);
|
||||||
|
})
|
||||||
|
.choice()
|
||||||
|
.when(exchangeProperty("skipVectorization").isEqualTo(true))
|
||||||
|
.log(LoggingLevel.DEBUG, "Skipping generic vectorization for ${header.embeddingId}")
|
||||||
|
.otherwise()
|
||||||
|
.marshal().json(JsonLibrary.Jackson)
|
||||||
|
.setProperty("retryCount", constant(0))
|
||||||
|
.setProperty("maxRetries", constant(properties.getVectorization().getMaxRetries()))
|
||||||
|
.setProperty("vectorizationSuccess", constant(false))
|
||||||
|
.loopDoWhile(simple("${exchangeProperty.vectorizationSuccess} == false && ${exchangeProperty.retryCount} < ${exchangeProperty.maxRetries}"))
|
||||||
|
.process(exchange -> {
|
||||||
|
Integer retryCount = exchange.getProperty("retryCount", Integer.class);
|
||||||
|
exchange.setProperty("retryCount", retryCount + 1);
|
||||||
|
if (retryCount > 0) {
|
||||||
|
long backoffMs = (long) Math.pow(2, retryCount) * 1000L;
|
||||||
|
Thread.sleep(backoffMs);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.doTry()
|
||||||
|
.toD(properties.getVectorization().getApiUrl() + "/embed?bridgeEndpoint=true&throwExceptionOnFailure=false&connectTimeout=" +
|
||||||
|
properties.getVectorization().getConnectTimeout() + "&socketTimeout=" +
|
||||||
|
properties.getVectorization().getSocketTimeout())
|
||||||
|
.process(exchange -> {
|
||||||
|
Integer statusCode = exchange.getIn().getHeader(Exchange.HTTP_RESPONSE_CODE, Integer.class);
|
||||||
|
if (statusCode == null || statusCode != 200) {
|
||||||
|
String body = exchange.getIn().getBody(String.class);
|
||||||
|
throw new RuntimeException("Embedding service returned HTTP " + statusCode + ": " + body);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.unmarshal().json(JsonLibrary.Jackson, EmbedResponse.class)
|
||||||
|
.process(exchange -> {
|
||||||
|
UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
|
||||||
|
EmbedResponse response = exchange.getIn().getBody(EmbedResponse.class);
|
||||||
|
if (response == null || response.embedding == null) {
|
||||||
|
throw new RuntimeException("Embedding service returned null embedding response");
|
||||||
|
}
|
||||||
|
processingService.saveEmbedding(embeddingId, response.embedding, response.tokenCount);
|
||||||
|
exchange.setProperty("vectorizationSuccess", true);
|
||||||
|
})
|
||||||
|
.doCatch(Exception.class)
|
||||||
|
.process(exchange -> {
|
||||||
|
UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class);
|
||||||
|
Integer retryCount = exchange.getProperty("retryCount", Integer.class);
|
||||||
|
Integer maxRetries = exchange.getProperty("maxRetries", Integer.class);
|
||||||
|
Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class);
|
||||||
|
String errorMsg = exception != null ? exception.getMessage() : "Unknown error";
|
||||||
|
if (errorMsg != null && errorMsg.contains("Connection pool shut down")) {
|
||||||
|
log.warn("Generic vectorization aborted for {} because the application is shutting down", embeddingId);
|
||||||
|
exchange.setProperty("vectorizationSuccess", true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (retryCount >= maxRetries) {
|
||||||
|
processingService.markAsFailed(embeddingId, errorMsg);
|
||||||
|
} else {
|
||||||
|
log.warn("Generic vectorization attempt #{} failed for {}: {}", retryCount, embeddingId, errorMsg);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.end()
|
||||||
|
.end()
|
||||||
|
.end();
|
||||||
|
|
||||||
|
from("timer:generic-vectorization-scheduler?period=" + properties.getVectorization().getGenericSchedulerPeriodMs() + "&delay=500")
|
||||||
|
.routeId(ROUTE_ID_SCHEDULER)
|
||||||
|
.process(exchange -> {
|
||||||
|
int batchSize = properties.getVectorization().getBatchSize();
|
||||||
|
List<UUID> pending = embeddingRepository.findIdsByEmbeddingStatus(EmbeddingStatus.PENDING, PageRequest.of(0, batchSize));
|
||||||
|
List<UUID> failed = List.of();
|
||||||
|
if (pending.isEmpty()) {
|
||||||
|
failed = embeddingRepository.findIdsByEmbeddingStatus(EmbeddingStatus.FAILED, PageRequest.of(0, batchSize));
|
||||||
|
}
|
||||||
|
List<UUID> toProcess = !pending.isEmpty() ? pending : failed;
|
||||||
|
if (toProcess.isEmpty()) {
|
||||||
|
exchange.setProperty("noPendingEmbeddings", true);
|
||||||
|
} else {
|
||||||
|
exchange.getIn().setBody(toProcess);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.choice()
|
||||||
|
.when(exchangeProperty("noPendingEmbeddings").isEqualTo(true))
|
||||||
|
.log(LoggingLevel.DEBUG, "Generic vectorization scheduler: nothing pending")
|
||||||
|
.otherwise()
|
||||||
|
.split(body())
|
||||||
|
.process(exchange -> {
|
||||||
|
UUID embeddingId = exchange.getIn().getBody(UUID.class);
|
||||||
|
exchange.getIn().setHeader("embeddingId", embeddingId);
|
||||||
|
})
|
||||||
|
.to("direct:vectorize-embedding")
|
||||||
|
.end()
|
||||||
|
.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class EmbedRequest {
|
||||||
|
@JsonProperty("text")
|
||||||
|
public String text;
|
||||||
|
|
||||||
|
@JsonProperty("is_query")
|
||||||
|
public boolean isQuery;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class EmbedResponse {
|
||||||
|
public float[] embedding;
|
||||||
|
public int dimensions;
|
||||||
|
@JsonProperty("token_count")
|
||||||
|
public int tokenCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,142 @@
|
|||||||
|
package at.procon.dip.vectorization.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentEmbedding;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentService;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import at.procon.ted.model.entity.VectorizationStatus;
|
||||||
|
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||||
|
import at.procon.ted.service.VectorizationService;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Propagation;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Phase 2 generic vectorization processor that works on DOC text representations and DOC embeddings.
|
||||||
|
* <p>
|
||||||
|
* The service keeps the existing TED semantic search operational by optionally dual-writing completed
|
||||||
|
* embeddings back into the legacy TED procurement_document vector columns, resolved by document hash.
|
||||||
|
*/
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class DocumentEmbeddingProcessingService {
|
||||||
|
|
||||||
|
private final DocumentEmbeddingRepository embeddingRepository;
|
||||||
|
private final DocumentService documentService;
|
||||||
|
private final VectorizationService vectorizationService;
|
||||||
|
private final TedProcessorProperties properties;
|
||||||
|
private final ProcurementDocumentRepository procurementDocumentRepository;
|
||||||
|
|
||||||
|
@Transactional(propagation = Propagation.REQUIRES_NEW)
|
||||||
|
public EmbeddingPayload prepareEmbeddingForVectorization(UUID embeddingId) {
|
||||||
|
DocumentEmbedding embedding = embeddingRepository.findDetailedById(embeddingId)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
|
||||||
|
|
||||||
|
if (embedding.getEmbeddingStatus() == EmbeddingStatus.PROCESSING) {
|
||||||
|
log.debug("Embedding {} is already PROCESSING, skipping duplicate queue entry", embeddingId);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
embedding.setEmbeddingStatus(EmbeddingStatus.PROCESSING);
|
||||||
|
embedding.setErrorMessage(null);
|
||||||
|
embeddingRepository.save(embedding);
|
||||||
|
|
||||||
|
String textBody = embedding.getRepresentation().getTextBody();
|
||||||
|
if (textBody == null || textBody.isBlank()) {
|
||||||
|
embedding.setEmbeddingStatus(EmbeddingStatus.SKIPPED);
|
||||||
|
embedding.setErrorMessage("No text representation available");
|
||||||
|
embedding.setEmbeddedAt(OffsetDateTime.now());
|
||||||
|
embeddingRepository.save(embedding);
|
||||||
|
documentService.updateStatus(embedding.getDocument().getId(), DocumentStatus.REPRESENTED);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
int maxLength = properties.getVectorization().getMaxTextLength();
|
||||||
|
if (textBody.length() > maxLength) {
|
||||||
|
log.debug("Truncating representation {} for embedding {} from {} to {} chars",
|
||||||
|
embedding.getRepresentation().getId(), embeddingId, textBody.length(), maxLength);
|
||||||
|
textBody = textBody.substring(0, maxLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new EmbeddingPayload(
|
||||||
|
embedding.getId(),
|
||||||
|
embedding.getDocument().getId(),
|
||||||
|
embedding.getDocument().getDedupHash(),
|
||||||
|
textBody,
|
||||||
|
embedding.getModel().getDimensions(),
|
||||||
|
embedding.getModel().isQueryPrefixRequired(),
|
||||||
|
embedding.getRepresentation().getId()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(propagation = Propagation.REQUIRES_NEW)
|
||||||
|
public void saveEmbedding(UUID embeddingId, float[] embedding, Integer tokenCount) {
|
||||||
|
DocumentEmbedding loaded = embeddingRepository.findDetailedById(embeddingId)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
|
||||||
|
|
||||||
|
int expectedDimensions = loaded.getModel().getDimensions();
|
||||||
|
if (embedding == null || embedding.length != expectedDimensions) {
|
||||||
|
throw new IllegalArgumentException("Invalid embedding dimension for " + embeddingId +
|
||||||
|
": expected " + expectedDimensions + ", got " + (embedding == null ? 0 : embedding.length));
|
||||||
|
}
|
||||||
|
|
||||||
|
String vectorString = vectorizationService.floatArrayToVectorString(embedding);
|
||||||
|
embeddingRepository.updateEmbeddingVector(embeddingId, vectorString, tokenCount, embedding.length);
|
||||||
|
documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.INDEXED);
|
||||||
|
|
||||||
|
if (properties.getVectorization().isDualWriteLegacyTedVectors()) {
|
||||||
|
dualWriteLegacyTedVector(loaded, vectorString, tokenCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional(propagation = Propagation.REQUIRES_NEW)
|
||||||
|
public void markAsFailed(UUID embeddingId, String errorMessage) {
|
||||||
|
DocumentEmbedding loaded = embeddingRepository.findDetailedById(embeddingId)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId));
|
||||||
|
|
||||||
|
embeddingRepository.updateEmbeddingStatus(embeddingId, EmbeddingStatus.FAILED, errorMessage, null);
|
||||||
|
documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.FAILED);
|
||||||
|
|
||||||
|
if (properties.getVectorization().isDualWriteLegacyTedVectors()) {
|
||||||
|
loaded.getDocument().getDedupHash();
|
||||||
|
procurementDocumentRepository.findByDocumentHash(loaded.getDocument().getDedupHash())
|
||||||
|
.ifPresent(doc -> procurementDocumentRepository.updateVectorizationStatus(
|
||||||
|
doc.getId(), VectorizationStatus.FAILED, errorMessage, null));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void dualWriteLegacyTedVector(DocumentEmbedding embedding, String vectorString, Integer tokenCount) {
|
||||||
|
String dedupHash = embedding.getDocument().getDedupHash();
|
||||||
|
if (dedupHash == null || dedupHash.isBlank()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
procurementDocumentRepository.findByDocumentHash(dedupHash)
|
||||||
|
.ifPresentOrElse(
|
||||||
|
legacy -> {
|
||||||
|
procurementDocumentRepository.updateContentVector(legacy.getId(), vectorString, tokenCount);
|
||||||
|
log.debug("Dual-wrote embedding {} back to legacy TED document {}", embedding.getId(), legacy.getId());
|
||||||
|
},
|
||||||
|
() -> log.debug("No legacy TED document found for DOC embedding {} with dedup hash {}",
|
||||||
|
embedding.getId(), dedupHash)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public record EmbeddingPayload(
|
||||||
|
UUID embeddingId,
|
||||||
|
UUID documentId,
|
||||||
|
String dedupHash,
|
||||||
|
String textContent,
|
||||||
|
Integer expectedDimensions,
|
||||||
|
boolean queryPrefixRequired,
|
||||||
|
UUID representationId
|
||||||
|
) {
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package at.procon.dip.vectorization.spi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Describes one embedding model registered in the platform.
|
||||||
|
*/
|
||||||
|
public record EmbeddingModelDescriptor(
|
||||||
|
String modelKey,
|
||||||
|
String provider,
|
||||||
|
int dimensions,
|
||||||
|
String distanceMetric,
|
||||||
|
boolean queryPrefixRequired
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package at.procon.dip.vectorization.spi;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provider abstraction for vectorization backends.
|
||||||
|
*/
|
||||||
|
public interface EmbeddingProvider {
|
||||||
|
|
||||||
|
EmbeddingModelDescriptor model();
|
||||||
|
|
||||||
|
EmbeddingResult embed(List<String> texts, boolean queryMode);
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package at.procon.dip.vectorization.spi;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Embedding output for one or more representations.
|
||||||
|
*/
|
||||||
|
public record EmbeddingResult(
|
||||||
|
EmbeddingModelDescriptor model,
|
||||||
|
List<float[]> vectors,
|
||||||
|
List<String> warnings
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,41 @@
|
|||||||
|
package at.procon.dip.vectorization.startup;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
||||||
|
import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.boot.ApplicationArguments;
|
||||||
|
import org.springframework.boot.ApplicationRunner;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensures the configured embedding model exists in DOC.doc_embedding_model.
|
||||||
|
*/
|
||||||
|
@Component
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class ConfiguredEmbeddingModelStartupRunner implements ApplicationRunner {
|
||||||
|
|
||||||
|
private final TedProcessorProperties properties;
|
||||||
|
private final DocumentEmbeddingService embeddingService;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run(ApplicationArguments args) {
|
||||||
|
if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddingService.registerModel(new RegisterEmbeddingModelCommand(
|
||||||
|
properties.getVectorization().getModelName(),
|
||||||
|
properties.getVectorization().getEmbeddingProvider(),
|
||||||
|
properties.getVectorization().getModelName(),
|
||||||
|
properties.getVectorization().getDimensions(),
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
true
|
||||||
|
));
|
||||||
|
|
||||||
|
log.info("Phase 2 embedding model ensured: {}", properties.getVectorization().getModelName());
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,60 @@
|
|||||||
|
package at.procon.dip.vectorization.startup;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.camel.ProducerTemplate;
|
||||||
|
import org.springframework.boot.ApplicationArguments;
|
||||||
|
import org.springframework.boot.ApplicationRunner;
|
||||||
|
import org.springframework.data.domain.PageRequest;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Queues pending and failed DOC embeddings immediately on startup.
|
||||||
|
*/
|
||||||
|
@Component
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class GenericVectorizationStartupRunner implements ApplicationRunner {
|
||||||
|
|
||||||
|
private static final int BATCH_SIZE = 1000;
|
||||||
|
|
||||||
|
private final TedProcessorProperties properties;
|
||||||
|
private final DocumentEmbeddingRepository embeddingRepository;
|
||||||
|
private final ProducerTemplate producerTemplate;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run(ApplicationArguments args) {
|
||||||
|
if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int queued = 0;
|
||||||
|
queued += queueByStatus(EmbeddingStatus.PENDING, "PENDING");
|
||||||
|
queued += queueByStatus(EmbeddingStatus.FAILED, "FAILED");
|
||||||
|
log.info("Generic vectorization startup runner queued {} embedding jobs", queued);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int queueByStatus(EmbeddingStatus status, String label) {
|
||||||
|
int queued = 0;
|
||||||
|
int page = 0;
|
||||||
|
List<UUID> ids;
|
||||||
|
do {
|
||||||
|
ids = embeddingRepository.findIdsByEmbeddingStatus(status, PageRequest.of(page, BATCH_SIZE));
|
||||||
|
for (UUID id : ids) {
|
||||||
|
try {
|
||||||
|
producerTemplate.sendBodyAndHeader("direct:vectorize-embedding", null, "embeddingId", id);
|
||||||
|
queued++;
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Failed to queue {} embedding {}: {}", label, id, e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
page++;
|
||||||
|
} while (ids.size() == BATCH_SIZE);
|
||||||
|
return queued;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,26 +1,20 @@
|
|||||||
package at.procon.ted;
|
package at.procon.ted;
|
||||||
|
|
||||||
import org.springframework.boot.SpringApplication;
|
import at.procon.dip.DocumentIntelligencePlatformApplication;
|
||||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
|
||||||
import org.springframework.scheduling.annotation.EnableAsync;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TED Procurement Document Processor Application.
|
* Legacy entry point kept for backward compatibility.
|
||||||
*
|
*
|
||||||
* Processes EU eForms public procurement notices from TED (Tenders Electronic Daily).
|
* <p>The platform is being generalized beyond TED-specific procurement documents.
|
||||||
* Features:
|
* New runtime packaging should use {@link DocumentIntelligencePlatformApplication}.</p>
|
||||||
* - Directory watching with Apache Camel for automated XML processing
|
|
||||||
* - PostgreSQL storage with native XML support and pgvector for semantic search
|
|
||||||
* - Asynchronous document vectorization using multilingual-e5-large model
|
|
||||||
* - REST API for structured and semantic search
|
|
||||||
*
|
|
||||||
* @author Martin.Schweitzer@procon.co.at and claude.ai
|
|
||||||
*/
|
*/
|
||||||
@SpringBootApplication
|
@Deprecated(forRemoval = false, since = "1.1.0")
|
||||||
@EnableAsync
|
public final class TedProcurementProcessorApplication {
|
||||||
public class TedProcurementProcessorApplication {
|
|
||||||
|
private TedProcurementProcessorApplication() {
|
||||||
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
SpringApplication.run(TedProcurementProcessorApplication.class, args);
|
DocumentIntelligencePlatformApplication.main(args);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,197 @@
|
|||||||
|
package at.procon.ted.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import at.procon.dip.domain.document.StorageType;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentEmbedding;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentSource;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentContentRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentService;
|
||||||
|
import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import at.procon.ted.model.entity.ProcurementDocument;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Phase 2 bridge that dual-writes TED documents into the generic DOC persistence backbone.
|
||||||
|
*/
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class TedPhase2GenericDocumentService {
|
||||||
|
|
||||||
|
private final TedProcessorProperties properties;
|
||||||
|
private final DocumentRepository documentRepository;
|
||||||
|
private final DocumentContentRepository contentRepository;
|
||||||
|
private final DocumentSourceRepository sourceRepository;
|
||||||
|
private final DocumentTextRepresentationRepository representationRepository;
|
||||||
|
private final DocumentEmbeddingRepository embeddingRepository;
|
||||||
|
private final DocumentService documentService;
|
||||||
|
private final DocumentEmbeddingService embeddingService;
|
||||||
|
|
||||||
|
@Transactional
|
||||||
|
public UUID registerOrRefreshTedDocument(ProcurementDocument tedDocument) {
|
||||||
|
if (!properties.getVectorization().isGenericPipelineEnabled()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
Document document = documentRepository.findByDedupHash(tedDocument.getDocumentHash())
|
||||||
|
.orElseGet(() -> createGenericDocument(tedDocument));
|
||||||
|
|
||||||
|
document.setDocumentType(DocumentType.TED_NOTICE);
|
||||||
|
document.setDocumentFamily(DocumentFamily.PROCUREMENT);
|
||||||
|
document.setVisibility(DocumentVisibility.PUBLIC);
|
||||||
|
document.setStatus(DocumentStatus.REPRESENTED);
|
||||||
|
document.setTitle(tedDocument.getProjectTitle());
|
||||||
|
document.setSummary(tedDocument.getProjectDescription());
|
||||||
|
document.setLanguageCode(tedDocument.getLanguageCode());
|
||||||
|
document.setMimeType("application/xml");
|
||||||
|
document.setBusinessKey(buildBusinessKey(tedDocument));
|
||||||
|
document.setDedupHash(tedDocument.getDocumentHash());
|
||||||
|
document = documentRepository.save(document);
|
||||||
|
|
||||||
|
ensureTedSource(document, tedDocument);
|
||||||
|
DocumentContent originalContent = ensureOriginalContent(document, tedDocument);
|
||||||
|
DocumentTextRepresentation representation = ensurePrimaryRepresentation(document, originalContent, tedDocument);
|
||||||
|
DocumentEmbedding embedding = ensurePendingEmbedding(document, representation);
|
||||||
|
|
||||||
|
log.debug("Phase 2 DOC bridge ensured generic TED document {} -> embedding {}", document.getId(), embedding.getId());
|
||||||
|
return embedding.getId();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Document createGenericDocument(ProcurementDocument tedDocument) {
|
||||||
|
return documentService.create(new at.procon.dip.domain.document.service.command.CreateDocumentCommand(
|
||||||
|
null,
|
||||||
|
DocumentVisibility.PUBLIC,
|
||||||
|
DocumentType.TED_NOTICE,
|
||||||
|
DocumentFamily.PROCUREMENT,
|
||||||
|
DocumentStatus.REPRESENTED,
|
||||||
|
tedDocument.getProjectTitle(),
|
||||||
|
tedDocument.getProjectDescription(),
|
||||||
|
tedDocument.getLanguageCode(),
|
||||||
|
"application/xml",
|
||||||
|
buildBusinessKey(tedDocument),
|
||||||
|
tedDocument.getDocumentHash()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void ensureTedSource(Document document, ProcurementDocument tedDocument) {
|
||||||
|
String externalId = tedDocument.getPublicationId() != null ? tedDocument.getPublicationId() : tedDocument.getId().toString();
|
||||||
|
boolean sourceExists = sourceRepository.findByDocument_Id(document.getId()).stream()
|
||||||
|
.anyMatch(existing -> externalId.equals(existing.getExternalSourceId()));
|
||||||
|
if (sourceExists) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentSource source = DocumentSource.builder()
|
||||||
|
.document(document)
|
||||||
|
.sourceType(SourceType.FILE_SYSTEM)
|
||||||
|
.externalSourceId(externalId)
|
||||||
|
.sourceUri(tedDocument.getSourcePath())
|
||||||
|
.sourceFilename(tedDocument.getSourceFilename())
|
||||||
|
.importBatchId("ted-phase2")
|
||||||
|
.receivedAt(OffsetDateTime.now())
|
||||||
|
.build();
|
||||||
|
sourceRepository.save(source);
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentContent ensureOriginalContent(Document document, ProcurementDocument tedDocument) {
|
||||||
|
List<DocumentContent> existing = contentRepository.findByDocument_IdAndContentRole(document.getId(), ContentRole.ORIGINAL);
|
||||||
|
if (!existing.isEmpty()) {
|
||||||
|
DocumentContent content = existing.get(0);
|
||||||
|
content.setMimeType("application/xml");
|
||||||
|
content.setStorageType(StorageType.DB_TEXT);
|
||||||
|
content.setTextContent(tedDocument.getXmlDocument());
|
||||||
|
content.setContentHash(tedDocument.getDocumentHash());
|
||||||
|
content.setSizeBytes(tedDocument.getFileSizeBytes());
|
||||||
|
return contentRepository.save(content);
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentContent content = DocumentContent.builder()
|
||||||
|
.document(document)
|
||||||
|
.contentRole(ContentRole.ORIGINAL)
|
||||||
|
.storageType(StorageType.DB_TEXT)
|
||||||
|
.mimeType("application/xml")
|
||||||
|
.charsetName("UTF-8")
|
||||||
|
.textContent(tedDocument.getXmlDocument())
|
||||||
|
.contentHash(tedDocument.getDocumentHash())
|
||||||
|
.sizeBytes(tedDocument.getFileSizeBytes())
|
||||||
|
.build();
|
||||||
|
return contentRepository.save(content);
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentTextRepresentation ensurePrimaryRepresentation(Document document,
|
||||||
|
DocumentContent originalContent,
|
||||||
|
ProcurementDocument tedDocument) {
|
||||||
|
DocumentTextRepresentation representation = representationRepository
|
||||||
|
.findFirstByDocument_IdAndPrimaryRepresentationTrue(document.getId())
|
||||||
|
.orElseGet(DocumentTextRepresentation::new);
|
||||||
|
|
||||||
|
representation.setDocument(document);
|
||||||
|
representation.setContent(originalContent);
|
||||||
|
representation.setRepresentationType(RepresentationType.SEMANTIC_TEXT);
|
||||||
|
representation.setBuilderKey(properties.getVectorization().getPrimaryRepresentationBuilderKey());
|
||||||
|
representation.setLanguageCode(tedDocument.getLanguageCode());
|
||||||
|
representation.setPrimaryRepresentation(true);
|
||||||
|
representation.setTextBody(tedDocument.getTextContent() != null ? tedDocument.getTextContent() : tedDocument.getProjectDescription());
|
||||||
|
representation.setTokenCount(null);
|
||||||
|
representation.setChunkIndex(null);
|
||||||
|
representation.setChunkStartOffset(null);
|
||||||
|
representation.setChunkEndOffset(null);
|
||||||
|
return representationRepository.save(representation);
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentEmbedding ensurePendingEmbedding(Document document, DocumentTextRepresentation representation) {
|
||||||
|
DocumentEmbeddingModel model = embeddingService.registerModel(new RegisterEmbeddingModelCommand(
|
||||||
|
properties.getVectorization().getModelName(),
|
||||||
|
properties.getVectorization().getEmbeddingProvider(),
|
||||||
|
properties.getVectorization().getModelName(),
|
||||||
|
properties.getVectorization().getDimensions(),
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
true
|
||||||
|
));
|
||||||
|
|
||||||
|
return embeddingRepository.findByRepresentation_IdAndModel_Id(representation.getId(), model.getId())
|
||||||
|
.map(existing -> {
|
||||||
|
existing.setDocument(document);
|
||||||
|
existing.setRepresentation(representation);
|
||||||
|
existing.setModel(model);
|
||||||
|
existing.setEmbeddingStatus(at.procon.dip.domain.document.EmbeddingStatus.PENDING);
|
||||||
|
existing.setErrorMessage(null);
|
||||||
|
existing.setEmbeddedAt(null);
|
||||||
|
return embeddingRepository.save(existing);
|
||||||
|
})
|
||||||
|
.orElseGet(() -> embeddingService.createPendingEmbedding(document.getId(), representation.getId(), model.getId()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildBusinessKey(ProcurementDocument tedDocument) {
|
||||||
|
if (tedDocument.getPublicationId() != null && !tedDocument.getPublicationId().isBlank()) {
|
||||||
|
return "TED:publication:" + tedDocument.getPublicationId();
|
||||||
|
}
|
||||||
|
if (tedDocument.getNoticeUrl() != null && !tedDocument.getNoticeUrl().isBlank()) {
|
||||||
|
return "TED:url:" + tedDocument.getNoticeUrl();
|
||||||
|
}
|
||||||
|
return "TED:hash:" + tedDocument.getDocumentHash();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,281 @@
|
|||||||
|
-- Phase 1: Generic DOC persistence backbone for the Procon Document Intelligence Platform
|
||||||
|
-- This migration is additive and intentionally does not modify the existing TED runtime tables.
|
||||||
|
|
||||||
|
CREATE SCHEMA IF NOT EXISTS DOC;
|
||||||
|
|
||||||
|
SET search_path TO TED, DOC, public;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
CREATE EXTENSION IF NOT EXISTS pgcrypto SCHEMA public;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN insufficient_privilege THEN
|
||||||
|
RAISE NOTICE 'Skipping pgcrypto extension creation (insufficient privileges)';
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
RAISE NOTICE 'Extension pgcrypto already exists';
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
CREATE EXTENSION IF NOT EXISTS vector SCHEMA public;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN insufficient_privilege THEN
|
||||||
|
RAISE NOTICE 'Skipping vector extension creation (insufficient privileges)';
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
RAISE NOTICE 'Extension vector already exists';
|
||||||
|
WHEN undefined_file THEN
|
||||||
|
RAISE WARNING 'Extension vector not available - install pgvector on the database server';
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_visibility') THEN
|
||||||
|
CREATE TYPE DOC.doc_document_visibility AS ENUM ('PUBLIC', 'TENANT', 'SHARED', 'RESTRICTED');
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_type') THEN
|
||||||
|
CREATE TYPE DOC.doc_document_type AS ENUM (
|
||||||
|
'TED_NOTICE', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
|
||||||
|
'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'UNKNOWN'
|
||||||
|
);
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_family') THEN
|
||||||
|
CREATE TYPE DOC.doc_document_family AS ENUM ('PROCUREMENT', 'MAIL', 'ATTACHMENT', 'KNOWLEDGE', 'GENERIC');
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_status') THEN
|
||||||
|
CREATE TYPE DOC.doc_document_status AS ENUM ('RECEIVED', 'CLASSIFIED', 'EXTRACTED', 'REPRESENTED', 'INDEXED', 'FAILED', 'ARCHIVED');
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_source_type') THEN
|
||||||
|
CREATE TYPE DOC.doc_source_type AS ENUM ('TED_PACKAGE', 'MAIL', 'FILE_SYSTEM', 'REST_UPLOAD', 'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION');
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_content_role') THEN
|
||||||
|
CREATE TYPE DOC.doc_content_role AS ENUM (
|
||||||
|
'ORIGINAL', 'NORMALIZED_TEXT', 'OCR_TEXT', 'HTML_CLEAN',
|
||||||
|
'EXTRACTED_METADATA_JSON', 'THUMBNAIL', 'DERIVED_BINARY'
|
||||||
|
);
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_storage_type') THEN
|
||||||
|
CREATE TYPE DOC.doc_storage_type AS ENUM ('DB_TEXT', 'DB_BINARY', 'FILE_PATH', 'OBJECT_STORAGE', 'EXTERNAL_REFERENCE');
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_representation_type') THEN
|
||||||
|
CREATE TYPE DOC.doc_representation_type AS ENUM ('FULLTEXT', 'SEMANTIC_TEXT', 'SUMMARY', 'TITLE_ABSTRACT', 'CHUNK', 'METADATA_ENRICHED');
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_embedding_status') THEN
|
||||||
|
CREATE TYPE DOC.doc_embedding_status AS ENUM ('PENDING', 'PROCESSING', 'COMPLETED', 'FAILED', 'SKIPPED');
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_distance_metric') THEN
|
||||||
|
CREATE TYPE DOC.doc_distance_metric AS ENUM ('COSINE', 'L2', 'INNER_PRODUCT');
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_relation_type') THEN
|
||||||
|
CREATE TYPE DOC.doc_relation_type AS ENUM ('CONTAINS', 'ATTACHMENT_OF', 'EXTRACTED_FROM', 'DERIVED_FROM', 'PART_OF', 'VERSION_OF', 'RELATED_TO');
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOC.doc_tenant (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
tenant_key VARCHAR(120) NOT NULL UNIQUE,
|
||||||
|
display_name VARCHAR(255) NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOC.doc_document (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
owner_tenant_id UUID REFERENCES DOC.doc_tenant(id),
|
||||||
|
visibility DOC.doc_document_visibility NOT NULL,
|
||||||
|
document_type DOC.doc_document_type NOT NULL,
|
||||||
|
document_family DOC.doc_document_family NOT NULL,
|
||||||
|
status DOC.doc_document_status NOT NULL DEFAULT 'RECEIVED',
|
||||||
|
title VARCHAR(1000),
|
||||||
|
summary TEXT,
|
||||||
|
language_code VARCHAR(16),
|
||||||
|
mime_type VARCHAR(255),
|
||||||
|
business_key VARCHAR(255),
|
||||||
|
dedup_hash VARCHAR(64),
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOC.doc_source (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||||
|
source_type DOC.doc_source_type NOT NULL,
|
||||||
|
external_source_id VARCHAR(500),
|
||||||
|
source_uri TEXT,
|
||||||
|
source_filename VARCHAR(1000),
|
||||||
|
parent_source_id UUID,
|
||||||
|
import_batch_id VARCHAR(255),
|
||||||
|
received_at TIMESTAMP WITH TIME ZONE,
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOC.doc_content (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||||
|
content_role DOC.doc_content_role NOT NULL,
|
||||||
|
storage_type DOC.doc_storage_type NOT NULL,
|
||||||
|
mime_type VARCHAR(255),
|
||||||
|
charset_name VARCHAR(120),
|
||||||
|
text_content TEXT,
|
||||||
|
binary_ref TEXT,
|
||||||
|
content_hash VARCHAR(64),
|
||||||
|
size_bytes BIGINT,
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOC.doc_text_representation (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||||
|
content_id UUID REFERENCES DOC.doc_content(id) ON DELETE SET NULL,
|
||||||
|
representation_type DOC.doc_representation_type NOT NULL,
|
||||||
|
builder_key VARCHAR(255),
|
||||||
|
language_code VARCHAR(16),
|
||||||
|
token_count INTEGER,
|
||||||
|
char_count INTEGER,
|
||||||
|
chunk_index INTEGER,
|
||||||
|
chunk_start_offset INTEGER,
|
||||||
|
chunk_end_offset INTEGER,
|
||||||
|
is_primary BOOLEAN NOT NULL DEFAULT FALSE,
|
||||||
|
text_body TEXT NOT NULL,
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOC.doc_embedding_model (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
model_key VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
provider VARCHAR(120) NOT NULL,
|
||||||
|
display_name VARCHAR(255),
|
||||||
|
dimensions INTEGER NOT NULL,
|
||||||
|
distance_metric DOC.doc_distance_metric NOT NULL DEFAULT 'COSINE',
|
||||||
|
query_prefix_required BOOLEAN NOT NULL DEFAULT FALSE,
|
||||||
|
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOC.doc_embedding (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||||
|
representation_id UUID NOT NULL REFERENCES DOC.doc_text_representation(id) ON DELETE CASCADE,
|
||||||
|
model_id UUID NOT NULL REFERENCES DOC.doc_embedding_model(id),
|
||||||
|
embedding_status DOC.doc_embedding_status NOT NULL DEFAULT 'PENDING',
|
||||||
|
token_count INTEGER,
|
||||||
|
embedding_dimensions INTEGER,
|
||||||
|
error_message TEXT,
|
||||||
|
embedded_at TIMESTAMP WITH TIME ZONE,
|
||||||
|
embedding_vector public.vector,
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOC.doc_relation (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
parent_document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||||
|
child_document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||||
|
relation_type DOC.doc_relation_type NOT NULL,
|
||||||
|
sort_order INTEGER,
|
||||||
|
relation_metadata TEXT,
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
CONSTRAINT chk_doc_relation_no_self CHECK (parent_document_id <> child_document_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_tenant_key ON DOC.doc_tenant(tenant_key);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_tenant_active ON DOC.doc_tenant(active);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_document_type ON DOC.doc_document(document_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_document_family ON DOC.doc_document(document_family);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_document_status ON DOC.doc_document(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_document_visibility ON DOC.doc_document(visibility);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_document_owner_tenant ON DOC.doc_document(owner_tenant_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_document_dedup_hash ON DOC.doc_document(dedup_hash);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_document_business_key ON DOC.doc_document(business_key);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_document_created_at ON DOC.doc_document(created_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_source_document ON DOC.doc_source(document_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_source_type ON DOC.doc_source(source_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_source_external_id ON DOC.doc_source(external_source_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_source_received_at ON DOC.doc_source(received_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_source_parent_source ON DOC.doc_source(parent_source_id);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_content_document ON DOC.doc_content(document_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_content_role ON DOC.doc_content(content_role);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_content_hash ON DOC.doc_content(content_hash);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_content_storage_type ON DOC.doc_content(storage_type);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_document ON DOC.doc_text_representation(document_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_content ON DOC.doc_text_representation(content_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_type ON DOC.doc_text_representation(representation_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_primary ON DOC.doc_text_representation(is_primary);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_embedding_model_key ON DOC.doc_embedding_model(model_key);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_embedding_model_active ON DOC.doc_embedding_model(active);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_embedding_document ON DOC.doc_embedding(document_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_embedding_repr ON DOC.doc_embedding(representation_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_embedding_model ON DOC.doc_embedding(model_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_embedding_status ON DOC.doc_embedding(embedding_status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_embedding_embedded_at ON DOC.doc_embedding(embedded_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_relation_parent ON DOC.doc_relation(parent_document_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_relation_child ON DOC.doc_relation(child_document_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_relation_type ON DOC.doc_relation(relation_type);
|
||||||
|
|
||||||
|
COMMENT ON SCHEMA DOC IS 'Generic document platform schema introduced in Phase 1';
|
||||||
|
COMMENT ON TABLE DOC.doc_document IS 'Canonical document root with optional owner tenant and mandatory visibility';
|
||||||
|
COMMENT ON TABLE DOC.doc_content IS 'Stored payload variants for a canonical document';
|
||||||
|
COMMENT ON TABLE DOC.doc_text_representation IS 'Search-oriented text representations derived from document content';
|
||||||
|
COMMENT ON TABLE DOC.doc_embedding IS 'Embedding lifecycle separated from document structure';
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
-- Phase 2: Vectorization decoupling support in the generic DOC schema
|
||||||
|
-- Adds safety constraints and indexes for representation-based embedding processing.
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS uq_doc_embedding_representation_model
|
||||||
|
ON DOC.doc_embedding(representation_id, model_id);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_embedding_status_created
|
||||||
|
ON DOC.doc_embedding(embedding_status, created_at);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_embedding_status_updated
|
||||||
|
ON DOC.doc_embedding(embedding_status, updated_at);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_document_primary
|
||||||
|
ON DOC.doc_text_representation(document_id, is_primary);
|
||||||
Loading…
Reference in New Issue