From 12f0b0604b328f41dd7925d8159435cd1c06ace8 Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Wed, 22 Apr 2026 12:36:03 +0200 Subject: [PATCH] improved embedding model prefix handling using profiles --- docs/embedding/VECTOR_SYNC_HTTP_PROVIDER.md | 3 +- ...cumentIntelligencePlatformApplication.java | 4 +- .../document/entity/DocumentEmbedding.java | 13 +--- .../DocumentEmbeddingPrefixProfile.java | 75 +++++++++++++++++++ ...umentEmbeddingPrefixProfileRepository.java | 11 +++ .../DocumentEmbeddingRepository.java | 13 +--- .../source/JdbcLeitstandTimeSourceClient.java | 6 ++ ...yRepresentationMaterializationService.java | 3 +- .../service/EmbeddingPersistenceService.java | 5 +- .../EmbeddingPrefixProfileService.java | 61 +++++++++++++++ .../DocumentEmbeddingProcessingService.java | 2 +- .../V28__doc_embedding_prefix_profile.sql | 65 ++++++++++++++++ 12 files changed, 233 insertions(+), 28 deletions(-) create mode 100644 src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingPrefixProfile.java create mode 100644 src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingPrefixProfileRepository.java create mode 100644 src/main/java/at/procon/dip/embedding/service/EmbeddingPrefixProfileService.java create mode 100644 src/main/resources/db/migration/V28__doc_embedding_prefix_profile.sql diff --git a/docs/embedding/VECTOR_SYNC_HTTP_PROVIDER.md b/docs/embedding/VECTOR_SYNC_HTTP_PROVIDER.md index ea40c3b..b0cba3c 100644 --- a/docs/embedding/VECTOR_SYNC_HTTP_PROVIDER.md +++ b/docs/embedding/VECTOR_SYNC_HTTP_PROVIDER.md @@ -93,7 +93,6 @@ Supported modes: - `EXTERNAL` - DIP assumes the external service applies the prefixing itself For persisted document embeddings, the produced prefix provenance is stored in `doc.doc_embedding`: -- `prefix_mode` -- `applied_prefix` +- `prefix_profile_id` (resolved via `DOC.doc_embedding_prefix_profile`) This makes it possible to identify whether indexed vectors were created with raw text, DIP-side prefixing, or externally handled prefixing before deciding on re-embedding. diff --git a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java index 1c397e0..f45bbce 100644 --- a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java +++ b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java @@ -16,8 +16,8 @@ import org.springframework.scheduling.annotation.EnableAsync; */ @SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"}) @EnableAsync -@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity", "at.procon.dip.migration.audit.entity", "at.procon.dip.migration.entity", "at.procon.dip.domain.time.entity"}) -@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository", "at.procon.dip.migration.audit.repository", "at.procon.dip.migration.repository", "at.procon.dip.domain.time.repository"}) +@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity", "at.procon.dip.migration.audit.entity", "at.procon.dip.migration.entity"}) +@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository", "at.procon.dip.migration.audit.repository", "at.procon.dip.migration.repository"}) public class DocumentIntelligencePlatformApplication { public static void main(String[] args) { diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java index 281a40e..64d69ab 100644 --- a/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java @@ -2,7 +2,6 @@ package at.procon.dip.domain.document.entity; import at.procon.dip.architecture.SchemaNames; import at.procon.dip.domain.document.EmbeddingStatus; -import at.procon.dip.embedding.model.EmbeddingPrefixMode; import jakarta.persistence.Column; import jakarta.persistence.Entity; import jakarta.persistence.EnumType; @@ -39,7 +38,7 @@ import lombok.Setter; @Index(name = "idx_doc_embedding_model", columnList = "model_id"), @Index(name = "idx_doc_embedding_status", columnList = "embedding_status"), @Index(name = "idx_doc_embedding_embedded_at", columnList = "embedded_at"), - @Index(name = "idx_doc_embedding_prefix_mode", columnList = "prefix_mode") + @Index(name = "idx_doc_embedding_prefix_profile", columnList = "prefix_profile_id") }) @Getter @Setter @@ -81,13 +80,9 @@ public class DocumentEmbedding { @Column(name = "embedded_at") private OffsetDateTime embeddedAt; - @Enumerated(EnumType.STRING) - @Column(name = "prefix_mode", nullable = false, length = 32) - @Builder.Default - private EmbeddingPrefixMode prefixMode = EmbeddingPrefixMode.OFF; - - @Column(name = "applied_prefix", length = 64) - private String appliedPrefix; + @ManyToOne(fetch = FetchType.LAZY) + @JoinColumn(name = "prefix_profile_id") + private DocumentEmbeddingPrefixProfile prefixProfile; @Builder.Default diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingPrefixProfile.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingPrefixProfile.java new file mode 100644 index 0000000..a217dd5 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingPrefixProfile.java @@ -0,0 +1,75 @@ +package at.procon.dip.domain.document.entity; + +import at.procon.dip.architecture.SchemaNames; +import at.procon.dip.embedding.model.EmbeddingPrefixMode; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.PrePersist; +import jakarta.persistence.PreUpdate; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_embedding_prefix_profile", indexes = { + @Index(name = "idx_doc_embedding_prefix_profile_code", columnList = "code", unique = true), + @Index(name = "idx_doc_embedding_prefix_profile_mode", columnList = "prefix_mode") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class DocumentEmbeddingPrefixProfile { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @Column(name = "code", nullable = false, length = 128, unique = true) + private String code; + + @Enumerated(EnumType.STRING) + @Column(name = "prefix_mode", nullable = false, length = 32) + private EmbeddingPrefixMode prefixMode; + + @Column(name = "prefix_text", nullable = false, columnDefinition = "TEXT") + @Builder.Default + private String prefixText = ""; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @Builder.Default + @Column(name = "updated_at", nullable = false) + private OffsetDateTime updatedAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + updatedAt = OffsetDateTime.now(); + if (prefixText == null) { + prefixText = ""; + } + } + + @PreUpdate + protected void onUpdate() { + updatedAt = OffsetDateTime.now(); + if (prefixText == null) { + prefixText = ""; + } + } +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingPrefixProfileRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingPrefixProfileRepository.java new file mode 100644 index 0000000..431a075 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingPrefixProfileRepository.java @@ -0,0 +1,11 @@ +package at.procon.dip.domain.document.repository; + +import at.procon.dip.domain.document.entity.DocumentEmbeddingPrefixProfile; +import java.util.Optional; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface DocumentEmbeddingPrefixProfileRepository extends JpaRepository { + + Optional findByCode(String code); +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java index 977df13..7c8d1f7 100644 --- a/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java @@ -2,7 +2,6 @@ package at.procon.dip.domain.document.repository; import at.procon.dip.domain.document.EmbeddingStatus; import at.procon.dip.domain.document.entity.DocumentEmbedding; -import at.procon.dip.embedding.model.EmbeddingPrefixMode; import java.time.OffsetDateTime; import java.util.List; import java.util.Optional; @@ -33,25 +32,17 @@ public interface DocumentEmbeddingRepository extends JpaRepository findDetailedById(@Param("embeddingId") UUID embeddingId); - default int updateEmbeddingVector(@Param("id") UUID id, - @Param("vectorData") float[] vectorData, - @Param("tokenCount") Integer tokenCount, - @Param("dimensions") Integer dimensions) { - return updateEmbeddingVector(id, vectorData, tokenCount, dimensions, EmbeddingPrefixMode.OFF.name(), null); - } - @Modifying @Query(value = "UPDATE doc.doc_embedding SET embedding_vector = CAST(:vectorData AS vector), " + "embedding_status = 'COMPLETED', embedded_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP, " + "error_message = NULL, token_count = :tokenCount, embedding_dimensions = :dimensions, " + - "prefix_mode = :prefixMode, applied_prefix = :appliedPrefix WHERE id = :id", + "prefix_profile_id = :prefixProfileId WHERE id = :id", nativeQuery = true) int updateEmbeddingVector(@Param("id") UUID id, @Param("vectorData") float[] vectorData, @Param("tokenCount") Integer tokenCount, @Param("dimensions") Integer dimensions, - @Param("prefixMode") String prefixMode, - @Param("appliedPrefix") String appliedPrefix); + @Param("prefixProfileId") UUID prefixProfileId); @Modifying @Query("UPDATE DocumentEmbedding e SET e.embeddingStatus = :status, e.errorMessage = :errorMessage, " + diff --git a/src/main/java/at/procon/dip/domain/time/leitstand/source/JdbcLeitstandTimeSourceClient.java b/src/main/java/at/procon/dip/domain/time/leitstand/source/JdbcLeitstandTimeSourceClient.java index 35f9ca3..9b54cfa 100644 --- a/src/main/java/at/procon/dip/domain/time/leitstand/source/JdbcLeitstandTimeSourceClient.java +++ b/src/main/java/at/procon/dip/domain/time/leitstand/source/JdbcLeitstandTimeSourceClient.java @@ -9,12 +9,18 @@ import java.time.OffsetDateTime; import java.time.ZoneId; import java.util.List; import java.util.Map; + +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.jdbc.core.RowMapper; import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate; import org.springframework.stereotype.Component; @Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) +@ConditionalOnProperty(prefix = "dip.time.leitstand", name = "enabled", havingValue = "true") public class JdbcLeitstandTimeSourceClient implements LeitstandTimeSourceClient { private final TimeDomainProperties properties; diff --git a/src/main/java/at/procon/dip/domain/time/service/TimeEntryRepresentationMaterializationService.java b/src/main/java/at/procon/dip/domain/time/service/TimeEntryRepresentationMaterializationService.java index 1f18a4d..3d7a1b3 100644 --- a/src/main/java/at/procon/dip/domain/time/service/TimeEntryRepresentationMaterializationService.java +++ b/src/main/java/at/procon/dip/domain/time/service/TimeEntryRepresentationMaterializationService.java @@ -62,10 +62,11 @@ public class TimeEntryRepresentationMaterializationService { || !equalsNullable(projection.getLanguageCode(), existing.get().getLanguageCode()) || !BUILDER_KEY.equals(existing.get().getBuilderKey()); + Document finalDocument = document; DocumentTextRepresentation semantic = existing .map(found -> changed ? updateRepresentation(found, projection) : found) .orElseGet(() -> documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand( - document.getId(), + finalDocument.getId(), null, RepresentationType.SEMANTIC_TEXT, BUILDER_KEY, diff --git a/src/main/java/at/procon/dip/embedding/service/EmbeddingPersistenceService.java b/src/main/java/at/procon/dip/embedding/service/EmbeddingPersistenceService.java index c8ac9af..f2bf3f2 100644 --- a/src/main/java/at/procon/dip/embedding/service/EmbeddingPersistenceService.java +++ b/src/main/java/at/procon/dip/embedding/service/EmbeddingPersistenceService.java @@ -24,6 +24,7 @@ public class EmbeddingPersistenceService { private final DocumentEmbeddingService documentEmbeddingService; private final DocumentEmbeddingRepository embeddingRepository; private final EmbeddingModelCatalogService modelCatalogService; + private final EmbeddingPrefixProfileService embeddingPrefixProfileService; public DocumentEmbedding ensurePending(UUID representationId, String modelKey) { DocumentTextRepresentation representation = representationRepository.findById(representationId) @@ -59,13 +60,13 @@ public class EmbeddingPersistenceService { if (vector == null || vector.length == 0) { throw new IllegalArgumentException("Embedding vector must not be empty"); } + UUID prefixProfileId = embeddingPrefixProfileService.resolveProfileId(prefixMode, appliedPrefix); embeddingRepository.updateEmbeddingVector( embeddingId, vector, tokenCount, vector.length, - (prefixMode == null ? EmbeddingPrefixMode.OFF : prefixMode).name(), - appliedPrefix + prefixProfileId ); } diff --git a/src/main/java/at/procon/dip/embedding/service/EmbeddingPrefixProfileService.java b/src/main/java/at/procon/dip/embedding/service/EmbeddingPrefixProfileService.java new file mode 100644 index 0000000..cb646f6 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/service/EmbeddingPrefixProfileService.java @@ -0,0 +1,61 @@ +package at.procon.dip.embedding.service; + +import at.procon.dip.domain.document.entity.DocumentEmbeddingPrefixProfile; +import at.procon.dip.domain.document.repository.DocumentEmbeddingPrefixProfileRepository; +import at.procon.dip.embedding.model.EmbeddingPrefixMode; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.HexFormat; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +@Service +@RequiredArgsConstructor +public class EmbeddingPrefixProfileService { + + private final DocumentEmbeddingPrefixProfileRepository repository; + private final ConcurrentMap idCache = new ConcurrentHashMap<>(); + + @Transactional + public UUID resolveProfileId(EmbeddingPrefixMode prefixMode, String appliedPrefix) { + EmbeddingPrefixMode normalizedMode = prefixMode == null ? EmbeddingPrefixMode.OFF : prefixMode; + String normalizedPrefix = appliedPrefix == null ? "" : appliedPrefix; + String code = buildCode(normalizedMode, normalizedPrefix); + + UUID cached = idCache.get(code); + if (cached != null) { + return cached; + } + + UUID resolved = repository.findByCode(code) + .map(DocumentEmbeddingPrefixProfile::getId) + .orElseGet(() -> repository.save( + DocumentEmbeddingPrefixProfile.builder() + .code(code) + .prefixMode(normalizedMode) + .prefixText(normalizedPrefix) + .build() + ).getId()); + idCache.putIfAbsent(code, resolved); + return resolved; + } + + static String buildCode(EmbeddingPrefixMode prefixMode, String prefixText) { + return prefixMode.name() + ":" + sha256Hex(prefixText == null ? "" : prefixText); + } + + private static String sha256Hex(String value) { + try { + MessageDigest digest = MessageDigest.getInstance("SHA-256"); + byte[] bytes = digest.digest(value.getBytes(StandardCharsets.UTF_8)); + return HexFormat.of().formatHex(bytes); + } catch (NoSuchAlgorithmException e) { + throw new IllegalStateException("SHA-256 digest not available", e); + } + } +} diff --git a/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java b/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java index f2f646d..a575014 100644 --- a/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java +++ b/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java @@ -95,7 +95,7 @@ public class DocumentEmbeddingProcessingService { } String vectorString = vectorizationService.floatArrayToVectorString(embedding); - embeddingRepository.updateEmbeddingVector(embeddingId, embedding, tokenCount, embedding.length); + embeddingRepository.updateEmbeddingVector(embeddingId, embedding, tokenCount, embedding.length, null); documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.INDEXED); if (properties.isDualWriteLegacyTedVectors()) { diff --git a/src/main/resources/db/migration/V28__doc_embedding_prefix_profile.sql b/src/main/resources/db/migration/V28__doc_embedding_prefix_profile.sql new file mode 100644 index 0000000..62108f7 --- /dev/null +++ b/src/main/resources/db/migration/V28__doc_embedding_prefix_profile.sql @@ -0,0 +1,65 @@ +CREATE TABLE IF NOT EXISTS doc.doc_embedding_prefix_profile ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + code VARCHAR(128) NOT NULL UNIQUE, + prefix_mode VARCHAR(32) NOT NULL, + prefix_text TEXT NOT NULL DEFAULT '', + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +ALTER TABLE doc.doc_embedding + ADD COLUMN IF NOT EXISTS prefix_profile_id UUID; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM information_schema.columns c + WHERE c.table_schema = 'doc' + AND c.table_name = 'doc_embedding' + AND c.column_name = 'prefix_mode' + ) THEN + EXECUTE $sql$ + INSERT INTO doc.doc_embedding_prefix_profile (code, prefix_mode, prefix_text) + SELECT DISTINCT + COALESCE(de.prefix_mode, 'OFF') || ':' || md5(COALESCE(de.applied_prefix, '')), + COALESCE(de.prefix_mode, 'OFF'), + COALESCE(de.applied_prefix, '') + FROM doc.doc_embedding de + ON CONFLICT (code) DO NOTHING + $sql$; + + EXECUTE $sql$ + UPDATE doc.doc_embedding de + SET prefix_profile_id = pp.id + FROM doc.doc_embedding_prefix_profile pp + WHERE de.prefix_profile_id IS NULL + AND pp.code = COALESCE(de.prefix_mode, 'OFF') || ':' || md5(COALESCE(de.applied_prefix, '')) + $sql$; + END IF; +END $$; + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 + FROM pg_constraint + WHERE conname = 'fk_doc_embedding_prefix_profile' + ) THEN + ALTER TABLE doc.doc_embedding + ADD CONSTRAINT fk_doc_embedding_prefix_profile + FOREIGN KEY (prefix_profile_id) + REFERENCES doc.doc_embedding_prefix_profile(id); + END IF; +END $$; + +CREATE INDEX IF NOT EXISTS idx_doc_embedding_prefix_profile + ON doc.doc_embedding(prefix_profile_id); + +DROP INDEX IF EXISTS doc.idx_doc_embedding_prefix_mode; + +ALTER TABLE doc.doc_embedding + DROP COLUMN IF EXISTS applied_prefix; + +ALTER TABLE doc.doc_embedding + DROP COLUMN IF EXISTS prefix_mode;