improved embedding model prefix handling using profiles

This commit is contained in:
trifonovt 2026-04-22 12:36:03 +02:00
parent 6ca9936b87
commit 12f0b0604b
12 changed files with 233 additions and 28 deletions

View File

@ -93,7 +93,6 @@ Supported modes:
- `EXTERNAL` - DIP assumes the external service applies the prefixing itself
For persisted document embeddings, the produced prefix provenance is stored in `doc.doc_embedding`:
- `prefix_mode`
- `applied_prefix`
- `prefix_profile_id` (resolved via `DOC.doc_embedding_prefix_profile`)
This makes it possible to identify whether indexed vectors were created with raw text, DIP-side prefixing, or externally handled prefixing before deciding on re-embedding.

View File

@ -16,8 +16,8 @@ import org.springframework.scheduling.annotation.EnableAsync;
*/
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
@EnableAsync
@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity", "at.procon.dip.migration.audit.entity", "at.procon.dip.migration.entity", "at.procon.dip.domain.time.entity"})
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository", "at.procon.dip.migration.audit.repository", "at.procon.dip.migration.repository", "at.procon.dip.domain.time.repository"})
@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity", "at.procon.dip.migration.audit.entity", "at.procon.dip.migration.entity"})
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository", "at.procon.dip.migration.audit.repository", "at.procon.dip.migration.repository"})
public class DocumentIntelligencePlatformApplication {
public static void main(String[] args) {

View File

@ -2,7 +2,6 @@ package at.procon.dip.domain.document.entity;
import at.procon.dip.architecture.SchemaNames;
import at.procon.dip.domain.document.EmbeddingStatus;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import jakarta.persistence.Column;
import jakarta.persistence.Entity;
import jakarta.persistence.EnumType;
@ -39,7 +38,7 @@ import lombok.Setter;
@Index(name = "idx_doc_embedding_model", columnList = "model_id"),
@Index(name = "idx_doc_embedding_status", columnList = "embedding_status"),
@Index(name = "idx_doc_embedding_embedded_at", columnList = "embedded_at"),
@Index(name = "idx_doc_embedding_prefix_mode", columnList = "prefix_mode")
@Index(name = "idx_doc_embedding_prefix_profile", columnList = "prefix_profile_id")
})
@Getter
@Setter
@ -81,13 +80,9 @@ public class DocumentEmbedding {
@Column(name = "embedded_at")
private OffsetDateTime embeddedAt;
@Enumerated(EnumType.STRING)
@Column(name = "prefix_mode", nullable = false, length = 32)
@Builder.Default
private EmbeddingPrefixMode prefixMode = EmbeddingPrefixMode.OFF;
@Column(name = "applied_prefix", length = 64)
private String appliedPrefix;
@ManyToOne(fetch = FetchType.LAZY)
@JoinColumn(name = "prefix_profile_id")
private DocumentEmbeddingPrefixProfile prefixProfile;
@Builder.Default

View File

@ -0,0 +1,75 @@
package at.procon.dip.domain.document.entity;
import at.procon.dip.architecture.SchemaNames;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import jakarta.persistence.Column;
import jakarta.persistence.Entity;
import jakarta.persistence.EnumType;
import jakarta.persistence.Enumerated;
import jakarta.persistence.GeneratedValue;
import jakarta.persistence.GenerationType;
import jakarta.persistence.Id;
import jakarta.persistence.Index;
import jakarta.persistence.PrePersist;
import jakarta.persistence.PreUpdate;
import jakarta.persistence.Table;
import java.time.OffsetDateTime;
import java.util.UUID;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
@Entity
@Table(schema = SchemaNames.DOC, name = "doc_embedding_prefix_profile", indexes = {
@Index(name = "idx_doc_embedding_prefix_profile_code", columnList = "code", unique = true),
@Index(name = "idx_doc_embedding_prefix_profile_mode", columnList = "prefix_mode")
})
@Getter
@Setter
@NoArgsConstructor
@AllArgsConstructor
@Builder
public class DocumentEmbeddingPrefixProfile {
@Id
@GeneratedValue(strategy = GenerationType.UUID)
private UUID id;
@Column(name = "code", nullable = false, length = 128, unique = true)
private String code;
@Enumerated(EnumType.STRING)
@Column(name = "prefix_mode", nullable = false, length = 32)
private EmbeddingPrefixMode prefixMode;
@Column(name = "prefix_text", nullable = false, columnDefinition = "TEXT")
@Builder.Default
private String prefixText = "";
@Builder.Default
@Column(name = "created_at", nullable = false, updatable = false)
private OffsetDateTime createdAt = OffsetDateTime.now();
@Builder.Default
@Column(name = "updated_at", nullable = false)
private OffsetDateTime updatedAt = OffsetDateTime.now();
@PrePersist
protected void onCreate() {
createdAt = OffsetDateTime.now();
updatedAt = OffsetDateTime.now();
if (prefixText == null) {
prefixText = "";
}
}
@PreUpdate
protected void onUpdate() {
updatedAt = OffsetDateTime.now();
if (prefixText == null) {
prefixText = "";
}
}
}

View File

@ -0,0 +1,11 @@
package at.procon.dip.domain.document.repository;
import at.procon.dip.domain.document.entity.DocumentEmbeddingPrefixProfile;
import java.util.Optional;
import java.util.UUID;
import org.springframework.data.jpa.repository.JpaRepository;
public interface DocumentEmbeddingPrefixProfileRepository extends JpaRepository<DocumentEmbeddingPrefixProfile, UUID> {
Optional<DocumentEmbeddingPrefixProfile> findByCode(String code);
}

View File

@ -2,7 +2,6 @@ package at.procon.dip.domain.document.repository;
import at.procon.dip.domain.document.EmbeddingStatus;
import at.procon.dip.domain.document.entity.DocumentEmbedding;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import java.time.OffsetDateTime;
import java.util.List;
import java.util.Optional;
@ -33,25 +32,17 @@ public interface DocumentEmbeddingRepository extends JpaRepository<DocumentEmbed
"WHERE e.id = :embeddingId")
Optional<DocumentEmbedding> findDetailedById(@Param("embeddingId") UUID embeddingId);
default int updateEmbeddingVector(@Param("id") UUID id,
@Param("vectorData") float[] vectorData,
@Param("tokenCount") Integer tokenCount,
@Param("dimensions") Integer dimensions) {
return updateEmbeddingVector(id, vectorData, tokenCount, dimensions, EmbeddingPrefixMode.OFF.name(), null);
}
@Modifying
@Query(value = "UPDATE doc.doc_embedding SET embedding_vector = CAST(:vectorData AS vector), " +
"embedding_status = 'COMPLETED', embedded_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP, " +
"error_message = NULL, token_count = :tokenCount, embedding_dimensions = :dimensions, " +
"prefix_mode = :prefixMode, applied_prefix = :appliedPrefix WHERE id = :id",
"prefix_profile_id = :prefixProfileId WHERE id = :id",
nativeQuery = true)
int updateEmbeddingVector(@Param("id") UUID id,
@Param("vectorData") float[] vectorData,
@Param("tokenCount") Integer tokenCount,
@Param("dimensions") Integer dimensions,
@Param("prefixMode") String prefixMode,
@Param("appliedPrefix") String appliedPrefix);
@Param("prefixProfileId") UUID prefixProfileId);
@Modifying
@Query("UPDATE DocumentEmbedding e SET e.embeddingStatus = :status, e.errorMessage = :errorMessage, " +

View File

@ -9,12 +9,18 @@ import java.time.OffsetDateTime;
import java.time.ZoneId;
import java.util.List;
import java.util.Map;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.jdbc.core.RowMapper;
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
import org.springframework.stereotype.Component;
@Component
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@ConditionalOnProperty(prefix = "dip.time.leitstand", name = "enabled", havingValue = "true")
public class JdbcLeitstandTimeSourceClient implements LeitstandTimeSourceClient {
private final TimeDomainProperties properties;

View File

@ -62,10 +62,11 @@ public class TimeEntryRepresentationMaterializationService {
|| !equalsNullable(projection.getLanguageCode(), existing.get().getLanguageCode())
|| !BUILDER_KEY.equals(existing.get().getBuilderKey());
Document finalDocument = document;
DocumentTextRepresentation semantic = existing
.map(found -> changed ? updateRepresentation(found, projection) : found)
.orElseGet(() -> documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
document.getId(),
finalDocument.getId(),
null,
RepresentationType.SEMANTIC_TEXT,
BUILDER_KEY,

View File

@ -24,6 +24,7 @@ public class EmbeddingPersistenceService {
private final DocumentEmbeddingService documentEmbeddingService;
private final DocumentEmbeddingRepository embeddingRepository;
private final EmbeddingModelCatalogService modelCatalogService;
private final EmbeddingPrefixProfileService embeddingPrefixProfileService;
public DocumentEmbedding ensurePending(UUID representationId, String modelKey) {
DocumentTextRepresentation representation = representationRepository.findById(representationId)
@ -59,13 +60,13 @@ public class EmbeddingPersistenceService {
if (vector == null || vector.length == 0) {
throw new IllegalArgumentException("Embedding vector must not be empty");
}
UUID prefixProfileId = embeddingPrefixProfileService.resolveProfileId(prefixMode, appliedPrefix);
embeddingRepository.updateEmbeddingVector(
embeddingId,
vector,
tokenCount,
vector.length,
(prefixMode == null ? EmbeddingPrefixMode.OFF : prefixMode).name(),
appliedPrefix
prefixProfileId
);
}

View File

@ -0,0 +1,61 @@
package at.procon.dip.embedding.service;
import at.procon.dip.domain.document.entity.DocumentEmbeddingPrefixProfile;
import at.procon.dip.domain.document.repository.DocumentEmbeddingPrefixProfileRepository;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.HexFormat;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
@Service
@RequiredArgsConstructor
public class EmbeddingPrefixProfileService {
private final DocumentEmbeddingPrefixProfileRepository repository;
private final ConcurrentMap<String, UUID> idCache = new ConcurrentHashMap<>();
@Transactional
public UUID resolveProfileId(EmbeddingPrefixMode prefixMode, String appliedPrefix) {
EmbeddingPrefixMode normalizedMode = prefixMode == null ? EmbeddingPrefixMode.OFF : prefixMode;
String normalizedPrefix = appliedPrefix == null ? "" : appliedPrefix;
String code = buildCode(normalizedMode, normalizedPrefix);
UUID cached = idCache.get(code);
if (cached != null) {
return cached;
}
UUID resolved = repository.findByCode(code)
.map(DocumentEmbeddingPrefixProfile::getId)
.orElseGet(() -> repository.save(
DocumentEmbeddingPrefixProfile.builder()
.code(code)
.prefixMode(normalizedMode)
.prefixText(normalizedPrefix)
.build()
).getId());
idCache.putIfAbsent(code, resolved);
return resolved;
}
static String buildCode(EmbeddingPrefixMode prefixMode, String prefixText) {
return prefixMode.name() + ":" + sha256Hex(prefixText == null ? "" : prefixText);
}
private static String sha256Hex(String value) {
try {
MessageDigest digest = MessageDigest.getInstance("SHA-256");
byte[] bytes = digest.digest(value.getBytes(StandardCharsets.UTF_8));
return HexFormat.of().formatHex(bytes);
} catch (NoSuchAlgorithmException e) {
throw new IllegalStateException("SHA-256 digest not available", e);
}
}
}

View File

@ -95,7 +95,7 @@ public class DocumentEmbeddingProcessingService {
}
String vectorString = vectorizationService.floatArrayToVectorString(embedding);
embeddingRepository.updateEmbeddingVector(embeddingId, embedding, tokenCount, embedding.length);
embeddingRepository.updateEmbeddingVector(embeddingId, embedding, tokenCount, embedding.length, null);
documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.INDEXED);
if (properties.isDualWriteLegacyTedVectors()) {

View File

@ -0,0 +1,65 @@
CREATE TABLE IF NOT EXISTS doc.doc_embedding_prefix_profile (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
code VARCHAR(128) NOT NULL UNIQUE,
prefix_mode VARCHAR(32) NOT NULL,
prefix_text TEXT NOT NULL DEFAULT '',
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
);
ALTER TABLE doc.doc_embedding
ADD COLUMN IF NOT EXISTS prefix_profile_id UUID;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM information_schema.columns c
WHERE c.table_schema = 'doc'
AND c.table_name = 'doc_embedding'
AND c.column_name = 'prefix_mode'
) THEN
EXECUTE $sql$
INSERT INTO doc.doc_embedding_prefix_profile (code, prefix_mode, prefix_text)
SELECT DISTINCT
COALESCE(de.prefix_mode, 'OFF') || ':' || md5(COALESCE(de.applied_prefix, '')),
COALESCE(de.prefix_mode, 'OFF'),
COALESCE(de.applied_prefix, '')
FROM doc.doc_embedding de
ON CONFLICT (code) DO NOTHING
$sql$;
EXECUTE $sql$
UPDATE doc.doc_embedding de
SET prefix_profile_id = pp.id
FROM doc.doc_embedding_prefix_profile pp
WHERE de.prefix_profile_id IS NULL
AND pp.code = COALESCE(de.prefix_mode, 'OFF') || ':' || md5(COALESCE(de.applied_prefix, ''))
$sql$;
END IF;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1
FROM pg_constraint
WHERE conname = 'fk_doc_embedding_prefix_profile'
) THEN
ALTER TABLE doc.doc_embedding
ADD CONSTRAINT fk_doc_embedding_prefix_profile
FOREIGN KEY (prefix_profile_id)
REFERENCES doc.doc_embedding_prefix_profile(id);
END IF;
END $$;
CREATE INDEX IF NOT EXISTS idx_doc_embedding_prefix_profile
ON doc.doc_embedding(prefix_profile_id);
DROP INDEX IF EXISTS doc.idx_doc_embedding_prefix_mode;
ALTER TABLE doc.doc_embedding
DROP COLUMN IF EXISTS applied_prefix;
ALTER TABLE doc.doc_embedding
DROP COLUMN IF EXISTS prefix_mode;