improved embedding model prefix handling using profiles
This commit is contained in:
parent
6ca9936b87
commit
12f0b0604b
|
|
@ -93,7 +93,6 @@ Supported modes:
|
|||
- `EXTERNAL` - DIP assumes the external service applies the prefixing itself
|
||||
|
||||
For persisted document embeddings, the produced prefix provenance is stored in `doc.doc_embedding`:
|
||||
- `prefix_mode`
|
||||
- `applied_prefix`
|
||||
- `prefix_profile_id` (resolved via `DOC.doc_embedding_prefix_profile`)
|
||||
|
||||
This makes it possible to identify whether indexed vectors were created with raw text, DIP-side prefixing, or externally handled prefixing before deciding on re-embedding.
|
||||
|
|
|
|||
|
|
@ -16,8 +16,8 @@ import org.springframework.scheduling.annotation.EnableAsync;
|
|||
*/
|
||||
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
|
||||
@EnableAsync
|
||||
@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity", "at.procon.dip.migration.audit.entity", "at.procon.dip.migration.entity", "at.procon.dip.domain.time.entity"})
|
||||
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository", "at.procon.dip.migration.audit.repository", "at.procon.dip.migration.repository", "at.procon.dip.domain.time.repository"})
|
||||
@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity", "at.procon.dip.migration.audit.entity", "at.procon.dip.migration.entity"})
|
||||
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository", "at.procon.dip.migration.audit.repository", "at.procon.dip.migration.repository"})
|
||||
public class DocumentIntelligencePlatformApplication {
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ package at.procon.dip.domain.document.entity;
|
|||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.EnumType;
|
||||
|
|
@ -39,7 +38,7 @@ import lombok.Setter;
|
|||
@Index(name = "idx_doc_embedding_model", columnList = "model_id"),
|
||||
@Index(name = "idx_doc_embedding_status", columnList = "embedding_status"),
|
||||
@Index(name = "idx_doc_embedding_embedded_at", columnList = "embedded_at"),
|
||||
@Index(name = "idx_doc_embedding_prefix_mode", columnList = "prefix_mode")
|
||||
@Index(name = "idx_doc_embedding_prefix_profile", columnList = "prefix_profile_id")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
|
|
@ -81,13 +80,9 @@ public class DocumentEmbedding {
|
|||
@Column(name = "embedded_at")
|
||||
private OffsetDateTime embeddedAt;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "prefix_mode", nullable = false, length = 32)
|
||||
@Builder.Default
|
||||
private EmbeddingPrefixMode prefixMode = EmbeddingPrefixMode.OFF;
|
||||
|
||||
@Column(name = "applied_prefix", length = 64)
|
||||
private String appliedPrefix;
|
||||
@ManyToOne(fetch = FetchType.LAZY)
|
||||
@JoinColumn(name = "prefix_profile_id")
|
||||
private DocumentEmbeddingPrefixProfile prefixProfile;
|
||||
|
||||
|
||||
@Builder.Default
|
||||
|
|
|
|||
|
|
@ -0,0 +1,75 @@
|
|||
package at.procon.dip.domain.document.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.EnumType;
|
||||
import jakarta.persistence.Enumerated;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.PreUpdate;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_embedding_prefix_profile", indexes = {
|
||||
@Index(name = "idx_doc_embedding_prefix_profile_code", columnList = "code", unique = true),
|
||||
@Index(name = "idx_doc_embedding_prefix_profile_mode", columnList = "prefix_mode")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class DocumentEmbeddingPrefixProfile {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@Column(name = "code", nullable = false, length = 128, unique = true)
|
||||
private String code;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "prefix_mode", nullable = false, length = 32)
|
||||
private EmbeddingPrefixMode prefixMode;
|
||||
|
||||
@Column(name = "prefix_text", nullable = false, columnDefinition = "TEXT")
|
||||
@Builder.Default
|
||||
private String prefixText = "";
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "updated_at", nullable = false)
|
||||
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
updatedAt = OffsetDateTime.now();
|
||||
if (prefixText == null) {
|
||||
prefixText = "";
|
||||
}
|
||||
}
|
||||
|
||||
@PreUpdate
|
||||
protected void onUpdate() {
|
||||
updatedAt = OffsetDateTime.now();
|
||||
if (prefixText == null) {
|
||||
prefixText = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
package at.procon.dip.domain.document.repository;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingPrefixProfile;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface DocumentEmbeddingPrefixProfileRepository extends JpaRepository<DocumentEmbeddingPrefixProfile, UUID> {
|
||||
|
||||
Optional<DocumentEmbeddingPrefixProfile> findByCode(String code);
|
||||
}
|
||||
|
|
@ -2,7 +2,6 @@ package at.procon.dip.domain.document.repository;
|
|||
|
||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbedding;
|
||||
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
|
@ -33,25 +32,17 @@ public interface DocumentEmbeddingRepository extends JpaRepository<DocumentEmbed
|
|||
"WHERE e.id = :embeddingId")
|
||||
Optional<DocumentEmbedding> findDetailedById(@Param("embeddingId") UUID embeddingId);
|
||||
|
||||
default int updateEmbeddingVector(@Param("id") UUID id,
|
||||
@Param("vectorData") float[] vectorData,
|
||||
@Param("tokenCount") Integer tokenCount,
|
||||
@Param("dimensions") Integer dimensions) {
|
||||
return updateEmbeddingVector(id, vectorData, tokenCount, dimensions, EmbeddingPrefixMode.OFF.name(), null);
|
||||
}
|
||||
|
||||
@Modifying
|
||||
@Query(value = "UPDATE doc.doc_embedding SET embedding_vector = CAST(:vectorData AS vector), " +
|
||||
"embedding_status = 'COMPLETED', embedded_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP, " +
|
||||
"error_message = NULL, token_count = :tokenCount, embedding_dimensions = :dimensions, " +
|
||||
"prefix_mode = :prefixMode, applied_prefix = :appliedPrefix WHERE id = :id",
|
||||
"prefix_profile_id = :prefixProfileId WHERE id = :id",
|
||||
nativeQuery = true)
|
||||
int updateEmbeddingVector(@Param("id") UUID id,
|
||||
@Param("vectorData") float[] vectorData,
|
||||
@Param("tokenCount") Integer tokenCount,
|
||||
@Param("dimensions") Integer dimensions,
|
||||
@Param("prefixMode") String prefixMode,
|
||||
@Param("appliedPrefix") String appliedPrefix);
|
||||
@Param("prefixProfileId") UUID prefixProfileId);
|
||||
|
||||
@Modifying
|
||||
@Query("UPDATE DocumentEmbedding e SET e.embeddingStatus = :status, e.errorMessage = :errorMessage, " +
|
||||
|
|
|
|||
|
|
@ -9,12 +9,18 @@ import java.time.OffsetDateTime;
|
|||
import java.time.ZoneId;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import org.springframework.beans.factory.annotation.Qualifier;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.jdbc.core.RowMapper;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
@ConditionalOnProperty(prefix = "dip.time.leitstand", name = "enabled", havingValue = "true")
|
||||
public class JdbcLeitstandTimeSourceClient implements LeitstandTimeSourceClient {
|
||||
|
||||
private final TimeDomainProperties properties;
|
||||
|
|
|
|||
|
|
@ -62,10 +62,11 @@ public class TimeEntryRepresentationMaterializationService {
|
|||
|| !equalsNullable(projection.getLanguageCode(), existing.get().getLanguageCode())
|
||||
|| !BUILDER_KEY.equals(existing.get().getBuilderKey());
|
||||
|
||||
Document finalDocument = document;
|
||||
DocumentTextRepresentation semantic = existing
|
||||
.map(found -> changed ? updateRepresentation(found, projection) : found)
|
||||
.orElseGet(() -> documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
|
||||
document.getId(),
|
||||
finalDocument.getId(),
|
||||
null,
|
||||
RepresentationType.SEMANTIC_TEXT,
|
||||
BUILDER_KEY,
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ public class EmbeddingPersistenceService {
|
|||
private final DocumentEmbeddingService documentEmbeddingService;
|
||||
private final DocumentEmbeddingRepository embeddingRepository;
|
||||
private final EmbeddingModelCatalogService modelCatalogService;
|
||||
private final EmbeddingPrefixProfileService embeddingPrefixProfileService;
|
||||
|
||||
public DocumentEmbedding ensurePending(UUID representationId, String modelKey) {
|
||||
DocumentTextRepresentation representation = representationRepository.findById(representationId)
|
||||
|
|
@ -59,13 +60,13 @@ public class EmbeddingPersistenceService {
|
|||
if (vector == null || vector.length == 0) {
|
||||
throw new IllegalArgumentException("Embedding vector must not be empty");
|
||||
}
|
||||
UUID prefixProfileId = embeddingPrefixProfileService.resolveProfileId(prefixMode, appliedPrefix);
|
||||
embeddingRepository.updateEmbeddingVector(
|
||||
embeddingId,
|
||||
vector,
|
||||
tokenCount,
|
||||
vector.length,
|
||||
(prefixMode == null ? EmbeddingPrefixMode.OFF : prefixMode).name(),
|
||||
appliedPrefix
|
||||
prefixProfileId
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,61 @@
|
|||
package at.procon.dip.embedding.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingPrefixProfile;
|
||||
import at.procon.dip.domain.document.repository.DocumentEmbeddingPrefixProfileRepository;
|
||||
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.HexFormat;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class EmbeddingPrefixProfileService {
|
||||
|
||||
private final DocumentEmbeddingPrefixProfileRepository repository;
|
||||
private final ConcurrentMap<String, UUID> idCache = new ConcurrentHashMap<>();
|
||||
|
||||
@Transactional
|
||||
public UUID resolveProfileId(EmbeddingPrefixMode prefixMode, String appliedPrefix) {
|
||||
EmbeddingPrefixMode normalizedMode = prefixMode == null ? EmbeddingPrefixMode.OFF : prefixMode;
|
||||
String normalizedPrefix = appliedPrefix == null ? "" : appliedPrefix;
|
||||
String code = buildCode(normalizedMode, normalizedPrefix);
|
||||
|
||||
UUID cached = idCache.get(code);
|
||||
if (cached != null) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
UUID resolved = repository.findByCode(code)
|
||||
.map(DocumentEmbeddingPrefixProfile::getId)
|
||||
.orElseGet(() -> repository.save(
|
||||
DocumentEmbeddingPrefixProfile.builder()
|
||||
.code(code)
|
||||
.prefixMode(normalizedMode)
|
||||
.prefixText(normalizedPrefix)
|
||||
.build()
|
||||
).getId());
|
||||
idCache.putIfAbsent(code, resolved);
|
||||
return resolved;
|
||||
}
|
||||
|
||||
static String buildCode(EmbeddingPrefixMode prefixMode, String prefixText) {
|
||||
return prefixMode.name() + ":" + sha256Hex(prefixText == null ? "" : prefixText);
|
||||
}
|
||||
|
||||
private static String sha256Hex(String value) {
|
||||
try {
|
||||
MessageDigest digest = MessageDigest.getInstance("SHA-256");
|
||||
byte[] bytes = digest.digest(value.getBytes(StandardCharsets.UTF_8));
|
||||
return HexFormat.of().formatHex(bytes);
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new IllegalStateException("SHA-256 digest not available", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -95,7 +95,7 @@ public class DocumentEmbeddingProcessingService {
|
|||
}
|
||||
|
||||
String vectorString = vectorizationService.floatArrayToVectorString(embedding);
|
||||
embeddingRepository.updateEmbeddingVector(embeddingId, embedding, tokenCount, embedding.length);
|
||||
embeddingRepository.updateEmbeddingVector(embeddingId, embedding, tokenCount, embedding.length, null);
|
||||
documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.INDEXED);
|
||||
|
||||
if (properties.isDualWriteLegacyTedVectors()) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,65 @@
|
|||
CREATE TABLE IF NOT EXISTS doc.doc_embedding_prefix_profile (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
code VARCHAR(128) NOT NULL UNIQUE,
|
||||
prefix_mode VARCHAR(32) NOT NULL,
|
||||
prefix_text TEXT NOT NULL DEFAULT '',
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
ALTER TABLE doc.doc_embedding
|
||||
ADD COLUMN IF NOT EXISTS prefix_profile_id UUID;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM information_schema.columns c
|
||||
WHERE c.table_schema = 'doc'
|
||||
AND c.table_name = 'doc_embedding'
|
||||
AND c.column_name = 'prefix_mode'
|
||||
) THEN
|
||||
EXECUTE $sql$
|
||||
INSERT INTO doc.doc_embedding_prefix_profile (code, prefix_mode, prefix_text)
|
||||
SELECT DISTINCT
|
||||
COALESCE(de.prefix_mode, 'OFF') || ':' || md5(COALESCE(de.applied_prefix, '')),
|
||||
COALESCE(de.prefix_mode, 'OFF'),
|
||||
COALESCE(de.applied_prefix, '')
|
||||
FROM doc.doc_embedding de
|
||||
ON CONFLICT (code) DO NOTHING
|
||||
$sql$;
|
||||
|
||||
EXECUTE $sql$
|
||||
UPDATE doc.doc_embedding de
|
||||
SET prefix_profile_id = pp.id
|
||||
FROM doc.doc_embedding_prefix_profile pp
|
||||
WHERE de.prefix_profile_id IS NULL
|
||||
AND pp.code = COALESCE(de.prefix_mode, 'OFF') || ':' || md5(COALESCE(de.applied_prefix, ''))
|
||||
$sql$;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_constraint
|
||||
WHERE conname = 'fk_doc_embedding_prefix_profile'
|
||||
) THEN
|
||||
ALTER TABLE doc.doc_embedding
|
||||
ADD CONSTRAINT fk_doc_embedding_prefix_profile
|
||||
FOREIGN KEY (prefix_profile_id)
|
||||
REFERENCES doc.doc_embedding_prefix_profile(id);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_embedding_prefix_profile
|
||||
ON doc.doc_embedding(prefix_profile_id);
|
||||
|
||||
DROP INDEX IF EXISTS doc.idx_doc_embedding_prefix_mode;
|
||||
|
||||
ALTER TABLE doc.doc_embedding
|
||||
DROP COLUMN IF EXISTS applied_prefix;
|
||||
|
||||
ALTER TABLE doc.doc_embedding
|
||||
DROP COLUMN IF EXISTS prefix_mode;
|
||||
Loading…
Reference in New Issue