embedding model prefixes support

This commit is contained in:
trifonovt 2026-04-21 14:51:53 +02:00
parent 1cd8ebe066
commit 439a06d633
22 changed files with 323 additions and 58 deletions

View File

@ -67,3 +67,33 @@ Notes:
- non-batch-capable models still fall back to single-item execution - non-batch-capable models still fall back to single-item execution
- `parallel-batch-count` controls how many claimed job batches may be started in parallel - `parallel-batch-count` controls how many claimed job batches may be started in parallel
- `execution-batch-size` controls how many texts are sent in one `/vectorize-batch` request inside each claimed job batch - `execution-batch-size` controls how many texts are sent in one `/vectorize-batch` request inside each claimed job batch
## E5 prefix handling
For models such as `intfloat/multilingual-e5-large`, configure prefix handling on the model:
```yaml
dip:
embedding:
models:
e5-default:
provider-config-key: vector-sync-e5
provider-model-key: intfloat/multilingual-e5-large
dimensions: 1024
supports-batch: true
prefix-mode: CLIENT
query-prefix: "query: "
document-prefix: "passage: "
```
Supported modes:
- `OFF` - DIP sends raw text
- `CLIENT` - DIP prepends the configured prefix before calling the provider
- `EXTERNAL` - DIP assumes the external service applies the prefixing itself
For persisted document embeddings, the produced prefix provenance is stored in `doc.doc_embedding`:
- `prefix_mode`
- `applied_prefix`
This makes it possible to identify whether indexed vectors were created with raw text, DIP-side prefixing, or externally handled prefixing before deciding on re-embedding.

View File

@ -2,6 +2,7 @@ package at.procon.dip.domain.document.entity;
import at.procon.dip.architecture.SchemaNames; import at.procon.dip.architecture.SchemaNames;
import at.procon.dip.domain.document.EmbeddingStatus; import at.procon.dip.domain.document.EmbeddingStatus;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import jakarta.persistence.Column; import jakarta.persistence.Column;
import jakarta.persistence.Entity; import jakarta.persistence.Entity;
import jakarta.persistence.EnumType; import jakarta.persistence.EnumType;
@ -37,7 +38,8 @@ import lombok.Setter;
@Index(name = "idx_doc_embedding_repr", columnList = "representation_id"), @Index(name = "idx_doc_embedding_repr", columnList = "representation_id"),
@Index(name = "idx_doc_embedding_model", columnList = "model_id"), @Index(name = "idx_doc_embedding_model", columnList = "model_id"),
@Index(name = "idx_doc_embedding_status", columnList = "embedding_status"), @Index(name = "idx_doc_embedding_status", columnList = "embedding_status"),
@Index(name = "idx_doc_embedding_embedded_at", columnList = "embedded_at") @Index(name = "idx_doc_embedding_embedded_at", columnList = "embedded_at"),
@Index(name = "idx_doc_embedding_prefix_mode", columnList = "prefix_mode")
}) })
@Getter @Getter
@Setter @Setter
@ -79,6 +81,15 @@ public class DocumentEmbedding {
@Column(name = "embedded_at") @Column(name = "embedded_at")
private OffsetDateTime embeddedAt; private OffsetDateTime embeddedAt;
@Enumerated(EnumType.STRING)
@Column(name = "prefix_mode", nullable = false, length = 32)
@Builder.Default
private EmbeddingPrefixMode prefixMode = EmbeddingPrefixMode.OFF;
@Column(name = "applied_prefix", length = 64)
private String appliedPrefix;
@Builder.Default @Builder.Default
@Column(name = "created_at", nullable = false, updatable = false) @Column(name = "created_at", nullable = false, updatable = false)
private OffsetDateTime createdAt = OffsetDateTime.now(); private OffsetDateTime createdAt = OffsetDateTime.now();

View File

@ -2,6 +2,7 @@ package at.procon.dip.domain.document.repository;
import at.procon.dip.domain.document.EmbeddingStatus; import at.procon.dip.domain.document.EmbeddingStatus;
import at.procon.dip.domain.document.entity.DocumentEmbedding; import at.procon.dip.domain.document.entity.DocumentEmbedding;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import java.time.OffsetDateTime; import java.time.OffsetDateTime;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
@ -32,15 +33,25 @@ public interface DocumentEmbeddingRepository extends JpaRepository<DocumentEmbed
"WHERE e.id = :embeddingId") "WHERE e.id = :embeddingId")
Optional<DocumentEmbedding> findDetailedById(@Param("embeddingId") UUID embeddingId); Optional<DocumentEmbedding> findDetailedById(@Param("embeddingId") UUID embeddingId);
default int updateEmbeddingVector(@Param("id") UUID id,
@Param("vectorData") float[] vectorData,
@Param("tokenCount") Integer tokenCount,
@Param("dimensions") Integer dimensions) {
return updateEmbeddingVector(id, vectorData, tokenCount, dimensions, EmbeddingPrefixMode.OFF.name(), null);
}
@Modifying @Modifying
@Query(value = "UPDATE doc.doc_embedding SET embedding_vector = CAST(:vectorData AS vector), " + @Query(value = "UPDATE doc.doc_embedding SET embedding_vector = CAST(:vectorData AS vector), " +
"embedding_status = 'COMPLETED', embedded_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP, " + "embedding_status = 'COMPLETED', embedded_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP, " +
"error_message = NULL, token_count = :tokenCount, embedding_dimensions = :dimensions WHERE id = :id", "error_message = NULL, token_count = :tokenCount, embedding_dimensions = :dimensions, " +
"prefix_mode = :prefixMode, applied_prefix = :appliedPrefix WHERE id = :id",
nativeQuery = true) nativeQuery = true)
int updateEmbeddingVector(@Param("id") UUID id, int updateEmbeddingVector(@Param("id") UUID id,
@Param("vectorData") float[] vectorData, @Param("vectorData") float[] vectorData,
@Param("tokenCount") Integer tokenCount, @Param("tokenCount") Integer tokenCount,
@Param("dimensions") Integer dimensions); @Param("dimensions") Integer dimensions,
@Param("prefixMode") String prefixMode,
@Param("appliedPrefix") String appliedPrefix);
@Modifying @Modifying
@Query("UPDATE DocumentEmbedding e SET e.embeddingStatus = :status, e.errorMessage = :errorMessage, " + @Query("UPDATE DocumentEmbedding e SET e.embeddingStatus = :status, e.errorMessage = :errorMessage, " +

View File

@ -1,6 +1,7 @@
package at.procon.dip.embedding.config; package at.procon.dip.embedding.config;
import at.procon.dip.domain.document.DistanceMetric; import at.procon.dip.domain.document.DistanceMetric;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import java.time.Duration; import java.time.Duration;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
@ -50,6 +51,9 @@ public class EmbeddingProperties {
private boolean supportsBatch = false; private boolean supportsBatch = false;
private Integer maxInputChars; private Integer maxInputChars;
private boolean active = true; private boolean active = true;
private EmbeddingPrefixMode prefixMode = EmbeddingPrefixMode.OFF;
private String queryPrefix = "query: ";
private String documentPrefix = "passage: ";
} }
@Data @Data

View File

@ -0,0 +1,35 @@
package at.procon.dip.embedding.model;
import java.util.List;
public final class EmbeddingInputPrefixing {
private EmbeddingInputPrefixing() {
}
public static PrefixedTexts apply(EmbeddingModelDescriptor model, EmbeddingUseCase useCase, List<String> texts) {
EmbeddingPrefixMode prefixMode = model.prefixMode() == null ? EmbeddingPrefixMode.OFF : model.prefixMode();
return switch (prefixMode) {
case OFF -> new PrefixedTexts(texts, EmbeddingPrefixMode.OFF, null);
case EXTERNAL -> new PrefixedTexts(texts, EmbeddingPrefixMode.EXTERNAL, null);
case CLIENT -> {
String prefix = useCase == EmbeddingUseCase.QUERY ? model.queryPrefix() : model.documentPrefix();
if (prefix == null || prefix.isBlank()) {
throw new IllegalStateException("Prefix mode CLIENT requires a non-blank prefix for use case " + useCase);
}
yield new PrefixedTexts(
texts.stream().map(text -> prefix + text).toList(),
EmbeddingPrefixMode.CLIENT,
prefix
);
}
};
}
public record PrefixedTexts(
List<String> texts,
EmbeddingPrefixMode prefixMode,
String appliedPrefix
) {
}
}

View File

@ -11,6 +11,33 @@ public record EmbeddingModelDescriptor(
boolean supportsQueryEmbeddingMode, boolean supportsQueryEmbeddingMode,
boolean supportsBatch, boolean supportsBatch,
Integer maxInputChars, Integer maxInputChars,
boolean active boolean active,
EmbeddingPrefixMode prefixMode,
String queryPrefix,
String documentPrefix
) { ) {
public EmbeddingModelDescriptor(String modelKey,
String providerConfigKey,
String providerModelKey,
int dimensions,
DistanceMetric distanceMetric,
boolean supportsQueryEmbeddingMode,
boolean supportsBatch,
Integer maxInputChars,
boolean active) {
this(
modelKey,
providerConfigKey,
providerModelKey,
dimensions,
distanceMetric,
supportsQueryEmbeddingMode,
supportsBatch,
maxInputChars,
active,
EmbeddingPrefixMode.OFF,
"query: ",
"passage: "
);
}
} }

View File

@ -0,0 +1,7 @@
package at.procon.dip.embedding.model;
public enum EmbeddingPrefixMode {
OFF,
CLIENT,
EXTERNAL
}

View File

@ -7,6 +7,15 @@ public record EmbeddingProviderResult(
List<float[]> vectors, List<float[]> vectors,
List<String> warnings, List<String> warnings,
String providerRequestId, String providerRequestId,
Integer tokenCount Integer tokenCount,
EmbeddingPrefixMode prefixMode,
String appliedPrefix
) { ) {
public EmbeddingProviderResult(EmbeddingModelDescriptor model,
List<float[]> vectors,
List<String> warnings,
String providerRequestId,
Integer tokenCount) {
this(model, vectors, warnings, providerRequestId, tokenCount, EmbeddingPrefixMode.OFF, null);
}
} }

View File

@ -1,5 +1,6 @@
package at.procon.dip.embedding.provider.http; package at.procon.dip.embedding.provider.http;
import at.procon.dip.embedding.model.EmbeddingInputPrefixing;
import at.procon.dip.embedding.model.EmbeddingModelDescriptor; import at.procon.dip.embedding.model.EmbeddingModelDescriptor;
import at.procon.dip.embedding.model.EmbeddingProviderResult; import at.procon.dip.embedding.model.EmbeddingProviderResult;
import at.procon.dip.embedding.model.EmbeddingRequest; import at.procon.dip.embedding.model.EmbeddingRequest;
@ -12,17 +13,19 @@ import java.io.IOException;
import java.net.http.HttpResponse; import java.net.http.HttpResponse;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
/** /**
* Existing HTTP/JSON embedding provider using the /embed contract. * Compatibility provider for the currently running external Python vectorization service.
*
* It calls /embed with the legacy payload shape { text, isQuery }.
*/ */
@Component @Component
public class ExternalHttpEmbeddingProvider extends AbstractHttpEmbeddingProviderSupport implements EmbeddingProvider { public class ExternalHttpEmbeddingProvider extends AbstractHttpEmbeddingProviderSupport implements EmbeddingProvider {
private static final String PROVIDER_TYPE = "http-json"; private static final String PROVIDER_TYPE = "http-json";
public ExternalHttpEmbeddingProvider(ObjectMapper objectMapper, ObjectMapper mapper) { public ExternalHttpEmbeddingProvider(ObjectMapper objectMapper) {
super(objectMapper); super(objectMapper);
} }
@ -40,29 +43,32 @@ public class ExternalHttpEmbeddingProvider extends AbstractHttpEmbeddingProvider
public EmbeddingProviderResult embedDocuments(ResolvedEmbeddingProviderConfig providerConfig, public EmbeddingProviderResult embedDocuments(ResolvedEmbeddingProviderConfig providerConfig,
EmbeddingModelDescriptor model, EmbeddingModelDescriptor model,
EmbeddingRequest request) { EmbeddingRequest request) {
return execute(providerConfig, request, EmbeddingUseCase.DOCUMENT); return execute(providerConfig, model, request, EmbeddingUseCase.DOCUMENT);
} }
@Override @Override
public EmbeddingProviderResult embedQuery(ResolvedEmbeddingProviderConfig providerConfig, public EmbeddingProviderResult embedQuery(ResolvedEmbeddingProviderConfig providerConfig,
EmbeddingModelDescriptor model, EmbeddingModelDescriptor model,
EmbeddingRequest request) { EmbeddingRequest request) {
return execute(providerConfig, request, EmbeddingUseCase.QUERY); return execute(providerConfig, model, request, EmbeddingUseCase.QUERY);
} }
private EmbeddingProviderResult execute(ResolvedEmbeddingProviderConfig providerConfig, private EmbeddingProviderResult execute(ResolvedEmbeddingProviderConfig providerConfig,
EmbeddingModelDescriptor model,
EmbeddingRequest request, EmbeddingRequest request,
EmbeddingUseCase useCase) { EmbeddingUseCase useCase) {
if (request.texts() == null || request.texts().isEmpty()) { if (request.texts() == null || request.texts().isEmpty()) {
throw new IllegalArgumentException("Embedding request texts must not be empty"); throw new IllegalArgumentException("Embedding request texts must not be empty");
} }
EmbeddingInputPrefixing.PrefixedTexts prefixedTexts = EmbeddingInputPrefixing.apply(model, useCase, request.texts());
try { try {
HttpResponse<String> response = postJson( HttpResponse<String> response = postJson(
providerConfig, providerConfig,
"/embed", "/embed",
Map.of( Map.of(
"text", request.texts().getFirst(), "text", prefixedTexts.texts().getFirst(),
"isQuery", useCase == EmbeddingUseCase.QUERY "isQuery", useCase == EmbeddingUseCase.QUERY
) )
); );
@ -73,11 +79,13 @@ public class ExternalHttpEmbeddingProvider extends AbstractHttpEmbeddingProvider
} }
return new EmbeddingProviderResult( return new EmbeddingProviderResult(
null, model,
List.of(parsed.embedding), List.of(parsed.embedding),
List.of(), List.of(),
null, null,
parsed.tokenCount parsed.tokenCount,
prefixedTexts.prefixMode(),
prefixedTexts.appliedPrefix()
); );
} catch (InterruptedException e) { } catch (InterruptedException e) {
Thread.currentThread().interrupt(); Thread.currentThread().interrupt();

View File

@ -1,8 +1,10 @@
package at.procon.dip.embedding.provider.http; package at.procon.dip.embedding.provider.http;
import at.procon.dip.embedding.model.EmbeddingInputPrefixing;
import at.procon.dip.embedding.model.EmbeddingModelDescriptor; import at.procon.dip.embedding.model.EmbeddingModelDescriptor;
import at.procon.dip.embedding.model.EmbeddingProviderResult; import at.procon.dip.embedding.model.EmbeddingProviderResult;
import at.procon.dip.embedding.model.EmbeddingRequest; import at.procon.dip.embedding.model.EmbeddingRequest;
import at.procon.dip.embedding.model.EmbeddingUseCase;
import at.procon.dip.embedding.model.ResolvedEmbeddingProviderConfig; import at.procon.dip.embedding.model.ResolvedEmbeddingProviderConfig;
import at.procon.dip.embedding.provider.EmbeddingProvider; import at.procon.dip.embedding.provider.EmbeddingProvider;
import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonProperty;
@ -73,27 +75,30 @@ public class VectorSyncHttpEmbeddingProvider extends AbstractHttpEmbeddingProvid
public EmbeddingProviderResult embedDocuments(ResolvedEmbeddingProviderConfig providerConfig, public EmbeddingProviderResult embedDocuments(ResolvedEmbeddingProviderConfig providerConfig,
EmbeddingModelDescriptor model, EmbeddingModelDescriptor model,
EmbeddingRequest request) { EmbeddingRequest request) {
return execute(providerConfig, model, request); return execute(providerConfig, model, request, EmbeddingUseCase.DOCUMENT);
} }
@Override @Override
public EmbeddingProviderResult embedQuery(ResolvedEmbeddingProviderConfig providerConfig, public EmbeddingProviderResult embedQuery(ResolvedEmbeddingProviderConfig providerConfig,
EmbeddingModelDescriptor model, EmbeddingModelDescriptor model,
EmbeddingRequest request) { EmbeddingRequest request) {
return execute(providerConfig, model, request); return execute(providerConfig, model, request, EmbeddingUseCase.QUERY);
} }
private EmbeddingProviderResult execute(ResolvedEmbeddingProviderConfig providerConfig, private EmbeddingProviderResult execute(ResolvedEmbeddingProviderConfig providerConfig,
EmbeddingModelDescriptor model, EmbeddingModelDescriptor model,
EmbeddingRequest request) { EmbeddingRequest request,
EmbeddingUseCase useCase) {
if (request.texts() == null || request.texts().isEmpty()) { if (request.texts() == null || request.texts().isEmpty()) {
throw new IllegalArgumentException("Embedding request texts must not be empty"); throw new IllegalArgumentException("Embedding request texts must not be empty");
} }
EmbeddingInputPrefixing.PrefixedTexts prefixedTexts = EmbeddingInputPrefixing.apply(model, useCase, request.texts());
try { try {
return request.texts().size() == 1 return prefixedTexts.texts().size() == 1
? executeSingle(providerConfig, model, request.texts().getFirst()) ? executeSingle(providerConfig, model, prefixedTexts)
: executeBatch(providerConfig, model, request); : executeBatch(providerConfig, model, request, prefixedTexts);
} catch (InterruptedException e) { } catch (InterruptedException e) {
Thread.currentThread().interrupt(); Thread.currentThread().interrupt();
throw new IllegalStateException("Embedding provider call interrupted", e); throw new IllegalStateException("Embedding provider call interrupted", e);
@ -104,11 +109,11 @@ public class VectorSyncHttpEmbeddingProvider extends AbstractHttpEmbeddingProvid
private EmbeddingProviderResult executeSingle(ResolvedEmbeddingProviderConfig providerConfig, private EmbeddingProviderResult executeSingle(ResolvedEmbeddingProviderConfig providerConfig,
EmbeddingModelDescriptor model, EmbeddingModelDescriptor model,
String text) throws IOException, InterruptedException { EmbeddingInputPrefixing.PrefixedTexts prefixedTexts) throws IOException, InterruptedException {
HttpResponse<String> response = postJson( HttpResponse<String> response = postJson(
providerConfig, providerConfig,
"/vector-sync", "/vector-sync",
new VectorSyncRequest(model.providerModelKey(), text) new VectorSyncRequest(model.providerModelKey(), prefixedTexts.texts().getFirst())
); );
VectorSyncResponse parsed = objectMapper.readValue(response.body(), VectorSyncResponse.class); VectorSyncResponse parsed = objectMapper.readValue(response.body(), VectorSyncResponse.class);
@ -119,13 +124,16 @@ public class VectorSyncHttpEmbeddingProvider extends AbstractHttpEmbeddingProvid
List.of(vector), List.of(vector),
List.of(), List.of(),
null, null,
parsed.tokenCount parsed.tokenCount,
prefixedTexts.prefixMode(),
prefixedTexts.appliedPrefix()
); );
} }
private EmbeddingProviderResult executeBatch(ResolvedEmbeddingProviderConfig providerConfig, private EmbeddingProviderResult executeBatch(ResolvedEmbeddingProviderConfig providerConfig,
EmbeddingModelDescriptor model, EmbeddingModelDescriptor model,
EmbeddingRequest request) throws IOException, InterruptedException { EmbeddingRequest request,
EmbeddingInputPrefixing.PrefixedTexts prefixedTexts) throws IOException, InterruptedException {
BatchRequestSettings settings = resolveBatchRequestSettings(providerConfig, request.providerOptions()); BatchRequestSettings settings = resolveBatchRequestSettings(providerConfig, request.providerOptions());
if (settings.truncateLength() <= 0) { if (settings.truncateLength() <= 0) {
@ -135,10 +143,10 @@ public class VectorSyncHttpEmbeddingProvider extends AbstractHttpEmbeddingProvid
throw new IllegalArgumentException("Batch chunk size must be > 0"); throw new IllegalArgumentException("Batch chunk size must be > 0");
} }
List<String> requestOrder = new ArrayList<>(request.texts().size()); List<String> requestOrder = new ArrayList<>(prefixedTexts.texts().size());
List<VectorizeBatchItemRequest> items = new ArrayList<>(request.texts().size()); List<VectorizeBatchItemRequest> items = new ArrayList<>(prefixedTexts.texts().size());
for (String text : request.texts()) { for (String text : prefixedTexts.texts()) {
String id = UUID.randomUUID().toString(); String id = UUID.randomUUID().toString();
requestOrder.add(id); requestOrder.add(id);
items.add(new VectorizeBatchItemRequest(id, text)); items.add(new VectorizeBatchItemRequest(id, text));
@ -167,7 +175,7 @@ public class VectorSyncHttpEmbeddingProvider extends AbstractHttpEmbeddingProvid
resultById.put(result.id, result); resultById.put(result.id, result);
} }
List<float[]> vectors = new ArrayList<>(request.texts().size()); List<float[]> vectors = new ArrayList<>(prefixedTexts.texts().size());
int totalTokenCount = 0; int totalTokenCount = 0;
boolean hasAnyTokenCount = false; boolean hasAnyTokenCount = false;
@ -190,7 +198,9 @@ public class VectorSyncHttpEmbeddingProvider extends AbstractHttpEmbeddingProvid
vectors, vectors,
List.of(), List.of(),
null, null,
hasAnyTokenCount ? totalTokenCount : null hasAnyTokenCount ? totalTokenCount : null,
prefixedTexts.prefixMode(),
prefixedTexts.appliedPrefix()
); );
} }

View File

@ -57,7 +57,10 @@ public class EmbeddingModelRegistry {
model.isSupportsQueryEmbeddingMode(), model.isSupportsQueryEmbeddingMode(),
model.isSupportsBatch(), model.isSupportsBatch(),
model.getMaxInputChars(), model.getMaxInputChars(),
model.isActive() model.isActive(),
model.getPrefixMode(),
model.getQueryPrefix(),
model.getDocumentPrefix()
); );
} }
} }

View File

@ -1,6 +1,7 @@
package at.procon.dip.embedding.service; package at.procon.dip.embedding.service;
import at.procon.dip.embedding.job.service.EmbeddingJobService; import at.procon.dip.embedding.job.service.EmbeddingJobService;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import at.procon.dip.embedding.model.EmbeddingProviderResult; import at.procon.dip.embedding.model.EmbeddingProviderResult;
import java.util.UUID; import java.util.UUID;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -37,7 +38,18 @@ public class EmbeddingJobExecutionPersistenceService {
Integer tokenCount, Integer tokenCount,
UUID jobId, UUID jobId,
String providerRequestId) { String providerRequestId) {
embeddingPersistenceService.saveCompleted(embeddingId, vector, tokenCount); completeJob(embeddingId, vector, tokenCount, jobId, providerRequestId, EmbeddingPrefixMode.OFF, null);
}
@Transactional(propagation = Propagation.REQUIRES_NEW)
public void completeJob(UUID embeddingId,
float[] vector,
Integer tokenCount,
UUID jobId,
String providerRequestId,
EmbeddingPrefixMode prefixMode,
String appliedPrefix) {
embeddingPersistenceService.saveCompleted(embeddingId, vector, tokenCount, prefixMode, appliedPrefix);
jobService.markCompleted(jobId, providerRequestId); jobService.markCompleted(jobId, providerRequestId);
} }

View File

@ -7,6 +7,7 @@ import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.domain.document.service.DocumentEmbeddingService; import at.procon.dip.domain.document.service.DocumentEmbeddingService;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import at.procon.dip.embedding.model.EmbeddingProviderResult; import at.procon.dip.embedding.model.EmbeddingProviderResult;
import java.time.OffsetDateTime; import java.time.OffsetDateTime;
import java.util.UUID; import java.util.UUID;
@ -43,10 +44,18 @@ public class EmbeddingPersistenceService {
if (result.vectors() == null || result.vectors().isEmpty()) { if (result.vectors() == null || result.vectors().isEmpty()) {
throw new IllegalArgumentException("Embedding provider result contains no vectors"); throw new IllegalArgumentException("Embedding provider result contains no vectors");
} }
saveCompleted(embeddingId, result.vectors().getFirst(), result.tokenCount()); saveCompleted(embeddingId, result.vectors().getFirst(), result.tokenCount(), result.prefixMode(), result.appliedPrefix());
} }
public void saveCompleted(UUID embeddingId, float[] vector, Integer tokenCount) { public void saveCompleted(UUID embeddingId, float[] vector, Integer tokenCount) {
saveCompleted(embeddingId, vector, tokenCount, EmbeddingPrefixMode.OFF, null);
}
public void saveCompleted(UUID embeddingId,
float[] vector,
Integer tokenCount,
EmbeddingPrefixMode prefixMode,
String appliedPrefix) {
if (vector == null || vector.length == 0) { if (vector == null || vector.length == 0) {
throw new IllegalArgumentException("Embedding vector must not be empty"); throw new IllegalArgumentException("Embedding vector must not be empty");
} }
@ -54,7 +63,9 @@ public class EmbeddingPersistenceService {
embeddingId, embeddingId,
vector, vector,
tokenCount, tokenCount,
vector.length vector.length,
(prefixMode == null ? EmbeddingPrefixMode.OFF : prefixMode).name(),
appliedPrefix
); );
} }

View File

@ -285,7 +285,9 @@ public class RepresentationEmbeddingOrchestrator {
result.vectors().get(i), result.vectors().get(i),
null, null,
prepared.job().getId(), prepared.job().getId(),
result.providerRequestId() result.providerRequestId(),
result.prefixMode(),
result.appliedPrefix()
); );
} }
} catch (RuntimeException ex) { } catch (RuntimeException ex) {

View File

@ -1,6 +1,7 @@
package at.procon.dip.embedding.startup; package at.procon.dip.embedding.startup;
import at.procon.dip.embedding.config.EmbeddingProperties; import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import at.procon.dip.embedding.registry.EmbeddingModelRegistry; import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
import at.procon.dip.embedding.registry.EmbeddingProviderConfigResolver; import at.procon.dip.embedding.registry.EmbeddingProviderConfigResolver;
import at.procon.dip.embedding.registry.EmbeddingProviderRegistry; import at.procon.dip.embedding.registry.EmbeddingProviderRegistry;
@ -34,8 +35,16 @@ public class EmbeddingSubsystemStartupValidator implements ApplicationRunner {
modelRegistry.getActiveModels().forEach(model -> { modelRegistry.getActiveModels().forEach(model -> {
var providerConfig = providerConfigResolver.resolve(model.providerConfigKey()); var providerConfig = providerConfigResolver.resolve(model.providerConfigKey());
providerRegistry.getRequired(providerConfig.providerType()); providerRegistry.getRequired(providerConfig.providerType());
log.info("Validated embedding model {} -> provider {} ({})", if (model.prefixMode() == EmbeddingPrefixMode.CLIENT) {
model.modelKey(), model.providerConfigKey(), providerConfig.providerType()); if (model.queryPrefix() == null || model.queryPrefix().isBlank()) {
throw new IllegalStateException("Embedding model " + model.modelKey() + " uses CLIENT prefix mode but query-prefix is blank");
}
if (model.documentPrefix() == null || model.documentPrefix().isBlank()) {
throw new IllegalStateException("Embedding model " + model.modelKey() + " uses CLIENT prefix mode but document-prefix is blank");
}
}
log.info("Validated embedding model {} -> provider {} ({}, prefixMode={})",
model.modelKey(), model.providerConfigKey(), providerConfig.providerType(), model.prefixMode());
}); });
if (properties.getDefaultDocumentModel() != null && !properties.getDefaultDocumentModel().isBlank()) { if (properties.getDefaultDocumentModel() != null && !properties.getDefaultDocumentModel().isBlank()) {

View File

@ -133,7 +133,7 @@ ted:
# Polling delay in milliseconds (1 minute) # Polling delay in milliseconds (1 minute)
delay: 60000 delay: 60000
# Max messages per poll # Max messages per poll
max-messages-per-poll: 100 max-messages-per-poll: 10
# Output directory for processed attachments # Output directory for processed attachments
attachment-output-directory: /ted.europe/mail-attachments attachment-output-directory: /ted.europe/mail-attachments
# Enable/disable MIME file input processing # Enable/disable MIME file input processing

View File

@ -39,11 +39,11 @@ dip:
embedding: embedding:
enabled: true enabled: true
jobs: jobs:
enabled: true enabled: false
parallel-batch-count: 2 parallel-batch-count: 1
process-in-batches: true process-in-batches: true
batch-size: 48 batch-size: 16
execution-batch-size: 48 execution-batch-size: 16
default-document-model: e5-default default-document-model: e5-default
default-query-model: e5-default default-query-model: e5-default
@ -56,7 +56,7 @@ dip:
external-e5: external-e5:
type: http-json type: http-json
base-url: http://172.20.20.6:8001 base-url: http://172.20.241.55:8001
connect-timeout: 5s connect-timeout: 5s
read-timeout: 60s read-timeout: 60s
batch-request: batch-request:
@ -66,7 +66,7 @@ dip:
vector-sync-e5: vector-sync-e5:
type: http-vector-sync type: http-vector-sync
base-url: http://172.20.20.6:8001 base-url: http://172.20.241.55:8001
connect-timeout: 30s connect-timeout: 30s
read-timeout: 300s read-timeout: 300s
headers: headers:
@ -93,6 +93,9 @@ dip:
distance-metric: COSINE distance-metric: COSINE
supports-query-embedding-mode: true supports-query-embedding-mode: true
supports-batch: true supports-batch: true
prefix-mode: CLIENT
query-prefix: "query: "
document-prefix: "passage: "
active: true active: true
profiles: profiles:
@ -290,6 +293,18 @@ dip:
# Delete tar.gz after ingestion # Delete tar.gz after ingestion
delete-after-ingestion: true delete-after-ingestion: true
time:
enabled: false
leitstand:
enabled: false
import-batch-id: time-leitstand
reconcile-lookback-days: 7
toggl-track:
enabled: false
import-batch-id: time-toggl
reconcile-lookback-days: 7
ted: # Phase 3 TED projection configuration ted: # Phase 3 TED projection configuration
projection: projection:
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document # Enable/disable dual-write into the TED projection model on top of DOC.doc_document

View File

@ -14,9 +14,12 @@ spring:
name: document-intelligence-platform name: document-intelligence-platform
datasource: datasource:
url: jdbc:postgresql://localhost:5432/RELM #url: jdbc:postgresql://localhost:5432/RELM
#username: ${DB_USERNAME:postgres}
#password: ${DB_PASSWORD:P54!pcd#Wi}
url: jdbc:postgresql://94.130.218.54:32333/RELM
username: ${DB_USERNAME:postgres} username: ${DB_USERNAME:postgres}
password: ${DB_PASSWORD:P54!pcd#Wi} password: ${DB_PASSWORD:PDmXRx0Rbk9OFOn9qO5Gm/mPCfqW8zwbZ+/YIU1lySc=}
driver-class-name: org.postgresql.Driver driver-class-name: org.postgresql.Driver
hikari: hikari:
maximum-pool-size: 5 maximum-pool-size: 5
@ -28,7 +31,7 @@ spring:
jpa: jpa:
hibernate: hibernate:
ddl-auto: update ddl-auto: validate
show-sql: false show-sql: false
open-in-view: false open-in-view: false
properties: properties:

View File

@ -0,0 +1,8 @@
ALTER TABLE doc.doc_embedding
ADD COLUMN IF NOT EXISTS prefix_mode VARCHAR(32) NOT NULL DEFAULT 'OFF';
ALTER TABLE doc.doc_embedding
ADD COLUMN IF NOT EXISTS applied_prefix VARCHAR(64);
CREATE INDEX IF NOT EXISTS idx_doc_embedding_prefix_mode
ON doc.doc_embedding(prefix_mode);

View File

@ -2,6 +2,7 @@ package at.procon.dip.architecture;
import at.procon.dip.domain.ted.config.TedProjectionProperties; import at.procon.dip.domain.ted.config.TedProjectionProperties;
import at.procon.dip.ingestion.config.DipIngestionProperties; import at.procon.dip.ingestion.config.DipIngestionProperties;
import at.procon.dip.domain.time.config.TimeDomainProperties;
import at.procon.dip.search.config.DipSearchProperties; import at.procon.dip.search.config.DipSearchProperties;
import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.config.TedProcessorProperties;
import java.lang.reflect.Constructor; import java.lang.reflect.Constructor;
@ -22,11 +23,8 @@ class NewRuntimeMustNotDependOnTedProcessorPropertiesTest {
List<Class<?>> newRuntimeClasses = List.of( List<Class<?>> newRuntimeClasses = List.of(
at.procon.dip.ingestion.service.GenericDocumentImportService.class, at.procon.dip.ingestion.service.GenericDocumentImportService.class,
at.procon.dip.ingestion.camel.GenericFileSystemIngestionRoute.class, at.procon.dip.ingestion.camel.GenericFileSystemIngestionRoute.class,
at.procon.dip.ingestion.camel.GenericMailIngestionRoute.class,
at.procon.dip.ingestion.controller.GenericDocumentImportController.class, at.procon.dip.ingestion.controller.GenericDocumentImportController.class,
at.procon.dip.ingestion.adapter.MailDocumentIngestionAdapter.class, at.procon.dip.ingestion.adapter.MailDocumentIngestionAdapter.class,
at.procon.dip.ingestion.service.MailMetadataPersistenceService.class,
at.procon.dip.ingestion.mail.MailImportIdentityResolver.class,
at.procon.dip.ingestion.adapter.TedPackageDocumentIngestionAdapter.class, at.procon.dip.ingestion.adapter.TedPackageDocumentIngestionAdapter.class,
at.procon.dip.ingestion.service.TedPackageChildImportProcessor.class, at.procon.dip.ingestion.service.TedPackageChildImportProcessor.class,
at.procon.dip.domain.ted.service.TedNoticeProjectionService.class, at.procon.dip.domain.ted.service.TedNoticeProjectionService.class,
@ -52,6 +50,7 @@ class NewRuntimeMustNotDependOnTedProcessorPropertiesTest {
assertThat(DipSearchProperties.class).isNotNull(); assertThat(DipSearchProperties.class).isNotNull();
assertThat(DipIngestionProperties.class).isNotNull(); assertThat(DipIngestionProperties.class).isNotNull();
assertThat(TedProjectionProperties.class).isNotNull(); assertThat(TedProjectionProperties.class).isNotNull();
assertThat(TimeDomainProperties.class).isNotNull();
} }
private boolean hasDependency(Class<?> owner, Class<?> dependency) { private boolean hasDependency(Class<?> owner, Class<?> dependency) {

View File

@ -4,6 +4,7 @@ import static org.assertj.core.api.Assertions.assertThat;
import at.procon.dip.domain.document.DistanceMetric; import at.procon.dip.domain.document.DistanceMetric;
import at.procon.dip.embedding.model.EmbeddingModelDescriptor; import at.procon.dip.embedding.model.EmbeddingModelDescriptor;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import at.procon.dip.embedding.model.EmbeddingRequest; import at.procon.dip.embedding.model.EmbeddingRequest;
import at.procon.dip.embedding.model.EmbeddingUseCase; import at.procon.dip.embedding.model.EmbeddingUseCase;
import at.procon.dip.embedding.model.ResolvedEmbeddingProviderConfig; import at.procon.dip.embedding.model.ResolvedEmbeddingProviderConfig;
@ -27,6 +28,7 @@ class VectorSyncHttpEmbeddingProviderTest {
private final ObjectMapper objectMapper = new ObjectMapper(); private final ObjectMapper objectMapper = new ObjectMapper();
private HttpServer server; private HttpServer server;
private final AtomicReference<String> lastBatchBody = new AtomicReference<>(); private final AtomicReference<String> lastBatchBody = new AtomicReference<>();
private final AtomicReference<String> lastSingleBody = new AtomicReference<>();
@AfterEach @AfterEach
void tearDown() { void tearDown() {
@ -75,6 +77,53 @@ class VectorSyncHttpEmbeddingProviderTest {
assertThat(result.vectors()).hasSize(1); assertThat(result.vectors()).hasSize(1);
assertThat(result.vectors().getFirst()).containsExactly(0.1f, 0.2f, 0.3f); assertThat(result.vectors().getFirst()).containsExactly(0.1f, 0.2f, 0.3f);
assertThat(result.tokenCount()).isEqualTo(9); assertThat(result.tokenCount()).isEqualTo(9);
assertThat(result.prefixMode()).isEqualTo(EmbeddingPrefixMode.OFF);
assertThat(result.appliedPrefix()).isNull();
}
@Test
void shouldPrefixTextsInClientModeForDocuments() throws Exception {
server = HttpServer.create(new InetSocketAddress(0), 0);
server.createContext("/vector-sync", this::handleVectorSync);
server.start();
var provider = new VectorSyncHttpEmbeddingProvider(objectMapper);
var config = ResolvedEmbeddingProviderConfig.builder()
.key("vector-sync-local")
.providerType("http-vector-sync")
.baseUrl("http://localhost:" + server.getAddress().getPort())
.readTimeout(Duration.ofSeconds(5))
.headers(Map.of("X-Client", "dip-test"))
.batchTruncateText(false)
.batchTruncateLength(512)
.batchChunkSize(20)
.build();
var model = new EmbeddingModelDescriptor(
"e5-default",
"vector-sync-local",
"intfloat/multilingual-e5-large",
3,
DistanceMetric.COSINE,
true,
false,
8192,
true,
EmbeddingPrefixMode.CLIENT,
"query: ",
"passage: "
);
var request = EmbeddingRequest.builder()
.modelKey("e5-default")
.useCase(EmbeddingUseCase.DOCUMENT)
.texts(List.of("This is a sample text to vectorize"))
.providerOptions(Map.of())
.build();
var result = provider.embedDocuments(config, model, request);
assertThat(result.prefixMode()).isEqualTo(EmbeddingPrefixMode.CLIENT);
assertThat(result.appliedPrefix()).isEqualTo("passage: ");
assertThat(lastSingleBody.get()).contains("passage: This is a sample text to vectorize");
} }
@Test @Test
@ -118,6 +167,7 @@ class VectorSyncHttpEmbeddingProviderTest {
assertThat(result.vectors().get(0)).containsExactly(0.1f, 0.2f, 0.3f); assertThat(result.vectors().get(0)).containsExactly(0.1f, 0.2f, 0.3f);
assertThat(result.vectors().get(1)).containsExactly(0.4f, 0.5f, 0.6f); assertThat(result.vectors().get(1)).containsExactly(0.4f, 0.5f, 0.6f);
assertThat(result.tokenCount()).isEqualTo(12); assertThat(result.tokenCount()).isEqualTo(12);
assertThat(result.prefixMode()).isEqualTo(EmbeddingPrefixMode.OFF);
JsonNode requestBody = objectMapper.readTree(lastBatchBody.get()); JsonNode requestBody = objectMapper.readTree(lastBatchBody.get());
assertThat(requestBody.get("truncate_text").asBoolean()).isTrue(); assertThat(requestBody.get("truncate_text").asBoolean()).isTrue();
@ -177,9 +227,10 @@ class VectorSyncHttpEmbeddingProviderTest {
body = new String(in.readAllBytes(), StandardCharsets.UTF_8); body = new String(in.readAllBytes(), StandardCharsets.UTF_8);
} }
lastSingleBody.set(body);
assertThat(exchange.getRequestMethod()).isEqualTo("POST"); assertThat(exchange.getRequestMethod()).isEqualTo("POST");
assertThat(body).contains("\"model\":\"intfloat/multilingual-e5-large\""); assertThat(body).contains("\"model\":\"intfloat/multilingual-e5-large\"");
assertThat(body).contains("\"text\":\"This is a sample text to vectorize\""); assertThat(body).contains("This is a sample text to vectorize");
assertThat(exchange.getRequestHeaders().getFirst("X-Client")).isEqualTo("dip-test"); assertThat(exchange.getRequestHeaders().getFirst("X-Client")).isEqualTo("dip-test");
respondJson(exchange, """ respondJson(exchange, """

View File

@ -175,8 +175,8 @@ class RepresentationEmbeddingOrchestratorTest {
ArgumentCaptor<List<String>> textsCaptor = ArgumentCaptor.forClass(List.class); ArgumentCaptor<List<String>> textsCaptor = ArgumentCaptor.forClass(List.class);
verify(executionService, times(1)).embedTexts(eq("e5-default"), eq(EmbeddingUseCase.DOCUMENT), textsCaptor.capture()); verify(executionService, times(1)).embedTexts(eq("e5-default"), eq(EmbeddingUseCase.DOCUMENT), textsCaptor.capture());
assertThat(textsCaptor.getValue()).containsExactly("alpha", "beta"); assertThat(textsCaptor.getValue()).containsExactly("alpha", "beta");
verify(executionPersistenceService).completeJob(eq(embeddingId1), aryEq(new float[]{0.1f, 0.2f, 0.3f}), eq(null), eq(job1.getId()), eq("batch-req-1")); verify(executionPersistenceService).completeJob(eq(embeddingId1), aryEq(new float[]{0.1f, 0.2f, 0.3f}), eq(null), eq(job1.getId()), eq("batch-req-1"), eq(EmbeddingPrefixMode.OFF), eq(null));
verify(executionPersistenceService).completeJob(eq(embeddingId2), aryEq(new float[]{0.4f, 0.5f, 0.6f}), eq(null), eq(job2.getId()), eq("batch-req-1")); verify(executionPersistenceService).completeJob(eq(embeddingId2), aryEq(new float[]{0.4f, 0.5f, 0.6f}), eq(null), eq(job2.getId()), eq("batch-req-1"), eq(EmbeddingPrefixMode.OFF), eq(null));
} }
@Test @Test
@ -220,7 +220,7 @@ class RepresentationEmbeddingOrchestratorTest {
orchestrator.processNextReadyBatch(); orchestrator.processNextReadyBatch();
verify(executionService, times(1)).embedTexts(eq("mock-search"), eq(EmbeddingUseCase.DOCUMENT), eq(List.of("gamma"))); verify(executionService, times(1)).embedTexts(eq("mock-search"), eq(EmbeddingUseCase.DOCUMENT), eq(List.of("gamma")));
verify(executionPersistenceService, never()).completeJob(eq(embeddingId), any(float[].class), eq(null), eq(job.getId()), anyString()); verify(executionPersistenceService, never()).completeJob(eq(embeddingId), any(float[].class), eq(null), eq(job.getId()), anyString(), any(), any());
verify(executionPersistenceService).completeJob(eq(embeddingId), any(EmbeddingProviderResult.class), eq(job.getId()), eq("req-2")); verify(executionPersistenceService).completeJob(eq(embeddingId), any(EmbeddingProviderResult.class), eq(job.getId()), eq("req-2"));
} }
@ -305,11 +305,11 @@ class RepresentationEmbeddingOrchestratorTest {
assertThat(processed).isEqualTo(6); assertThat(processed).isEqualTo(6);
verify(jobService, times(4)).claimNextReadyJobs(2); verify(jobService, times(4)).claimNextReadyJobs(2);
verify(executionService, times(3)).embedTexts(eq("e5-default"), eq(EmbeddingUseCase.DOCUMENT), any()); verify(executionService, times(3)).embedTexts(eq("e5-default"), eq(EmbeddingUseCase.DOCUMENT), any());
verify(executionPersistenceService).completeJob(eq(embeddingId1), aryEq(new float[]{0.1f, 0.2f, 0.3f}), eq(null), eq(job1.getId()), eq("batch-req-1")); verify(executionPersistenceService).completeJob(eq(embeddingId1), aryEq(new float[]{0.1f, 0.2f, 0.3f}), eq(null), eq(job1.getId()), eq("batch-req-1"), eq(EmbeddingPrefixMode.OFF), eq(null));
verify(executionPersistenceService).completeJob(eq(embeddingId2), aryEq(new float[]{0.4f, 0.5f, 0.6f}), eq(null), eq(job2.getId()), eq("batch-req-1")); verify(executionPersistenceService).completeJob(eq(embeddingId2), aryEq(new float[]{0.4f, 0.5f, 0.6f}), eq(null), eq(job2.getId()), eq("batch-req-1"), eq(EmbeddingPrefixMode.OFF), eq(null));
verify(executionPersistenceService).completeJob(eq(embeddingId3), aryEq(new float[]{0.7f, 0.8f, 0.9f}), eq(null), eq(job3.getId()), eq("batch-req-2")); verify(executionPersistenceService).completeJob(eq(embeddingId3), aryEq(new float[]{0.7f, 0.8f, 0.9f}), eq(null), eq(job3.getId()), eq("batch-req-2"), eq(EmbeddingPrefixMode.OFF), eq(null));
verify(executionPersistenceService).completeJob(eq(embeddingId4), aryEq(new float[]{1.0f, 1.1f, 1.2f}), eq(null), eq(job4.getId()), eq("batch-req-2")); verify(executionPersistenceService).completeJob(eq(embeddingId4), aryEq(new float[]{1.0f, 1.1f, 1.2f}), eq(null), eq(job4.getId()), eq("batch-req-2"), eq(EmbeddingPrefixMode.OFF), eq(null));
verify(executionPersistenceService).completeJob(eq(embeddingId5), aryEq(new float[]{1.3f, 1.4f, 1.5f}), eq(null), eq(job5.getId()), eq("batch-req-3")); verify(executionPersistenceService).completeJob(eq(embeddingId5), aryEq(new float[]{1.3f, 1.4f, 1.5f}), eq(null), eq(job5.getId()), eq("batch-req-3"), eq(EmbeddingPrefixMode.OFF), eq(null));
verify(executionPersistenceService).completeJob(eq(embeddingId6), aryEq(new float[]{1.6f, 1.7f, 1.8f}), eq(null), eq(job6.getId()), eq("batch-req-3")); verify(executionPersistenceService).completeJob(eq(embeddingId6), aryEq(new float[]{1.6f, 1.7f, 1.8f}), eq(null), eq(job6.getId()), eq("batch-req-3"), eq(EmbeddingPrefixMode.OFF), eq(null));
} }
} }