From 2687d4ba177e85c24e69cd607c6dd1b84830ecd4 Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Fri, 20 Mar 2026 17:37:26 +0100 Subject: [PATCH] embedding nv1 + search tests --- .../at/procon/dip/config/JacksonConfig.java | 18 +++ .../TedGenericDocumentRootService.java | 89 +++++++++++ .../embedding/config/EmbeddingProperties.java | 44 +++++ .../model/EmbeddingModelDescriptor.java | 16 ++ .../model/EmbeddingProviderResult.java | 12 ++ .../dip/embedding/model/EmbeddingRequest.java | 14 ++ .../dip/embedding/model/EmbeddingUseCase.java | 6 + .../ResolvedEmbeddingProviderConfig.java | 18 +++ .../embedding/provider/EmbeddingProvider.java | 25 +++ .../http/ExternalHttpEmbeddingProvider.java | 151 ++++++++++++++++++ .../provider/mock/MockEmbeddingProvider.java | 72 +++++++++ .../registry/EmbeddingModelRegistry.java | 63 ++++++++ .../EmbeddingProviderConfigResolver.java | 32 ++++ .../registry/EmbeddingProviderRegistry.java | 20 +++ .../service/DefaultQueryEmbeddingService.java | 36 +++++ .../service/EmbeddingExecutionService.java | 51 ++++++ .../service/QueryEmbeddingService.java | 8 + .../EmbeddingSubsystemStartupValidator.java | 45 ++++++ src/main/resources/application.yml | 34 +++- .../sql/create-doc-search-test-schemas.sql | 3 + 20 files changed, 756 insertions(+), 1 deletion(-) create mode 100644 src/main/java/at/procon/dip/config/JacksonConfig.java create mode 100644 src/main/java/at/procon/dip/domain/ted/service/TedGenericDocumentRootService.java create mode 100644 src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java create mode 100644 src/main/java/at/procon/dip/embedding/model/EmbeddingModelDescriptor.java create mode 100644 src/main/java/at/procon/dip/embedding/model/EmbeddingProviderResult.java create mode 100644 src/main/java/at/procon/dip/embedding/model/EmbeddingRequest.java create mode 100644 src/main/java/at/procon/dip/embedding/model/EmbeddingUseCase.java create mode 100644 src/main/java/at/procon/dip/embedding/model/ResolvedEmbeddingProviderConfig.java create mode 100644 src/main/java/at/procon/dip/embedding/provider/EmbeddingProvider.java create mode 100644 src/main/java/at/procon/dip/embedding/provider/http/ExternalHttpEmbeddingProvider.java create mode 100644 src/main/java/at/procon/dip/embedding/provider/mock/MockEmbeddingProvider.java create mode 100644 src/main/java/at/procon/dip/embedding/registry/EmbeddingModelRegistry.java create mode 100644 src/main/java/at/procon/dip/embedding/registry/EmbeddingProviderConfigResolver.java create mode 100644 src/main/java/at/procon/dip/embedding/registry/EmbeddingProviderRegistry.java create mode 100644 src/main/java/at/procon/dip/embedding/service/DefaultQueryEmbeddingService.java create mode 100644 src/main/java/at/procon/dip/embedding/service/EmbeddingExecutionService.java create mode 100644 src/main/java/at/procon/dip/embedding/service/QueryEmbeddingService.java create mode 100644 src/main/java/at/procon/dip/embedding/startup/EmbeddingSubsystemStartupValidator.java create mode 100644 src/test/resources/sql/create-doc-search-test-schemas.sql diff --git a/src/main/java/at/procon/dip/config/JacksonConfig.java b/src/main/java/at/procon/dip/config/JacksonConfig.java new file mode 100644 index 0000000..538b232 --- /dev/null +++ b/src/main/java/at/procon/dip/config/JacksonConfig.java @@ -0,0 +1,18 @@ +package at.procon.dip.config; + +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import org.springframework.boot.autoconfigure.jackson.Jackson2ObjectMapperBuilderCustomizer; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class JacksonConfig { + + @Bean + public Jackson2ObjectMapperBuilderCustomizer jsonCustomizer() { + return builder -> builder + .modules(new JavaTimeModule()) + .featuresToDisable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); + } +} \ No newline at end of file diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedGenericDocumentRootService.java b/src/main/java/at/procon/dip/domain/ted/service/TedGenericDocumentRootService.java new file mode 100644 index 0000000..d377e4f --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/service/TedGenericDocumentRootService.java @@ -0,0 +1,89 @@ +package at.procon.dip.domain.ted.service; + +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.repository.DocumentRepository; +import at.procon.dip.domain.document.service.DocumentService; +import at.procon.dip.domain.document.service.command.CreateDocumentCommand; +import at.procon.ted.model.entity.ProcurementDocument; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; +import org.springframework.util.StringUtils; + +/** + * Side-effect-free helper for TED projection flows. + *

+ * Unlike the legacy Phase 2 bridge, this service only ensures that the canonical + * DOC document root exists and is refreshed with TED metadata. It intentionally + * does not create/update sources, contents, representations, or embeddings. + */ +@Service +@RequiredArgsConstructor +@Slf4j +public class TedGenericDocumentRootService { + + private final DocumentRepository documentRepository; + private final DocumentService documentService; + + @Transactional + public UUID ensureGenericTedDocumentRoot(ProcurementDocument tedDocument) { + return ensureGenericTedDocument(tedDocument).getId(); + } + + @Transactional + public Document ensureGenericTedDocument(ProcurementDocument tedDocument) { + Document document = documentRepository.findByDedupHash(tedDocument.getDocumentHash()) + .orElseGet(() -> createGenericDocument(tedDocument)); + + document.setDocumentType(DocumentType.TED_NOTICE); + document.setDocumentFamily(DocumentFamily.PROCUREMENT); + document.setVisibility(DocumentVisibility.PUBLIC); + document.setStatus(DocumentStatus.CLASSIFIED); + document.setTitle(tedDocument.getProjectTitle()); + document.setSummary(tedDocument.getProjectDescription()); + document.setLanguageCode(tedDocument.getLanguageCode()); + document.setMimeType("application/xml"); + document.setBusinessKey(buildBusinessKey(tedDocument)); + document.setDedupHash(tedDocument.getDocumentHash()); + + Document saved = documentService.save(document); + log.debug("Ensured side-effect-free generic TED document root {} for legacy TED document {}", + saved.getId(), tedDocument.getId()); + return saved; + } + + private Document createGenericDocument(ProcurementDocument tedDocument) { + return documentService.create(new CreateDocumentCommand( + null, + DocumentVisibility.PUBLIC, + DocumentType.TED_NOTICE, + DocumentFamily.PROCUREMENT, + DocumentStatus.CLASSIFIED, + tedDocument.getProjectTitle(), + tedDocument.getProjectDescription(), + tedDocument.getLanguageCode(), + "application/xml", + buildBusinessKey(tedDocument), + tedDocument.getDocumentHash() + )); + } + + private String buildBusinessKey(ProcurementDocument tedDocument) { + if (StringUtils.hasText(tedDocument.getPublicationId())) { + return "TED:publication:" + tedDocument.getPublicationId(); + } + if (StringUtils.hasText(tedDocument.getNoticeId())) { + return "TED:notice:" + tedDocument.getNoticeId(); + } + if (StringUtils.hasText(tedDocument.getNoticeUrl())) { + return "TED:url:" + tedDocument.getNoticeUrl(); + } + return "TED:hash:" + tedDocument.getDocumentHash(); + } +} diff --git a/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java b/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java new file mode 100644 index 0000000..3a34c91 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java @@ -0,0 +1,44 @@ +package at.procon.dip.embedding.config; + +import at.procon.dip.domain.document.DistanceMetric; +import java.time.Duration; +import java.util.LinkedHashMap; +import java.util.Map; +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Configuration; + +@Configuration +@ConfigurationProperties(prefix = "dip.embedding") +@Data +public class EmbeddingProperties { + + private boolean enabled = false; + private String defaultDocumentModel; + private String defaultQueryModel; + private Map providers = new LinkedHashMap<>(); + private Map models = new LinkedHashMap<>(); + + @Data + public static class ProviderProperties { + private String type; + private String baseUrl; + private String apiKey; + private Duration connectTimeout = Duration.ofSeconds(5); + private Duration readTimeout = Duration.ofSeconds(60); + private Map headers = new LinkedHashMap<>(); + private Integer dimensions; + } + + @Data + public static class ModelProperties { + private String providerConfigKey; + private String providerModelKey; + private Integer dimensions; + private DistanceMetric distanceMetric = DistanceMetric.COSINE; + private boolean supportsQueryEmbeddingMode = true; + private boolean supportsBatch = false; + private Integer maxInputChars; + private boolean active = true; + } +} diff --git a/src/main/java/at/procon/dip/embedding/model/EmbeddingModelDescriptor.java b/src/main/java/at/procon/dip/embedding/model/EmbeddingModelDescriptor.java new file mode 100644 index 0000000..8982901 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/model/EmbeddingModelDescriptor.java @@ -0,0 +1,16 @@ +package at.procon.dip.embedding.model; + +import at.procon.dip.domain.document.DistanceMetric; + +public record EmbeddingModelDescriptor( + String modelKey, + String providerConfigKey, + String providerModelKey, + int dimensions, + DistanceMetric distanceMetric, + boolean supportsQueryEmbeddingMode, + boolean supportsBatch, + Integer maxInputChars, + boolean active +) { +} diff --git a/src/main/java/at/procon/dip/embedding/model/EmbeddingProviderResult.java b/src/main/java/at/procon/dip/embedding/model/EmbeddingProviderResult.java new file mode 100644 index 0000000..d6ae13d --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/model/EmbeddingProviderResult.java @@ -0,0 +1,12 @@ +package at.procon.dip.embedding.model; + +import java.util.List; + +public record EmbeddingProviderResult( + EmbeddingModelDescriptor model, + List vectors, + List warnings, + String providerRequestId, + Integer tokenCount +) { +} diff --git a/src/main/java/at/procon/dip/embedding/model/EmbeddingRequest.java b/src/main/java/at/procon/dip/embedding/model/EmbeddingRequest.java new file mode 100644 index 0000000..3ceaa2a --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/model/EmbeddingRequest.java @@ -0,0 +1,14 @@ +package at.procon.dip.embedding.model; + +import java.util.List; +import java.util.Map; +import lombok.Builder; + +@Builder +public record EmbeddingRequest( + String modelKey, + EmbeddingUseCase useCase, + List texts, + Map providerOptions +) { +} diff --git a/src/main/java/at/procon/dip/embedding/model/EmbeddingUseCase.java b/src/main/java/at/procon/dip/embedding/model/EmbeddingUseCase.java new file mode 100644 index 0000000..787dee6 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/model/EmbeddingUseCase.java @@ -0,0 +1,6 @@ +package at.procon.dip.embedding.model; + +public enum EmbeddingUseCase { + DOCUMENT, + QUERY +} diff --git a/src/main/java/at/procon/dip/embedding/model/ResolvedEmbeddingProviderConfig.java b/src/main/java/at/procon/dip/embedding/model/ResolvedEmbeddingProviderConfig.java new file mode 100644 index 0000000..4d0e8e6 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/model/ResolvedEmbeddingProviderConfig.java @@ -0,0 +1,18 @@ +package at.procon.dip.embedding.model; + +import java.time.Duration; +import java.util.Map; +import lombok.Builder; + +@Builder +public record ResolvedEmbeddingProviderConfig( + String key, + String providerType, + String baseUrl, + String apiKey, + Duration connectTimeout, + Duration readTimeout, + Map headers, + Integer dimensions +) { +} diff --git a/src/main/java/at/procon/dip/embedding/provider/EmbeddingProvider.java b/src/main/java/at/procon/dip/embedding/provider/EmbeddingProvider.java new file mode 100644 index 0000000..39de59c --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/provider/EmbeddingProvider.java @@ -0,0 +1,25 @@ +package at.procon.dip.embedding.provider; + +import at.procon.dip.embedding.model.EmbeddingModelDescriptor; +import at.procon.dip.embedding.model.EmbeddingProviderResult; +import at.procon.dip.embedding.model.EmbeddingRequest; +import at.procon.dip.embedding.model.ResolvedEmbeddingProviderConfig; + +public interface EmbeddingProvider { + + String providerType(); + + boolean supports(EmbeddingModelDescriptor model, ResolvedEmbeddingProviderConfig providerConfig); + + EmbeddingProviderResult embedDocuments( + ResolvedEmbeddingProviderConfig providerConfig, + EmbeddingModelDescriptor model, + EmbeddingRequest request + ); + + EmbeddingProviderResult embedQuery( + ResolvedEmbeddingProviderConfig providerConfig, + EmbeddingModelDescriptor model, + EmbeddingRequest request + ); +} diff --git a/src/main/java/at/procon/dip/embedding/provider/http/ExternalHttpEmbeddingProvider.java b/src/main/java/at/procon/dip/embedding/provider/http/ExternalHttpEmbeddingProvider.java new file mode 100644 index 0000000..7fb44db --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/provider/http/ExternalHttpEmbeddingProvider.java @@ -0,0 +1,151 @@ +package at.procon.dip.embedding.provider.http; + +import at.procon.dip.embedding.model.EmbeddingModelDescriptor; +import at.procon.dip.embedding.model.EmbeddingProviderResult; +import at.procon.dip.embedding.model.EmbeddingRequest; +import at.procon.dip.embedding.model.EmbeddingUseCase; +import at.procon.dip.embedding.model.ResolvedEmbeddingProviderConfig; +import at.procon.dip.embedding.provider.EmbeddingProvider; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.charset.StandardCharsets; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +public class ExternalHttpEmbeddingProvider implements EmbeddingProvider { + + private static final String PROVIDER_TYPE = "http-json"; + + private final ObjectMapper objectMapper; + private final HttpClient httpClient = HttpClient.newBuilder().version(HttpClient.Version.HTTP_1_1).build(); + + @Override + public String providerType() { + return PROVIDER_TYPE; + } + + @Override + public boolean supports(EmbeddingModelDescriptor model, ResolvedEmbeddingProviderConfig providerConfig) { + return PROVIDER_TYPE.equalsIgnoreCase(providerConfig.providerType()); + } + + @Override + public EmbeddingProviderResult embedDocuments(ResolvedEmbeddingProviderConfig providerConfig, + EmbeddingModelDescriptor model, + EmbeddingRequest request) { + return execute(providerConfig, model, request, EmbeddingUseCase.DOCUMENT); + } + + @Override + public EmbeddingProviderResult embedQuery(ResolvedEmbeddingProviderConfig providerConfig, + EmbeddingModelDescriptor model, + EmbeddingRequest request) { + return execute(providerConfig, model, request, EmbeddingUseCase.QUERY); + } + + private EmbeddingProviderResult execute(ResolvedEmbeddingProviderConfig providerConfig, + EmbeddingModelDescriptor model, + EmbeddingRequest request, + EmbeddingUseCase useCase) { + try { + var payload = new ProviderRequest( + model.providerModelKey(), + request.texts(), + useCase == EmbeddingUseCase.QUERY, + request.providerOptions() == null ? Map.of() : request.providerOptions() + ); + + HttpRequest.Builder builder = HttpRequest.newBuilder() + .uri(URI.create(trimTrailingSlash(providerConfig.baseUrl()) + "/embed")) + .timeout(providerConfig.readTimeout() == null ? Duration.ofSeconds(60) : providerConfig.readTimeout()) + .header("Content-Type", "application/json") + .POST(HttpRequest.BodyPublishers.ofString(objectMapper.writeValueAsString(payload), StandardCharsets.UTF_8)); + + if (providerConfig.apiKey() != null && !providerConfig.apiKey().isBlank()) { + builder.header("Authorization", "Bearer " + providerConfig.apiKey()); + } + if (providerConfig.headers() != null) { + providerConfig.headers().forEach(builder::header); + } + + HttpResponse response = httpClient.send(builder.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8)); + if (response.statusCode() / 100 != 2) { + throw new IllegalStateException("Embedding provider returned status %d: %s".formatted(response.statusCode(), response.body())); + } + + ProviderResponse parsed = objectMapper.readValue(response.body(), ProviderResponse.class); + List vectors = new ArrayList<>(); + if (parsed.embeddings != null) { + for (List embedding : parsed.embeddings) { + vectors.add(toArray(embedding)); + } + } else if (parsed.embedding != null) { + vectors.add(toArray(parsed.embedding)); + } + + return new EmbeddingProviderResult( + model, + vectors, + parsed.warnings == null ? List.of() : parsed.warnings, + parsed.requestId, + parsed.tokenCount + ); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IllegalStateException("Embedding provider call interrupted", e); + } catch (IOException e) { + throw new IllegalStateException("Failed to call external embedding provider", e); + } + } + + private float[] toArray(List embedding) { + float[] result = new float[embedding.size()]; + for (int i = 0; i < embedding.size(); i++) { + result[i] = embedding.get(i); + } + return result; + } + + private String trimTrailingSlash(String value) { + if (value == null || value.isBlank()) { + throw new IllegalArgumentException("Embedding provider baseUrl must be configured"); + } + return value.endsWith("/") ? value.substring(0, value.length() - 1) : value; + } + + private record ProviderRequest( + @JsonProperty("model") String model, + @JsonProperty("texts") List texts, + @JsonProperty("is_query") boolean query, + @JsonProperty("options") Map options + ) { + } + + private static class ProviderResponse { + @JsonProperty("embedding") + public List embedding; + + @JsonProperty("embeddings") + public List> embeddings; + + @JsonProperty("warnings") + public List warnings; + + @JsonProperty("request_id") + public String requestId; + + @JsonProperty("token_count") + public Integer tokenCount; + } +} diff --git a/src/main/java/at/procon/dip/embedding/provider/mock/MockEmbeddingProvider.java b/src/main/java/at/procon/dip/embedding/provider/mock/MockEmbeddingProvider.java new file mode 100644 index 0000000..ee323f6 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/provider/mock/MockEmbeddingProvider.java @@ -0,0 +1,72 @@ +package at.procon.dip.embedding.provider.mock; + +import at.procon.dip.embedding.model.EmbeddingModelDescriptor; +import at.procon.dip.embedding.model.EmbeddingProviderResult; +import at.procon.dip.embedding.model.EmbeddingRequest; +import at.procon.dip.embedding.model.ResolvedEmbeddingProviderConfig; +import at.procon.dip.embedding.provider.EmbeddingProvider; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; +import org.springframework.stereotype.Component; + +@Component +public class MockEmbeddingProvider implements EmbeddingProvider { + + private static final String PROVIDER_TYPE = "mock"; + + @Override + public String providerType() { + return PROVIDER_TYPE; + } + + @Override + public boolean supports(EmbeddingModelDescriptor model, ResolvedEmbeddingProviderConfig providerConfig) { + return PROVIDER_TYPE.equalsIgnoreCase(providerConfig.providerType()); + } + + @Override + public EmbeddingProviderResult embedDocuments(ResolvedEmbeddingProviderConfig providerConfig, + EmbeddingModelDescriptor model, + EmbeddingRequest request) { + return execute(providerConfig, model, request); + } + + @Override + public EmbeddingProviderResult embedQuery(ResolvedEmbeddingProviderConfig providerConfig, + EmbeddingModelDescriptor model, + EmbeddingRequest request) { + return execute(providerConfig, model, request); + } + + private EmbeddingProviderResult execute(ResolvedEmbeddingProviderConfig providerConfig, + EmbeddingModelDescriptor model, + EmbeddingRequest request) { + int dimensions = model.dimensions() > 0 + ? model.dimensions() + : providerConfig.dimensions() == null ? 16 : providerConfig.dimensions(); + + List vectors = new ArrayList<>(); + for (String text : request.texts()) { + vectors.add(embedDeterministically(text, dimensions)); + } + + return new EmbeddingProviderResult( + model, + vectors, + List.of(), + "mock-" + UUID.randomUUID(), + request.texts().stream().mapToInt(text -> text == null ? 0 : text.length()).sum() + ); + } + + private float[] embedDeterministically(String text, int dimensions) { + float[] vector = new float[dimensions]; + String value = text == null ? "" : text; + for (int i = 0; i < value.length(); i++) { + int bucket = i % dimensions; + vector[bucket] += ((value.charAt(i) % 31) + 1) / 31.0f; + } + return vector; + } +} diff --git a/src/main/java/at/procon/dip/embedding/registry/EmbeddingModelRegistry.java b/src/main/java/at/procon/dip/embedding/registry/EmbeddingModelRegistry.java new file mode 100644 index 0000000..7b1a508 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/registry/EmbeddingModelRegistry.java @@ -0,0 +1,63 @@ +package at.procon.dip.embedding.registry; + +import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.model.EmbeddingModelDescriptor; +import java.util.List; +import java.util.Optional; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +public class EmbeddingModelRegistry { + + private final EmbeddingProperties properties; + + public EmbeddingModelDescriptor getRequired(String modelKey) { + return find(modelKey) + .orElseThrow(() -> new IllegalArgumentException("Unknown embedding model key: " + modelKey)); + } + + public Optional find(String modelKey) { + EmbeddingProperties.ModelProperties model = properties.getModels().get(modelKey); + if (model == null) { + return Optional.empty(); + } + return Optional.of(toDescriptor(modelKey, model)); + } + + public List getActiveModels() { + return properties.getModels().entrySet().stream() + .filter(entry -> entry.getValue().isActive()) + .map(entry -> toDescriptor(entry.getKey(), entry.getValue())) + .toList(); + } + + public String getRequiredDefaultQueryModelKey() { + if (properties.getDefaultQueryModel() == null || properties.getDefaultQueryModel().isBlank()) { + throw new IllegalStateException("dip.embedding.default-query-model is not configured"); + } + return properties.getDefaultQueryModel(); + } + + public String getRequiredDefaultDocumentModelKey() { + if (properties.getDefaultDocumentModel() == null || properties.getDefaultDocumentModel().isBlank()) { + throw new IllegalStateException("dip.embedding.default-document-model is not configured"); + } + return properties.getDefaultDocumentModel(); + } + + private EmbeddingModelDescriptor toDescriptor(String modelKey, EmbeddingProperties.ModelProperties model) { + return new EmbeddingModelDescriptor( + modelKey, + model.getProviderConfigKey(), + model.getProviderModelKey() == null || model.getProviderModelKey().isBlank() ? modelKey : model.getProviderModelKey(), + model.getDimensions() == null ? 0 : model.getDimensions(), + model.getDistanceMetric(), + model.isSupportsQueryEmbeddingMode(), + model.isSupportsBatch(), + model.getMaxInputChars(), + model.isActive() + ); + } +} diff --git a/src/main/java/at/procon/dip/embedding/registry/EmbeddingProviderConfigResolver.java b/src/main/java/at/procon/dip/embedding/registry/EmbeddingProviderConfigResolver.java new file mode 100644 index 0000000..7c3ece8 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/registry/EmbeddingProviderConfigResolver.java @@ -0,0 +1,32 @@ +package at.procon.dip.embedding.registry; + +import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.model.ResolvedEmbeddingProviderConfig; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +public class EmbeddingProviderConfigResolver { + + private final EmbeddingProperties properties; + + public ResolvedEmbeddingProviderConfig resolve(String providerConfigKey) { + EmbeddingProperties.ProviderProperties provider = properties.getProviders().get(providerConfigKey); + if (provider == null) { + throw new IllegalArgumentException("Unknown embedding provider config key: " + providerConfigKey); + } + + return ResolvedEmbeddingProviderConfig.builder() + .key(providerConfigKey) + .providerType(provider.getType()) + .baseUrl(provider.getBaseUrl()) + .apiKey(provider.getApiKey()) + .connectTimeout(provider.getConnectTimeout()) + .readTimeout(provider.getReadTimeout()) + .headers(provider.getHeaders() == null ? Map.of() : Map.copyOf(provider.getHeaders())) + .dimensions(provider.getDimensions()) + .build(); + } +} diff --git a/src/main/java/at/procon/dip/embedding/registry/EmbeddingProviderRegistry.java b/src/main/java/at/procon/dip/embedding/registry/EmbeddingProviderRegistry.java new file mode 100644 index 0000000..0993946 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/registry/EmbeddingProviderRegistry.java @@ -0,0 +1,20 @@ +package at.procon.dip.embedding.registry; + +import at.procon.dip.embedding.provider.EmbeddingProvider; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +public class EmbeddingProviderRegistry { + + private final List providers; + + public EmbeddingProvider getRequired(String providerType) { + return providers.stream() + .filter(provider -> provider.providerType().equalsIgnoreCase(providerType)) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("No embedding provider registered for type: " + providerType)); + } +} diff --git a/src/main/java/at/procon/dip/embedding/service/DefaultQueryEmbeddingService.java b/src/main/java/at/procon/dip/embedding/service/DefaultQueryEmbeddingService.java new file mode 100644 index 0000000..545d1e5 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/service/DefaultQueryEmbeddingService.java @@ -0,0 +1,36 @@ +package at.procon.dip.embedding.service; + +import at.procon.dip.embedding.model.EmbeddingProviderResult; +import at.procon.dip.embedding.model.EmbeddingUseCase; +import at.procon.dip.embedding.registry.EmbeddingModelRegistry; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +public class DefaultQueryEmbeddingService implements QueryEmbeddingService { + + private final EmbeddingExecutionService executionService; + private final EmbeddingModelRegistry modelRegistry; + + @Override + public float[] embedQuery(String queryText) { + return embedQuery(queryText, modelRegistry.getRequiredDefaultQueryModelKey()); + } + + @Override + public float[] embedQuery(String queryText, String modelKey) { + EmbeddingProviderResult result = executionService.embedTexts( + modelKey, + EmbeddingUseCase.QUERY, + List.of(queryText) + ); + + if (result.vectors() == null || result.vectors().isEmpty()) { + throw new IllegalStateException("Embedding provider returned no query vector for model " + modelKey); + } + + return result.vectors().getFirst(); + } +} diff --git a/src/main/java/at/procon/dip/embedding/service/EmbeddingExecutionService.java b/src/main/java/at/procon/dip/embedding/service/EmbeddingExecutionService.java new file mode 100644 index 0000000..4e4f87c --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/service/EmbeddingExecutionService.java @@ -0,0 +1,51 @@ +package at.procon.dip.embedding.service; + +import at.procon.dip.embedding.model.EmbeddingProviderResult; +import at.procon.dip.embedding.model.EmbeddingRequest; +import at.procon.dip.embedding.model.EmbeddingUseCase; +import at.procon.dip.embedding.provider.EmbeddingProvider; +import at.procon.dip.embedding.registry.EmbeddingModelRegistry; +import at.procon.dip.embedding.registry.EmbeddingProviderConfigResolver; +import at.procon.dip.embedding.registry.EmbeddingProviderRegistry; +import java.util.List; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +public class EmbeddingExecutionService { + + private final EmbeddingModelRegistry modelRegistry; + private final EmbeddingProviderConfigResolver providerConfigResolver; + private final EmbeddingProviderRegistry providerRegistry; + + public EmbeddingProviderResult embedTexts(String modelKey, EmbeddingUseCase useCase, List texts) { + return embedTexts(modelKey, useCase, texts, Map.of()); + } + + public EmbeddingProviderResult embedTexts(String modelKey, + EmbeddingUseCase useCase, + List texts, + Map providerOptions) { + var model = modelRegistry.getRequired(modelKey); + var providerConfig = providerConfigResolver.resolve(model.providerConfigKey()); + EmbeddingProvider provider = providerRegistry.getRequired(providerConfig.providerType()); + + if (!provider.supports(model, providerConfig)) { + throw new IllegalStateException("Provider %s does not support model %s".formatted( + provider.providerType(), model.modelKey())); + } + + EmbeddingRequest request = EmbeddingRequest.builder() + .modelKey(model.modelKey()) + .useCase(useCase) + .texts(texts) + .providerOptions(providerOptions) + .build(); + + return useCase == EmbeddingUseCase.QUERY + ? provider.embedQuery(providerConfig, model, request) + : provider.embedDocuments(providerConfig, model, request); + } +} diff --git a/src/main/java/at/procon/dip/embedding/service/QueryEmbeddingService.java b/src/main/java/at/procon/dip/embedding/service/QueryEmbeddingService.java new file mode 100644 index 0000000..3f44ca6 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/service/QueryEmbeddingService.java @@ -0,0 +1,8 @@ +package at.procon.dip.embedding.service; + +public interface QueryEmbeddingService { + + float[] embedQuery(String queryText); + + float[] embedQuery(String queryText, String modelKey); +} diff --git a/src/main/java/at/procon/dip/embedding/startup/EmbeddingSubsystemStartupValidator.java b/src/main/java/at/procon/dip/embedding/startup/EmbeddingSubsystemStartupValidator.java new file mode 100644 index 0000000..16f1871 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/startup/EmbeddingSubsystemStartupValidator.java @@ -0,0 +1,45 @@ +package at.procon.dip.embedding.startup; + +import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.registry.EmbeddingModelRegistry; +import at.procon.dip.embedding.registry.EmbeddingProviderConfigResolver; +import at.procon.dip.embedding.registry.EmbeddingProviderRegistry; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +@ConditionalOnProperty(prefix = "dip.embedding", name = "enabled", havingValue = "true") +@Slf4j +public class EmbeddingSubsystemStartupValidator implements ApplicationRunner { + + private final EmbeddingProperties properties; + private final EmbeddingModelRegistry modelRegistry; + private final EmbeddingProviderConfigResolver providerConfigResolver; + private final EmbeddingProviderRegistry providerRegistry; + + @Override + public void run(ApplicationArguments args) { + if (properties.getModels().isEmpty()) { + throw new IllegalStateException("dip.embedding.enabled=true but no models are configured"); + } + + modelRegistry.getActiveModels().forEach(model -> { + var providerConfig = providerConfigResolver.resolve(model.providerConfigKey()); + providerRegistry.getRequired(providerConfig.providerType()); + log.info("Validated embedding model {} -> provider {} ({})", + model.modelKey(), model.providerConfigKey(), providerConfig.providerType()); + }); + + if (properties.getDefaultDocumentModel() != null && !properties.getDefaultDocumentModel().isBlank()) { + modelRegistry.getRequired(properties.getDefaultDocumentModel()); + } + if (properties.getDefaultQueryModel() != null && !properties.getDefaultQueryModel().isBlank()) { + modelRegistry.getRequired(properties.getDefaultQueryModel()); + } + } +} diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 7e0fa09..ad4f212 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -154,7 +154,7 @@ ted: # TED Daily Package Download configuration download: # Enable/disable automatic package download - enabled: true + enabled: false # User service-based camel route use-service-based: false # Base URL for TED Daily Packages @@ -344,3 +344,35 @@ logging: org.apache.camel: INFO org.hibernate.SQL: WARN org.hibernate.type.descriptor.sql: WARN + + +# Parallel generic embedding subsystem (disabled by default while built alongside the legacy path) +dip: + embedding: + enabled: true + default-document-model: mock-search + default-query-model: mock-search + providers: + mock-default: + type: mock + dimensions: 16 + external-e5: + type: http-json + base-url: http://localhost:8001 + connect-timeout: 5s + read-timeout: 60s + models: + mock-search: + provider-config-key: mock-default + provider-model-key: mock-search + dimensions: 16 + distance-metric: COSINE + supports-query-embedding-mode: true + active: true + e5-default: + provider-config-key: external-e5 + provider-model-key: intfloat/multilingual-e5-large + dimensions: 1024 + distance-metric: COSINE + supports-query-embedding-mode: true + active: true diff --git a/src/test/resources/sql/create-doc-search-test-schemas.sql b/src/test/resources/sql/create-doc-search-test-schemas.sql new file mode 100644 index 0000000..dcff347 --- /dev/null +++ b/src/test/resources/sql/create-doc-search-test-schemas.sql @@ -0,0 +1,3 @@ +CREATE SCHEMA IF NOT EXISTS DOC; +CREATE SCHEMA IF NOT EXISTS TED; +CREATE EXTENSION IF NOT EXISTS pg_trgm;