diff --git a/docs/embedding/NV3_IMPLEMENTATION_NOTES.md b/docs/embedding/NV3_IMPLEMENTATION_NOTES.md new file mode 100644 index 0000000..9f6c2c9 --- /dev/null +++ b/docs/embedding/NV3_IMPLEMENTATION_NOTES.md @@ -0,0 +1,43 @@ +# NV3 — Generic semantic search on the new embedding subsystem + +This phase keeps the new embedding subsystem parallel to the legacy flow and plugs semantic retrieval +into the generic search architecture. + +## Scope + +- query embeddings are generated through `at.procon.dip.embedding.service.QueryEmbeddingService` +- semantic search uses `DOC.doc_embedding` +- retrieval joins `DOC.doc_text_representation` and `DOC.doc_document` +- chunk-aware document collapse remains in the generic search fusion layer +- no structured TED/mail search in this phase +- no legacy cutover in this phase + +## Main classes + +- `at.procon.dip.search.service.SemanticQueryEmbeddingService` +- `at.procon.dip.search.engine.semantic.PgVectorSemanticSearchEngine` +- `at.procon.dip.search.repository.DocumentSemanticSearchRepository` + +## Query model selection + +Order of precedence: + +1. `SearchRequest.semanticModelKey` +2. `dip.embedding.default-query-model` + +The selected model is ensured in `DOC.doc_embedding_model` through +`EmbeddingModelCatalogService` before the query runs. + +## Search flow + +1. planner includes `PGVECTOR_SEMANTIC` for `SEMANTIC` or `HYBRID` +2. `SemanticQueryEmbeddingService` builds a query vector +3. `DocumentSemanticSearchRepository` searches `DOC.doc_embedding` +4. generic fusion/collapse merges semantic hits with lexical hits + +## Notes + +- the SQL uses `public.vector` explicitly in casts to avoid search-path related surprises +- the repository returns representation metadata (`representation_type`, chunk offsets, etc.) +- `SearchRequest.semanticModelKey` is optional and keeps the API model-aware without forcing users + to choose a model for every request diff --git a/src/main/java/at/procon/dip/search/dto/SearchRequest.java b/src/main/java/at/procon/dip/search/dto/SearchRequest.java index 0b6becb..583686d 100644 --- a/src/main/java/at/procon/dip/search/dto/SearchRequest.java +++ b/src/main/java/at/procon/dip/search/dto/SearchRequest.java @@ -44,4 +44,10 @@ public class SearchRequest { @Builder.Default private SearchRepresentationSelectionMode representationSelectionMode = SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS; + + /** + * Optional semantic model key used only by the semantic search engine. + * When omitted, the new embedding subsystem default query model is used. + */ + private String semanticModelKey; } diff --git a/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java b/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java index f73bdda..a238f73 100644 --- a/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java +++ b/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java @@ -1,5 +1,6 @@ package at.procon.dip.search.engine.semantic; +import at.procon.dip.embedding.config.EmbeddingProperties; import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; @@ -15,6 +16,7 @@ import org.springframework.stereotype.Component; @RequiredArgsConstructor public class PgVectorSemanticSearchEngine implements SearchEngine { + private final EmbeddingProperties embeddingProperties; private final TedProcessorProperties properties; private final SemanticQueryEmbeddingService queryEmbeddingService; private final DocumentSemanticSearchRepository repository; @@ -26,14 +28,16 @@ public class PgVectorSemanticSearchEngine implements SearchEngine { @Override public boolean supports(SearchExecutionContext context) { - return properties.getVectorization().isEnabled() + return embeddingProperties.isEnabled() && context.getRequest().getQueryText() != null && !context.getRequest().getQueryText().isBlank(); } @Override public List execute(SearchExecutionContext context) { - return queryEmbeddingService.buildQueryEmbedding(context.getRequest().getQueryText()) + return queryEmbeddingService.buildQueryEmbedding( + context.getRequest().getQueryText(), + context.getRequest().getSemanticModelKey()) .map(query -> repository.search( context, query.modelId(), diff --git a/src/main/java/at/procon/dip/search/plan/DefaultSearchPlanner.java b/src/main/java/at/procon/dip/search/plan/DefaultSearchPlanner.java index 99769ab..08fd995 100644 --- a/src/main/java/at/procon/dip/search/plan/DefaultSearchPlanner.java +++ b/src/main/java/at/procon/dip/search/plan/DefaultSearchPlanner.java @@ -6,7 +6,6 @@ import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchMode; import java.util.ArrayList; import java.util.LinkedHashSet; -import java.util.List; import java.util.Set; import org.springframework.stereotype.Component; diff --git a/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java index e6d2b9f..fb9659c 100644 --- a/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java +++ b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java @@ -26,6 +26,7 @@ public class DocumentSemanticSearchRepository { SELECT d.id AS document_id, dtr.id AS representation_id, + CAST(dtr.representation_type AS text) AS representation_type, CAST(d.document_type AS text) AS document_type, CAST(d.document_family AS text) AS document_family, CAST(d.visibility AS text) AS visibility, @@ -33,7 +34,6 @@ public class DocumentSemanticSearchRepository { d.summary AS summary, COALESCE(dtr.language_code, d.language_code) AS language_code, d.mime_type AS mime_type, - CAST(dtr.representation_type AS text) AS representation_type, dtr.is_primary AS is_primary, dtr.chunk_index AS chunk_index, dtr.chunk_start_offset AS chunk_start_offset, @@ -41,7 +41,7 @@ public class DocumentSemanticSearchRepository { d.created_at AS created_at, d.updated_at AS updated_at, LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, - (1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) AS score + (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) AS score FROM doc.doc_embedding de JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id JOIN doc.doc_document d ON d.id = de.document_id @@ -49,7 +49,7 @@ public class DocumentSemanticSearchRepository { WHERE de.embedding_status = 'COMPLETED' AND de.embedding_vector IS NOT NULL AND de.model_id = :modelId - AND (1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) >= :threshold + AND (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) >= :threshold """); MapSqlParameterSource params = new MapSqlParameterSource(); diff --git a/src/main/java/at/procon/dip/search/service/SemanticQueryEmbeddingService.java b/src/main/java/at/procon/dip/search/service/SemanticQueryEmbeddingService.java index 2117771..a338b2d 100644 --- a/src/main/java/at/procon/dip/search/service/SemanticQueryEmbeddingService.java +++ b/src/main/java/at/procon/dip/search/service/SemanticQueryEmbeddingService.java @@ -1,9 +1,11 @@ package at.procon.dip.search.service; import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; -import at.procon.dip.domain.document.service.DocumentEmbeddingService; -import at.procon.ted.config.TedProcessorProperties; -import at.procon.ted.service.VectorizationService; +import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.registry.EmbeddingModelRegistry; +import at.procon.dip.embedding.service.EmbeddingModelCatalogService; +import at.procon.dip.embedding.service.QueryEmbeddingService; +import at.procon.dip.embedding.support.EmbeddingVectorCodec; import java.util.Optional; import java.util.UUID; import lombok.RequiredArgsConstructor; @@ -15,25 +17,47 @@ import org.springframework.stereotype.Service; @Slf4j public class SemanticQueryEmbeddingService { - private final TedProcessorProperties properties; - private final DocumentEmbeddingService documentEmbeddingService; - private final VectorizationService vectorizationService; + private final EmbeddingProperties embeddingProperties; + private final EmbeddingModelRegistry embeddingModelRegistry; + private final EmbeddingModelCatalogService embeddingModelCatalogService; + private final QueryEmbeddingService queryEmbeddingService; public Optional buildQueryEmbedding(String queryText) { - if (!properties.getVectorization().isEnabled()) { + return buildQueryEmbedding(queryText, null); + } + + public Optional buildQueryEmbedding(String queryText, String requestedModelKey) { + if (!embeddingProperties.isEnabled()) { + return Optional.empty(); + } + if (queryText == null || queryText.isBlank()) { return Optional.empty(); } + + String modelKey = resolveModelKey(requestedModelKey); try { - DocumentEmbeddingModel model = documentEmbeddingService.findActiveModelByKey( - properties.getVectorization().getModelName()); - float[] vector = vectorizationService.generateQueryEmbedding(queryText); - return Optional.of(new QueryEmbedding(model.getId(), vectorizationService.floatArrayToVectorString(vector))); + DocumentEmbeddingModel model = embeddingModelCatalogService.ensureRegistered(modelKey); + float[] vector = queryEmbeddingService.embedQuery(queryText, modelKey); + return Optional.of(new QueryEmbedding( + model.getId(), + modelKey, + EmbeddingVectorCodec.toPgVector(vector), + vector.length + )); } catch (Exception e) { - log.warn("Failed to generate semantic query embedding: {}", e.getMessage()); + log.warn("Failed to generate semantic query embedding with model {}: {}", modelKey, e.getMessage()); return Optional.empty(); } } - public record QueryEmbedding(UUID modelId, String vectorString) { + private String resolveModelKey(String requestedModelKey) { + if (requestedModelKey != null && !requestedModelKey.isBlank()) { + embeddingModelRegistry.getRequired(requestedModelKey); + return requestedModelKey; + } + return embeddingModelRegistry.getRequiredDefaultQueryModelKey(); + } + + public record QueryEmbedding(UUID modelId, String modelKey, String vectorString, int dimensions) { } } diff --git a/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchOrchestratorIntegrationTest.java b/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchOrchestratorIntegrationTest.java new file mode 100644 index 0000000..19e879e --- /dev/null +++ b/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchOrchestratorIntegrationTest.java @@ -0,0 +1,81 @@ +package at.procon.dip.search.integration; + +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; +import at.procon.dip.search.dto.SearchEngineType; +import at.procon.dip.search.dto.SearchMode; +import at.procon.dip.search.dto.SearchRepresentationSelectionMode; +import at.procon.dip.search.dto.SearchRequest; +import at.procon.dip.search.dto.SearchResponse; +import at.procon.dip.search.service.SearchOrchestrator; +import at.procon.dip.search.spi.SearchDocumentScope; +import at.procon.dip.testsupport.AbstractSearchIntegrationTest; +import at.procon.dip.testsupport.SearchTestDataFactory; +import java.util.List; +import java.util.Set; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.test.context.TestPropertySource; + +import static org.assertj.core.api.Assertions.assertThat; + +@TestPropertySource(properties = { + "dip.embedding.enabled=true", + "dip.embedding.default-document-model=mock-search", + "dip.embedding.default-query-model=mock-search", + "dip.embedding.providers.mock-default.type=mock", + "dip.embedding.providers.mock-default.dimensions=16", + "dip.embedding.models.mock-search.provider-config-key=mock-default", + "dip.embedding.models.mock-search.provider-model-key=mock-search", + "dip.embedding.models.mock-search.dimensions=16", + "dip.embedding.models.mock-search.active=true", + "dip.embedding.jobs.enabled=true", + "ted.search.similarity-threshold=0.10", + "ted.search.semantic-candidate-limit=50" +}) +class GenericSemanticSearchOrchestratorIntegrationTest extends AbstractSearchIntegrationTest { + + @Autowired + private SearchTestDataFactory dataFactory; + + @Autowired + private RepresentationEmbeddingOrchestrator embeddingOrchestrator; + + @Autowired + private SearchOrchestrator searchOrchestrator; + + @Test + void semanticMode_should_return_document_from_chunk_embeddings() { + var created = dataFactory.createDocumentWithPrimaryAndChunks( + "Energy optimization strategy", + "Strategy overview", + "This primary representation only contains a high level overview.", + "en", + List.of( + "Chunk one is introductory and does not contain the target phrase.", + "District heating optimization strategy for municipal energy systems is described here." + )); + + embeddingOrchestrator.enqueueDocument(created.document().getId(), "mock-search"); + int processed = embeddingOrchestrator.processNextReadyBatch(); + assertThat(processed).isGreaterThan(0); + + SearchRequest request = SearchRequest.builder() + .queryText("district heating optimization strategy") + .modes(Set.of(SearchMode.SEMANTIC)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS) + .semanticModelKey("mock-search") + .build(); + + SearchResponse response = searchOrchestrator.search( + request, + new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null)); + + assertThat(response.getHits()).isNotEmpty(); + assertThat(response.getHits().getFirst().getTitle()).isEqualTo("Energy optimization strategy"); + assertThat(response.getHits().getFirst().getPrimaryEngine()).isEqualTo(SearchEngineType.PGVECTOR_SEMANTIC); + assertThat(response.getHits().getFirst().getRepresentationType()).isEqualTo(RepresentationType.CHUNK); + } +}