embedding nv3

master
trifonovt 1 month ago
parent 87fdae9f21
commit 19a02cdcf7

@ -0,0 +1,43 @@
# NV3 — Generic semantic search on the new embedding subsystem
This phase keeps the new embedding subsystem parallel to the legacy flow and plugs semantic retrieval
into the generic search architecture.
## Scope
- query embeddings are generated through `at.procon.dip.embedding.service.QueryEmbeddingService`
- semantic search uses `DOC.doc_embedding`
- retrieval joins `DOC.doc_text_representation` and `DOC.doc_document`
- chunk-aware document collapse remains in the generic search fusion layer
- no structured TED/mail search in this phase
- no legacy cutover in this phase
## Main classes
- `at.procon.dip.search.service.SemanticQueryEmbeddingService`
- `at.procon.dip.search.engine.semantic.PgVectorSemanticSearchEngine`
- `at.procon.dip.search.repository.DocumentSemanticSearchRepository`
## Query model selection
Order of precedence:
1. `SearchRequest.semanticModelKey`
2. `dip.embedding.default-query-model`
The selected model is ensured in `DOC.doc_embedding_model` through
`EmbeddingModelCatalogService` before the query runs.
## Search flow
1. planner includes `PGVECTOR_SEMANTIC` for `SEMANTIC` or `HYBRID`
2. `SemanticQueryEmbeddingService` builds a query vector
3. `DocumentSemanticSearchRepository` searches `DOC.doc_embedding`
4. generic fusion/collapse merges semantic hits with lexical hits
## Notes
- the SQL uses `public.vector` explicitly in casts to avoid search-path related surprises
- the repository returns representation metadata (`representation_type`, chunk offsets, etc.)
- `SearchRequest.semanticModelKey` is optional and keeps the API model-aware without forcing users
to choose a model for every request

@ -44,4 +44,10 @@ public class SearchRequest {
@Builder.Default @Builder.Default
private SearchRepresentationSelectionMode representationSelectionMode = private SearchRepresentationSelectionMode representationSelectionMode =
SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS; SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS;
/**
* Optional semantic model key used only by the semantic search engine.
* When omitted, the new embedding subsystem default query model is used.
*/
private String semanticModelKey;
} }

@ -1,5 +1,6 @@
package at.procon.dip.search.engine.semantic; package at.procon.dip.search.engine.semantic;
import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchHit;
@ -15,6 +16,7 @@ import org.springframework.stereotype.Component;
@RequiredArgsConstructor @RequiredArgsConstructor
public class PgVectorSemanticSearchEngine implements SearchEngine { public class PgVectorSemanticSearchEngine implements SearchEngine {
private final EmbeddingProperties embeddingProperties;
private final TedProcessorProperties properties; private final TedProcessorProperties properties;
private final SemanticQueryEmbeddingService queryEmbeddingService; private final SemanticQueryEmbeddingService queryEmbeddingService;
private final DocumentSemanticSearchRepository repository; private final DocumentSemanticSearchRepository repository;
@ -26,14 +28,16 @@ public class PgVectorSemanticSearchEngine implements SearchEngine {
@Override @Override
public boolean supports(SearchExecutionContext context) { public boolean supports(SearchExecutionContext context) {
return properties.getVectorization().isEnabled() return embeddingProperties.isEnabled()
&& context.getRequest().getQueryText() != null && context.getRequest().getQueryText() != null
&& !context.getRequest().getQueryText().isBlank(); && !context.getRequest().getQueryText().isBlank();
} }
@Override @Override
public List<SearchHit> execute(SearchExecutionContext context) { public List<SearchHit> execute(SearchExecutionContext context) {
return queryEmbeddingService.buildQueryEmbedding(context.getRequest().getQueryText()) return queryEmbeddingService.buildQueryEmbedding(
context.getRequest().getQueryText(),
context.getRequest().getSemanticModelKey())
.map(query -> repository.search( .map(query -> repository.search(
context, context,
query.modelId(), query.modelId(),

@ -6,7 +6,6 @@ import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchMode; import at.procon.dip.search.dto.SearchMode;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set; import java.util.Set;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;

@ -26,6 +26,7 @@ public class DocumentSemanticSearchRepository {
SELECT SELECT
d.id AS document_id, d.id AS document_id,
dtr.id AS representation_id, dtr.id AS representation_id,
CAST(dtr.representation_type AS text) AS representation_type,
CAST(d.document_type AS text) AS document_type, CAST(d.document_type AS text) AS document_type,
CAST(d.document_family AS text) AS document_family, CAST(d.document_family AS text) AS document_family,
CAST(d.visibility AS text) AS visibility, CAST(d.visibility AS text) AS visibility,
@ -33,7 +34,6 @@ public class DocumentSemanticSearchRepository {
d.summary AS summary, d.summary AS summary,
COALESCE(dtr.language_code, d.language_code) AS language_code, COALESCE(dtr.language_code, d.language_code) AS language_code,
d.mime_type AS mime_type, d.mime_type AS mime_type,
CAST(dtr.representation_type AS text) AS representation_type,
dtr.is_primary AS is_primary, dtr.is_primary AS is_primary,
dtr.chunk_index AS chunk_index, dtr.chunk_index AS chunk_index,
dtr.chunk_start_offset AS chunk_start_offset, dtr.chunk_start_offset AS chunk_start_offset,
@ -41,7 +41,7 @@ public class DocumentSemanticSearchRepository {
d.created_at AS created_at, d.created_at AS created_at,
d.updated_at AS updated_at, d.updated_at AS updated_at,
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
(1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) AS score (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) AS score
FROM doc.doc_embedding de FROM doc.doc_embedding de
JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
JOIN doc.doc_document d ON d.id = de.document_id JOIN doc.doc_document d ON d.id = de.document_id
@ -49,7 +49,7 @@ public class DocumentSemanticSearchRepository {
WHERE de.embedding_status = 'COMPLETED' WHERE de.embedding_status = 'COMPLETED'
AND de.embedding_vector IS NOT NULL AND de.embedding_vector IS NOT NULL
AND de.model_id = :modelId AND de.model_id = :modelId
AND (1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) >= :threshold AND (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) >= :threshold
"""); """);
MapSqlParameterSource params = new MapSqlParameterSource(); MapSqlParameterSource params = new MapSqlParameterSource();

@ -1,9 +1,11 @@
package at.procon.dip.search.service; package at.procon.dip.search.service;
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
import at.procon.dip.domain.document.service.DocumentEmbeddingService; import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.ted.config.TedProcessorProperties; import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
import at.procon.ted.service.VectorizationService; import at.procon.dip.embedding.service.EmbeddingModelCatalogService;
import at.procon.dip.embedding.service.QueryEmbeddingService;
import at.procon.dip.embedding.support.EmbeddingVectorCodec;
import java.util.Optional; import java.util.Optional;
import java.util.UUID; import java.util.UUID;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -15,25 +17,47 @@ import org.springframework.stereotype.Service;
@Slf4j @Slf4j
public class SemanticQueryEmbeddingService { public class SemanticQueryEmbeddingService {
private final TedProcessorProperties properties; private final EmbeddingProperties embeddingProperties;
private final DocumentEmbeddingService documentEmbeddingService; private final EmbeddingModelRegistry embeddingModelRegistry;
private final VectorizationService vectorizationService; private final EmbeddingModelCatalogService embeddingModelCatalogService;
private final QueryEmbeddingService queryEmbeddingService;
public Optional<QueryEmbedding> buildQueryEmbedding(String queryText) { public Optional<QueryEmbedding> buildQueryEmbedding(String queryText) {
if (!properties.getVectorization().isEnabled()) { return buildQueryEmbedding(queryText, null);
}
public Optional<QueryEmbedding> buildQueryEmbedding(String queryText, String requestedModelKey) {
if (!embeddingProperties.isEnabled()) {
return Optional.empty();
}
if (queryText == null || queryText.isBlank()) {
return Optional.empty(); return Optional.empty();
} }
String modelKey = resolveModelKey(requestedModelKey);
try { try {
DocumentEmbeddingModel model = documentEmbeddingService.findActiveModelByKey( DocumentEmbeddingModel model = embeddingModelCatalogService.ensureRegistered(modelKey);
properties.getVectorization().getModelName()); float[] vector = queryEmbeddingService.embedQuery(queryText, modelKey);
float[] vector = vectorizationService.generateQueryEmbedding(queryText); return Optional.of(new QueryEmbedding(
return Optional.of(new QueryEmbedding(model.getId(), vectorizationService.floatArrayToVectorString(vector))); model.getId(),
modelKey,
EmbeddingVectorCodec.toPgVector(vector),
vector.length
));
} catch (Exception e) { } catch (Exception e) {
log.warn("Failed to generate semantic query embedding: {}", e.getMessage()); log.warn("Failed to generate semantic query embedding with model {}: {}", modelKey, e.getMessage());
return Optional.empty(); return Optional.empty();
} }
} }
public record QueryEmbedding(UUID modelId, String vectorString) { private String resolveModelKey(String requestedModelKey) {
if (requestedModelKey != null && !requestedModelKey.isBlank()) {
embeddingModelRegistry.getRequired(requestedModelKey);
return requestedModelKey;
}
return embeddingModelRegistry.getRequiredDefaultQueryModelKey();
}
public record QueryEmbedding(UUID modelId, String modelKey, String vectorString, int dimensions) {
} }
} }

@ -0,0 +1,81 @@
package at.procon.dip.search.integration;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchMode;
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.service.SearchOrchestrator;
import at.procon.dip.search.spi.SearchDocumentScope;
import at.procon.dip.testsupport.AbstractSearchIntegrationTest;
import at.procon.dip.testsupport.SearchTestDataFactory;
import java.util.List;
import java.util.Set;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.test.context.TestPropertySource;
import static org.assertj.core.api.Assertions.assertThat;
@TestPropertySource(properties = {
"dip.embedding.enabled=true",
"dip.embedding.default-document-model=mock-search",
"dip.embedding.default-query-model=mock-search",
"dip.embedding.providers.mock-default.type=mock",
"dip.embedding.providers.mock-default.dimensions=16",
"dip.embedding.models.mock-search.provider-config-key=mock-default",
"dip.embedding.models.mock-search.provider-model-key=mock-search",
"dip.embedding.models.mock-search.dimensions=16",
"dip.embedding.models.mock-search.active=true",
"dip.embedding.jobs.enabled=true",
"ted.search.similarity-threshold=0.10",
"ted.search.semantic-candidate-limit=50"
})
class GenericSemanticSearchOrchestratorIntegrationTest extends AbstractSearchIntegrationTest {
@Autowired
private SearchTestDataFactory dataFactory;
@Autowired
private RepresentationEmbeddingOrchestrator embeddingOrchestrator;
@Autowired
private SearchOrchestrator searchOrchestrator;
@Test
void semanticMode_should_return_document_from_chunk_embeddings() {
var created = dataFactory.createDocumentWithPrimaryAndChunks(
"Energy optimization strategy",
"Strategy overview",
"This primary representation only contains a high level overview.",
"en",
List.of(
"Chunk one is introductory and does not contain the target phrase.",
"District heating optimization strategy for municipal energy systems is described here."
));
embeddingOrchestrator.enqueueDocument(created.document().getId(), "mock-search");
int processed = embeddingOrchestrator.processNextReadyBatch();
assertThat(processed).isGreaterThan(0);
SearchRequest request = SearchRequest.builder()
.queryText("district heating optimization strategy")
.modes(Set.of(SearchMode.SEMANTIC))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS)
.semanticModelKey("mock-search")
.build();
SearchResponse response = searchOrchestrator.search(
request,
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null));
assertThat(response.getHits()).isNotEmpty();
assertThat(response.getHits().getFirst().getTitle()).isEqualTo("Energy optimization strategy");
assertThat(response.getHits().getFirst().getPrimaryEngine()).isEqualTo(SearchEngineType.PGVECTOR_SEMANTIC);
assertThat(response.getHits().getFirst().getRepresentationType()).isEqualTo(RepresentationType.CHUNK);
}
}
Loading…
Cancel
Save