embedding nv3
This commit is contained in:
parent
87fdae9f21
commit
19a02cdcf7
|
|
@ -0,0 +1,43 @@
|
|||
# NV3 — Generic semantic search on the new embedding subsystem
|
||||
|
||||
This phase keeps the new embedding subsystem parallel to the legacy flow and plugs semantic retrieval
|
||||
into the generic search architecture.
|
||||
|
||||
## Scope
|
||||
|
||||
- query embeddings are generated through `at.procon.dip.embedding.service.QueryEmbeddingService`
|
||||
- semantic search uses `DOC.doc_embedding`
|
||||
- retrieval joins `DOC.doc_text_representation` and `DOC.doc_document`
|
||||
- chunk-aware document collapse remains in the generic search fusion layer
|
||||
- no structured TED/mail search in this phase
|
||||
- no legacy cutover in this phase
|
||||
|
||||
## Main classes
|
||||
|
||||
- `at.procon.dip.search.service.SemanticQueryEmbeddingService`
|
||||
- `at.procon.dip.search.engine.semantic.PgVectorSemanticSearchEngine`
|
||||
- `at.procon.dip.search.repository.DocumentSemanticSearchRepository`
|
||||
|
||||
## Query model selection
|
||||
|
||||
Order of precedence:
|
||||
|
||||
1. `SearchRequest.semanticModelKey`
|
||||
2. `dip.embedding.default-query-model`
|
||||
|
||||
The selected model is ensured in `DOC.doc_embedding_model` through
|
||||
`EmbeddingModelCatalogService` before the query runs.
|
||||
|
||||
## Search flow
|
||||
|
||||
1. planner includes `PGVECTOR_SEMANTIC` for `SEMANTIC` or `HYBRID`
|
||||
2. `SemanticQueryEmbeddingService` builds a query vector
|
||||
3. `DocumentSemanticSearchRepository` searches `DOC.doc_embedding`
|
||||
4. generic fusion/collapse merges semantic hits with lexical hits
|
||||
|
||||
## Notes
|
||||
|
||||
- the SQL uses `public.vector` explicitly in casts to avoid search-path related surprises
|
||||
- the repository returns representation metadata (`representation_type`, chunk offsets, etc.)
|
||||
- `SearchRequest.semanticModelKey` is optional and keeps the API model-aware without forcing users
|
||||
to choose a model for every request
|
||||
|
|
@ -44,4 +44,10 @@ public class SearchRequest {
|
|||
@Builder.Default
|
||||
private SearchRepresentationSelectionMode representationSelectionMode =
|
||||
SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS;
|
||||
|
||||
/**
|
||||
* Optional semantic model key used only by the semantic search engine.
|
||||
* When omitted, the new embedding subsystem default query model is used.
|
||||
*/
|
||||
private String semanticModelKey;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
package at.procon.dip.search.engine.semantic;
|
||||
|
||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
|
|
@ -15,6 +16,7 @@ import org.springframework.stereotype.Component;
|
|||
@RequiredArgsConstructor
|
||||
public class PgVectorSemanticSearchEngine implements SearchEngine {
|
||||
|
||||
private final EmbeddingProperties embeddingProperties;
|
||||
private final TedProcessorProperties properties;
|
||||
private final SemanticQueryEmbeddingService queryEmbeddingService;
|
||||
private final DocumentSemanticSearchRepository repository;
|
||||
|
|
@ -26,14 +28,16 @@ public class PgVectorSemanticSearchEngine implements SearchEngine {
|
|||
|
||||
@Override
|
||||
public boolean supports(SearchExecutionContext context) {
|
||||
return properties.getVectorization().isEnabled()
|
||||
return embeddingProperties.isEnabled()
|
||||
&& context.getRequest().getQueryText() != null
|
||||
&& !context.getRequest().getQueryText().isBlank();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SearchHit> execute(SearchExecutionContext context) {
|
||||
return queryEmbeddingService.buildQueryEmbedding(context.getRequest().getQueryText())
|
||||
return queryEmbeddingService.buildQueryEmbedding(
|
||||
context.getRequest().getQueryText(),
|
||||
context.getRequest().getSemanticModelKey())
|
||||
.map(query -> repository.search(
|
||||
context,
|
||||
query.modelId(),
|
||||
|
|
|
|||
|
|
@ -6,7 +6,6 @@ import at.procon.dip.search.dto.SearchEngineType;
|
|||
import at.procon.dip.search.dto.SearchMode;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ public class DocumentSemanticSearchRepository {
|
|||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
CAST(dtr.representation_type AS text) AS representation_type,
|
||||
CAST(d.document_type AS text) AS document_type,
|
||||
CAST(d.document_family AS text) AS document_family,
|
||||
CAST(d.visibility AS text) AS visibility,
|
||||
|
|
@ -33,7 +34,6 @@ public class DocumentSemanticSearchRepository {
|
|||
d.summary AS summary,
|
||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||
d.mime_type AS mime_type,
|
||||
CAST(dtr.representation_type AS text) AS representation_type,
|
||||
dtr.is_primary AS is_primary,
|
||||
dtr.chunk_index AS chunk_index,
|
||||
dtr.chunk_start_offset AS chunk_start_offset,
|
||||
|
|
@ -41,7 +41,7 @@ public class DocumentSemanticSearchRepository {
|
|||
d.created_at AS created_at,
|
||||
d.updated_at AS updated_at,
|
||||
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
||||
(1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) AS score
|
||||
(1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) AS score
|
||||
FROM doc.doc_embedding de
|
||||
JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
|
||||
JOIN doc.doc_document d ON d.id = de.document_id
|
||||
|
|
@ -49,7 +49,7 @@ public class DocumentSemanticSearchRepository {
|
|||
WHERE de.embedding_status = 'COMPLETED'
|
||||
AND de.embedding_vector IS NOT NULL
|
||||
AND de.model_id = :modelId
|
||||
AND (1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) >= :threshold
|
||||
AND (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) >= :threshold
|
||||
""");
|
||||
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
package at.procon.dip.search.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import at.procon.ted.service.VectorizationService;
|
||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
||||
import at.procon.dip.embedding.service.EmbeddingModelCatalogService;
|
||||
import at.procon.dip.embedding.service.QueryEmbeddingService;
|
||||
import at.procon.dip.embedding.support.EmbeddingVectorCodec;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
|
@ -15,25 +17,47 @@ import org.springframework.stereotype.Service;
|
|||
@Slf4j
|
||||
public class SemanticQueryEmbeddingService {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final DocumentEmbeddingService documentEmbeddingService;
|
||||
private final VectorizationService vectorizationService;
|
||||
private final EmbeddingProperties embeddingProperties;
|
||||
private final EmbeddingModelRegistry embeddingModelRegistry;
|
||||
private final EmbeddingModelCatalogService embeddingModelCatalogService;
|
||||
private final QueryEmbeddingService queryEmbeddingService;
|
||||
|
||||
public Optional<QueryEmbedding> buildQueryEmbedding(String queryText) {
|
||||
if (!properties.getVectorization().isEnabled()) {
|
||||
return buildQueryEmbedding(queryText, null);
|
||||
}
|
||||
|
||||
public Optional<QueryEmbedding> buildQueryEmbedding(String queryText, String requestedModelKey) {
|
||||
if (!embeddingProperties.isEnabled()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
if (queryText == null || queryText.isBlank()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
String modelKey = resolveModelKey(requestedModelKey);
|
||||
try {
|
||||
DocumentEmbeddingModel model = documentEmbeddingService.findActiveModelByKey(
|
||||
properties.getVectorization().getModelName());
|
||||
float[] vector = vectorizationService.generateQueryEmbedding(queryText);
|
||||
return Optional.of(new QueryEmbedding(model.getId(), vectorizationService.floatArrayToVectorString(vector)));
|
||||
DocumentEmbeddingModel model = embeddingModelCatalogService.ensureRegistered(modelKey);
|
||||
float[] vector = queryEmbeddingService.embedQuery(queryText, modelKey);
|
||||
return Optional.of(new QueryEmbedding(
|
||||
model.getId(),
|
||||
modelKey,
|
||||
EmbeddingVectorCodec.toPgVector(vector),
|
||||
vector.length
|
||||
));
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to generate semantic query embedding: {}", e.getMessage());
|
||||
log.warn("Failed to generate semantic query embedding with model {}: {}", modelKey, e.getMessage());
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public record QueryEmbedding(UUID modelId, String vectorString) {
|
||||
private String resolveModelKey(String requestedModelKey) {
|
||||
if (requestedModelKey != null && !requestedModelKey.isBlank()) {
|
||||
embeddingModelRegistry.getRequired(requestedModelKey);
|
||||
return requestedModelKey;
|
||||
}
|
||||
return embeddingModelRegistry.getRequiredDefaultQueryModelKey();
|
||||
}
|
||||
|
||||
public record QueryEmbedding(UUID modelId, String modelKey, String vectorString, int dimensions) {
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,81 @@
|
|||
package at.procon.dip.search.integration;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchMode;
|
||||
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
|
||||
import at.procon.dip.search.dto.SearchRequest;
|
||||
import at.procon.dip.search.dto.SearchResponse;
|
||||
import at.procon.dip.search.service.SearchOrchestrator;
|
||||
import at.procon.dip.search.spi.SearchDocumentScope;
|
||||
import at.procon.dip.testsupport.AbstractSearchIntegrationTest;
|
||||
import at.procon.dip.testsupport.SearchTestDataFactory;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.test.context.TestPropertySource;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
@TestPropertySource(properties = {
|
||||
"dip.embedding.enabled=true",
|
||||
"dip.embedding.default-document-model=mock-search",
|
||||
"dip.embedding.default-query-model=mock-search",
|
||||
"dip.embedding.providers.mock-default.type=mock",
|
||||
"dip.embedding.providers.mock-default.dimensions=16",
|
||||
"dip.embedding.models.mock-search.provider-config-key=mock-default",
|
||||
"dip.embedding.models.mock-search.provider-model-key=mock-search",
|
||||
"dip.embedding.models.mock-search.dimensions=16",
|
||||
"dip.embedding.models.mock-search.active=true",
|
||||
"dip.embedding.jobs.enabled=true",
|
||||
"ted.search.similarity-threshold=0.10",
|
||||
"ted.search.semantic-candidate-limit=50"
|
||||
})
|
||||
class GenericSemanticSearchOrchestratorIntegrationTest extends AbstractSearchIntegrationTest {
|
||||
|
||||
@Autowired
|
||||
private SearchTestDataFactory dataFactory;
|
||||
|
||||
@Autowired
|
||||
private RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
||||
|
||||
@Autowired
|
||||
private SearchOrchestrator searchOrchestrator;
|
||||
|
||||
@Test
|
||||
void semanticMode_should_return_document_from_chunk_embeddings() {
|
||||
var created = dataFactory.createDocumentWithPrimaryAndChunks(
|
||||
"Energy optimization strategy",
|
||||
"Strategy overview",
|
||||
"This primary representation only contains a high level overview.",
|
||||
"en",
|
||||
List.of(
|
||||
"Chunk one is introductory and does not contain the target phrase.",
|
||||
"District heating optimization strategy for municipal energy systems is described here."
|
||||
));
|
||||
|
||||
embeddingOrchestrator.enqueueDocument(created.document().getId(), "mock-search");
|
||||
int processed = embeddingOrchestrator.processNextReadyBatch();
|
||||
assertThat(processed).isGreaterThan(0);
|
||||
|
||||
SearchRequest request = SearchRequest.builder()
|
||||
.queryText("district heating optimization strategy")
|
||||
.modes(Set.of(SearchMode.SEMANTIC))
|
||||
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS)
|
||||
.semanticModelKey("mock-search")
|
||||
.build();
|
||||
|
||||
SearchResponse response = searchOrchestrator.search(
|
||||
request,
|
||||
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null));
|
||||
|
||||
assertThat(response.getHits()).isNotEmpty();
|
||||
assertThat(response.getHits().getFirst().getTitle()).isEqualTo("Energy optimization strategy");
|
||||
assertThat(response.getHits().getFirst().getPrimaryEngine()).isEqualTo(SearchEngineType.PGVECTOR_SEMANTIC);
|
||||
assertThat(response.getHits().getFirst().getRepresentationType()).isEqualTo(RepresentationType.CHUNK);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue