embedding nv3
This commit is contained in:
parent
87fdae9f21
commit
19a02cdcf7
|
|
@ -0,0 +1,43 @@
|
||||||
|
# NV3 — Generic semantic search on the new embedding subsystem
|
||||||
|
|
||||||
|
This phase keeps the new embedding subsystem parallel to the legacy flow and plugs semantic retrieval
|
||||||
|
into the generic search architecture.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
- query embeddings are generated through `at.procon.dip.embedding.service.QueryEmbeddingService`
|
||||||
|
- semantic search uses `DOC.doc_embedding`
|
||||||
|
- retrieval joins `DOC.doc_text_representation` and `DOC.doc_document`
|
||||||
|
- chunk-aware document collapse remains in the generic search fusion layer
|
||||||
|
- no structured TED/mail search in this phase
|
||||||
|
- no legacy cutover in this phase
|
||||||
|
|
||||||
|
## Main classes
|
||||||
|
|
||||||
|
- `at.procon.dip.search.service.SemanticQueryEmbeddingService`
|
||||||
|
- `at.procon.dip.search.engine.semantic.PgVectorSemanticSearchEngine`
|
||||||
|
- `at.procon.dip.search.repository.DocumentSemanticSearchRepository`
|
||||||
|
|
||||||
|
## Query model selection
|
||||||
|
|
||||||
|
Order of precedence:
|
||||||
|
|
||||||
|
1. `SearchRequest.semanticModelKey`
|
||||||
|
2. `dip.embedding.default-query-model`
|
||||||
|
|
||||||
|
The selected model is ensured in `DOC.doc_embedding_model` through
|
||||||
|
`EmbeddingModelCatalogService` before the query runs.
|
||||||
|
|
||||||
|
## Search flow
|
||||||
|
|
||||||
|
1. planner includes `PGVECTOR_SEMANTIC` for `SEMANTIC` or `HYBRID`
|
||||||
|
2. `SemanticQueryEmbeddingService` builds a query vector
|
||||||
|
3. `DocumentSemanticSearchRepository` searches `DOC.doc_embedding`
|
||||||
|
4. generic fusion/collapse merges semantic hits with lexical hits
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- the SQL uses `public.vector` explicitly in casts to avoid search-path related surprises
|
||||||
|
- the repository returns representation metadata (`representation_type`, chunk offsets, etc.)
|
||||||
|
- `SearchRequest.semanticModelKey` is optional and keeps the API model-aware without forcing users
|
||||||
|
to choose a model for every request
|
||||||
|
|
@ -44,4 +44,10 @@ public class SearchRequest {
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
private SearchRepresentationSelectionMode representationSelectionMode =
|
private SearchRepresentationSelectionMode representationSelectionMode =
|
||||||
SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS;
|
SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optional semantic model key used only by the semantic search engine.
|
||||||
|
* When omitted, the new embedding subsystem default query model is used.
|
||||||
|
*/
|
||||||
|
private String semanticModelKey;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
package at.procon.dip.search.engine.semantic;
|
package at.procon.dip.search.engine.semantic;
|
||||||
|
|
||||||
|
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||||
import at.procon.dip.search.api.SearchExecutionContext;
|
import at.procon.dip.search.api.SearchExecutionContext;
|
||||||
import at.procon.dip.search.dto.SearchEngineType;
|
import at.procon.dip.search.dto.SearchEngineType;
|
||||||
import at.procon.dip.search.dto.SearchHit;
|
import at.procon.dip.search.dto.SearchHit;
|
||||||
|
|
@ -15,6 +16,7 @@ import org.springframework.stereotype.Component;
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class PgVectorSemanticSearchEngine implements SearchEngine {
|
public class PgVectorSemanticSearchEngine implements SearchEngine {
|
||||||
|
|
||||||
|
private final EmbeddingProperties embeddingProperties;
|
||||||
private final TedProcessorProperties properties;
|
private final TedProcessorProperties properties;
|
||||||
private final SemanticQueryEmbeddingService queryEmbeddingService;
|
private final SemanticQueryEmbeddingService queryEmbeddingService;
|
||||||
private final DocumentSemanticSearchRepository repository;
|
private final DocumentSemanticSearchRepository repository;
|
||||||
|
|
@ -26,14 +28,16 @@ public class PgVectorSemanticSearchEngine implements SearchEngine {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean supports(SearchExecutionContext context) {
|
public boolean supports(SearchExecutionContext context) {
|
||||||
return properties.getVectorization().isEnabled()
|
return embeddingProperties.isEnabled()
|
||||||
&& context.getRequest().getQueryText() != null
|
&& context.getRequest().getQueryText() != null
|
||||||
&& !context.getRequest().getQueryText().isBlank();
|
&& !context.getRequest().getQueryText().isBlank();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<SearchHit> execute(SearchExecutionContext context) {
|
public List<SearchHit> execute(SearchExecutionContext context) {
|
||||||
return queryEmbeddingService.buildQueryEmbedding(context.getRequest().getQueryText())
|
return queryEmbeddingService.buildQueryEmbedding(
|
||||||
|
context.getRequest().getQueryText(),
|
||||||
|
context.getRequest().getSemanticModelKey())
|
||||||
.map(query -> repository.search(
|
.map(query -> repository.search(
|
||||||
context,
|
context,
|
||||||
query.modelId(),
|
query.modelId(),
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@ import at.procon.dip.search.dto.SearchEngineType;
|
||||||
import at.procon.dip.search.dto.SearchMode;
|
import at.procon.dip.search.dto.SearchMode;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,7 @@ public class DocumentSemanticSearchRepository {
|
||||||
SELECT
|
SELECT
|
||||||
d.id AS document_id,
|
d.id AS document_id,
|
||||||
dtr.id AS representation_id,
|
dtr.id AS representation_id,
|
||||||
|
CAST(dtr.representation_type AS text) AS representation_type,
|
||||||
CAST(d.document_type AS text) AS document_type,
|
CAST(d.document_type AS text) AS document_type,
|
||||||
CAST(d.document_family AS text) AS document_family,
|
CAST(d.document_family AS text) AS document_family,
|
||||||
CAST(d.visibility AS text) AS visibility,
|
CAST(d.visibility AS text) AS visibility,
|
||||||
|
|
@ -33,7 +34,6 @@ public class DocumentSemanticSearchRepository {
|
||||||
d.summary AS summary,
|
d.summary AS summary,
|
||||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||||
d.mime_type AS mime_type,
|
d.mime_type AS mime_type,
|
||||||
CAST(dtr.representation_type AS text) AS representation_type,
|
|
||||||
dtr.is_primary AS is_primary,
|
dtr.is_primary AS is_primary,
|
||||||
dtr.chunk_index AS chunk_index,
|
dtr.chunk_index AS chunk_index,
|
||||||
dtr.chunk_start_offset AS chunk_start_offset,
|
dtr.chunk_start_offset AS chunk_start_offset,
|
||||||
|
|
@ -41,7 +41,7 @@ public class DocumentSemanticSearchRepository {
|
||||||
d.created_at AS created_at,
|
d.created_at AS created_at,
|
||||||
d.updated_at AS updated_at,
|
d.updated_at AS updated_at,
|
||||||
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
||||||
(1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) AS score
|
(1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) AS score
|
||||||
FROM doc.doc_embedding de
|
FROM doc.doc_embedding de
|
||||||
JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
|
JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
|
||||||
JOIN doc.doc_document d ON d.id = de.document_id
|
JOIN doc.doc_document d ON d.id = de.document_id
|
||||||
|
|
@ -49,7 +49,7 @@ public class DocumentSemanticSearchRepository {
|
||||||
WHERE de.embedding_status = 'COMPLETED'
|
WHERE de.embedding_status = 'COMPLETED'
|
||||||
AND de.embedding_vector IS NOT NULL
|
AND de.embedding_vector IS NOT NULL
|
||||||
AND de.model_id = :modelId
|
AND de.model_id = :modelId
|
||||||
AND (1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) >= :threshold
|
AND (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) >= :threshold
|
||||||
""");
|
""");
|
||||||
|
|
||||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,11 @@
|
||||||
package at.procon.dip.search.service;
|
package at.procon.dip.search.service;
|
||||||
|
|
||||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||||
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||||
import at.procon.ted.config.TedProcessorProperties;
|
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
||||||
import at.procon.ted.service.VectorizationService;
|
import at.procon.dip.embedding.service.EmbeddingModelCatalogService;
|
||||||
|
import at.procon.dip.embedding.service.QueryEmbeddingService;
|
||||||
|
import at.procon.dip.embedding.support.EmbeddingVectorCodec;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
@ -15,25 +17,47 @@ import org.springframework.stereotype.Service;
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class SemanticQueryEmbeddingService {
|
public class SemanticQueryEmbeddingService {
|
||||||
|
|
||||||
private final TedProcessorProperties properties;
|
private final EmbeddingProperties embeddingProperties;
|
||||||
private final DocumentEmbeddingService documentEmbeddingService;
|
private final EmbeddingModelRegistry embeddingModelRegistry;
|
||||||
private final VectorizationService vectorizationService;
|
private final EmbeddingModelCatalogService embeddingModelCatalogService;
|
||||||
|
private final QueryEmbeddingService queryEmbeddingService;
|
||||||
|
|
||||||
public Optional<QueryEmbedding> buildQueryEmbedding(String queryText) {
|
public Optional<QueryEmbedding> buildQueryEmbedding(String queryText) {
|
||||||
if (!properties.getVectorization().isEnabled()) {
|
return buildQueryEmbedding(queryText, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<QueryEmbedding> buildQueryEmbedding(String queryText, String requestedModelKey) {
|
||||||
|
if (!embeddingProperties.isEnabled()) {
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
if (queryText == null || queryText.isBlank()) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
String modelKey = resolveModelKey(requestedModelKey);
|
||||||
try {
|
try {
|
||||||
DocumentEmbeddingModel model = documentEmbeddingService.findActiveModelByKey(
|
DocumentEmbeddingModel model = embeddingModelCatalogService.ensureRegistered(modelKey);
|
||||||
properties.getVectorization().getModelName());
|
float[] vector = queryEmbeddingService.embedQuery(queryText, modelKey);
|
||||||
float[] vector = vectorizationService.generateQueryEmbedding(queryText);
|
return Optional.of(new QueryEmbedding(
|
||||||
return Optional.of(new QueryEmbedding(model.getId(), vectorizationService.floatArrayToVectorString(vector)));
|
model.getId(),
|
||||||
|
modelKey,
|
||||||
|
EmbeddingVectorCodec.toPgVector(vector),
|
||||||
|
vector.length
|
||||||
|
));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.warn("Failed to generate semantic query embedding: {}", e.getMessage());
|
log.warn("Failed to generate semantic query embedding with model {}: {}", modelKey, e.getMessage());
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public record QueryEmbedding(UUID modelId, String vectorString) {
|
private String resolveModelKey(String requestedModelKey) {
|
||||||
|
if (requestedModelKey != null && !requestedModelKey.isBlank()) {
|
||||||
|
embeddingModelRegistry.getRequired(requestedModelKey);
|
||||||
|
return requestedModelKey;
|
||||||
|
}
|
||||||
|
return embeddingModelRegistry.getRequiredDefaultQueryModelKey();
|
||||||
|
}
|
||||||
|
|
||||||
|
public record QueryEmbedding(UUID modelId, String modelKey, String vectorString, int dimensions) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,81 @@
|
||||||
|
package at.procon.dip.search.integration;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||||
|
import at.procon.dip.search.dto.SearchEngineType;
|
||||||
|
import at.procon.dip.search.dto.SearchMode;
|
||||||
|
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
|
||||||
|
import at.procon.dip.search.dto.SearchRequest;
|
||||||
|
import at.procon.dip.search.dto.SearchResponse;
|
||||||
|
import at.procon.dip.search.service.SearchOrchestrator;
|
||||||
|
import at.procon.dip.search.spi.SearchDocumentScope;
|
||||||
|
import at.procon.dip.testsupport.AbstractSearchIntegrationTest;
|
||||||
|
import at.procon.dip.testsupport.SearchTestDataFactory;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.test.context.TestPropertySource;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
@TestPropertySource(properties = {
|
||||||
|
"dip.embedding.enabled=true",
|
||||||
|
"dip.embedding.default-document-model=mock-search",
|
||||||
|
"dip.embedding.default-query-model=mock-search",
|
||||||
|
"dip.embedding.providers.mock-default.type=mock",
|
||||||
|
"dip.embedding.providers.mock-default.dimensions=16",
|
||||||
|
"dip.embedding.models.mock-search.provider-config-key=mock-default",
|
||||||
|
"dip.embedding.models.mock-search.provider-model-key=mock-search",
|
||||||
|
"dip.embedding.models.mock-search.dimensions=16",
|
||||||
|
"dip.embedding.models.mock-search.active=true",
|
||||||
|
"dip.embedding.jobs.enabled=true",
|
||||||
|
"ted.search.similarity-threshold=0.10",
|
||||||
|
"ted.search.semantic-candidate-limit=50"
|
||||||
|
})
|
||||||
|
class GenericSemanticSearchOrchestratorIntegrationTest extends AbstractSearchIntegrationTest {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private SearchTestDataFactory dataFactory;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private SearchOrchestrator searchOrchestrator;
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void semanticMode_should_return_document_from_chunk_embeddings() {
|
||||||
|
var created = dataFactory.createDocumentWithPrimaryAndChunks(
|
||||||
|
"Energy optimization strategy",
|
||||||
|
"Strategy overview",
|
||||||
|
"This primary representation only contains a high level overview.",
|
||||||
|
"en",
|
||||||
|
List.of(
|
||||||
|
"Chunk one is introductory and does not contain the target phrase.",
|
||||||
|
"District heating optimization strategy for municipal energy systems is described here."
|
||||||
|
));
|
||||||
|
|
||||||
|
embeddingOrchestrator.enqueueDocument(created.document().getId(), "mock-search");
|
||||||
|
int processed = embeddingOrchestrator.processNextReadyBatch();
|
||||||
|
assertThat(processed).isGreaterThan(0);
|
||||||
|
|
||||||
|
SearchRequest request = SearchRequest.builder()
|
||||||
|
.queryText("district heating optimization strategy")
|
||||||
|
.modes(Set.of(SearchMode.SEMANTIC))
|
||||||
|
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS)
|
||||||
|
.semanticModelKey("mock-search")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
SearchResponse response = searchOrchestrator.search(
|
||||||
|
request,
|
||||||
|
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null));
|
||||||
|
|
||||||
|
assertThat(response.getHits()).isNotEmpty();
|
||||||
|
assertThat(response.getHits().getFirst().getTitle()).isEqualTo("Energy optimization strategy");
|
||||||
|
assertThat(response.getHits().getFirst().getPrimaryEngine()).isEqualTo(SearchEngineType.PGVECTOR_SEMANTIC);
|
||||||
|
assertThat(response.getHits().getFirst().getRepresentationType()).isEqualTo(RepresentationType.CHUNK);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue