embedding nv3.1
parent
19a02cdcf7
commit
ca502cb369
@ -0,0 +1,25 @@
|
||||
# NV3.1 hardening notes
|
||||
|
||||
This slice hardens the new parallel semantic search path introduced in NV3.
|
||||
|
||||
## Scope
|
||||
|
||||
- dedicated semantic search test application context
|
||||
- semantic endpoint integration test with MockMvc
|
||||
- semantic + hybrid orchestrator integration tests
|
||||
- semantic model selection test
|
||||
- pgvector-aware test database setup
|
||||
|
||||
## Test focus
|
||||
|
||||
1. semantic-only search returns hits when embeddings were created through the new subsystem
|
||||
2. hybrid search includes lexical and semantic engines together
|
||||
3. semantic model selection is honored (`semanticModelKey`)
|
||||
4. debug endpoint exposes semantic engine participation
|
||||
|
||||
## Notes
|
||||
|
||||
- tests use the new parallel embedding subsystem only
|
||||
- the legacy vectorization flow is not used
|
||||
- tests rely on the mock embedding provider for deterministic embeddings
|
||||
- the semantic test base uses a pgvector-enabled PostgreSQL image
|
||||
@ -0,0 +1,101 @@
|
||||
package at.procon.dip.search.integration;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.search.dto.SearchMode;
|
||||
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
|
||||
import at.procon.dip.search.dto.SearchRequest;
|
||||
import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest;
|
||||
import at.procon.dip.testsupport.SemanticSearchTestDataFactory;
|
||||
import at.procon.dip.testsupport.config.SearchTestJacksonConfig;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import java.util.Set;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.test.context.TestPropertySource;
|
||||
import org.springframework.test.web.servlet.MockMvc;
|
||||
import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
|
||||
|
||||
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post;
|
||||
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.jsonPath;
|
||||
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
|
||||
|
||||
@AutoConfigureMockMvc
|
||||
@Import(SearchTestJacksonConfig.class)
|
||||
@TestPropertySource(properties = {
|
||||
"spring.mvc.converters.preferred-json-mapper=jackson"
|
||||
})
|
||||
class GenericSemanticSearchEndpointIntegrationTest extends AbstractSemanticSearchIntegrationTest {
|
||||
|
||||
@Autowired
|
||||
private SemanticSearchTestDataFactory dataFactory;
|
||||
|
||||
@Autowired
|
||||
private MockMvc mockMvc;
|
||||
|
||||
@Autowired
|
||||
private ObjectMapper objectMapper;
|
||||
|
||||
@Test
|
||||
void searchEndpoint_should_return_hits_for_semantic_request() throws Exception {
|
||||
dataFactory.createAndEmbedPrimaryRepresentation(
|
||||
"District heating modernization strategy",
|
||||
"Municipal energy planning",
|
||||
"District heating optimization strategy for municipal energy systems.",
|
||||
DocumentType.TEXT,
|
||||
DocumentFamily.GENERIC,
|
||||
"en",
|
||||
RepresentationType.SEMANTIC_TEXT,
|
||||
"mock-search"
|
||||
);
|
||||
|
||||
SearchRequest request = SearchRequest.builder()
|
||||
.queryText("district heating optimization")
|
||||
.modes(Set.of(SearchMode.SEMANTIC))
|
||||
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
|
||||
.semanticModelKey("mock-search")
|
||||
.build();
|
||||
|
||||
mockMvc.perform(post("/search")
|
||||
.contentType(MediaType.APPLICATION_JSON)
|
||||
.accept(MediaType.APPLICATION_JSON)
|
||||
.characterEncoding("UTF-8")
|
||||
.content(objectMapper.writeValueAsString(request)))
|
||||
.andExpect(status().isOk())
|
||||
.andExpect(jsonPath("$.hits[0].title").value("District heating modernization strategy"))
|
||||
.andExpect(jsonPath("$.enginesUsed").isArray());
|
||||
}
|
||||
|
||||
@Test
|
||||
void debugEndpoint_should_show_semantic_engine_in_plan() throws Exception {
|
||||
dataFactory.createAndEmbedPrimaryRepresentation(
|
||||
"Heat network planning",
|
||||
"Municipal energy planning",
|
||||
"District heating optimization strategy for municipal energy systems.",
|
||||
DocumentType.TEXT,
|
||||
DocumentFamily.GENERIC,
|
||||
"en",
|
||||
RepresentationType.SEMANTIC_TEXT,
|
||||
"mock-search"
|
||||
);
|
||||
|
||||
SearchRequest request = SearchRequest.builder()
|
||||
.queryText("district heating optimization")
|
||||
.modes(Set.of(SearchMode.HYBRID))
|
||||
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
|
||||
.semanticModelKey("mock-search")
|
||||
.build();
|
||||
|
||||
mockMvc.perform(post("/search/debug")
|
||||
.contentType(MediaType.APPLICATION_JSON)
|
||||
.accept(MediaType.APPLICATION_JSON)
|
||||
.characterEncoding("UTF-8")
|
||||
.content(objectMapper.writeValueAsString(request)))
|
||||
.andExpect(status().isOk())
|
||||
.andExpect(jsonPath("$.plan.engines").isArray())
|
||||
.andExpect(jsonPath("$.fusedResponse.hits[0].title").value("Heat network planning"));
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,68 @@
|
||||
package at.procon.dip.search.integration;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.search.dto.SearchMode;
|
||||
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
|
||||
import at.procon.dip.search.dto.SearchRequest;
|
||||
import at.procon.dip.search.dto.SearchResponse;
|
||||
import at.procon.dip.search.service.SearchOrchestrator;
|
||||
import at.procon.dip.search.spi.SearchDocumentScope;
|
||||
import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest;
|
||||
import at.procon.dip.testsupport.SemanticSearchTestDataFactory;
|
||||
import java.util.Set;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
class SemanticModelSelectionIntegrationTest extends AbstractSemanticSearchIntegrationTest {
|
||||
|
||||
@Autowired
|
||||
private SemanticSearchTestDataFactory dataFactory;
|
||||
|
||||
@Autowired
|
||||
private SearchOrchestrator searchOrchestrator;
|
||||
|
||||
@Test
|
||||
void semanticModelKey_should_control_which_embeddings_are_used() {
|
||||
dataFactory.createAndEmbedPrimaryRepresentation(
|
||||
"Heat network planning",
|
||||
"Alt-model semantic document",
|
||||
"District heating optimization strategy for municipal networks.",
|
||||
DocumentType.TEXT,
|
||||
DocumentFamily.GENERIC,
|
||||
"en",
|
||||
RepresentationType.SEMANTIC_TEXT,
|
||||
"mock-search-alt"
|
||||
);
|
||||
|
||||
SearchRequest defaultModelRequest = SearchRequest.builder()
|
||||
.queryText("district heating optimization")
|
||||
.modes(Set.of(SearchMode.SEMANTIC))
|
||||
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
|
||||
.build();
|
||||
|
||||
SearchRequest alternateModelRequest = SearchRequest.builder()
|
||||
.queryText("district heating optimization")
|
||||
.modes(Set.of(SearchMode.SEMANTIC))
|
||||
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
|
||||
.semanticModelKey("mock-search-alt")
|
||||
.build();
|
||||
|
||||
SearchResponse defaultModelResponse = searchOrchestrator.search(
|
||||
defaultModelRequest,
|
||||
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null)
|
||||
);
|
||||
|
||||
SearchResponse alternateModelResponse = searchOrchestrator.search(
|
||||
alternateModelRequest,
|
||||
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null)
|
||||
);
|
||||
|
||||
assertThat(defaultModelResponse.getHits()).isEmpty();
|
||||
assertThat(alternateModelResponse.getHits()).hasSize(1);
|
||||
assertThat(alternateModelResponse.getHits().getFirst().getTitle()).isEqualTo("Heat network planning");
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,130 @@
|
||||
package at.procon.dip.testsupport;
|
||||
|
||||
import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
|
||||
import java.time.OffsetDateTime;
|
||||
import javax.sql.DataSource;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.TestInstance;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
import org.springframework.test.context.DynamicPropertyRegistry;
|
||||
import org.springframework.test.context.DynamicPropertySource;
|
||||
import org.springframework.test.context.TestPropertySource;
|
||||
import org.testcontainers.containers.PostgreSQLContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
@SpringBootTest(classes = SearchSemanticTestApplication.class, webEnvironment = SpringBootTest.WebEnvironment.MOCK)
|
||||
@Testcontainers
|
||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||
@TestPropertySource(properties = {
|
||||
"spring.jpa.hibernate.ddl-auto=create-drop",
|
||||
"spring.jpa.show-sql=false",
|
||||
"spring.jpa.open-in-view=false",
|
||||
"spring.jpa.properties.hibernate.default_schema=DOC",
|
||||
"spring.main.lazy-initialization=true",
|
||||
"server.servlet.context-path=/api",
|
||||
|
||||
"ted.search.default-page-size=20",
|
||||
"ted.search.max-page-size=100",
|
||||
"ted.search.fulltext-weight=0.35",
|
||||
"ted.search.trigram-weight=0.20",
|
||||
"ted.search.semantic-weight=0.45",
|
||||
"ted.search.recency-boost-weight=0.05",
|
||||
"ted.search.trigram-threshold=0.10",
|
||||
"ted.search.semantic-candidate-limit=50",
|
||||
"ted.search.similarity-threshold=0.01",
|
||||
|
||||
"dip.embedding.enabled=true",
|
||||
"dip.embedding.default-document-model=mock-search",
|
||||
"dip.embedding.default-query-model=mock-search",
|
||||
"dip.embedding.jobs.enabled=true",
|
||||
"dip.embedding.jobs.batch-size=8",
|
||||
"dip.embedding.providers.mock-default.type=mock",
|
||||
"dip.embedding.providers.mock-default.dimensions=16",
|
||||
"dip.embedding.providers.mock-alt.type=mock",
|
||||
"dip.embedding.providers.mock-alt.dimensions=16",
|
||||
"dip.embedding.models.mock-search.provider-config-key=mock-default",
|
||||
"dip.embedding.models.mock-search.provider-model-key=mock-search",
|
||||
"dip.embedding.models.mock-search.dimensions=16",
|
||||
"dip.embedding.models.mock-search.supports-query-embedding-mode=true",
|
||||
"dip.embedding.models.mock-search.active=true",
|
||||
"dip.embedding.models.mock-search-alt.provider-config-key=mock-alt",
|
||||
"dip.embedding.models.mock-search-alt.provider-model-key=mock-search-alt",
|
||||
"dip.embedding.models.mock-search-alt.dimensions=16",
|
||||
"dip.embedding.models.mock-search-alt.supports-query-embedding-mode=true",
|
||||
"dip.embedding.models.mock-search-alt.active=true"
|
||||
})
|
||||
public abstract class AbstractSemanticSearchIntegrationTest {
|
||||
|
||||
@Container
|
||||
static PostgreSQLContainer<?> postgres = new PostgreSQLContainer<>("pgvector/pgvector:pg16")
|
||||
.withDatabaseName("dip_semantic_search_test")
|
||||
.withUsername("test")
|
||||
.withPassword("test")
|
||||
.withInitScript("sql/create-doc-search-test-schemas.sql");
|
||||
|
||||
static {
|
||||
postgres.start();
|
||||
}
|
||||
|
||||
@DynamicPropertySource
|
||||
static void registerProperties(DynamicPropertyRegistry registry) {
|
||||
registry.add("spring.datasource.url", postgres::getJdbcUrl);
|
||||
registry.add("spring.datasource.username", postgres::getUsername);
|
||||
registry.add("spring.datasource.password", postgres::getPassword);
|
||||
registry.add("spring.datasource.driver-class-name", postgres::getDriverClassName);
|
||||
}
|
||||
|
||||
@Autowired
|
||||
protected JdbcTemplate jdbcTemplate;
|
||||
|
||||
@Autowired
|
||||
protected DataSource dataSource;
|
||||
|
||||
@Autowired
|
||||
protected DocumentRepository documentRepository;
|
||||
|
||||
@Autowired
|
||||
protected DocumentTextRepresentationRepository representationRepository;
|
||||
|
||||
@Autowired
|
||||
protected DocumentEmbeddingRepository embeddingRepository;
|
||||
|
||||
@Autowired
|
||||
protected DocumentEmbeddingModelRepository embeddingModelRepository;
|
||||
|
||||
@Autowired
|
||||
protected DocumentTenantRepository tenantRepository;
|
||||
|
||||
@BeforeEach
|
||||
void resetSemanticSearchTestDatabase() {
|
||||
ensureSearchColumnsAndIndexes();
|
||||
cleanupDatabase();
|
||||
}
|
||||
|
||||
protected void ensureSearchColumnsAndIndexes() {
|
||||
jdbcTemplate.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm WITH SCHEMA doc");
|
||||
jdbcTemplate.execute("CREATE EXTENSION IF NOT EXISTS vector WITH SCHEMA public");
|
||||
jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_config VARCHAR(64)");
|
||||
jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_vector tsvector");
|
||||
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector_test ON doc.doc_text_representation USING GIN (search_vector)");
|
||||
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm_test ON doc.doc_document USING GIN (title doc.gin_trgm_ops)");
|
||||
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm_test ON doc.doc_document USING GIN (summary doc.gin_trgm_ops)");
|
||||
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm_test ON doc.doc_text_representation USING GIN (text_body doc.gin_trgm_ops)");
|
||||
jdbcTemplate.execute("ALTER TABLE doc.doc_embedding ADD COLUMN IF NOT EXISTS embedding_vector public.vector");
|
||||
}
|
||||
|
||||
protected void cleanupDatabase() {
|
||||
jdbcTemplate.execute("TRUNCATE TABLE doc.doc_embedding_job, doc.doc_embedding, doc.doc_embedding_model, doc.doc_text_representation, doc.doc_document, doc.doc_tenant RESTART IDENTITY CASCADE");
|
||||
}
|
||||
|
||||
protected void setDocumentCreatedAt(java.util.UUID documentId, OffsetDateTime createdAt) {
|
||||
jdbcTemplate.update("UPDATE doc.doc_document SET created_at = ?, updated_at = ? WHERE id = ?", createdAt, createdAt, documentId);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,39 @@
|
||||
package at.procon.dip.testsupport;
|
||||
|
||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import org.springframework.boot.SpringBootConfiguration;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.domain.EntityScan;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||
|
||||
/**
|
||||
* Narrow semantic-search test application that loads the new generic search subsystem
|
||||
* plus the new parallel embedding subsystem.
|
||||
*/
|
||||
@SpringBootConfiguration
|
||||
@EnableAutoConfiguration(excludeName = {
|
||||
"org.apache.camel.spring.boot.CamelAutoConfiguration",
|
||||
"org.springframework.boot.autoconfigure.task.TaskSchedulingAutoConfiguration"
|
||||
})
|
||||
@EnableConfigurationProperties({TedProcessorProperties.class, EmbeddingProperties.class})
|
||||
@EntityScan(basePackages = {
|
||||
"at.procon.dip.domain.document.entity",
|
||||
"at.procon.dip.domain.tenant.entity",
|
||||
"at.procon.dip.embedding.job.entity"
|
||||
})
|
||||
@EnableJpaRepositories(basePackages = {
|
||||
"at.procon.dip.domain.document.repository",
|
||||
"at.procon.dip.domain.tenant.repository",
|
||||
"at.procon.dip.embedding.job.repository"
|
||||
})
|
||||
@ComponentScan(basePackages = {
|
||||
"at.procon.dip.domain.document.service",
|
||||
"at.procon.dip.embedding",
|
||||
"at.procon.dip.search",
|
||||
"at.procon.dip.testsupport"
|
||||
})
|
||||
public class SearchSemanticTestApplication {
|
||||
}
|
||||
@ -0,0 +1,68 @@
|
||||
package at.procon.dip.testsupport;
|
||||
|
||||
import at.procon.dip.testsupport.SearchTestDataFactory.CreatedDocument;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||
import java.util.List;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@Transactional
|
||||
public class SemanticSearchTestDataFactory {
|
||||
|
||||
private final SearchTestDataFactory lexicalFactory;
|
||||
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
||||
|
||||
public CreatedDocument createAndEmbedPrimaryRepresentation(
|
||||
String title,
|
||||
String summary,
|
||||
String body,
|
||||
DocumentType documentType,
|
||||
DocumentFamily documentFamily,
|
||||
String languageCode,
|
||||
RepresentationType primaryType,
|
||||
String modelKey) {
|
||||
|
||||
CreatedDocument created = lexicalFactory.createDocumentWithPrimaryRepresentation(
|
||||
title,
|
||||
summary,
|
||||
body,
|
||||
documentType,
|
||||
documentFamily,
|
||||
languageCode,
|
||||
primaryType
|
||||
);
|
||||
embedDocument(created.document().getId(), modelKey);
|
||||
return created;
|
||||
}
|
||||
|
||||
public CreatedDocument createAndEmbedPrimaryAndChunks(
|
||||
String title,
|
||||
String summary,
|
||||
String primaryBody,
|
||||
String languageCode,
|
||||
List<String> chunkBodies,
|
||||
String modelKey) {
|
||||
CreatedDocument created = lexicalFactory.createDocumentWithPrimaryAndChunks(
|
||||
title,
|
||||
summary,
|
||||
primaryBody,
|
||||
languageCode,
|
||||
chunkBodies
|
||||
);
|
||||
embedDocument(created.document().getId(), modelKey);
|
||||
return created;
|
||||
}
|
||||
|
||||
private void embedDocument(java.util.UUID documentId, String modelKey) {
|
||||
embeddingOrchestrator.enqueueDocument(documentId, modelKey);
|
||||
while (embeddingOrchestrator.processNextReadyBatch() > 0) {
|
||||
// drain the ready queue synchronously for deterministic tests
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
package at.procon.dip.testsupport.config;
|
||||
|
||||
import com.fasterxml.jackson.databind.SerializationFeature;
|
||||
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
|
||||
import org.springframework.boot.autoconfigure.jackson.Jackson2ObjectMapperBuilderCustomizer;
|
||||
import org.springframework.boot.test.context.TestConfiguration;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
|
||||
@TestConfiguration
|
||||
public class SearchTestJacksonConfig {
|
||||
|
||||
@Bean
|
||||
Jackson2ObjectMapperBuilderCustomizer searchTestJacksonCustomizer() {
|
||||
return builder -> builder
|
||||
.modules(new JavaTimeModule())
|
||||
.featuresToDisable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue