embedding nv3.1

master
trifonovt 4 weeks ago
parent 19a02cdcf7
commit ca502cb369

@ -0,0 +1,25 @@
# NV3.1 hardening notes
This slice hardens the new parallel semantic search path introduced in NV3.
## Scope
- dedicated semantic search test application context
- semantic endpoint integration test with MockMvc
- semantic + hybrid orchestrator integration tests
- semantic model selection test
- pgvector-aware test database setup
## Test focus
1. semantic-only search returns hits when embeddings were created through the new subsystem
2. hybrid search includes lexical and semantic engines together
3. semantic model selection is honored (`semanticModelKey`)
4. debug endpoint exposes semantic engine participation
## Notes
- tests use the new parallel embedding subsystem only
- the legacy vectorization flow is not used
- tests rely on the mock embedding provider for deterministic embeddings
- the semantic test base uses a pgvector-enabled PostgreSQL image

@ -0,0 +1,101 @@
package at.procon.dip.search.integration;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.search.dto.SearchMode;
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest;
import at.procon.dip.testsupport.SemanticSearchTestDataFactory;
import at.procon.dip.testsupport.config.SearchTestJacksonConfig;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.util.Set;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Import;
import org.springframework.http.MediaType;
import org.springframework.test.context.TestPropertySource;
import org.springframework.test.web.servlet.MockMvc;
import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.jsonPath;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
@AutoConfigureMockMvc
@Import(SearchTestJacksonConfig.class)
@TestPropertySource(properties = {
"spring.mvc.converters.preferred-json-mapper=jackson"
})
class GenericSemanticSearchEndpointIntegrationTest extends AbstractSemanticSearchIntegrationTest {
@Autowired
private SemanticSearchTestDataFactory dataFactory;
@Autowired
private MockMvc mockMvc;
@Autowired
private ObjectMapper objectMapper;
@Test
void searchEndpoint_should_return_hits_for_semantic_request() throws Exception {
dataFactory.createAndEmbedPrimaryRepresentation(
"District heating modernization strategy",
"Municipal energy planning",
"District heating optimization strategy for municipal energy systems.",
DocumentType.TEXT,
DocumentFamily.GENERIC,
"en",
RepresentationType.SEMANTIC_TEXT,
"mock-search"
);
SearchRequest request = SearchRequest.builder()
.queryText("district heating optimization")
.modes(Set.of(SearchMode.SEMANTIC))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.semanticModelKey("mock-search")
.build();
mockMvc.perform(post("/search")
.contentType(MediaType.APPLICATION_JSON)
.accept(MediaType.APPLICATION_JSON)
.characterEncoding("UTF-8")
.content(objectMapper.writeValueAsString(request)))
.andExpect(status().isOk())
.andExpect(jsonPath("$.hits[0].title").value("District heating modernization strategy"))
.andExpect(jsonPath("$.enginesUsed").isArray());
}
@Test
void debugEndpoint_should_show_semantic_engine_in_plan() throws Exception {
dataFactory.createAndEmbedPrimaryRepresentation(
"Heat network planning",
"Municipal energy planning",
"District heating optimization strategy for municipal energy systems.",
DocumentType.TEXT,
DocumentFamily.GENERIC,
"en",
RepresentationType.SEMANTIC_TEXT,
"mock-search"
);
SearchRequest request = SearchRequest.builder()
.queryText("district heating optimization")
.modes(Set.of(SearchMode.HYBRID))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.semanticModelKey("mock-search")
.build();
mockMvc.perform(post("/search/debug")
.contentType(MediaType.APPLICATION_JSON)
.accept(MediaType.APPLICATION_JSON)
.characterEncoding("UTF-8")
.content(objectMapper.writeValueAsString(request)))
.andExpect(status().isOk())
.andExpect(jsonPath("$.plan.engines").isArray())
.andExpect(jsonPath("$.fusedResponse.hits[0].title").value("Heat network planning"));
}
}

@ -3,7 +3,6 @@ package at.procon.dip.search.integration;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchMode;
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
@ -11,71 +10,80 @@ import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.service.SearchOrchestrator;
import at.procon.dip.search.spi.SearchDocumentScope;
import at.procon.dip.testsupport.AbstractSearchIntegrationTest;
import at.procon.dip.testsupport.SearchTestDataFactory;
import java.util.List;
import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest;
import at.procon.dip.testsupport.SemanticSearchTestDataFactory;
import java.util.Set;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.test.context.TestPropertySource;
import static org.assertj.core.api.Assertions.assertThat;
@TestPropertySource(properties = {
"dip.embedding.enabled=true",
"dip.embedding.default-document-model=mock-search",
"dip.embedding.default-query-model=mock-search",
"dip.embedding.providers.mock-default.type=mock",
"dip.embedding.providers.mock-default.dimensions=16",
"dip.embedding.models.mock-search.provider-config-key=mock-default",
"dip.embedding.models.mock-search.provider-model-key=mock-search",
"dip.embedding.models.mock-search.dimensions=16",
"dip.embedding.models.mock-search.active=true",
"dip.embedding.jobs.enabled=true",
"ted.search.similarity-threshold=0.10",
"ted.search.semantic-candidate-limit=50"
})
class GenericSemanticSearchOrchestratorIntegrationTest extends AbstractSearchIntegrationTest {
class GenericSemanticSearchOrchestratorIntegrationTest extends AbstractSemanticSearchIntegrationTest {
@Autowired
private SearchTestDataFactory dataFactory;
@Autowired
private RepresentationEmbeddingOrchestrator embeddingOrchestrator;
private SemanticSearchTestDataFactory dataFactory;
@Autowired
private SearchOrchestrator searchOrchestrator;
@Test
void semanticMode_should_return_document_from_chunk_embeddings() {
var created = dataFactory.createDocumentWithPrimaryAndChunks(
"Energy optimization strategy",
"Strategy overview",
"This primary representation only contains a high level overview.",
void semanticMode_should_return_embedded_document() {
dataFactory.createAndEmbedPrimaryRepresentation(
"District heating modernization strategy",
"Municipal energy planning",
"District heating optimization strategy for municipal energy systems.",
DocumentType.TEXT,
DocumentFamily.GENERIC,
"en",
List.of(
"Chunk one is introductory and does not contain the target phrase.",
"District heating optimization strategy for municipal energy systems is described here."
));
embeddingOrchestrator.enqueueDocument(created.document().getId(), "mock-search");
int processed = embeddingOrchestrator.processNextReadyBatch();
assertThat(processed).isGreaterThan(0);
RepresentationType.SEMANTIC_TEXT,
"mock-search"
);
SearchRequest request = SearchRequest.builder()
.queryText("district heating optimization strategy")
.queryText("district heating optimization")
.modes(Set.of(SearchMode.SEMANTIC))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS)
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.semanticModelKey("mock-search")
.build();
SearchResponse response = searchOrchestrator.search(
request,
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null));
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null)
);
assertThat(response.getHits()).isNotEmpty();
assertThat(response.getHits().getFirst().getTitle()).isEqualTo("Energy optimization strategy");
assertThat(response.getHits().getFirst().getTitle()).isEqualTo("District heating modernization strategy");
assertThat(response.getEnginesUsed()).contains(SearchEngineType.PGVECTOR_SEMANTIC);
assertThat(response.getHits().getFirst().getPrimaryEngine()).isEqualTo(SearchEngineType.PGVECTOR_SEMANTIC);
assertThat(response.getHits().getFirst().getRepresentationType()).isEqualTo(RepresentationType.CHUNK);
}
@Test
void hybridMode_should_include_semantic_engine_alongside_lexical_engines() {
dataFactory.createAndEmbedPrimaryRepresentation(
"Energy transition framework",
"Framework for district heating rollout",
"District heating optimization framework for urban energy transition.",
DocumentType.TEXT,
DocumentFamily.GENERIC,
"en",
RepresentationType.SEMANTIC_TEXT,
"mock-search"
);
SearchRequest request = SearchRequest.builder()
.queryText("district heating optimization")
.modes(Set.of(SearchMode.HYBRID))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.semanticModelKey("mock-search")
.build();
SearchResponse response = searchOrchestrator.search(
request,
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null)
);
assertThat(response.getHits()).isNotEmpty();
assertThat(response.getEnginesUsed()).contains(SearchEngineType.POSTGRES_FULLTEXT, SearchEngineType.PGVECTOR_SEMANTIC);
assertThat(response.getHits().getFirst().getFinalScore()).isGreaterThan(0.0d);
}
}

@ -0,0 +1,68 @@
package at.procon.dip.search.integration;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.search.dto.SearchMode;
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.service.SearchOrchestrator;
import at.procon.dip.search.spi.SearchDocumentScope;
import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest;
import at.procon.dip.testsupport.SemanticSearchTestDataFactory;
import java.util.Set;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import static org.assertj.core.api.Assertions.assertThat;
class SemanticModelSelectionIntegrationTest extends AbstractSemanticSearchIntegrationTest {
@Autowired
private SemanticSearchTestDataFactory dataFactory;
@Autowired
private SearchOrchestrator searchOrchestrator;
@Test
void semanticModelKey_should_control_which_embeddings_are_used() {
dataFactory.createAndEmbedPrimaryRepresentation(
"Heat network planning",
"Alt-model semantic document",
"District heating optimization strategy for municipal networks.",
DocumentType.TEXT,
DocumentFamily.GENERIC,
"en",
RepresentationType.SEMANTIC_TEXT,
"mock-search-alt"
);
SearchRequest defaultModelRequest = SearchRequest.builder()
.queryText("district heating optimization")
.modes(Set.of(SearchMode.SEMANTIC))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.build();
SearchRequest alternateModelRequest = SearchRequest.builder()
.queryText("district heating optimization")
.modes(Set.of(SearchMode.SEMANTIC))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.semanticModelKey("mock-search-alt")
.build();
SearchResponse defaultModelResponse = searchOrchestrator.search(
defaultModelRequest,
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null)
);
SearchResponse alternateModelResponse = searchOrchestrator.search(
alternateModelRequest,
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null)
);
assertThat(defaultModelResponse.getHits()).isEmpty();
assertThat(alternateModelResponse.getHits()).hasSize(1);
assertThat(alternateModelResponse.getHits().getFirst().getTitle()).isEqualTo("Heat network planning");
}
}

@ -0,0 +1,130 @@
package at.procon.dip.testsupport;
import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
import at.procon.dip.domain.document.repository.DocumentRepository;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
import java.time.OffsetDateTime;
import javax.sql.DataSource;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.TestInstance;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.test.context.DynamicPropertyRegistry;
import org.springframework.test.context.DynamicPropertySource;
import org.springframework.test.context.TestPropertySource;
import org.testcontainers.containers.PostgreSQLContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
@SpringBootTest(classes = SearchSemanticTestApplication.class, webEnvironment = SpringBootTest.WebEnvironment.MOCK)
@Testcontainers
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@TestPropertySource(properties = {
"spring.jpa.hibernate.ddl-auto=create-drop",
"spring.jpa.show-sql=false",
"spring.jpa.open-in-view=false",
"spring.jpa.properties.hibernate.default_schema=DOC",
"spring.main.lazy-initialization=true",
"server.servlet.context-path=/api",
"ted.search.default-page-size=20",
"ted.search.max-page-size=100",
"ted.search.fulltext-weight=0.35",
"ted.search.trigram-weight=0.20",
"ted.search.semantic-weight=0.45",
"ted.search.recency-boost-weight=0.05",
"ted.search.trigram-threshold=0.10",
"ted.search.semantic-candidate-limit=50",
"ted.search.similarity-threshold=0.01",
"dip.embedding.enabled=true",
"dip.embedding.default-document-model=mock-search",
"dip.embedding.default-query-model=mock-search",
"dip.embedding.jobs.enabled=true",
"dip.embedding.jobs.batch-size=8",
"dip.embedding.providers.mock-default.type=mock",
"dip.embedding.providers.mock-default.dimensions=16",
"dip.embedding.providers.mock-alt.type=mock",
"dip.embedding.providers.mock-alt.dimensions=16",
"dip.embedding.models.mock-search.provider-config-key=mock-default",
"dip.embedding.models.mock-search.provider-model-key=mock-search",
"dip.embedding.models.mock-search.dimensions=16",
"dip.embedding.models.mock-search.supports-query-embedding-mode=true",
"dip.embedding.models.mock-search.active=true",
"dip.embedding.models.mock-search-alt.provider-config-key=mock-alt",
"dip.embedding.models.mock-search-alt.provider-model-key=mock-search-alt",
"dip.embedding.models.mock-search-alt.dimensions=16",
"dip.embedding.models.mock-search-alt.supports-query-embedding-mode=true",
"dip.embedding.models.mock-search-alt.active=true"
})
public abstract class AbstractSemanticSearchIntegrationTest {
@Container
static PostgreSQLContainer<?> postgres = new PostgreSQLContainer<>("pgvector/pgvector:pg16")
.withDatabaseName("dip_semantic_search_test")
.withUsername("test")
.withPassword("test")
.withInitScript("sql/create-doc-search-test-schemas.sql");
static {
postgres.start();
}
@DynamicPropertySource
static void registerProperties(DynamicPropertyRegistry registry) {
registry.add("spring.datasource.url", postgres::getJdbcUrl);
registry.add("spring.datasource.username", postgres::getUsername);
registry.add("spring.datasource.password", postgres::getPassword);
registry.add("spring.datasource.driver-class-name", postgres::getDriverClassName);
}
@Autowired
protected JdbcTemplate jdbcTemplate;
@Autowired
protected DataSource dataSource;
@Autowired
protected DocumentRepository documentRepository;
@Autowired
protected DocumentTextRepresentationRepository representationRepository;
@Autowired
protected DocumentEmbeddingRepository embeddingRepository;
@Autowired
protected DocumentEmbeddingModelRepository embeddingModelRepository;
@Autowired
protected DocumentTenantRepository tenantRepository;
@BeforeEach
void resetSemanticSearchTestDatabase() {
ensureSearchColumnsAndIndexes();
cleanupDatabase();
}
protected void ensureSearchColumnsAndIndexes() {
jdbcTemplate.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm WITH SCHEMA doc");
jdbcTemplate.execute("CREATE EXTENSION IF NOT EXISTS vector WITH SCHEMA public");
jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_config VARCHAR(64)");
jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_vector tsvector");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector_test ON doc.doc_text_representation USING GIN (search_vector)");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm_test ON doc.doc_document USING GIN (title doc.gin_trgm_ops)");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm_test ON doc.doc_document USING GIN (summary doc.gin_trgm_ops)");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm_test ON doc.doc_text_representation USING GIN (text_body doc.gin_trgm_ops)");
jdbcTemplate.execute("ALTER TABLE doc.doc_embedding ADD COLUMN IF NOT EXISTS embedding_vector public.vector");
}
protected void cleanupDatabase() {
jdbcTemplate.execute("TRUNCATE TABLE doc.doc_embedding_job, doc.doc_embedding, doc.doc_embedding_model, doc.doc_text_representation, doc.doc_document, doc.doc_tenant RESTART IDENTITY CASCADE");
}
protected void setDocumentCreatedAt(java.util.UUID documentId, OffsetDateTime createdAt) {
jdbcTemplate.update("UPDATE doc.doc_document SET created_at = ?, updated_at = ? WHERE id = ?", createdAt, createdAt, documentId);
}
}

@ -0,0 +1,39 @@
package at.procon.dip.testsupport;
import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.ted.config.TedProcessorProperties;
import org.springframework.boot.SpringBootConfiguration;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.domain.EntityScan;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
/**
* Narrow semantic-search test application that loads the new generic search subsystem
* plus the new parallel embedding subsystem.
*/
@SpringBootConfiguration
@EnableAutoConfiguration(excludeName = {
"org.apache.camel.spring.boot.CamelAutoConfiguration",
"org.springframework.boot.autoconfigure.task.TaskSchedulingAutoConfiguration"
})
@EnableConfigurationProperties({TedProcessorProperties.class, EmbeddingProperties.class})
@EntityScan(basePackages = {
"at.procon.dip.domain.document.entity",
"at.procon.dip.domain.tenant.entity",
"at.procon.dip.embedding.job.entity"
})
@EnableJpaRepositories(basePackages = {
"at.procon.dip.domain.document.repository",
"at.procon.dip.domain.tenant.repository",
"at.procon.dip.embedding.job.repository"
})
@ComponentScan(basePackages = {
"at.procon.dip.domain.document.service",
"at.procon.dip.embedding",
"at.procon.dip.search",
"at.procon.dip.testsupport"
})
public class SearchSemanticTestApplication {
}

@ -0,0 +1,68 @@
package at.procon.dip.testsupport;
import at.procon.dip.testsupport.SearchTestDataFactory.CreatedDocument;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
@Component
@RequiredArgsConstructor
@Transactional
public class SemanticSearchTestDataFactory {
private final SearchTestDataFactory lexicalFactory;
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
public CreatedDocument createAndEmbedPrimaryRepresentation(
String title,
String summary,
String body,
DocumentType documentType,
DocumentFamily documentFamily,
String languageCode,
RepresentationType primaryType,
String modelKey) {
CreatedDocument created = lexicalFactory.createDocumentWithPrimaryRepresentation(
title,
summary,
body,
documentType,
documentFamily,
languageCode,
primaryType
);
embedDocument(created.document().getId(), modelKey);
return created;
}
public CreatedDocument createAndEmbedPrimaryAndChunks(
String title,
String summary,
String primaryBody,
String languageCode,
List<String> chunkBodies,
String modelKey) {
CreatedDocument created = lexicalFactory.createDocumentWithPrimaryAndChunks(
title,
summary,
primaryBody,
languageCode,
chunkBodies
);
embedDocument(created.document().getId(), modelKey);
return created;
}
private void embedDocument(java.util.UUID documentId, String modelKey) {
embeddingOrchestrator.enqueueDocument(documentId, modelKey);
while (embeddingOrchestrator.processNextReadyBatch() > 0) {
// drain the ready queue synchronously for deterministic tests
}
}
}

@ -0,0 +1,18 @@
package at.procon.dip.testsupport.config;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import org.springframework.boot.autoconfigure.jackson.Jackson2ObjectMapperBuilderCustomizer;
import org.springframework.boot.test.context.TestConfiguration;
import org.springframework.context.annotation.Bean;
@TestConfiguration
public class SearchTestJacksonConfig {
@Bean
Jackson2ObjectMapperBuilderCustomizer searchTestJacksonCustomizer() {
return builder -> builder
.modules(new JavaTimeModule())
.featuresToDisable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
}
}
Loading…
Cancel
Save