From ca502cb36971f972363d7645ac7a9b620b9b78ae Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Mon, 23 Mar 2026 11:18:19 +0100 Subject: [PATCH] embedding nv3.1 --- docs/embedding/NV3_1_HARDENING_NOTES.md | 25 ++++ ...SemanticSearchEndpointIntegrationTest.java | 101 ++++++++++++++ ...nticSearchOrchestratorIntegrationTest.java | 92 +++++++------ ...SemanticModelSelectionIntegrationTest.java | 68 +++++++++ ...AbstractSemanticSearchIntegrationTest.java | 130 ++++++++++++++++++ .../SearchSemanticTestApplication.java | 39 ++++++ .../SemanticSearchTestDataFactory.java | 68 +++++++++ .../config/SearchTestJacksonConfig.java | 18 +++ 8 files changed, 499 insertions(+), 42 deletions(-) create mode 100644 docs/embedding/NV3_1_HARDENING_NOTES.md create mode 100644 src/test/java/at/procon/dip/search/integration/GenericSemanticSearchEndpointIntegrationTest.java create mode 100644 src/test/java/at/procon/dip/search/integration/SemanticModelSelectionIntegrationTest.java create mode 100644 src/test/java/at/procon/dip/testsupport/AbstractSemanticSearchIntegrationTest.java create mode 100644 src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java create mode 100644 src/test/java/at/procon/dip/testsupport/SemanticSearchTestDataFactory.java create mode 100644 src/test/java/at/procon/dip/testsupport/config/SearchTestJacksonConfig.java diff --git a/docs/embedding/NV3_1_HARDENING_NOTES.md b/docs/embedding/NV3_1_HARDENING_NOTES.md new file mode 100644 index 0000000..178be5c --- /dev/null +++ b/docs/embedding/NV3_1_HARDENING_NOTES.md @@ -0,0 +1,25 @@ +# NV3.1 hardening notes + +This slice hardens the new parallel semantic search path introduced in NV3. + +## Scope + +- dedicated semantic search test application context +- semantic endpoint integration test with MockMvc +- semantic + hybrid orchestrator integration tests +- semantic model selection test +- pgvector-aware test database setup + +## Test focus + +1. semantic-only search returns hits when embeddings were created through the new subsystem +2. hybrid search includes lexical and semantic engines together +3. semantic model selection is honored (`semanticModelKey`) +4. debug endpoint exposes semantic engine participation + +## Notes + +- tests use the new parallel embedding subsystem only +- the legacy vectorization flow is not used +- tests rely on the mock embedding provider for deterministic embeddings +- the semantic test base uses a pgvector-enabled PostgreSQL image diff --git a/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchEndpointIntegrationTest.java b/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchEndpointIntegrationTest.java new file mode 100644 index 0000000..db8644a --- /dev/null +++ b/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchEndpointIntegrationTest.java @@ -0,0 +1,101 @@ +package at.procon.dip.search.integration; + +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.search.dto.SearchMode; +import at.procon.dip.search.dto.SearchRepresentationSelectionMode; +import at.procon.dip.search.dto.SearchRequest; +import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest; +import at.procon.dip.testsupport.SemanticSearchTestDataFactory; +import at.procon.dip.testsupport.config.SearchTestJacksonConfig; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.util.Set; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.springframework.http.MediaType; +import org.springframework.test.context.TestPropertySource; +import org.springframework.test.web.servlet.MockMvc; +import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc; + +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.jsonPath; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; + +@AutoConfigureMockMvc +@Import(SearchTestJacksonConfig.class) +@TestPropertySource(properties = { + "spring.mvc.converters.preferred-json-mapper=jackson" +}) +class GenericSemanticSearchEndpointIntegrationTest extends AbstractSemanticSearchIntegrationTest { + + @Autowired + private SemanticSearchTestDataFactory dataFactory; + + @Autowired + private MockMvc mockMvc; + + @Autowired + private ObjectMapper objectMapper; + + @Test + void searchEndpoint_should_return_hits_for_semantic_request() throws Exception { + dataFactory.createAndEmbedPrimaryRepresentation( + "District heating modernization strategy", + "Municipal energy planning", + "District heating optimization strategy for municipal energy systems.", + DocumentType.TEXT, + DocumentFamily.GENERIC, + "en", + RepresentationType.SEMANTIC_TEXT, + "mock-search" + ); + + SearchRequest request = SearchRequest.builder() + .queryText("district heating optimization") + .modes(Set.of(SearchMode.SEMANTIC)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .semanticModelKey("mock-search") + .build(); + + mockMvc.perform(post("/search") + .contentType(MediaType.APPLICATION_JSON) + .accept(MediaType.APPLICATION_JSON) + .characterEncoding("UTF-8") + .content(objectMapper.writeValueAsString(request))) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.hits[0].title").value("District heating modernization strategy")) + .andExpect(jsonPath("$.enginesUsed").isArray()); + } + + @Test + void debugEndpoint_should_show_semantic_engine_in_plan() throws Exception { + dataFactory.createAndEmbedPrimaryRepresentation( + "Heat network planning", + "Municipal energy planning", + "District heating optimization strategy for municipal energy systems.", + DocumentType.TEXT, + DocumentFamily.GENERIC, + "en", + RepresentationType.SEMANTIC_TEXT, + "mock-search" + ); + + SearchRequest request = SearchRequest.builder() + .queryText("district heating optimization") + .modes(Set.of(SearchMode.HYBRID)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .semanticModelKey("mock-search") + .build(); + + mockMvc.perform(post("/search/debug") + .contentType(MediaType.APPLICATION_JSON) + .accept(MediaType.APPLICATION_JSON) + .characterEncoding("UTF-8") + .content(objectMapper.writeValueAsString(request))) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.plan.engines").isArray()) + .andExpect(jsonPath("$.fusedResponse.hits[0].title").value("Heat network planning")); + } +} diff --git a/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchOrchestratorIntegrationTest.java b/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchOrchestratorIntegrationTest.java index 19e879e..d60182a 100644 --- a/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchOrchestratorIntegrationTest.java +++ b/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchOrchestratorIntegrationTest.java @@ -3,7 +3,6 @@ package at.procon.dip.search.integration; import at.procon.dip.domain.document.DocumentFamily; import at.procon.dip.domain.document.DocumentType; import at.procon.dip.domain.document.RepresentationType; -import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchMode; import at.procon.dip.search.dto.SearchRepresentationSelectionMode; @@ -11,71 +10,80 @@ import at.procon.dip.search.dto.SearchRequest; import at.procon.dip.search.dto.SearchResponse; import at.procon.dip.search.service.SearchOrchestrator; import at.procon.dip.search.spi.SearchDocumentScope; -import at.procon.dip.testsupport.AbstractSearchIntegrationTest; -import at.procon.dip.testsupport.SearchTestDataFactory; -import java.util.List; +import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest; +import at.procon.dip.testsupport.SemanticSearchTestDataFactory; import java.util.Set; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.test.context.TestPropertySource; import static org.assertj.core.api.Assertions.assertThat; -@TestPropertySource(properties = { - "dip.embedding.enabled=true", - "dip.embedding.default-document-model=mock-search", - "dip.embedding.default-query-model=mock-search", - "dip.embedding.providers.mock-default.type=mock", - "dip.embedding.providers.mock-default.dimensions=16", - "dip.embedding.models.mock-search.provider-config-key=mock-default", - "dip.embedding.models.mock-search.provider-model-key=mock-search", - "dip.embedding.models.mock-search.dimensions=16", - "dip.embedding.models.mock-search.active=true", - "dip.embedding.jobs.enabled=true", - "ted.search.similarity-threshold=0.10", - "ted.search.semantic-candidate-limit=50" -}) -class GenericSemanticSearchOrchestratorIntegrationTest extends AbstractSearchIntegrationTest { +class GenericSemanticSearchOrchestratorIntegrationTest extends AbstractSemanticSearchIntegrationTest { @Autowired - private SearchTestDataFactory dataFactory; - - @Autowired - private RepresentationEmbeddingOrchestrator embeddingOrchestrator; + private SemanticSearchTestDataFactory dataFactory; @Autowired private SearchOrchestrator searchOrchestrator; @Test - void semanticMode_should_return_document_from_chunk_embeddings() { - var created = dataFactory.createDocumentWithPrimaryAndChunks( - "Energy optimization strategy", - "Strategy overview", - "This primary representation only contains a high level overview.", + void semanticMode_should_return_embedded_document() { + dataFactory.createAndEmbedPrimaryRepresentation( + "District heating modernization strategy", + "Municipal energy planning", + "District heating optimization strategy for municipal energy systems.", + DocumentType.TEXT, + DocumentFamily.GENERIC, "en", - List.of( - "Chunk one is introductory and does not contain the target phrase.", - "District heating optimization strategy for municipal energy systems is described here." - )); - - embeddingOrchestrator.enqueueDocument(created.document().getId(), "mock-search"); - int processed = embeddingOrchestrator.processNextReadyBatch(); - assertThat(processed).isGreaterThan(0); + RepresentationType.SEMANTIC_TEXT, + "mock-search" + ); SearchRequest request = SearchRequest.builder() - .queryText("district heating optimization strategy") + .queryText("district heating optimization") .modes(Set.of(SearchMode.SEMANTIC)) - .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) .semanticModelKey("mock-search") .build(); SearchResponse response = searchOrchestrator.search( request, - new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null)); + new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null) + ); assertThat(response.getHits()).isNotEmpty(); - assertThat(response.getHits().getFirst().getTitle()).isEqualTo("Energy optimization strategy"); + assertThat(response.getHits().getFirst().getTitle()).isEqualTo("District heating modernization strategy"); + assertThat(response.getEnginesUsed()).contains(SearchEngineType.PGVECTOR_SEMANTIC); assertThat(response.getHits().getFirst().getPrimaryEngine()).isEqualTo(SearchEngineType.PGVECTOR_SEMANTIC); - assertThat(response.getHits().getFirst().getRepresentationType()).isEqualTo(RepresentationType.CHUNK); + } + + @Test + void hybridMode_should_include_semantic_engine_alongside_lexical_engines() { + dataFactory.createAndEmbedPrimaryRepresentation( + "Energy transition framework", + "Framework for district heating rollout", + "District heating optimization framework for urban energy transition.", + DocumentType.TEXT, + DocumentFamily.GENERIC, + "en", + RepresentationType.SEMANTIC_TEXT, + "mock-search" + ); + + SearchRequest request = SearchRequest.builder() + .queryText("district heating optimization") + .modes(Set.of(SearchMode.HYBRID)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .semanticModelKey("mock-search") + .build(); + + SearchResponse response = searchOrchestrator.search( + request, + new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null) + ); + + assertThat(response.getHits()).isNotEmpty(); + assertThat(response.getEnginesUsed()).contains(SearchEngineType.POSTGRES_FULLTEXT, SearchEngineType.PGVECTOR_SEMANTIC); + assertThat(response.getHits().getFirst().getFinalScore()).isGreaterThan(0.0d); } } diff --git a/src/test/java/at/procon/dip/search/integration/SemanticModelSelectionIntegrationTest.java b/src/test/java/at/procon/dip/search/integration/SemanticModelSelectionIntegrationTest.java new file mode 100644 index 0000000..f9deee4 --- /dev/null +++ b/src/test/java/at/procon/dip/search/integration/SemanticModelSelectionIntegrationTest.java @@ -0,0 +1,68 @@ +package at.procon.dip.search.integration; + +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.search.dto.SearchMode; +import at.procon.dip.search.dto.SearchRepresentationSelectionMode; +import at.procon.dip.search.dto.SearchRequest; +import at.procon.dip.search.dto.SearchResponse; +import at.procon.dip.search.service.SearchOrchestrator; +import at.procon.dip.search.spi.SearchDocumentScope; +import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest; +import at.procon.dip.testsupport.SemanticSearchTestDataFactory; +import java.util.Set; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; + +import static org.assertj.core.api.Assertions.assertThat; + +class SemanticModelSelectionIntegrationTest extends AbstractSemanticSearchIntegrationTest { + + @Autowired + private SemanticSearchTestDataFactory dataFactory; + + @Autowired + private SearchOrchestrator searchOrchestrator; + + @Test + void semanticModelKey_should_control_which_embeddings_are_used() { + dataFactory.createAndEmbedPrimaryRepresentation( + "Heat network planning", + "Alt-model semantic document", + "District heating optimization strategy for municipal networks.", + DocumentType.TEXT, + DocumentFamily.GENERIC, + "en", + RepresentationType.SEMANTIC_TEXT, + "mock-search-alt" + ); + + SearchRequest defaultModelRequest = SearchRequest.builder() + .queryText("district heating optimization") + .modes(Set.of(SearchMode.SEMANTIC)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .build(); + + SearchRequest alternateModelRequest = SearchRequest.builder() + .queryText("district heating optimization") + .modes(Set.of(SearchMode.SEMANTIC)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .semanticModelKey("mock-search-alt") + .build(); + + SearchResponse defaultModelResponse = searchOrchestrator.search( + defaultModelRequest, + new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null) + ); + + SearchResponse alternateModelResponse = searchOrchestrator.search( + alternateModelRequest, + new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null) + ); + + assertThat(defaultModelResponse.getHits()).isEmpty(); + assertThat(alternateModelResponse.getHits()).hasSize(1); + assertThat(alternateModelResponse.getHits().getFirst().getTitle()).isEqualTo("Heat network planning"); + } +} diff --git a/src/test/java/at/procon/dip/testsupport/AbstractSemanticSearchIntegrationTest.java b/src/test/java/at/procon/dip/testsupport/AbstractSemanticSearchIntegrationTest.java new file mode 100644 index 0000000..93e593b --- /dev/null +++ b/src/test/java/at/procon/dip/testsupport/AbstractSemanticSearchIntegrationTest.java @@ -0,0 +1,130 @@ +package at.procon.dip.testsupport; + +import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository; +import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; +import at.procon.dip.domain.document.repository.DocumentRepository; +import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; +import at.procon.dip.domain.tenant.repository.DocumentTenantRepository; +import java.time.OffsetDateTime; +import javax.sql.DataSource; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestInstance; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.test.context.DynamicPropertyRegistry; +import org.springframework.test.context.DynamicPropertySource; +import org.springframework.test.context.TestPropertySource; +import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +@SpringBootTest(classes = SearchSemanticTestApplication.class, webEnvironment = SpringBootTest.WebEnvironment.MOCK) +@Testcontainers +@TestInstance(TestInstance.Lifecycle.PER_CLASS) +@TestPropertySource(properties = { + "spring.jpa.hibernate.ddl-auto=create-drop", + "spring.jpa.show-sql=false", + "spring.jpa.open-in-view=false", + "spring.jpa.properties.hibernate.default_schema=DOC", + "spring.main.lazy-initialization=true", + "server.servlet.context-path=/api", + + "ted.search.default-page-size=20", + "ted.search.max-page-size=100", + "ted.search.fulltext-weight=0.35", + "ted.search.trigram-weight=0.20", + "ted.search.semantic-weight=0.45", + "ted.search.recency-boost-weight=0.05", + "ted.search.trigram-threshold=0.10", + "ted.search.semantic-candidate-limit=50", + "ted.search.similarity-threshold=0.01", + + "dip.embedding.enabled=true", + "dip.embedding.default-document-model=mock-search", + "dip.embedding.default-query-model=mock-search", + "dip.embedding.jobs.enabled=true", + "dip.embedding.jobs.batch-size=8", + "dip.embedding.providers.mock-default.type=mock", + "dip.embedding.providers.mock-default.dimensions=16", + "dip.embedding.providers.mock-alt.type=mock", + "dip.embedding.providers.mock-alt.dimensions=16", + "dip.embedding.models.mock-search.provider-config-key=mock-default", + "dip.embedding.models.mock-search.provider-model-key=mock-search", + "dip.embedding.models.mock-search.dimensions=16", + "dip.embedding.models.mock-search.supports-query-embedding-mode=true", + "dip.embedding.models.mock-search.active=true", + "dip.embedding.models.mock-search-alt.provider-config-key=mock-alt", + "dip.embedding.models.mock-search-alt.provider-model-key=mock-search-alt", + "dip.embedding.models.mock-search-alt.dimensions=16", + "dip.embedding.models.mock-search-alt.supports-query-embedding-mode=true", + "dip.embedding.models.mock-search-alt.active=true" +}) +public abstract class AbstractSemanticSearchIntegrationTest { + + @Container + static PostgreSQLContainer postgres = new PostgreSQLContainer<>("pgvector/pgvector:pg16") + .withDatabaseName("dip_semantic_search_test") + .withUsername("test") + .withPassword("test") + .withInitScript("sql/create-doc-search-test-schemas.sql"); + + static { + postgres.start(); + } + + @DynamicPropertySource + static void registerProperties(DynamicPropertyRegistry registry) { + registry.add("spring.datasource.url", postgres::getJdbcUrl); + registry.add("spring.datasource.username", postgres::getUsername); + registry.add("spring.datasource.password", postgres::getPassword); + registry.add("spring.datasource.driver-class-name", postgres::getDriverClassName); + } + + @Autowired + protected JdbcTemplate jdbcTemplate; + + @Autowired + protected DataSource dataSource; + + @Autowired + protected DocumentRepository documentRepository; + + @Autowired + protected DocumentTextRepresentationRepository representationRepository; + + @Autowired + protected DocumentEmbeddingRepository embeddingRepository; + + @Autowired + protected DocumentEmbeddingModelRepository embeddingModelRepository; + + @Autowired + protected DocumentTenantRepository tenantRepository; + + @BeforeEach + void resetSemanticSearchTestDatabase() { + ensureSearchColumnsAndIndexes(); + cleanupDatabase(); + } + + protected void ensureSearchColumnsAndIndexes() { + jdbcTemplate.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm WITH SCHEMA doc"); + jdbcTemplate.execute("CREATE EXTENSION IF NOT EXISTS vector WITH SCHEMA public"); + jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_config VARCHAR(64)"); + jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_vector tsvector"); + jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector_test ON doc.doc_text_representation USING GIN (search_vector)"); + jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm_test ON doc.doc_document USING GIN (title doc.gin_trgm_ops)"); + jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm_test ON doc.doc_document USING GIN (summary doc.gin_trgm_ops)"); + jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm_test ON doc.doc_text_representation USING GIN (text_body doc.gin_trgm_ops)"); + jdbcTemplate.execute("ALTER TABLE doc.doc_embedding ADD COLUMN IF NOT EXISTS embedding_vector public.vector"); + } + + protected void cleanupDatabase() { + jdbcTemplate.execute("TRUNCATE TABLE doc.doc_embedding_job, doc.doc_embedding, doc.doc_embedding_model, doc.doc_text_representation, doc.doc_document, doc.doc_tenant RESTART IDENTITY CASCADE"); + } + + protected void setDocumentCreatedAt(java.util.UUID documentId, OffsetDateTime createdAt) { + jdbcTemplate.update("UPDATE doc.doc_document SET created_at = ?, updated_at = ? WHERE id = ?", createdAt, createdAt, documentId); + } +} diff --git a/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java b/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java new file mode 100644 index 0000000..62f8bf4 --- /dev/null +++ b/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java @@ -0,0 +1,39 @@ +package at.procon.dip.testsupport; + +import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.ted.config.TedProcessorProperties; +import org.springframework.boot.SpringBootConfiguration; +import org.springframework.boot.autoconfigure.EnableAutoConfiguration; +import org.springframework.boot.autoconfigure.domain.EntityScan; +import org.springframework.boot.context.properties.EnableConfigurationProperties; +import org.springframework.context.annotation.ComponentScan; +import org.springframework.data.jpa.repository.config.EnableJpaRepositories; + +/** + * Narrow semantic-search test application that loads the new generic search subsystem + * plus the new parallel embedding subsystem. + */ +@SpringBootConfiguration +@EnableAutoConfiguration(excludeName = { + "org.apache.camel.spring.boot.CamelAutoConfiguration", + "org.springframework.boot.autoconfigure.task.TaskSchedulingAutoConfiguration" +}) +@EnableConfigurationProperties({TedProcessorProperties.class, EmbeddingProperties.class}) +@EntityScan(basePackages = { + "at.procon.dip.domain.document.entity", + "at.procon.dip.domain.tenant.entity", + "at.procon.dip.embedding.job.entity" +}) +@EnableJpaRepositories(basePackages = { + "at.procon.dip.domain.document.repository", + "at.procon.dip.domain.tenant.repository", + "at.procon.dip.embedding.job.repository" +}) +@ComponentScan(basePackages = { + "at.procon.dip.domain.document.service", + "at.procon.dip.embedding", + "at.procon.dip.search", + "at.procon.dip.testsupport" +}) +public class SearchSemanticTestApplication { +} diff --git a/src/test/java/at/procon/dip/testsupport/SemanticSearchTestDataFactory.java b/src/test/java/at/procon/dip/testsupport/SemanticSearchTestDataFactory.java new file mode 100644 index 0000000..3fc9d26 --- /dev/null +++ b/src/test/java/at/procon/dip/testsupport/SemanticSearchTestDataFactory.java @@ -0,0 +1,68 @@ +package at.procon.dip.testsupport; + +import at.procon.dip.testsupport.SearchTestDataFactory.CreatedDocument; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; +import org.springframework.transaction.annotation.Transactional; + +@Component +@RequiredArgsConstructor +@Transactional +public class SemanticSearchTestDataFactory { + + private final SearchTestDataFactory lexicalFactory; + private final RepresentationEmbeddingOrchestrator embeddingOrchestrator; + + public CreatedDocument createAndEmbedPrimaryRepresentation( + String title, + String summary, + String body, + DocumentType documentType, + DocumentFamily documentFamily, + String languageCode, + RepresentationType primaryType, + String modelKey) { + + CreatedDocument created = lexicalFactory.createDocumentWithPrimaryRepresentation( + title, + summary, + body, + documentType, + documentFamily, + languageCode, + primaryType + ); + embedDocument(created.document().getId(), modelKey); + return created; + } + + public CreatedDocument createAndEmbedPrimaryAndChunks( + String title, + String summary, + String primaryBody, + String languageCode, + List chunkBodies, + String modelKey) { + CreatedDocument created = lexicalFactory.createDocumentWithPrimaryAndChunks( + title, + summary, + primaryBody, + languageCode, + chunkBodies + ); + embedDocument(created.document().getId(), modelKey); + return created; + } + + private void embedDocument(java.util.UUID documentId, String modelKey) { + embeddingOrchestrator.enqueueDocument(documentId, modelKey); + while (embeddingOrchestrator.processNextReadyBatch() > 0) { + // drain the ready queue synchronously for deterministic tests + } + } +} diff --git a/src/test/java/at/procon/dip/testsupport/config/SearchTestJacksonConfig.java b/src/test/java/at/procon/dip/testsupport/config/SearchTestJacksonConfig.java new file mode 100644 index 0000000..f91ae56 --- /dev/null +++ b/src/test/java/at/procon/dip/testsupport/config/SearchTestJacksonConfig.java @@ -0,0 +1,18 @@ +package at.procon.dip.testsupport.config; + +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import org.springframework.boot.autoconfigure.jackson.Jackson2ObjectMapperBuilderCustomizer; +import org.springframework.boot.test.context.TestConfiguration; +import org.springframework.context.annotation.Bean; + +@TestConfiguration +public class SearchTestJacksonConfig { + + @Bean + Jackson2ObjectMapperBuilderCustomizer searchTestJacksonCustomizer() { + return builder -> builder + .modules(new JavaTimeModule()) + .featuresToDisable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); + } +}