diff --git a/docs/embedding/NV1-NV3_IMPLEMENTATION_PLAN.md b/docs/embedding/NV1-NV3_IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..3d9d322 --- /dev/null +++ b/docs/embedding/NV1-NV3_IMPLEMENTATION_PLAN.md @@ -0,0 +1,84 @@ +# Parallel embedding subsystem plan (NV1–NV3) + +This plan assumes the old vectorization subsystem remains in place temporarily, while a new generic embedding subsystem is built in parallel. + +## Principles + +- Build the new subsystem under `at.procon.dip.embedding.*`. +- Do not shape it around old TED-specific or legacy vectorization services. +- Operate on `DocumentTextRepresentation` and `DocumentEmbedding` as the core abstraction. +- Keep the new subsystem configurable and provider-based. +- Migrate and cut over later. + +## NV1 — provider/model/query foundation + +### Goal +Create a standalone embedding foundation that can: +- resolve configured providers +- resolve configured models +- embed arbitrary text lists +- embed search queries +- support deterministic testing + +### Deliverables +- `EmbeddingProperties` +- `EmbeddingUseCase` +- `EmbeddingRequest` +- `EmbeddingProviderResult` +- `EmbeddingModelDescriptor` +- `ResolvedEmbeddingProviderConfig` +- `EmbeddingProvider` +- `ExternalHttpEmbeddingProvider` +- `MockEmbeddingProvider` +- `EmbeddingProviderRegistry` +- `EmbeddingModelRegistry` +- `EmbeddingProviderConfigResolver` +- `EmbeddingExecutionService` +- `QueryEmbeddingService` +- startup validation of provider/model wiring + +### Notes +- No cutover to the old vectorization path yet. +- No persistence/job orchestration yet. +- New subsystem should be safe to include in the app while disabled by default. + +## NV2 — persistence and job orchestration + +### Goal +Make the new subsystem able to create and process embedding jobs against `DocumentTextRepresentation`. + +### Deliverables +- `EmbeddingJob` entity/repository/service +- retry / backoff policy +- default `EmbeddingSelectionPolicy` +- representation-level embedding execution +- `DocumentEmbedding` persistence updates through the new subsystem + +## NV3 — generic semantic search engine + +### Goal +Add semantic search into the generic search platform using only the new subsystem. + +### Deliverables +- `PgVectorSemanticSearchEngine` +- `DocumentSemanticSearchRepository` +- query embedding through `QueryEmbeddingService` +- chunk-aware retrieval and collapse +- fusion with lexical search + +## Migration philosophy + +Because the app is still in development, prefer: + +1. migrate documents and text representations first +2. re-embed through the new subsystem +3. only preserve old raw vector data if there is a strong operational reason + +## Recommended implementation order + +1. NV1 foundation +2. NV1 tests with mock provider +3. NV2 jobs and selection policy +4. NV3 semantic search +5. migration/backfill +6. cutover diff --git a/src/test/java/at/procon/dip/embedding/provider/mock/MockEmbeddingProviderTest.java b/src/test/java/at/procon/dip/embedding/provider/mock/MockEmbeddingProviderTest.java new file mode 100644 index 0000000..f312b53 --- /dev/null +++ b/src/test/java/at/procon/dip/embedding/provider/mock/MockEmbeddingProviderTest.java @@ -0,0 +1,39 @@ +package at.procon.dip.embedding.provider.mock; + +import static org.assertj.core.api.Assertions.assertThat; + +import at.procon.dip.domain.document.DistanceMetric; +import at.procon.dip.embedding.model.EmbeddingModelDescriptor; +import at.procon.dip.embedding.model.EmbeddingRequest; +import at.procon.dip.embedding.model.EmbeddingUseCase; +import at.procon.dip.embedding.model.ResolvedEmbeddingProviderConfig; +import java.util.List; +import org.junit.jupiter.api.Test; + +class MockEmbeddingProviderTest { + + private final MockEmbeddingProvider provider = new MockEmbeddingProvider(); + + @Test + void should_produce_deterministic_vectors() { + EmbeddingModelDescriptor model = new EmbeddingModelDescriptor( + "mock-search", "mock-default", "mock-search", 8, DistanceMetric.COSINE, true, true, null, true); + ResolvedEmbeddingProviderConfig config = ResolvedEmbeddingProviderConfig.builder() + .key("mock-default") + .providerType("mock") + .dimensions(8) + .build(); + EmbeddingRequest request = EmbeddingRequest.builder() + .modelKey("mock-search") + .useCase(EmbeddingUseCase.DOCUMENT) + .texts(List.of("district heating optimization")) + .build(); + + var first = provider.embedDocuments(config, model, request); + var second = provider.embedDocuments(config, model, request); + + assertThat(first.vectors()).hasSize(1); + assertThat(second.vectors()).hasSize(1); + assertThat(first.vectors().getFirst()).containsExactly(second.vectors().getFirst()); + } +} diff --git a/src/test/java/at/procon/dip/embedding/registry/EmbeddingModelRegistryTest.java b/src/test/java/at/procon/dip/embedding/registry/EmbeddingModelRegistryTest.java new file mode 100644 index 0000000..431231e --- /dev/null +++ b/src/test/java/at/procon/dip/embedding/registry/EmbeddingModelRegistryTest.java @@ -0,0 +1,29 @@ +package at.procon.dip.embedding.registry; + +import static org.assertj.core.api.Assertions.assertThat; + +import at.procon.dip.domain.document.DistanceMetric; +import at.procon.dip.embedding.config.EmbeddingProperties; +import org.junit.jupiter.api.Test; + +class EmbeddingModelRegistryTest { + + @Test + void should_resolve_active_model_from_properties() { + EmbeddingProperties properties = new EmbeddingProperties(); + properties.setDefaultDocumentModel("mock-search"); + EmbeddingProperties.ModelProperties model = new EmbeddingProperties.ModelProperties(); + model.setProviderConfigKey("mock-default"); + model.setProviderModelKey("mock-search"); + model.setDimensions(16); + model.setDistanceMetric(DistanceMetric.COSINE); + model.setSupportsQueryEmbeddingMode(true); + model.setActive(true); + properties.getModels().put("mock-search", model); + + EmbeddingModelRegistry registry = new EmbeddingModelRegistry(properties); + + assertThat(registry.getRequiredDefaultDocumentModelKey()).isEqualTo("mock-search"); + assertThat(registry.getRequired("mock-search").providerConfigKey()).isEqualTo("mock-default"); + } +} diff --git a/src/test/java/at/procon/dip/embedding/service/DefaultQueryEmbeddingServiceTest.java b/src/test/java/at/procon/dip/embedding/service/DefaultQueryEmbeddingServiceTest.java new file mode 100644 index 0000000..460c6c3 --- /dev/null +++ b/src/test/java/at/procon/dip/embedding/service/DefaultQueryEmbeddingServiceTest.java @@ -0,0 +1,38 @@ +package at.procon.dip.embedding.service; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import at.procon.dip.domain.document.DistanceMetric; +import at.procon.dip.embedding.model.EmbeddingModelDescriptor; +import at.procon.dip.embedding.model.EmbeddingProviderResult; +import at.procon.dip.embedding.model.EmbeddingUseCase; +import at.procon.dip.embedding.registry.EmbeddingModelRegistry; +import java.util.List; +import org.junit.jupiter.api.Test; + +class DefaultQueryEmbeddingServiceTest { + + @Test + void should_use_default_query_model() { + EmbeddingExecutionService executionService = mock(EmbeddingExecutionService.class); + EmbeddingModelRegistry modelRegistry = mock(EmbeddingModelRegistry.class); + + when(modelRegistry.getRequiredDefaultQueryModelKey()).thenReturn("mock-search"); + when(executionService.embedTexts("mock-search", EmbeddingUseCase.QUERY, List.of("framework agreement"))) + .thenReturn(new EmbeddingProviderResult( + new EmbeddingModelDescriptor("mock-search", "mock-default", "mock-search", 4, + DistanceMetric.COSINE, true, true, null, true), + List.of(new float[]{1f, 2f, 3f, 4f}), + List.of(), + "req-1", + 2 + )); + + DefaultQueryEmbeddingService service = new DefaultQueryEmbeddingService(executionService, modelRegistry); + float[] vector = service.embedQuery("framework agreement"); + + assertThat(vector).containsExactly(1f, 2f, 3f, 4f); + } +} diff --git a/src/test/java/at/procon/dip/search/integration/GenericSearchEndpointIntegrationTest.java b/src/test/java/at/procon/dip/search/integration/GenericSearchEndpointIntegrationTest.java new file mode 100644 index 0000000..93e0651 --- /dev/null +++ b/src/test/java/at/procon/dip/search/integration/GenericSearchEndpointIntegrationTest.java @@ -0,0 +1,138 @@ +package at.procon.dip.search.integration; + +import at.procon.dip.config.JacksonConfig; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.search.dto.SearchMode; +import at.procon.dip.search.dto.SearchRepresentationSelectionMode; +import at.procon.dip.search.dto.SearchRequest; +import at.procon.dip.testsupport.AbstractSearchIntegrationTest; +import at.procon.dip.testsupport.SearchTestDataFactory; +import at.procon.dip.testsupport.config.SearchTestConfig; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.util.List; +import java.util.Set; + +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.ImportAutoConfiguration; +import org.springframework.boot.autoconfigure.http.HttpMessageConvertersAutoConfiguration; +import org.springframework.boot.autoconfigure.jackson.JacksonAutoConfiguration; +import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration; +import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.context.annotation.Import; +import org.springframework.http.MediaType; +import org.springframework.test.web.servlet.MockMvc; + +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.get; +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.jsonPath; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; + + + +@Import(SearchTestConfig.class) +@ImportAutoConfiguration({ + JacksonAutoConfiguration.class, + HttpMessageConvertersAutoConfiguration.class, + WebMvcAutoConfiguration.class +}) +class GenericSearchEndpointIntegrationTest extends AbstractSearchIntegrationTest { + + @Autowired + private SearchTestDataFactory dataFactory; + + @Autowired + private MockMvc mockMvc; + + @Autowired + private ObjectMapper objectMapper; + + @Test + void searchEndpoint_should_return_hits_for_fulltext_request() throws Exception { + dataFactory.createDocumentWithPrimaryRepresentation( + "Vienna school renovation framework", + "School roof framework agreement", + "Framework agreement for school roof renovation in Vienna.", + DocumentType.TED_NOTICE, + DocumentFamily.PROCUREMENT, + "en", + RepresentationType.SEMANTIC_TEXT); + + SearchRequest request = SearchRequest.builder() + .queryText("framework agreement") + .modes(Set.of(SearchMode.FULLTEXT)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .build(); + + mockMvc.perform(post("/search") + .contentType(MediaType.APPLICATION_JSON) + .accept(MediaType.APPLICATION_JSON) + .characterEncoding("UTF-8") + .content(objectMapper.writeValueAsString(request))) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.hits[0].title").value("Vienna school renovation framework")) + .andExpect(jsonPath("$.enginesUsed[0]").value("POSTGRES_FULLTEXT")); + } + + @Test + void debugEndpoint_should_return_plan_and_engine_results() throws Exception { + dataFactory.createDocumentWithPrimaryRepresentation( + "Maintenance manual", + "Factory maintenance manual", + "Maintenance manual for calibration and preventive checks.", + DocumentType.PDF, + DocumentFamily.KNOWLEDGE, + "en", + RepresentationType.FULLTEXT); + + SearchRequest request = SearchRequest.builder() + .queryText("maintenence manual") + .modes(Set.of(SearchMode.HYBRID)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .build(); + + mockMvc.perform(post("/search/debug") + .contentType(MediaType.APPLICATION_JSON) + .accept(MediaType.APPLICATION_JSON) + .characterEncoding("UTF-8") + .content(objectMapper.writeValueAsString(request))) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.plan.engines").isArray()) + .andExpect(jsonPath("$.engineResults").isArray()) + .andExpect(jsonPath("$.fusedResponse.hits[0].title").value("Maintenance manual")); + } + + @Test + void metricsEndpoint_should_return_search_metrics_snapshot() throws Exception { + dataFactory.createDocumentWithPrimaryAndChunks( + "Energy optimization strategy", + "Strategy overview", + "This primary representation only contains a high level overview.", + "en", + List.of("District heating optimization strategy for municipal energy systems is described here.")); + + SearchRequest request = SearchRequest.builder() + .queryText("district heating optimization") + .modes(Set.of(SearchMode.FULLTEXT)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS) + .build(); + + mockMvc.perform(post("/search") + .contentType(MediaType.APPLICATION_JSON) + .accept(MediaType.APPLICATION_JSON) + .characterEncoding("UTF-8") + .content(objectMapper.writeValueAsString(request))) + .andExpect(status().isOk()); + + mockMvc.perform(get("/search/metrics")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.totalSearchRequests").isNumber()) + .andExpect(jsonPath("$.representationCounts").exists()); + } +} diff --git a/src/test/java/at/procon/dip/search/integration/GenericSearchOrchestratorIntegrationTest.java b/src/test/java/at/procon/dip/search/integration/GenericSearchOrchestratorIntegrationTest.java new file mode 100644 index 0000000..86b463e --- /dev/null +++ b/src/test/java/at/procon/dip/search/integration/GenericSearchOrchestratorIntegrationTest.java @@ -0,0 +1,174 @@ +package at.procon.dip.search.integration; + +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.search.api.SearchExecutionContext; +import at.procon.dip.search.dto.*; +import at.procon.dip.search.engine.trigram.PostgresTrigramSearchEngine; +import at.procon.dip.search.service.SearchOrchestrator; +import at.procon.dip.search.spi.SearchDocumentScope; +import at.procon.dip.testsupport.AbstractSearchIntegrationTest; +import at.procon.dip.testsupport.SearchTestDataFactory; +import java.util.List; +import java.util.Set; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.test.annotation.DirtiesContext; + +import static org.assertj.core.api.Assertions.assertThat; + +@DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) +class GenericSearchOrchestratorIntegrationTest extends AbstractSearchIntegrationTest { + + @Autowired + private SearchTestDataFactory dataFactory; + + @Autowired + private SearchOrchestrator searchOrchestrator; + + @Autowired + private PostgresTrigramSearchEngine trigramSearchEngine; + + @Test + void hybridSearch_should_collapse_document_hits_when_fulltext_and_trigram_match_same_document() { + dataFactory.createDocumentWithPrimaryRepresentation( + "Maintenance manual", + "Factory maintenance manual", + "Maintenance manual for calibration and preventive checks.", + DocumentType.PDF, + DocumentFamily.KNOWLEDGE, + "en", + RepresentationType.FULLTEXT); + + SearchRequest request = SearchRequest.builder() + .queryText("Maintenance manual") + .modes(Set.of(SearchMode.HYBRID)) + .collapseByDocument(true) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .build(); + + SearchResponse response = searchOrchestrator.search( + request, + new SearchDocumentScope(Set.of(), null, null, null, null)); + + assertThat(response.getHits()).hasSize(1); + assertThat(response.getHits().getFirst().getTitle()).isEqualTo("Maintenance manual"); + assertThat(response.getEnginesUsed()).isNotEmpty(); + assertThat(response.getHits().getFirst().getFinalScore()).isGreaterThan(0.0d); + } + + @Test + void representationSelectionMode_should_control_chunk_visibility() { + dataFactory.createDocumentWithPrimaryAndChunks( + "Energy optimization strategy", + "Strategy overview", + "This primary representation only contains a high level overview.", + "en", + List.of( + "Chunk one is introductory and does not contain the target phrase.", + "District heating optimization strategy for municipal energy systems is described here." + )); + + SearchRequest primaryOnly = SearchRequest.builder() + .queryText("district heating optimization") + .modes(Set.of(SearchMode.FULLTEXT)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .build(); + + SearchRequest primaryAndChunks = SearchRequest.builder() + .queryText("district heating optimization") + .modes(Set.of(SearchMode.FULLTEXT)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS) + .build(); + + SearchResponse primaryOnlyResponse = searchOrchestrator.search( + primaryOnly, + new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null)); + + SearchResponse primaryAndChunksResponse = searchOrchestrator.search( + primaryAndChunks, + new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null)); + + assertThat(primaryOnlyResponse.getHits()).isEmpty(); + assertThat(primaryAndChunksResponse.getHits()).hasSize(1); + assertThat(primaryAndChunksResponse.getHits().getFirst().getTitle()).isEqualTo("Energy optimization strategy"); + assertThat(primaryAndChunksResponse.getHits().getFirst().getMatchedRepresentationCount()).isGreaterThanOrEqualTo(1); + assertThat(primaryAndChunksResponse.getHits().getFirst().getRepresentationType()).isEqualTo(RepresentationType.CHUNK); + } + + @Test + void trigramMode_should_find_document_by_fuzzy_title() { + dataFactory.createDocumentWithPrimaryAndChunks( + "Energy optimization strategy", + "Planning note", + "This primary representation contains only generic background information.", + "en", + List.of( + "This chunk talks about municipal utilities and operations.", + "This chunk contains unrelated technical background." + )); + + SearchRequest request = SearchRequest.builder() + .queryText("Enegry optimiztion stratgy") + .modes(Set.of(SearchMode.TRIGRAM)) + .build(); + + SearchResponse response = searchOrchestrator.search( + request, + new SearchDocumentScope( + Set.of(), + Set.of(DocumentType.TEXT), + Set.of(DocumentFamily.GENERIC), + null, + null + ) + ); + + assertThat(response.getHits()).isNotEmpty(); + assertThat(response.getHits()).hasSize(1); + + SearchHit first = response.getHits().getFirst(); + assertThat(first.getTitle()).isEqualTo("Energy optimization strategy"); + assertThat(first.getPrimaryEngine()).isEqualTo(SearchEngineType.POSTGRES_TRIGRAM); + assertThat(first.getMatchedField()).isEqualTo(SearchMatchField.DOCUMENT_TITLE); + assertThat(first.getFinalScore()).isGreaterThan(0.0); + } + + @Test + void trigramRepository_should_find_document_by_fuzzy_title() { + dataFactory.createDocumentWithPrimaryAndChunks( + "Energy optimization strategy", + "Planning note", + "This primary representation contains only generic background information.", + "en", + List.of( + "This chunk talks about municipal utilities and operations.", + "This chunk contains unrelated technical background." + )); + + SearchRequest request = SearchRequest.builder() + .queryText("Enegry optimiztion stratgy") + .modes(Set.of(SearchMode.TRIGRAM)) + .build(); + + SearchExecutionContext context = SearchExecutionContext.builder() + .request(request) + .scope(new SearchDocumentScope( + Set.of(), + Set.of(DocumentType.TEXT), + Set.of(DocumentFamily.GENERIC), + null, + null + )) + .page(0) + .size(10) + .build(); + + List hits = trigramSearchEngine.execute(context); + + assertThat(hits).isNotEmpty(); + assertThat(hits.getFirst().getTitle()).isEqualTo("Energy optimization strategy"); + assertThat(hits.getFirst().getPrimaryEngine()).isEqualTo(SearchEngineType.POSTGRES_TRIGRAM); + } +} diff --git a/src/test/java/at/procon/dip/search/integration/GenericSearchRepositoryIntegrationTest.java b/src/test/java/at/procon/dip/search/integration/GenericSearchRepositoryIntegrationTest.java new file mode 100644 index 0000000..45a8af3 --- /dev/null +++ b/src/test/java/at/procon/dip/search/integration/GenericSearchRepositoryIntegrationTest.java @@ -0,0 +1,108 @@ +package at.procon.dip.search.integration; + +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.search.api.SearchExecutionContext; +import at.procon.dip.search.dto.SearchHit; +import at.procon.dip.search.dto.SearchMode; +import at.procon.dip.search.dto.SearchRequest; +import at.procon.dip.search.dto.SearchRepresentationSelectionMode; +import at.procon.dip.search.repository.DocumentFullTextSearchRepository; +import at.procon.dip.search.repository.DocumentTrigramSearchRepository; +import at.procon.dip.search.spi.SearchDocumentScope; +import at.procon.dip.testsupport.AbstractSearchIntegrationTest; +import at.procon.dip.testsupport.SearchTestDataFactory; +import java.util.List; +import java.util.Set; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; + +import static org.assertj.core.api.Assertions.assertThat; + +class GenericSearchRepositoryIntegrationTest extends AbstractSearchIntegrationTest { + + @Autowired + private SearchTestDataFactory dataFactory; + + @Autowired + private DocumentFullTextSearchRepository fullTextRepository; + + @Autowired + private DocumentTrigramSearchRepository trigramRepository; + + @Test + void fullTextRepository_should_find_exact_keyword_in_primary_representation() { + dataFactory.createDocumentWithPrimaryRepresentation( + "Vienna school renovation framework", + "School roof framework agreement", + "Framework agreement for school roof renovation in Vienna.", + DocumentType.TED_NOTICE, + DocumentFamily.PROCUREMENT, + "en", + RepresentationType.SEMANTIC_TEXT); + + dataFactory.createDocumentWithPrimaryRepresentation( + "Pump maintenance manual", + "Maintenance procedures", + "Calibration procedure for pumps and gauges.", + DocumentType.PDF, + DocumentFamily.KNOWLEDGE, + "en", + RepresentationType.FULLTEXT); + + assertThat(jdbcTemplate.queryForObject( + "select count(*) from doc.doc_text_representation", + Integer.class + )).isGreaterThan(0); + + assertThat(jdbcTemplate.queryForObject( + "select count(*) from doc.doc_text_representation where search_vector is not null", + Integer.class + )).isGreaterThan(0); + + SearchExecutionContext context = SearchExecutionContext.builder() + .request(SearchRequest.builder() + .queryText("framework agreement") + .modes(Set.of(SearchMode.FULLTEXT)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .build()) + .scope(new SearchDocumentScope(Set.of(), null, null, null, null)) + .page(0) + .size(10) + .build(); + + List hits = fullTextRepository.search(context, 10); + assertThat(hits).isNotEmpty(); + assertThat(hits).extracting(SearchHit::getTitle) + .contains("Vienna school renovation framework") + .doesNotContain("Pump maintenance manual"); + } + + @Test + void trigramRepository_should_match_fuzzy_title() { + dataFactory.createDocumentWithPrimaryRepresentation( + "Vienna school renovation framework", + "School roof framework agreement", + "Framework agreement for school roof renovation in Vienna.", + DocumentType.TED_NOTICE, + DocumentFamily.PROCUREMENT, + "en", + RepresentationType.SEMANTIC_TEXT); + + SearchExecutionContext context = SearchExecutionContext.builder() + .request(SearchRequest.builder() + .queryText("Viena school renovtion") + .modes(Set.of(SearchMode.TRIGRAM)) + .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) + .build()) + .scope(new SearchDocumentScope(Set.of(), null, null, null, null)) + .page(0) + .size(10) + .build(); + + List hits = trigramRepository.search(context, 10, 0.10d); + assertThat(hits).isNotEmpty(); + assertThat(hits.getFirst().getTitle()).isEqualTo("Vienna school renovation framework"); + } +} diff --git a/src/test/java/at/procon/dip/testsupport/AbstractSearchIntegrationTest.java b/src/test/java/at/procon/dip/testsupport/AbstractSearchIntegrationTest.java new file mode 100644 index 0000000..ad97fe5 --- /dev/null +++ b/src/test/java/at/procon/dip/testsupport/AbstractSearchIntegrationTest.java @@ -0,0 +1,119 @@ +package at.procon.dip.testsupport; + +import at.procon.dip.FixedPortPostgreSQLContainer; +import at.procon.dip.domain.document.repository.DocumentRepository; +import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; +import at.procon.dip.domain.tenant.repository.DocumentTenantRepository; +import javax.sql.DataSource; + +import at.procon.dip.testsupport.config.SearchTestConfig; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestInstance; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.ImportAutoConfiguration; +import org.springframework.boot.autoconfigure.http.HttpMessageConvertersAutoConfiguration; +import org.springframework.boot.autoconfigure.jackson.JacksonAutoConfiguration; +import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.context.annotation.Import; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.test.context.DynamicPropertyRegistry; +import org.springframework.test.context.DynamicPropertySource; +import org.springframework.test.context.TestPropertySource; +import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +@SpringBootTest(classes = SearchTestApplication.class, webEnvironment = SpringBootTest.WebEnvironment.MOCK) +@Testcontainers +@TestInstance(TestInstance.Lifecycle.PER_CLASS) +@TestPropertySource(properties = { + "spring.jpa.hibernate.ddl-auto=create-drop", + "spring.jpa.show-sql=false", + "spring.jpa.open-in-view=false", + "spring.jpa.properties.hibernate.default_schema=DOC", + "spring.main.lazy-initialization=true", + "ted.vectorization.enabled=false", + "ted.search.default-page-size=20", + "ted.search.max-page-size=100", + "ted.search.fulltext-weight=0.60", + "ted.search.trigram-weight=0.40", + "ted.search.semantic-weight=0.45", + "ted.search.recency-boost-weight=0.05", + "ted.search.trigram-threshold=0.10", + "server.servlet.context-path=/api" +}) +public abstract class AbstractSearchIntegrationTest { + + private static final int HOST_PORT = 15433; + private static final String DB_NAME = "dip_search_test"; + private static final String DB_USER = "test"; + private static final String DB_PASSWORD = "test"; + private static final String JDBC_URL = "jdbc:postgresql://localhost:" + HOST_PORT + "/" + DB_NAME; + + @Container + static PostgreSQLContainer postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", HOST_PORT) + .withDatabaseName(DB_NAME) + .withUsername(DB_USER) + .withPassword(DB_PASSWORD) + .withInitScript("sql/create-doc-search-test-schemas.sql"); + + + @DynamicPropertySource + static void registerProperties(DynamicPropertyRegistry registry) { + if (!postgres.isRunning()) { + postgres.start(); + } + registry.add("spring.datasource.url", () -> JDBC_URL); + registry.add("spring.datasource.username", () -> DB_USER); + registry.add("spring.datasource.password", () -> DB_PASSWORD); + registry.add("spring.datasource.driver-class-name", () -> "org.postgresql.Driver"); + } + + @Autowired + protected JdbcTemplate jdbcTemplate; + + @Autowired + protected DataSource dataSource; + + @Autowired + protected DocumentRepository documentRepository; + + @Autowired + protected DocumentTextRepresentationRepository representationRepository; + + @Autowired + protected DocumentTenantRepository tenantRepository; + + @BeforeEach + void resetSearchTestDatabase() { + ensureSearchColumnsAndIndexes(); + cleanupDatabase(); + } + + protected void ensureSearchColumnsAndIndexes() { + jdbcTemplate.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm"); + jdbcTemplate.execute("CREATE SCHEMA IF NOT EXISTS doc"); + jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_config VARCHAR(64)"); + jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_vector tsvector"); + jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector_test ON doc.doc_text_representation USING GIN (search_vector)"); + jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm_test ON doc.doc_document USING GIN (title gin_trgm_ops)"); + jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm_test ON doc.doc_document USING GIN (summary gin_trgm_ops)"); + jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm_test ON doc.doc_text_representation USING GIN (text_body gin_trgm_ops)"); + } + + protected void cleanupDatabase() { + jdbcTemplate.execute("TRUNCATE TABLE doc.doc_text_representation, doc.doc_document, doc.doc_tenant RESTART IDENTITY CASCADE"); + } + + protected void setDocumentCreatedAt(java.util.UUID documentId, java.time.OffsetDateTime createdAt) { + jdbcTemplate.update("UPDATE doc.doc_document SET created_at = ?, updated_at = ? WHERE id = ?", createdAt, createdAt, documentId); + } + + protected boolean columnExists(String schema, String table, String column) { + return Boolean.TRUE.equals(jdbcTemplate.queryForObject( + "SELECT COUNT(*) > 0 FROM information_schema.columns WHERE table_schema = ? AND table_name = ? AND column_name = ?", + Boolean.class, + schema.toLowerCase(), table.toLowerCase(), column.toLowerCase())); + } +} diff --git a/src/test/java/at/procon/dip/testsupport/SearchTestApplication.java b/src/test/java/at/procon/dip/testsupport/SearchTestApplication.java new file mode 100644 index 0000000..69ad9c4 --- /dev/null +++ b/src/test/java/at/procon/dip/testsupport/SearchTestApplication.java @@ -0,0 +1,80 @@ +package at.procon.dip.testsupport; + +import at.procon.dip.config.JacksonConfig; +import at.procon.dip.domain.document.service.DocumentContentService; + +import at.procon.dip.domain.document.service.DocumentRepresentationService; +import at.procon.dip.domain.document.service.DocumentService; +import at.procon.dip.search.engine.fulltext.PostgresFullTextSearchEngine; +import at.procon.dip.search.engine.trigram.PostgresTrigramSearchEngine; +import at.procon.dip.search.plan.DefaultSearchPlanner; +import at.procon.dip.search.rank.DefaultSearchResultFusionService; +import at.procon.dip.search.rank.DefaultSearchScoreNormalizer; +import at.procon.dip.search.repository.DocumentFullTextSearchRepositoryImpl; +import at.procon.dip.search.repository.DocumentTrigramSearchRepositoryImpl; +import at.procon.dip.search.service.DefaultSearchOrchestrator; +import at.procon.dip.search.service.DocumentLexicalIndexService; +import at.procon.dip.search.service.SearchMetricsService; +import at.procon.dip.search.web.GenericSearchController; +import at.procon.ted.config.TedProcessorProperties; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.springframework.boot.SpringBootConfiguration; +import org.springframework.boot.autoconfigure.EnableAutoConfiguration; +import org.springframework.boot.autoconfigure.ImportAutoConfiguration; +import org.springframework.boot.autoconfigure.domain.EntityScan; +import org.springframework.boot.autoconfigure.http.HttpMessageConvertersAutoConfiguration; +import org.springframework.boot.autoconfigure.jackson.JacksonAutoConfiguration; +import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration; +import org.springframework.boot.autoconfigure.jdbc.JdbcTemplateAutoConfiguration; +import org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration; +import org.springframework.boot.autoconfigure.transaction.TransactionAutoConfiguration; +import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration; +import org.springframework.boot.context.properties.EnableConfigurationProperties; +import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc; +import org.springframework.context.annotation.Import; +import org.springframework.data.jpa.repository.config.EnableJpaRepositories; + +/** + * Narrow test application for generic lexical search slices. + * + * Important: this class does not component-scan the whole application. Every + * test support bean that should exist in the test context must therefore be + * imported explicitly. + */ +@SpringBootConfiguration +@AutoConfigureMockMvc +@ImportAutoConfiguration({ + DataSourceAutoConfiguration.class, + HibernateJpaAutoConfiguration.class, + TransactionAutoConfiguration.class, + JdbcTemplateAutoConfiguration.class +}) +@EnableConfigurationProperties(TedProcessorProperties.class) +@EntityScan(basePackages = { + "at.procon.dip.domain.document.entity", + "at.procon.dip.domain.tenant.entity" +}) +@EnableJpaRepositories(basePackages = { + "at.procon.dip.domain.document.repository", + "at.procon.dip.domain.tenant.repository" +}) +@Import({ + DocumentService.class, + DocumentContentService.class, + DocumentRepresentationService.class, + DocumentLexicalIndexService.class, + SearchTestDataFactory.class, + DefaultSearchPlanner.class, + DocumentFullTextSearchRepositoryImpl.class, + DocumentTrigramSearchRepositoryImpl.class, + PostgresFullTextSearchEngine.class, + PostgresTrigramSearchEngine.class, + DefaultSearchScoreNormalizer.class, + DefaultSearchResultFusionService.class, + SearchMetricsService.class, + DefaultSearchOrchestrator.class, + GenericSearchController.class, + DocumentLexicalIndexService.class +}) +public class SearchTestApplication { +} diff --git a/src/test/java/at/procon/dip/testsupport/SearchTestDataFactory.java b/src/test/java/at/procon/dip/testsupport/SearchTestDataFactory.java new file mode 100644 index 0000000..1d5266a --- /dev/null +++ b/src/test/java/at/procon/dip/testsupport/SearchTestDataFactory.java @@ -0,0 +1,133 @@ +package at.procon.dip.testsupport; + +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.entity.DocumentTextRepresentation; +import at.procon.dip.search.service.DocumentLexicalIndexService; +import at.procon.dip.domain.document.service.DocumentRepresentationService; +import at.procon.dip.domain.document.service.DocumentService; +import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand; +import at.procon.dip.domain.document.service.command.CreateDocumentCommand; +import java.util.ArrayList; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; +import org.springframework.transaction.annotation.Transactional; + +@Component +@RequiredArgsConstructor +@Transactional +public class SearchTestDataFactory { + + private final DocumentService documentService; + private final DocumentRepresentationService representationService; + private final DocumentLexicalIndexService lexicalIndexService; + + public CreatedDocument createDocumentWithPrimaryRepresentation( + String title, + String summary, + String body, + DocumentType documentType, + DocumentFamily documentFamily, + String languageCode, + RepresentationType primaryType) { + + Document document = documentService.create(new CreateDocumentCommand( + null, + DocumentVisibility.PUBLIC, + documentType, + documentFamily, + DocumentStatus.RECEIVED, + title, + summary, + languageCode, + "text/plain", + null, + Integer.toHexString((title + body).hashCode()) + )); + + DocumentTextRepresentation primary = addRepresentation(document, primaryType, languageCode, true, null, null, null, body); + return new CreatedDocument(document, primary, List.of(primary)); + } + + public CreatedDocument createDocumentWithPrimaryAndChunks( + String title, + String summary, + String primaryBody, + String languageCode, + List chunkBodies) { + + Document document = documentService.create(new CreateDocumentCommand( + null, + DocumentVisibility.PUBLIC, + DocumentType.TEXT, + DocumentFamily.GENERIC, + DocumentStatus.RECEIVED, + title, + summary, + languageCode, + "text/plain", + null, + Integer.toHexString((title + primaryBody + chunkBodies).hashCode()) + )); + + List all = new ArrayList<>(); + DocumentTextRepresentation primary = addRepresentation(document, RepresentationType.SEMANTIC_TEXT, languageCode, true, null, null, null, primaryBody); + all.add(primary); + + int offset = 0; + for (int i = 0; i < chunkBodies.size(); i++) { + String chunk = chunkBodies.get(i); + DocumentTextRepresentation saved = addRepresentation( + document, + RepresentationType.CHUNK, + languageCode, + false, + i, + offset, + offset + chunk.length(), + chunk); + all.add(saved); + offset += chunk.length(); + } + + return new CreatedDocument(document, primary, all); + } + + private DocumentTextRepresentation addRepresentation( + Document document, + RepresentationType type, + String languageCode, + boolean primary, + Integer chunkIndex, + Integer chunkStartOffset, + Integer chunkEndOffset, + String text) { + DocumentTextRepresentation representation = representationService.addRepresentation(new AddDocumentTextRepresentationCommand( + document.getId(), + null, + type, + "search-test-factory", + languageCode, + null, + chunkIndex, + chunkStartOffset, + chunkEndOffset, + primary, + text + )); + lexicalIndexService.refreshRepresentationLexicalIndex(representation.getId()); + return representation; + } + + public record CreatedDocument( + Document document, + DocumentTextRepresentation primaryRepresentation, + List representations + ) { + } +} diff --git a/src/test/java/at/procon/dip/testsupport/config/SearchTestConfig.java b/src/test/java/at/procon/dip/testsupport/config/SearchTestConfig.java new file mode 100644 index 0000000..a93db95 --- /dev/null +++ b/src/test/java/at/procon/dip/testsupport/config/SearchTestConfig.java @@ -0,0 +1,20 @@ +package at.procon.dip.testsupport.config; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import org.springframework.boot.autoconfigure.jackson.Jackson2ObjectMapperBuilderCustomizer; +import org.springframework.boot.test.context.TestConfiguration; +import org.springframework.context.annotation.Bean; + +@TestConfiguration +public class SearchTestConfig { + + @Bean + public ObjectMapper objectMapper() { + ObjectMapper mapper = new ObjectMapper(); + mapper.registerModule(new JavaTimeModule()); + mapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); + return mapper; + } +} \ No newline at end of file