embedding nv1 + search tests

master
trifonovt 1 month ago
parent 2687d4ba17
commit d7369c796c

@ -0,0 +1,84 @@
# Parallel embedding subsystem plan (NV1NV3)
This plan assumes the old vectorization subsystem remains in place temporarily, while a new generic embedding subsystem is built in parallel.
## Principles
- Build the new subsystem under `at.procon.dip.embedding.*`.
- Do not shape it around old TED-specific or legacy vectorization services.
- Operate on `DocumentTextRepresentation` and `DocumentEmbedding` as the core abstraction.
- Keep the new subsystem configurable and provider-based.
- Migrate and cut over later.
## NV1 — provider/model/query foundation
### Goal
Create a standalone embedding foundation that can:
- resolve configured providers
- resolve configured models
- embed arbitrary text lists
- embed search queries
- support deterministic testing
### Deliverables
- `EmbeddingProperties`
- `EmbeddingUseCase`
- `EmbeddingRequest`
- `EmbeddingProviderResult`
- `EmbeddingModelDescriptor`
- `ResolvedEmbeddingProviderConfig`
- `EmbeddingProvider`
- `ExternalHttpEmbeddingProvider`
- `MockEmbeddingProvider`
- `EmbeddingProviderRegistry`
- `EmbeddingModelRegistry`
- `EmbeddingProviderConfigResolver`
- `EmbeddingExecutionService`
- `QueryEmbeddingService`
- startup validation of provider/model wiring
### Notes
- No cutover to the old vectorization path yet.
- No persistence/job orchestration yet.
- New subsystem should be safe to include in the app while disabled by default.
## NV2 — persistence and job orchestration
### Goal
Make the new subsystem able to create and process embedding jobs against `DocumentTextRepresentation`.
### Deliverables
- `EmbeddingJob` entity/repository/service
- retry / backoff policy
- default `EmbeddingSelectionPolicy`
- representation-level embedding execution
- `DocumentEmbedding` persistence updates through the new subsystem
## NV3 — generic semantic search engine
### Goal
Add semantic search into the generic search platform using only the new subsystem.
### Deliverables
- `PgVectorSemanticSearchEngine`
- `DocumentSemanticSearchRepository`
- query embedding through `QueryEmbeddingService`
- chunk-aware retrieval and collapse
- fusion with lexical search
## Migration philosophy
Because the app is still in development, prefer:
1. migrate documents and text representations first
2. re-embed through the new subsystem
3. only preserve old raw vector data if there is a strong operational reason
## Recommended implementation order
1. NV1 foundation
2. NV1 tests with mock provider
3. NV2 jobs and selection policy
4. NV3 semantic search
5. migration/backfill
6. cutover

@ -0,0 +1,39 @@
package at.procon.dip.embedding.provider.mock;
import static org.assertj.core.api.Assertions.assertThat;
import at.procon.dip.domain.document.DistanceMetric;
import at.procon.dip.embedding.model.EmbeddingModelDescriptor;
import at.procon.dip.embedding.model.EmbeddingRequest;
import at.procon.dip.embedding.model.EmbeddingUseCase;
import at.procon.dip.embedding.model.ResolvedEmbeddingProviderConfig;
import java.util.List;
import org.junit.jupiter.api.Test;
class MockEmbeddingProviderTest {
private final MockEmbeddingProvider provider = new MockEmbeddingProvider();
@Test
void should_produce_deterministic_vectors() {
EmbeddingModelDescriptor model = new EmbeddingModelDescriptor(
"mock-search", "mock-default", "mock-search", 8, DistanceMetric.COSINE, true, true, null, true);
ResolvedEmbeddingProviderConfig config = ResolvedEmbeddingProviderConfig.builder()
.key("mock-default")
.providerType("mock")
.dimensions(8)
.build();
EmbeddingRequest request = EmbeddingRequest.builder()
.modelKey("mock-search")
.useCase(EmbeddingUseCase.DOCUMENT)
.texts(List.of("district heating optimization"))
.build();
var first = provider.embedDocuments(config, model, request);
var second = provider.embedDocuments(config, model, request);
assertThat(first.vectors()).hasSize(1);
assertThat(second.vectors()).hasSize(1);
assertThat(first.vectors().getFirst()).containsExactly(second.vectors().getFirst());
}
}

@ -0,0 +1,29 @@
package at.procon.dip.embedding.registry;
import static org.assertj.core.api.Assertions.assertThat;
import at.procon.dip.domain.document.DistanceMetric;
import at.procon.dip.embedding.config.EmbeddingProperties;
import org.junit.jupiter.api.Test;
class EmbeddingModelRegistryTest {
@Test
void should_resolve_active_model_from_properties() {
EmbeddingProperties properties = new EmbeddingProperties();
properties.setDefaultDocumentModel("mock-search");
EmbeddingProperties.ModelProperties model = new EmbeddingProperties.ModelProperties();
model.setProviderConfigKey("mock-default");
model.setProviderModelKey("mock-search");
model.setDimensions(16);
model.setDistanceMetric(DistanceMetric.COSINE);
model.setSupportsQueryEmbeddingMode(true);
model.setActive(true);
properties.getModels().put("mock-search", model);
EmbeddingModelRegistry registry = new EmbeddingModelRegistry(properties);
assertThat(registry.getRequiredDefaultDocumentModelKey()).isEqualTo("mock-search");
assertThat(registry.getRequired("mock-search").providerConfigKey()).isEqualTo("mock-default");
}
}

@ -0,0 +1,38 @@
package at.procon.dip.embedding.service;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import at.procon.dip.domain.document.DistanceMetric;
import at.procon.dip.embedding.model.EmbeddingModelDescriptor;
import at.procon.dip.embedding.model.EmbeddingProviderResult;
import at.procon.dip.embedding.model.EmbeddingUseCase;
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
import java.util.List;
import org.junit.jupiter.api.Test;
class DefaultQueryEmbeddingServiceTest {
@Test
void should_use_default_query_model() {
EmbeddingExecutionService executionService = mock(EmbeddingExecutionService.class);
EmbeddingModelRegistry modelRegistry = mock(EmbeddingModelRegistry.class);
when(modelRegistry.getRequiredDefaultQueryModelKey()).thenReturn("mock-search");
when(executionService.embedTexts("mock-search", EmbeddingUseCase.QUERY, List.of("framework agreement")))
.thenReturn(new EmbeddingProviderResult(
new EmbeddingModelDescriptor("mock-search", "mock-default", "mock-search", 4,
DistanceMetric.COSINE, true, true, null, true),
List.of(new float[]{1f, 2f, 3f, 4f}),
List.of(),
"req-1",
2
));
DefaultQueryEmbeddingService service = new DefaultQueryEmbeddingService(executionService, modelRegistry);
float[] vector = service.embedQuery("framework agreement");
assertThat(vector).containsExactly(1f, 2f, 3f, 4f);
}
}

@ -0,0 +1,138 @@
package at.procon.dip.search.integration;
import at.procon.dip.config.JacksonConfig;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.search.dto.SearchMode;
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.testsupport.AbstractSearchIntegrationTest;
import at.procon.dip.testsupport.SearchTestDataFactory;
import at.procon.dip.testsupport.config.SearchTestConfig;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.util.List;
import java.util.Set;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.http.HttpMessageConvertersAutoConfiguration;
import org.springframework.boot.autoconfigure.jackson.JacksonAutoConfiguration;
import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration;
import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.context.annotation.Import;
import org.springframework.http.MediaType;
import org.springframework.test.web.servlet.MockMvc;
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.get;
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.jsonPath;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
@Import(SearchTestConfig.class)
@ImportAutoConfiguration({
JacksonAutoConfiguration.class,
HttpMessageConvertersAutoConfiguration.class,
WebMvcAutoConfiguration.class
})
class GenericSearchEndpointIntegrationTest extends AbstractSearchIntegrationTest {
@Autowired
private SearchTestDataFactory dataFactory;
@Autowired
private MockMvc mockMvc;
@Autowired
private ObjectMapper objectMapper;
@Test
void searchEndpoint_should_return_hits_for_fulltext_request() throws Exception {
dataFactory.createDocumentWithPrimaryRepresentation(
"Vienna school renovation framework",
"School roof framework agreement",
"Framework agreement for school roof renovation in Vienna.",
DocumentType.TED_NOTICE,
DocumentFamily.PROCUREMENT,
"en",
RepresentationType.SEMANTIC_TEXT);
SearchRequest request = SearchRequest.builder()
.queryText("framework agreement")
.modes(Set.of(SearchMode.FULLTEXT))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.build();
mockMvc.perform(post("/search")
.contentType(MediaType.APPLICATION_JSON)
.accept(MediaType.APPLICATION_JSON)
.characterEncoding("UTF-8")
.content(objectMapper.writeValueAsString(request)))
.andExpect(status().isOk())
.andExpect(jsonPath("$.hits[0].title").value("Vienna school renovation framework"))
.andExpect(jsonPath("$.enginesUsed[0]").value("POSTGRES_FULLTEXT"));
}
@Test
void debugEndpoint_should_return_plan_and_engine_results() throws Exception {
dataFactory.createDocumentWithPrimaryRepresentation(
"Maintenance manual",
"Factory maintenance manual",
"Maintenance manual for calibration and preventive checks.",
DocumentType.PDF,
DocumentFamily.KNOWLEDGE,
"en",
RepresentationType.FULLTEXT);
SearchRequest request = SearchRequest.builder()
.queryText("maintenence manual")
.modes(Set.of(SearchMode.HYBRID))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.build();
mockMvc.perform(post("/search/debug")
.contentType(MediaType.APPLICATION_JSON)
.accept(MediaType.APPLICATION_JSON)
.characterEncoding("UTF-8")
.content(objectMapper.writeValueAsString(request)))
.andExpect(status().isOk())
.andExpect(jsonPath("$.plan.engines").isArray())
.andExpect(jsonPath("$.engineResults").isArray())
.andExpect(jsonPath("$.fusedResponse.hits[0].title").value("Maintenance manual"));
}
@Test
void metricsEndpoint_should_return_search_metrics_snapshot() throws Exception {
dataFactory.createDocumentWithPrimaryAndChunks(
"Energy optimization strategy",
"Strategy overview",
"This primary representation only contains a high level overview.",
"en",
List.of("District heating optimization strategy for municipal energy systems is described here."));
SearchRequest request = SearchRequest.builder()
.queryText("district heating optimization")
.modes(Set.of(SearchMode.FULLTEXT))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS)
.build();
mockMvc.perform(post("/search")
.contentType(MediaType.APPLICATION_JSON)
.accept(MediaType.APPLICATION_JSON)
.characterEncoding("UTF-8")
.content(objectMapper.writeValueAsString(request)))
.andExpect(status().isOk());
mockMvc.perform(get("/search/metrics"))
.andExpect(status().isOk())
.andExpect(jsonPath("$.totalSearchRequests").isNumber())
.andExpect(jsonPath("$.representationCounts").exists());
}
}

@ -0,0 +1,174 @@
package at.procon.dip.search.integration;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.*;
import at.procon.dip.search.engine.trigram.PostgresTrigramSearchEngine;
import at.procon.dip.search.service.SearchOrchestrator;
import at.procon.dip.search.spi.SearchDocumentScope;
import at.procon.dip.testsupport.AbstractSearchIntegrationTest;
import at.procon.dip.testsupport.SearchTestDataFactory;
import java.util.List;
import java.util.Set;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.test.annotation.DirtiesContext;
import static org.assertj.core.api.Assertions.assertThat;
@DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS)
class GenericSearchOrchestratorIntegrationTest extends AbstractSearchIntegrationTest {
@Autowired
private SearchTestDataFactory dataFactory;
@Autowired
private SearchOrchestrator searchOrchestrator;
@Autowired
private PostgresTrigramSearchEngine trigramSearchEngine;
@Test
void hybridSearch_should_collapse_document_hits_when_fulltext_and_trigram_match_same_document() {
dataFactory.createDocumentWithPrimaryRepresentation(
"Maintenance manual",
"Factory maintenance manual",
"Maintenance manual for calibration and preventive checks.",
DocumentType.PDF,
DocumentFamily.KNOWLEDGE,
"en",
RepresentationType.FULLTEXT);
SearchRequest request = SearchRequest.builder()
.queryText("Maintenance manual")
.modes(Set.of(SearchMode.HYBRID))
.collapseByDocument(true)
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.build();
SearchResponse response = searchOrchestrator.search(
request,
new SearchDocumentScope(Set.of(), null, null, null, null));
assertThat(response.getHits()).hasSize(1);
assertThat(response.getHits().getFirst().getTitle()).isEqualTo("Maintenance manual");
assertThat(response.getEnginesUsed()).isNotEmpty();
assertThat(response.getHits().getFirst().getFinalScore()).isGreaterThan(0.0d);
}
@Test
void representationSelectionMode_should_control_chunk_visibility() {
dataFactory.createDocumentWithPrimaryAndChunks(
"Energy optimization strategy",
"Strategy overview",
"This primary representation only contains a high level overview.",
"en",
List.of(
"Chunk one is introductory and does not contain the target phrase.",
"District heating optimization strategy for municipal energy systems is described here."
));
SearchRequest primaryOnly = SearchRequest.builder()
.queryText("district heating optimization")
.modes(Set.of(SearchMode.FULLTEXT))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.build();
SearchRequest primaryAndChunks = SearchRequest.builder()
.queryText("district heating optimization")
.modes(Set.of(SearchMode.FULLTEXT))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS)
.build();
SearchResponse primaryOnlyResponse = searchOrchestrator.search(
primaryOnly,
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null));
SearchResponse primaryAndChunksResponse = searchOrchestrator.search(
primaryAndChunks,
new SearchDocumentScope(Set.of(), Set.of(DocumentType.TEXT), Set.of(DocumentFamily.GENERIC), null, null));
assertThat(primaryOnlyResponse.getHits()).isEmpty();
assertThat(primaryAndChunksResponse.getHits()).hasSize(1);
assertThat(primaryAndChunksResponse.getHits().getFirst().getTitle()).isEqualTo("Energy optimization strategy");
assertThat(primaryAndChunksResponse.getHits().getFirst().getMatchedRepresentationCount()).isGreaterThanOrEqualTo(1);
assertThat(primaryAndChunksResponse.getHits().getFirst().getRepresentationType()).isEqualTo(RepresentationType.CHUNK);
}
@Test
void trigramMode_should_find_document_by_fuzzy_title() {
dataFactory.createDocumentWithPrimaryAndChunks(
"Energy optimization strategy",
"Planning note",
"This primary representation contains only generic background information.",
"en",
List.of(
"This chunk talks about municipal utilities and operations.",
"This chunk contains unrelated technical background."
));
SearchRequest request = SearchRequest.builder()
.queryText("Enegry optimiztion stratgy")
.modes(Set.of(SearchMode.TRIGRAM))
.build();
SearchResponse response = searchOrchestrator.search(
request,
new SearchDocumentScope(
Set.of(),
Set.of(DocumentType.TEXT),
Set.of(DocumentFamily.GENERIC),
null,
null
)
);
assertThat(response.getHits()).isNotEmpty();
assertThat(response.getHits()).hasSize(1);
SearchHit first = response.getHits().getFirst();
assertThat(first.getTitle()).isEqualTo("Energy optimization strategy");
assertThat(first.getPrimaryEngine()).isEqualTo(SearchEngineType.POSTGRES_TRIGRAM);
assertThat(first.getMatchedField()).isEqualTo(SearchMatchField.DOCUMENT_TITLE);
assertThat(first.getFinalScore()).isGreaterThan(0.0);
}
@Test
void trigramRepository_should_find_document_by_fuzzy_title() {
dataFactory.createDocumentWithPrimaryAndChunks(
"Energy optimization strategy",
"Planning note",
"This primary representation contains only generic background information.",
"en",
List.of(
"This chunk talks about municipal utilities and operations.",
"This chunk contains unrelated technical background."
));
SearchRequest request = SearchRequest.builder()
.queryText("Enegry optimiztion stratgy")
.modes(Set.of(SearchMode.TRIGRAM))
.build();
SearchExecutionContext context = SearchExecutionContext.builder()
.request(request)
.scope(new SearchDocumentScope(
Set.of(),
Set.of(DocumentType.TEXT),
Set.of(DocumentFamily.GENERIC),
null,
null
))
.page(0)
.size(10)
.build();
List<SearchHit> hits = trigramSearchEngine.execute(context);
assertThat(hits).isNotEmpty();
assertThat(hits.getFirst().getTitle()).isEqualTo("Energy optimization strategy");
assertThat(hits.getFirst().getPrimaryEngine()).isEqualTo(SearchEngineType.POSTGRES_TRIGRAM);
}
}

@ -0,0 +1,108 @@
package at.procon.dip.search.integration;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchMode;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
import at.procon.dip.search.repository.DocumentFullTextSearchRepository;
import at.procon.dip.search.repository.DocumentTrigramSearchRepository;
import at.procon.dip.search.spi.SearchDocumentScope;
import at.procon.dip.testsupport.AbstractSearchIntegrationTest;
import at.procon.dip.testsupport.SearchTestDataFactory;
import java.util.List;
import java.util.Set;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import static org.assertj.core.api.Assertions.assertThat;
class GenericSearchRepositoryIntegrationTest extends AbstractSearchIntegrationTest {
@Autowired
private SearchTestDataFactory dataFactory;
@Autowired
private DocumentFullTextSearchRepository fullTextRepository;
@Autowired
private DocumentTrigramSearchRepository trigramRepository;
@Test
void fullTextRepository_should_find_exact_keyword_in_primary_representation() {
dataFactory.createDocumentWithPrimaryRepresentation(
"Vienna school renovation framework",
"School roof framework agreement",
"Framework agreement for school roof renovation in Vienna.",
DocumentType.TED_NOTICE,
DocumentFamily.PROCUREMENT,
"en",
RepresentationType.SEMANTIC_TEXT);
dataFactory.createDocumentWithPrimaryRepresentation(
"Pump maintenance manual",
"Maintenance procedures",
"Calibration procedure for pumps and gauges.",
DocumentType.PDF,
DocumentFamily.KNOWLEDGE,
"en",
RepresentationType.FULLTEXT);
assertThat(jdbcTemplate.queryForObject(
"select count(*) from doc.doc_text_representation",
Integer.class
)).isGreaterThan(0);
assertThat(jdbcTemplate.queryForObject(
"select count(*) from doc.doc_text_representation where search_vector is not null",
Integer.class
)).isGreaterThan(0);
SearchExecutionContext context = SearchExecutionContext.builder()
.request(SearchRequest.builder()
.queryText("framework agreement")
.modes(Set.of(SearchMode.FULLTEXT))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.build())
.scope(new SearchDocumentScope(Set.of(), null, null, null, null))
.page(0)
.size(10)
.build();
List<SearchHit> hits = fullTextRepository.search(context, 10);
assertThat(hits).isNotEmpty();
assertThat(hits).extracting(SearchHit::getTitle)
.contains("Vienna school renovation framework")
.doesNotContain("Pump maintenance manual");
}
@Test
void trigramRepository_should_match_fuzzy_title() {
dataFactory.createDocumentWithPrimaryRepresentation(
"Vienna school renovation framework",
"School roof framework agreement",
"Framework agreement for school roof renovation in Vienna.",
DocumentType.TED_NOTICE,
DocumentFamily.PROCUREMENT,
"en",
RepresentationType.SEMANTIC_TEXT);
SearchExecutionContext context = SearchExecutionContext.builder()
.request(SearchRequest.builder()
.queryText("Viena school renovtion")
.modes(Set.of(SearchMode.TRIGRAM))
.representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY)
.build())
.scope(new SearchDocumentScope(Set.of(), null, null, null, null))
.page(0)
.size(10)
.build();
List<SearchHit> hits = trigramRepository.search(context, 10, 0.10d);
assertThat(hits).isNotEmpty();
assertThat(hits.getFirst().getTitle()).isEqualTo("Vienna school renovation framework");
}
}

@ -0,0 +1,119 @@
package at.procon.dip.testsupport;
import at.procon.dip.FixedPortPostgreSQLContainer;
import at.procon.dip.domain.document.repository.DocumentRepository;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
import javax.sql.DataSource;
import at.procon.dip.testsupport.config.SearchTestConfig;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.TestInstance;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.http.HttpMessageConvertersAutoConfiguration;
import org.springframework.boot.autoconfigure.jackson.JacksonAutoConfiguration;
import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.context.annotation.Import;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.test.context.DynamicPropertyRegistry;
import org.springframework.test.context.DynamicPropertySource;
import org.springframework.test.context.TestPropertySource;
import org.testcontainers.containers.PostgreSQLContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
@SpringBootTest(classes = SearchTestApplication.class, webEnvironment = SpringBootTest.WebEnvironment.MOCK)
@Testcontainers
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@TestPropertySource(properties = {
"spring.jpa.hibernate.ddl-auto=create-drop",
"spring.jpa.show-sql=false",
"spring.jpa.open-in-view=false",
"spring.jpa.properties.hibernate.default_schema=DOC",
"spring.main.lazy-initialization=true",
"ted.vectorization.enabled=false",
"ted.search.default-page-size=20",
"ted.search.max-page-size=100",
"ted.search.fulltext-weight=0.60",
"ted.search.trigram-weight=0.40",
"ted.search.semantic-weight=0.45",
"ted.search.recency-boost-weight=0.05",
"ted.search.trigram-threshold=0.10",
"server.servlet.context-path=/api"
})
public abstract class AbstractSearchIntegrationTest {
private static final int HOST_PORT = 15433;
private static final String DB_NAME = "dip_search_test";
private static final String DB_USER = "test";
private static final String DB_PASSWORD = "test";
private static final String JDBC_URL = "jdbc:postgresql://localhost:" + HOST_PORT + "/" + DB_NAME;
@Container
static PostgreSQLContainer<?> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", HOST_PORT)
.withDatabaseName(DB_NAME)
.withUsername(DB_USER)
.withPassword(DB_PASSWORD)
.withInitScript("sql/create-doc-search-test-schemas.sql");
@DynamicPropertySource
static void registerProperties(DynamicPropertyRegistry registry) {
if (!postgres.isRunning()) {
postgres.start();
}
registry.add("spring.datasource.url", () -> JDBC_URL);
registry.add("spring.datasource.username", () -> DB_USER);
registry.add("spring.datasource.password", () -> DB_PASSWORD);
registry.add("spring.datasource.driver-class-name", () -> "org.postgresql.Driver");
}
@Autowired
protected JdbcTemplate jdbcTemplate;
@Autowired
protected DataSource dataSource;
@Autowired
protected DocumentRepository documentRepository;
@Autowired
protected DocumentTextRepresentationRepository representationRepository;
@Autowired
protected DocumentTenantRepository tenantRepository;
@BeforeEach
void resetSearchTestDatabase() {
ensureSearchColumnsAndIndexes();
cleanupDatabase();
}
protected void ensureSearchColumnsAndIndexes() {
jdbcTemplate.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm");
jdbcTemplate.execute("CREATE SCHEMA IF NOT EXISTS doc");
jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_config VARCHAR(64)");
jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_vector tsvector");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector_test ON doc.doc_text_representation USING GIN (search_vector)");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm_test ON doc.doc_document USING GIN (title gin_trgm_ops)");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm_test ON doc.doc_document USING GIN (summary gin_trgm_ops)");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm_test ON doc.doc_text_representation USING GIN (text_body gin_trgm_ops)");
}
protected void cleanupDatabase() {
jdbcTemplate.execute("TRUNCATE TABLE doc.doc_text_representation, doc.doc_document, doc.doc_tenant RESTART IDENTITY CASCADE");
}
protected void setDocumentCreatedAt(java.util.UUID documentId, java.time.OffsetDateTime createdAt) {
jdbcTemplate.update("UPDATE doc.doc_document SET created_at = ?, updated_at = ? WHERE id = ?", createdAt, createdAt, documentId);
}
protected boolean columnExists(String schema, String table, String column) {
return Boolean.TRUE.equals(jdbcTemplate.queryForObject(
"SELECT COUNT(*) > 0 FROM information_schema.columns WHERE table_schema = ? AND table_name = ? AND column_name = ?",
Boolean.class,
schema.toLowerCase(), table.toLowerCase(), column.toLowerCase()));
}
}

@ -0,0 +1,80 @@
package at.procon.dip.testsupport;
import at.procon.dip.config.JacksonConfig;
import at.procon.dip.domain.document.service.DocumentContentService;
import at.procon.dip.domain.document.service.DocumentRepresentationService;
import at.procon.dip.domain.document.service.DocumentService;
import at.procon.dip.search.engine.fulltext.PostgresFullTextSearchEngine;
import at.procon.dip.search.engine.trigram.PostgresTrigramSearchEngine;
import at.procon.dip.search.plan.DefaultSearchPlanner;
import at.procon.dip.search.rank.DefaultSearchResultFusionService;
import at.procon.dip.search.rank.DefaultSearchScoreNormalizer;
import at.procon.dip.search.repository.DocumentFullTextSearchRepositoryImpl;
import at.procon.dip.search.repository.DocumentTrigramSearchRepositoryImpl;
import at.procon.dip.search.service.DefaultSearchOrchestrator;
import at.procon.dip.search.service.DocumentLexicalIndexService;
import at.procon.dip.search.service.SearchMetricsService;
import at.procon.dip.search.web.GenericSearchController;
import at.procon.ted.config.TedProcessorProperties;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.boot.SpringBootConfiguration;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.domain.EntityScan;
import org.springframework.boot.autoconfigure.http.HttpMessageConvertersAutoConfiguration;
import org.springframework.boot.autoconfigure.jackson.JacksonAutoConfiguration;
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
import org.springframework.boot.autoconfigure.jdbc.JdbcTemplateAutoConfiguration;
import org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration;
import org.springframework.boot.autoconfigure.transaction.TransactionAutoConfiguration;
import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
import org.springframework.context.annotation.Import;
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
/**
* Narrow test application for generic lexical search slices.
*
* Important: this class does not component-scan the whole application. Every
* test support bean that should exist in the test context must therefore be
* imported explicitly.
*/
@SpringBootConfiguration
@AutoConfigureMockMvc
@ImportAutoConfiguration({
DataSourceAutoConfiguration.class,
HibernateJpaAutoConfiguration.class,
TransactionAutoConfiguration.class,
JdbcTemplateAutoConfiguration.class
})
@EnableConfigurationProperties(TedProcessorProperties.class)
@EntityScan(basePackages = {
"at.procon.dip.domain.document.entity",
"at.procon.dip.domain.tenant.entity"
})
@EnableJpaRepositories(basePackages = {
"at.procon.dip.domain.document.repository",
"at.procon.dip.domain.tenant.repository"
})
@Import({
DocumentService.class,
DocumentContentService.class,
DocumentRepresentationService.class,
DocumentLexicalIndexService.class,
SearchTestDataFactory.class,
DefaultSearchPlanner.class,
DocumentFullTextSearchRepositoryImpl.class,
DocumentTrigramSearchRepositoryImpl.class,
PostgresFullTextSearchEngine.class,
PostgresTrigramSearchEngine.class,
DefaultSearchScoreNormalizer.class,
DefaultSearchResultFusionService.class,
SearchMetricsService.class,
DefaultSearchOrchestrator.class,
GenericSearchController.class,
DocumentLexicalIndexService.class
})
public class SearchTestApplication {
}

@ -0,0 +1,133 @@
package at.procon.dip.testsupport;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
import at.procon.dip.search.service.DocumentLexicalIndexService;
import at.procon.dip.domain.document.service.DocumentRepresentationService;
import at.procon.dip.domain.document.service.DocumentService;
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
import java.util.ArrayList;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
@Component
@RequiredArgsConstructor
@Transactional
public class SearchTestDataFactory {
private final DocumentService documentService;
private final DocumentRepresentationService representationService;
private final DocumentLexicalIndexService lexicalIndexService;
public CreatedDocument createDocumentWithPrimaryRepresentation(
String title,
String summary,
String body,
DocumentType documentType,
DocumentFamily documentFamily,
String languageCode,
RepresentationType primaryType) {
Document document = documentService.create(new CreateDocumentCommand(
null,
DocumentVisibility.PUBLIC,
documentType,
documentFamily,
DocumentStatus.RECEIVED,
title,
summary,
languageCode,
"text/plain",
null,
Integer.toHexString((title + body).hashCode())
));
DocumentTextRepresentation primary = addRepresentation(document, primaryType, languageCode, true, null, null, null, body);
return new CreatedDocument(document, primary, List.of(primary));
}
public CreatedDocument createDocumentWithPrimaryAndChunks(
String title,
String summary,
String primaryBody,
String languageCode,
List<String> chunkBodies) {
Document document = documentService.create(new CreateDocumentCommand(
null,
DocumentVisibility.PUBLIC,
DocumentType.TEXT,
DocumentFamily.GENERIC,
DocumentStatus.RECEIVED,
title,
summary,
languageCode,
"text/plain",
null,
Integer.toHexString((title + primaryBody + chunkBodies).hashCode())
));
List<DocumentTextRepresentation> all = new ArrayList<>();
DocumentTextRepresentation primary = addRepresentation(document, RepresentationType.SEMANTIC_TEXT, languageCode, true, null, null, null, primaryBody);
all.add(primary);
int offset = 0;
for (int i = 0; i < chunkBodies.size(); i++) {
String chunk = chunkBodies.get(i);
DocumentTextRepresentation saved = addRepresentation(
document,
RepresentationType.CHUNK,
languageCode,
false,
i,
offset,
offset + chunk.length(),
chunk);
all.add(saved);
offset += chunk.length();
}
return new CreatedDocument(document, primary, all);
}
private DocumentTextRepresentation addRepresentation(
Document document,
RepresentationType type,
String languageCode,
boolean primary,
Integer chunkIndex,
Integer chunkStartOffset,
Integer chunkEndOffset,
String text) {
DocumentTextRepresentation representation = representationService.addRepresentation(new AddDocumentTextRepresentationCommand(
document.getId(),
null,
type,
"search-test-factory",
languageCode,
null,
chunkIndex,
chunkStartOffset,
chunkEndOffset,
primary,
text
));
lexicalIndexService.refreshRepresentationLexicalIndex(representation.getId());
return representation;
}
public record CreatedDocument(
Document document,
DocumentTextRepresentation primaryRepresentation,
List<DocumentTextRepresentation> representations
) {
}
}

@ -0,0 +1,20 @@
package at.procon.dip.testsupport.config;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import org.springframework.boot.autoconfigure.jackson.Jackson2ObjectMapperBuilderCustomizer;
import org.springframework.boot.test.context.TestConfiguration;
import org.springframework.context.annotation.Bean;
@TestConfiguration
public class SearchTestConfig {
@Bean
public ObjectMapper objectMapper() {
ObjectMapper mapper = new ObjectMapper();
mapper.registerModule(new JavaTimeModule());
mapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
return mapper;
}
}
Loading…
Cancel
Save