embedding nv3.1 - test

master
trifonovt 4 weeks ago
parent ca502cb369
commit 847cb40f8a

@ -38,7 +38,7 @@ class BasicMimeAndExtensionDocumentTypeDetectorTest {
assertThat(result.documentType()).isEqualTo(DocumentType.GENERIC_BINARY); assertThat(result.documentType()).isEqualTo(DocumentType.GENERIC_BINARY);
assertThat(result.mimeType()).isEqualTo("application/vnd.ms-excel"); assertThat(result.mimeType()).isEqualTo("application/vnd.ms-excel");
assertThat(result.attributes()).containsEntry("detectedExtension", "xls"); assertThat(result.attributes()).containsEntry("detectedExtension", "xls");
assertThat(result.attributes()).containsEntry("effectiveMediaType", "application/vnd.ms-excel"); assertThat(result.attributes()).containsEntry("normalizedMediaType", "application/vnd.ms-excel");
} }
@Test @Test

@ -47,8 +47,8 @@ class SpreadsheetDocumentExtractorTest {
String text = result.derivedTextByRole().get(ContentRole.NORMALIZED_TEXT); String text = result.derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
assertNotNull(text); assertNotNull(text);
assertTrue(text.contains("Sheet: Sheet1")); assertTrue(text.contains("Sheet: Sheet1"));
assertTrue(text.contains("Name | Amount")); assertTrue(text.contains("Name\tAmount"));
assertTrue(text.contains("Alice | 42")); assertTrue(text.contains("Alice\t42"));
} }
@Test @Test
@ -74,7 +74,9 @@ class SpreadsheetDocumentExtractorTest {
ExtractionResult result = extractor.extract(new ExtractionRequest(source, detection, csv, data)); ExtractionResult result = extractor.extract(new ExtractionRequest(source, detection, csv, data));
String text = result.derivedTextByRole().get(ContentRole.NORMALIZED_TEXT); String text = result.derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
assertEquals("Name | Amount\nAlice | 42\nBob | 77", text); assertEquals("Name,Amount\n" +
"Alice,42\n" +
"Bob,77", text);
} }
private byte[] createLegacyXls() throws Exception { private byte[] createLegacyXls() throws Exception {

@ -63,7 +63,7 @@ class MailDocumentIngestionAdapterBundleTest {
properties.getGenericIngestion().setMailAdapterEnabled(true); properties.getGenericIngestion().setMailAdapterEnabled(true);
properties.getGenericIngestion().setExpandMailZipAttachments(false); properties.getGenericIngestion().setExpandMailZipAttachments(false);
properties.getGenericIngestion().setMailImportBatchId("test-mail-bundle"); properties.getGenericIngestion().setMailImportBatchId("test-mail-bundle");
when(zipExtractionService.canHandle(any(), any())).thenReturn(false); lenient().when(zipExtractionService.canHandle(any(), any())).thenReturn(false);
adapter = new MailDocumentIngestionAdapter(properties, importService, new MailMessageExtractionService(), relationService, zipExtractionService); adapter = new MailDocumentIngestionAdapter(properties, importService, new MailMessageExtractionService(), relationService, zipExtractionService);
} }

@ -35,6 +35,7 @@ import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.normalization.impl.DefaultGenericTextRepresentationBuilder; import at.procon.dip.normalization.impl.DefaultGenericTextRepresentationBuilder;
import at.procon.dip.normalization.service.TextRepresentationBuildService; import at.procon.dip.normalization.service.TextRepresentationBuildService;
import at.procon.dip.processing.service.StructuredDocumentProcessingService; import at.procon.dip.processing.service.StructuredDocumentProcessingService;
import at.procon.dip.search.service.DocumentLexicalIndexService;
import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.service.attachment.PdfExtractionService; import at.procon.ted.service.attachment.PdfExtractionService;
import at.procon.ted.service.attachment.ZipExtractionService; import at.procon.ted.service.attachment.ZipExtractionService;
@ -369,6 +370,7 @@ class MailBundleProcessingIntegrationTest {
DocumentExtractionService.class, DocumentExtractionService.class,
GenericDocumentImportService.class, GenericDocumentImportService.class,
StructuredDocumentProcessingService.class, StructuredDocumentProcessingService.class,
DocumentLexicalIndexService.class
}) })
static class TestApplication { static class TestApplication {

@ -18,7 +18,7 @@ import org.springframework.test.annotation.DirtiesContext;
import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThat;
@DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) //@DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS)
class GenericSearchOrchestratorIntegrationTest extends AbstractSearchIntegrationTest { class GenericSearchOrchestratorIntegrationTest extends AbstractSearchIntegrationTest {
@Autowired @Autowired

@ -8,11 +8,16 @@ import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
import at.procon.dip.search.dto.SearchRequest; import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest; import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest;
import at.procon.dip.testsupport.SemanticSearchTestDataFactory; import at.procon.dip.testsupport.SemanticSearchTestDataFactory;
import at.procon.dip.testsupport.config.SearchTestConfig;
import at.procon.dip.testsupport.config.SearchTestJacksonConfig; import at.procon.dip.testsupport.config.SearchTestJacksonConfig;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import java.util.Set; import java.util.Set;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.http.HttpMessageConvertersAutoConfiguration;
import org.springframework.boot.autoconfigure.jackson.JacksonAutoConfiguration;
import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration;
import org.springframework.context.annotation.Import; import org.springframework.context.annotation.Import;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.test.context.TestPropertySource; import org.springframework.test.context.TestPropertySource;
@ -24,9 +29,11 @@ import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
@AutoConfigureMockMvc @AutoConfigureMockMvc
@Import(SearchTestJacksonConfig.class) @Import(SearchTestConfig.class)
@TestPropertySource(properties = { @ImportAutoConfiguration({
"spring.mvc.converters.preferred-json-mapper=jackson" JacksonAutoConfiguration.class,
HttpMessageConvertersAutoConfiguration.class,
WebMvcAutoConfiguration.class
}) })
class GenericSemanticSearchEndpointIntegrationTest extends AbstractSemanticSearchIntegrationTest { class GenericSemanticSearchEndpointIntegrationTest extends AbstractSemanticSearchIntegrationTest {

@ -13,11 +13,15 @@ import at.procon.dip.search.spi.SearchDocumentScope;
import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest; import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest;
import at.procon.dip.testsupport.SemanticSearchTestDataFactory; import at.procon.dip.testsupport.SemanticSearchTestDataFactory;
import java.util.Set; import java.util.Set;
import at.procon.dip.testsupport.config.SearchTestConfig;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Import;
import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThat;
@Import(SearchTestConfig.class)
class GenericSemanticSearchOrchestratorIntegrationTest extends AbstractSemanticSearchIntegrationTest { class GenericSemanticSearchOrchestratorIntegrationTest extends AbstractSemanticSearchIntegrationTest {
@Autowired @Autowired

@ -12,11 +12,15 @@ import at.procon.dip.search.spi.SearchDocumentScope;
import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest; import at.procon.dip.testsupport.AbstractSemanticSearchIntegrationTest;
import at.procon.dip.testsupport.SemanticSearchTestDataFactory; import at.procon.dip.testsupport.SemanticSearchTestDataFactory;
import java.util.Set; import java.util.Set;
import at.procon.dip.testsupport.config.SearchTestConfig;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Import;
import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThat;
@Import(SearchTestConfig.class)
class SemanticModelSelectionIntegrationTest extends AbstractSemanticSearchIntegrationTest { class SemanticModelSelectionIntegrationTest extends AbstractSemanticSearchIntegrationTest {
@Autowired @Autowired

@ -6,16 +6,10 @@ import at.procon.dip.domain.document.repository.DocumentTextRepresentationReposi
import at.procon.dip.domain.tenant.repository.DocumentTenantRepository; import at.procon.dip.domain.tenant.repository.DocumentTenantRepository;
import javax.sql.DataSource; import javax.sql.DataSource;
import at.procon.dip.testsupport.config.SearchTestConfig;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.TestInstance; import org.junit.jupiter.api.TestInstance;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.http.HttpMessageConvertersAutoConfiguration;
import org.springframework.boot.autoconfigure.jackson.JacksonAutoConfiguration;
import org.springframework.boot.autoconfigure.web.servlet.WebMvcAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest; import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.context.annotation.Import;
import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.test.context.DynamicPropertyRegistry; import org.springframework.test.context.DynamicPropertyRegistry;
import org.springframework.test.context.DynamicPropertySource; import org.springframework.test.context.DynamicPropertySource;
@ -92,14 +86,14 @@ public abstract class AbstractSearchIntegrationTest {
} }
protected void ensureSearchColumnsAndIndexes() { protected void ensureSearchColumnsAndIndexes() {
jdbcTemplate.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm");
jdbcTemplate.execute("CREATE SCHEMA IF NOT EXISTS doc"); jdbcTemplate.execute("CREATE SCHEMA IF NOT EXISTS doc");
jdbcTemplate.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm with schema doc");
jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_config VARCHAR(64)"); jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_config VARCHAR(64)");
jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_vector tsvector"); jdbcTemplate.execute("ALTER TABLE doc.doc_text_representation ADD COLUMN IF NOT EXISTS search_vector tsvector");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector_test ON doc.doc_text_representation USING GIN (search_vector)"); jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector_test ON doc.doc_text_representation USING GIN (search_vector)");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm_test ON doc.doc_document USING GIN (title gin_trgm_ops)"); jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm_test ON doc.doc_document USING GIN (title doc.gin_trgm_ops)");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm_test ON doc.doc_document USING GIN (summary gin_trgm_ops)"); jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm_test ON doc.doc_document USING GIN (summary doc.gin_trgm_ops)");
jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm_test ON doc.doc_text_representation USING GIN (text_body gin_trgm_ops)"); jdbcTemplate.execute("CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm_test ON doc.doc_text_representation USING GIN (text_body doc.gin_trgm_ops)");
} }
protected void cleanupDatabase() { protected void cleanupDatabase() {

@ -1,5 +1,6 @@
package at.procon.dip.testsupport; package at.procon.dip.testsupport;
import at.procon.dip.FixedPortPostgreSQLContainer;
import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository; import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
import at.procon.dip.domain.document.repository.DocumentRepository; import at.procon.dip.domain.document.repository.DocumentRepository;
@ -18,6 +19,7 @@ import org.springframework.test.context.TestPropertySource;
import org.testcontainers.containers.PostgreSQLContainer; import org.testcontainers.containers.PostgreSQLContainer;
import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.DockerImageName;
@SpringBootTest(classes = SearchSemanticTestApplication.class, webEnvironment = SpringBootTest.WebEnvironment.MOCK) @SpringBootTest(classes = SearchSemanticTestApplication.class, webEnvironment = SpringBootTest.WebEnvironment.MOCK)
@Testcontainers @Testcontainers
@ -62,23 +64,33 @@ import org.testcontainers.junit.jupiter.Testcontainers;
}) })
public abstract class AbstractSemanticSearchIntegrationTest { public abstract class AbstractSemanticSearchIntegrationTest {
private static final int HOST_PORT = 15433;
private static final String DB_NAME = "dip_semantic_search_test";
private static final String DB_USER = "test";
private static final String DB_PASSWORD = "test";
private static final String JDBC_URL = "jdbc:postgresql://localhost:" + HOST_PORT + "/" + DB_NAME;
@Container @Container
static PostgreSQLContainer<?> postgres = new PostgreSQLContainer<>("pgvector/pgvector:pg16") static PostgreSQLContainer<?> postgres = new FixedPortPostgreSQLContainer<>(
.withDatabaseName("dip_semantic_search_test") DockerImageName.parse("pgvector/pgvector:pg16-trixie")
.withUsername("test") .asCompatibleSubstituteFor("postgres").toString()
.withPassword("test") , HOST_PORT)
.withDatabaseName(DB_NAME)
.withUsername(DB_USER)
.withPassword(DB_PASSWORD)
.withCommand("postgres", "-c", "fsync=off")
.withInitScript("sql/create-doc-search-test-schemas.sql"); .withInitScript("sql/create-doc-search-test-schemas.sql");
static {
postgres.start();
}
@DynamicPropertySource @DynamicPropertySource
static void registerProperties(DynamicPropertyRegistry registry) { static void registerProperties(DynamicPropertyRegistry registry) {
registry.add("spring.datasource.url", postgres::getJdbcUrl); if (!postgres.isRunning()) {
registry.add("spring.datasource.username", postgres::getUsername); postgres.start();
registry.add("spring.datasource.password", postgres::getPassword); }
registry.add("spring.datasource.driver-class-name", postgres::getDriverClassName); registry.add("spring.datasource.url", () -> JDBC_URL);
registry.add("spring.datasource.username", () -> DB_USER);
registry.add("spring.datasource.password", () -> DB_PASSWORD);
registry.add("spring.datasource.driver-class-name", () -> "org.postgresql.Driver");
} }
@Autowired @Autowired

@ -1,22 +1,52 @@
package at.procon.dip.testsupport; package at.procon.dip.testsupport;
import at.procon.dip.domain.document.repository.DocumentContentRepository;
import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.domain.document.service.DocumentContentService;
import at.procon.dip.domain.document.service.DocumentRepresentationService;
import at.procon.dip.domain.document.service.DocumentService;
import at.procon.dip.embedding.config.EmbeddingProperties; import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.search.engine.fulltext.PostgresFullTextSearchEngine;
import at.procon.dip.search.engine.trigram.PostgresTrigramSearchEngine;
import at.procon.dip.search.plan.DefaultSearchPlanner;
import at.procon.dip.search.rank.DefaultSearchResultFusionService;
import at.procon.dip.search.rank.DefaultSearchScoreNormalizer;
import at.procon.dip.search.repository.DocumentFullTextSearchRepositoryImpl;
import at.procon.dip.search.repository.DocumentTrigramSearchRepositoryImpl;
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
import at.procon.dip.domain.document.repository.DocumentRepository;
import at.procon.dip.search.service.DefaultSearchOrchestrator;
import at.procon.dip.search.service.DocumentLexicalIndexService;
import at.procon.dip.search.service.SearchMetricsService;
import at.procon.dip.search.web.GenericSearchController;
import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.config.TedProcessorProperties;
import org.springframework.boot.SpringBootConfiguration; import org.springframework.boot.SpringBootConfiguration;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration; import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.domain.EntityScan; import org.springframework.boot.autoconfigure.domain.EntityScan;
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
import org.springframework.boot.autoconfigure.jdbc.JdbcTemplateAutoConfiguration;
import org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration;
import org.springframework.boot.autoconfigure.transaction.TransactionAutoConfiguration;
import org.springframework.boot.context.properties.EnableConfigurationProperties; import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Import;
import org.springframework.data.jpa.repository.config.EnableJpaRepositories; import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
/** /**
* Narrow semantic-search test application that loads the new generic search subsystem * Narrow semantic-search test application that loads the new generic search subsystem
* plus the new parallel embedding subsystem. * plus the new parallel embedding subsystem.
*/ */
@SpringBootConfiguration @SpringBootConfiguration
@EnableAutoConfiguration(excludeName = { @AutoConfigureMockMvc
"org.apache.camel.spring.boot.CamelAutoConfiguration", @ImportAutoConfiguration({
"org.springframework.boot.autoconfigure.task.TaskSchedulingAutoConfiguration" DataSourceAutoConfiguration.class,
HibernateJpaAutoConfiguration.class,
TransactionAutoConfiguration.class,
JdbcTemplateAutoConfiguration.class
}) })
@EnableConfigurationProperties({TedProcessorProperties.class, EmbeddingProperties.class}) @EnableConfigurationProperties({TedProcessorProperties.class, EmbeddingProperties.class})
@EntityScan(basePackages = { @EntityScan(basePackages = {
@ -32,8 +62,25 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
@ComponentScan(basePackages = { @ComponentScan(basePackages = {
"at.procon.dip.domain.document.service", "at.procon.dip.domain.document.service",
"at.procon.dip.embedding", "at.procon.dip.embedding",
"at.procon.dip.search", "at.procon.dip.search"
"at.procon.dip.testsupport" })
@Import({
DocumentService.class,
DocumentContentService.class,
DocumentRepresentationService.class,
DocumentLexicalIndexService.class,
SearchTestDataFactory.class,
SemanticSearchTestDataFactory.class,
DefaultSearchPlanner.class,
DocumentFullTextSearchRepositoryImpl.class,
DocumentTrigramSearchRepositoryImpl.class,
PostgresFullTextSearchEngine.class,
PostgresTrigramSearchEngine.class,
DefaultSearchScoreNormalizer.class,
DefaultSearchResultFusionService.class,
SearchMetricsService.class,
DefaultSearchOrchestrator.class,
GenericSearchController.class
}) })
public class SearchSemanticTestApplication { public class SearchSemanticTestApplication {
} }

@ -1,3 +1,3 @@
CREATE SCHEMA IF NOT EXISTS DOC; CREATE SCHEMA IF NOT EXISTS DOC;
CREATE SCHEMA IF NOT EXISTS TED; CREATE SCHEMA IF NOT EXISTS TED;
CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE EXTENSION IF NOT EXISTS pg_trgm with schema doc;

Loading…
Cancel
Save