From d20652916251eb4ca4c7829f92f90dd459b36c39 Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Mon, 23 Mar 2026 16:21:16 +0100 Subject: [PATCH] embedding nv3.3 --- .../NV3_3_OPTION_A_HARDENING_NOTES.md | 34 +++ .../dip/domain/document/DistanceMetric.java | 1 + .../embedding/config/EmbeddingProperties.java | 2 +- ...faultGenericTextRepresentationBuilder.java | 4 + .../PgVectorSemanticSearchEngine.java | 43 +++- .../DocumentSemanticSearchRepository.java | 45 +++- src/main/resources/application - Kopie.yml | 234 ------------------ src/main/resources/application.yml | 6 +- .../V11__doc_option_a_semantic_hardening.sql | 39 +++ ...__search_slice2_generic_search_support.sql | 8 +- .../SearchSemanticTestApplication.java | 2 +- 11 files changed, 166 insertions(+), 252 deletions(-) create mode 100644 docs/embedding/NV3_3_OPTION_A_HARDENING_NOTES.md delete mode 100644 src/main/resources/application - Kopie.yml create mode 100644 src/main/resources/db/migration/V11__doc_option_a_semantic_hardening.sql diff --git a/docs/embedding/NV3_3_OPTION_A_HARDENING_NOTES.md b/docs/embedding/NV3_3_OPTION_A_HARDENING_NOTES.md new file mode 100644 index 0000000..27f3b07 --- /dev/null +++ b/docs/embedding/NV3_3_OPTION_A_HARDENING_NOTES.md @@ -0,0 +1,34 @@ +# Option A semantic search hardening + +This patch hardens the multi-model semantic search implementation in three places: + +## 1. Semantic repository +- requires a positive model dimension +- requires a configured distance metric +- uses metric-aware SQL expressions: + - cosine -> `1 - distance` + - inner product -> `-1 * negative_inner_product` + - euclidean -> `1 / (1 + distance)` + +## 2. Semantic engine +- resolves one explicit model per request +- validates: + - model active + - dimensions > 0 + - distance metric configured + - query embedding mode supported + +## 3. Database +- check constraint for positive dimensions +- unique constraint on `(representation_id, model_id)` +- comments documenting the per-model partial ANN index strategy + +## Why this matters + +With Option A, multiple vector lengths live in one `DOC.doc_embedding.embedding_vector` column. That is safe only if: + +- every semantic query resolves exactly one model +- the query vector uses that same model +- the repository filters by `model_id` +- the vector cast uses the correct model dimension +- ANN indexes are created per active model \ No newline at end of file diff --git a/src/main/java/at/procon/dip/domain/document/DistanceMetric.java b/src/main/java/at/procon/dip/domain/document/DistanceMetric.java index 0bb8d68..d8bb9b7 100644 --- a/src/main/java/at/procon/dip/domain/document/DistanceMetric.java +++ b/src/main/java/at/procon/dip/domain/document/DistanceMetric.java @@ -6,5 +6,6 @@ package at.procon.dip.domain.document; public enum DistanceMetric { COSINE, L2, + EUCLIDEAN, INNER_PRODUCT } diff --git a/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java b/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java index 56d2ad6..bc47cc0 100644 --- a/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java +++ b/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java @@ -8,9 +8,9 @@ import lombok.Data; import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.context.annotation.Configuration; -@Configuration @ConfigurationProperties(prefix = "dip.embedding") @Data +@Configuration public class EmbeddingProperties { private boolean enabled = false; diff --git a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java index 93bea89..446ddf4 100644 --- a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java +++ b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java @@ -41,6 +41,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati String semantic = buildSemanticText(title, summary, request.detectionResult().documentType()); List drafts = new ArrayList<>(); + /* drafts.add(new TextRepresentationDraft( RepresentationType.FULLTEXT, BUILDER_KEY, @@ -53,6 +54,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati ContentRole.NORMALIZED_TEXT, Boolean.FALSE )); + */ drafts.add(new TextRepresentationDraft( RepresentationType.SEMANTIC_TEXT, BUILDER_KEY, @@ -65,6 +67,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati ContentRole.NORMALIZED_TEXT, Boolean.TRUE )); + /* if (StringUtils.hasText(title)) { drafts.add(new TextRepresentationDraft( RepresentationType.TITLE_ABSTRACT, @@ -91,6 +94,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati ContentRole.NORMALIZED_TEXT, Boolean.FALSE )); + */ return drafts; } diff --git a/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java b/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java index a238f73..15fdff4 100644 --- a/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java +++ b/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java @@ -1,6 +1,8 @@ package at.procon.dip.search.engine.semantic; import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.model.EmbeddingModelDescriptor; +import at.procon.dip.embedding.registry.EmbeddingModelRegistry; import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; @@ -10,13 +12,16 @@ import at.procon.dip.search.service.SemanticQueryEmbeddingService; import at.procon.ted.config.TedProcessorProperties; import java.util.List; import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; @Component @RequiredArgsConstructor +@Slf4j public class PgVectorSemanticSearchEngine implements SearchEngine { private final EmbeddingProperties embeddingProperties; + private final EmbeddingModelRegistry embeddingModelRegistry; private final TedProcessorProperties properties; private final SemanticQueryEmbeddingService queryEmbeddingService; private final DocumentSemanticSearchRepository repository; @@ -35,15 +40,47 @@ public class PgVectorSemanticSearchEngine implements SearchEngine { @Override public List execute(SearchExecutionContext context) { + String requestedModelKey = context.getRequest().getSemanticModelKey(); + EmbeddingModelDescriptor model = resolveModel(requestedModelKey); + validateModel(model); + return queryEmbeddingService.buildQueryEmbedding( context.getRequest().getQueryText(), - context.getRequest().getSemanticModelKey()) + model.modelKey()) .map(query -> repository.search( context, query.modelId(), + model.dimensions(), + model.distanceMetric(), query.vectorString(), properties.getSearch().getSemanticCandidateLimit(), properties.getSearch().getSimilarityThreshold())) - .orElse(List.of()); + .orElseGet(() -> { + log.debug("Semantic search skipped because query embedding could not be generated for model {}", model.modelKey()); + return List.of(); + }); + } + + private EmbeddingModelDescriptor resolveModel(String requestedModelKey) { + if (requestedModelKey != null && !requestedModelKey.isBlank()) { + return embeddingModelRegistry.getRequired(requestedModelKey); + } + String defaultModelKey = embeddingModelRegistry.getRequiredDefaultQueryModelKey(); + return embeddingModelRegistry.getRequired(defaultModelKey); + } + + private void validateModel(EmbeddingModelDescriptor model) { + if (!model.active()) { + throw new IllegalStateException("Semantic search model is not active: " + model.modelKey()); + } + if (model.dimensions() <= 0) { + throw new IllegalStateException("Semantic search model has invalid dimensions: " + model.modelKey() + " -> " + model.dimensions()); + } + if (model.distanceMetric() == null) { + throw new IllegalStateException("Semantic search model has no distance metric configured: " + model.modelKey()); + } + if (!model.supportsQueryEmbeddingMode()) { + throw new IllegalStateException("Semantic search model does not support query embedding mode: " + model.modelKey()); + } } -} +} \ No newline at end of file diff --git a/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java index fb9659c..8322c7b 100644 --- a/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java +++ b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java @@ -1,5 +1,6 @@ package at.procon.dip.search.repository; +import at.procon.dip.domain.document.DistanceMetric; import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; @@ -19,9 +20,22 @@ public class DocumentSemanticSearchRepository { public List search(SearchExecutionContext context, UUID modelId, + int modelDimensions, + DistanceMetric distanceMetric, String queryVector, int limit, double threshold) { + + if (modelDimensions <= 0) { + throw new IllegalArgumentException("Semantic search requires a positive model dimension, got: " + modelDimensions); + } + if (distanceMetric == null) { + throw new IllegalArgumentException("Semantic search requires a distance metric"); + } + + String vectorType = "public.vector(" + modelDimensions + ")"; + String similarityExpr = buildSimilarityExpression(distanceMetric, vectorType); + StringBuilder sql = new StringBuilder(""" SELECT d.id AS document_id, @@ -41,7 +55,9 @@ public class DocumentSemanticSearchRepository { d.created_at AS created_at, d.updated_at AS updated_at, LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, - (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) AS score + """); + sql.append(similarityExpr).append(" AS score "); + sql.append(""" FROM doc.doc_embedding de JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id JOIN doc.doc_document d ON d.id = de.document_id @@ -49,18 +65,35 @@ public class DocumentSemanticSearchRepository { WHERE de.embedding_status = 'COMPLETED' AND de.embedding_vector IS NOT NULL AND de.model_id = :modelId - AND (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) >= :threshold - """); + AND de.embedding_dimensions = :modelDimensions + AND """); + sql.append(similarityExpr).append(" >= :threshold "); MapSqlParameterSource params = new MapSqlParameterSource(); params.addValue("queryVector", queryVector); params.addValue("modelId", modelId); + params.addValue("modelDimensions", modelDimensions); params.addValue("threshold", threshold); + SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true); sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit"); params.addValue("limit", limit); - return jdbcTemplate.query(sql.toString(), params, - new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT)); + return jdbcTemplate.query( + sql.toString(), + params, + new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT) + ); + } + + private String buildSimilarityExpression(DistanceMetric distanceMetric, String vectorType) { + String lhs = "(de.embedding_vector)::" + vectorType; + String rhs = "CAST(:queryVector AS " + vectorType + ")"; + + return switch (distanceMetric) { + case COSINE -> "(1 - (" + lhs + " <=> " + rhs + "))"; + case INNER_PRODUCT -> "(-1 * (" + lhs + " <#> " + rhs + "))"; + case EUCLIDEAN, L2 -> "(1 / (1 + (" + lhs + " <-> " + rhs + ")))"; + }; } -} +} \ No newline at end of file diff --git a/src/main/resources/application - Kopie.yml b/src/main/resources/application - Kopie.yml deleted file mode 100644 index 5526b1d..0000000 --- a/src/main/resources/application - Kopie.yml +++ /dev/null @@ -1,234 +0,0 @@ -# TED Procurement Document Processor Configuration -# Author: Martin.Schweitzer@procon.co.at and claude.ai - -server: - port: 8888 - servlet: - context-path: /api - -spring: - application: - name: ted-procurement-processor - - datasource: - url: jdbc:postgresql://localhost:32333/RELM - username: ${DB_USERNAME:postgres} - password: ${DB_PASSWORD:pwd} - driver-class-name: org.postgresql.Driver - hikari: - maximum-pool-size: 5 - minimum-idle: 2 - connection-timeout: 30000 - idle-timeout: 300000 - max-lifetime: 900000 - leak-detection-threshold: 120000 # 2 minutes - increased to avoid false positives with batch processing - - jpa: - hibernate: - ddl-auto: none - show-sql: false - open-in-view: false - properties: - hibernate: - format_sql: true - default_schema: TED - jdbc: - batch_size: 25 # Match chunk size for optimal batch processing - order_inserts: true - order_updates: true - - flyway: - enabled: true - locations: classpath:db/migration - baseline-on-migrate: true - create-schemas: true - schemas: TED - default-schema: TED - -# Apache Camel Configuration -camel: - springboot: - main-run-controller: true - health: - enabled: true - # Weniger strenge Health-Checks für File-Consumer - consumers-enabled: false - -# Custom Application Properties -ted: - # Directory configuration for file processing - input: - # Base directory for watching incoming TED XML files - directory: ${TED_INPUT_DIR:D:/ted.europe/extracted} - # File pattern to match (recursive scanning) - pattern: "**/*.xml" - # Move processed files to this directory - processed-directory: ${TED_PROCESSED_DIR:.processed} - # Move failed files to this directory - error-directory: ${TED_ERROR_DIR:.error} - # Polling interval in milliseconds - poll-interval: 5000 - # Maximum messages per poll (reduced to prevent memory issues) - max-messages-per-poll: 10 - - # Schema validation configuration - schema: - # Enable/disable XSD validation - enabled: true - # Path to eForms SDK schemas (from Maven dependency or custom location) - path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd - - # Vectorization configuration - vectorization: - # Enable/disable async vectorization - enabled: true - # Use external HTTP API instead of subprocess - use-http-api: true - # Embedding service URL - api-url: http://localhost:8001 - # Model name for sentence-transformers - model-name: intfloat/multilingual-e5-large - # Vector dimensions (must match model output) - dimensions: 1024 - # Batch size for vectorization - batch-size: 16 - # Thread pool size for async processing - thread-pool-size: 4 - # Maximum text length for vectorization (characters) - max-text-length: 8192 - # HTTP connection timeout (milliseconds) - connect-timeout: 10000 - # HTTP socket/read timeout (milliseconds) - socket-timeout: 60000 - # Maximum retries on connection failure - max-retries: 5 - - # Search configuration - search: - # Default page size for search results - default-page-size: 20 - # Maximum page size - max-page-size: 100 - # Similarity threshold for vector search (0.0 - 1.0) - similarity-threshold: 0.7 - - # TED Daily Package Download configuration - download: - # Enable/disable automatic package download - enabled: true - # Base URL for TED Daily Packages - base-url: https://ted.europa.eu/packages/daily/ - # Download directory for tar.gz files - download-directory: D:/ted.europe/downloads - # Extract directory for XML files - extract-directory: D:/ted.europe/extracted - # Start year for downloads - start-year: 2015 - # Max consecutive 404 errors before stopping - max-consecutive-404: 4 - # Polling interval (milliseconds) - 2 minutes - poll-interval: 120000 - # Download timeout (milliseconds) - 5 minutes - download-timeout: 300000 - # Max concurrent downloads - max-concurrent-downloads: 2 - # Delay between downloads (milliseconds) for rate limiting - 5 seconds - delay-between-downloads: 3000 - # Delete tar.gz after extraction - delete-after-extraction: true - # Prioritize current year first - prioritize-current-year: false - - # IMAP Mail configuration - mail: - # Enable/disable mail processing - enabled: true - # IMAP server hostname - host: host - # IMAP server port (993 for IMAPS) - port: 993 - # Mail account username (email address) - username: ${MAIL_USERNAME:} - # Mail account password - password: ${MAIL_PASSWORD:} - # Use SSL/TLS connection - ssl: true - # Mail folder to read from - folder-name: INBOX - # Delete messages after processing - delete: false - # Mark messages as seen after processing (false = peek mode, don't mark as read) - seen: false - # Only process unseen messages - unseen: true - # Polling delay in milliseconds (1 minute) - delay: 60000 - # Max messages per poll - max-messages-per-poll: 10 - # Output directory for processed attachments - attachment-output-directory: D:/ted.europe/mail-attachments - # Enable/disable MIME file input processing - mime-input-enabled: true - # Input directory for MIME files (.eml) - mime-input-directory: D:/ted.europe/mime-input - # File pattern for MIME files (regex) - mime-input-pattern: .*\\.eml - # Polling interval for MIME input directory (milliseconds) - mime-input-poll-interval: 10000 - - # Solution Brief processing configuration - solution-brief: - # Enable/disable Solution Brief processing - enabled: true - # Input directory for Solution Brief PDF files - input-directory: C:/work/SolutionBrief - # Output directory for Excel result files (relative to input or absolute) - result-directory: ./result - # Number of top similar documents to include - top-k: 20 - # Minimum similarity threshold (0.0-1.0) - similarity-threshold: 0.5 - # Polling interval in milliseconds (30 seconds) - poll-interval: 30000 - # File pattern for PDF files (regex) - file-pattern: .*\\.pdf - # Process files only once (idempotent) - idempotent: true - # Idempotent repository file path - idempotent-repository: ./solution-brief-processed.dat - - # Data cleanup configuration - cleanup: - # Enable automatic cleanup of old documents - enabled: false - # Retention period in years (default: 10) - retention-years: 10 - # Cron expression for cleanup schedule (default: daily at 2 AM) - cron: "0 0 2 * * *" - -# Actuator endpoints -management: - endpoints: - web: - exposure: - include: health,info,metrics,camel - endpoint: - health: - show-details: when-authorized - -# OpenAPI documentation -springdoc: - api-docs: - path: /v3/api-docs - swagger-ui: - path: /swagger-ui.html - operations-sorter: method - -# Logging configuration -logging: - level: - at.procon.ted: INFO - at.procon.ted.camel.SolutionBriefRoute: INFO - org.apache.camel: INFO - org.hibernate.SQL: WARN - org.hibernate.type.descriptor.sql: WARN diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index ad4f212..bbe4f1f 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -84,7 +84,7 @@ ted: # Vectorization configuration vectorization: # Enable/disable async vectorization - enabled: false + enabled: true # Use external HTTP API instead of subprocess use-http-api: true # Embedding service URL @@ -154,7 +154,7 @@ ted: # TED Daily Package Download configuration download: # Enable/disable automatic package download - enabled: false + enabled: true # User service-based camel route use-service-based: false # Base URL for TED Daily Packages @@ -168,7 +168,7 @@ ted: # Max consecutive 404 errors before stopping max-consecutive-404: 4 # Polling interval (milliseconds) - 2 minutes - poll-interval: 1800000 + poll-interval: 300000 # Retry interval for tail NOT_FOUND packages - 6 hours not-found-retry-interval: 21600000 # Grace period after year end before a previous-year tail 404 is treated as final diff --git a/src/main/resources/db/migration/V11__doc_option_a_semantic_hardening.sql b/src/main/resources/db/migration/V11__doc_option_a_semantic_hardening.sql new file mode 100644 index 0000000..632561b --- /dev/null +++ b/src/main/resources/db/migration/V11__doc_option_a_semantic_hardening.sql @@ -0,0 +1,39 @@ +ALTER TABLE DOC.doc_embedding + DROP CONSTRAINT IF EXISTS ck_doc_embedding_dimensions_positive; + +ALTER TABLE DOC.doc_embedding + ADD CONSTRAINT ck_doc_embedding_dimensions_positive + CHECK (embedding_dimensions IS NULL OR embedding_dimensions > 0); + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 + FROM pg_constraint + WHERE conname = 'uq_doc_embedding_representation_model' + AND conrelid = 'doc.doc_embedding'::regclass + ) THEN + ALTER TABLE DOC.doc_embedding + ADD CONSTRAINT uq_doc_embedding_representation_model + UNIQUE (representation_id, model_id); + END IF; +END $$; + +COMMENT ON TABLE DOC.doc_embedding IS +'Option A multi-model embedding storage. Embeddings of different lengths may coexist in one table. Semantic search must always filter by model_id and embedding_dimensions.'; + +COMMENT ON COLUMN DOC.doc_embedding.embedding_dimensions IS +'Resolved dimension of the stored embedding. Used for validation, filtering, and model-specific vector casts.'; + +COMMENT ON COLUMN DOC.doc_embedding.embedding_vector IS +'Generic pgvector column without fixed dimension. Create per-model partial expression indexes with a fixed cast, e.g. ((embedding_vector::public.vector(1024)) vector_cosine_ops).'; + +-- Recommended partial ANN index pattern for active models: +-- CREATE INDEX idx_doc_embedding__hnsw +-- ON DOC.doc_embedding USING hnsw ((embedding_vector::public.vector()) vector_cosine_ops) +-- WHERE model_id = ''::uuid +-- AND embedding_status = 'COMPLETED'; +-- +-- If you use inner product or euclidean distance for a model, pick the matching operator class: +-- vector_ip_ops +-- vector_l2_ops \ No newline at end of file diff --git a/src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql b/src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql index f4ccfdf..f9c397e 100644 --- a/src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql +++ b/src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql @@ -1,7 +1,7 @@ -- Slice 1 + Slice 2 generic search support for DOC documents. -- Adds lexical-search support columns/indexes and pg_trgm extension. -CREATE EXTENSION IF NOT EXISTS pg_trgm; +CREATE EXTENSION IF NOT EXISTS pg_trgm with schema doc; ALTER TABLE DOC.doc_text_representation ADD COLUMN IF NOT EXISTS search_config VARCHAR(64); @@ -15,12 +15,12 @@ CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm ON DOC.doc_document - USING GIN (title gin_trgm_ops); + USING GIN (title DOC.gin_trgm_ops); CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm ON DOC.doc_document - USING GIN (summary gin_trgm_ops); + USING GIN (summary DOC.gin_trgm_ops); CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm ON DOC.doc_text_representation - USING GIN (text_body gin_trgm_ops); + USING GIN (text_body DOC.gin_trgm_ops); diff --git a/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java b/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java index c1c7105..0616125 100644 --- a/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java +++ b/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java @@ -48,7 +48,7 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories; TransactionAutoConfiguration.class, JdbcTemplateAutoConfiguration.class }) -@EnableConfigurationProperties({TedProcessorProperties.class, EmbeddingProperties.class}) +@EnableConfigurationProperties({TedProcessorProperties.class}) @EntityScan(basePackages = { "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity",