embedding nv3.3

2026-03-23 16:21:16 +01:00 · 2026-03-23 16:21:16 +01:00 · d206529162
parent 847cb40f8a
commit d206529162
11 changed files with 166 additions and 252 deletions
--- a/docs/embedding/NV3_3_OPTION_A_HARDENING_NOTES.md
+++ b/docs/embedding/NV3_3_OPTION_A_HARDENING_NOTES.md
@ -0,0 +1,34 @@
 # Option A semantic search hardening
 This patch hardens the multi-model semantic search implementation in three places:
 ## 1. Semantic repository
 - requires a positive model dimension
 - requires a configured distance metric
 - uses metric-aware SQL expressions:
  - cosine -> `1 - distance`
  - inner product -> `-1 * negative_inner_product`
  - euclidean -> `1 / (1 + distance)`
 ## 2. Semantic engine
 - resolves one explicit model per request
 - validates:
  - model active
  - dimensions > 0
  - distance metric configured
  - query embedding mode supported
 ## 3. Database
 - check constraint for positive dimensions
 - unique constraint on `(representation_id, model_id)`
 - comments documenting the per-model partial ANN index strategy
 ## Why this matters
 With Option A, multiple vector lengths live in one `DOC.doc_embedding.embedding_vector` column. That is safe only if:
 - every semantic query resolves exactly one model
 - the query vector uses that same model
 - the repository filters by `model_id`
 - the vector cast uses the correct model dimension
 - ANN indexes are created per active model
--- a/src/main/java/at/procon/dip/domain/document/DistanceMetric.java
+++ b/src/main/java/at/procon/dip/domain/document/DistanceMetric.java
@ -6,5 +6,6 @@ package at.procon.dip.domain.document;
 public enum DistanceMetric {
    COSINE,
    L2,
    EUCLIDEAN,
    INNER_PRODUCT
 }
--- a/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java
+++ b/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java
@ -8,9 +8,9 @@ import lombok.Data;
 import org.springframework.boot.context.properties.ConfigurationProperties;
 import org.springframework.context.annotation.Configuration;
@Configuration
@ConfigurationProperties(prefix = "dip.embedding")
@Data
@Configuration
 public class EmbeddingProperties {
    private boolean enabled = false;
--- a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java
+++ b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java
@ -41,6 +41,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
        String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
        List<TextRepresentationDraft> drafts = new ArrayList<>();
        /*
        drafts.add(new TextRepresentationDraft(
                RepresentationType.FULLTEXT,
                BUILDER_KEY,
@ -53,6 +54,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
                ContentRole.NORMALIZED_TEXT,
                Boolean.FALSE
        ));
         */
        drafts.add(new TextRepresentationDraft(
                RepresentationType.SEMANTIC_TEXT,
                BUILDER_KEY,
@ -65,6 +67,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
                ContentRole.NORMALIZED_TEXT,
                Boolean.TRUE
        ));
        /*
        if (StringUtils.hasText(title)) {
            drafts.add(new TextRepresentationDraft(
                    RepresentationType.TITLE_ABSTRACT,
@ -91,6 +94,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
                ContentRole.NORMALIZED_TEXT,
                Boolean.FALSE
        ));
         */
        return drafts;
    }
--- a/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java
+++ b/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java
@ -1,6 +1,8 @@
 package at.procon.dip.search.engine.semantic;
 import at.procon.dip.embedding.config.EmbeddingProperties;
 import at.procon.dip.embedding.model.EmbeddingModelDescriptor;
 import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
 import at.procon.dip.search.api.SearchExecutionContext;
 import at.procon.dip.search.dto.SearchEngineType;
 import at.procon.dip.search.dto.SearchHit;
@ -10,13 +12,16 @@ import at.procon.dip.search.service.SemanticQueryEmbeddingService;
 import at.procon.ted.config.TedProcessorProperties;
 import java.util.List;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
@Slf4j
 public class PgVectorSemanticSearchEngine implements SearchEngine {
    private final EmbeddingProperties embeddingProperties;
    private final EmbeddingModelRegistry embeddingModelRegistry;
    private final TedProcessorProperties properties;
    private final SemanticQueryEmbeddingService queryEmbeddingService;
    private final DocumentSemanticSearchRepository repository;
@ -35,15 +40,47 @@ public class PgVectorSemanticSearchEngine implements SearchEngine {
    @Override
    public List<SearchHit> execute(SearchExecutionContext context) {
        String requestedModelKey = context.getRequest().getSemanticModelKey();
        EmbeddingModelDescriptor model = resolveModel(requestedModelKey);
        validateModel(model);
        return queryEmbeddingService.buildQueryEmbedding(
                        context.getRequest().getQueryText(),
-                        context.getRequest().getSemanticModelKey())
+                        model.modelKey())
                .map(query -> repository.search(
                        context,
                        query.modelId(),
                        model.dimensions(),
                        model.distanceMetric(),
                        query.vectorString(),
                        properties.getSearch().getSemanticCandidateLimit(),
                        properties.getSearch().getSimilarityThreshold()))
-                .orElse(List.of());
+                .orElseGet(() -> {
                    log.debug("Semantic search skipped because query embedding could not be generated for model {}", model.modelKey());
                    return List.of();
                });
    }
-}
+
    private EmbeddingModelDescriptor resolveModel(String requestedModelKey) {
        if (requestedModelKey != null && !requestedModelKey.isBlank()) {
            return embeddingModelRegistry.getRequired(requestedModelKey);
        }
        String defaultModelKey = embeddingModelRegistry.getRequiredDefaultQueryModelKey();
        return embeddingModelRegistry.getRequired(defaultModelKey);
    }
    private void validateModel(EmbeddingModelDescriptor model) {
        if (!model.active()) {
            throw new IllegalStateException("Semantic search model is not active: " + model.modelKey());
        }
        if (model.dimensions() <= 0) {
            throw new IllegalStateException("Semantic search model has invalid dimensions: " + model.modelKey() + " -> " + model.dimensions());
        }
        if (model.distanceMetric() == null) {
            throw new IllegalStateException("Semantic search model has no distance metric configured: " + model.modelKey());
        }
        if (!model.supportsQueryEmbeddingMode()) {
            throw new IllegalStateException("Semantic search model does not support query embedding mode: " + model.modelKey());
        }
    }
 }
--- a/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java
+++ b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java
@ -1,5 +1,6 @@
 package at.procon.dip.search.repository;
 import at.procon.dip.domain.document.DistanceMetric;
 import at.procon.dip.search.api.SearchExecutionContext;
 import at.procon.dip.search.dto.SearchEngineType;
 import at.procon.dip.search.dto.SearchHit;
@ -19,9 +20,22 @@ public class DocumentSemanticSearchRepository {
    public List<SearchHit> search(SearchExecutionContext context,
                                  UUID modelId,
                                  int modelDimensions,
                                  DistanceMetric distanceMetric,
                                  String queryVector,
                                  int limit,
                                  double threshold) {
        if (modelDimensions <= 0) {
            throw new IllegalArgumentException("Semantic search requires a positive model dimension, got: " + modelDimensions);
        }
        if (distanceMetric == null) {
            throw new IllegalArgumentException("Semantic search requires a distance metric");
        }
        String vectorType = "public.vector(" + modelDimensions + ")";
        String similarityExpr = buildSimilarityExpression(distanceMetric, vectorType);
        StringBuilder sql = new StringBuilder("""
                SELECT
                    d.id AS document_id,
@ -41,7 +55,9 @@ public class DocumentSemanticSearchRepository {
                    d.created_at AS created_at,
                    d.updated_at AS updated_at,
                    LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
-                    (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) AS score
+                """);
        sql.append(similarityExpr).append(" AS score ");
        sql.append("""
                FROM doc.doc_embedding de
                JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
                JOIN doc.doc_document d ON d.id = de.document_id
@ -49,18 +65,35 @@ public class DocumentSemanticSearchRepository {
                WHERE de.embedding_status = 'COMPLETED'
                  AND de.embedding_vector IS NOT NULL
                  AND de.model_id = :modelId
-                  AND (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) >= :threshold
+                  AND de.embedding_dimensions = :modelDimensions
-                """);
+                  AND """);
        sql.append(similarityExpr).append(" >= :threshold ");
        MapSqlParameterSource params = new MapSqlParameterSource();
        params.addValue("queryVector", queryVector);
        params.addValue("modelId", modelId);
        params.addValue("modelDimensions", modelDimensions);
        params.addValue("threshold", threshold);
        SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
        sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
        params.addValue("limit", limit);
-        return jdbcTemplate.query(sql.toString(), params,
+        return jdbcTemplate.query(
-                new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT));
+                sql.toString(),
                params,
                new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT)
        );
    }
-}
+
    private String buildSimilarityExpression(DistanceMetric distanceMetric, String vectorType) {
        String lhs = "(de.embedding_vector)::" + vectorType;
        String rhs = "CAST(:queryVector AS " + vectorType + ")";
        return switch (distanceMetric) {
            case COSINE -> "(1 - (" + lhs + " <=> " + rhs + "))";
            case INNER_PRODUCT -> "(-1 * (" + lhs + " <#> " + rhs + "))";
            case EUCLIDEAN, L2 -> "(1 / (1 + (" + lhs + " <-> " + rhs + ")))";
        };
    }
 }
--- a/src/main/resources/application
+++ b/src/main/resources/application
@ -1,234 +0,0 @@
 # TED Procurement Document Processor Configuration
 # Author: Martin.Schweitzer@procon.co.at and claude.ai
 server:
  port: 8888
  servlet:
    context-path: /api
 spring:
  application:
    name: ted-procurement-processor
  datasource:
    url: jdbc:postgresql://localhost:32333/RELM
    username: ${DB_USERNAME:postgres}
    password: ${DB_PASSWORD:pwd}
    driver-class-name: org.postgresql.Driver
    hikari:
      maximum-pool-size: 5
      minimum-idle: 2
      connection-timeout: 30000
      idle-timeout: 300000
      max-lifetime: 900000
      leak-detection-threshold: 120000  # 2 minutes - increased to avoid false positives with batch processing
  jpa:
    hibernate:
      ddl-auto: none
    show-sql: false
    open-in-view: false
    properties:
      hibernate:
        format_sql: true
        default_schema: TED
        jdbc:
          batch_size: 25  # Match chunk size for optimal batch processing
        order_inserts: true
        order_updates: true
  flyway:
    enabled: true
    locations: classpath:db/migration
    baseline-on-migrate: true
    create-schemas: true
    schemas: TED
    default-schema: TED
 # Apache Camel Configuration
 camel:
  springboot:
    main-run-controller: true
  health:
    enabled: true
    # Weniger strenge Health-Checks für File-Consumer
    consumers-enabled: false
 # Custom Application Properties
 ted:
  # Directory configuration for file processing
  input:
    # Base directory for watching incoming TED XML files
    directory: ${TED_INPUT_DIR:D:/ted.europe/extracted}
    # File pattern to match (recursive scanning)
    pattern: "**/*.xml"
    # Move processed files to this directory
    processed-directory: ${TED_PROCESSED_DIR:.processed}
    # Move failed files to this directory  
    error-directory: ${TED_ERROR_DIR:.error}
    # Polling interval in milliseconds
    poll-interval: 5000
    # Maximum messages per poll (reduced to prevent memory issues)
    max-messages-per-poll: 10
  # Schema validation configuration
  schema:
    # Enable/disable XSD validation
    enabled: true
    # Path to eForms SDK schemas (from Maven dependency or custom location)
    path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
  # Vectorization configuration
  vectorization:
    # Enable/disable async vectorization
    enabled: true
    # Use external HTTP API instead of subprocess
    use-http-api: true
    # Embedding service URL
    api-url: http://localhost:8001
    # Model name for sentence-transformers
    model-name: intfloat/multilingual-e5-large
    # Vector dimensions (must match model output)
    dimensions: 1024
    # Batch size for vectorization
    batch-size: 16
    # Thread pool size for async processing
    thread-pool-size: 4
    # Maximum text length for vectorization (characters)
    max-text-length: 8192
    # HTTP connection timeout (milliseconds)
    connect-timeout: 10000
    # HTTP socket/read timeout (milliseconds)
    socket-timeout: 60000
    # Maximum retries on connection failure
    max-retries: 5
  # Search configuration
  search:
    # Default page size for search results
    default-page-size: 20
    # Maximum page size
    max-page-size: 100
    # Similarity threshold for vector search (0.0 - 1.0)
    similarity-threshold: 0.7
  # TED Daily Package Download configuration
  download:
    # Enable/disable automatic package download
    enabled: true
    # Base URL for TED Daily Packages
    base-url: https://ted.europa.eu/packages/daily/
    # Download directory for tar.gz files
    download-directory: D:/ted.europe/downloads
    # Extract directory for XML files
    extract-directory: D:/ted.europe/extracted
    # Start year for downloads
    start-year: 2015
    # Max consecutive 404 errors before stopping
    max-consecutive-404: 4
    # Polling interval (milliseconds) - 2 minutes
    poll-interval: 120000
    # Download timeout (milliseconds) - 5 minutes
    download-timeout: 300000
    # Max concurrent downloads
    max-concurrent-downloads: 2
    # Delay between downloads (milliseconds) for rate limiting - 5 seconds
    delay-between-downloads: 3000
    # Delete tar.gz after extraction
    delete-after-extraction: true
    # Prioritize current year first
    prioritize-current-year: false
  # IMAP Mail configuration
  mail:
    # Enable/disable mail processing
    enabled: true
    # IMAP server hostname
    host: host
    # IMAP server port (993 for IMAPS)
    port: 993
    # Mail account username (email address)
    username: ${MAIL_USERNAME:}
    # Mail account password
    password: ${MAIL_PASSWORD:}
    # Use SSL/TLS connection
    ssl: true
    # Mail folder to read from
    folder-name: INBOX
    # Delete messages after processing
    delete: false
    # Mark messages as seen after processing (false = peek mode, don't mark as read)
    seen: false
    # Only process unseen messages
    unseen: true
    # Polling delay in milliseconds (1 minute)
    delay: 60000
    # Max messages per poll
    max-messages-per-poll: 10
    # Output directory for processed attachments
    attachment-output-directory: D:/ted.europe/mail-attachments
    # Enable/disable MIME file input processing
    mime-input-enabled: true
    # Input directory for MIME files (.eml)
    mime-input-directory: D:/ted.europe/mime-input
    # File pattern for MIME files (regex)
    mime-input-pattern: .*\\.eml
    # Polling interval for MIME input directory (milliseconds)
    mime-input-poll-interval: 10000
  # Solution Brief processing configuration
  solution-brief:
    # Enable/disable Solution Brief processing
    enabled: true
    # Input directory for Solution Brief PDF files
    input-directory: C:/work/SolutionBrief
    # Output directory for Excel result files (relative to input or absolute)
    result-directory: ./result
    # Number of top similar documents to include
    top-k: 20
    # Minimum similarity threshold (0.0-1.0)
    similarity-threshold: 0.5
    # Polling interval in milliseconds (30 seconds)
    poll-interval: 30000
    # File pattern for PDF files (regex)
    file-pattern: .*\\.pdf
    # Process files only once (idempotent)
    idempotent: true
    # Idempotent repository file path
    idempotent-repository: ./solution-brief-processed.dat
  # Data cleanup configuration
  cleanup:
    # Enable automatic cleanup of old documents
    enabled: false
    # Retention period in years (default: 10)
    retention-years: 10
    # Cron expression for cleanup schedule (default: daily at 2 AM)
    cron: "0 0 2 * * *"
 # Actuator endpoints
 management:
  endpoints:
    web:
      exposure:
        include: health,info,metrics,camel
  endpoint:
    health:
      show-details: when-authorized
 # OpenAPI documentation
 springdoc:
  api-docs:
    path: /v3/api-docs
  swagger-ui:
    path: /swagger-ui.html
    operations-sorter: method
 # Logging configuration
 logging:
  level:
    at.procon.ted: INFO
    at.procon.ted.camel.SolutionBriefRoute: INFO
    org.apache.camel: INFO
    org.hibernate.SQL: WARN
    org.hibernate.type.descriptor.sql: WARN
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@ -84,7 +84,7 @@ ted:
  # Vectorization configuration
  vectorization:
    # Enable/disable async vectorization
-    enabled: false
+    enabled: true
    # Use external HTTP API instead of subprocess
    use-http-api: true
    # Embedding service URL
@ -154,7 +154,7 @@ ted:
  # TED Daily Package Download configuration
  download:
    # Enable/disable automatic package download
-    enabled: false
+    enabled: true
    # User service-based camel route
    use-service-based: false
    # Base URL for TED Daily Packages
@ -168,7 +168,7 @@ ted:
    # Max consecutive 404 errors before stopping
    max-consecutive-404: 4
    # Polling interval (milliseconds) - 2 minutes
-    poll-interval: 1800000
+    poll-interval: 300000
    # Retry interval for tail NOT_FOUND packages - 6 hours
    not-found-retry-interval: 21600000
    # Grace period after year end before a previous-year tail 404 is treated as final
--- a/src/main/resources/db/migration/V11__doc_option_a_semantic_hardening.sql
+++ b/src/main/resources/db/migration/V11__doc_option_a_semantic_hardening.sql
@ -0,0 +1,39 @@
 ALTER TABLE DOC.doc_embedding
    DROP CONSTRAINT IF EXISTS ck_doc_embedding_dimensions_positive;
 ALTER TABLE DOC.doc_embedding
    ADD CONSTRAINT ck_doc_embedding_dimensions_positive
    CHECK (embedding_dimensions IS NULL OR embedding_dimensions > 0);
 DO $$
 BEGIN
    IF NOT EXISTS (
        SELECT 1
        FROM pg_constraint
        WHERE conname = 'uq_doc_embedding_representation_model'
          AND conrelid = 'doc.doc_embedding'::regclass
    ) THEN
        ALTER TABLE DOC.doc_embedding
            ADD CONSTRAINT uq_doc_embedding_representation_model
            UNIQUE (representation_id, model_id);
    END IF;
 END $$;
 COMMENT ON TABLE DOC.doc_embedding IS
 'Option A multi-model embedding storage. Embeddings of different lengths may coexist in one table. Semantic search must always filter by model_id and embedding_dimensions.';
 COMMENT ON COLUMN DOC.doc_embedding.embedding_dimensions IS
 'Resolved dimension of the stored embedding. Used for validation, filtering, and model-specific vector casts.';
 COMMENT ON COLUMN DOC.doc_embedding.embedding_vector IS
 'Generic pgvector column without fixed dimension. Create per-model partial expression indexes with a fixed cast, e.g. ((embedding_vector::public.vector(1024)) vector_cosine_ops).';
 -- Recommended partial ANN index pattern for active models:
 -- CREATE INDEX idx_doc_embedding_<model_key>_hnsw
 --   ON DOC.doc_embedding USING hnsw ((embedding_vector::public.vector(<DIMENSIONS>)) vector_cosine_ops)
 --   WHERE model_id = '<MODEL_UUID>'::uuid
 --     AND embedding_status = 'COMPLETED';
 --
 -- If you use inner product or euclidean distance for a model, pick the matching operator class:
 --   vector_ip_ops
 --   vector_l2_ops
--- a/src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql
+++ b/src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql
@ -1,7 +1,7 @@
 -- Slice 1 + Slice 2 generic search support for DOC documents.
 -- Adds lexical-search support columns/indexes and pg_trgm extension.
-CREATE EXTENSION IF NOT EXISTS pg_trgm;
+CREATE EXTENSION IF NOT EXISTS pg_trgm with schema doc;
 ALTER TABLE DOC.doc_text_representation
    ADD COLUMN IF NOT EXISTS search_config VARCHAR(64);
@ -15,12 +15,12 @@ CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector
 CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm
    ON DOC.doc_document
-    USING GIN (title gin_trgm_ops);
+    USING GIN (title DOC.gin_trgm_ops);
 CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm
    ON DOC.doc_document
-    USING GIN (summary gin_trgm_ops);
+    USING GIN (summary DOC.gin_trgm_ops);
 CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm
    ON DOC.doc_text_representation
-    USING GIN (text_body gin_trgm_ops);
+    USING GIN (text_body DOC.gin_trgm_ops);
--- a/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java
+++ b/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java
@ -48,7 +48,7 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
        TransactionAutoConfiguration.class,
        JdbcTemplateAutoConfiguration.class
 })
-@EnableConfigurationProperties({TedProcessorProperties.class, EmbeddingProperties.class})
+@EnableConfigurationProperties({TedProcessorProperties.class})
@EntityScan(basePackages = {
        "at.procon.dip.domain.document.entity",
        "at.procon.dip.domain.tenant.entity",