embedding nv3.3

master
trifonovt 4 weeks ago
parent 847cb40f8a
commit d206529162

@ -0,0 +1,34 @@
# Option A semantic search hardening
This patch hardens the multi-model semantic search implementation in three places:
## 1. Semantic repository
- requires a positive model dimension
- requires a configured distance metric
- uses metric-aware SQL expressions:
- cosine -> `1 - distance`
- inner product -> `-1 * negative_inner_product`
- euclidean -> `1 / (1 + distance)`
## 2. Semantic engine
- resolves one explicit model per request
- validates:
- model active
- dimensions > 0
- distance metric configured
- query embedding mode supported
## 3. Database
- check constraint for positive dimensions
- unique constraint on `(representation_id, model_id)`
- comments documenting the per-model partial ANN index strategy
## Why this matters
With Option A, multiple vector lengths live in one `DOC.doc_embedding.embedding_vector` column. That is safe only if:
- every semantic query resolves exactly one model
- the query vector uses that same model
- the repository filters by `model_id`
- the vector cast uses the correct model dimension
- ANN indexes are created per active model

@ -6,5 +6,6 @@ package at.procon.dip.domain.document;
public enum DistanceMetric { public enum DistanceMetric {
COSINE, COSINE,
L2, L2,
EUCLIDEAN,
INNER_PRODUCT INNER_PRODUCT
} }

@ -8,9 +8,9 @@ import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Configuration;
@Configuration
@ConfigurationProperties(prefix = "dip.embedding") @ConfigurationProperties(prefix = "dip.embedding")
@Data @Data
@Configuration
public class EmbeddingProperties { public class EmbeddingProperties {
private boolean enabled = false; private boolean enabled = false;

@ -41,6 +41,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
String semantic = buildSemanticText(title, summary, request.detectionResult().documentType()); String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
List<TextRepresentationDraft> drafts = new ArrayList<>(); List<TextRepresentationDraft> drafts = new ArrayList<>();
/*
drafts.add(new TextRepresentationDraft( drafts.add(new TextRepresentationDraft(
RepresentationType.FULLTEXT, RepresentationType.FULLTEXT,
BUILDER_KEY, BUILDER_KEY,
@ -53,6 +54,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
ContentRole.NORMALIZED_TEXT, ContentRole.NORMALIZED_TEXT,
Boolean.FALSE Boolean.FALSE
)); ));
*/
drafts.add(new TextRepresentationDraft( drafts.add(new TextRepresentationDraft(
RepresentationType.SEMANTIC_TEXT, RepresentationType.SEMANTIC_TEXT,
BUILDER_KEY, BUILDER_KEY,
@ -65,6 +67,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
ContentRole.NORMALIZED_TEXT, ContentRole.NORMALIZED_TEXT,
Boolean.TRUE Boolean.TRUE
)); ));
/*
if (StringUtils.hasText(title)) { if (StringUtils.hasText(title)) {
drafts.add(new TextRepresentationDraft( drafts.add(new TextRepresentationDraft(
RepresentationType.TITLE_ABSTRACT, RepresentationType.TITLE_ABSTRACT,
@ -91,6 +94,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
ContentRole.NORMALIZED_TEXT, ContentRole.NORMALIZED_TEXT,
Boolean.FALSE Boolean.FALSE
)); ));
*/
return drafts; return drafts;
} }

@ -1,6 +1,8 @@
package at.procon.dip.search.engine.semantic; package at.procon.dip.search.engine.semantic;
import at.procon.dip.embedding.config.EmbeddingProperties; import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.embedding.model.EmbeddingModelDescriptor;
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchHit;
@ -10,13 +12,16 @@ import at.procon.dip.search.service.SemanticQueryEmbeddingService;
import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.config.TedProcessorProperties;
import java.util.List; import java.util.List;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@Component @Component
@RequiredArgsConstructor @RequiredArgsConstructor
@Slf4j
public class PgVectorSemanticSearchEngine implements SearchEngine { public class PgVectorSemanticSearchEngine implements SearchEngine {
private final EmbeddingProperties embeddingProperties; private final EmbeddingProperties embeddingProperties;
private final EmbeddingModelRegistry embeddingModelRegistry;
private final TedProcessorProperties properties; private final TedProcessorProperties properties;
private final SemanticQueryEmbeddingService queryEmbeddingService; private final SemanticQueryEmbeddingService queryEmbeddingService;
private final DocumentSemanticSearchRepository repository; private final DocumentSemanticSearchRepository repository;
@ -35,15 +40,47 @@ public class PgVectorSemanticSearchEngine implements SearchEngine {
@Override @Override
public List<SearchHit> execute(SearchExecutionContext context) { public List<SearchHit> execute(SearchExecutionContext context) {
String requestedModelKey = context.getRequest().getSemanticModelKey();
EmbeddingModelDescriptor model = resolveModel(requestedModelKey);
validateModel(model);
return queryEmbeddingService.buildQueryEmbedding( return queryEmbeddingService.buildQueryEmbedding(
context.getRequest().getQueryText(), context.getRequest().getQueryText(),
context.getRequest().getSemanticModelKey()) model.modelKey())
.map(query -> repository.search( .map(query -> repository.search(
context, context,
query.modelId(), query.modelId(),
model.dimensions(),
model.distanceMetric(),
query.vectorString(), query.vectorString(),
properties.getSearch().getSemanticCandidateLimit(), properties.getSearch().getSemanticCandidateLimit(),
properties.getSearch().getSimilarityThreshold())) properties.getSearch().getSimilarityThreshold()))
.orElse(List.of()); .orElseGet(() -> {
log.debug("Semantic search skipped because query embedding could not be generated for model {}", model.modelKey());
return List.of();
});
}
private EmbeddingModelDescriptor resolveModel(String requestedModelKey) {
if (requestedModelKey != null && !requestedModelKey.isBlank()) {
return embeddingModelRegistry.getRequired(requestedModelKey);
}
String defaultModelKey = embeddingModelRegistry.getRequiredDefaultQueryModelKey();
return embeddingModelRegistry.getRequired(defaultModelKey);
}
private void validateModel(EmbeddingModelDescriptor model) {
if (!model.active()) {
throw new IllegalStateException("Semantic search model is not active: " + model.modelKey());
}
if (model.dimensions() <= 0) {
throw new IllegalStateException("Semantic search model has invalid dimensions: " + model.modelKey() + " -> " + model.dimensions());
}
if (model.distanceMetric() == null) {
throw new IllegalStateException("Semantic search model has no distance metric configured: " + model.modelKey());
}
if (!model.supportsQueryEmbeddingMode()) {
throw new IllegalStateException("Semantic search model does not support query embedding mode: " + model.modelKey());
}
} }
} }

@ -1,5 +1,6 @@
package at.procon.dip.search.repository; package at.procon.dip.search.repository;
import at.procon.dip.domain.document.DistanceMetric;
import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchHit;
@ -19,9 +20,22 @@ public class DocumentSemanticSearchRepository {
public List<SearchHit> search(SearchExecutionContext context, public List<SearchHit> search(SearchExecutionContext context,
UUID modelId, UUID modelId,
int modelDimensions,
DistanceMetric distanceMetric,
String queryVector, String queryVector,
int limit, int limit,
double threshold) { double threshold) {
if (modelDimensions <= 0) {
throw new IllegalArgumentException("Semantic search requires a positive model dimension, got: " + modelDimensions);
}
if (distanceMetric == null) {
throw new IllegalArgumentException("Semantic search requires a distance metric");
}
String vectorType = "public.vector(" + modelDimensions + ")";
String similarityExpr = buildSimilarityExpression(distanceMetric, vectorType);
StringBuilder sql = new StringBuilder(""" StringBuilder sql = new StringBuilder("""
SELECT SELECT
d.id AS document_id, d.id AS document_id,
@ -41,7 +55,9 @@ public class DocumentSemanticSearchRepository {
d.created_at AS created_at, d.created_at AS created_at,
d.updated_at AS updated_at, d.updated_at AS updated_at,
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
(1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) AS score """);
sql.append(similarityExpr).append(" AS score ");
sql.append("""
FROM doc.doc_embedding de FROM doc.doc_embedding de
JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
JOIN doc.doc_document d ON d.id = de.document_id JOIN doc.doc_document d ON d.id = de.document_id
@ -49,18 +65,35 @@ public class DocumentSemanticSearchRepository {
WHERE de.embedding_status = 'COMPLETED' WHERE de.embedding_status = 'COMPLETED'
AND de.embedding_vector IS NOT NULL AND de.embedding_vector IS NOT NULL
AND de.model_id = :modelId AND de.model_id = :modelId
AND (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) >= :threshold AND de.embedding_dimensions = :modelDimensions
"""); AND """);
sql.append(similarityExpr).append(" >= :threshold ");
MapSqlParameterSource params = new MapSqlParameterSource(); MapSqlParameterSource params = new MapSqlParameterSource();
params.addValue("queryVector", queryVector); params.addValue("queryVector", queryVector);
params.addValue("modelId", modelId); params.addValue("modelId", modelId);
params.addValue("modelDimensions", modelDimensions);
params.addValue("threshold", threshold); params.addValue("threshold", threshold);
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true); SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit"); sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
params.addValue("limit", limit); params.addValue("limit", limit);
return jdbcTemplate.query(sql.toString(), params, return jdbcTemplate.query(
new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT)); sql.toString(),
params,
new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT)
);
}
private String buildSimilarityExpression(DistanceMetric distanceMetric, String vectorType) {
String lhs = "(de.embedding_vector)::" + vectorType;
String rhs = "CAST(:queryVector AS " + vectorType + ")";
return switch (distanceMetric) {
case COSINE -> "(1 - (" + lhs + " <=> " + rhs + "))";
case INNER_PRODUCT -> "(-1 * (" + lhs + " <#> " + rhs + "))";
case EUCLIDEAN, L2 -> "(1 / (1 + (" + lhs + " <-> " + rhs + ")))";
};
} }
} }

@ -1,234 +0,0 @@
# TED Procurement Document Processor Configuration
# Author: Martin.Schweitzer@procon.co.at and claude.ai
server:
port: 8888
servlet:
context-path: /api
spring:
application:
name: ted-procurement-processor
datasource:
url: jdbc:postgresql://localhost:32333/RELM
username: ${DB_USERNAME:postgres}
password: ${DB_PASSWORD:pwd}
driver-class-name: org.postgresql.Driver
hikari:
maximum-pool-size: 5
minimum-idle: 2
connection-timeout: 30000
idle-timeout: 300000
max-lifetime: 900000
leak-detection-threshold: 120000 # 2 minutes - increased to avoid false positives with batch processing
jpa:
hibernate:
ddl-auto: none
show-sql: false
open-in-view: false
properties:
hibernate:
format_sql: true
default_schema: TED
jdbc:
batch_size: 25 # Match chunk size for optimal batch processing
order_inserts: true
order_updates: true
flyway:
enabled: true
locations: classpath:db/migration
baseline-on-migrate: true
create-schemas: true
schemas: TED
default-schema: TED
# Apache Camel Configuration
camel:
springboot:
main-run-controller: true
health:
enabled: true
# Weniger strenge Health-Checks für File-Consumer
consumers-enabled: false
# Custom Application Properties
ted:
# Directory configuration for file processing
input:
# Base directory for watching incoming TED XML files
directory: ${TED_INPUT_DIR:D:/ted.europe/extracted}
# File pattern to match (recursive scanning)
pattern: "**/*.xml"
# Move processed files to this directory
processed-directory: ${TED_PROCESSED_DIR:.processed}
# Move failed files to this directory
error-directory: ${TED_ERROR_DIR:.error}
# Polling interval in milliseconds
poll-interval: 5000
# Maximum messages per poll (reduced to prevent memory issues)
max-messages-per-poll: 10
# Schema validation configuration
schema:
# Enable/disable XSD validation
enabled: true
# Path to eForms SDK schemas (from Maven dependency or custom location)
path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
# Vectorization configuration
vectorization:
# Enable/disable async vectorization
enabled: true
# Use external HTTP API instead of subprocess
use-http-api: true
# Embedding service URL
api-url: http://localhost:8001
# Model name for sentence-transformers
model-name: intfloat/multilingual-e5-large
# Vector dimensions (must match model output)
dimensions: 1024
# Batch size for vectorization
batch-size: 16
# Thread pool size for async processing
thread-pool-size: 4
# Maximum text length for vectorization (characters)
max-text-length: 8192
# HTTP connection timeout (milliseconds)
connect-timeout: 10000
# HTTP socket/read timeout (milliseconds)
socket-timeout: 60000
# Maximum retries on connection failure
max-retries: 5
# Search configuration
search:
# Default page size for search results
default-page-size: 20
# Maximum page size
max-page-size: 100
# Similarity threshold for vector search (0.0 - 1.0)
similarity-threshold: 0.7
# TED Daily Package Download configuration
download:
# Enable/disable automatic package download
enabled: true
# Base URL for TED Daily Packages
base-url: https://ted.europa.eu/packages/daily/
# Download directory for tar.gz files
download-directory: D:/ted.europe/downloads
# Extract directory for XML files
extract-directory: D:/ted.europe/extracted
# Start year for downloads
start-year: 2015
# Max consecutive 404 errors before stopping
max-consecutive-404: 4
# Polling interval (milliseconds) - 2 minutes
poll-interval: 120000
# Download timeout (milliseconds) - 5 minutes
download-timeout: 300000
# Max concurrent downloads
max-concurrent-downloads: 2
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
delay-between-downloads: 3000
# Delete tar.gz after extraction
delete-after-extraction: true
# Prioritize current year first
prioritize-current-year: false
# IMAP Mail configuration
mail:
# Enable/disable mail processing
enabled: true
# IMAP server hostname
host: host
# IMAP server port (993 for IMAPS)
port: 993
# Mail account username (email address)
username: ${MAIL_USERNAME:}
# Mail account password
password: ${MAIL_PASSWORD:}
# Use SSL/TLS connection
ssl: true
# Mail folder to read from
folder-name: INBOX
# Delete messages after processing
delete: false
# Mark messages as seen after processing (false = peek mode, don't mark as read)
seen: false
# Only process unseen messages
unseen: true
# Polling delay in milliseconds (1 minute)
delay: 60000
# Max messages per poll
max-messages-per-poll: 10
# Output directory for processed attachments
attachment-output-directory: D:/ted.europe/mail-attachments
# Enable/disable MIME file input processing
mime-input-enabled: true
# Input directory for MIME files (.eml)
mime-input-directory: D:/ted.europe/mime-input
# File pattern for MIME files (regex)
mime-input-pattern: .*\\.eml
# Polling interval for MIME input directory (milliseconds)
mime-input-poll-interval: 10000
# Solution Brief processing configuration
solution-brief:
# Enable/disable Solution Brief processing
enabled: true
# Input directory for Solution Brief PDF files
input-directory: C:/work/SolutionBrief
# Output directory for Excel result files (relative to input or absolute)
result-directory: ./result
# Number of top similar documents to include
top-k: 20
# Minimum similarity threshold (0.0-1.0)
similarity-threshold: 0.5
# Polling interval in milliseconds (30 seconds)
poll-interval: 30000
# File pattern for PDF files (regex)
file-pattern: .*\\.pdf
# Process files only once (idempotent)
idempotent: true
# Idempotent repository file path
idempotent-repository: ./solution-brief-processed.dat
# Data cleanup configuration
cleanup:
# Enable automatic cleanup of old documents
enabled: false
# Retention period in years (default: 10)
retention-years: 10
# Cron expression for cleanup schedule (default: daily at 2 AM)
cron: "0 0 2 * * *"
# Actuator endpoints
management:
endpoints:
web:
exposure:
include: health,info,metrics,camel
endpoint:
health:
show-details: when-authorized
# OpenAPI documentation
springdoc:
api-docs:
path: /v3/api-docs
swagger-ui:
path: /swagger-ui.html
operations-sorter: method
# Logging configuration
logging:
level:
at.procon.ted: INFO
at.procon.ted.camel.SolutionBriefRoute: INFO
org.apache.camel: INFO
org.hibernate.SQL: WARN
org.hibernate.type.descriptor.sql: WARN

@ -84,7 +84,7 @@ ted:
# Vectorization configuration # Vectorization configuration
vectorization: vectorization:
# Enable/disable async vectorization # Enable/disable async vectorization
enabled: false enabled: true
# Use external HTTP API instead of subprocess # Use external HTTP API instead of subprocess
use-http-api: true use-http-api: true
# Embedding service URL # Embedding service URL
@ -154,7 +154,7 @@ ted:
# TED Daily Package Download configuration # TED Daily Package Download configuration
download: download:
# Enable/disable automatic package download # Enable/disable automatic package download
enabled: false enabled: true
# User service-based camel route # User service-based camel route
use-service-based: false use-service-based: false
# Base URL for TED Daily Packages # Base URL for TED Daily Packages
@ -168,7 +168,7 @@ ted:
# Max consecutive 404 errors before stopping # Max consecutive 404 errors before stopping
max-consecutive-404: 4 max-consecutive-404: 4
# Polling interval (milliseconds) - 2 minutes # Polling interval (milliseconds) - 2 minutes
poll-interval: 1800000 poll-interval: 300000
# Retry interval for tail NOT_FOUND packages - 6 hours # Retry interval for tail NOT_FOUND packages - 6 hours
not-found-retry-interval: 21600000 not-found-retry-interval: 21600000
# Grace period after year end before a previous-year tail 404 is treated as final # Grace period after year end before a previous-year tail 404 is treated as final

@ -0,0 +1,39 @@
ALTER TABLE DOC.doc_embedding
DROP CONSTRAINT IF EXISTS ck_doc_embedding_dimensions_positive;
ALTER TABLE DOC.doc_embedding
ADD CONSTRAINT ck_doc_embedding_dimensions_positive
CHECK (embedding_dimensions IS NULL OR embedding_dimensions > 0);
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1
FROM pg_constraint
WHERE conname = 'uq_doc_embedding_representation_model'
AND conrelid = 'doc.doc_embedding'::regclass
) THEN
ALTER TABLE DOC.doc_embedding
ADD CONSTRAINT uq_doc_embedding_representation_model
UNIQUE (representation_id, model_id);
END IF;
END $$;
COMMENT ON TABLE DOC.doc_embedding IS
'Option A multi-model embedding storage. Embeddings of different lengths may coexist in one table. Semantic search must always filter by model_id and embedding_dimensions.';
COMMENT ON COLUMN DOC.doc_embedding.embedding_dimensions IS
'Resolved dimension of the stored embedding. Used for validation, filtering, and model-specific vector casts.';
COMMENT ON COLUMN DOC.doc_embedding.embedding_vector IS
'Generic pgvector column without fixed dimension. Create per-model partial expression indexes with a fixed cast, e.g. ((embedding_vector::public.vector(1024)) vector_cosine_ops).';
-- Recommended partial ANN index pattern for active models:
-- CREATE INDEX idx_doc_embedding_<model_key>_hnsw
-- ON DOC.doc_embedding USING hnsw ((embedding_vector::public.vector(<DIMENSIONS>)) vector_cosine_ops)
-- WHERE model_id = '<MODEL_UUID>'::uuid
-- AND embedding_status = 'COMPLETED';
--
-- If you use inner product or euclidean distance for a model, pick the matching operator class:
-- vector_ip_ops
-- vector_l2_ops

@ -1,7 +1,7 @@
-- Slice 1 + Slice 2 generic search support for DOC documents. -- Slice 1 + Slice 2 generic search support for DOC documents.
-- Adds lexical-search support columns/indexes and pg_trgm extension. -- Adds lexical-search support columns/indexes and pg_trgm extension.
CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE EXTENSION IF NOT EXISTS pg_trgm with schema doc;
ALTER TABLE DOC.doc_text_representation ALTER TABLE DOC.doc_text_representation
ADD COLUMN IF NOT EXISTS search_config VARCHAR(64); ADD COLUMN IF NOT EXISTS search_config VARCHAR(64);
@ -15,12 +15,12 @@ CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector
CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm
ON DOC.doc_document ON DOC.doc_document
USING GIN (title gin_trgm_ops); USING GIN (title DOC.gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm
ON DOC.doc_document ON DOC.doc_document
USING GIN (summary gin_trgm_ops); USING GIN (summary DOC.gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm
ON DOC.doc_text_representation ON DOC.doc_text_representation
USING GIN (text_body gin_trgm_ops); USING GIN (text_body DOC.gin_trgm_ops);

@ -48,7 +48,7 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
TransactionAutoConfiguration.class, TransactionAutoConfiguration.class,
JdbcTemplateAutoConfiguration.class JdbcTemplateAutoConfiguration.class
}) })
@EnableConfigurationProperties({TedProcessorProperties.class, EmbeddingProperties.class}) @EnableConfigurationProperties({TedProcessorProperties.class})
@EntityScan(basePackages = { @EntityScan(basePackages = {
"at.procon.dip.domain.document.entity", "at.procon.dip.domain.document.entity",
"at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.tenant.entity",

Loading…
Cancel
Save