embedding nv3.3
parent
847cb40f8a
commit
d206529162
@ -0,0 +1,34 @@
|
||||
# Option A semantic search hardening
|
||||
|
||||
This patch hardens the multi-model semantic search implementation in three places:
|
||||
|
||||
## 1. Semantic repository
|
||||
- requires a positive model dimension
|
||||
- requires a configured distance metric
|
||||
- uses metric-aware SQL expressions:
|
||||
- cosine -> `1 - distance`
|
||||
- inner product -> `-1 * negative_inner_product`
|
||||
- euclidean -> `1 / (1 + distance)`
|
||||
|
||||
## 2. Semantic engine
|
||||
- resolves one explicit model per request
|
||||
- validates:
|
||||
- model active
|
||||
- dimensions > 0
|
||||
- distance metric configured
|
||||
- query embedding mode supported
|
||||
|
||||
## 3. Database
|
||||
- check constraint for positive dimensions
|
||||
- unique constraint on `(representation_id, model_id)`
|
||||
- comments documenting the per-model partial ANN index strategy
|
||||
|
||||
## Why this matters
|
||||
|
||||
With Option A, multiple vector lengths live in one `DOC.doc_embedding.embedding_vector` column. That is safe only if:
|
||||
|
||||
- every semantic query resolves exactly one model
|
||||
- the query vector uses that same model
|
||||
- the repository filters by `model_id`
|
||||
- the vector cast uses the correct model dimension
|
||||
- ANN indexes are created per active model
|
||||
@ -1,234 +0,0 @@
|
||||
# TED Procurement Document Processor Configuration
|
||||
# Author: Martin.Schweitzer@procon.co.at and claude.ai
|
||||
|
||||
server:
|
||||
port: 8888
|
||||
servlet:
|
||||
context-path: /api
|
||||
|
||||
spring:
|
||||
application:
|
||||
name: ted-procurement-processor
|
||||
|
||||
datasource:
|
||||
url: jdbc:postgresql://localhost:32333/RELM
|
||||
username: ${DB_USERNAME:postgres}
|
||||
password: ${DB_PASSWORD:pwd}
|
||||
driver-class-name: org.postgresql.Driver
|
||||
hikari:
|
||||
maximum-pool-size: 5
|
||||
minimum-idle: 2
|
||||
connection-timeout: 30000
|
||||
idle-timeout: 300000
|
||||
max-lifetime: 900000
|
||||
leak-detection-threshold: 120000 # 2 minutes - increased to avoid false positives with batch processing
|
||||
|
||||
jpa:
|
||||
hibernate:
|
||||
ddl-auto: none
|
||||
show-sql: false
|
||||
open-in-view: false
|
||||
properties:
|
||||
hibernate:
|
||||
format_sql: true
|
||||
default_schema: TED
|
||||
jdbc:
|
||||
batch_size: 25 # Match chunk size for optimal batch processing
|
||||
order_inserts: true
|
||||
order_updates: true
|
||||
|
||||
flyway:
|
||||
enabled: true
|
||||
locations: classpath:db/migration
|
||||
baseline-on-migrate: true
|
||||
create-schemas: true
|
||||
schemas: TED
|
||||
default-schema: TED
|
||||
|
||||
# Apache Camel Configuration
|
||||
camel:
|
||||
springboot:
|
||||
main-run-controller: true
|
||||
health:
|
||||
enabled: true
|
||||
# Weniger strenge Health-Checks für File-Consumer
|
||||
consumers-enabled: false
|
||||
|
||||
# Custom Application Properties
|
||||
ted:
|
||||
# Directory configuration for file processing
|
||||
input:
|
||||
# Base directory for watching incoming TED XML files
|
||||
directory: ${TED_INPUT_DIR:D:/ted.europe/extracted}
|
||||
# File pattern to match (recursive scanning)
|
||||
pattern: "**/*.xml"
|
||||
# Move processed files to this directory
|
||||
processed-directory: ${TED_PROCESSED_DIR:.processed}
|
||||
# Move failed files to this directory
|
||||
error-directory: ${TED_ERROR_DIR:.error}
|
||||
# Polling interval in milliseconds
|
||||
poll-interval: 5000
|
||||
# Maximum messages per poll (reduced to prevent memory issues)
|
||||
max-messages-per-poll: 10
|
||||
|
||||
# Schema validation configuration
|
||||
schema:
|
||||
# Enable/disable XSD validation
|
||||
enabled: true
|
||||
# Path to eForms SDK schemas (from Maven dependency or custom location)
|
||||
path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
|
||||
|
||||
# Vectorization configuration
|
||||
vectorization:
|
||||
# Enable/disable async vectorization
|
||||
enabled: true
|
||||
# Use external HTTP API instead of subprocess
|
||||
use-http-api: true
|
||||
# Embedding service URL
|
||||
api-url: http://localhost:8001
|
||||
# Model name for sentence-transformers
|
||||
model-name: intfloat/multilingual-e5-large
|
||||
# Vector dimensions (must match model output)
|
||||
dimensions: 1024
|
||||
# Batch size for vectorization
|
||||
batch-size: 16
|
||||
# Thread pool size for async processing
|
||||
thread-pool-size: 4
|
||||
# Maximum text length for vectorization (characters)
|
||||
max-text-length: 8192
|
||||
# HTTP connection timeout (milliseconds)
|
||||
connect-timeout: 10000
|
||||
# HTTP socket/read timeout (milliseconds)
|
||||
socket-timeout: 60000
|
||||
# Maximum retries on connection failure
|
||||
max-retries: 5
|
||||
|
||||
# Search configuration
|
||||
search:
|
||||
# Default page size for search results
|
||||
default-page-size: 20
|
||||
# Maximum page size
|
||||
max-page-size: 100
|
||||
# Similarity threshold for vector search (0.0 - 1.0)
|
||||
similarity-threshold: 0.7
|
||||
|
||||
# TED Daily Package Download configuration
|
||||
download:
|
||||
# Enable/disable automatic package download
|
||||
enabled: true
|
||||
# Base URL for TED Daily Packages
|
||||
base-url: https://ted.europa.eu/packages/daily/
|
||||
# Download directory for tar.gz files
|
||||
download-directory: D:/ted.europe/downloads
|
||||
# Extract directory for XML files
|
||||
extract-directory: D:/ted.europe/extracted
|
||||
# Start year for downloads
|
||||
start-year: 2015
|
||||
# Max consecutive 404 errors before stopping
|
||||
max-consecutive-404: 4
|
||||
# Polling interval (milliseconds) - 2 minutes
|
||||
poll-interval: 120000
|
||||
# Download timeout (milliseconds) - 5 minutes
|
||||
download-timeout: 300000
|
||||
# Max concurrent downloads
|
||||
max-concurrent-downloads: 2
|
||||
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
|
||||
delay-between-downloads: 3000
|
||||
# Delete tar.gz after extraction
|
||||
delete-after-extraction: true
|
||||
# Prioritize current year first
|
||||
prioritize-current-year: false
|
||||
|
||||
# IMAP Mail configuration
|
||||
mail:
|
||||
# Enable/disable mail processing
|
||||
enabled: true
|
||||
# IMAP server hostname
|
||||
host: host
|
||||
# IMAP server port (993 for IMAPS)
|
||||
port: 993
|
||||
# Mail account username (email address)
|
||||
username: ${MAIL_USERNAME:}
|
||||
# Mail account password
|
||||
password: ${MAIL_PASSWORD:}
|
||||
# Use SSL/TLS connection
|
||||
ssl: true
|
||||
# Mail folder to read from
|
||||
folder-name: INBOX
|
||||
# Delete messages after processing
|
||||
delete: false
|
||||
# Mark messages as seen after processing (false = peek mode, don't mark as read)
|
||||
seen: false
|
||||
# Only process unseen messages
|
||||
unseen: true
|
||||
# Polling delay in milliseconds (1 minute)
|
||||
delay: 60000
|
||||
# Max messages per poll
|
||||
max-messages-per-poll: 10
|
||||
# Output directory for processed attachments
|
||||
attachment-output-directory: D:/ted.europe/mail-attachments
|
||||
# Enable/disable MIME file input processing
|
||||
mime-input-enabled: true
|
||||
# Input directory for MIME files (.eml)
|
||||
mime-input-directory: D:/ted.europe/mime-input
|
||||
# File pattern for MIME files (regex)
|
||||
mime-input-pattern: .*\\.eml
|
||||
# Polling interval for MIME input directory (milliseconds)
|
||||
mime-input-poll-interval: 10000
|
||||
|
||||
# Solution Brief processing configuration
|
||||
solution-brief:
|
||||
# Enable/disable Solution Brief processing
|
||||
enabled: true
|
||||
# Input directory for Solution Brief PDF files
|
||||
input-directory: C:/work/SolutionBrief
|
||||
# Output directory for Excel result files (relative to input or absolute)
|
||||
result-directory: ./result
|
||||
# Number of top similar documents to include
|
||||
top-k: 20
|
||||
# Minimum similarity threshold (0.0-1.0)
|
||||
similarity-threshold: 0.5
|
||||
# Polling interval in milliseconds (30 seconds)
|
||||
poll-interval: 30000
|
||||
# File pattern for PDF files (regex)
|
||||
file-pattern: .*\\.pdf
|
||||
# Process files only once (idempotent)
|
||||
idempotent: true
|
||||
# Idempotent repository file path
|
||||
idempotent-repository: ./solution-brief-processed.dat
|
||||
|
||||
# Data cleanup configuration
|
||||
cleanup:
|
||||
# Enable automatic cleanup of old documents
|
||||
enabled: false
|
||||
# Retention period in years (default: 10)
|
||||
retention-years: 10
|
||||
# Cron expression for cleanup schedule (default: daily at 2 AM)
|
||||
cron: "0 0 2 * * *"
|
||||
|
||||
# Actuator endpoints
|
||||
management:
|
||||
endpoints:
|
||||
web:
|
||||
exposure:
|
||||
include: health,info,metrics,camel
|
||||
endpoint:
|
||||
health:
|
||||
show-details: when-authorized
|
||||
|
||||
# OpenAPI documentation
|
||||
springdoc:
|
||||
api-docs:
|
||||
path: /v3/api-docs
|
||||
swagger-ui:
|
||||
path: /swagger-ui.html
|
||||
operations-sorter: method
|
||||
|
||||
# Logging configuration
|
||||
logging:
|
||||
level:
|
||||
at.procon.ted: INFO
|
||||
at.procon.ted.camel.SolutionBriefRoute: INFO
|
||||
org.apache.camel: INFO
|
||||
org.hibernate.SQL: WARN
|
||||
org.hibernate.type.descriptor.sql: WARN
|
||||
@ -0,0 +1,39 @@
|
||||
ALTER TABLE DOC.doc_embedding
|
||||
DROP CONSTRAINT IF EXISTS ck_doc_embedding_dimensions_positive;
|
||||
|
||||
ALTER TABLE DOC.doc_embedding
|
||||
ADD CONSTRAINT ck_doc_embedding_dimensions_positive
|
||||
CHECK (embedding_dimensions IS NULL OR embedding_dimensions > 0);
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_constraint
|
||||
WHERE conname = 'uq_doc_embedding_representation_model'
|
||||
AND conrelid = 'doc.doc_embedding'::regclass
|
||||
) THEN
|
||||
ALTER TABLE DOC.doc_embedding
|
||||
ADD CONSTRAINT uq_doc_embedding_representation_model
|
||||
UNIQUE (representation_id, model_id);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
COMMENT ON TABLE DOC.doc_embedding IS
|
||||
'Option A multi-model embedding storage. Embeddings of different lengths may coexist in one table. Semantic search must always filter by model_id and embedding_dimensions.';
|
||||
|
||||
COMMENT ON COLUMN DOC.doc_embedding.embedding_dimensions IS
|
||||
'Resolved dimension of the stored embedding. Used for validation, filtering, and model-specific vector casts.';
|
||||
|
||||
COMMENT ON COLUMN DOC.doc_embedding.embedding_vector IS
|
||||
'Generic pgvector column without fixed dimension. Create per-model partial expression indexes with a fixed cast, e.g. ((embedding_vector::public.vector(1024)) vector_cosine_ops).';
|
||||
|
||||
-- Recommended partial ANN index pattern for active models:
|
||||
-- CREATE INDEX idx_doc_embedding_<model_key>_hnsw
|
||||
-- ON DOC.doc_embedding USING hnsw ((embedding_vector::public.vector(<DIMENSIONS>)) vector_cosine_ops)
|
||||
-- WHERE model_id = '<MODEL_UUID>'::uuid
|
||||
-- AND embedding_status = 'COMPLETED';
|
||||
--
|
||||
-- If you use inner product or euclidean distance for a model, pick the matching operator class:
|
||||
-- vector_ip_ops
|
||||
-- vector_l2_ops
|
||||
Loading…
Reference in New Issue