embedding nv3.3
This commit is contained in:
parent
847cb40f8a
commit
d206529162
|
|
@ -0,0 +1,34 @@
|
||||||
|
# Option A semantic search hardening
|
||||||
|
|
||||||
|
This patch hardens the multi-model semantic search implementation in three places:
|
||||||
|
|
||||||
|
## 1. Semantic repository
|
||||||
|
- requires a positive model dimension
|
||||||
|
- requires a configured distance metric
|
||||||
|
- uses metric-aware SQL expressions:
|
||||||
|
- cosine -> `1 - distance`
|
||||||
|
- inner product -> `-1 * negative_inner_product`
|
||||||
|
- euclidean -> `1 / (1 + distance)`
|
||||||
|
|
||||||
|
## 2. Semantic engine
|
||||||
|
- resolves one explicit model per request
|
||||||
|
- validates:
|
||||||
|
- model active
|
||||||
|
- dimensions > 0
|
||||||
|
- distance metric configured
|
||||||
|
- query embedding mode supported
|
||||||
|
|
||||||
|
## 3. Database
|
||||||
|
- check constraint for positive dimensions
|
||||||
|
- unique constraint on `(representation_id, model_id)`
|
||||||
|
- comments documenting the per-model partial ANN index strategy
|
||||||
|
|
||||||
|
## Why this matters
|
||||||
|
|
||||||
|
With Option A, multiple vector lengths live in one `DOC.doc_embedding.embedding_vector` column. That is safe only if:
|
||||||
|
|
||||||
|
- every semantic query resolves exactly one model
|
||||||
|
- the query vector uses that same model
|
||||||
|
- the repository filters by `model_id`
|
||||||
|
- the vector cast uses the correct model dimension
|
||||||
|
- ANN indexes are created per active model
|
||||||
|
|
@ -6,5 +6,6 @@ package at.procon.dip.domain.document;
|
||||||
public enum DistanceMetric {
|
public enum DistanceMetric {
|
||||||
COSINE,
|
COSINE,
|
||||||
L2,
|
L2,
|
||||||
|
EUCLIDEAN,
|
||||||
INNER_PRODUCT
|
INNER_PRODUCT
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -8,9 +8,9 @@ import lombok.Data;
|
||||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||||
import org.springframework.context.annotation.Configuration;
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
|
||||||
@Configuration
|
|
||||||
@ConfigurationProperties(prefix = "dip.embedding")
|
@ConfigurationProperties(prefix = "dip.embedding")
|
||||||
@Data
|
@Data
|
||||||
|
@Configuration
|
||||||
public class EmbeddingProperties {
|
public class EmbeddingProperties {
|
||||||
|
|
||||||
private boolean enabled = false;
|
private boolean enabled = false;
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
||||||
String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
|
String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
|
||||||
|
|
||||||
List<TextRepresentationDraft> drafts = new ArrayList<>();
|
List<TextRepresentationDraft> drafts = new ArrayList<>();
|
||||||
|
/*
|
||||||
drafts.add(new TextRepresentationDraft(
|
drafts.add(new TextRepresentationDraft(
|
||||||
RepresentationType.FULLTEXT,
|
RepresentationType.FULLTEXT,
|
||||||
BUILDER_KEY,
|
BUILDER_KEY,
|
||||||
|
|
@ -53,6 +54,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.FALSE
|
Boolean.FALSE
|
||||||
));
|
));
|
||||||
|
*/
|
||||||
drafts.add(new TextRepresentationDraft(
|
drafts.add(new TextRepresentationDraft(
|
||||||
RepresentationType.SEMANTIC_TEXT,
|
RepresentationType.SEMANTIC_TEXT,
|
||||||
BUILDER_KEY,
|
BUILDER_KEY,
|
||||||
|
|
@ -65,6 +67,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.TRUE
|
Boolean.TRUE
|
||||||
));
|
));
|
||||||
|
/*
|
||||||
if (StringUtils.hasText(title)) {
|
if (StringUtils.hasText(title)) {
|
||||||
drafts.add(new TextRepresentationDraft(
|
drafts.add(new TextRepresentationDraft(
|
||||||
RepresentationType.TITLE_ABSTRACT,
|
RepresentationType.TITLE_ABSTRACT,
|
||||||
|
|
@ -91,6 +94,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.FALSE
|
Boolean.FALSE
|
||||||
));
|
));
|
||||||
|
*/
|
||||||
return drafts;
|
return drafts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
package at.procon.dip.search.engine.semantic;
|
package at.procon.dip.search.engine.semantic;
|
||||||
|
|
||||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||||
|
import at.procon.dip.embedding.model.EmbeddingModelDescriptor;
|
||||||
|
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
||||||
import at.procon.dip.search.api.SearchExecutionContext;
|
import at.procon.dip.search.api.SearchExecutionContext;
|
||||||
import at.procon.dip.search.dto.SearchEngineType;
|
import at.procon.dip.search.dto.SearchEngineType;
|
||||||
import at.procon.dip.search.dto.SearchHit;
|
import at.procon.dip.search.dto.SearchHit;
|
||||||
|
|
@ -10,13 +12,16 @@ import at.procon.dip.search.service.SemanticQueryEmbeddingService;
|
||||||
import at.procon.ted.config.TedProcessorProperties;
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
public class PgVectorSemanticSearchEngine implements SearchEngine {
|
public class PgVectorSemanticSearchEngine implements SearchEngine {
|
||||||
|
|
||||||
private final EmbeddingProperties embeddingProperties;
|
private final EmbeddingProperties embeddingProperties;
|
||||||
|
private final EmbeddingModelRegistry embeddingModelRegistry;
|
||||||
private final TedProcessorProperties properties;
|
private final TedProcessorProperties properties;
|
||||||
private final SemanticQueryEmbeddingService queryEmbeddingService;
|
private final SemanticQueryEmbeddingService queryEmbeddingService;
|
||||||
private final DocumentSemanticSearchRepository repository;
|
private final DocumentSemanticSearchRepository repository;
|
||||||
|
|
@ -35,15 +40,47 @@ public class PgVectorSemanticSearchEngine implements SearchEngine {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<SearchHit> execute(SearchExecutionContext context) {
|
public List<SearchHit> execute(SearchExecutionContext context) {
|
||||||
|
String requestedModelKey = context.getRequest().getSemanticModelKey();
|
||||||
|
EmbeddingModelDescriptor model = resolveModel(requestedModelKey);
|
||||||
|
validateModel(model);
|
||||||
|
|
||||||
return queryEmbeddingService.buildQueryEmbedding(
|
return queryEmbeddingService.buildQueryEmbedding(
|
||||||
context.getRequest().getQueryText(),
|
context.getRequest().getQueryText(),
|
||||||
context.getRequest().getSemanticModelKey())
|
model.modelKey())
|
||||||
.map(query -> repository.search(
|
.map(query -> repository.search(
|
||||||
context,
|
context,
|
||||||
query.modelId(),
|
query.modelId(),
|
||||||
|
model.dimensions(),
|
||||||
|
model.distanceMetric(),
|
||||||
query.vectorString(),
|
query.vectorString(),
|
||||||
properties.getSearch().getSemanticCandidateLimit(),
|
properties.getSearch().getSemanticCandidateLimit(),
|
||||||
properties.getSearch().getSimilarityThreshold()))
|
properties.getSearch().getSimilarityThreshold()))
|
||||||
.orElse(List.of());
|
.orElseGet(() -> {
|
||||||
|
log.debug("Semantic search skipped because query embedding could not be generated for model {}", model.modelKey());
|
||||||
|
return List.of();
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
private EmbeddingModelDescriptor resolveModel(String requestedModelKey) {
|
||||||
|
if (requestedModelKey != null && !requestedModelKey.isBlank()) {
|
||||||
|
return embeddingModelRegistry.getRequired(requestedModelKey);
|
||||||
|
}
|
||||||
|
String defaultModelKey = embeddingModelRegistry.getRequiredDefaultQueryModelKey();
|
||||||
|
return embeddingModelRegistry.getRequired(defaultModelKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void validateModel(EmbeddingModelDescriptor model) {
|
||||||
|
if (!model.active()) {
|
||||||
|
throw new IllegalStateException("Semantic search model is not active: " + model.modelKey());
|
||||||
|
}
|
||||||
|
if (model.dimensions() <= 0) {
|
||||||
|
throw new IllegalStateException("Semantic search model has invalid dimensions: " + model.modelKey() + " -> " + model.dimensions());
|
||||||
|
}
|
||||||
|
if (model.distanceMetric() == null) {
|
||||||
|
throw new IllegalStateException("Semantic search model has no distance metric configured: " + model.modelKey());
|
||||||
|
}
|
||||||
|
if (!model.supportsQueryEmbeddingMode()) {
|
||||||
|
throw new IllegalStateException("Semantic search model does not support query embedding mode: " + model.modelKey());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
package at.procon.dip.search.repository;
|
package at.procon.dip.search.repository;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DistanceMetric;
|
||||||
import at.procon.dip.search.api.SearchExecutionContext;
|
import at.procon.dip.search.api.SearchExecutionContext;
|
||||||
import at.procon.dip.search.dto.SearchEngineType;
|
import at.procon.dip.search.dto.SearchEngineType;
|
||||||
import at.procon.dip.search.dto.SearchHit;
|
import at.procon.dip.search.dto.SearchHit;
|
||||||
|
|
@ -19,9 +20,22 @@ public class DocumentSemanticSearchRepository {
|
||||||
|
|
||||||
public List<SearchHit> search(SearchExecutionContext context,
|
public List<SearchHit> search(SearchExecutionContext context,
|
||||||
UUID modelId,
|
UUID modelId,
|
||||||
|
int modelDimensions,
|
||||||
|
DistanceMetric distanceMetric,
|
||||||
String queryVector,
|
String queryVector,
|
||||||
int limit,
|
int limit,
|
||||||
double threshold) {
|
double threshold) {
|
||||||
|
|
||||||
|
if (modelDimensions <= 0) {
|
||||||
|
throw new IllegalArgumentException("Semantic search requires a positive model dimension, got: " + modelDimensions);
|
||||||
|
}
|
||||||
|
if (distanceMetric == null) {
|
||||||
|
throw new IllegalArgumentException("Semantic search requires a distance metric");
|
||||||
|
}
|
||||||
|
|
||||||
|
String vectorType = "public.vector(" + modelDimensions + ")";
|
||||||
|
String similarityExpr = buildSimilarityExpression(distanceMetric, vectorType);
|
||||||
|
|
||||||
StringBuilder sql = new StringBuilder("""
|
StringBuilder sql = new StringBuilder("""
|
||||||
SELECT
|
SELECT
|
||||||
d.id AS document_id,
|
d.id AS document_id,
|
||||||
|
|
@ -41,7 +55,9 @@ public class DocumentSemanticSearchRepository {
|
||||||
d.created_at AS created_at,
|
d.created_at AS created_at,
|
||||||
d.updated_at AS updated_at,
|
d.updated_at AS updated_at,
|
||||||
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
||||||
(1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) AS score
|
""");
|
||||||
|
sql.append(similarityExpr).append(" AS score ");
|
||||||
|
sql.append("""
|
||||||
FROM doc.doc_embedding de
|
FROM doc.doc_embedding de
|
||||||
JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
|
JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
|
||||||
JOIN doc.doc_document d ON d.id = de.document_id
|
JOIN doc.doc_document d ON d.id = de.document_id
|
||||||
|
|
@ -49,18 +65,35 @@ public class DocumentSemanticSearchRepository {
|
||||||
WHERE de.embedding_status = 'COMPLETED'
|
WHERE de.embedding_status = 'COMPLETED'
|
||||||
AND de.embedding_vector IS NOT NULL
|
AND de.embedding_vector IS NOT NULL
|
||||||
AND de.model_id = :modelId
|
AND de.model_id = :modelId
|
||||||
AND (1 - (de.embedding_vector <=> CAST(:queryVector AS public.vector))) >= :threshold
|
AND de.embedding_dimensions = :modelDimensions
|
||||||
""");
|
AND """);
|
||||||
|
sql.append(similarityExpr).append(" >= :threshold ");
|
||||||
|
|
||||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||||
params.addValue("queryVector", queryVector);
|
params.addValue("queryVector", queryVector);
|
||||||
params.addValue("modelId", modelId);
|
params.addValue("modelId", modelId);
|
||||||
|
params.addValue("modelDimensions", modelDimensions);
|
||||||
params.addValue("threshold", threshold);
|
params.addValue("threshold", threshold);
|
||||||
|
|
||||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||||
params.addValue("limit", limit);
|
params.addValue("limit", limit);
|
||||||
|
|
||||||
return jdbcTemplate.query(sql.toString(), params,
|
return jdbcTemplate.query(
|
||||||
new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT));
|
sql.toString(),
|
||||||
|
params,
|
||||||
|
new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
private String buildSimilarityExpression(DistanceMetric distanceMetric, String vectorType) {
|
||||||
|
String lhs = "(de.embedding_vector)::" + vectorType;
|
||||||
|
String rhs = "CAST(:queryVector AS " + vectorType + ")";
|
||||||
|
|
||||||
|
return switch (distanceMetric) {
|
||||||
|
case COSINE -> "(1 - (" + lhs + " <=> " + rhs + "))";
|
||||||
|
case INNER_PRODUCT -> "(-1 * (" + lhs + " <#> " + rhs + "))";
|
||||||
|
case EUCLIDEAN, L2 -> "(1 / (1 + (" + lhs + " <-> " + rhs + ")))";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,234 +0,0 @@
|
||||||
# TED Procurement Document Processor Configuration
|
|
||||||
# Author: Martin.Schweitzer@procon.co.at and claude.ai
|
|
||||||
|
|
||||||
server:
|
|
||||||
port: 8888
|
|
||||||
servlet:
|
|
||||||
context-path: /api
|
|
||||||
|
|
||||||
spring:
|
|
||||||
application:
|
|
||||||
name: ted-procurement-processor
|
|
||||||
|
|
||||||
datasource:
|
|
||||||
url: jdbc:postgresql://localhost:32333/RELM
|
|
||||||
username: ${DB_USERNAME:postgres}
|
|
||||||
password: ${DB_PASSWORD:pwd}
|
|
||||||
driver-class-name: org.postgresql.Driver
|
|
||||||
hikari:
|
|
||||||
maximum-pool-size: 5
|
|
||||||
minimum-idle: 2
|
|
||||||
connection-timeout: 30000
|
|
||||||
idle-timeout: 300000
|
|
||||||
max-lifetime: 900000
|
|
||||||
leak-detection-threshold: 120000 # 2 minutes - increased to avoid false positives with batch processing
|
|
||||||
|
|
||||||
jpa:
|
|
||||||
hibernate:
|
|
||||||
ddl-auto: none
|
|
||||||
show-sql: false
|
|
||||||
open-in-view: false
|
|
||||||
properties:
|
|
||||||
hibernate:
|
|
||||||
format_sql: true
|
|
||||||
default_schema: TED
|
|
||||||
jdbc:
|
|
||||||
batch_size: 25 # Match chunk size for optimal batch processing
|
|
||||||
order_inserts: true
|
|
||||||
order_updates: true
|
|
||||||
|
|
||||||
flyway:
|
|
||||||
enabled: true
|
|
||||||
locations: classpath:db/migration
|
|
||||||
baseline-on-migrate: true
|
|
||||||
create-schemas: true
|
|
||||||
schemas: TED
|
|
||||||
default-schema: TED
|
|
||||||
|
|
||||||
# Apache Camel Configuration
|
|
||||||
camel:
|
|
||||||
springboot:
|
|
||||||
main-run-controller: true
|
|
||||||
health:
|
|
||||||
enabled: true
|
|
||||||
# Weniger strenge Health-Checks für File-Consumer
|
|
||||||
consumers-enabled: false
|
|
||||||
|
|
||||||
# Custom Application Properties
|
|
||||||
ted:
|
|
||||||
# Directory configuration for file processing
|
|
||||||
input:
|
|
||||||
# Base directory for watching incoming TED XML files
|
|
||||||
directory: ${TED_INPUT_DIR:D:/ted.europe/extracted}
|
|
||||||
# File pattern to match (recursive scanning)
|
|
||||||
pattern: "**/*.xml"
|
|
||||||
# Move processed files to this directory
|
|
||||||
processed-directory: ${TED_PROCESSED_DIR:.processed}
|
|
||||||
# Move failed files to this directory
|
|
||||||
error-directory: ${TED_ERROR_DIR:.error}
|
|
||||||
# Polling interval in milliseconds
|
|
||||||
poll-interval: 5000
|
|
||||||
# Maximum messages per poll (reduced to prevent memory issues)
|
|
||||||
max-messages-per-poll: 10
|
|
||||||
|
|
||||||
# Schema validation configuration
|
|
||||||
schema:
|
|
||||||
# Enable/disable XSD validation
|
|
||||||
enabled: true
|
|
||||||
# Path to eForms SDK schemas (from Maven dependency or custom location)
|
|
||||||
path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
|
|
||||||
|
|
||||||
# Vectorization configuration
|
|
||||||
vectorization:
|
|
||||||
# Enable/disable async vectorization
|
|
||||||
enabled: true
|
|
||||||
# Use external HTTP API instead of subprocess
|
|
||||||
use-http-api: true
|
|
||||||
# Embedding service URL
|
|
||||||
api-url: http://localhost:8001
|
|
||||||
# Model name for sentence-transformers
|
|
||||||
model-name: intfloat/multilingual-e5-large
|
|
||||||
# Vector dimensions (must match model output)
|
|
||||||
dimensions: 1024
|
|
||||||
# Batch size for vectorization
|
|
||||||
batch-size: 16
|
|
||||||
# Thread pool size for async processing
|
|
||||||
thread-pool-size: 4
|
|
||||||
# Maximum text length for vectorization (characters)
|
|
||||||
max-text-length: 8192
|
|
||||||
# HTTP connection timeout (milliseconds)
|
|
||||||
connect-timeout: 10000
|
|
||||||
# HTTP socket/read timeout (milliseconds)
|
|
||||||
socket-timeout: 60000
|
|
||||||
# Maximum retries on connection failure
|
|
||||||
max-retries: 5
|
|
||||||
|
|
||||||
# Search configuration
|
|
||||||
search:
|
|
||||||
# Default page size for search results
|
|
||||||
default-page-size: 20
|
|
||||||
# Maximum page size
|
|
||||||
max-page-size: 100
|
|
||||||
# Similarity threshold for vector search (0.0 - 1.0)
|
|
||||||
similarity-threshold: 0.7
|
|
||||||
|
|
||||||
# TED Daily Package Download configuration
|
|
||||||
download:
|
|
||||||
# Enable/disable automatic package download
|
|
||||||
enabled: true
|
|
||||||
# Base URL for TED Daily Packages
|
|
||||||
base-url: https://ted.europa.eu/packages/daily/
|
|
||||||
# Download directory for tar.gz files
|
|
||||||
download-directory: D:/ted.europe/downloads
|
|
||||||
# Extract directory for XML files
|
|
||||||
extract-directory: D:/ted.europe/extracted
|
|
||||||
# Start year for downloads
|
|
||||||
start-year: 2015
|
|
||||||
# Max consecutive 404 errors before stopping
|
|
||||||
max-consecutive-404: 4
|
|
||||||
# Polling interval (milliseconds) - 2 minutes
|
|
||||||
poll-interval: 120000
|
|
||||||
# Download timeout (milliseconds) - 5 minutes
|
|
||||||
download-timeout: 300000
|
|
||||||
# Max concurrent downloads
|
|
||||||
max-concurrent-downloads: 2
|
|
||||||
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
|
|
||||||
delay-between-downloads: 3000
|
|
||||||
# Delete tar.gz after extraction
|
|
||||||
delete-after-extraction: true
|
|
||||||
# Prioritize current year first
|
|
||||||
prioritize-current-year: false
|
|
||||||
|
|
||||||
# IMAP Mail configuration
|
|
||||||
mail:
|
|
||||||
# Enable/disable mail processing
|
|
||||||
enabled: true
|
|
||||||
# IMAP server hostname
|
|
||||||
host: host
|
|
||||||
# IMAP server port (993 for IMAPS)
|
|
||||||
port: 993
|
|
||||||
# Mail account username (email address)
|
|
||||||
username: ${MAIL_USERNAME:}
|
|
||||||
# Mail account password
|
|
||||||
password: ${MAIL_PASSWORD:}
|
|
||||||
# Use SSL/TLS connection
|
|
||||||
ssl: true
|
|
||||||
# Mail folder to read from
|
|
||||||
folder-name: INBOX
|
|
||||||
# Delete messages after processing
|
|
||||||
delete: false
|
|
||||||
# Mark messages as seen after processing (false = peek mode, don't mark as read)
|
|
||||||
seen: false
|
|
||||||
# Only process unseen messages
|
|
||||||
unseen: true
|
|
||||||
# Polling delay in milliseconds (1 minute)
|
|
||||||
delay: 60000
|
|
||||||
# Max messages per poll
|
|
||||||
max-messages-per-poll: 10
|
|
||||||
# Output directory for processed attachments
|
|
||||||
attachment-output-directory: D:/ted.europe/mail-attachments
|
|
||||||
# Enable/disable MIME file input processing
|
|
||||||
mime-input-enabled: true
|
|
||||||
# Input directory for MIME files (.eml)
|
|
||||||
mime-input-directory: D:/ted.europe/mime-input
|
|
||||||
# File pattern for MIME files (regex)
|
|
||||||
mime-input-pattern: .*\\.eml
|
|
||||||
# Polling interval for MIME input directory (milliseconds)
|
|
||||||
mime-input-poll-interval: 10000
|
|
||||||
|
|
||||||
# Solution Brief processing configuration
|
|
||||||
solution-brief:
|
|
||||||
# Enable/disable Solution Brief processing
|
|
||||||
enabled: true
|
|
||||||
# Input directory for Solution Brief PDF files
|
|
||||||
input-directory: C:/work/SolutionBrief
|
|
||||||
# Output directory for Excel result files (relative to input or absolute)
|
|
||||||
result-directory: ./result
|
|
||||||
# Number of top similar documents to include
|
|
||||||
top-k: 20
|
|
||||||
# Minimum similarity threshold (0.0-1.0)
|
|
||||||
similarity-threshold: 0.5
|
|
||||||
# Polling interval in milliseconds (30 seconds)
|
|
||||||
poll-interval: 30000
|
|
||||||
# File pattern for PDF files (regex)
|
|
||||||
file-pattern: .*\\.pdf
|
|
||||||
# Process files only once (idempotent)
|
|
||||||
idempotent: true
|
|
||||||
# Idempotent repository file path
|
|
||||||
idempotent-repository: ./solution-brief-processed.dat
|
|
||||||
|
|
||||||
# Data cleanup configuration
|
|
||||||
cleanup:
|
|
||||||
# Enable automatic cleanup of old documents
|
|
||||||
enabled: false
|
|
||||||
# Retention period in years (default: 10)
|
|
||||||
retention-years: 10
|
|
||||||
# Cron expression for cleanup schedule (default: daily at 2 AM)
|
|
||||||
cron: "0 0 2 * * *"
|
|
||||||
|
|
||||||
# Actuator endpoints
|
|
||||||
management:
|
|
||||||
endpoints:
|
|
||||||
web:
|
|
||||||
exposure:
|
|
||||||
include: health,info,metrics,camel
|
|
||||||
endpoint:
|
|
||||||
health:
|
|
||||||
show-details: when-authorized
|
|
||||||
|
|
||||||
# OpenAPI documentation
|
|
||||||
springdoc:
|
|
||||||
api-docs:
|
|
||||||
path: /v3/api-docs
|
|
||||||
swagger-ui:
|
|
||||||
path: /swagger-ui.html
|
|
||||||
operations-sorter: method
|
|
||||||
|
|
||||||
# Logging configuration
|
|
||||||
logging:
|
|
||||||
level:
|
|
||||||
at.procon.ted: INFO
|
|
||||||
at.procon.ted.camel.SolutionBriefRoute: INFO
|
|
||||||
org.apache.camel: INFO
|
|
||||||
org.hibernate.SQL: WARN
|
|
||||||
org.hibernate.type.descriptor.sql: WARN
|
|
||||||
|
|
@ -84,7 +84,7 @@ ted:
|
||||||
# Vectorization configuration
|
# Vectorization configuration
|
||||||
vectorization:
|
vectorization:
|
||||||
# Enable/disable async vectorization
|
# Enable/disable async vectorization
|
||||||
enabled: false
|
enabled: true
|
||||||
# Use external HTTP API instead of subprocess
|
# Use external HTTP API instead of subprocess
|
||||||
use-http-api: true
|
use-http-api: true
|
||||||
# Embedding service URL
|
# Embedding service URL
|
||||||
|
|
@ -154,7 +154,7 @@ ted:
|
||||||
# TED Daily Package Download configuration
|
# TED Daily Package Download configuration
|
||||||
download:
|
download:
|
||||||
# Enable/disable automatic package download
|
# Enable/disable automatic package download
|
||||||
enabled: false
|
enabled: true
|
||||||
# User service-based camel route
|
# User service-based camel route
|
||||||
use-service-based: false
|
use-service-based: false
|
||||||
# Base URL for TED Daily Packages
|
# Base URL for TED Daily Packages
|
||||||
|
|
@ -168,7 +168,7 @@ ted:
|
||||||
# Max consecutive 404 errors before stopping
|
# Max consecutive 404 errors before stopping
|
||||||
max-consecutive-404: 4
|
max-consecutive-404: 4
|
||||||
# Polling interval (milliseconds) - 2 minutes
|
# Polling interval (milliseconds) - 2 minutes
|
||||||
poll-interval: 1800000
|
poll-interval: 300000
|
||||||
# Retry interval for tail NOT_FOUND packages - 6 hours
|
# Retry interval for tail NOT_FOUND packages - 6 hours
|
||||||
not-found-retry-interval: 21600000
|
not-found-retry-interval: 21600000
|
||||||
# Grace period after year end before a previous-year tail 404 is treated as final
|
# Grace period after year end before a previous-year tail 404 is treated as final
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,39 @@
|
||||||
|
ALTER TABLE DOC.doc_embedding
|
||||||
|
DROP CONSTRAINT IF EXISTS ck_doc_embedding_dimensions_positive;
|
||||||
|
|
||||||
|
ALTER TABLE DOC.doc_embedding
|
||||||
|
ADD CONSTRAINT ck_doc_embedding_dimensions_positive
|
||||||
|
CHECK (embedding_dimensions IS NULL OR embedding_dimensions > 0);
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_constraint
|
||||||
|
WHERE conname = 'uq_doc_embedding_representation_model'
|
||||||
|
AND conrelid = 'doc.doc_embedding'::regclass
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE DOC.doc_embedding
|
||||||
|
ADD CONSTRAINT uq_doc_embedding_representation_model
|
||||||
|
UNIQUE (representation_id, model_id);
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
COMMENT ON TABLE DOC.doc_embedding IS
|
||||||
|
'Option A multi-model embedding storage. Embeddings of different lengths may coexist in one table. Semantic search must always filter by model_id and embedding_dimensions.';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN DOC.doc_embedding.embedding_dimensions IS
|
||||||
|
'Resolved dimension of the stored embedding. Used for validation, filtering, and model-specific vector casts.';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN DOC.doc_embedding.embedding_vector IS
|
||||||
|
'Generic pgvector column without fixed dimension. Create per-model partial expression indexes with a fixed cast, e.g. ((embedding_vector::public.vector(1024)) vector_cosine_ops).';
|
||||||
|
|
||||||
|
-- Recommended partial ANN index pattern for active models:
|
||||||
|
-- CREATE INDEX idx_doc_embedding_<model_key>_hnsw
|
||||||
|
-- ON DOC.doc_embedding USING hnsw ((embedding_vector::public.vector(<DIMENSIONS>)) vector_cosine_ops)
|
||||||
|
-- WHERE model_id = '<MODEL_UUID>'::uuid
|
||||||
|
-- AND embedding_status = 'COMPLETED';
|
||||||
|
--
|
||||||
|
-- If you use inner product or euclidean distance for a model, pick the matching operator class:
|
||||||
|
-- vector_ip_ops
|
||||||
|
-- vector_l2_ops
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
-- Slice 1 + Slice 2 generic search support for DOC documents.
|
-- Slice 1 + Slice 2 generic search support for DOC documents.
|
||||||
-- Adds lexical-search support columns/indexes and pg_trgm extension.
|
-- Adds lexical-search support columns/indexes and pg_trgm extension.
|
||||||
|
|
||||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
CREATE EXTENSION IF NOT EXISTS pg_trgm with schema doc;
|
||||||
|
|
||||||
ALTER TABLE DOC.doc_text_representation
|
ALTER TABLE DOC.doc_text_representation
|
||||||
ADD COLUMN IF NOT EXISTS search_config VARCHAR(64);
|
ADD COLUMN IF NOT EXISTS search_config VARCHAR(64);
|
||||||
|
|
@ -15,12 +15,12 @@ CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm
|
CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm
|
||||||
ON DOC.doc_document
|
ON DOC.doc_document
|
||||||
USING GIN (title gin_trgm_ops);
|
USING GIN (title DOC.gin_trgm_ops);
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm
|
CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm
|
||||||
ON DOC.doc_document
|
ON DOC.doc_document
|
||||||
USING GIN (summary gin_trgm_ops);
|
USING GIN (summary DOC.gin_trgm_ops);
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm
|
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm
|
||||||
ON DOC.doc_text_representation
|
ON DOC.doc_text_representation
|
||||||
USING GIN (text_body gin_trgm_ops);
|
USING GIN (text_body DOC.gin_trgm_ops);
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||||
TransactionAutoConfiguration.class,
|
TransactionAutoConfiguration.class,
|
||||||
JdbcTemplateAutoConfiguration.class
|
JdbcTemplateAutoConfiguration.class
|
||||||
})
|
})
|
||||||
@EnableConfigurationProperties({TedProcessorProperties.class, EmbeddingProperties.class})
|
@EnableConfigurationProperties({TedProcessorProperties.class})
|
||||||
@EntityScan(basePackages = {
|
@EntityScan(basePackages = {
|
||||||
"at.procon.dip.domain.document.entity",
|
"at.procon.dip.domain.document.entity",
|
||||||
"at.procon.dip.domain.tenant.entity",
|
"at.procon.dip.domain.tenant.entity",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue