You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

263 lines
8.3 KiB
YAML

# Document Intelligence Platform Configuration
# Author: Martin.Schweitzer@procon.co.at and claude.ai
server:
port: 8889
servlet:
context-path: /api
spring:
application:
name: document-intelligence-platform
datasource:
url: jdbc:postgresql://localhost:5432/RELM
username: ${DB_USERNAME:postgres}
password: ${DB_PASSWORD:P54!pcd#Wi}
driver-class-name: org.postgresql.Driver
hikari:
maximum-pool-size: 5
minimum-idle: 2
connection-timeout: 30000
idle-timeout: 300000
max-lifetime: 900000
leak-detection-threshold: 120000 # 2 minutes - increased to avoid false positives with batch processing
jpa:
hibernate:
ddl-auto: update
show-sql: false
open-in-view: false
properties:
hibernate:
dialect: org.hibernate.dialect.PostgreSQLDialect
format_sql: true
default_schema: TED
jdbc:
batch_size: 25 # Match chunk size for optimal batch processing
order_inserts: true
order_updates: true
flyway:
enabled: true
locations: classpath:db/migration
baseline-on-migrate: true
create-schemas: true
schemas:
- TED
- DOC
default-schema: TED
# Apache Camel Configuration
camel:
springboot:
main-run-controller: true
health:
enabled: true
# Weniger strenge Health-Checks für File-Consumer
consumers-enabled: false
# Custom Application Properties
ted:
# Directory configuration for file processing
input:
# Base directory for watching incoming TED XML files
directory: ${TED_INPUT_DIR:/ted.europe/extracted}
# File pattern to match (recursive scanning)
pattern: "**/*.xml"
# Move processed files to this directory
processed-directory: ${TED_PROCESSED_DIR:.processed}
# Move failed files to this directory
error-directory: ${TED_ERROR_DIR:.error}
# Polling interval in milliseconds
poll-interval: 5000
# Maximum messages per poll (reduced to prevent memory issues)
max-messages-per-poll: 10
# Schema validation configuration
schema:
# Enable/disable XSD validation
enabled: true
# Path to eForms SDK schemas (from Maven dependency or custom location)
path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
# Vectorization configuration
vectorization:
# Enable/disable async vectorization
enabled: false
# Use external HTTP API instead of subprocess
use-http-api: true
# Embedding service URL
api-url: http://172.20.240.18:8001
# Model name for sentence-transformers
model-name: intfloat/multilingual-e5-large
# Vector dimensions (must match model output)
dimensions: 1024
# Batch size for vectorization
batch-size: 16
# Thread pool size for async processing
thread-pool-size: 4
# Maximum text length for vectorization (characters)
max-text-length: 8192
# HTTP connection timeout (milliseconds)
connect-timeout: 10000
# HTTP socket/read timeout (milliseconds)
socket-timeout: 60000
# Maximum retries on connection failure
max-retries: 5
# Phase 2: use generic DOC representation/embedding pipeline as primary vectorization path
generic-pipeline-enabled: true
# Keep legacy TED vector columns updated until semantic search is migrated
dual-write-legacy-ted-vectors: true
# Scheduler interval for generic embedding polling
generic-scheduler-period-ms: 6000
# Builder identifier for primary TED semantic representations in DOC
primary-representation-builder-key: ted-phase2-primary-representation
# Provider key stored in DOC.doc_embedding_model
embedding-provider: http-embedding-service
# Search configuration
search:
# Default page size for search results
default-page-size: 20
# Maximum page size
max-page-size: 100
# Similarity threshold for vector search (0.0 - 1.0)
similarity-threshold: 0.7
# TED Daily Package Download configuration
download:
# Enable/disable automatic package download
enabled: false
# Base URL for TED Daily Packages
base-url: https://ted.europa.eu/packages/daily/
# Download directory for tar.gz files
download-directory: /ted.europe/downloads
# Extract directory for XML files
extract-directory: /ted.europe/extracted
# Start year for downloads
start-year: 2023
# Max consecutive 404 errors before stopping
max-consecutive-404: 4
# Polling interval (milliseconds) - 2 minutes
poll-interval: 120000
# Retry interval for tail NOT_FOUND packages - 6 hours
not-found-retry-interval: 21600000
# Grace period after year end before a previous-year tail 404 is treated as final
previous-year-grace-period-days: 30
# Keep retrying current-year tail 404 packages indefinitely
retry-current-year-not-found-indefinitely: true
# Download timeout (milliseconds) - 5 minutes
download-timeout: 300000
# Max concurrent downloads
max-concurrent-downloads: 2
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
delay-between-downloads: 3000
# Delete tar.gz after extraction
delete-after-extraction: true
# Prioritize current year first
prioritize-current-year: false
# IMAP Mail configuration
mail:
# Enable/disable mail processing
enabled: false
# IMAP server hostname
host: mail.mymagenta.business
# IMAP server port (993 for IMAPS)
port: 993
# Mail account username (email address)
username: archiv@procon.co.at
# Mail account password
password: ${MAIL_PASSWORD:worasigg}
# Use SSL/TLS connection
ssl: true
# Mail folder to read from
folder-name: INBOX
# Delete messages after processing
delete: false
# Mark messages as seen after processing (false = peek mode, don't mark as read)
seen: false
# Only process unseen messages
unseen: true
# Polling delay in milliseconds (1 minute)
delay: 60000
# Max messages per poll
max-messages-per-poll: 10
# Output directory for processed attachments
attachment-output-directory: /ted.europe/mail-attachments
# Enable/disable MIME file input processing
mime-input-enabled: true
# Input directory for MIME files (.eml)
mime-input-directory: /ted.europe/mime-input
# File pattern for MIME files (regex)
mime-input-pattern: .*\\.eml
# Polling interval for MIME input directory (milliseconds)
mime-input-poll-interval: 10000
# Phase 3 TED projection configuration
projection:
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document
enabled: true
# Optional startup backfill for legacy TED documents without a projection row yet
startup-backfill-enabled: false
# Maximum number of legacy TED documents to backfill during startup
startup-backfill-limit: 250
# Solution Brief processing configuration
solution-brief:
# Enable/disable Solution Brief processing
enabled: false
# Input directory for Solution Brief PDF files
input-directory: C:/work/SolutionBrief
# Output directory for Excel result files (relative to input or absolute)
result-directory: ./result
# Number of top similar documents to include
top-k: 20
# Minimum similarity threshold (0.0-1.0)
similarity-threshold: 0.5
# Polling interval in milliseconds (30 seconds)
poll-interval: 30000
# File pattern for PDF files (regex)
file-pattern: .*\\.pdf
# Process files only once (idempotent)
idempotent: true
# Idempotent repository file path
idempotent-repository: ./solution-brief-processed.dat
# Data cleanup configuration
cleanup:
# Enable automatic cleanup of old documents
enabled: false
# Retention period in years (default: 10)
retention-years: 10
# Cron expression for cleanup schedule (default: daily at 2 AM)
cron: "0 0 2 * * *"
# Actuator endpoints
management:
endpoints:
web:
exposure:
include: health,info,metrics,camel
endpoint:
health:
show-details: when-authorized
# OpenAPI documentation
springdoc:
api-docs:
path: /v3/api-docs
swagger-ui:
path: /swagger-ui.html
operations-sorter: method
# Logging configuration
logging:
level:
at.procon.ted: INFO
at.procon.ted.camel.SolutionBriefRoute: INFO
org.apache.camel: INFO
org.hibernate.SQL: WARN
org.hibernate.type.descriptor.sql: WARN