# Document Intelligence Platform Configuration # Author: Martin.Schweitzer@procon.co.at and claude.ai server: port: 8889 servlet: context-path: /api spring: application: name: document-intelligence-platform datasource: url: jdbc:postgresql://localhost:5432/RELM username: ${DB_USERNAME:postgres} password: ${DB_PASSWORD:P54!pcd#Wi} driver-class-name: org.postgresql.Driver hikari: maximum-pool-size: 5 minimum-idle: 2 connection-timeout: 30000 idle-timeout: 300000 max-lifetime: 900000 leak-detection-threshold: 120000 # 2 minutes - increased to avoid false positives with batch processing jpa: hibernate: ddl-auto: update show-sql: false open-in-view: false properties: hibernate: dialect: org.hibernate.dialect.PostgreSQLDialect format_sql: true default_schema: TED jdbc: batch_size: 25 # Match chunk size for optimal batch processing order_inserts: true order_updates: true flyway: enabled: true locations: classpath:db/migration baseline-on-migrate: true create-schemas: true schemas: - TED - DOC default-schema: TED # Apache Camel Configuration camel: springboot: main-run-controller: true health: enabled: true # Weniger strenge Health-Checks für File-Consumer consumers-enabled: false # Custom Application Properties ted: # Directory configuration for file processing input: # Base directory for watching incoming TED XML files directory: ${TED_INPUT_DIR:/ted.europe/extracted} # File pattern to match (recursive scanning) pattern: "**/*.xml" # Move processed files to this directory processed-directory: ${TED_PROCESSED_DIR:.processed} # Move failed files to this directory error-directory: ${TED_ERROR_DIR:.error} # Polling interval in milliseconds poll-interval: 5000 # Maximum messages per poll (reduced to prevent memory issues) max-messages-per-poll: 10 # Schema validation configuration schema: # Enable/disable XSD validation enabled: true # Path to eForms SDK schemas (from Maven dependency or custom location) path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd # Vectorization configuration vectorization: # Enable/disable async vectorization enabled: false # Use external HTTP API instead of subprocess use-http-api: true # Embedding service URL api-url: http://172.20.240.18:8001 # Model name for sentence-transformers model-name: intfloat/multilingual-e5-large # Vector dimensions (must match model output) dimensions: 1024 # Batch size for vectorization batch-size: 16 # Thread pool size for async processing thread-pool-size: 4 # Maximum text length for vectorization (characters) max-text-length: 8192 # HTTP connection timeout (milliseconds) connect-timeout: 10000 # HTTP socket/read timeout (milliseconds) socket-timeout: 60000 # Maximum retries on connection failure max-retries: 5 # Phase 2: use generic DOC representation/embedding pipeline as primary vectorization path generic-pipeline-enabled: true # Keep legacy TED vector columns updated until semantic search is migrated dual-write-legacy-ted-vectors: true # Scheduler interval for generic embedding polling generic-scheduler-period-ms: 6000 # Builder identifier for primary TED semantic representations in DOC primary-representation-builder-key: ted-phase2-primary-representation # Provider key stored in DOC.doc_embedding_model embedding-provider: http-embedding-service # Search configuration search: # Default page size for search results default-page-size: 20 # Maximum page size max-page-size: 100 # Similarity threshold for vector search (0.0 - 1.0) similarity-threshold: 0.7 # TED Daily Package Download configuration download: # Enable/disable automatic package download enabled: false # Base URL for TED Daily Packages base-url: https://ted.europa.eu/packages/daily/ # Download directory for tar.gz files download-directory: /ted.europe/downloads # Extract directory for XML files extract-directory: /ted.europe/extracted # Start year for downloads start-year: 2023 # Max consecutive 404 errors before stopping max-consecutive-404: 4 # Polling interval (milliseconds) - 2 minutes poll-interval: 120000 # Retry interval for tail NOT_FOUND packages - 6 hours not-found-retry-interval: 21600000 # Grace period after year end before a previous-year tail 404 is treated as final previous-year-grace-period-days: 30 # Keep retrying current-year tail 404 packages indefinitely retry-current-year-not-found-indefinitely: true # Download timeout (milliseconds) - 5 minutes download-timeout: 300000 # Max concurrent downloads max-concurrent-downloads: 2 # Delay between downloads (milliseconds) for rate limiting - 5 seconds delay-between-downloads: 3000 # Delete tar.gz after extraction delete-after-extraction: true # Prioritize current year first prioritize-current-year: false # IMAP Mail configuration mail: # Enable/disable mail processing enabled: false # IMAP server hostname host: mail.mymagenta.business # IMAP server port (993 for IMAPS) port: 993 # Mail account username (email address) username: archiv@procon.co.at # Mail account password password: ${MAIL_PASSWORD:worasigg} # Use SSL/TLS connection ssl: true # Mail folder to read from folder-name: INBOX # Delete messages after processing delete: false # Mark messages as seen after processing (false = peek mode, don't mark as read) seen: false # Only process unseen messages unseen: true # Polling delay in milliseconds (1 minute) delay: 60000 # Max messages per poll max-messages-per-poll: 10 # Output directory for processed attachments attachment-output-directory: /ted.europe/mail-attachments # Enable/disable MIME file input processing mime-input-enabled: true # Input directory for MIME files (.eml) mime-input-directory: /ted.europe/mime-input # File pattern for MIME files (regex) mime-input-pattern: .*\\.eml # Polling interval for MIME input directory (milliseconds) mime-input-poll-interval: 10000 # Phase 3 TED projection configuration projection: # Enable/disable dual-write into the TED projection model on top of DOC.doc_document enabled: true # Optional startup backfill for legacy TED documents without a projection row yet startup-backfill-enabled: false # Maximum number of legacy TED documents to backfill during startup startup-backfill-limit: 250 # Solution Brief processing configuration solution-brief: # Enable/disable Solution Brief processing enabled: false # Input directory for Solution Brief PDF files input-directory: C:/work/SolutionBrief # Output directory for Excel result files (relative to input or absolute) result-directory: ./result # Number of top similar documents to include top-k: 20 # Minimum similarity threshold (0.0-1.0) similarity-threshold: 0.5 # Polling interval in milliseconds (30 seconds) poll-interval: 30000 # File pattern for PDF files (regex) file-pattern: .*\\.pdf # Process files only once (idempotent) idempotent: true # Idempotent repository file path idempotent-repository: ./solution-brief-processed.dat # Data cleanup configuration cleanup: # Enable automatic cleanup of old documents enabled: false # Retention period in years (default: 10) retention-years: 10 # Cron expression for cleanup schedule (default: daily at 2 AM) cron: "0 0 2 * * *" # Actuator endpoints management: endpoints: web: exposure: include: health,info,metrics,camel endpoint: health: show-details: when-authorized # OpenAPI documentation springdoc: api-docs: path: /v3/api-docs swagger-ui: path: /swagger-ui.html operations-sorter: method # Logging configuration logging: level: at.procon.ted: INFO at.procon.ted.camel.SolutionBriefRoute: INFO org.apache.camel: INFO org.hibernate.SQL: WARN org.hibernate.type.descriptor.sql: WARN