You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
347 lines
12 KiB
YAML
347 lines
12 KiB
YAML
# Document Intelligence Platform Configuration
|
|
# Author: Martin.Schweitzer@procon.co.at and claude.ai
|
|
|
|
server:
|
|
port: 8889
|
|
servlet:
|
|
context-path: /api
|
|
|
|
spring:
|
|
application:
|
|
name: document-intelligence-platform
|
|
|
|
datasource:
|
|
url: jdbc:postgresql://localhost:5432/RELM
|
|
username: ${DB_USERNAME:postgres}
|
|
password: ${DB_PASSWORD:P54!pcd#Wi}
|
|
driver-class-name: org.postgresql.Driver
|
|
hikari:
|
|
maximum-pool-size: 5
|
|
minimum-idle: 2
|
|
connection-timeout: 30000
|
|
idle-timeout: 300000
|
|
max-lifetime: 900000
|
|
leak-detection-threshold: 120000 # 2 minutes - increased to avoid false positives with batch processing
|
|
|
|
jpa:
|
|
hibernate:
|
|
ddl-auto: update
|
|
show-sql: false
|
|
open-in-view: false
|
|
properties:
|
|
hibernate:
|
|
dialect: org.hibernate.dialect.PostgreSQLDialect
|
|
format_sql: true
|
|
default_schema: TED
|
|
jdbc:
|
|
batch_size: 25 # Match chunk size for optimal batch processing
|
|
order_inserts: true
|
|
order_updates: true
|
|
|
|
flyway:
|
|
enabled: false
|
|
locations: classpath:db/migration
|
|
baseline-on-migrate: true
|
|
create-schemas: true
|
|
schemas:
|
|
- TED
|
|
- DOC
|
|
default-schema: TED
|
|
|
|
# Apache Camel Configuration
|
|
camel:
|
|
springboot:
|
|
main-run-controller: true
|
|
health:
|
|
enabled: true
|
|
# Weniger strenge Health-Checks für File-Consumer
|
|
consumers-enabled: false
|
|
|
|
# Custom Application Properties
|
|
ted:
|
|
# Directory configuration for file processing
|
|
input:
|
|
# Base directory for watching incoming TED XML files
|
|
directory: ${TED_INPUT_DIR:/ted.europe/extracted}
|
|
# File pattern to match (recursive scanning)
|
|
pattern: "**/*.xml"
|
|
# Move processed files to this directory
|
|
processed-directory: ${TED_PROCESSED_DIR:.processed}
|
|
# Move failed files to this directory
|
|
error-directory: ${TED_ERROR_DIR:.error}
|
|
# Polling interval in milliseconds
|
|
poll-interval: 5000
|
|
# Maximum messages per poll (reduced to prevent memory issues)
|
|
max-messages-per-poll: 10
|
|
|
|
# Schema validation configuration
|
|
schema:
|
|
# Enable/disable XSD validation
|
|
enabled: true
|
|
# Path to eForms SDK schemas (from Maven dependency or custom location)
|
|
path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
|
|
|
|
# Vectorization configuration
|
|
vectorization:
|
|
# Enable/disable async vectorization
|
|
enabled: false
|
|
# Use external HTTP API instead of subprocess
|
|
use-http-api: true
|
|
# Embedding service URL
|
|
api-url: http://172.20.240.18:8001
|
|
# Model name for sentence-transformers
|
|
model-name: intfloat/multilingual-e5-large
|
|
# Vector dimensions (must match model output)
|
|
dimensions: 1024
|
|
# Batch size for vectorization
|
|
batch-size: 16
|
|
# Thread pool size for async processing
|
|
thread-pool-size: 4
|
|
# Maximum text length for vectorization (characters)
|
|
max-text-length: 8192
|
|
# HTTP connection timeout (milliseconds)
|
|
connect-timeout: 10000
|
|
# HTTP socket/read timeout (milliseconds)
|
|
socket-timeout: 60000
|
|
# Maximum retries on connection failure
|
|
max-retries: 5
|
|
# Phase 2: use generic DOC representation/embedding pipeline as primary vectorization path
|
|
generic-pipeline-enabled: true
|
|
# Keep legacy TED vector columns updated until semantic search is migrated
|
|
dual-write-legacy-ted-vectors: true
|
|
# Scheduler interval for generic embedding polling
|
|
generic-scheduler-period-ms: 6000
|
|
# Builder identifier for primary TED semantic representations in DOC
|
|
primary-representation-builder-key: ted-phase2-primary-representation
|
|
# Provider key stored in DOC.doc_embedding_model
|
|
embedding-provider: http-embedding-service
|
|
|
|
# Search configuration
|
|
search:
|
|
# Default page size for search results
|
|
default-page-size: 20
|
|
# Maximum page size
|
|
max-page-size: 100
|
|
# Similarity threshold for vector search (0.0 - 1.0)
|
|
similarity-threshold: 0.7
|
|
# Minimum trigram similarity for fuzzy lexical matches
|
|
trigram-similarity-threshold: 0.12
|
|
# Candidate limits per engine before fusion/collapse
|
|
fulltext-candidate-limit: 120
|
|
trigram-candidate-limit: 120
|
|
semantic-candidate-limit: 120
|
|
# Hybrid fusion weights
|
|
fulltext-weight: 0.35
|
|
trigram-weight: 0.20
|
|
semantic-weight: 0.45
|
|
# Additional score weight for recency
|
|
recency-boost-weight: 0.05
|
|
# Recency half-life in days
|
|
recency-half-life-days: 30
|
|
# Enable chunk representations for long documents
|
|
chunking-enabled: true
|
|
# Target chunk size in characters
|
|
chunk-target-chars: 1800
|
|
# Overlap between consecutive chunks
|
|
chunk-overlap-chars: 200
|
|
# Maximum number of chunks generated per document
|
|
max-chunks-per-document: 12
|
|
# Startup backfill limit for missing lexical vectors
|
|
startup-lexical-backfill-limit: 500
|
|
# Number of top hits per engine returned by /search/debug
|
|
debug-top-hits-per-engine: 10
|
|
|
|
# TED Daily Package Download configuration
|
|
download:
|
|
# Enable/disable automatic package download
|
|
enabled: true
|
|
# User service-based camel route
|
|
use-service-based: false
|
|
# Base URL for TED Daily Packages
|
|
base-url: https://ted.europa.eu/packages/daily/
|
|
# Download directory for tar.gz files
|
|
download-directory: /ted.europe/downloads
|
|
# Extract directory for XML files
|
|
extract-directory: /ted.europe/extracted
|
|
# Start year for downloads
|
|
start-year: 2026
|
|
# Max consecutive 404 errors before stopping
|
|
max-consecutive-404: 4
|
|
# Polling interval (milliseconds) - 2 minutes
|
|
poll-interval: 1800000
|
|
# Retry interval for tail NOT_FOUND packages - 6 hours
|
|
not-found-retry-interval: 21600000
|
|
# Grace period after year end before a previous-year tail 404 is treated as final
|
|
previous-year-grace-period-days: 30
|
|
# Keep retrying current-year tail 404 packages indefinitely
|
|
retry-current-year-not-found-indefinitely: true
|
|
# Download timeout (milliseconds) - 5 minutes
|
|
download-timeout: 300000
|
|
# Max concurrent downloads
|
|
max-concurrent-downloads: 2
|
|
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
|
|
delay-between-downloads: 3000
|
|
# Delete tar.gz after extraction
|
|
delete-after-extraction: true
|
|
# Prioritize current year first
|
|
prioritize-current-year: false
|
|
|
|
# IMAP Mail configuration
|
|
mail:
|
|
# Enable/disable mail processing
|
|
enabled: false
|
|
# IMAP server hostname
|
|
host: mail.mymagenta.business
|
|
# IMAP server port (993 for IMAPS)
|
|
port: 993
|
|
# Mail account username (email address)
|
|
username: archiv@procon.co.at
|
|
# Mail account password
|
|
password: ${MAIL_PASSWORD:worasigg}
|
|
# Use SSL/TLS connection
|
|
ssl: true
|
|
# Mail folder to read from
|
|
folder-name: INBOX
|
|
# Delete messages after processing
|
|
delete: false
|
|
# Mark messages as seen after processing (false = peek mode, don't mark as read)
|
|
seen: false
|
|
# Only process unseen messages
|
|
unseen: true
|
|
# Polling delay in milliseconds (1 minute)
|
|
delay: 60000
|
|
# Max messages per poll
|
|
max-messages-per-poll: 100
|
|
# Output directory for processed attachments
|
|
attachment-output-directory: /ted.europe/mail-attachments
|
|
# Enable/disable MIME file input processing
|
|
mime-input-enabled: true
|
|
# Input directory for MIME files (.eml)
|
|
mime-input-directory: /ted.europe/mime-input
|
|
# File pattern for MIME files (regex)
|
|
mime-input-pattern: .*\\.eml
|
|
# Polling interval for MIME input directory (milliseconds)
|
|
mime-input-poll-interval: 1000000
|
|
|
|
# Phase 3 TED projection configuration
|
|
projection:
|
|
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document
|
|
enabled: true
|
|
# Optional startup backfill for legacy TED documents without a projection row yet
|
|
startup-backfill-enabled: false
|
|
# Maximum number of legacy TED documents to backfill during startup
|
|
startup-backfill-limit: 250
|
|
|
|
# Phase 4 generic ingestion configuration
|
|
generic-ingestion:
|
|
# Master switch for arbitrary document ingestion into the DOC model
|
|
enabled: true
|
|
# Enable file-system polling for non-TED documents
|
|
file-system-enabled: false
|
|
# Allow REST/API upload endpoints for arbitrary documents
|
|
rest-upload-enabled: true
|
|
# Input directory for the generic Camel file route
|
|
input-directory: /ted.europe/generic-input
|
|
# Regex for files accepted by the generic file route
|
|
file-pattern: .*\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$
|
|
# Move successfully processed files here
|
|
processed-directory: .dip-processed
|
|
# Move failed files here
|
|
error-directory: .dip-error
|
|
# Polling interval for the generic route
|
|
poll-interval: 15000
|
|
# Maximum files per poll
|
|
max-messages-per-poll: 200
|
|
# Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs
|
|
default-owner-tenant-key:
|
|
# Default visibility when no explicit access context is provided
|
|
default-visibility: PUBLIC
|
|
# Optional default language for filesystem imports
|
|
default-language-code:
|
|
# Store small binary originals in DOC.doc_content.binary_content
|
|
store-original-binary-in-db: true
|
|
# Maximum binary payload size persisted inline in DB
|
|
max-binary-bytes-in-db: 5242880
|
|
# Deduplicate by content hash and attach additional sources to the same canonical document
|
|
deduplicate-by-content-hash: true
|
|
# Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers
|
|
store-original-content-for-wrapper-documents: true
|
|
# Queue only the primary text representation for vectorization
|
|
vectorize-primary-representation-only: true
|
|
# Import batch marker written to DOC.doc_source.import_batch_id
|
|
import-batch-id: phase4-generic
|
|
# Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI
|
|
ted-package-adapter-enabled: true
|
|
# Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI
|
|
mail-adapter-enabled: true
|
|
# Optional dedicated mail owner tenant, falls back to default-owner-tenant-key
|
|
mail-default-owner-tenant-key:
|
|
# Visibility for imported mail messages and attachments
|
|
mail-default-visibility: TENANT
|
|
# Expand ZIP attachments recursively through the mail adapter
|
|
expand-mail-zip-attachments: true
|
|
# Import batch marker for TED package roots and children
|
|
ted-package-import-batch-id: phase41-ted-package
|
|
# When true, TED package documents are stored only through the generic ingestion gateway
|
|
# and the legacy XML batch processing path is skipped
|
|
gateway-only-for-ted-packages: true
|
|
# Import batch marker for mail roots and attachments
|
|
mail-import-batch-id: phase41-mail
|
|
|
|
# Solution Brief processing configuration
|
|
solution-brief:
|
|
# Enable/disable Solution Brief processing
|
|
enabled: false
|
|
# Input directory for Solution Brief PDF files
|
|
input-directory: C:/work/SolutionBrief
|
|
# Output directory for Excel result files (relative to input or absolute)
|
|
result-directory: ./result
|
|
# Number of top similar documents to include
|
|
top-k: 20
|
|
# Minimum similarity threshold (0.0-1.0)
|
|
similarity-threshold: 0.5
|
|
# Polling interval in milliseconds (30 seconds)
|
|
poll-interval: 30000
|
|
# File pattern for PDF files (regex)
|
|
file-pattern: .*\\.pdf
|
|
# Process files only once (idempotent)
|
|
idempotent: true
|
|
# Idempotent repository file path
|
|
idempotent-repository: ./solution-brief-processed.dat
|
|
|
|
# Data cleanup configuration
|
|
cleanup:
|
|
# Enable automatic cleanup of old documents
|
|
enabled: false
|
|
# Retention period in years (default: 10)
|
|
retention-years: 10
|
|
# Cron expression for cleanup schedule (default: daily at 2 AM)
|
|
cron: "0 0 2 * * *"
|
|
|
|
# Actuator endpoints
|
|
management:
|
|
endpoints:
|
|
web:
|
|
exposure:
|
|
include: health,info,metrics,camel
|
|
endpoint:
|
|
health:
|
|
show-details: when-authorized
|
|
|
|
# OpenAPI documentation
|
|
springdoc:
|
|
api-docs:
|
|
path: /v3/api-docs
|
|
swagger-ui:
|
|
path: /swagger-ui.html
|
|
operations-sorter: method
|
|
|
|
# Logging configuration
|
|
logging:
|
|
level:
|
|
at.procon.ted: INFO
|
|
at.procon.ted.camel.SolutionBriefRoute: INFO
|
|
org.apache.camel: INFO
|
|
org.hibernate.SQL: WARN
|
|
org.hibernate.type.descriptor.sql: WARN
|