DIP/src/main/resources/application.yml

# Document Intelligence Platform Configuration
# Author: Martin.Schweitzer@procon.co.at and claude.ai

server:
  port: 8889
  servlet:
    context-path: /api

spring:
  application:
    name: document-intelligence-platform

  datasource:
    url: jdbc:postgresql://localhost:5432/RELM
    username: ${DB_USERNAME:postgres}
    password: ${DB_PASSWORD:P54!pcd#Wi}
    driver-class-name: org.postgresql.Driver
    hikari:
      maximum-pool-size: 5
      minimum-idle: 2
      connection-timeout: 30000
      idle-timeout: 300000
      max-lifetime: 900000
      leak-detection-threshold: 120000  # 2 minutes - increased to avoid false positives with batch processing

  jpa:
    hibernate:
      ddl-auto: update
    show-sql: false
    open-in-view: false
    properties:
      hibernate:
        dialect: org.hibernate.dialect.PostgreSQLDialect
        format_sql: true
        default_schema: TED
        jdbc:
          batch_size: 25  # Match chunk size for optimal batch processing
        order_inserts: true
        order_updates: true

  flyway:
    enabled: false
    locations: classpath:db/migration
    baseline-on-migrate: true
    create-schemas: true
    schemas:
      - TED
      - DOC
    default-schema: TED

# Apache Camel Configuration
camel:
  springboot:
    main-run-controller: true
  health:
    enabled: true
    # Weniger strenge Health-Checks für File-Consumer
    consumers-enabled: false

# Custom Application Properties
ted:
  # Directory configuration for file processing
  input:
    # Base directory for watching incoming TED XML files
    directory: ${TED_INPUT_DIR:/ted.europe/extracted}
    # File pattern to match (recursive scanning)
    pattern: "**/*.xml"
    # Move processed files to this directory
    processed-directory: ${TED_PROCESSED_DIR:.processed}
    # Move failed files to this directory
    error-directory: ${TED_ERROR_DIR:.error}
    # Polling interval in milliseconds
    poll-interval: 5000
    # Maximum messages per poll (reduced to prevent memory issues)
    max-messages-per-poll: 10

  # Schema validation configuration
  schema:
    # Enable/disable XSD validation
    enabled: true
    # Path to eForms SDK schemas (from Maven dependency or custom location)
    path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd

  # Vectorization configuration
  vectorization:
    # Enable/disable async vectorization
    enabled: false
    # Use external HTTP API instead of subprocess
    use-http-api: true
    # Embedding service URL
    api-url: http://172.20.240.18:8001
    # Model name for sentence-transformers
    model-name: intfloat/multilingual-e5-large
    # Vector dimensions (must match model output)
    dimensions: 1024
    # Batch size for vectorization
    batch-size: 16
    # Thread pool size for async processing
    thread-pool-size: 4
    # Maximum text length for vectorization (characters)
    max-text-length: 8192
    # HTTP connection timeout (milliseconds)
    connect-timeout: 10000
    # HTTP socket/read timeout (milliseconds)
    socket-timeout: 60000
    # Maximum retries on connection failure
    max-retries: 5
    # Phase 2: use generic DOC representation/embedding pipeline as primary vectorization path
    generic-pipeline-enabled: true
    # Keep legacy TED vector columns updated until semantic search is migrated
    dual-write-legacy-ted-vectors: true
    # Scheduler interval for generic embedding polling
    generic-scheduler-period-ms: 6000
    # Builder identifier for primary TED semantic representations in DOC
    primary-representation-builder-key: ted-phase2-primary-representation
    # Provider key stored in DOC.doc_embedding_model
    embedding-provider: http-embedding-service

  # Search configuration
  search:
    # Default page size for search results
    default-page-size: 20
    # Maximum page size
    max-page-size: 100
    # Similarity threshold for vector search (0.0 - 1.0)
    similarity-threshold: 0.7
    # Minimum trigram similarity for fuzzy lexical matches
    trigram-similarity-threshold: 0.12
    # Candidate limits per engine before fusion/collapse
    fulltext-candidate-limit: 120
    trigram-candidate-limit: 120
    semantic-candidate-limit: 120
    # Hybrid fusion weights
    fulltext-weight: 0.35
    trigram-weight: 0.20
    semantic-weight: 0.45
    # Additional score weight for recency
    recency-boost-weight: 0.05
    # Recency half-life in days
    recency-half-life-days: 30
    # Enable chunk representations for long documents
    chunking-enabled: true
    # Target chunk size in characters
    chunk-target-chars: 1800
    # Overlap between consecutive chunks
    chunk-overlap-chars: 200
    # Maximum number of chunks generated per document
    max-chunks-per-document: 12
    # Startup backfill limit for missing lexical vectors
    startup-lexical-backfill-limit: 500
    # Number of top hits per engine returned by /search/debug
    debug-top-hits-per-engine: 10

  # TED Daily Package Download configuration
  download:
    # Enable/disable automatic package download
    enabled: true
    # User service-based camel route
    use-service-based: false
    # Base URL for TED Daily Packages
    base-url: https://ted.europa.eu/packages/daily/
    # Download directory for tar.gz files
    download-directory: /ted.europe/downloads
    # Extract directory for XML files
    extract-directory: /ted.europe/extracted
    # Start year for downloads
    start-year: 2026
    # Max consecutive 404 errors before stopping
    max-consecutive-404: 4
    # Polling interval (milliseconds) - 2 minutes
    poll-interval: 1800000
    # Retry interval for tail NOT_FOUND packages - 6 hours
    not-found-retry-interval: 21600000
    # Grace period after year end before a previous-year tail 404 is treated as final
    previous-year-grace-period-days: 30
    # Keep retrying current-year tail 404 packages indefinitely
    retry-current-year-not-found-indefinitely: true
    # Download timeout (milliseconds) - 5 minutes
    download-timeout: 300000
    # Max concurrent downloads
    max-concurrent-downloads: 2
    # Delay between downloads (milliseconds) for rate limiting - 5 seconds
    delay-between-downloads: 3000
    # Delete tar.gz after extraction
    delete-after-extraction: true
    # Prioritize current year first
    prioritize-current-year: false

  # IMAP Mail configuration
  mail:
    # Enable/disable mail processing
    enabled: false
    # IMAP server hostname
    host: mail.mymagenta.business
    # IMAP server port (993 for IMAPS)
    port: 993
    # Mail account username (email address)
    username: archiv@procon.co.at
    # Mail account password
    password: ${MAIL_PASSWORD:worasigg}
    # Use SSL/TLS connection
    ssl: true
    # Mail folder to read from
    folder-name: INBOX
    # Delete messages after processing
    delete: false
    # Mark messages as seen after processing (false = peek mode, don't mark as read)
    seen: false
    # Only process unseen messages
    unseen: true
    # Polling delay in milliseconds (1 minute)
    delay: 60000
    # Max messages per poll
    max-messages-per-poll: 100
    # Output directory for processed attachments
    attachment-output-directory: /ted.europe/mail-attachments
    # Enable/disable MIME file input processing
    mime-input-enabled: true
    # Input directory for MIME files (.eml)
    mime-input-directory: /ted.europe/mime-input
    # File pattern for MIME files (regex)
    mime-input-pattern: .*\\.eml
    # Polling interval for MIME input directory (milliseconds)
    mime-input-poll-interval: 1000000

  # Phase 3 TED projection configuration
  projection:
    # Enable/disable dual-write into the TED projection model on top of DOC.doc_document
    enabled: true
    # Optional startup backfill for legacy TED documents without a projection row yet
    startup-backfill-enabled: false
    # Maximum number of legacy TED documents to backfill during startup
    startup-backfill-limit: 250

  # Phase 4 generic ingestion configuration
  generic-ingestion:
    # Master switch for arbitrary document ingestion into the DOC model
    enabled: true
    # Enable file-system polling for non-TED documents
    file-system-enabled: false
    # Allow REST/API upload endpoints for arbitrary documents
    rest-upload-enabled: true
    # Input directory for the generic Camel file route
    input-directory: /ted.europe/generic-input
    # Regex for files accepted by the generic file route
    file-pattern: .*\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$
    # Move successfully processed files here
    processed-directory: .dip-processed
    # Move failed files here
    error-directory: .dip-error
    # Polling interval for the generic route
    poll-interval: 15000
    # Maximum files per poll
    max-messages-per-poll: 200
    # Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs
    default-owner-tenant-key:
    # Default visibility when no explicit access context is provided
    default-visibility: PUBLIC
    # Optional default language for filesystem imports
    default-language-code:
    # Store small binary originals in DOC.doc_content.binary_content
    store-original-binary-in-db: true
    # Maximum binary payload size persisted inline in DB
    max-binary-bytes-in-db: 5242880
    # Deduplicate by content hash and attach additional sources to the same canonical document
    deduplicate-by-content-hash: true
    # Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers
    store-original-content-for-wrapper-documents: true
    # Queue only the primary text representation for vectorization
    vectorize-primary-representation-only: true
    # Import batch marker written to DOC.doc_source.import_batch_id
    import-batch-id: phase4-generic
    # Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI
    ted-package-adapter-enabled: true
    # Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI
    mail-adapter-enabled: true
    # Optional dedicated mail owner tenant, falls back to default-owner-tenant-key
    mail-default-owner-tenant-key:
    # Visibility for imported mail messages and attachments
    mail-default-visibility: TENANT
    # Expand ZIP attachments recursively through the mail adapter
    expand-mail-zip-attachments: true
    # Import batch marker for TED package roots and children
    ted-package-import-batch-id: phase41-ted-package
    # When true, TED package documents are stored only through the generic ingestion gateway
    # and the legacy XML batch processing path is skipped
    gateway-only-for-ted-packages: true
    # Import batch marker for mail roots and attachments
    mail-import-batch-id: phase41-mail

  # Solution Brief processing configuration
  solution-brief:
    # Enable/disable Solution Brief processing
    enabled: false
    # Input directory for Solution Brief PDF files
    input-directory: C:/work/SolutionBrief
    # Output directory for Excel result files (relative to input or absolute)
    result-directory: ./result
    # Number of top similar documents to include
    top-k: 20
    # Minimum similarity threshold (0.0-1.0)
    similarity-threshold: 0.5
    # Polling interval in milliseconds (30 seconds)
    poll-interval: 30000
    # File pattern for PDF files (regex)
    file-pattern: .*\\.pdf
    # Process files only once (idempotent)
    idempotent: true
    # Idempotent repository file path
    idempotent-repository: ./solution-brief-processed.dat

  # Data cleanup configuration
  cleanup:
    # Enable automatic cleanup of old documents
    enabled: false
    # Retention period in years (default: 10)
    retention-years: 10
    # Cron expression for cleanup schedule (default: daily at 2 AM)
    cron: "0 0 2 * * *"

# Actuator endpoints
management:
  endpoints:
    web:
      exposure:
        include: health,info,metrics,camel
  endpoint:
    health:
      show-details: when-authorized

# OpenAPI documentation
springdoc:
  api-docs:
    path: /v3/api-docs
  swagger-ui:
    path: /swagger-ui.html
    operations-sorter: method

# Logging configuration
logging:
  level:
    at.procon.ted: INFO
    at.procon.ted.camel.SolutionBriefRoute: INFO
    org.apache.camel: INFO
    org.hibernate.SQL: WARN
    org.hibernate.type.descriptor.sql: WARN