DIP/src/main/resources/application-new.yml

dip:
  runtime:
    mode: NEW

  search:
      # Default page size for search results
      default-page-size: 20
      # Maximum page size
      max-page-size: 100
      # Similarity threshold for vector search (0.0 - 1.0)
      similarity-threshold: 0.7
      # Minimum trigram similarity for fuzzy lexical matches
      trigram-similarity-threshold: 0.12
      # Candidate limits per engine before fusion/collapse
      fulltext-candidate-limit: 120
      trigram-candidate-limit: 120
      semantic-candidate-limit: 120
      # Hybrid fusion weights
      fulltext-weight: 0.35
      trigram-weight: 0.20
      semantic-weight: 0.45
      # Additional score weight for recency
      recency-boost-weight: 0.05
      # Recency half-life in days
      recency-half-life-days: 30
      # Enable chunk representations for long documents
      chunking-enabled: true
      # Target chunk size in characters
      chunk-target-chars: 1800
      # Overlap between consecutive chunks
      chunk-overlap-chars: 200
      # Maximum number of chunks generated per document
      max-chunks-per-document: 12
      # Startup backfill limit for missing lexical vectors
      startup-lexical-backfill-limit: 500
      scheduled-lexical-backfill-enabled: true
      scheduled-lexical-backfill-delay-ms: 30000
      scheduled-lexical-backfill-batch-size: 200
      # Number of top hits per engine returned by /search/debug
      debug-top-hits-per-engine: 10

  embedding:
      enabled: true
      jobs:
        enabled: false
        parallel-batch-count: 1
        process-in-batches: true
        batch-size: 16
        execution-batch-size: 16

      default-document-model: e5-default
      default-query-model: e5-default

      providers:

        mock-default:
          type: mock
          dimensions: 16

        external-e5:
          type: http-json
          base-url: http://172.20.241.55:8001
          connect-timeout: 5s
          read-timeout: 60s
          batch-request:
            truncate-text: false
            truncate-length: 512
            chunk-size: 16

        vector-sync-e5:
          type: http-vector-sync
          base-url: http://172.20.241.55:8001
          connect-timeout: 30s
          read-timeout: 300s
          headers:
            X-Client: dip
          batch-request:
            truncate-text: false
            truncate-length: 512
            chunk-size: 16

      models:

        mock-search:
          provider-config-key: mock-default
          provider-model-key: mock-search
          dimensions: 16
          distance-metric: COSINE
          supports-query-embedding-mode: true
          active: true

        e5-default:
          provider-config-key: vector-sync-e5
          provider-model-key: intfloat/multilingual-e5-large
          dimensions: 1024
          distance-metric: COSINE
          supports-query-embedding-mode: true
          supports-batch: true
          prefix-mode: CLIENT
          query-prefix: "query: "
          document-prefix: "passage: "
          active: true

      profiles:
        definitions:
          primary-only:
            embed-representation-types: [SEMANTIC_TEXT]

          primary-and-chunks:
            embed-representation-types: [SEMANTIC_TEXT, CHUNK]

          ted-semantic:
            embed-representation-types: [SEMANTIC_TEXT] #[SEMANTIC_TEXT, TITLE_ABSTRACT, CHUNK]

          mail-message:
            embed-representation-types: [SEMANTIC_TEXT, ATTACHMENT_ROLLUP]

          attachment-chunks:
            embed-representation-types: [CHUNK]

          disabled:
            embed-representation-types: []

      policies:
        default-policy:
          policy-key: generic-default
          model-key: e5-default
          query-model-key: e5-default
          profile-key: primary-and-chunks
          enabled: true

        ted-policy:
          policy-key: ted-default
          model-key: e5-default
          query-model-key: e5-default
          profile-key: ted-semantic
          enabled: true

        rules:
          - name: ted-notice
            when:
              document-family: TED_NOTICE
            use:
              policy-key: ted-default
              model-key: e5-default
              query-model-key: e5-default
              profile-key: ted-semantic
              enabled: true

          - name: email-root
            when:
              document-type: EMAIL
            use:
              policy-key: mail-default
              model-key: e5-default
              query-model-key: e5-default
              profile-key: mail-message
              enabled: true

          - name: mail-attachment-pdf
            when:
              source-type: MAIL_ATTACHMENT
              mime-type: application/pdf
            use:
              policy-key: mail-attachment-pdf
              model-key: e5-default
              query-model-key: e5-default
              profile-key: attachment-chunks
              enabled: true

          - name: skip-images
            when:
              mime-type: image/.*
            use:
              policy-key: no-embedding-images
              profile-key: disabled
              enabled: false

  clustering:
    python:
      enabled: true
      base-url: http://localhost:8001
      cluster-path: /cluster
      cluster-run-path: /cluster-run
      request-mode: INLINE_VECTORS
      connect-timeout: 30s
      read-timeout: 30m

  # Phase 4 generic ingestion configuration
  ingestion:
    # Master switch for arbitrary document ingestion into the DOC model
    enabled: true
    # Enable file-system polling for non-TED documents
    file-system-enabled: false
    # Allow REST/API upload endpoints for arbitrary documents
    rest-upload-enabled: true
    # Input directory for the generic Camel file route
    input-directory: /ted.europe/generic-input
    # Regex for files accepted by the generic file route
    file-pattern: .*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$
    # Move successfully processed files here
    processed-directory: .dip-processed
    # Move failed files here
    error-directory: .dip-error
    # Polling interval for the generic route
    poll-interval: 15000
    # Maximum files per poll
    max-messages-per-poll: 200
    # Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs
    default-owner-tenant-key:
    # Default visibility when no explicit access context is provided
    default-visibility: PUBLIC
    # Optional default language for filesystem imports
    default-language-code:
    # Store small binary originals in DOC.doc_content.binary_content
    store-original-binary-in-db: true
    # Maximum binary payload size persisted inline in DB
    max-binary-bytes-in-db: 5242880
    # Deduplicate by content hash and attach additional sources to the same canonical document
    deduplicate-by-content-hash: true
    # Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers
    store-original-content-for-wrapper-documents: true
    # Queue only the primary text representation for vectorization
    vectorize-primary-representation-only: true
    # Import batch marker written to DOC.doc_source.import_batch_id
    import-batch-id: phase4-generic
    # Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI
    ted-package-adapter-enabled: true
    # Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI
    mail-adapter-enabled: true
    # Optional dedicated mail owner tenant, falls back to default-owner-tenant-key
    mail-default-owner-tenant-key:
    # Visibility for imported mail messages and attachments
    mail-default-visibility: TENANT
    # Expand ZIP attachments recursively through the mail adapter
    expand-mail-zip-attachments: true
    # Import batch marker for TED package roots and children
    ted-package-import-batch-id: phase41-ted-package
    # When true, TED package documents are stored only through the generic ingestion gateway
    # and the legacy XML batch processing path is skipped
    gateway-only-for-ted-packages: true
    # Import batch marker for mail roots and attachments
    mail-import-batch-id: phase41-mail

    ted-package-child-parallelism: 4
    ted-package-child-max-in-flight: 8

    # NEW Camel mail consumer route for provider-driven mail ingestion
    mail-route:
      # Enable/disable the NEW Camel mail consumer
      enabled: false
      # Generic mail server protocol (IMAP/IMAPS/POP3/POP3S)
      protocol: IMAPS
      # Mail server host
      host: mail.mymagenta.business
      # Mail server port; leave empty to use Camel component defaults
      port: 993
      # Mailbox username
      username: archiv@procon.co.at
      # Mailbox password
      password: ${MAIL_PASSWORD:worasigg}
      # Folder/mailbox name
      folder-name: INBOX
      # Optional stable provider account key; falls back to username
      account-key:
      # Delete messages after successful processing
      delete: false
      # Consume only unseen messages
      unseen: true
      # Keep messages unread while consuming
      peek: true
      # Poll delay in milliseconds
      delay: 15000
      # Maximum messages per poll
      max-messages-per-poll: 20
      # Fetch entire messages by default
      fetch-size: 10
      # Close folder after each poll cycle
      close-folder: false
      # Camel mail debug mode
      debug-mode: false
      # Socket connection timeout in milliseconds
      connection-timeout: 30000

    # ted packages download configuration
    ted-download:
      # Enable/disable automatic package download
      enabled: false
      # Base URL for TED Daily Packages
      base-url: https://ted.europa.eu/packages/daily/
      # Download directory for tar.gz files
      download-directory: /ted.europe/downloads-new
      # Start year for downloads
      start-year: 2026
      # Polling interval (milliseconds) - 2 minutes
      poll-interval: 60000
      # Retry interval for tail NOT_FOUND packages - 6 hours
      not-found-retry-interval: 21600000
      # Grace period after year end before a previous-year tail 404 is treated as final
      previous-year-grace-period-days: 30
      # Keep retrying current-year tail 404 packages indefinitely
      retry-current-year-not-found-indefinitely: true
      # Download timeout (milliseconds) - 5 minutes
      download-timeout: 300000
      # Max concurrent downloads
      max-running-packages: 2
      # Delay between downloads (milliseconds) for rate limiting - 5 seconds
      delay-between-downloads: 5000
      # Delete tar.gz after ingestion
      delete-after-ingestion: true

  time:
    enabled: false
    leitstand:
      enabled: false
      startup-sync-enabled: false
      startup-selective-materialization-enabled: true
      selective-materialization-person-dbk: 100920031023144811001000
      selective-materialization-person-number:
      selective-materialization-build-projection: true
      create-canonical-time-entries: true
      build-search-projection: true
      build-representations: true
      queue-embeddings: true
      startup-projection-rebuild-enabled: false
      representation-language-code: de
      incremental-enabled: true
      scope-key: leitstand-default
      import-batch-id: time-leitstand
      reconcile-lookback-days: 7
      jdbc:
        url: jdbc:jtds:sqlserver://mag2:1433;databaseName=spc
        username: sa
        password: jhcbxr
        driver-class-name: net.sourceforge.jtds.jdbc.Driver
        fetch-size: 500
        query-timeout-seconds: 300
    toggl-track:
      enabled: false
      import-batch-id: time-toggl
      reconcile-lookback-days: 7

  ted: # Phase 3 TED projection configuration
    projection:
      # Enable/disable dual-write into the TED projection model on top of DOC.doc_document
      enabled: true
      # Optional startup backfill for legacy TED documents without a projection row yet
      startup-backfill-enabled: false
      # Maximum number of legacy TED documents to backfill during startup
      startup-backfill-limit: 250
      structured-search-hybrid-candidate-limit: 5000
      structured-search-facet-bucket-limit: 12

  migration:
    legacy-audit:
      # Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem
      enabled: false
      # Optional startup execution; the audit is read-only and only writes audit run/finding tables
      startup-run-enabled: true
      # Maximum number of legacy TED documents to scan during startup (0 = all)
      startup-run-limit: 0
      # Page size for legacy TED document scanning
      page-size: 100
      # Maximum number of persisted findings in a single run
      max-findings-per-run: 10000
      # Maximum number of grouped duplicate samples captured for aggregate checks
      max-duplicate-samples: 100

    legacy-ted:
      # Enable the resumable legacy TED -> DOC/projection backfill subsystem
      enabled: false
      # Run the backfill automatically on NEW-runtime startup
      startup-enabled: false
      # Number of legacy TED documents fetched and processed per batch
      batch-size: 100
      # Optional cap for a single invocation; 0 means migrate all remaining rows
      max-documents-per-run: 0
      # Resume the latest STOPPED/FAILED run from its saved cursor
      resume-latest-incomplete-run: true
      # Import batch id written to DOC.doc_source rows created by the migration
      import-batch-id: legacy-ted-backfill
      # Keep false for Wave 1; embeddings can be backfilled later as a separate step
      queue-embeddings: false
      migrate-embeddings: false
      build-chunk-representations: true

    legacy-ted-embeddings:
      enabled: false
      startup-enabled: false
      batch-size: 500
      max-documents-per-run: 0
      skip-when-primary-representation-missing: true
      queue-missing-embeddings: true