dip: runtime: mode: NEW search: # Default page size for search results default-page-size: 20 # Maximum page size max-page-size: 100 # Similarity threshold for vector search (0.0 - 1.0) similarity-threshold: 0.7 # Minimum trigram similarity for fuzzy lexical matches trigram-similarity-threshold: 0.12 # Candidate limits per engine before fusion/collapse fulltext-candidate-limit: 120 trigram-candidate-limit: 120 semantic-candidate-limit: 120 # Hybrid fusion weights fulltext-weight: 0.35 trigram-weight: 0.20 semantic-weight: 0.45 # Additional score weight for recency recency-boost-weight: 0.05 # Recency half-life in days recency-half-life-days: 30 # Enable chunk representations for long documents chunking-enabled: true # Target chunk size in characters chunk-target-chars: 1800 # Overlap between consecutive chunks chunk-overlap-chars: 200 # Maximum number of chunks generated per document max-chunks-per-document: 12 # Startup backfill limit for missing lexical vectors startup-lexical-backfill-limit: 500 # Number of top hits per engine returned by /search/debug debug-top-hits-per-engine: 10 embedding: enabled: true jobs: enabled: false parallel-batch-count: 1 process-in-batches: true batch-size: 16 execution-batch-size: 16 default-document-model: e5-default default-query-model: e5-default providers: mock-default: type: mock dimensions: 16 external-e5: type: http-json base-url: http://172.20.241.55:8001 connect-timeout: 5s read-timeout: 60s batch-request: truncate-text: false truncate-length: 512 chunk-size: 16 vector-sync-e5: type: http-vector-sync base-url: http://172.20.241.55:8001 connect-timeout: 30s read-timeout: 300s headers: X-Client: dip batch-request: truncate-text: false truncate-length: 512 chunk-size: 16 models: mock-search: provider-config-key: mock-default provider-model-key: mock-search dimensions: 16 distance-metric: COSINE supports-query-embedding-mode: true active: true e5-default: provider-config-key: vector-sync-e5 provider-model-key: intfloat/multilingual-e5-large dimensions: 1024 distance-metric: COSINE supports-query-embedding-mode: true supports-batch: true prefix-mode: CLIENT query-prefix: "query: " document-prefix: "passage: " active: true profiles: definitions: primary-only: embed-representation-types: [SEMANTIC_TEXT] primary-and-chunks: embed-representation-types: [SEMANTIC_TEXT, CHUNK] ted-semantic: embed-representation-types: [SEMANTIC_TEXT] #[SEMANTIC_TEXT, TITLE_ABSTRACT, CHUNK] mail-message: embed-representation-types: [SEMANTIC_TEXT, ATTACHMENT_ROLLUP] attachment-chunks: embed-representation-types: [CHUNK] disabled: embed-representation-types: [] policies: default-policy: policy-key: generic-default model-key: e5-default query-model-key: e5-default profile-key: primary-and-chunks enabled: true ted-policy: policy-key: ted-default model-key: e5-default query-model-key: e5-default profile-key: ted-semantic enabled: true rules: - name: ted-notice when: document-family: TED_NOTICE use: policy-key: ted-default model-key: e5-default query-model-key: e5-default profile-key: ted-semantic enabled: true - name: email-root when: document-type: EMAIL use: policy-key: mail-default model-key: e5-default query-model-key: e5-default profile-key: mail-message enabled: true - name: mail-attachment-pdf when: source-type: MAIL_ATTACHMENT mime-type: application/pdf use: policy-key: mail-attachment-pdf model-key: e5-default query-model-key: e5-default profile-key: attachment-chunks enabled: true - name: skip-images when: mime-type: image/.* use: policy-key: no-embedding-images profile-key: disabled enabled: false # Phase 4 generic ingestion configuration ingestion: # Master switch for arbitrary document ingestion into the DOC model enabled: true # Enable file-system polling for non-TED documents file-system-enabled: false # Allow REST/API upload endpoints for arbitrary documents rest-upload-enabled: true # Input directory for the generic Camel file route input-directory: /ted.europe/generic-input # Regex for files accepted by the generic file route file-pattern: .*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$ # Move successfully processed files here processed-directory: .dip-processed # Move failed files here error-directory: .dip-error # Polling interval for the generic route poll-interval: 15000 # Maximum files per poll max-messages-per-poll: 200 # Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs default-owner-tenant-key: # Default visibility when no explicit access context is provided default-visibility: PUBLIC # Optional default language for filesystem imports default-language-code: # Store small binary originals in DOC.doc_content.binary_content store-original-binary-in-db: true # Maximum binary payload size persisted inline in DB max-binary-bytes-in-db: 5242880 # Deduplicate by content hash and attach additional sources to the same canonical document deduplicate-by-content-hash: true # Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers store-original-content-for-wrapper-documents: true # Queue only the primary text representation for vectorization vectorize-primary-representation-only: true # Import batch marker written to DOC.doc_source.import_batch_id import-batch-id: phase4-generic # Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI ted-package-adapter-enabled: true # Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI mail-adapter-enabled: true # Optional dedicated mail owner tenant, falls back to default-owner-tenant-key mail-default-owner-tenant-key: # Visibility for imported mail messages and attachments mail-default-visibility: TENANT # Expand ZIP attachments recursively through the mail adapter expand-mail-zip-attachments: true # Import batch marker for TED package roots and children ted-package-import-batch-id: phase41-ted-package # When true, TED package documents are stored only through the generic ingestion gateway # and the legacy XML batch processing path is skipped gateway-only-for-ted-packages: true # Import batch marker for mail roots and attachments mail-import-batch-id: phase41-mail # NEW Camel mail consumer route for provider-driven mail ingestion mail-route: # Enable/disable the NEW Camel mail consumer enabled: false # Generic mail server protocol (IMAP/IMAPS/POP3/POP3S) protocol: IMAPS # Mail server host host: mail.mymagenta.business # Mail server port; leave empty to use Camel component defaults port: 993 # Mailbox username username: archiv@procon.co.at # Mailbox password password: ${MAIL_PASSWORD:worasigg} # Folder/mailbox name folder-name: INBOX # Optional stable provider account key; falls back to username account-key: # Delete messages after successful processing delete: false # Consume only unseen messages unseen: true # Keep messages unread while consuming peek: true # Poll delay in milliseconds delay: 15000 # Maximum messages per poll max-messages-per-poll: 20 # Fetch entire messages by default fetch-size: 10 # Close folder after each poll cycle close-folder: false # Camel mail debug mode debug-mode: false # Socket connection timeout in milliseconds connection-timeout: 30000 # ted packages download configuration ted-download: # Enable/disable automatic package download enabled: false # Base URL for TED Daily Packages base-url: https://ted.europa.eu/packages/daily/ # Download directory for tar.gz files download-directory: /ted.europe/downloads-new # Start year for downloads start-year: 2026 # Polling interval (milliseconds) - 2 minutes poll-interval: 60000 # Retry interval for tail NOT_FOUND packages - 6 hours not-found-retry-interval: 21600000 # Grace period after year end before a previous-year tail 404 is treated as final previous-year-grace-period-days: 30 # Keep retrying current-year tail 404 packages indefinitely retry-current-year-not-found-indefinitely: true # Download timeout (milliseconds) - 5 minutes download-timeout: 300000 # Max concurrent downloads max-running-packages: 2 # Delay between downloads (milliseconds) for rate limiting - 5 seconds delay-between-downloads: 5000 # Delete tar.gz after ingestion delete-after-ingestion: true time: enabled: false leitstand: enabled: false import-batch-id: time-leitstand reconcile-lookback-days: 7 toggl-track: enabled: false import-batch-id: time-toggl reconcile-lookback-days: 7 ted: # Phase 3 TED projection configuration projection: # Enable/disable dual-write into the TED projection model on top of DOC.doc_document enabled: true # Optional startup backfill for legacy TED documents without a projection row yet startup-backfill-enabled: false # Maximum number of legacy TED documents to backfill during startup startup-backfill-limit: 250 structured-search-hybrid-candidate-limit: 5000 structured-search-facet-bucket-limit: 12 migration: legacy-audit: # Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem enabled: false # Optional startup execution; the audit is read-only and only writes audit run/finding tables startup-run-enabled: true # Maximum number of legacy TED documents to scan during startup (0 = all) startup-run-limit: 0 # Page size for legacy TED document scanning page-size: 100 # Maximum number of persisted findings in a single run max-findings-per-run: 10000 # Maximum number of grouped duplicate samples captured for aggregate checks max-duplicate-samples: 100 legacy-ted: # Enable the resumable legacy TED -> DOC/projection backfill subsystem enabled: false # Run the backfill automatically on NEW-runtime startup startup-enabled: false # Number of legacy TED documents fetched and processed per batch batch-size: 100 # Optional cap for a single invocation; 0 means migrate all remaining rows max-documents-per-run: 0 # Resume the latest STOPPED/FAILED run from its saved cursor resume-latest-incomplete-run: true # Import batch id written to DOC.doc_source rows created by the migration import-batch-id: legacy-ted-backfill # Keep false for Wave 1; embeddings can be backfilled later as a separate step queue-embeddings: false migrate-embeddings: false build-chunk-representations: true legacy-ted-embeddings: enabled: false startup-enabled: false batch-size: 500 max-documents-per-run: 0 skip-when-primary-representation-missing: true queue-missing-embeddings: true