DIP/src/main/resources/application-new.yml

393 lines
14 KiB
YAML

dip:
runtime:
mode: NEW
search:
# Default page size for search results
default-page-size: 20
# Maximum page size
max-page-size: 100
# Similarity threshold for vector search (0.0 - 1.0)
similarity-threshold: 0.7
# Minimum trigram similarity for fuzzy lexical matches
trigram-similarity-threshold: 0.12
# Candidate limits per engine before fusion/collapse
fulltext-candidate-limit: 120
trigram-candidate-limit: 120
semantic-candidate-limit: 120
# Hybrid fusion weights
fulltext-weight: 0.35
trigram-weight: 0.20
semantic-weight: 0.45
# Additional score weight for recency
recency-boost-weight: 0.05
# Recency half-life in days
recency-half-life-days: 30
# Enable chunk representations for long documents
chunking-enabled: true
# Target chunk size in characters
chunk-target-chars: 1800
# Overlap between consecutive chunks
chunk-overlap-chars: 200
# Maximum number of chunks generated per document
max-chunks-per-document: 12
# Startup backfill limit for missing lexical vectors
startup-lexical-backfill-limit: 500
scheduled-lexical-backfill-enabled: true
scheduled-lexical-backfill-delay-ms: 30000
scheduled-lexical-backfill-batch-size: 200
# Number of top hits per engine returned by /search/debug
debug-top-hits-per-engine: 10
embedding:
enabled: true
jobs:
enabled: false
parallel-batch-count: 1
process-in-batches: true
batch-size: 16
execution-batch-size: 16
default-document-model: e5-default
default-query-model: e5-default
providers:
mock-default:
type: mock
dimensions: 16
external-e5:
type: http-json
base-url: http://172.20.241.55:8001
connect-timeout: 5s
read-timeout: 60s
batch-request:
truncate-text: false
truncate-length: 512
chunk-size: 16
vector-sync-e5:
type: http-vector-sync
base-url: http://172.20.241.55:8001
connect-timeout: 30s
read-timeout: 300s
headers:
X-Client: dip
batch-request:
truncate-text: false
truncate-length: 512
chunk-size: 16
models:
mock-search:
provider-config-key: mock-default
provider-model-key: mock-search
dimensions: 16
distance-metric: COSINE
supports-query-embedding-mode: true
active: true
e5-default:
provider-config-key: vector-sync-e5
provider-model-key: intfloat/multilingual-e5-large
dimensions: 1024
distance-metric: COSINE
supports-query-embedding-mode: true
supports-batch: true
prefix-mode: CLIENT
query-prefix: "query: "
document-prefix: "passage: "
active: true
profiles:
definitions:
primary-only:
embed-representation-types: [SEMANTIC_TEXT]
primary-and-chunks:
embed-representation-types: [SEMANTIC_TEXT, CHUNK]
ted-semantic:
embed-representation-types: [SEMANTIC_TEXT] #[SEMANTIC_TEXT, TITLE_ABSTRACT, CHUNK]
mail-message:
embed-representation-types: [SEMANTIC_TEXT, ATTACHMENT_ROLLUP]
attachment-chunks:
embed-representation-types: [CHUNK]
disabled:
embed-representation-types: []
policies:
default-policy:
policy-key: generic-default
model-key: e5-default
query-model-key: e5-default
profile-key: primary-and-chunks
enabled: true
ted-policy:
policy-key: ted-default
model-key: e5-default
query-model-key: e5-default
profile-key: ted-semantic
enabled: true
rules:
- name: ted-notice
when:
document-family: TED_NOTICE
use:
policy-key: ted-default
model-key: e5-default
query-model-key: e5-default
profile-key: ted-semantic
enabled: true
- name: email-root
when:
document-type: EMAIL
use:
policy-key: mail-default
model-key: e5-default
query-model-key: e5-default
profile-key: mail-message
enabled: true
- name: mail-attachment-pdf
when:
source-type: MAIL_ATTACHMENT
mime-type: application/pdf
use:
policy-key: mail-attachment-pdf
model-key: e5-default
query-model-key: e5-default
profile-key: attachment-chunks
enabled: true
- name: skip-images
when:
mime-type: image/.*
use:
policy-key: no-embedding-images
profile-key: disabled
enabled: false
clustering:
python:
enabled: true
base-url: http://localhost:8001
cluster-path: /cluster
cluster-run-path: /cluster-run
request-mode: INLINE_VECTORS
connect-timeout: 30s
read-timeout: 30m
# Phase 4 generic ingestion configuration
ingestion:
# Master switch for arbitrary document ingestion into the DOC model
enabled: true
# Enable file-system polling for non-TED documents
file-system-enabled: false
# Allow REST/API upload endpoints for arbitrary documents
rest-upload-enabled: true
# Input directory for the generic Camel file route
input-directory: /ted.europe/generic-input
# Regex for files accepted by the generic file route
file-pattern: .*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$
# Move successfully processed files here
processed-directory: .dip-processed
# Move failed files here
error-directory: .dip-error
# Polling interval for the generic route
poll-interval: 15000
# Maximum files per poll
max-messages-per-poll: 200
# Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs
default-owner-tenant-key:
# Default visibility when no explicit access context is provided
default-visibility: PUBLIC
# Optional default language for filesystem imports
default-language-code:
# Store small binary originals in DOC.doc_content.binary_content
store-original-binary-in-db: true
# Maximum binary payload size persisted inline in DB
max-binary-bytes-in-db: 5242880
# Deduplicate by content hash and attach additional sources to the same canonical document
deduplicate-by-content-hash: true
# Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers
store-original-content-for-wrapper-documents: true
# Queue only the primary text representation for vectorization
vectorize-primary-representation-only: true
# Import batch marker written to DOC.doc_source.import_batch_id
import-batch-id: phase4-generic
# Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI
ted-package-adapter-enabled: true
# Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI
mail-adapter-enabled: true
# Optional dedicated mail owner tenant, falls back to default-owner-tenant-key
mail-default-owner-tenant-key:
# Visibility for imported mail messages and attachments
mail-default-visibility: TENANT
# Expand ZIP attachments recursively through the mail adapter
expand-mail-zip-attachments: true
# Import batch marker for TED package roots and children
ted-package-import-batch-id: phase41-ted-package
# When true, TED package documents are stored only through the generic ingestion gateway
# and the legacy XML batch processing path is skipped
gateway-only-for-ted-packages: true
# Import batch marker for mail roots and attachments
mail-import-batch-id: phase41-mail
ted-package-child-parallelism: 4
ted-package-child-max-in-flight: 8
# NEW Camel mail consumer route for provider-driven mail ingestion
mail-route:
# Enable/disable the NEW Camel mail consumer
enabled: false
# Generic mail server protocol (IMAP/IMAPS/POP3/POP3S)
protocol: IMAPS
# Mail server host
host: mail.mymagenta.business
# Mail server port; leave empty to use Camel component defaults
port: 993
# Mailbox username
username: archiv@procon.co.at
# Mailbox password
password: ${MAIL_PASSWORD:worasigg}
# Folder/mailbox name
folder-name: INBOX
# Optional stable provider account key; falls back to username
account-key:
# Delete messages after successful processing
delete: false
# Consume only unseen messages
unseen: true
# Keep messages unread while consuming
peek: true
# Poll delay in milliseconds
delay: 15000
# Maximum messages per poll
max-messages-per-poll: 20
# Fetch entire messages by default
fetch-size: 10
# Close folder after each poll cycle
close-folder: false
# Camel mail debug mode
debug-mode: false
# Socket connection timeout in milliseconds
connection-timeout: 30000
# ted packages download configuration
ted-download:
# Enable/disable automatic package download
enabled: false
# Base URL for TED Daily Packages
base-url: https://ted.europa.eu/packages/daily/
# Download directory for tar.gz files
download-directory: /ted.europe/downloads-new
# Start year for downloads
start-year: 2026
# Polling interval (milliseconds) - 2 minutes
poll-interval: 60000
# Retry interval for tail NOT_FOUND packages - 6 hours
not-found-retry-interval: 21600000
# Grace period after year end before a previous-year tail 404 is treated as final
previous-year-grace-period-days: 30
# Keep retrying current-year tail 404 packages indefinitely
retry-current-year-not-found-indefinitely: true
# Download timeout (milliseconds) - 5 minutes
download-timeout: 300000
# Max concurrent downloads
max-running-packages: 2
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
delay-between-downloads: 5000
# Delete tar.gz after ingestion
delete-after-ingestion: true
time:
enabled: false
leitstand:
enabled: false
startup-sync-enabled: false
startup-selective-materialization-enabled: true
selective-materialization-person-dbk: 100920031023144811001000
selective-materialization-person-number:
selective-materialization-build-projection: true
create-canonical-time-entries: true
build-search-projection: true
build-representations: true
queue-embeddings: true
startup-projection-rebuild-enabled: false
representation-language-code: de
incremental-enabled: true
scope-key: leitstand-default
import-batch-id: time-leitstand
reconcile-lookback-days: 7
jdbc:
url: jdbc:jtds:sqlserver://mag2:1433;databaseName=spc
username: sa
password: jhcbxr
driver-class-name: net.sourceforge.jtds.jdbc.Driver
fetch-size: 500
query-timeout-seconds: 300
toggl-track:
enabled: false
import-batch-id: time-toggl
reconcile-lookback-days: 7
ted: # Phase 3 TED projection configuration
projection:
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document
enabled: true
# Optional startup backfill for legacy TED documents without a projection row yet
startup-backfill-enabled: false
# Maximum number of legacy TED documents to backfill during startup
startup-backfill-limit: 250
structured-search-hybrid-candidate-limit: 5000
structured-search-facet-bucket-limit: 12
migration:
legacy-audit:
# Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem
enabled: false
# Optional startup execution; the audit is read-only and only writes audit run/finding tables
startup-run-enabled: true
# Maximum number of legacy TED documents to scan during startup (0 = all)
startup-run-limit: 0
# Page size for legacy TED document scanning
page-size: 100
# Maximum number of persisted findings in a single run
max-findings-per-run: 10000
# Maximum number of grouped duplicate samples captured for aggregate checks
max-duplicate-samples: 100
legacy-ted:
# Enable the resumable legacy TED -> DOC/projection backfill subsystem
enabled: false
# Run the backfill automatically on NEW-runtime startup
startup-enabled: false
# Number of legacy TED documents fetched and processed per batch
batch-size: 100
# Optional cap for a single invocation; 0 means migrate all remaining rows
max-documents-per-run: 0
# Resume the latest STOPPED/FAILED run from its saved cursor
resume-latest-incomplete-run: true
# Import batch id written to DOC.doc_source rows created by the migration
import-batch-id: legacy-ted-backfill
# Keep false for Wave 1; embeddings can be backfilled later as a separate step
queue-embeddings: false
migrate-embeddings: false
build-chunk-representations: true
legacy-ted-embeddings:
enabled: false
startup-enabled: false
batch-size: 500
max-documents-per-run: 0
skip-when-primary-representation-missing: true
queue-missing-embeddings: true