424 lines
15 KiB
YAML
424 lines
15 KiB
YAML
dip:
|
|
runtime:
|
|
mode: NEW
|
|
|
|
search:
|
|
# Default page size for search results
|
|
default-page-size: 20
|
|
# Maximum page size
|
|
max-page-size: 100
|
|
# Similarity threshold for vector search (0.0 - 1.0)
|
|
similarity-threshold: 0.7
|
|
# Minimum trigram similarity for fuzzy lexical matches
|
|
trigram-similarity-threshold: 0.12
|
|
# Candidate limits per engine before fusion/collapse
|
|
fulltext-candidate-limit: 120
|
|
trigram-candidate-limit: 120
|
|
semantic-candidate-limit: 120
|
|
# Hybrid fusion weights
|
|
fulltext-weight: 0.35
|
|
trigram-weight: 0.20
|
|
semantic-weight: 0.45
|
|
# Additional score weight for recency
|
|
recency-boost-weight: 0.05
|
|
# Recency half-life in days
|
|
recency-half-life-days: 30
|
|
# Enable chunk representations for long documents
|
|
chunking-enabled: true
|
|
# Target chunk size in characters
|
|
chunk-target-chars: 1800
|
|
# Overlap between consecutive chunks
|
|
chunk-overlap-chars: 200
|
|
# Maximum number of chunks generated per document
|
|
max-chunks-per-document: 12
|
|
# Startup backfill limit for missing lexical vectors
|
|
startup-lexical-backfill-limit: 500
|
|
scheduled-lexical-backfill-enabled: false
|
|
scheduled-lexical-backfill-delay-ms: 30000
|
|
scheduled-lexical-backfill-batch-size: 200
|
|
# Number of top hits per engine returned by /search/debug
|
|
debug-top-hits-per-engine: 10
|
|
|
|
embedding:
|
|
enabled: true
|
|
jobs:
|
|
enabled: false
|
|
parallel-batch-count: 2
|
|
process-in-batches: true
|
|
batch-size: 48
|
|
execution-batch-size: 48
|
|
startup:
|
|
# Enqueue missing DOC representation embeddings on NEW-runtime startup.
|
|
enqueue-missing-enabled: false
|
|
# Also process ready embedding jobs during startup. Requires dip.embedding.jobs.enabled=true.
|
|
process-ready-enabled: false
|
|
# Leave empty to enqueue missing embeddings for all document types, or set e.g. TED_NOTICE_LOT.
|
|
document-type:
|
|
# Optional representation filter, e.g. SEMANTIC_TEXT.
|
|
representation-type:
|
|
# Optional builder filter, e.g. ted-lot-clustering-text-v1.
|
|
builder-key:
|
|
primary-only: false
|
|
# Leave empty to use dip.embedding.default-document-model.
|
|
model-key:
|
|
# False skips representations that already have a COMPLETED embedding for the model.
|
|
force: false
|
|
batch-size: 1000
|
|
# 0 means enqueue all matching not-vectorized representations.
|
|
max-representations-per-run: 0
|
|
|
|
default-document-model: e5-default
|
|
default-query-model: e5-default
|
|
|
|
providers:
|
|
|
|
mock-default:
|
|
type: mock
|
|
dimensions: 16
|
|
|
|
external-e5:
|
|
type: http-json
|
|
base-url: http://172.20.241.55:8001
|
|
connect-timeout: 5s
|
|
read-timeout: 60s
|
|
batch-request:
|
|
truncate-text: false
|
|
truncate-length: 512
|
|
chunk-size: 16
|
|
|
|
vector-sync-e5:
|
|
type: http-vector-sync
|
|
base-url: http://172.20.241.55:8001
|
|
connect-timeout: 30s
|
|
read-timeout: 300s
|
|
headers:
|
|
X-Client: dip
|
|
batch-request:
|
|
truncate-text: false
|
|
truncate-length: 512
|
|
chunk-size: 16
|
|
|
|
models:
|
|
|
|
mock-search:
|
|
provider-config-key: mock-default
|
|
provider-model-key: mock-search
|
|
dimensions: 16
|
|
distance-metric: COSINE
|
|
supports-query-embedding-mode: true
|
|
active: true
|
|
|
|
e5-default:
|
|
provider-config-key: vector-sync-e5
|
|
provider-model-key: intfloat/multilingual-e5-large
|
|
dimensions: 1024
|
|
distance-metric: COSINE
|
|
supports-query-embedding-mode: true
|
|
supports-batch: true
|
|
prefix-mode: CLIENT
|
|
query-prefix: "query: "
|
|
document-prefix: "passage: "
|
|
active: true
|
|
|
|
profiles:
|
|
definitions:
|
|
primary-only:
|
|
embed-representation-types: [SEMANTIC_TEXT]
|
|
|
|
primary-and-chunks:
|
|
embed-representation-types: [SEMANTIC_TEXT, CHUNK]
|
|
|
|
ted-semantic:
|
|
embed-representation-types: [SEMANTIC_TEXT] #[SEMANTIC_TEXT, TITLE_ABSTRACT, CHUNK]
|
|
|
|
mail-message:
|
|
embed-representation-types: [SEMANTIC_TEXT, ATTACHMENT_ROLLUP]
|
|
|
|
attachment-chunks:
|
|
embed-representation-types: [CHUNK]
|
|
|
|
disabled:
|
|
embed-representation-types: []
|
|
|
|
policies:
|
|
default-policy:
|
|
policy-key: generic-default
|
|
model-key: e5-default
|
|
query-model-key: e5-default
|
|
profile-key: primary-and-chunks
|
|
enabled: true
|
|
|
|
ted-policy:
|
|
policy-key: ted-default
|
|
model-key: e5-default
|
|
query-model-key: e5-default
|
|
profile-key: ted-semantic
|
|
enabled: true
|
|
|
|
rules:
|
|
- name: ted-notice
|
|
when:
|
|
document-family: TED_NOTICE
|
|
use:
|
|
policy-key: ted-default
|
|
model-key: e5-default
|
|
query-model-key: e5-default
|
|
profile-key: ted-semantic
|
|
enabled: true
|
|
|
|
- name: email-root
|
|
when:
|
|
document-type: EMAIL
|
|
use:
|
|
policy-key: mail-default
|
|
model-key: e5-default
|
|
query-model-key: e5-default
|
|
profile-key: mail-message
|
|
enabled: true
|
|
|
|
- name: mail-attachment-pdf
|
|
when:
|
|
source-type: MAIL_ATTACHMENT
|
|
mime-type: application/pdf
|
|
use:
|
|
policy-key: mail-attachment-pdf
|
|
model-key: e5-default
|
|
query-model-key: e5-default
|
|
profile-key: attachment-chunks
|
|
enabled: true
|
|
|
|
- name: skip-images
|
|
when:
|
|
mime-type: image/.*
|
|
use:
|
|
policy-key: no-embedding-images
|
|
profile-key: disabled
|
|
enabled: false
|
|
|
|
clustering:
|
|
python:
|
|
enabled: true
|
|
base-url: http://localhost:8001
|
|
cluster-path: /cluster
|
|
cluster-run-path: /cluster-run
|
|
request-mode: INLINE_VECTORS
|
|
connect-timeout: 30s
|
|
read-timeout: 30m
|
|
|
|
# Phase 4 generic ingestion configuration
|
|
ingestion:
|
|
# Master switch for arbitrary document ingestion into the DOC model
|
|
enabled: true
|
|
# Enable file-system polling for non-TED documents
|
|
file-system-enabled: false
|
|
# Allow REST/API upload endpoints for arbitrary documents
|
|
rest-upload-enabled: true
|
|
# Input directory for the generic Camel file route
|
|
input-directory: /ted.europe/generic-input
|
|
# Regex for files accepted by the generic file route
|
|
file-pattern: .*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$
|
|
# Move successfully processed files here
|
|
processed-directory: .dip-processed
|
|
# Move failed files here
|
|
error-directory: .dip-error
|
|
# Polling interval for the generic route
|
|
poll-interval: 15000
|
|
# Maximum files per poll
|
|
max-messages-per-poll: 200
|
|
# Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs
|
|
default-owner-tenant-key:
|
|
# Default visibility when no explicit access context is provided
|
|
default-visibility: PUBLIC
|
|
# Optional default language for filesystem imports
|
|
default-language-code:
|
|
# Store small binary originals in DOC.doc_content.binary_content
|
|
store-original-binary-in-db: true
|
|
# Maximum binary payload size persisted inline in DB
|
|
max-binary-bytes-in-db: 5242880
|
|
# Deduplicate by content hash and attach additional sources to the same canonical document
|
|
deduplicate-by-content-hash: true
|
|
# Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers
|
|
store-original-content-for-wrapper-documents: true
|
|
# Queue only the primary text representation for vectorization
|
|
vectorize-primary-representation-only: true
|
|
# Import batch marker written to DOC.doc_source.import_batch_id
|
|
import-batch-id: phase4-generic
|
|
# Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI
|
|
ted-package-adapter-enabled: true
|
|
# Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI
|
|
mail-adapter-enabled: true
|
|
# Optional dedicated mail owner tenant, falls back to default-owner-tenant-key
|
|
mail-default-owner-tenant-key:
|
|
# Visibility for imported mail messages and attachments
|
|
mail-default-visibility: TENANT
|
|
# Expand ZIP attachments recursively through the mail adapter
|
|
expand-mail-zip-attachments: true
|
|
# Import batch marker for TED package roots and children
|
|
ted-package-import-batch-id: phase41-ted-package
|
|
# When true, TED package documents are stored only through the generic ingestion gateway
|
|
# and the legacy XML batch processing path is skipped
|
|
gateway-only-for-ted-packages: true
|
|
# Import batch marker for mail roots and attachments
|
|
mail-import-batch-id: phase41-mail
|
|
|
|
ted-package-child-parallelism: 4
|
|
ted-package-child-max-in-flight: 8
|
|
|
|
# NEW Camel mail consumer route for provider-driven mail ingestion
|
|
mail-route:
|
|
# Enable/disable the NEW Camel mail consumer
|
|
enabled: false
|
|
# Generic mail server protocol (IMAP/IMAPS/POP3/POP3S)
|
|
protocol: IMAPS
|
|
# Mail server host
|
|
host: mail.mymagenta.business
|
|
# Mail server port; leave empty to use Camel component defaults
|
|
port: 993
|
|
# Mailbox username
|
|
username: archiv@procon.co.at
|
|
# Mailbox password
|
|
password: ${MAIL_PASSWORD}
|
|
# Folder/mailbox name
|
|
folder-name: INBOX
|
|
# Optional stable provider account key; falls back to username
|
|
account-key:
|
|
# Delete messages after successful processing
|
|
delete: false
|
|
# Consume only unseen messages
|
|
unseen: true
|
|
# Keep messages unread while consuming
|
|
peek: true
|
|
# Poll delay in milliseconds
|
|
delay: 15000
|
|
# Maximum messages per poll
|
|
max-messages-per-poll: 20
|
|
# Fetch entire messages by default
|
|
fetch-size: 10
|
|
# Close folder after each poll cycle
|
|
close-folder: false
|
|
# Camel mail debug mode
|
|
debug-mode: false
|
|
# Socket connection timeout in milliseconds
|
|
connection-timeout: 30000
|
|
|
|
# ted packages download configuration
|
|
ted-download:
|
|
# Enable/disable automatic package download
|
|
enabled: false
|
|
# Base URL for TED Daily Packages
|
|
base-url: https://ted.europa.eu/packages/daily/
|
|
# Download directory for tar.gz files
|
|
download-directory: /ted.europe/downloads-new
|
|
# Start year for downloads
|
|
start-year: 2026
|
|
# Polling interval (milliseconds) - 2 minutes
|
|
poll-interval: 60000
|
|
# Retry interval for tail NOT_FOUND packages - 6 hours
|
|
not-found-retry-interval: 21600000
|
|
# Grace period after year end before a previous-year tail 404 is treated as final
|
|
previous-year-grace-period-days: 30
|
|
# Keep retrying current-year tail 404 packages indefinitely
|
|
retry-current-year-not-found-indefinitely: true
|
|
# Download timeout (milliseconds) - 5 minutes
|
|
download-timeout: 300000
|
|
# Max concurrent downloads
|
|
max-running-packages: 2
|
|
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
|
|
delay-between-downloads: 5000
|
|
# Delete tar.gz after ingestion
|
|
delete-after-ingestion: true
|
|
|
|
time:
|
|
enabled: false
|
|
leitstand:
|
|
enabled: false
|
|
startup-sync-enabled: false
|
|
startup-selective-materialization-enabled: false
|
|
selective-materialization-person-dbk: #100920031023144811001000
|
|
selective-materialization-person-number:
|
|
selective-materialization-build-projection: true
|
|
create-canonical-time-entries: true
|
|
build-search-projection: true
|
|
build-representations: true
|
|
queue-embeddings: true
|
|
startup-projection-rebuild-enabled: false
|
|
representation-language-code: de
|
|
incremental-enabled: true
|
|
scope-key: leitstand-default
|
|
import-batch-id: time-leitstand
|
|
reconcile-lookback-days: 7
|
|
jdbc:
|
|
url: jdbc:jtds:sqlserver://mag2:1433;databaseName=spc
|
|
username: sa
|
|
password: jhcbxr
|
|
driver-class-name: net.sourceforge.jtds.jdbc.Driver
|
|
fetch-size: 500
|
|
query-timeout-seconds: 300
|
|
toggl-track:
|
|
enabled: false
|
|
import-batch-id: time-toggl
|
|
reconcile-lookback-days: 7
|
|
|
|
ted: # Phase 3 TED projection configuration
|
|
projection:
|
|
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document
|
|
enabled: true
|
|
# Optional startup backfill for legacy TED documents without a projection row yet
|
|
startup-backfill-enabled: false
|
|
# Maximum number of legacy TED documents to backfill during startup
|
|
startup-backfill-limit: 250
|
|
structured-search-hybrid-candidate-limit: 5000
|
|
structured-search-facet-bucket-limit: 12
|
|
lot-documents:
|
|
# Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot.
|
|
enabled: false
|
|
# Optional startup/backfill path for notices that were imported before lot documents existed.
|
|
startup-backfill-enabled: false
|
|
# Maximum number of legacy TED lot documents to backfill during startup (0 = all)
|
|
startup-backfill-limit: 0
|
|
# Queue embeddings whenever the lot semantic text representation is created or changed.
|
|
queue-embeddings-on-change: true
|
|
# Include parent notice project description even when the lot already has its own description.
|
|
include-parent-description: false
|
|
|
|
migration:
|
|
legacy-audit:
|
|
# Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem
|
|
enabled: false
|
|
# Optional startup execution; the audit is read-only and only writes audit run/finding tables
|
|
startup-run-enabled: true
|
|
# Maximum number of legacy TED documents to scan during startup (0 = all)
|
|
startup-run-limit: 0
|
|
# Page size for legacy TED document scanning
|
|
page-size: 100
|
|
# Maximum number of persisted findings in a single run
|
|
max-findings-per-run: 10000
|
|
# Maximum number of grouped duplicate samples captured for aggregate checks
|
|
max-duplicate-samples: 100
|
|
|
|
legacy-ted:
|
|
# Enable the resumable legacy TED -> DOC/projection backfill subsystem
|
|
enabled: false
|
|
# Run the backfill automatically on NEW-runtime startup
|
|
startup-enabled: false
|
|
# Number of legacy TED documents fetched and processed per batch
|
|
batch-size: 100
|
|
# Optional cap for a single invocation; 0 means migrate all remaining rows
|
|
max-documents-per-run: 0
|
|
# Resume the latest STOPPED/FAILED run from its saved cursor
|
|
resume-latest-incomplete-run: true
|
|
# Import batch id written to DOC.doc_source rows created by the migration
|
|
import-batch-id: legacy-ted-backfill
|
|
# Keep false for Wave 1; embeddings can be backfilled later as a separate step
|
|
queue-embeddings: false
|
|
migrate-embeddings: false
|
|
build-chunk-representations: true
|
|
|
|
legacy-ted-embeddings:
|
|
enabled: false
|
|
startup-enabled: false
|
|
batch-size: 500
|
|
max-documents-per-run: 0
|
|
skip-when-primary-representation-missing: true
|
|
queue-missing-embeddings: true
|