From 6ae39b4ea52a1aa5da04dc04530605639e345f7d Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Thu, 9 Apr 2026 12:53:11 +0200 Subject: [PATCH] batch embedding support --- src/main/resources/application-new.yml | 32 ++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/main/resources/application-new.yml b/src/main/resources/application-new.yml index 4feb0bf..1bb47af 100644 --- a/src/main/resources/application-new.yml +++ b/src/main/resources/application-new.yml @@ -1,6 +1,7 @@ dip: runtime: mode: NEW + search: # Default page size for search results default-page-size: 20 @@ -36,9 +37,11 @@ dip: debug-top-hits-per-engine: 10 embedding: - enabled: true + enabled: false jobs: enabled: true + process-in-batches: true + execution-batch-size: 20 default-document-model: e5-default default-query-model: e5-default @@ -58,10 +61,14 @@ dip: vector-sync-e5: type: http-vector-sync base-url: http://localhost:8001 - connect-timeout: 5s - read-timeout: 60s + connect-timeout: 30s + read-timeout: 300s headers: X-Client: dip + batch-request: + truncate-text: true + truncate-length: 512 + chunk-size: 8 models: @@ -79,6 +86,7 @@ dip: dimensions: 1024 distance-metric: COSINE supports-query-embedding-mode: true + supports-batch: true active: true profiles: @@ -215,7 +223,7 @@ dip: # ted packages download configuration ted-download: # Enable/disable automatic package download - enabled: true + enabled: false # Base URL for TED Daily Packages base-url: https://ted.europa.eu/packages/daily/ # Download directory for tar.gz files @@ -238,6 +246,22 @@ dip: delay-between-downloads: 5000 # Delete tar.gz after ingestion delete-after-ingestion: true + + migration: + legacy-audit: + # Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem + enabled: true + # Optional startup execution; the audit is read-only and only writes audit run/finding tables + startup-run-enabled: true + # Maximum number of legacy TED documents to scan during startup (0 = all) + startup-run-limit: 0 + # Page size for legacy TED document scanning + page-size: 100 + # Maximum number of persisted findings in a single run + max-findings-per-run: 10000 + # Maximum number of grouped duplicate samples captured for aggregate checks + max-duplicate-samples: 100 + ted: # Phase 3 TED projection configuration projection: # Enable/disable dual-write into the TED projection model on top of DOC.doc_document