batch embedding support

2026-04-09 12:53:11 +02:00 · 2026-04-09 12:53:11 +02:00 · 6ae39b4ea5
parent 678db76415
commit 6ae39b4ea5
1 changed files with 28 additions and 4 deletions
--- a/src/main/resources/application-new.yml
+++ b/src/main/resources/application-new.yml
@ -1,6 +1,7 @@
 dip:
  runtime:
    mode: NEW
+
  search:
      # Default page size for search results
      default-page-size: 20
@ -36,9 +37,11 @@ dip:
      debug-top-hits-per-engine: 10

  embedding:
-      enabled: true
+      enabled: false
      jobs:
        enabled: true
+        process-in-batches: true
+        execution-batch-size: 20

      default-document-model: e5-default
      default-query-model: e5-default
@ -58,10 +61,14 @@ dip:
        vector-sync-e5:
          type: http-vector-sync
          base-url: http://localhost:8001
-          connect-timeout: 5s
-          read-timeout: 60s
+          connect-timeout: 30s
+          read-timeout: 300s
          headers:
            X-Client: dip
+          batch-request:
+            truncate-text: true
+            truncate-length: 512
+            chunk-size: 8

      models:

@ -79,6 +86,7 @@ dip:
          dimensions: 1024
          distance-metric: COSINE
          supports-query-embedding-mode: true
+          supports-batch: true
          active: true

      profiles:
@ -215,7 +223,7 @@ dip:
    # ted packages download configuration
    ted-download:
      # Enable/disable automatic package download
-      enabled: true
+      enabled: false
      # Base URL for TED Daily Packages
      base-url: https://ted.europa.eu/packages/daily/
      # Download directory for tar.gz files
@ -238,6 +246,22 @@ dip:
      delay-between-downloads: 5000
      # Delete tar.gz after ingestion
      delete-after-ingestion: true
+
+  migration:
+    legacy-audit:
+      # Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem
+      enabled: true
+      # Optional startup execution; the audit is read-only and only writes audit run/finding tables
+      startup-run-enabled: true
+      # Maximum number of legacy TED documents to scan during startup (0 = all)
+      startup-run-limit: 0
+      # Page size for legacy TED document scanning
+      page-size: 100
+      # Maximum number of persisted findings in a single run
+      max-findings-per-run: 10000
+      # Maximum number of grouped duplicate samples captured for aggregate checks
+      max-duplicate-samples: 100
+
  ted: # Phase 3 TED projection configuration
    projection:
      # Enable/disable dual-write into the TED projection model on top of DOC.doc_document