From 0ce5f5138208de2c8dae205c1c06ab02d52dd3f1 Mon Sep 17 00:00:00 2001
From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com>
Date: Wed, 15 Apr 2026 16:19:16 +0200
Subject: [PATCH] ted legacy embeddings migration
---
...ntIntelligencePlatformApplication.java.bak | 28 ---
.../config/LegacyTedBackfillProperties.java | 9 +-
.../LegacyTedEmbeddingBackfillProperties.java | 29 +++
.../entity/LegacyTedMigrationRun.java | 4 +
...gacyTedEmbeddingTargetQueryRepository.java | 49 +++++
.../LegacyTedBackfillMigrationService.java | 4 +-
.../service/LegacyTedBackfillWorker.java | 85 +++++++-
.../LegacyTedEmbeddingBackfillService.java | 188 ++++++++++++++++++
.../service/LegacyTedEmbeddingTarget.java | 8 +
...gacyTedEmbeddingBackfillStartupRunner.java | 33 +++
.../LegacyTedEmbeddingSnapshot.java | 15 ++
.../ProcurementDocumentRepository.java | 40 +++-
src/main/resources/application-new.yml | 19 +-
...2__doc_legacy_ted_backfill_performance.sql | 7 +
14 files changed, 475 insertions(+), 43 deletions(-)
delete mode 100644 src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak
create mode 100644 src/main/java/at/procon/dip/migration/config/LegacyTedEmbeddingBackfillProperties.java
create mode 100644 src/main/java/at/procon/dip/migration/repository/LegacyTedEmbeddingTargetQueryRepository.java
create mode 100644 src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingBackfillService.java
create mode 100644 src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingTarget.java
create mode 100644 src/main/java/at/procon/dip/migration/startup/LegacyTedEmbeddingBackfillStartupRunner.java
create mode 100644 src/main/java/at/procon/ted/repository/LegacyTedEmbeddingSnapshot.java
create mode 100644 src/main/resources/db/migration/V22__doc_legacy_ted_backfill_performance.sql
diff --git a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak
deleted file mode 100644
index db03770..0000000
--- a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak
+++ /dev/null
@@ -1,28 +0,0 @@
-package at.procon.dip;
-
-import at.procon.ted.config.TedProcessorProperties;
-import org.springframework.boot.SpringApplication;
-import org.springframework.boot.autoconfigure.SpringBootApplication;
-import org.springframework.boot.context.properties.EnableConfigurationProperties;
-import org.springframework.boot.autoconfigure.domain.EntityScan;
-import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
-import org.springframework.scheduling.annotation.EnableAsync;
-
-/**
- * Procon Document Intelligence Platform (DIP).
- *
- *
Phase 0 introduces a generic platform root namespace and architecture contracts
- * while keeping the existing TED-specific runtime intact. Subsequent phases can move
- * modules incrementally from {@code at.procon.ted} into the broader document platform.
- */
-@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
-@EnableAsync
-//@EnableConfigurationProperties(TedProcessorProperties.class)
-@EntityScan(basePackages = {"at.procon.ted.model.entity"})
-@EnableJpaRepositories(basePackages = {"at.procon.ted.repository"})
-public class DocumentIntelligencePlatformApplication {
-
- public static void main(String[] args) {
- SpringApplication.run(DocumentIntelligencePlatformApplication.class, args);
- }
-}
diff --git a/src/main/java/at/procon/dip/migration/config/LegacyTedBackfillProperties.java b/src/main/java/at/procon/dip/migration/config/LegacyTedBackfillProperties.java
index 9b28ced..90a6128 100644
--- a/src/main/java/at/procon/dip/migration/config/LegacyTedBackfillProperties.java
+++ b/src/main/java/at/procon/dip/migration/config/LegacyTedBackfillProperties.java
@@ -30,6 +30,13 @@ public class LegacyTedBackfillProperties {
/** Import batch id written to DOC.doc_source rows created by the migration. */
private String importBatchId = "legacy-ted-backfill";
- /** Queue embeddings for migrated TED representations after the DOC/projection backfill. */
+ /** Queue fresh embeddings for migrated TED representations after the DOC/projection backfill. */
private boolean queueEmbeddings = false;
+
+ /** Migrate existing legacy TED content vectors into DOC.doc_embedding for the primary representation. */
+ private boolean migrateEmbeddings = false;
+
+ /** Build CHUNK representations during migration. Disable for a faster structural-only backfill. */
+ private boolean buildChunkRepresentations = true;
}
+
diff --git a/src/main/java/at/procon/dip/migration/config/LegacyTedEmbeddingBackfillProperties.java b/src/main/java/at/procon/dip/migration/config/LegacyTedEmbeddingBackfillProperties.java
new file mode 100644
index 0000000..70eec0c
--- /dev/null
+++ b/src/main/java/at/procon/dip/migration/config/LegacyTedEmbeddingBackfillProperties.java
@@ -0,0 +1,29 @@
+package at.procon.dip.migration.config;
+
+import lombok.Data;
+import org.springframework.boot.context.properties.ConfigurationProperties;
+import org.springframework.context.annotation.Configuration;
+
+@Configuration
+@ConfigurationProperties(prefix = "dip.migration.legacy-ted-embeddings")
+@Data
+public class LegacyTedEmbeddingBackfillProperties {
+
+ /** Enable the legacy TED embedding-only backfill subsystem. */
+ private boolean enabled = false;
+
+ /** Run the embedding-only backfill automatically on application startup in NEW runtime. */
+ private boolean startupEnabled = false;
+
+ /** Number of legacy TED documents to inspect per cursor batch. */
+ private int batchSize = 500;
+
+ /** Optional cap for a single invocation. 0 or negative means unlimited. */
+ private long maxDocumentsPerRun = 0;
+
+ /** Skip legacy TED rows that do not yet have a migrated primary SEMANTIC_TEXT representation. */
+ private boolean skipWhenPrimaryRepresentationMissing = true;
+
+ /** Queue a fresh embedding job when no legacy vector exists for a migrated document. */
+ private boolean queueMissingEmbeddings = false;
+}
diff --git a/src/main/java/at/procon/dip/migration/entity/LegacyTedMigrationRun.java b/src/main/java/at/procon/dip/migration/entity/LegacyTedMigrationRun.java
index 4b25771..6a1e884 100644
--- a/src/main/java/at/procon/dip/migration/entity/LegacyTedMigrationRun.java
+++ b/src/main/java/at/procon/dip/migration/entity/LegacyTedMigrationRun.java
@@ -46,6 +46,10 @@ public class LegacyTedMigrationRun {
@Column(name = "queue_embeddings", nullable = false)
private boolean queueEmbeddings;
+ @Column(name = "build_chunk_representations", nullable = false)
+ @Builder.Default
+ private boolean buildChunkRepresentations = true;
+
@Column(name = "batch_size", nullable = false)
private int batchSize;
diff --git a/src/main/java/at/procon/dip/migration/repository/LegacyTedEmbeddingTargetQueryRepository.java b/src/main/java/at/procon/dip/migration/repository/LegacyTedEmbeddingTargetQueryRepository.java
new file mode 100644
index 0000000..f510845
--- /dev/null
+++ b/src/main/java/at/procon/dip/migration/repository/LegacyTedEmbeddingTargetQueryRepository.java
@@ -0,0 +1,49 @@
+package at.procon.dip.migration.repository;
+
+import at.procon.dip.migration.service.LegacyTedEmbeddingTarget;
+import jakarta.persistence.EntityManager;
+import jakarta.persistence.Query;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Repository;
+
+@Repository
+@RequiredArgsConstructor
+public class LegacyTedEmbeddingTargetQueryRepository {
+
+ private final EntityManager entityManager;
+
+ @SuppressWarnings("unchecked")
+ public List findPrimarySemanticTargetsByLegacyIds(Collection legacyIds) {
+ if (legacyIds == null || legacyIds.isEmpty()) {
+ return List.of();
+ }
+
+ Query query = entityManager.createNativeQuery("""
+ SELECT p.legacy_procurement_document_id AS legacy_procurement_document_id,
+ p.document_id AS document_id,
+ r.id AS representation_id
+ FROM ted.ted_notice_projection p
+ JOIN doc.doc_text_representation r
+ ON r.document_id = p.document_id
+ WHERE p.legacy_procurement_document_id IN (:legacyIds)
+ AND r.representation_type = 'SEMANTIC_TEXT'
+ AND COALESCE(r.is_primary, FALSE) = TRUE
+ """);
+ query.setParameter("legacyIds", legacyIds);
+
+ List rows = query.getResultList();
+ List results = new ArrayList<>(rows.size());
+ for (Object[] row : rows) {
+ results.add(new LegacyTedEmbeddingTarget(
+ (UUID) row[0],
+ (UUID) row[1],
+ (UUID) row[2]
+ ));
+ }
+ return results;
+ }
+}
diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java
index 5a68cdb..5d3d67e 100644
--- a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java
+++ b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java
@@ -40,8 +40,8 @@ public class LegacyTedBackfillMigrationService {
}
LegacyTedMigrationRun run = resolveRun();
- log.info("Starting legacy TED -> DOC/projection backfill run {} (batchSize={}, maxDocumentsPerRun={}, queueEmbeddings={})",
- run.getId(), run.getBatchSize(), run.getMaxDocumentsPerRun(), run.isQueueEmbeddings());
+ log.info("Starting legacy TED -> DOC/projection backfill run {} (batchSize={}, maxDocumentsPerRun={}, queueEmbeddings={}, migrateEmbeddings={})",
+ run.getId(), run.getBatchSize(), run.getMaxDocumentsPerRun(), run.isQueueEmbeddings(), properties.isMigrateEmbeddings());
long existingCheckpointCount = checkpointRepository.countByRun_Id(run.getId());
int batchNumber = existingCheckpointCount >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) existingCheckpointCount;
diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java
index 87f5e78..3b5caae 100644
--- a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java
+++ b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java
@@ -27,7 +27,9 @@ import at.procon.dip.domain.document.service.command.CreateDocumentRelationComma
import at.procon.dip.domain.ted.service.TedGenericDocumentRootService;
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
import at.procon.dip.domain.ted.service.TedPackageDocumentService;
+import at.procon.dip.migration.config.LegacyTedBackfillProperties;
import at.procon.dip.embedding.config.EmbeddingProperties;
+import at.procon.dip.embedding.service.EmbeddingPersistenceService;
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.extraction.spi.ExtractionResult;
@@ -40,14 +42,18 @@ import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.dip.search.service.DocumentLexicalIndexService;
import at.procon.ted.model.entity.ProcurementDocument;
+import at.procon.ted.repository.LegacyTedEmbeddingSnapshot;
import at.procon.ted.repository.ProcurementDocumentRepository;
import at.procon.ted.util.HashUtils;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.ArrayList;
+import java.util.LinkedHashSet;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
+import java.util.Optional;
+import java.util.Set;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
@@ -83,7 +89,9 @@ public class LegacyTedBackfillWorker {
private final DocumentRelationService documentRelationService;
private final TextRepresentationBuildService textRepresentationBuildService;
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
+ private final EmbeddingPersistenceService embeddingPersistenceService;
private final EmbeddingProperties embeddingProperties;
+ private final LegacyTedBackfillProperties backfillProperties;
@Transactional(propagation = Propagation.REQUIRES_NEW)
public BackfillOutcome backfill(UUID legacyProcurementDocumentId, String importBatchId, boolean queueEmbeddings) {
@@ -110,8 +118,12 @@ public class LegacyTedBackfillWorker {
UUID projectionId = tedNoticeProjectionService.registerOrRefreshProjection(legacyDocument, document.getId());
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
+ Set migratedRepresentationIds = backfillProperties.isMigrateEmbeddings()
+ ? migrateLegacyEmbeddings(legacyDocument, savedRepresentations)
+ : Set.of();
+
if (queueEmbeddings) {
- queueEmbeddings(document.getId(), savedRepresentations);
+ queueEmbeddings(document.getId(), savedRepresentations, migratedRepresentationIds);
}
return new BackfillOutcome(document.getId(), projectionId);
@@ -307,13 +319,82 @@ public class LegacyTedBackfillWorker {
.findFirst();
}
- private void queueEmbeddings(UUID documentId, List representations) {
+ private Set migrateLegacyEmbeddings(ProcurementDocument legacyDocument,
+ List savedRepresentations) {
+ LegacyTedEmbeddingSnapshot embeddingSnapshot = procurementDocumentRepository
+ .findEmbeddingSnapshotByIdForMigration(legacyDocument.getId())
+ .orElse(null);
+ if (embeddingSnapshot == null || !StringUtils.hasText(embeddingSnapshot.getVectorText())) {
+ return Set.of();
+ }
+ if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
+ log.warn("Skipping legacy embedding migration for TED notice {} because no default document model is configured", legacyDocument.getId());
+ return Set.of();
+ }
+
+ DocumentTextRepresentation targetRepresentation = selectPrimaryEmbeddingTarget(savedRepresentations)
+ .orElseThrow(() -> new IllegalStateException("No suitable representation found for migrated embedding of legacy TED notice " + legacyDocument.getId()));
+
+ float[] vector = parseVectorText(embeddingSnapshot.getVectorText());
+ if (vector.length == 0) {
+ log.warn("Skipping legacy embedding migration for TED notice {} because the legacy vector is empty", legacyDocument.getId());
+ return Set.of();
+ }
+
+ var embedding = embeddingPersistenceService.ensurePending(targetRepresentation.getId(), embeddingProperties.getDefaultDocumentModel());
+ embeddingPersistenceService.saveCompleted(embedding.getId(), vector, embeddingSnapshot.getTokenCount());
+ log.debug("Migrated legacy embedding for TED notice {} to representation {} (dimension={}, tokenCount={})",
+ legacyDocument.getId(), targetRepresentation.getId(), vector.length, embeddingSnapshot.getTokenCount());
+ return Set.of(targetRepresentation.getId());
+ }
+
+ private Optional selectPrimaryEmbeddingTarget(List representations) {
+ if (representations == null || representations.isEmpty()) {
+ return Optional.empty();
+ }
+ return representations.stream()
+ .filter(rep -> rep.getRepresentationType() == RepresentationType.SEMANTIC_TEXT && rep.isPrimaryRepresentation())
+ .findFirst()
+ .or(() -> representations.stream().filter(DocumentTextRepresentation::isPrimaryRepresentation).findFirst())
+ .or(() -> representations.stream().filter(rep -> rep.getRepresentationType() == RepresentationType.SEMANTIC_TEXT).findFirst())
+ .or(() -> Optional.of(representations.get(0)));
+ }
+
+ private float[] parseVectorText(String vectorText) {
+ if (!StringUtils.hasText(vectorText)) {
+ return new float[0];
+ }
+ String trimmed = vectorText.trim();
+ if (trimmed.startsWith("[")) {
+ trimmed = trimmed.substring(1);
+ }
+ if (trimmed.endsWith("]")) {
+ trimmed = trimmed.substring(0, trimmed.length() - 1);
+ }
+ if (!StringUtils.hasText(trimmed)) {
+ return new float[0];
+ }
+ String[] parts = trimmed.split(",");
+ float[] result = new float[parts.length];
+ for (int i = 0; i < parts.length; i++) {
+ result[i] = Float.parseFloat(parts[i].trim());
+ }
+ return result;
+ }
+
+ private void queueEmbeddings(UUID documentId,
+ List representations,
+ Set migratedRepresentationIds) {
if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
log.debug("Skipping embedding queue for migrated document {} because no default document model is configured", documentId);
return;
}
+ Set skippedIds = migratedRepresentationIds == null ? Set.of() : new LinkedHashSet<>(migratedRepresentationIds);
for (DocumentTextRepresentation representation : representations) {
+ if (representation.getId() != null && skippedIds.contains(representation.getId())) {
+ continue;
+ }
RepresentationType type = representation.getRepresentationType();
boolean queue = switch (type) {
case SEMANTIC_TEXT -> true;
diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingBackfillService.java b/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingBackfillService.java
new file mode 100644
index 0000000..a9a0232
--- /dev/null
+++ b/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingBackfillService.java
@@ -0,0 +1,188 @@
+package at.procon.dip.migration.service;
+
+import at.procon.dip.embedding.config.EmbeddingProperties;
+import at.procon.dip.embedding.service.EmbeddingPersistenceService;
+import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
+import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties;
+import at.procon.dip.migration.repository.LegacyTedEmbeddingTargetQueryRepository;
+import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
+import at.procon.dip.runtime.config.RuntimeMode;
+import at.procon.ted.repository.LegacyTedEmbeddingSnapshot;
+import at.procon.ted.repository.LegacyTedMigrationCursor;
+import at.procon.ted.repository.ProcurementDocumentRepository;
+import java.time.Instant;
+import java.time.OffsetDateTime;
+import java.time.ZoneId;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Service;
+import org.springframework.util.StringUtils;
+
+@Service
+@ConditionalOnRuntimeMode(RuntimeMode.NEW)
+@RequiredArgsConstructor
+@Slf4j
+public class LegacyTedEmbeddingBackfillService {
+
+ private final LegacyTedEmbeddingBackfillProperties properties;
+ private final ProcurementDocumentRepository procurementDocumentRepository;
+ private final LegacyTedEmbeddingTargetQueryRepository targetQueryRepository;
+ private final EmbeddingPersistenceService embeddingPersistenceService;
+ private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
+ private final EmbeddingProperties embeddingProperties;
+
+ public void runBackfill() {
+ if (!properties.isEnabled()) {
+ log.info("Legacy TED embedding-only backfill is disabled");
+ return;
+ }
+ if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
+ log.warn("Skipping legacy TED embedding-only backfill because no default document model is configured");
+ return;
+ }
+
+ log.info("Starting legacy TED embedding-only backfill (batchSize={}, maxDocumentsPerRun={}, queueMissingEmbeddings={})",
+ properties.getBatchSize(), properties.getMaxDocumentsPerRun(), properties.isQueueMissingEmbeddings());
+
+ Instant lastCreatedAt = null;
+ UUID lastId = null;
+ long inspected = 0;
+ long migrated = 0;
+ long queued = 0;
+ long skippedMissingTarget = 0;
+ long skippedMissingVector = 0;
+
+ while (true) {
+ int limit = effectiveBatchLimit(inspected);
+ if (limit <= 0) {
+ break;
+ }
+
+ List cursors = loadNextBatch(
+ lastCreatedAt != null ?
+ lastCreatedAt.atZone(ZoneId.systemDefault()).toOffsetDateTime() : null, lastId, limit);
+ if (cursors.isEmpty()) {
+ break;
+ }
+
+ List legacyIds = new ArrayList<>(cursors.size());
+ for (LegacyTedMigrationCursor cursor : cursors) {
+ legacyIds.add(cursor.getId());
+ }
+
+ Map targetsByLegacyId = indexTargets(
+ targetQueryRepository.findPrimarySemanticTargetsByLegacyIds(legacyIds)
+ );
+ Map snapshotsByLegacyId = indexSnapshots(
+ procurementDocumentRepository.findEmbeddingSnapshotsByIdsForMigration(legacyIds)
+ );
+
+ for (LegacyTedMigrationCursor cursor : cursors) {
+ inspected++;
+ LegacyTedEmbeddingTarget target = targetsByLegacyId.get(cursor.getId());
+ LegacyTedEmbeddingSnapshot snapshot = snapshotsByLegacyId.get(cursor.getId());
+
+ if (target == null) {
+ if (properties.isSkipWhenPrimaryRepresentationMissing()) {
+ skippedMissingTarget++;
+ }
+ lastCreatedAt = cursor.getCreatedAt();
+ lastId = cursor.getId();
+ continue;
+ }
+
+ if (snapshot != null && StringUtils.hasText(snapshot.getVectorText())) {
+ float[] vector = parseVectorText(snapshot.getVectorText());
+ if (vector.length > 0) {
+ var embedding = embeddingPersistenceService.ensurePending(target.representationId(), embeddingProperties.getDefaultDocumentModel());
+ embeddingPersistenceService.saveCompleted(embedding.getId(), vector, snapshot.getTokenCount());
+ migrated++;
+ } else {
+ skippedMissingVector++;
+ }
+ } else if (properties.isQueueMissingEmbeddings()) {
+ embeddingOrchestrator.enqueueRepresentation(
+ target.documentId(),
+ target.representationId(),
+ embeddingProperties.getDefaultDocumentModel()
+ );
+ queued++;
+ } else {
+ skippedMissingVector++;
+ }
+
+ lastCreatedAt = cursor.getCreatedAt();
+ lastId = cursor.getId();
+ }
+
+ log.info("Legacy TED embedding-only backfill progress: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}",
+ inspected, migrated, queued, skippedMissingTarget, skippedMissingVector);
+ }
+
+ log.info("Legacy TED embedding-only backfill finished: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}",
+ inspected, migrated, queued, skippedMissingTarget, skippedMissingVector);
+ }
+
+ protected List loadNextBatch(OffsetDateTime lastCreatedAt, UUID lastId, int limit) {
+ if (lastCreatedAt == null || lastId == null) {
+ return procurementDocumentRepository.findFirstMigrationBatch(limit);
+ }
+ return procurementDocumentRepository.findNextMigrationBatch(lastCreatedAt, lastId, limit);
+ }
+
+ private int effectiveBatchLimit(long processedInThisInvocation) {
+ long maxPerRun = properties.getMaxDocumentsPerRun();
+ if (maxPerRun <= 0) {
+ return Math.max(1, properties.getBatchSize());
+ }
+ long remaining = maxPerRun - processedInThisInvocation;
+ if (remaining <= 0) {
+ return 0;
+ }
+ return (int) Math.max(1L, Math.min(properties.getBatchSize(), remaining));
+ }
+
+ private Map indexTargets(Collection targets) {
+ Map indexed = new LinkedHashMap<>();
+ for (LegacyTedEmbeddingTarget target : targets) {
+ indexed.putIfAbsent(target.legacyProcurementDocumentId(), target);
+ }
+ return indexed;
+ }
+
+ private Map indexSnapshots(Collection snapshots) {
+ Map indexed = new LinkedHashMap<>();
+ for (LegacyTedEmbeddingSnapshot snapshot : snapshots) {
+ indexed.put(snapshot.getId(), snapshot);
+ }
+ return indexed;
+ }
+
+ private float[] parseVectorText(String vectorText) {
+ if (!StringUtils.hasText(vectorText)) {
+ return new float[0];
+ }
+ String trimmed = vectorText.trim();
+ if (trimmed.startsWith("[")) {
+ trimmed = trimmed.substring(1);
+ }
+ if (trimmed.endsWith("]")) {
+ trimmed = trimmed.substring(0, trimmed.length() - 1);
+ }
+ if (!StringUtils.hasText(trimmed)) {
+ return new float[0];
+ }
+ String[] parts = trimmed.split(",");
+ float[] result = new float[parts.length];
+ for (int i = 0; i < parts.length; i++) {
+ result[i] = Float.parseFloat(parts[i].trim());
+ }
+ return result;
+ }
+}
diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingTarget.java b/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingTarget.java
new file mode 100644
index 0000000..54783d3
--- /dev/null
+++ b/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingTarget.java
@@ -0,0 +1,8 @@
+package at.procon.dip.migration.service;
+
+import java.util.UUID;
+
+public record LegacyTedEmbeddingTarget(UUID legacyProcurementDocumentId,
+ UUID documentId,
+ UUID representationId) {
+}
diff --git a/src/main/java/at/procon/dip/migration/startup/LegacyTedEmbeddingBackfillStartupRunner.java b/src/main/java/at/procon/dip/migration/startup/LegacyTedEmbeddingBackfillStartupRunner.java
new file mode 100644
index 0000000..0df389d
--- /dev/null
+++ b/src/main/java/at/procon/dip/migration/startup/LegacyTedEmbeddingBackfillStartupRunner.java
@@ -0,0 +1,33 @@
+package at.procon.dip.migration.startup;
+
+import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties;
+import at.procon.dip.migration.service.LegacyTedEmbeddingBackfillService;
+import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
+import at.procon.dip.runtime.config.RuntimeMode;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.boot.ApplicationArguments;
+import org.springframework.boot.ApplicationRunner;
+import org.springframework.core.Ordered;
+import org.springframework.core.annotation.Order;
+import org.springframework.stereotype.Component;
+
+@Component
+@ConditionalOnRuntimeMode(RuntimeMode.NEW)
+@RequiredArgsConstructor
+@Order(Ordered.LOWEST_PRECEDENCE)
+@Slf4j
+public class LegacyTedEmbeddingBackfillStartupRunner implements ApplicationRunner {
+
+ private final LegacyTedEmbeddingBackfillProperties properties;
+ private final LegacyTedEmbeddingBackfillService backfillService;
+
+ @Override
+ public void run(ApplicationArguments args) {
+ if (!properties.isEnabled() || !properties.isStartupEnabled()) {
+ return;
+ }
+ log.info("Startup-triggered legacy TED embedding-only backfill is enabled");
+ backfillService.runBackfill();
+ }
+}
diff --git a/src/main/java/at/procon/ted/repository/LegacyTedEmbeddingSnapshot.java b/src/main/java/at/procon/ted/repository/LegacyTedEmbeddingSnapshot.java
new file mode 100644
index 0000000..85b44bb
--- /dev/null
+++ b/src/main/java/at/procon/ted/repository/LegacyTedEmbeddingSnapshot.java
@@ -0,0 +1,15 @@
+package at.procon.ted.repository;
+
+import java.util.UUID;
+
+/**
+ * Lightweight projection for migrating legacy TED notice embeddings into the new DOC embedding model.
+ */
+public interface LegacyTedEmbeddingSnapshot {
+
+ UUID getId();
+
+ String getVectorText();
+
+ Integer getTokenCount();
+}
diff --git a/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java b/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java
index bf76099..5527a70 100644
--- a/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java
+++ b/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java
@@ -10,8 +10,10 @@ import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import org.springframework.stereotype.Repository;
+import java.time.Instant;
import java.time.LocalDate;
import java.time.OffsetDateTime;
+import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
@@ -216,7 +218,7 @@ public interface ProcurementDocumentRepository extends
@Query(value = """
SELECT p.id AS id, p.created_at AS createdAt
FROM ted.procurement_document p
- ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
+ ORDER BY p.created_at ASC, p.id ASC
LIMIT :limit
""", nativeQuery = true)
List findFirstMigrationBatch(@Param("limit") int limit);
@@ -228,14 +230,34 @@ public interface ProcurementDocumentRepository extends
SELECT p.id AS id, p.created_at AS createdAt
FROM ted.procurement_document p
WHERE p.created_at > :lastCreatedAt
- OR (p.created_at = :lastCreatedAt AND CAST(p.id AS text) > CAST(:lastId AS text))
- ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
+ OR (p.created_at = :lastCreatedAt AND p.id > :lastId)
+ ORDER BY p.created_at ASC, p.id ASC
LIMIT :limit
""", nativeQuery = true)
List findNextMigrationBatch(@Param("lastCreatedAt") OffsetDateTime lastCreatedAt,
@Param("lastId") UUID lastId,
@Param("limit") int limit);
+ @Query("SELECT p FROM ProcurementDocument p WHERE p.id IN :ids")
+ List findAllByIdInForMigration(@Param("ids") Collection ids);
+
+ @Query("SELECT l FROM ProcurementLot l WHERE l.document.id IN :ids")
+ List findLotsByDocumentIdInForMigration(@Param("ids") Collection ids);
+
+ @Query("SELECT o FROM Organization o WHERE o.document.id IN :ids")
+ List findOrganizationsByDocumentIdInForMigration(@Param("ids") Collection ids);
+
+ @Query(value = """
+ SELECT p.id AS id,
+ CAST(p.content_vector AS text) AS vectorText,
+ p.embedding_token_count AS tokenCount
+ FROM ted.procurement_document p
+ WHERE p.id IN :ids
+ AND p.content_vector IS NOT NULL
+ """, nativeQuery = true)
+ List findEmbeddingSnapshotsByIdsForMigration(@Param("ids") Collection ids);
+
+
/**
* Delete all documents created before the specified date.
* Cascading deletes will automatically remove related lots, organizations, and logs.
@@ -254,6 +276,14 @@ public interface ProcurementDocumentRepository extends
* @return Number of documents
*/
long countByCreatedAtBefore(OffsetDateTime cutoffDate);
-}
-
+ @Query(value = """
+ SELECT p.id AS id,
+ CAST(p.content_vector AS text) AS vectorText,
+ p.embedding_token_count AS tokenCount
+ FROM ted.procurement_document p
+ WHERE p.id = :id
+ AND p.content_vector IS NOT NULL
+ """, nativeQuery = true)
+ Optional findEmbeddingSnapshotByIdForMigration(@Param("id") UUID id);
+}
\ No newline at end of file
diff --git a/src/main/resources/application-new.yml b/src/main/resources/application-new.yml
index bbd1b58..bb50a29 100644
--- a/src/main/resources/application-new.yml
+++ b/src/main/resources/application-new.yml
@@ -1,7 +1,6 @@
dip:
runtime:
mode: NEW
-
search:
# Default page size for search results
default-page-size: 20
@@ -37,9 +36,9 @@ dip:
debug-top-hits-per-engine: 10
embedding:
- enabled: false
+ enabled: true
jobs:
- enabled: true
+ enabled: false
process-in-batches: true
execution-batch-size: 20
@@ -54,7 +53,7 @@ dip:
external-e5:
type: http-json
- base-url: http://172.20.240.18:8001
+ base-url: http://localhost:8001
connect-timeout: 5s
read-timeout: 60s
@@ -285,4 +284,14 @@ dip:
# Import batch id written to DOC.doc_source rows created by the migration
import-batch-id: legacy-ted-backfill
# Keep false for Wave 1; embeddings can be backfilled later as a separate step
- queue-embeddings: false
\ No newline at end of file
+ queue-embeddings: false
+ migrate-embeddings: false
+ build-chunk-representations: true
+
+ legacy-ted-embeddings:
+ enabled: true
+ startup-enabled: true
+ batch-size: 500
+ max-documents-per-run: 0
+ skip-when-primary-representation-missing: true
+ queue-missing-embeddings: false
\ No newline at end of file
diff --git a/src/main/resources/db/migration/V22__doc_legacy_ted_backfill_performance.sql b/src/main/resources/db/migration/V22__doc_legacy_ted_backfill_performance.sql
new file mode 100644
index 0000000..2ecc2f7
--- /dev/null
+++ b/src/main/resources/db/migration/V22__doc_legacy_ted_backfill_performance.sql
@@ -0,0 +1,7 @@
+SET search_path TO TED, DOC, public;
+
+CREATE INDEX IF NOT EXISTS idx_ted_procurement_document_created_at_id
+ ON TED.procurement_document (created_at ASC, id ASC);
+
+ALTER TABLE DOC.doc_legacy_ted_migration_run
+ ADD COLUMN IF NOT EXISTS build_chunk_representations boolean NOT NULL DEFAULT true;