From 0ce5f5138208de2c8dae205c1c06ab02d52dd3f1 Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Wed, 15 Apr 2026 16:19:16 +0200 Subject: [PATCH] ted legacy embeddings migration --- ...ntIntelligencePlatformApplication.java.bak | 28 --- .../config/LegacyTedBackfillProperties.java | 9 +- .../LegacyTedEmbeddingBackfillProperties.java | 29 +++ .../entity/LegacyTedMigrationRun.java | 4 + ...gacyTedEmbeddingTargetQueryRepository.java | 49 +++++ .../LegacyTedBackfillMigrationService.java | 4 +- .../service/LegacyTedBackfillWorker.java | 85 +++++++- .../LegacyTedEmbeddingBackfillService.java | 188 ++++++++++++++++++ .../service/LegacyTedEmbeddingTarget.java | 8 + ...gacyTedEmbeddingBackfillStartupRunner.java | 33 +++ .../LegacyTedEmbeddingSnapshot.java | 15 ++ .../ProcurementDocumentRepository.java | 40 +++- src/main/resources/application-new.yml | 19 +- ...2__doc_legacy_ted_backfill_performance.sql | 7 + 14 files changed, 475 insertions(+), 43 deletions(-) delete mode 100644 src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak create mode 100644 src/main/java/at/procon/dip/migration/config/LegacyTedEmbeddingBackfillProperties.java create mode 100644 src/main/java/at/procon/dip/migration/repository/LegacyTedEmbeddingTargetQueryRepository.java create mode 100644 src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingBackfillService.java create mode 100644 src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingTarget.java create mode 100644 src/main/java/at/procon/dip/migration/startup/LegacyTedEmbeddingBackfillStartupRunner.java create mode 100644 src/main/java/at/procon/ted/repository/LegacyTedEmbeddingSnapshot.java create mode 100644 src/main/resources/db/migration/V22__doc_legacy_ted_backfill_performance.sql diff --git a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak deleted file mode 100644 index db03770..0000000 --- a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak +++ /dev/null @@ -1,28 +0,0 @@ -package at.procon.dip; - -import at.procon.ted.config.TedProcessorProperties; -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; -import org.springframework.boot.context.properties.EnableConfigurationProperties; -import org.springframework.boot.autoconfigure.domain.EntityScan; -import org.springframework.data.jpa.repository.config.EnableJpaRepositories; -import org.springframework.scheduling.annotation.EnableAsync; - -/** - * Procon Document Intelligence Platform (DIP). - * - *

Phase 0 introduces a generic platform root namespace and architecture contracts - * while keeping the existing TED-specific runtime intact. Subsequent phases can move - * modules incrementally from {@code at.procon.ted} into the broader document platform.

- */ -@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"}) -@EnableAsync -//@EnableConfigurationProperties(TedProcessorProperties.class) -@EntityScan(basePackages = {"at.procon.ted.model.entity"}) -@EnableJpaRepositories(basePackages = {"at.procon.ted.repository"}) -public class DocumentIntelligencePlatformApplication { - - public static void main(String[] args) { - SpringApplication.run(DocumentIntelligencePlatformApplication.class, args); - } -} diff --git a/src/main/java/at/procon/dip/migration/config/LegacyTedBackfillProperties.java b/src/main/java/at/procon/dip/migration/config/LegacyTedBackfillProperties.java index 9b28ced..90a6128 100644 --- a/src/main/java/at/procon/dip/migration/config/LegacyTedBackfillProperties.java +++ b/src/main/java/at/procon/dip/migration/config/LegacyTedBackfillProperties.java @@ -30,6 +30,13 @@ public class LegacyTedBackfillProperties { /** Import batch id written to DOC.doc_source rows created by the migration. */ private String importBatchId = "legacy-ted-backfill"; - /** Queue embeddings for migrated TED representations after the DOC/projection backfill. */ + /** Queue fresh embeddings for migrated TED representations after the DOC/projection backfill. */ private boolean queueEmbeddings = false; + + /** Migrate existing legacy TED content vectors into DOC.doc_embedding for the primary representation. */ + private boolean migrateEmbeddings = false; + + /** Build CHUNK representations during migration. Disable for a faster structural-only backfill. */ + private boolean buildChunkRepresentations = true; } + diff --git a/src/main/java/at/procon/dip/migration/config/LegacyTedEmbeddingBackfillProperties.java b/src/main/java/at/procon/dip/migration/config/LegacyTedEmbeddingBackfillProperties.java new file mode 100644 index 0000000..70eec0c --- /dev/null +++ b/src/main/java/at/procon/dip/migration/config/LegacyTedEmbeddingBackfillProperties.java @@ -0,0 +1,29 @@ +package at.procon.dip.migration.config; + +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Configuration; + +@Configuration +@ConfigurationProperties(prefix = "dip.migration.legacy-ted-embeddings") +@Data +public class LegacyTedEmbeddingBackfillProperties { + + /** Enable the legacy TED embedding-only backfill subsystem. */ + private boolean enabled = false; + + /** Run the embedding-only backfill automatically on application startup in NEW runtime. */ + private boolean startupEnabled = false; + + /** Number of legacy TED documents to inspect per cursor batch. */ + private int batchSize = 500; + + /** Optional cap for a single invocation. 0 or negative means unlimited. */ + private long maxDocumentsPerRun = 0; + + /** Skip legacy TED rows that do not yet have a migrated primary SEMANTIC_TEXT representation. */ + private boolean skipWhenPrimaryRepresentationMissing = true; + + /** Queue a fresh embedding job when no legacy vector exists for a migrated document. */ + private boolean queueMissingEmbeddings = false; +} diff --git a/src/main/java/at/procon/dip/migration/entity/LegacyTedMigrationRun.java b/src/main/java/at/procon/dip/migration/entity/LegacyTedMigrationRun.java index 4b25771..6a1e884 100644 --- a/src/main/java/at/procon/dip/migration/entity/LegacyTedMigrationRun.java +++ b/src/main/java/at/procon/dip/migration/entity/LegacyTedMigrationRun.java @@ -46,6 +46,10 @@ public class LegacyTedMigrationRun { @Column(name = "queue_embeddings", nullable = false) private boolean queueEmbeddings; + @Column(name = "build_chunk_representations", nullable = false) + @Builder.Default + private boolean buildChunkRepresentations = true; + @Column(name = "batch_size", nullable = false) private int batchSize; diff --git a/src/main/java/at/procon/dip/migration/repository/LegacyTedEmbeddingTargetQueryRepository.java b/src/main/java/at/procon/dip/migration/repository/LegacyTedEmbeddingTargetQueryRepository.java new file mode 100644 index 0000000..f510845 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/repository/LegacyTedEmbeddingTargetQueryRepository.java @@ -0,0 +1,49 @@ +package at.procon.dip.migration.repository; + +import at.procon.dip.migration.service.LegacyTedEmbeddingTarget; +import jakarta.persistence.EntityManager; +import jakarta.persistence.Query; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Repository; + +@Repository +@RequiredArgsConstructor +public class LegacyTedEmbeddingTargetQueryRepository { + + private final EntityManager entityManager; + + @SuppressWarnings("unchecked") + public List findPrimarySemanticTargetsByLegacyIds(Collection legacyIds) { + if (legacyIds == null || legacyIds.isEmpty()) { + return List.of(); + } + + Query query = entityManager.createNativeQuery(""" + SELECT p.legacy_procurement_document_id AS legacy_procurement_document_id, + p.document_id AS document_id, + r.id AS representation_id + FROM ted.ted_notice_projection p + JOIN doc.doc_text_representation r + ON r.document_id = p.document_id + WHERE p.legacy_procurement_document_id IN (:legacyIds) + AND r.representation_type = 'SEMANTIC_TEXT' + AND COALESCE(r.is_primary, FALSE) = TRUE + """); + query.setParameter("legacyIds", legacyIds); + + List rows = query.getResultList(); + List results = new ArrayList<>(rows.size()); + for (Object[] row : rows) { + results.add(new LegacyTedEmbeddingTarget( + (UUID) row[0], + (UUID) row[1], + (UUID) row[2] + )); + } + return results; + } +} diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java index 5a68cdb..5d3d67e 100644 --- a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java +++ b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java @@ -40,8 +40,8 @@ public class LegacyTedBackfillMigrationService { } LegacyTedMigrationRun run = resolveRun(); - log.info("Starting legacy TED -> DOC/projection backfill run {} (batchSize={}, maxDocumentsPerRun={}, queueEmbeddings={})", - run.getId(), run.getBatchSize(), run.getMaxDocumentsPerRun(), run.isQueueEmbeddings()); + log.info("Starting legacy TED -> DOC/projection backfill run {} (batchSize={}, maxDocumentsPerRun={}, queueEmbeddings={}, migrateEmbeddings={})", + run.getId(), run.getBatchSize(), run.getMaxDocumentsPerRun(), run.isQueueEmbeddings(), properties.isMigrateEmbeddings()); long existingCheckpointCount = checkpointRepository.countByRun_Id(run.getId()); int batchNumber = existingCheckpointCount >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) existingCheckpointCount; diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java index 87f5e78..3b5caae 100644 --- a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java +++ b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java @@ -27,7 +27,9 @@ import at.procon.dip.domain.document.service.command.CreateDocumentRelationComma import at.procon.dip.domain.ted.service.TedGenericDocumentRootService; import at.procon.dip.domain.ted.service.TedNoticeProjectionService; import at.procon.dip.domain.ted.service.TedPackageDocumentService; +import at.procon.dip.migration.config.LegacyTedBackfillProperties; import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.service.EmbeddingPersistenceService; import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; import at.procon.dip.extraction.spi.ExtractedStructuredPayload; import at.procon.dip.extraction.spi.ExtractionResult; @@ -40,14 +42,18 @@ import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; import at.procon.dip.runtime.config.RuntimeMode; import at.procon.dip.search.service.DocumentLexicalIndexService; import at.procon.ted.model.entity.ProcurementDocument; +import at.procon.ted.repository.LegacyTedEmbeddingSnapshot; import at.procon.ted.repository.ProcurementDocumentRepository; import at.procon.ted.util.HashUtils; import java.nio.charset.StandardCharsets; import java.time.OffsetDateTime; import java.util.ArrayList; +import java.util.LinkedHashSet; import java.util.Comparator; import java.util.LinkedHashMap; import java.util.List; +import java.util.Optional; +import java.util.Set; import java.util.Map; import java.util.Objects; import java.util.UUID; @@ -83,7 +89,9 @@ public class LegacyTedBackfillWorker { private final DocumentRelationService documentRelationService; private final TextRepresentationBuildService textRepresentationBuildService; private final RepresentationEmbeddingOrchestrator embeddingOrchestrator; + private final EmbeddingPersistenceService embeddingPersistenceService; private final EmbeddingProperties embeddingProperties; + private final LegacyTedBackfillProperties backfillProperties; @Transactional(propagation = Propagation.REQUIRES_NEW) public BackfillOutcome backfill(UUID legacyProcurementDocumentId, String importBatchId, boolean queueEmbeddings) { @@ -110,8 +118,12 @@ public class LegacyTedBackfillWorker { UUID projectionId = tedNoticeProjectionService.registerOrRefreshProjection(legacyDocument, document.getId()); documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED); + Set migratedRepresentationIds = backfillProperties.isMigrateEmbeddings() + ? migrateLegacyEmbeddings(legacyDocument, savedRepresentations) + : Set.of(); + if (queueEmbeddings) { - queueEmbeddings(document.getId(), savedRepresentations); + queueEmbeddings(document.getId(), savedRepresentations, migratedRepresentationIds); } return new BackfillOutcome(document.getId(), projectionId); @@ -307,13 +319,82 @@ public class LegacyTedBackfillWorker { .findFirst(); } - private void queueEmbeddings(UUID documentId, List representations) { + private Set migrateLegacyEmbeddings(ProcurementDocument legacyDocument, + List savedRepresentations) { + LegacyTedEmbeddingSnapshot embeddingSnapshot = procurementDocumentRepository + .findEmbeddingSnapshotByIdForMigration(legacyDocument.getId()) + .orElse(null); + if (embeddingSnapshot == null || !StringUtils.hasText(embeddingSnapshot.getVectorText())) { + return Set.of(); + } + if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) { + log.warn("Skipping legacy embedding migration for TED notice {} because no default document model is configured", legacyDocument.getId()); + return Set.of(); + } + + DocumentTextRepresentation targetRepresentation = selectPrimaryEmbeddingTarget(savedRepresentations) + .orElseThrow(() -> new IllegalStateException("No suitable representation found for migrated embedding of legacy TED notice " + legacyDocument.getId())); + + float[] vector = parseVectorText(embeddingSnapshot.getVectorText()); + if (vector.length == 0) { + log.warn("Skipping legacy embedding migration for TED notice {} because the legacy vector is empty", legacyDocument.getId()); + return Set.of(); + } + + var embedding = embeddingPersistenceService.ensurePending(targetRepresentation.getId(), embeddingProperties.getDefaultDocumentModel()); + embeddingPersistenceService.saveCompleted(embedding.getId(), vector, embeddingSnapshot.getTokenCount()); + log.debug("Migrated legacy embedding for TED notice {} to representation {} (dimension={}, tokenCount={})", + legacyDocument.getId(), targetRepresentation.getId(), vector.length, embeddingSnapshot.getTokenCount()); + return Set.of(targetRepresentation.getId()); + } + + private Optional selectPrimaryEmbeddingTarget(List representations) { + if (representations == null || representations.isEmpty()) { + return Optional.empty(); + } + return representations.stream() + .filter(rep -> rep.getRepresentationType() == RepresentationType.SEMANTIC_TEXT && rep.isPrimaryRepresentation()) + .findFirst() + .or(() -> representations.stream().filter(DocumentTextRepresentation::isPrimaryRepresentation).findFirst()) + .or(() -> representations.stream().filter(rep -> rep.getRepresentationType() == RepresentationType.SEMANTIC_TEXT).findFirst()) + .or(() -> Optional.of(representations.get(0))); + } + + private float[] parseVectorText(String vectorText) { + if (!StringUtils.hasText(vectorText)) { + return new float[0]; + } + String trimmed = vectorText.trim(); + if (trimmed.startsWith("[")) { + trimmed = trimmed.substring(1); + } + if (trimmed.endsWith("]")) { + trimmed = trimmed.substring(0, trimmed.length() - 1); + } + if (!StringUtils.hasText(trimmed)) { + return new float[0]; + } + String[] parts = trimmed.split(","); + float[] result = new float[parts.length]; + for (int i = 0; i < parts.length; i++) { + result[i] = Float.parseFloat(parts[i].trim()); + } + return result; + } + + private void queueEmbeddings(UUID documentId, + List representations, + Set migratedRepresentationIds) { if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) { log.debug("Skipping embedding queue for migrated document {} because no default document model is configured", documentId); return; } + Set skippedIds = migratedRepresentationIds == null ? Set.of() : new LinkedHashSet<>(migratedRepresentationIds); for (DocumentTextRepresentation representation : representations) { + if (representation.getId() != null && skippedIds.contains(representation.getId())) { + continue; + } RepresentationType type = representation.getRepresentationType(); boolean queue = switch (type) { case SEMANTIC_TEXT -> true; diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingBackfillService.java b/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingBackfillService.java new file mode 100644 index 0000000..a9a0232 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingBackfillService.java @@ -0,0 +1,188 @@ +package at.procon.dip.migration.service; + +import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.service.EmbeddingPersistenceService; +import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; +import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties; +import at.procon.dip.migration.repository.LegacyTedEmbeddingTargetQueryRepository; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.ted.repository.LegacyTedEmbeddingSnapshot; +import at.procon.ted.repository.LegacyTedMigrationCursor; +import at.procon.ted.repository.ProcurementDocumentRepository; +import java.time.Instant; +import java.time.OffsetDateTime; +import java.time.ZoneId; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.util.StringUtils; + +@Service +@ConditionalOnRuntimeMode(RuntimeMode.NEW) +@RequiredArgsConstructor +@Slf4j +public class LegacyTedEmbeddingBackfillService { + + private final LegacyTedEmbeddingBackfillProperties properties; + private final ProcurementDocumentRepository procurementDocumentRepository; + private final LegacyTedEmbeddingTargetQueryRepository targetQueryRepository; + private final EmbeddingPersistenceService embeddingPersistenceService; + private final RepresentationEmbeddingOrchestrator embeddingOrchestrator; + private final EmbeddingProperties embeddingProperties; + + public void runBackfill() { + if (!properties.isEnabled()) { + log.info("Legacy TED embedding-only backfill is disabled"); + return; + } + if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) { + log.warn("Skipping legacy TED embedding-only backfill because no default document model is configured"); + return; + } + + log.info("Starting legacy TED embedding-only backfill (batchSize={}, maxDocumentsPerRun={}, queueMissingEmbeddings={})", + properties.getBatchSize(), properties.getMaxDocumentsPerRun(), properties.isQueueMissingEmbeddings()); + + Instant lastCreatedAt = null; + UUID lastId = null; + long inspected = 0; + long migrated = 0; + long queued = 0; + long skippedMissingTarget = 0; + long skippedMissingVector = 0; + + while (true) { + int limit = effectiveBatchLimit(inspected); + if (limit <= 0) { + break; + } + + List cursors = loadNextBatch( + lastCreatedAt != null ? + lastCreatedAt.atZone(ZoneId.systemDefault()).toOffsetDateTime() : null, lastId, limit); + if (cursors.isEmpty()) { + break; + } + + List legacyIds = new ArrayList<>(cursors.size()); + for (LegacyTedMigrationCursor cursor : cursors) { + legacyIds.add(cursor.getId()); + } + + Map targetsByLegacyId = indexTargets( + targetQueryRepository.findPrimarySemanticTargetsByLegacyIds(legacyIds) + ); + Map snapshotsByLegacyId = indexSnapshots( + procurementDocumentRepository.findEmbeddingSnapshotsByIdsForMigration(legacyIds) + ); + + for (LegacyTedMigrationCursor cursor : cursors) { + inspected++; + LegacyTedEmbeddingTarget target = targetsByLegacyId.get(cursor.getId()); + LegacyTedEmbeddingSnapshot snapshot = snapshotsByLegacyId.get(cursor.getId()); + + if (target == null) { + if (properties.isSkipWhenPrimaryRepresentationMissing()) { + skippedMissingTarget++; + } + lastCreatedAt = cursor.getCreatedAt(); + lastId = cursor.getId(); + continue; + } + + if (snapshot != null && StringUtils.hasText(snapshot.getVectorText())) { + float[] vector = parseVectorText(snapshot.getVectorText()); + if (vector.length > 0) { + var embedding = embeddingPersistenceService.ensurePending(target.representationId(), embeddingProperties.getDefaultDocumentModel()); + embeddingPersistenceService.saveCompleted(embedding.getId(), vector, snapshot.getTokenCount()); + migrated++; + } else { + skippedMissingVector++; + } + } else if (properties.isQueueMissingEmbeddings()) { + embeddingOrchestrator.enqueueRepresentation( + target.documentId(), + target.representationId(), + embeddingProperties.getDefaultDocumentModel() + ); + queued++; + } else { + skippedMissingVector++; + } + + lastCreatedAt = cursor.getCreatedAt(); + lastId = cursor.getId(); + } + + log.info("Legacy TED embedding-only backfill progress: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}", + inspected, migrated, queued, skippedMissingTarget, skippedMissingVector); + } + + log.info("Legacy TED embedding-only backfill finished: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}", + inspected, migrated, queued, skippedMissingTarget, skippedMissingVector); + } + + protected List loadNextBatch(OffsetDateTime lastCreatedAt, UUID lastId, int limit) { + if (lastCreatedAt == null || lastId == null) { + return procurementDocumentRepository.findFirstMigrationBatch(limit); + } + return procurementDocumentRepository.findNextMigrationBatch(lastCreatedAt, lastId, limit); + } + + private int effectiveBatchLimit(long processedInThisInvocation) { + long maxPerRun = properties.getMaxDocumentsPerRun(); + if (maxPerRun <= 0) { + return Math.max(1, properties.getBatchSize()); + } + long remaining = maxPerRun - processedInThisInvocation; + if (remaining <= 0) { + return 0; + } + return (int) Math.max(1L, Math.min(properties.getBatchSize(), remaining)); + } + + private Map indexTargets(Collection targets) { + Map indexed = new LinkedHashMap<>(); + for (LegacyTedEmbeddingTarget target : targets) { + indexed.putIfAbsent(target.legacyProcurementDocumentId(), target); + } + return indexed; + } + + private Map indexSnapshots(Collection snapshots) { + Map indexed = new LinkedHashMap<>(); + for (LegacyTedEmbeddingSnapshot snapshot : snapshots) { + indexed.put(snapshot.getId(), snapshot); + } + return indexed; + } + + private float[] parseVectorText(String vectorText) { + if (!StringUtils.hasText(vectorText)) { + return new float[0]; + } + String trimmed = vectorText.trim(); + if (trimmed.startsWith("[")) { + trimmed = trimmed.substring(1); + } + if (trimmed.endsWith("]")) { + trimmed = trimmed.substring(0, trimmed.length() - 1); + } + if (!StringUtils.hasText(trimmed)) { + return new float[0]; + } + String[] parts = trimmed.split(","); + float[] result = new float[parts.length]; + for (int i = 0; i < parts.length; i++) { + result[i] = Float.parseFloat(parts[i].trim()); + } + return result; + } +} diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingTarget.java b/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingTarget.java new file mode 100644 index 0000000..54783d3 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/service/LegacyTedEmbeddingTarget.java @@ -0,0 +1,8 @@ +package at.procon.dip.migration.service; + +import java.util.UUID; + +public record LegacyTedEmbeddingTarget(UUID legacyProcurementDocumentId, + UUID documentId, + UUID representationId) { +} diff --git a/src/main/java/at/procon/dip/migration/startup/LegacyTedEmbeddingBackfillStartupRunner.java b/src/main/java/at/procon/dip/migration/startup/LegacyTedEmbeddingBackfillStartupRunner.java new file mode 100644 index 0000000..0df389d --- /dev/null +++ b/src/main/java/at/procon/dip/migration/startup/LegacyTedEmbeddingBackfillStartupRunner.java @@ -0,0 +1,33 @@ +package at.procon.dip.migration.startup; + +import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties; +import at.procon.dip.migration.service.LegacyTedEmbeddingBackfillService; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.core.Ordered; +import org.springframework.core.annotation.Order; +import org.springframework.stereotype.Component; + +@Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) +@RequiredArgsConstructor +@Order(Ordered.LOWEST_PRECEDENCE) +@Slf4j +public class LegacyTedEmbeddingBackfillStartupRunner implements ApplicationRunner { + + private final LegacyTedEmbeddingBackfillProperties properties; + private final LegacyTedEmbeddingBackfillService backfillService; + + @Override + public void run(ApplicationArguments args) { + if (!properties.isEnabled() || !properties.isStartupEnabled()) { + return; + } + log.info("Startup-triggered legacy TED embedding-only backfill is enabled"); + backfillService.runBackfill(); + } +} diff --git a/src/main/java/at/procon/ted/repository/LegacyTedEmbeddingSnapshot.java b/src/main/java/at/procon/ted/repository/LegacyTedEmbeddingSnapshot.java new file mode 100644 index 0000000..85b44bb --- /dev/null +++ b/src/main/java/at/procon/ted/repository/LegacyTedEmbeddingSnapshot.java @@ -0,0 +1,15 @@ +package at.procon.ted.repository; + +import java.util.UUID; + +/** + * Lightweight projection for migrating legacy TED notice embeddings into the new DOC embedding model. + */ +public interface LegacyTedEmbeddingSnapshot { + + UUID getId(); + + String getVectorText(); + + Integer getTokenCount(); +} diff --git a/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java b/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java index bf76099..5527a70 100644 --- a/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java +++ b/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java @@ -10,8 +10,10 @@ import org.springframework.data.jpa.repository.Query; import org.springframework.data.repository.query.Param; import org.springframework.stereotype.Repository; +import java.time.Instant; import java.time.LocalDate; import java.time.OffsetDateTime; +import java.util.Collection; import java.util.List; import java.util.Optional; import java.util.UUID; @@ -216,7 +218,7 @@ public interface ProcurementDocumentRepository extends @Query(value = """ SELECT p.id AS id, p.created_at AS createdAt FROM ted.procurement_document p - ORDER BY p.created_at ASC, CAST(p.id AS text) ASC + ORDER BY p.created_at ASC, p.id ASC LIMIT :limit """, nativeQuery = true) List findFirstMigrationBatch(@Param("limit") int limit); @@ -228,14 +230,34 @@ public interface ProcurementDocumentRepository extends SELECT p.id AS id, p.created_at AS createdAt FROM ted.procurement_document p WHERE p.created_at > :lastCreatedAt - OR (p.created_at = :lastCreatedAt AND CAST(p.id AS text) > CAST(:lastId AS text)) - ORDER BY p.created_at ASC, CAST(p.id AS text) ASC + OR (p.created_at = :lastCreatedAt AND p.id > :lastId) + ORDER BY p.created_at ASC, p.id ASC LIMIT :limit """, nativeQuery = true) List findNextMigrationBatch(@Param("lastCreatedAt") OffsetDateTime lastCreatedAt, @Param("lastId") UUID lastId, @Param("limit") int limit); + @Query("SELECT p FROM ProcurementDocument p WHERE p.id IN :ids") + List findAllByIdInForMigration(@Param("ids") Collection ids); + + @Query("SELECT l FROM ProcurementLot l WHERE l.document.id IN :ids") + List findLotsByDocumentIdInForMigration(@Param("ids") Collection ids); + + @Query("SELECT o FROM Organization o WHERE o.document.id IN :ids") + List findOrganizationsByDocumentIdInForMigration(@Param("ids") Collection ids); + + @Query(value = """ + SELECT p.id AS id, + CAST(p.content_vector AS text) AS vectorText, + p.embedding_token_count AS tokenCount + FROM ted.procurement_document p + WHERE p.id IN :ids + AND p.content_vector IS NOT NULL + """, nativeQuery = true) + List findEmbeddingSnapshotsByIdsForMigration(@Param("ids") Collection ids); + + /** * Delete all documents created before the specified date. * Cascading deletes will automatically remove related lots, organizations, and logs. @@ -254,6 +276,14 @@ public interface ProcurementDocumentRepository extends * @return Number of documents */ long countByCreatedAtBefore(OffsetDateTime cutoffDate); -} - + @Query(value = """ + SELECT p.id AS id, + CAST(p.content_vector AS text) AS vectorText, + p.embedding_token_count AS tokenCount + FROM ted.procurement_document p + WHERE p.id = :id + AND p.content_vector IS NOT NULL + """, nativeQuery = true) + Optional findEmbeddingSnapshotByIdForMigration(@Param("id") UUID id); +} \ No newline at end of file diff --git a/src/main/resources/application-new.yml b/src/main/resources/application-new.yml index bbd1b58..bb50a29 100644 --- a/src/main/resources/application-new.yml +++ b/src/main/resources/application-new.yml @@ -1,7 +1,6 @@ dip: runtime: mode: NEW - search: # Default page size for search results default-page-size: 20 @@ -37,9 +36,9 @@ dip: debug-top-hits-per-engine: 10 embedding: - enabled: false + enabled: true jobs: - enabled: true + enabled: false process-in-batches: true execution-batch-size: 20 @@ -54,7 +53,7 @@ dip: external-e5: type: http-json - base-url: http://172.20.240.18:8001 + base-url: http://localhost:8001 connect-timeout: 5s read-timeout: 60s @@ -285,4 +284,14 @@ dip: # Import batch id written to DOC.doc_source rows created by the migration import-batch-id: legacy-ted-backfill # Keep false for Wave 1; embeddings can be backfilled later as a separate step - queue-embeddings: false \ No newline at end of file + queue-embeddings: false + migrate-embeddings: false + build-chunk-representations: true + + legacy-ted-embeddings: + enabled: true + startup-enabled: true + batch-size: 500 + max-documents-per-run: 0 + skip-when-primary-representation-missing: true + queue-missing-embeddings: false \ No newline at end of file diff --git a/src/main/resources/db/migration/V22__doc_legacy_ted_backfill_performance.sql b/src/main/resources/db/migration/V22__doc_legacy_ted_backfill_performance.sql new file mode 100644 index 0000000..2ecc2f7 --- /dev/null +++ b/src/main/resources/db/migration/V22__doc_legacy_ted_backfill_performance.sql @@ -0,0 +1,7 @@ +SET search_path TO TED, DOC, public; + +CREATE INDEX IF NOT EXISTS idx_ted_procurement_document_created_at_id + ON TED.procurement_document (created_at ASC, id ASC); + +ALTER TABLE DOC.doc_legacy_ted_migration_run + ADD COLUMN IF NOT EXISTS build_chunk_representations boolean NOT NULL DEFAULT true;