ted legacy embeddings migration
parent
28c7854ead
commit
0ce5f51382
@ -1,28 +0,0 @@
|
||||
package at.procon.dip;
|
||||
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.boot.autoconfigure.domain.EntityScan;
|
||||
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||
import org.springframework.scheduling.annotation.EnableAsync;
|
||||
|
||||
/**
|
||||
* Procon Document Intelligence Platform (DIP).
|
||||
*
|
||||
* <p>Phase 0 introduces a generic platform root namespace and architecture contracts
|
||||
* while keeping the existing TED-specific runtime intact. Subsequent phases can move
|
||||
* modules incrementally from {@code at.procon.ted} into the broader document platform.</p>
|
||||
*/
|
||||
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
|
||||
@EnableAsync
|
||||
//@EnableConfigurationProperties(TedProcessorProperties.class)
|
||||
@EntityScan(basePackages = {"at.procon.ted.model.entity"})
|
||||
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository"})
|
||||
public class DocumentIntelligencePlatformApplication {
|
||||
|
||||
public static void main(String[] args) {
|
||||
SpringApplication.run(DocumentIntelligencePlatformApplication.class, args);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,29 @@
|
||||
package at.procon.dip.migration.config;
|
||||
|
||||
import lombok.Data;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@ConfigurationProperties(prefix = "dip.migration.legacy-ted-embeddings")
|
||||
@Data
|
||||
public class LegacyTedEmbeddingBackfillProperties {
|
||||
|
||||
/** Enable the legacy TED embedding-only backfill subsystem. */
|
||||
private boolean enabled = false;
|
||||
|
||||
/** Run the embedding-only backfill automatically on application startup in NEW runtime. */
|
||||
private boolean startupEnabled = false;
|
||||
|
||||
/** Number of legacy TED documents to inspect per cursor batch. */
|
||||
private int batchSize = 500;
|
||||
|
||||
/** Optional cap for a single invocation. 0 or negative means unlimited. */
|
||||
private long maxDocumentsPerRun = 0;
|
||||
|
||||
/** Skip legacy TED rows that do not yet have a migrated primary SEMANTIC_TEXT representation. */
|
||||
private boolean skipWhenPrimaryRepresentationMissing = true;
|
||||
|
||||
/** Queue a fresh embedding job when no legacy vector exists for a migrated document. */
|
||||
private boolean queueMissingEmbeddings = false;
|
||||
}
|
||||
@ -0,0 +1,49 @@
|
||||
package at.procon.dip.migration.repository;
|
||||
|
||||
import at.procon.dip.migration.service.LegacyTedEmbeddingTarget;
|
||||
import jakarta.persistence.EntityManager;
|
||||
import jakarta.persistence.Query;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
@RequiredArgsConstructor
|
||||
public class LegacyTedEmbeddingTargetQueryRepository {
|
||||
|
||||
private final EntityManager entityManager;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public List<LegacyTedEmbeddingTarget> findPrimarySemanticTargetsByLegacyIds(Collection<UUID> legacyIds) {
|
||||
if (legacyIds == null || legacyIds.isEmpty()) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
Query query = entityManager.createNativeQuery("""
|
||||
SELECT p.legacy_procurement_document_id AS legacy_procurement_document_id,
|
||||
p.document_id AS document_id,
|
||||
r.id AS representation_id
|
||||
FROM ted.ted_notice_projection p
|
||||
JOIN doc.doc_text_representation r
|
||||
ON r.document_id = p.document_id
|
||||
WHERE p.legacy_procurement_document_id IN (:legacyIds)
|
||||
AND r.representation_type = 'SEMANTIC_TEXT'
|
||||
AND COALESCE(r.is_primary, FALSE) = TRUE
|
||||
""");
|
||||
query.setParameter("legacyIds", legacyIds);
|
||||
|
||||
List<Object[]> rows = query.getResultList();
|
||||
List<LegacyTedEmbeddingTarget> results = new ArrayList<>(rows.size());
|
||||
for (Object[] row : rows) {
|
||||
results.add(new LegacyTedEmbeddingTarget(
|
||||
(UUID) row[0],
|
||||
(UUID) row[1],
|
||||
(UUID) row[2]
|
||||
));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,188 @@
|
||||
package at.procon.dip.migration.service;
|
||||
|
||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||
import at.procon.dip.embedding.service.EmbeddingPersistenceService;
|
||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||
import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties;
|
||||
import at.procon.dip.migration.repository.LegacyTedEmbeddingTargetQueryRepository;
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.ted.repository.LegacyTedEmbeddingSnapshot;
|
||||
import at.procon.ted.repository.LegacyTedMigrationCursor;
|
||||
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||
import java.time.Instant;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
@Service
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class LegacyTedEmbeddingBackfillService {
|
||||
|
||||
private final LegacyTedEmbeddingBackfillProperties properties;
|
||||
private final ProcurementDocumentRepository procurementDocumentRepository;
|
||||
private final LegacyTedEmbeddingTargetQueryRepository targetQueryRepository;
|
||||
private final EmbeddingPersistenceService embeddingPersistenceService;
|
||||
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
||||
private final EmbeddingProperties embeddingProperties;
|
||||
|
||||
public void runBackfill() {
|
||||
if (!properties.isEnabled()) {
|
||||
log.info("Legacy TED embedding-only backfill is disabled");
|
||||
return;
|
||||
}
|
||||
if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
|
||||
log.warn("Skipping legacy TED embedding-only backfill because no default document model is configured");
|
||||
return;
|
||||
}
|
||||
|
||||
log.info("Starting legacy TED embedding-only backfill (batchSize={}, maxDocumentsPerRun={}, queueMissingEmbeddings={})",
|
||||
properties.getBatchSize(), properties.getMaxDocumentsPerRun(), properties.isQueueMissingEmbeddings());
|
||||
|
||||
Instant lastCreatedAt = null;
|
||||
UUID lastId = null;
|
||||
long inspected = 0;
|
||||
long migrated = 0;
|
||||
long queued = 0;
|
||||
long skippedMissingTarget = 0;
|
||||
long skippedMissingVector = 0;
|
||||
|
||||
while (true) {
|
||||
int limit = effectiveBatchLimit(inspected);
|
||||
if (limit <= 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
List<LegacyTedMigrationCursor> cursors = loadNextBatch(
|
||||
lastCreatedAt != null ?
|
||||
lastCreatedAt.atZone(ZoneId.systemDefault()).toOffsetDateTime() : null, lastId, limit);
|
||||
if (cursors.isEmpty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
List<UUID> legacyIds = new ArrayList<>(cursors.size());
|
||||
for (LegacyTedMigrationCursor cursor : cursors) {
|
||||
legacyIds.add(cursor.getId());
|
||||
}
|
||||
|
||||
Map<UUID, LegacyTedEmbeddingTarget> targetsByLegacyId = indexTargets(
|
||||
targetQueryRepository.findPrimarySemanticTargetsByLegacyIds(legacyIds)
|
||||
);
|
||||
Map<UUID, LegacyTedEmbeddingSnapshot> snapshotsByLegacyId = indexSnapshots(
|
||||
procurementDocumentRepository.findEmbeddingSnapshotsByIdsForMigration(legacyIds)
|
||||
);
|
||||
|
||||
for (LegacyTedMigrationCursor cursor : cursors) {
|
||||
inspected++;
|
||||
LegacyTedEmbeddingTarget target = targetsByLegacyId.get(cursor.getId());
|
||||
LegacyTedEmbeddingSnapshot snapshot = snapshotsByLegacyId.get(cursor.getId());
|
||||
|
||||
if (target == null) {
|
||||
if (properties.isSkipWhenPrimaryRepresentationMissing()) {
|
||||
skippedMissingTarget++;
|
||||
}
|
||||
lastCreatedAt = cursor.getCreatedAt();
|
||||
lastId = cursor.getId();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (snapshot != null && StringUtils.hasText(snapshot.getVectorText())) {
|
||||
float[] vector = parseVectorText(snapshot.getVectorText());
|
||||
if (vector.length > 0) {
|
||||
var embedding = embeddingPersistenceService.ensurePending(target.representationId(), embeddingProperties.getDefaultDocumentModel());
|
||||
embeddingPersistenceService.saveCompleted(embedding.getId(), vector, snapshot.getTokenCount());
|
||||
migrated++;
|
||||
} else {
|
||||
skippedMissingVector++;
|
||||
}
|
||||
} else if (properties.isQueueMissingEmbeddings()) {
|
||||
embeddingOrchestrator.enqueueRepresentation(
|
||||
target.documentId(),
|
||||
target.representationId(),
|
||||
embeddingProperties.getDefaultDocumentModel()
|
||||
);
|
||||
queued++;
|
||||
} else {
|
||||
skippedMissingVector++;
|
||||
}
|
||||
|
||||
lastCreatedAt = cursor.getCreatedAt();
|
||||
lastId = cursor.getId();
|
||||
}
|
||||
|
||||
log.info("Legacy TED embedding-only backfill progress: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}",
|
||||
inspected, migrated, queued, skippedMissingTarget, skippedMissingVector);
|
||||
}
|
||||
|
||||
log.info("Legacy TED embedding-only backfill finished: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}",
|
||||
inspected, migrated, queued, skippedMissingTarget, skippedMissingVector);
|
||||
}
|
||||
|
||||
protected List<LegacyTedMigrationCursor> loadNextBatch(OffsetDateTime lastCreatedAt, UUID lastId, int limit) {
|
||||
if (lastCreatedAt == null || lastId == null) {
|
||||
return procurementDocumentRepository.findFirstMigrationBatch(limit);
|
||||
}
|
||||
return procurementDocumentRepository.findNextMigrationBatch(lastCreatedAt, lastId, limit);
|
||||
}
|
||||
|
||||
private int effectiveBatchLimit(long processedInThisInvocation) {
|
||||
long maxPerRun = properties.getMaxDocumentsPerRun();
|
||||
if (maxPerRun <= 0) {
|
||||
return Math.max(1, properties.getBatchSize());
|
||||
}
|
||||
long remaining = maxPerRun - processedInThisInvocation;
|
||||
if (remaining <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return (int) Math.max(1L, Math.min(properties.getBatchSize(), remaining));
|
||||
}
|
||||
|
||||
private Map<UUID, LegacyTedEmbeddingTarget> indexTargets(Collection<LegacyTedEmbeddingTarget> targets) {
|
||||
Map<UUID, LegacyTedEmbeddingTarget> indexed = new LinkedHashMap<>();
|
||||
for (LegacyTedEmbeddingTarget target : targets) {
|
||||
indexed.putIfAbsent(target.legacyProcurementDocumentId(), target);
|
||||
}
|
||||
return indexed;
|
||||
}
|
||||
|
||||
private Map<UUID, LegacyTedEmbeddingSnapshot> indexSnapshots(Collection<LegacyTedEmbeddingSnapshot> snapshots) {
|
||||
Map<UUID, LegacyTedEmbeddingSnapshot> indexed = new LinkedHashMap<>();
|
||||
for (LegacyTedEmbeddingSnapshot snapshot : snapshots) {
|
||||
indexed.put(snapshot.getId(), snapshot);
|
||||
}
|
||||
return indexed;
|
||||
}
|
||||
|
||||
private float[] parseVectorText(String vectorText) {
|
||||
if (!StringUtils.hasText(vectorText)) {
|
||||
return new float[0];
|
||||
}
|
||||
String trimmed = vectorText.trim();
|
||||
if (trimmed.startsWith("[")) {
|
||||
trimmed = trimmed.substring(1);
|
||||
}
|
||||
if (trimmed.endsWith("]")) {
|
||||
trimmed = trimmed.substring(0, trimmed.length() - 1);
|
||||
}
|
||||
if (!StringUtils.hasText(trimmed)) {
|
||||
return new float[0];
|
||||
}
|
||||
String[] parts = trimmed.split(",");
|
||||
float[] result = new float[parts.length];
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
result[i] = Float.parseFloat(parts[i].trim());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
package at.procon.dip.migration.service;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
public record LegacyTedEmbeddingTarget(UUID legacyProcurementDocumentId,
|
||||
UUID documentId,
|
||||
UUID representationId) {
|
||||
}
|
||||
@ -0,0 +1,33 @@
|
||||
package at.procon.dip.migration.startup;
|
||||
|
||||
import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties;
|
||||
import at.procon.dip.migration.service.LegacyTedEmbeddingBackfillService;
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.boot.ApplicationArguments;
|
||||
import org.springframework.boot.ApplicationRunner;
|
||||
import org.springframework.core.Ordered;
|
||||
import org.springframework.core.annotation.Order;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
@RequiredArgsConstructor
|
||||
@Order(Ordered.LOWEST_PRECEDENCE)
|
||||
@Slf4j
|
||||
public class LegacyTedEmbeddingBackfillStartupRunner implements ApplicationRunner {
|
||||
|
||||
private final LegacyTedEmbeddingBackfillProperties properties;
|
||||
private final LegacyTedEmbeddingBackfillService backfillService;
|
||||
|
||||
@Override
|
||||
public void run(ApplicationArguments args) {
|
||||
if (!properties.isEnabled() || !properties.isStartupEnabled()) {
|
||||
return;
|
||||
}
|
||||
log.info("Startup-triggered legacy TED embedding-only backfill is enabled");
|
||||
backfillService.runBackfill();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package at.procon.ted.repository;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Lightweight projection for migrating legacy TED notice embeddings into the new DOC embedding model.
|
||||
*/
|
||||
public interface LegacyTedEmbeddingSnapshot {
|
||||
|
||||
UUID getId();
|
||||
|
||||
String getVectorText();
|
||||
|
||||
Integer getTokenCount();
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
SET search_path TO TED, DOC, public;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ted_procurement_document_created_at_id
|
||||
ON TED.procurement_document (created_at ASC, id ASC);
|
||||
|
||||
ALTER TABLE DOC.doc_legacy_ted_migration_run
|
||||
ADD COLUMN IF NOT EXISTS build_chunk_representations boolean NOT NULL DEFAULT true;
|
||||
Loading…
Reference in New Issue