ted legacy embeddings migration
parent
28c7854ead
commit
0ce5f51382
@ -1,28 +0,0 @@
|
|||||||
package at.procon.dip;
|
|
||||||
|
|
||||||
import at.procon.ted.config.TedProcessorProperties;
|
|
||||||
import org.springframework.boot.SpringApplication;
|
|
||||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
|
||||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
|
||||||
import org.springframework.boot.autoconfigure.domain.EntityScan;
|
|
||||||
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
|
||||||
import org.springframework.scheduling.annotation.EnableAsync;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Procon Document Intelligence Platform (DIP).
|
|
||||||
*
|
|
||||||
* <p>Phase 0 introduces a generic platform root namespace and architecture contracts
|
|
||||||
* while keeping the existing TED-specific runtime intact. Subsequent phases can move
|
|
||||||
* modules incrementally from {@code at.procon.ted} into the broader document platform.</p>
|
|
||||||
*/
|
|
||||||
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
|
|
||||||
@EnableAsync
|
|
||||||
//@EnableConfigurationProperties(TedProcessorProperties.class)
|
|
||||||
@EntityScan(basePackages = {"at.procon.ted.model.entity"})
|
|
||||||
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository"})
|
|
||||||
public class DocumentIntelligencePlatformApplication {
|
|
||||||
|
|
||||||
public static void main(String[] args) {
|
|
||||||
SpringApplication.run(DocumentIntelligencePlatformApplication.class, args);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -0,0 +1,29 @@
|
|||||||
|
package at.procon.dip.migration.config;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
|
||||||
|
@Configuration
|
||||||
|
@ConfigurationProperties(prefix = "dip.migration.legacy-ted-embeddings")
|
||||||
|
@Data
|
||||||
|
public class LegacyTedEmbeddingBackfillProperties {
|
||||||
|
|
||||||
|
/** Enable the legacy TED embedding-only backfill subsystem. */
|
||||||
|
private boolean enabled = false;
|
||||||
|
|
||||||
|
/** Run the embedding-only backfill automatically on application startup in NEW runtime. */
|
||||||
|
private boolean startupEnabled = false;
|
||||||
|
|
||||||
|
/** Number of legacy TED documents to inspect per cursor batch. */
|
||||||
|
private int batchSize = 500;
|
||||||
|
|
||||||
|
/** Optional cap for a single invocation. 0 or negative means unlimited. */
|
||||||
|
private long maxDocumentsPerRun = 0;
|
||||||
|
|
||||||
|
/** Skip legacy TED rows that do not yet have a migrated primary SEMANTIC_TEXT representation. */
|
||||||
|
private boolean skipWhenPrimaryRepresentationMissing = true;
|
||||||
|
|
||||||
|
/** Queue a fresh embedding job when no legacy vector exists for a migrated document. */
|
||||||
|
private boolean queueMissingEmbeddings = false;
|
||||||
|
}
|
||||||
@ -0,0 +1,49 @@
|
|||||||
|
package at.procon.dip.migration.repository;
|
||||||
|
|
||||||
|
import at.procon.dip.migration.service.LegacyTedEmbeddingTarget;
|
||||||
|
import jakarta.persistence.EntityManager;
|
||||||
|
import jakarta.persistence.Query;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Repository;
|
||||||
|
|
||||||
|
@Repository
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class LegacyTedEmbeddingTargetQueryRepository {
|
||||||
|
|
||||||
|
private final EntityManager entityManager;
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public List<LegacyTedEmbeddingTarget> findPrimarySemanticTargetsByLegacyIds(Collection<UUID> legacyIds) {
|
||||||
|
if (legacyIds == null || legacyIds.isEmpty()) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
Query query = entityManager.createNativeQuery("""
|
||||||
|
SELECT p.legacy_procurement_document_id AS legacy_procurement_document_id,
|
||||||
|
p.document_id AS document_id,
|
||||||
|
r.id AS representation_id
|
||||||
|
FROM ted.ted_notice_projection p
|
||||||
|
JOIN doc.doc_text_representation r
|
||||||
|
ON r.document_id = p.document_id
|
||||||
|
WHERE p.legacy_procurement_document_id IN (:legacyIds)
|
||||||
|
AND r.representation_type = 'SEMANTIC_TEXT'
|
||||||
|
AND COALESCE(r.is_primary, FALSE) = TRUE
|
||||||
|
""");
|
||||||
|
query.setParameter("legacyIds", legacyIds);
|
||||||
|
|
||||||
|
List<Object[]> rows = query.getResultList();
|
||||||
|
List<LegacyTedEmbeddingTarget> results = new ArrayList<>(rows.size());
|
||||||
|
for (Object[] row : rows) {
|
||||||
|
results.add(new LegacyTedEmbeddingTarget(
|
||||||
|
(UUID) row[0],
|
||||||
|
(UUID) row[1],
|
||||||
|
(UUID) row[2]
|
||||||
|
));
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,188 @@
|
|||||||
|
package at.procon.dip.migration.service;
|
||||||
|
|
||||||
|
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||||
|
import at.procon.dip.embedding.service.EmbeddingPersistenceService;
|
||||||
|
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||||
|
import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties;
|
||||||
|
import at.procon.dip.migration.repository.LegacyTedEmbeddingTargetQueryRepository;
|
||||||
|
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||||
|
import at.procon.dip.runtime.config.RuntimeMode;
|
||||||
|
import at.procon.ted.repository.LegacyTedEmbeddingSnapshot;
|
||||||
|
import at.procon.ted.repository.LegacyTedMigrationCursor;
|
||||||
|
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.time.ZoneId;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class LegacyTedEmbeddingBackfillService {
|
||||||
|
|
||||||
|
private final LegacyTedEmbeddingBackfillProperties properties;
|
||||||
|
private final ProcurementDocumentRepository procurementDocumentRepository;
|
||||||
|
private final LegacyTedEmbeddingTargetQueryRepository targetQueryRepository;
|
||||||
|
private final EmbeddingPersistenceService embeddingPersistenceService;
|
||||||
|
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
||||||
|
private final EmbeddingProperties embeddingProperties;
|
||||||
|
|
||||||
|
public void runBackfill() {
|
||||||
|
if (!properties.isEnabled()) {
|
||||||
|
log.info("Legacy TED embedding-only backfill is disabled");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
|
||||||
|
log.warn("Skipping legacy TED embedding-only backfill because no default document model is configured");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("Starting legacy TED embedding-only backfill (batchSize={}, maxDocumentsPerRun={}, queueMissingEmbeddings={})",
|
||||||
|
properties.getBatchSize(), properties.getMaxDocumentsPerRun(), properties.isQueueMissingEmbeddings());
|
||||||
|
|
||||||
|
Instant lastCreatedAt = null;
|
||||||
|
UUID lastId = null;
|
||||||
|
long inspected = 0;
|
||||||
|
long migrated = 0;
|
||||||
|
long queued = 0;
|
||||||
|
long skippedMissingTarget = 0;
|
||||||
|
long skippedMissingVector = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
int limit = effectiveBatchLimit(inspected);
|
||||||
|
if (limit <= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<LegacyTedMigrationCursor> cursors = loadNextBatch(
|
||||||
|
lastCreatedAt != null ?
|
||||||
|
lastCreatedAt.atZone(ZoneId.systemDefault()).toOffsetDateTime() : null, lastId, limit);
|
||||||
|
if (cursors.isEmpty()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<UUID> legacyIds = new ArrayList<>(cursors.size());
|
||||||
|
for (LegacyTedMigrationCursor cursor : cursors) {
|
||||||
|
legacyIds.add(cursor.getId());
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<UUID, LegacyTedEmbeddingTarget> targetsByLegacyId = indexTargets(
|
||||||
|
targetQueryRepository.findPrimarySemanticTargetsByLegacyIds(legacyIds)
|
||||||
|
);
|
||||||
|
Map<UUID, LegacyTedEmbeddingSnapshot> snapshotsByLegacyId = indexSnapshots(
|
||||||
|
procurementDocumentRepository.findEmbeddingSnapshotsByIdsForMigration(legacyIds)
|
||||||
|
);
|
||||||
|
|
||||||
|
for (LegacyTedMigrationCursor cursor : cursors) {
|
||||||
|
inspected++;
|
||||||
|
LegacyTedEmbeddingTarget target = targetsByLegacyId.get(cursor.getId());
|
||||||
|
LegacyTedEmbeddingSnapshot snapshot = snapshotsByLegacyId.get(cursor.getId());
|
||||||
|
|
||||||
|
if (target == null) {
|
||||||
|
if (properties.isSkipWhenPrimaryRepresentationMissing()) {
|
||||||
|
skippedMissingTarget++;
|
||||||
|
}
|
||||||
|
lastCreatedAt = cursor.getCreatedAt();
|
||||||
|
lastId = cursor.getId();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (snapshot != null && StringUtils.hasText(snapshot.getVectorText())) {
|
||||||
|
float[] vector = parseVectorText(snapshot.getVectorText());
|
||||||
|
if (vector.length > 0) {
|
||||||
|
var embedding = embeddingPersistenceService.ensurePending(target.representationId(), embeddingProperties.getDefaultDocumentModel());
|
||||||
|
embeddingPersistenceService.saveCompleted(embedding.getId(), vector, snapshot.getTokenCount());
|
||||||
|
migrated++;
|
||||||
|
} else {
|
||||||
|
skippedMissingVector++;
|
||||||
|
}
|
||||||
|
} else if (properties.isQueueMissingEmbeddings()) {
|
||||||
|
embeddingOrchestrator.enqueueRepresentation(
|
||||||
|
target.documentId(),
|
||||||
|
target.representationId(),
|
||||||
|
embeddingProperties.getDefaultDocumentModel()
|
||||||
|
);
|
||||||
|
queued++;
|
||||||
|
} else {
|
||||||
|
skippedMissingVector++;
|
||||||
|
}
|
||||||
|
|
||||||
|
lastCreatedAt = cursor.getCreatedAt();
|
||||||
|
lastId = cursor.getId();
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("Legacy TED embedding-only backfill progress: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}",
|
||||||
|
inspected, migrated, queued, skippedMissingTarget, skippedMissingVector);
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("Legacy TED embedding-only backfill finished: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}",
|
||||||
|
inspected, migrated, queued, skippedMissingTarget, skippedMissingVector);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<LegacyTedMigrationCursor> loadNextBatch(OffsetDateTime lastCreatedAt, UUID lastId, int limit) {
|
||||||
|
if (lastCreatedAt == null || lastId == null) {
|
||||||
|
return procurementDocumentRepository.findFirstMigrationBatch(limit);
|
||||||
|
}
|
||||||
|
return procurementDocumentRepository.findNextMigrationBatch(lastCreatedAt, lastId, limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int effectiveBatchLimit(long processedInThisInvocation) {
|
||||||
|
long maxPerRun = properties.getMaxDocumentsPerRun();
|
||||||
|
if (maxPerRun <= 0) {
|
||||||
|
return Math.max(1, properties.getBatchSize());
|
||||||
|
}
|
||||||
|
long remaining = maxPerRun - processedInThisInvocation;
|
||||||
|
if (remaining <= 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return (int) Math.max(1L, Math.min(properties.getBatchSize(), remaining));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<UUID, LegacyTedEmbeddingTarget> indexTargets(Collection<LegacyTedEmbeddingTarget> targets) {
|
||||||
|
Map<UUID, LegacyTedEmbeddingTarget> indexed = new LinkedHashMap<>();
|
||||||
|
for (LegacyTedEmbeddingTarget target : targets) {
|
||||||
|
indexed.putIfAbsent(target.legacyProcurementDocumentId(), target);
|
||||||
|
}
|
||||||
|
return indexed;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<UUID, LegacyTedEmbeddingSnapshot> indexSnapshots(Collection<LegacyTedEmbeddingSnapshot> snapshots) {
|
||||||
|
Map<UUID, LegacyTedEmbeddingSnapshot> indexed = new LinkedHashMap<>();
|
||||||
|
for (LegacyTedEmbeddingSnapshot snapshot : snapshots) {
|
||||||
|
indexed.put(snapshot.getId(), snapshot);
|
||||||
|
}
|
||||||
|
return indexed;
|
||||||
|
}
|
||||||
|
|
||||||
|
private float[] parseVectorText(String vectorText) {
|
||||||
|
if (!StringUtils.hasText(vectorText)) {
|
||||||
|
return new float[0];
|
||||||
|
}
|
||||||
|
String trimmed = vectorText.trim();
|
||||||
|
if (trimmed.startsWith("[")) {
|
||||||
|
trimmed = trimmed.substring(1);
|
||||||
|
}
|
||||||
|
if (trimmed.endsWith("]")) {
|
||||||
|
trimmed = trimmed.substring(0, trimmed.length() - 1);
|
||||||
|
}
|
||||||
|
if (!StringUtils.hasText(trimmed)) {
|
||||||
|
return new float[0];
|
||||||
|
}
|
||||||
|
String[] parts = trimmed.split(",");
|
||||||
|
float[] result = new float[parts.length];
|
||||||
|
for (int i = 0; i < parts.length; i++) {
|
||||||
|
result[i] = Float.parseFloat(parts[i].trim());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
package at.procon.dip.migration.service;
|
||||||
|
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
public record LegacyTedEmbeddingTarget(UUID legacyProcurementDocumentId,
|
||||||
|
UUID documentId,
|
||||||
|
UUID representationId) {
|
||||||
|
}
|
||||||
@ -0,0 +1,33 @@
|
|||||||
|
package at.procon.dip.migration.startup;
|
||||||
|
|
||||||
|
import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties;
|
||||||
|
import at.procon.dip.migration.service.LegacyTedEmbeddingBackfillService;
|
||||||
|
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||||
|
import at.procon.dip.runtime.config.RuntimeMode;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.boot.ApplicationArguments;
|
||||||
|
import org.springframework.boot.ApplicationRunner;
|
||||||
|
import org.springframework.core.Ordered;
|
||||||
|
import org.springframework.core.annotation.Order;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Order(Ordered.LOWEST_PRECEDENCE)
|
||||||
|
@Slf4j
|
||||||
|
public class LegacyTedEmbeddingBackfillStartupRunner implements ApplicationRunner {
|
||||||
|
|
||||||
|
private final LegacyTedEmbeddingBackfillProperties properties;
|
||||||
|
private final LegacyTedEmbeddingBackfillService backfillService;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run(ApplicationArguments args) {
|
||||||
|
if (!properties.isEnabled() || !properties.isStartupEnabled()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
log.info("Startup-triggered legacy TED embedding-only backfill is enabled");
|
||||||
|
backfillService.runBackfill();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
package at.procon.ted.repository;
|
||||||
|
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lightweight projection for migrating legacy TED notice embeddings into the new DOC embedding model.
|
||||||
|
*/
|
||||||
|
public interface LegacyTedEmbeddingSnapshot {
|
||||||
|
|
||||||
|
UUID getId();
|
||||||
|
|
||||||
|
String getVectorText();
|
||||||
|
|
||||||
|
Integer getTokenCount();
|
||||||
|
}
|
||||||
@ -0,0 +1,7 @@
|
|||||||
|
SET search_path TO TED, DOC, public;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ted_procurement_document_created_at_id
|
||||||
|
ON TED.procurement_document (created_at ASC, id ASC);
|
||||||
|
|
||||||
|
ALTER TABLE DOC.doc_legacy_ted_migration_run
|
||||||
|
ADD COLUMN IF NOT EXISTS build_chunk_representations boolean NOT NULL DEFAULT true;
|
||||||
Loading…
Reference in New Issue