ted legacy embeddings migration
This commit is contained in:
parent
28c7854ead
commit
0ce5f51382
|
|
@ -1,28 +0,0 @@
|
|||
package at.procon.dip;
|
||||
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.boot.autoconfigure.domain.EntityScan;
|
||||
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
|
||||
import org.springframework.scheduling.annotation.EnableAsync;
|
||||
|
||||
/**
|
||||
* Procon Document Intelligence Platform (DIP).
|
||||
*
|
||||
* <p>Phase 0 introduces a generic platform root namespace and architecture contracts
|
||||
* while keeping the existing TED-specific runtime intact. Subsequent phases can move
|
||||
* modules incrementally from {@code at.procon.ted} into the broader document platform.</p>
|
||||
*/
|
||||
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
|
||||
@EnableAsync
|
||||
//@EnableConfigurationProperties(TedProcessorProperties.class)
|
||||
@EntityScan(basePackages = {"at.procon.ted.model.entity"})
|
||||
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository"})
|
||||
public class DocumentIntelligencePlatformApplication {
|
||||
|
||||
public static void main(String[] args) {
|
||||
SpringApplication.run(DocumentIntelligencePlatformApplication.class, args);
|
||||
}
|
||||
}
|
||||
|
|
@ -30,6 +30,13 @@ public class LegacyTedBackfillProperties {
|
|||
/** Import batch id written to DOC.doc_source rows created by the migration. */
|
||||
private String importBatchId = "legacy-ted-backfill";
|
||||
|
||||
/** Queue embeddings for migrated TED representations after the DOC/projection backfill. */
|
||||
/** Queue fresh embeddings for migrated TED representations after the DOC/projection backfill. */
|
||||
private boolean queueEmbeddings = false;
|
||||
|
||||
/** Migrate existing legacy TED content vectors into DOC.doc_embedding for the primary representation. */
|
||||
private boolean migrateEmbeddings = false;
|
||||
|
||||
/** Build CHUNK representations during migration. Disable for a faster structural-only backfill. */
|
||||
private boolean buildChunkRepresentations = true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,29 @@
|
|||
package at.procon.dip.migration.config;
|
||||
|
||||
import lombok.Data;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@ConfigurationProperties(prefix = "dip.migration.legacy-ted-embeddings")
|
||||
@Data
|
||||
public class LegacyTedEmbeddingBackfillProperties {
|
||||
|
||||
/** Enable the legacy TED embedding-only backfill subsystem. */
|
||||
private boolean enabled = false;
|
||||
|
||||
/** Run the embedding-only backfill automatically on application startup in NEW runtime. */
|
||||
private boolean startupEnabled = false;
|
||||
|
||||
/** Number of legacy TED documents to inspect per cursor batch. */
|
||||
private int batchSize = 500;
|
||||
|
||||
/** Optional cap for a single invocation. 0 or negative means unlimited. */
|
||||
private long maxDocumentsPerRun = 0;
|
||||
|
||||
/** Skip legacy TED rows that do not yet have a migrated primary SEMANTIC_TEXT representation. */
|
||||
private boolean skipWhenPrimaryRepresentationMissing = true;
|
||||
|
||||
/** Queue a fresh embedding job when no legacy vector exists for a migrated document. */
|
||||
private boolean queueMissingEmbeddings = false;
|
||||
}
|
||||
|
|
@ -46,6 +46,10 @@ public class LegacyTedMigrationRun {
|
|||
@Column(name = "queue_embeddings", nullable = false)
|
||||
private boolean queueEmbeddings;
|
||||
|
||||
@Column(name = "build_chunk_representations", nullable = false)
|
||||
@Builder.Default
|
||||
private boolean buildChunkRepresentations = true;
|
||||
|
||||
@Column(name = "batch_size", nullable = false)
|
||||
private int batchSize;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,49 @@
|
|||
package at.procon.dip.migration.repository;
|
||||
|
||||
import at.procon.dip.migration.service.LegacyTedEmbeddingTarget;
|
||||
import jakarta.persistence.EntityManager;
|
||||
import jakarta.persistence.Query;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
@RequiredArgsConstructor
|
||||
public class LegacyTedEmbeddingTargetQueryRepository {
|
||||
|
||||
private final EntityManager entityManager;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public List<LegacyTedEmbeddingTarget> findPrimarySemanticTargetsByLegacyIds(Collection<UUID> legacyIds) {
|
||||
if (legacyIds == null || legacyIds.isEmpty()) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
Query query = entityManager.createNativeQuery("""
|
||||
SELECT p.legacy_procurement_document_id AS legacy_procurement_document_id,
|
||||
p.document_id AS document_id,
|
||||
r.id AS representation_id
|
||||
FROM ted.ted_notice_projection p
|
||||
JOIN doc.doc_text_representation r
|
||||
ON r.document_id = p.document_id
|
||||
WHERE p.legacy_procurement_document_id IN (:legacyIds)
|
||||
AND r.representation_type = 'SEMANTIC_TEXT'
|
||||
AND COALESCE(r.is_primary, FALSE) = TRUE
|
||||
""");
|
||||
query.setParameter("legacyIds", legacyIds);
|
||||
|
||||
List<Object[]> rows = query.getResultList();
|
||||
List<LegacyTedEmbeddingTarget> results = new ArrayList<>(rows.size());
|
||||
for (Object[] row : rows) {
|
||||
results.add(new LegacyTedEmbeddingTarget(
|
||||
(UUID) row[0],
|
||||
(UUID) row[1],
|
||||
(UUID) row[2]
|
||||
));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
}
|
||||
|
|
@ -40,8 +40,8 @@ public class LegacyTedBackfillMigrationService {
|
|||
}
|
||||
|
||||
LegacyTedMigrationRun run = resolveRun();
|
||||
log.info("Starting legacy TED -> DOC/projection backfill run {} (batchSize={}, maxDocumentsPerRun={}, queueEmbeddings={})",
|
||||
run.getId(), run.getBatchSize(), run.getMaxDocumentsPerRun(), run.isQueueEmbeddings());
|
||||
log.info("Starting legacy TED -> DOC/projection backfill run {} (batchSize={}, maxDocumentsPerRun={}, queueEmbeddings={}, migrateEmbeddings={})",
|
||||
run.getId(), run.getBatchSize(), run.getMaxDocumentsPerRun(), run.isQueueEmbeddings(), properties.isMigrateEmbeddings());
|
||||
|
||||
long existingCheckpointCount = checkpointRepository.countByRun_Id(run.getId());
|
||||
int batchNumber = existingCheckpointCount >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) existingCheckpointCount;
|
||||
|
|
|
|||
|
|
@ -27,7 +27,9 @@ import at.procon.dip.domain.document.service.command.CreateDocumentRelationComma
|
|||
import at.procon.dip.domain.ted.service.TedGenericDocumentRootService;
|
||||
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
|
||||
import at.procon.dip.domain.ted.service.TedPackageDocumentService;
|
||||
import at.procon.dip.migration.config.LegacyTedBackfillProperties;
|
||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||
import at.procon.dip.embedding.service.EmbeddingPersistenceService;
|
||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||
|
|
@ -40,14 +42,18 @@ import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
|||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.dip.search.service.DocumentLexicalIndexService;
|
||||
import at.procon.ted.model.entity.ProcurementDocument;
|
||||
import at.procon.ted.repository.LegacyTedEmbeddingSnapshot;
|
||||
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||
import at.procon.ted.util.HashUtils;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Comparator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
|
@ -83,7 +89,9 @@ public class LegacyTedBackfillWorker {
|
|||
private final DocumentRelationService documentRelationService;
|
||||
private final TextRepresentationBuildService textRepresentationBuildService;
|
||||
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
||||
private final EmbeddingPersistenceService embeddingPersistenceService;
|
||||
private final EmbeddingProperties embeddingProperties;
|
||||
private final LegacyTedBackfillProperties backfillProperties;
|
||||
|
||||
@Transactional(propagation = Propagation.REQUIRES_NEW)
|
||||
public BackfillOutcome backfill(UUID legacyProcurementDocumentId, String importBatchId, boolean queueEmbeddings) {
|
||||
|
|
@ -110,8 +118,12 @@ public class LegacyTedBackfillWorker {
|
|||
UUID projectionId = tedNoticeProjectionService.registerOrRefreshProjection(legacyDocument, document.getId());
|
||||
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
|
||||
|
||||
Set<UUID> migratedRepresentationIds = backfillProperties.isMigrateEmbeddings()
|
||||
? migrateLegacyEmbeddings(legacyDocument, savedRepresentations)
|
||||
: Set.of();
|
||||
|
||||
if (queueEmbeddings) {
|
||||
queueEmbeddings(document.getId(), savedRepresentations);
|
||||
queueEmbeddings(document.getId(), savedRepresentations, migratedRepresentationIds);
|
||||
}
|
||||
|
||||
return new BackfillOutcome(document.getId(), projectionId);
|
||||
|
|
@ -307,13 +319,82 @@ public class LegacyTedBackfillWorker {
|
|||
.findFirst();
|
||||
}
|
||||
|
||||
private void queueEmbeddings(UUID documentId, List<DocumentTextRepresentation> representations) {
|
||||
private Set<UUID> migrateLegacyEmbeddings(ProcurementDocument legacyDocument,
|
||||
List<DocumentTextRepresentation> savedRepresentations) {
|
||||
LegacyTedEmbeddingSnapshot embeddingSnapshot = procurementDocumentRepository
|
||||
.findEmbeddingSnapshotByIdForMigration(legacyDocument.getId())
|
||||
.orElse(null);
|
||||
if (embeddingSnapshot == null || !StringUtils.hasText(embeddingSnapshot.getVectorText())) {
|
||||
return Set.of();
|
||||
}
|
||||
if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
|
||||
log.warn("Skipping legacy embedding migration for TED notice {} because no default document model is configured", legacyDocument.getId());
|
||||
return Set.of();
|
||||
}
|
||||
|
||||
DocumentTextRepresentation targetRepresentation = selectPrimaryEmbeddingTarget(savedRepresentations)
|
||||
.orElseThrow(() -> new IllegalStateException("No suitable representation found for migrated embedding of legacy TED notice " + legacyDocument.getId()));
|
||||
|
||||
float[] vector = parseVectorText(embeddingSnapshot.getVectorText());
|
||||
if (vector.length == 0) {
|
||||
log.warn("Skipping legacy embedding migration for TED notice {} because the legacy vector is empty", legacyDocument.getId());
|
||||
return Set.of();
|
||||
}
|
||||
|
||||
var embedding = embeddingPersistenceService.ensurePending(targetRepresentation.getId(), embeddingProperties.getDefaultDocumentModel());
|
||||
embeddingPersistenceService.saveCompleted(embedding.getId(), vector, embeddingSnapshot.getTokenCount());
|
||||
log.debug("Migrated legacy embedding for TED notice {} to representation {} (dimension={}, tokenCount={})",
|
||||
legacyDocument.getId(), targetRepresentation.getId(), vector.length, embeddingSnapshot.getTokenCount());
|
||||
return Set.of(targetRepresentation.getId());
|
||||
}
|
||||
|
||||
private Optional<DocumentTextRepresentation> selectPrimaryEmbeddingTarget(List<DocumentTextRepresentation> representations) {
|
||||
if (representations == null || representations.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
return representations.stream()
|
||||
.filter(rep -> rep.getRepresentationType() == RepresentationType.SEMANTIC_TEXT && rep.isPrimaryRepresentation())
|
||||
.findFirst()
|
||||
.or(() -> representations.stream().filter(DocumentTextRepresentation::isPrimaryRepresentation).findFirst())
|
||||
.or(() -> representations.stream().filter(rep -> rep.getRepresentationType() == RepresentationType.SEMANTIC_TEXT).findFirst())
|
||||
.or(() -> Optional.of(representations.get(0)));
|
||||
}
|
||||
|
||||
private float[] parseVectorText(String vectorText) {
|
||||
if (!StringUtils.hasText(vectorText)) {
|
||||
return new float[0];
|
||||
}
|
||||
String trimmed = vectorText.trim();
|
||||
if (trimmed.startsWith("[")) {
|
||||
trimmed = trimmed.substring(1);
|
||||
}
|
||||
if (trimmed.endsWith("]")) {
|
||||
trimmed = trimmed.substring(0, trimmed.length() - 1);
|
||||
}
|
||||
if (!StringUtils.hasText(trimmed)) {
|
||||
return new float[0];
|
||||
}
|
||||
String[] parts = trimmed.split(",");
|
||||
float[] result = new float[parts.length];
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
result[i] = Float.parseFloat(parts[i].trim());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private void queueEmbeddings(UUID documentId,
|
||||
List<DocumentTextRepresentation> representations,
|
||||
Set<UUID> migratedRepresentationIds) {
|
||||
if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
|
||||
log.debug("Skipping embedding queue for migrated document {} because no default document model is configured", documentId);
|
||||
return;
|
||||
}
|
||||
|
||||
Set<UUID> skippedIds = migratedRepresentationIds == null ? Set.of() : new LinkedHashSet<>(migratedRepresentationIds);
|
||||
for (DocumentTextRepresentation representation : representations) {
|
||||
if (representation.getId() != null && skippedIds.contains(representation.getId())) {
|
||||
continue;
|
||||
}
|
||||
RepresentationType type = representation.getRepresentationType();
|
||||
boolean queue = switch (type) {
|
||||
case SEMANTIC_TEXT -> true;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,188 @@
|
|||
package at.procon.dip.migration.service;
|
||||
|
||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||
import at.procon.dip.embedding.service.EmbeddingPersistenceService;
|
||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||
import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties;
|
||||
import at.procon.dip.migration.repository.LegacyTedEmbeddingTargetQueryRepository;
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.ted.repository.LegacyTedEmbeddingSnapshot;
|
||||
import at.procon.ted.repository.LegacyTedMigrationCursor;
|
||||
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||
import java.time.Instant;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
@Service
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class LegacyTedEmbeddingBackfillService {
|
||||
|
||||
private final LegacyTedEmbeddingBackfillProperties properties;
|
||||
private final ProcurementDocumentRepository procurementDocumentRepository;
|
||||
private final LegacyTedEmbeddingTargetQueryRepository targetQueryRepository;
|
||||
private final EmbeddingPersistenceService embeddingPersistenceService;
|
||||
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
||||
private final EmbeddingProperties embeddingProperties;
|
||||
|
||||
public void runBackfill() {
|
||||
if (!properties.isEnabled()) {
|
||||
log.info("Legacy TED embedding-only backfill is disabled");
|
||||
return;
|
||||
}
|
||||
if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
|
||||
log.warn("Skipping legacy TED embedding-only backfill because no default document model is configured");
|
||||
return;
|
||||
}
|
||||
|
||||
log.info("Starting legacy TED embedding-only backfill (batchSize={}, maxDocumentsPerRun={}, queueMissingEmbeddings={})",
|
||||
properties.getBatchSize(), properties.getMaxDocumentsPerRun(), properties.isQueueMissingEmbeddings());
|
||||
|
||||
Instant lastCreatedAt = null;
|
||||
UUID lastId = null;
|
||||
long inspected = 0;
|
||||
long migrated = 0;
|
||||
long queued = 0;
|
||||
long skippedMissingTarget = 0;
|
||||
long skippedMissingVector = 0;
|
||||
|
||||
while (true) {
|
||||
int limit = effectiveBatchLimit(inspected);
|
||||
if (limit <= 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
List<LegacyTedMigrationCursor> cursors = loadNextBatch(
|
||||
lastCreatedAt != null ?
|
||||
lastCreatedAt.atZone(ZoneId.systemDefault()).toOffsetDateTime() : null, lastId, limit);
|
||||
if (cursors.isEmpty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
List<UUID> legacyIds = new ArrayList<>(cursors.size());
|
||||
for (LegacyTedMigrationCursor cursor : cursors) {
|
||||
legacyIds.add(cursor.getId());
|
||||
}
|
||||
|
||||
Map<UUID, LegacyTedEmbeddingTarget> targetsByLegacyId = indexTargets(
|
||||
targetQueryRepository.findPrimarySemanticTargetsByLegacyIds(legacyIds)
|
||||
);
|
||||
Map<UUID, LegacyTedEmbeddingSnapshot> snapshotsByLegacyId = indexSnapshots(
|
||||
procurementDocumentRepository.findEmbeddingSnapshotsByIdsForMigration(legacyIds)
|
||||
);
|
||||
|
||||
for (LegacyTedMigrationCursor cursor : cursors) {
|
||||
inspected++;
|
||||
LegacyTedEmbeddingTarget target = targetsByLegacyId.get(cursor.getId());
|
||||
LegacyTedEmbeddingSnapshot snapshot = snapshotsByLegacyId.get(cursor.getId());
|
||||
|
||||
if (target == null) {
|
||||
if (properties.isSkipWhenPrimaryRepresentationMissing()) {
|
||||
skippedMissingTarget++;
|
||||
}
|
||||
lastCreatedAt = cursor.getCreatedAt();
|
||||
lastId = cursor.getId();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (snapshot != null && StringUtils.hasText(snapshot.getVectorText())) {
|
||||
float[] vector = parseVectorText(snapshot.getVectorText());
|
||||
if (vector.length > 0) {
|
||||
var embedding = embeddingPersistenceService.ensurePending(target.representationId(), embeddingProperties.getDefaultDocumentModel());
|
||||
embeddingPersistenceService.saveCompleted(embedding.getId(), vector, snapshot.getTokenCount());
|
||||
migrated++;
|
||||
} else {
|
||||
skippedMissingVector++;
|
||||
}
|
||||
} else if (properties.isQueueMissingEmbeddings()) {
|
||||
embeddingOrchestrator.enqueueRepresentation(
|
||||
target.documentId(),
|
||||
target.representationId(),
|
||||
embeddingProperties.getDefaultDocumentModel()
|
||||
);
|
||||
queued++;
|
||||
} else {
|
||||
skippedMissingVector++;
|
||||
}
|
||||
|
||||
lastCreatedAt = cursor.getCreatedAt();
|
||||
lastId = cursor.getId();
|
||||
}
|
||||
|
||||
log.info("Legacy TED embedding-only backfill progress: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}",
|
||||
inspected, migrated, queued, skippedMissingTarget, skippedMissingVector);
|
||||
}
|
||||
|
||||
log.info("Legacy TED embedding-only backfill finished: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}",
|
||||
inspected, migrated, queued, skippedMissingTarget, skippedMissingVector);
|
||||
}
|
||||
|
||||
protected List<LegacyTedMigrationCursor> loadNextBatch(OffsetDateTime lastCreatedAt, UUID lastId, int limit) {
|
||||
if (lastCreatedAt == null || lastId == null) {
|
||||
return procurementDocumentRepository.findFirstMigrationBatch(limit);
|
||||
}
|
||||
return procurementDocumentRepository.findNextMigrationBatch(lastCreatedAt, lastId, limit);
|
||||
}
|
||||
|
||||
private int effectiveBatchLimit(long processedInThisInvocation) {
|
||||
long maxPerRun = properties.getMaxDocumentsPerRun();
|
||||
if (maxPerRun <= 0) {
|
||||
return Math.max(1, properties.getBatchSize());
|
||||
}
|
||||
long remaining = maxPerRun - processedInThisInvocation;
|
||||
if (remaining <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return (int) Math.max(1L, Math.min(properties.getBatchSize(), remaining));
|
||||
}
|
||||
|
||||
private Map<UUID, LegacyTedEmbeddingTarget> indexTargets(Collection<LegacyTedEmbeddingTarget> targets) {
|
||||
Map<UUID, LegacyTedEmbeddingTarget> indexed = new LinkedHashMap<>();
|
||||
for (LegacyTedEmbeddingTarget target : targets) {
|
||||
indexed.putIfAbsent(target.legacyProcurementDocumentId(), target);
|
||||
}
|
||||
return indexed;
|
||||
}
|
||||
|
||||
private Map<UUID, LegacyTedEmbeddingSnapshot> indexSnapshots(Collection<LegacyTedEmbeddingSnapshot> snapshots) {
|
||||
Map<UUID, LegacyTedEmbeddingSnapshot> indexed = new LinkedHashMap<>();
|
||||
for (LegacyTedEmbeddingSnapshot snapshot : snapshots) {
|
||||
indexed.put(snapshot.getId(), snapshot);
|
||||
}
|
||||
return indexed;
|
||||
}
|
||||
|
||||
private float[] parseVectorText(String vectorText) {
|
||||
if (!StringUtils.hasText(vectorText)) {
|
||||
return new float[0];
|
||||
}
|
||||
String trimmed = vectorText.trim();
|
||||
if (trimmed.startsWith("[")) {
|
||||
trimmed = trimmed.substring(1);
|
||||
}
|
||||
if (trimmed.endsWith("]")) {
|
||||
trimmed = trimmed.substring(0, trimmed.length() - 1);
|
||||
}
|
||||
if (!StringUtils.hasText(trimmed)) {
|
||||
return new float[0];
|
||||
}
|
||||
String[] parts = trimmed.split(",");
|
||||
float[] result = new float[parts.length];
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
result[i] = Float.parseFloat(parts[i].trim());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
package at.procon.dip.migration.service;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
public record LegacyTedEmbeddingTarget(UUID legacyProcurementDocumentId,
|
||||
UUID documentId,
|
||||
UUID representationId) {
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
package at.procon.dip.migration.startup;
|
||||
|
||||
import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties;
|
||||
import at.procon.dip.migration.service.LegacyTedEmbeddingBackfillService;
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.boot.ApplicationArguments;
|
||||
import org.springframework.boot.ApplicationRunner;
|
||||
import org.springframework.core.Ordered;
|
||||
import org.springframework.core.annotation.Order;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
@RequiredArgsConstructor
|
||||
@Order(Ordered.LOWEST_PRECEDENCE)
|
||||
@Slf4j
|
||||
public class LegacyTedEmbeddingBackfillStartupRunner implements ApplicationRunner {
|
||||
|
||||
private final LegacyTedEmbeddingBackfillProperties properties;
|
||||
private final LegacyTedEmbeddingBackfillService backfillService;
|
||||
|
||||
@Override
|
||||
public void run(ApplicationArguments args) {
|
||||
if (!properties.isEnabled() || !properties.isStartupEnabled()) {
|
||||
return;
|
||||
}
|
||||
log.info("Startup-triggered legacy TED embedding-only backfill is enabled");
|
||||
backfillService.runBackfill();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
package at.procon.ted.repository;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Lightweight projection for migrating legacy TED notice embeddings into the new DOC embedding model.
|
||||
*/
|
||||
public interface LegacyTedEmbeddingSnapshot {
|
||||
|
||||
UUID getId();
|
||||
|
||||
String getVectorText();
|
||||
|
||||
Integer getTokenCount();
|
||||
}
|
||||
|
|
@ -10,8 +10,10 @@ import org.springframework.data.jpa.repository.Query;
|
|||
import org.springframework.data.repository.query.Param;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.time.LocalDate;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
|
|
@ -216,7 +218,7 @@ public interface ProcurementDocumentRepository extends
|
|||
@Query(value = """
|
||||
SELECT p.id AS id, p.created_at AS createdAt
|
||||
FROM ted.procurement_document p
|
||||
ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
|
||||
ORDER BY p.created_at ASC, p.id ASC
|
||||
LIMIT :limit
|
||||
""", nativeQuery = true)
|
||||
List<LegacyTedMigrationCursor> findFirstMigrationBatch(@Param("limit") int limit);
|
||||
|
|
@ -228,14 +230,34 @@ public interface ProcurementDocumentRepository extends
|
|||
SELECT p.id AS id, p.created_at AS createdAt
|
||||
FROM ted.procurement_document p
|
||||
WHERE p.created_at > :lastCreatedAt
|
||||
OR (p.created_at = :lastCreatedAt AND CAST(p.id AS text) > CAST(:lastId AS text))
|
||||
ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
|
||||
OR (p.created_at = :lastCreatedAt AND p.id > :lastId)
|
||||
ORDER BY p.created_at ASC, p.id ASC
|
||||
LIMIT :limit
|
||||
""", nativeQuery = true)
|
||||
List<LegacyTedMigrationCursor> findNextMigrationBatch(@Param("lastCreatedAt") OffsetDateTime lastCreatedAt,
|
||||
@Param("lastId") UUID lastId,
|
||||
@Param("limit") int limit);
|
||||
|
||||
@Query("SELECT p FROM ProcurementDocument p WHERE p.id IN :ids")
|
||||
List<ProcurementDocument> findAllByIdInForMigration(@Param("ids") Collection<UUID> ids);
|
||||
|
||||
@Query("SELECT l FROM ProcurementLot l WHERE l.document.id IN :ids")
|
||||
List<ProcurementLot> findLotsByDocumentIdInForMigration(@Param("ids") Collection<UUID> ids);
|
||||
|
||||
@Query("SELECT o FROM Organization o WHERE o.document.id IN :ids")
|
||||
List<Organization> findOrganizationsByDocumentIdInForMigration(@Param("ids") Collection<UUID> ids);
|
||||
|
||||
@Query(value = """
|
||||
SELECT p.id AS id,
|
||||
CAST(p.content_vector AS text) AS vectorText,
|
||||
p.embedding_token_count AS tokenCount
|
||||
FROM ted.procurement_document p
|
||||
WHERE p.id IN :ids
|
||||
AND p.content_vector IS NOT NULL
|
||||
""", nativeQuery = true)
|
||||
List<LegacyTedEmbeddingSnapshot> findEmbeddingSnapshotsByIdsForMigration(@Param("ids") Collection<UUID> ids);
|
||||
|
||||
|
||||
/**
|
||||
* Delete all documents created before the specified date.
|
||||
* Cascading deletes will automatically remove related lots, organizations, and logs.
|
||||
|
|
@ -254,6 +276,14 @@ public interface ProcurementDocumentRepository extends
|
|||
* @return Number of documents
|
||||
*/
|
||||
long countByCreatedAtBefore(OffsetDateTime cutoffDate);
|
||||
|
||||
@Query(value = """
|
||||
SELECT p.id AS id,
|
||||
CAST(p.content_vector AS text) AS vectorText,
|
||||
p.embedding_token_count AS tokenCount
|
||||
FROM ted.procurement_document p
|
||||
WHERE p.id = :id
|
||||
AND p.content_vector IS NOT NULL
|
||||
""", nativeQuery = true)
|
||||
Optional<LegacyTedEmbeddingSnapshot> findEmbeddingSnapshotByIdForMigration(@Param("id") UUID id);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
dip:
|
||||
runtime:
|
||||
mode: NEW
|
||||
|
||||
search:
|
||||
# Default page size for search results
|
||||
default-page-size: 20
|
||||
|
|
@ -37,9 +36,9 @@ dip:
|
|||
debug-top-hits-per-engine: 10
|
||||
|
||||
embedding:
|
||||
enabled: false
|
||||
jobs:
|
||||
enabled: true
|
||||
jobs:
|
||||
enabled: false
|
||||
process-in-batches: true
|
||||
execution-batch-size: 20
|
||||
|
||||
|
|
@ -54,7 +53,7 @@ dip:
|
|||
|
||||
external-e5:
|
||||
type: http-json
|
||||
base-url: http://172.20.240.18:8001
|
||||
base-url: http://localhost:8001
|
||||
connect-timeout: 5s
|
||||
read-timeout: 60s
|
||||
|
||||
|
|
@ -286,3 +285,13 @@ dip:
|
|||
import-batch-id: legacy-ted-backfill
|
||||
# Keep false for Wave 1; embeddings can be backfilled later as a separate step
|
||||
queue-embeddings: false
|
||||
migrate-embeddings: false
|
||||
build-chunk-representations: true
|
||||
|
||||
legacy-ted-embeddings:
|
||||
enabled: true
|
||||
startup-enabled: true
|
||||
batch-size: 500
|
||||
max-documents-per-run: 0
|
||||
skip-when-primary-representation-missing: true
|
||||
queue-missing-embeddings: false
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
SET search_path TO TED, DOC, public;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ted_procurement_document_created_at_id
|
||||
ON TED.procurement_document (created_at ASC, id ASC);
|
||||
|
||||
ALTER TABLE DOC.doc_legacy_ted_migration_run
|
||||
ADD COLUMN IF NOT EXISTS build_chunk_representations boolean NOT NULL DEFAULT true;
|
||||
Loading…
Reference in New Issue