ted legacy embeddings migration

master
trifonovt 4 days ago
parent 28c7854ead
commit 0ce5f51382

@ -1,28 +0,0 @@
package at.procon.dip;
import at.procon.ted.config.TedProcessorProperties;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.boot.autoconfigure.domain.EntityScan;
import org.springframework.data.jpa.repository.config.EnableJpaRepositories;
import org.springframework.scheduling.annotation.EnableAsync;
/**
* Procon Document Intelligence Platform (DIP).
*
* <p>Phase 0 introduces a generic platform root namespace and architecture contracts
* while keeping the existing TED-specific runtime intact. Subsequent phases can move
* modules incrementally from {@code at.procon.ted} into the broader document platform.</p>
*/
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
@EnableAsync
//@EnableConfigurationProperties(TedProcessorProperties.class)
@EntityScan(basePackages = {"at.procon.ted.model.entity"})
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository"})
public class DocumentIntelligencePlatformApplication {
public static void main(String[] args) {
SpringApplication.run(DocumentIntelligencePlatformApplication.class, args);
}
}

@ -30,6 +30,13 @@ public class LegacyTedBackfillProperties {
/** Import batch id written to DOC.doc_source rows created by the migration. */
private String importBatchId = "legacy-ted-backfill";
/** Queue embeddings for migrated TED representations after the DOC/projection backfill. */
/** Queue fresh embeddings for migrated TED representations after the DOC/projection backfill. */
private boolean queueEmbeddings = false;
/** Migrate existing legacy TED content vectors into DOC.doc_embedding for the primary representation. */
private boolean migrateEmbeddings = false;
/** Build CHUNK representations during migration. Disable for a faster structural-only backfill. */
private boolean buildChunkRepresentations = true;
}

@ -0,0 +1,29 @@
package at.procon.dip.migration.config;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
@Configuration
@ConfigurationProperties(prefix = "dip.migration.legacy-ted-embeddings")
@Data
public class LegacyTedEmbeddingBackfillProperties {
/** Enable the legacy TED embedding-only backfill subsystem. */
private boolean enabled = false;
/** Run the embedding-only backfill automatically on application startup in NEW runtime. */
private boolean startupEnabled = false;
/** Number of legacy TED documents to inspect per cursor batch. */
private int batchSize = 500;
/** Optional cap for a single invocation. 0 or negative means unlimited. */
private long maxDocumentsPerRun = 0;
/** Skip legacy TED rows that do not yet have a migrated primary SEMANTIC_TEXT representation. */
private boolean skipWhenPrimaryRepresentationMissing = true;
/** Queue a fresh embedding job when no legacy vector exists for a migrated document. */
private boolean queueMissingEmbeddings = false;
}

@ -46,6 +46,10 @@ public class LegacyTedMigrationRun {
@Column(name = "queue_embeddings", nullable = false)
private boolean queueEmbeddings;
@Column(name = "build_chunk_representations", nullable = false)
@Builder.Default
private boolean buildChunkRepresentations = true;
@Column(name = "batch_size", nullable = false)
private int batchSize;

@ -0,0 +1,49 @@
package at.procon.dip.migration.repository;
import at.procon.dip.migration.service.LegacyTedEmbeddingTarget;
import jakarta.persistence.EntityManager;
import jakarta.persistence.Query;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.UUID;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Repository;
@Repository
@RequiredArgsConstructor
public class LegacyTedEmbeddingTargetQueryRepository {
private final EntityManager entityManager;
@SuppressWarnings("unchecked")
public List<LegacyTedEmbeddingTarget> findPrimarySemanticTargetsByLegacyIds(Collection<UUID> legacyIds) {
if (legacyIds == null || legacyIds.isEmpty()) {
return List.of();
}
Query query = entityManager.createNativeQuery("""
SELECT p.legacy_procurement_document_id AS legacy_procurement_document_id,
p.document_id AS document_id,
r.id AS representation_id
FROM ted.ted_notice_projection p
JOIN doc.doc_text_representation r
ON r.document_id = p.document_id
WHERE p.legacy_procurement_document_id IN (:legacyIds)
AND r.representation_type = 'SEMANTIC_TEXT'
AND COALESCE(r.is_primary, FALSE) = TRUE
""");
query.setParameter("legacyIds", legacyIds);
List<Object[]> rows = query.getResultList();
List<LegacyTedEmbeddingTarget> results = new ArrayList<>(rows.size());
for (Object[] row : rows) {
results.add(new LegacyTedEmbeddingTarget(
(UUID) row[0],
(UUID) row[1],
(UUID) row[2]
));
}
return results;
}
}

@ -40,8 +40,8 @@ public class LegacyTedBackfillMigrationService {
}
LegacyTedMigrationRun run = resolveRun();
log.info("Starting legacy TED -> DOC/projection backfill run {} (batchSize={}, maxDocumentsPerRun={}, queueEmbeddings={})",
run.getId(), run.getBatchSize(), run.getMaxDocumentsPerRun(), run.isQueueEmbeddings());
log.info("Starting legacy TED -> DOC/projection backfill run {} (batchSize={}, maxDocumentsPerRun={}, queueEmbeddings={}, migrateEmbeddings={})",
run.getId(), run.getBatchSize(), run.getMaxDocumentsPerRun(), run.isQueueEmbeddings(), properties.isMigrateEmbeddings());
long existingCheckpointCount = checkpointRepository.countByRun_Id(run.getId());
int batchNumber = existingCheckpointCount >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) existingCheckpointCount;

@ -27,7 +27,9 @@ import at.procon.dip.domain.document.service.command.CreateDocumentRelationComma
import at.procon.dip.domain.ted.service.TedGenericDocumentRootService;
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
import at.procon.dip.domain.ted.service.TedPackageDocumentService;
import at.procon.dip.migration.config.LegacyTedBackfillProperties;
import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.embedding.service.EmbeddingPersistenceService;
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
import at.procon.dip.extraction.spi.ExtractionResult;
@ -40,14 +42,18 @@ import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.dip.search.service.DocumentLexicalIndexService;
import at.procon.ted.model.entity.ProcurementDocument;
import at.procon.ted.repository.LegacyTedEmbeddingSnapshot;
import at.procon.ted.repository.ProcurementDocumentRepository;
import at.procon.ted.util.HashUtils;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
@ -83,7 +89,9 @@ public class LegacyTedBackfillWorker {
private final DocumentRelationService documentRelationService;
private final TextRepresentationBuildService textRepresentationBuildService;
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
private final EmbeddingPersistenceService embeddingPersistenceService;
private final EmbeddingProperties embeddingProperties;
private final LegacyTedBackfillProperties backfillProperties;
@Transactional(propagation = Propagation.REQUIRES_NEW)
public BackfillOutcome backfill(UUID legacyProcurementDocumentId, String importBatchId, boolean queueEmbeddings) {
@ -110,8 +118,12 @@ public class LegacyTedBackfillWorker {
UUID projectionId = tedNoticeProjectionService.registerOrRefreshProjection(legacyDocument, document.getId());
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
Set<UUID> migratedRepresentationIds = backfillProperties.isMigrateEmbeddings()
? migrateLegacyEmbeddings(legacyDocument, savedRepresentations)
: Set.of();
if (queueEmbeddings) {
queueEmbeddings(document.getId(), savedRepresentations);
queueEmbeddings(document.getId(), savedRepresentations, migratedRepresentationIds);
}
return new BackfillOutcome(document.getId(), projectionId);
@ -307,13 +319,82 @@ public class LegacyTedBackfillWorker {
.findFirst();
}
private void queueEmbeddings(UUID documentId, List<DocumentTextRepresentation> representations) {
private Set<UUID> migrateLegacyEmbeddings(ProcurementDocument legacyDocument,
List<DocumentTextRepresentation> savedRepresentations) {
LegacyTedEmbeddingSnapshot embeddingSnapshot = procurementDocumentRepository
.findEmbeddingSnapshotByIdForMigration(legacyDocument.getId())
.orElse(null);
if (embeddingSnapshot == null || !StringUtils.hasText(embeddingSnapshot.getVectorText())) {
return Set.of();
}
if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
log.warn("Skipping legacy embedding migration for TED notice {} because no default document model is configured", legacyDocument.getId());
return Set.of();
}
DocumentTextRepresentation targetRepresentation = selectPrimaryEmbeddingTarget(savedRepresentations)
.orElseThrow(() -> new IllegalStateException("No suitable representation found for migrated embedding of legacy TED notice " + legacyDocument.getId()));
float[] vector = parseVectorText(embeddingSnapshot.getVectorText());
if (vector.length == 0) {
log.warn("Skipping legacy embedding migration for TED notice {} because the legacy vector is empty", legacyDocument.getId());
return Set.of();
}
var embedding = embeddingPersistenceService.ensurePending(targetRepresentation.getId(), embeddingProperties.getDefaultDocumentModel());
embeddingPersistenceService.saveCompleted(embedding.getId(), vector, embeddingSnapshot.getTokenCount());
log.debug("Migrated legacy embedding for TED notice {} to representation {} (dimension={}, tokenCount={})",
legacyDocument.getId(), targetRepresentation.getId(), vector.length, embeddingSnapshot.getTokenCount());
return Set.of(targetRepresentation.getId());
}
private Optional<DocumentTextRepresentation> selectPrimaryEmbeddingTarget(List<DocumentTextRepresentation> representations) {
if (representations == null || representations.isEmpty()) {
return Optional.empty();
}
return representations.stream()
.filter(rep -> rep.getRepresentationType() == RepresentationType.SEMANTIC_TEXT && rep.isPrimaryRepresentation())
.findFirst()
.or(() -> representations.stream().filter(DocumentTextRepresentation::isPrimaryRepresentation).findFirst())
.or(() -> representations.stream().filter(rep -> rep.getRepresentationType() == RepresentationType.SEMANTIC_TEXT).findFirst())
.or(() -> Optional.of(representations.get(0)));
}
private float[] parseVectorText(String vectorText) {
if (!StringUtils.hasText(vectorText)) {
return new float[0];
}
String trimmed = vectorText.trim();
if (trimmed.startsWith("[")) {
trimmed = trimmed.substring(1);
}
if (trimmed.endsWith("]")) {
trimmed = trimmed.substring(0, trimmed.length() - 1);
}
if (!StringUtils.hasText(trimmed)) {
return new float[0];
}
String[] parts = trimmed.split(",");
float[] result = new float[parts.length];
for (int i = 0; i < parts.length; i++) {
result[i] = Float.parseFloat(parts[i].trim());
}
return result;
}
private void queueEmbeddings(UUID documentId,
List<DocumentTextRepresentation> representations,
Set<UUID> migratedRepresentationIds) {
if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
log.debug("Skipping embedding queue for migrated document {} because no default document model is configured", documentId);
return;
}
Set<UUID> skippedIds = migratedRepresentationIds == null ? Set.of() : new LinkedHashSet<>(migratedRepresentationIds);
for (DocumentTextRepresentation representation : representations) {
if (representation.getId() != null && skippedIds.contains(representation.getId())) {
continue;
}
RepresentationType type = representation.getRepresentationType();
boolean queue = switch (type) {
case SEMANTIC_TEXT -> true;

@ -0,0 +1,188 @@
package at.procon.dip.migration.service;
import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.embedding.service.EmbeddingPersistenceService;
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties;
import at.procon.dip.migration.repository.LegacyTedEmbeddingTargetQueryRepository;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.repository.LegacyTedEmbeddingSnapshot;
import at.procon.ted.repository.LegacyTedMigrationCursor;
import at.procon.ted.repository.ProcurementDocumentRepository;
import java.time.Instant;
import java.time.OffsetDateTime;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;
@Service
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@RequiredArgsConstructor
@Slf4j
public class LegacyTedEmbeddingBackfillService {
private final LegacyTedEmbeddingBackfillProperties properties;
private final ProcurementDocumentRepository procurementDocumentRepository;
private final LegacyTedEmbeddingTargetQueryRepository targetQueryRepository;
private final EmbeddingPersistenceService embeddingPersistenceService;
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
private final EmbeddingProperties embeddingProperties;
public void runBackfill() {
if (!properties.isEnabled()) {
log.info("Legacy TED embedding-only backfill is disabled");
return;
}
if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
log.warn("Skipping legacy TED embedding-only backfill because no default document model is configured");
return;
}
log.info("Starting legacy TED embedding-only backfill (batchSize={}, maxDocumentsPerRun={}, queueMissingEmbeddings={})",
properties.getBatchSize(), properties.getMaxDocumentsPerRun(), properties.isQueueMissingEmbeddings());
Instant lastCreatedAt = null;
UUID lastId = null;
long inspected = 0;
long migrated = 0;
long queued = 0;
long skippedMissingTarget = 0;
long skippedMissingVector = 0;
while (true) {
int limit = effectiveBatchLimit(inspected);
if (limit <= 0) {
break;
}
List<LegacyTedMigrationCursor> cursors = loadNextBatch(
lastCreatedAt != null ?
lastCreatedAt.atZone(ZoneId.systemDefault()).toOffsetDateTime() : null, lastId, limit);
if (cursors.isEmpty()) {
break;
}
List<UUID> legacyIds = new ArrayList<>(cursors.size());
for (LegacyTedMigrationCursor cursor : cursors) {
legacyIds.add(cursor.getId());
}
Map<UUID, LegacyTedEmbeddingTarget> targetsByLegacyId = indexTargets(
targetQueryRepository.findPrimarySemanticTargetsByLegacyIds(legacyIds)
);
Map<UUID, LegacyTedEmbeddingSnapshot> snapshotsByLegacyId = indexSnapshots(
procurementDocumentRepository.findEmbeddingSnapshotsByIdsForMigration(legacyIds)
);
for (LegacyTedMigrationCursor cursor : cursors) {
inspected++;
LegacyTedEmbeddingTarget target = targetsByLegacyId.get(cursor.getId());
LegacyTedEmbeddingSnapshot snapshot = snapshotsByLegacyId.get(cursor.getId());
if (target == null) {
if (properties.isSkipWhenPrimaryRepresentationMissing()) {
skippedMissingTarget++;
}
lastCreatedAt = cursor.getCreatedAt();
lastId = cursor.getId();
continue;
}
if (snapshot != null && StringUtils.hasText(snapshot.getVectorText())) {
float[] vector = parseVectorText(snapshot.getVectorText());
if (vector.length > 0) {
var embedding = embeddingPersistenceService.ensurePending(target.representationId(), embeddingProperties.getDefaultDocumentModel());
embeddingPersistenceService.saveCompleted(embedding.getId(), vector, snapshot.getTokenCount());
migrated++;
} else {
skippedMissingVector++;
}
} else if (properties.isQueueMissingEmbeddings()) {
embeddingOrchestrator.enqueueRepresentation(
target.documentId(),
target.representationId(),
embeddingProperties.getDefaultDocumentModel()
);
queued++;
} else {
skippedMissingVector++;
}
lastCreatedAt = cursor.getCreatedAt();
lastId = cursor.getId();
}
log.info("Legacy TED embedding-only backfill progress: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}",
inspected, migrated, queued, skippedMissingTarget, skippedMissingVector);
}
log.info("Legacy TED embedding-only backfill finished: inspected={}, migrated={}, queued={}, skippedMissingTarget={}, skippedMissingVector={}",
inspected, migrated, queued, skippedMissingTarget, skippedMissingVector);
}
protected List<LegacyTedMigrationCursor> loadNextBatch(OffsetDateTime lastCreatedAt, UUID lastId, int limit) {
if (lastCreatedAt == null || lastId == null) {
return procurementDocumentRepository.findFirstMigrationBatch(limit);
}
return procurementDocumentRepository.findNextMigrationBatch(lastCreatedAt, lastId, limit);
}
private int effectiveBatchLimit(long processedInThisInvocation) {
long maxPerRun = properties.getMaxDocumentsPerRun();
if (maxPerRun <= 0) {
return Math.max(1, properties.getBatchSize());
}
long remaining = maxPerRun - processedInThisInvocation;
if (remaining <= 0) {
return 0;
}
return (int) Math.max(1L, Math.min(properties.getBatchSize(), remaining));
}
private Map<UUID, LegacyTedEmbeddingTarget> indexTargets(Collection<LegacyTedEmbeddingTarget> targets) {
Map<UUID, LegacyTedEmbeddingTarget> indexed = new LinkedHashMap<>();
for (LegacyTedEmbeddingTarget target : targets) {
indexed.putIfAbsent(target.legacyProcurementDocumentId(), target);
}
return indexed;
}
private Map<UUID, LegacyTedEmbeddingSnapshot> indexSnapshots(Collection<LegacyTedEmbeddingSnapshot> snapshots) {
Map<UUID, LegacyTedEmbeddingSnapshot> indexed = new LinkedHashMap<>();
for (LegacyTedEmbeddingSnapshot snapshot : snapshots) {
indexed.put(snapshot.getId(), snapshot);
}
return indexed;
}
private float[] parseVectorText(String vectorText) {
if (!StringUtils.hasText(vectorText)) {
return new float[0];
}
String trimmed = vectorText.trim();
if (trimmed.startsWith("[")) {
trimmed = trimmed.substring(1);
}
if (trimmed.endsWith("]")) {
trimmed = trimmed.substring(0, trimmed.length() - 1);
}
if (!StringUtils.hasText(trimmed)) {
return new float[0];
}
String[] parts = trimmed.split(",");
float[] result = new float[parts.length];
for (int i = 0; i < parts.length; i++) {
result[i] = Float.parseFloat(parts[i].trim());
}
return result;
}
}

@ -0,0 +1,8 @@
package at.procon.dip.migration.service;
import java.util.UUID;
public record LegacyTedEmbeddingTarget(UUID legacyProcurementDocumentId,
UUID documentId,
UUID representationId) {
}

@ -0,0 +1,33 @@
package at.procon.dip.migration.startup;
import at.procon.dip.migration.config.LegacyTedEmbeddingBackfillProperties;
import at.procon.dip.migration.service.LegacyTedEmbeddingBackfillService;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.core.Ordered;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
@Component
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@RequiredArgsConstructor
@Order(Ordered.LOWEST_PRECEDENCE)
@Slf4j
public class LegacyTedEmbeddingBackfillStartupRunner implements ApplicationRunner {
private final LegacyTedEmbeddingBackfillProperties properties;
private final LegacyTedEmbeddingBackfillService backfillService;
@Override
public void run(ApplicationArguments args) {
if (!properties.isEnabled() || !properties.isStartupEnabled()) {
return;
}
log.info("Startup-triggered legacy TED embedding-only backfill is enabled");
backfillService.runBackfill();
}
}

@ -0,0 +1,15 @@
package at.procon.ted.repository;
import java.util.UUID;
/**
* Lightweight projection for migrating legacy TED notice embeddings into the new DOC embedding model.
*/
public interface LegacyTedEmbeddingSnapshot {
UUID getId();
String getVectorText();
Integer getTokenCount();
}

@ -10,8 +10,10 @@ import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import org.springframework.stereotype.Repository;
import java.time.Instant;
import java.time.LocalDate;
import java.time.OffsetDateTime;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
@ -216,7 +218,7 @@ public interface ProcurementDocumentRepository extends
@Query(value = """
SELECT p.id AS id, p.created_at AS createdAt
FROM ted.procurement_document p
ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
ORDER BY p.created_at ASC, p.id ASC
LIMIT :limit
""", nativeQuery = true)
List<LegacyTedMigrationCursor> findFirstMigrationBatch(@Param("limit") int limit);
@ -228,14 +230,34 @@ public interface ProcurementDocumentRepository extends
SELECT p.id AS id, p.created_at AS createdAt
FROM ted.procurement_document p
WHERE p.created_at > :lastCreatedAt
OR (p.created_at = :lastCreatedAt AND CAST(p.id AS text) > CAST(:lastId AS text))
ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
OR (p.created_at = :lastCreatedAt AND p.id > :lastId)
ORDER BY p.created_at ASC, p.id ASC
LIMIT :limit
""", nativeQuery = true)
List<LegacyTedMigrationCursor> findNextMigrationBatch(@Param("lastCreatedAt") OffsetDateTime lastCreatedAt,
@Param("lastId") UUID lastId,
@Param("limit") int limit);
@Query("SELECT p FROM ProcurementDocument p WHERE p.id IN :ids")
List<ProcurementDocument> findAllByIdInForMigration(@Param("ids") Collection<UUID> ids);
@Query("SELECT l FROM ProcurementLot l WHERE l.document.id IN :ids")
List<ProcurementLot> findLotsByDocumentIdInForMigration(@Param("ids") Collection<UUID> ids);
@Query("SELECT o FROM Organization o WHERE o.document.id IN :ids")
List<Organization> findOrganizationsByDocumentIdInForMigration(@Param("ids") Collection<UUID> ids);
@Query(value = """
SELECT p.id AS id,
CAST(p.content_vector AS text) AS vectorText,
p.embedding_token_count AS tokenCount
FROM ted.procurement_document p
WHERE p.id IN :ids
AND p.content_vector IS NOT NULL
""", nativeQuery = true)
List<LegacyTedEmbeddingSnapshot> findEmbeddingSnapshotsByIdsForMigration(@Param("ids") Collection<UUID> ids);
/**
* Delete all documents created before the specified date.
* Cascading deletes will automatically remove related lots, organizations, and logs.
@ -254,6 +276,14 @@ public interface ProcurementDocumentRepository extends
* @return Number of documents
*/
long countByCreatedAtBefore(OffsetDateTime cutoffDate);
}
@Query(value = """
SELECT p.id AS id,
CAST(p.content_vector AS text) AS vectorText,
p.embedding_token_count AS tokenCount
FROM ted.procurement_document p
WHERE p.id = :id
AND p.content_vector IS NOT NULL
""", nativeQuery = true)
Optional<LegacyTedEmbeddingSnapshot> findEmbeddingSnapshotByIdForMigration(@Param("id") UUID id);
}

@ -1,7 +1,6 @@
dip:
runtime:
mode: NEW
search:
# Default page size for search results
default-page-size: 20
@ -37,9 +36,9 @@ dip:
debug-top-hits-per-engine: 10
embedding:
enabled: false
jobs:
enabled: true
jobs:
enabled: false
process-in-batches: true
execution-batch-size: 20
@ -54,7 +53,7 @@ dip:
external-e5:
type: http-json
base-url: http://172.20.240.18:8001
base-url: http://localhost:8001
connect-timeout: 5s
read-timeout: 60s
@ -286,3 +285,13 @@ dip:
import-batch-id: legacy-ted-backfill
# Keep false for Wave 1; embeddings can be backfilled later as a separate step
queue-embeddings: false
migrate-embeddings: false
build-chunk-representations: true
legacy-ted-embeddings:
enabled: true
startup-enabled: true
batch-size: 500
max-documents-per-run: 0
skip-when-primary-representation-missing: true
queue-missing-embeddings: false

@ -0,0 +1,7 @@
SET search_path TO TED, DOC, public;
CREATE INDEX IF NOT EXISTS idx_ted_procurement_document_created_at_id
ON TED.procurement_document (created_at ASC, id ASC);
ALTER TABLE DOC.doc_legacy_ted_migration_run
ADD COLUMN IF NOT EXISTS build_chunk_representations boolean NOT NULL DEFAULT true;
Loading…
Cancel
Save