Compare commits
No commits in common. "28c7854ead27f077b8c0e28b29ff8e8a6a531335" and "152d9739af0969090ddb7db63ad6a93054147053" have entirely different histories.
28c7854ead
...
152d9739af
|
|
@ -16,8 +16,8 @@ import org.springframework.scheduling.annotation.EnableAsync;
|
||||||
*/
|
*/
|
||||||
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
|
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
|
||||||
@EnableAsync
|
@EnableAsync
|
||||||
@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity", "at.procon.dip.migration.audit.entity", "at.procon.dip.migration.entity"})
|
@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity", "at.procon.dip.migration.audit.entity"})
|
||||||
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository", "at.procon.dip.migration.audit.repository", "at.procon.dip.migration.repository"})
|
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository", "at.procon.dip.migration.audit.repository"})
|
||||||
public class DocumentIntelligencePlatformApplication {
|
public class DocumentIntelligencePlatformApplication {
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
|
|
|
||||||
|
|
@ -13,8 +13,6 @@ import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
public interface DocumentRepository extends JpaRepository<Document, UUID> {
|
public interface DocumentRepository extends JpaRepository<Document, UUID> {
|
||||||
|
|
||||||
Optional<Document> findByBusinessKey(String businessKey);
|
|
||||||
|
|
||||||
Optional<Document> findByDedupHash(String dedupHash);
|
Optional<Document> findByDedupHash(String dedupHash);
|
||||||
|
|
||||||
List<Document> findAllByDedupHash(String dedupHash);
|
List<Document> findAllByDedupHash(String dedupHash);
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,6 @@ import org.hibernate.type.SqlTypes;
|
||||||
@Table(schema = SchemaNames.TED, name = "ted_notice_projection", indexes = {
|
@Table(schema = SchemaNames.TED, name = "ted_notice_projection", indexes = {
|
||||||
@Index(name = "idx_ted_proj_document", columnList = "document_id"),
|
@Index(name = "idx_ted_proj_document", columnList = "document_id"),
|
||||||
@Index(name = "idx_ted_proj_legacy_doc", columnList = "legacy_procurement_document_id"),
|
@Index(name = "idx_ted_proj_legacy_doc", columnList = "legacy_procurement_document_id"),
|
||||||
@Index(name = "idx_ted_proj_package_identifier", columnList = "package_identifier"),
|
|
||||||
@Index(name = "idx_ted_proj_publication_id", columnList = "publication_id"),
|
@Index(name = "idx_ted_proj_publication_id", columnList = "publication_id"),
|
||||||
@Index(name = "idx_ted_proj_notice_type", columnList = "notice_type"),
|
@Index(name = "idx_ted_proj_notice_type", columnList = "notice_type"),
|
||||||
@Index(name = "idx_ted_proj_buyer_country", columnList = "buyer_country_code"),
|
@Index(name = "idx_ted_proj_buyer_country", columnList = "buyer_country_code"),
|
||||||
|
|
@ -62,16 +61,13 @@ public class TedNoticeProjection {
|
||||||
@Column(name = "legacy_procurement_document_id", unique = true)
|
@Column(name = "legacy_procurement_document_id", unique = true)
|
||||||
private UUID legacyProcurementDocumentId;
|
private UUID legacyProcurementDocumentId;
|
||||||
|
|
||||||
@Column(name = "package_identifier", length = 32)
|
|
||||||
private String packageIdentifier;
|
|
||||||
|
|
||||||
@Column(name = "notice_id", length = 100)
|
@Column(name = "notice_id", length = 100)
|
||||||
private String noticeId;
|
private String noticeId;
|
||||||
|
|
||||||
@Column(name = "publication_id", length = 50)
|
@Column(name = "publication_id", length = 50)
|
||||||
private String publicationId;
|
private String publicationId;
|
||||||
|
|
||||||
@Column(name = "notice_url", columnDefinition = "TEXT")
|
@Column(name = "notice_url", length = 255)
|
||||||
private String noticeUrl;
|
private String noticeUrl;
|
||||||
|
|
||||||
@Column(name = "ojs_id", length = 20)
|
@Column(name = "ojs_id", length = 20)
|
||||||
|
|
@ -189,24 +185,10 @@ public class TedNoticeProjection {
|
||||||
protected void onCreate() {
|
protected void onCreate() {
|
||||||
createdAt = OffsetDateTime.now();
|
createdAt = OffsetDateTime.now();
|
||||||
updatedAt = OffsetDateTime.now();
|
updatedAt = OffsetDateTime.now();
|
||||||
generateNoticeUrl();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@PreUpdate
|
@PreUpdate
|
||||||
protected void onUpdate() {
|
protected void onUpdate() {
|
||||||
updatedAt = OffsetDateTime.now();
|
updatedAt = OffsetDateTime.now();
|
||||||
generateNoticeUrl();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generates TED notice URL from publication_id.
|
|
||||||
* Format: https://ted.europa.eu/en/notice/-/detail/{publication_id without leading zeros}
|
|
||||||
*/
|
|
||||||
private void generateNoticeUrl() {
|
|
||||||
if (publicationId != null && !publicationId.isEmpty()) {
|
|
||||||
// Remove leading zeros from publication_id
|
|
||||||
String cleanId = publicationId.replaceFirst("^0+", "");
|
|
||||||
this.noticeUrl = "https://ted.europa.eu/en/notice/-/detail/" + cleanId;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,5 @@ public interface TedNoticeProjectionRepository extends JpaRepository<TedNoticePr
|
||||||
|
|
||||||
Optional<TedNoticeProjection> findByLegacyProcurementDocumentId(UUID legacyProcurementDocumentId);
|
Optional<TedNoticeProjection> findByLegacyProcurementDocumentId(UUID legacyProcurementDocumentId);
|
||||||
|
|
||||||
Optional<TedNoticeProjection> findByPackageIdentifierAndPublicationId(String packageIdentifier, String publicationId);
|
|
||||||
|
|
||||||
boolean existsByLegacyProcurementDocumentId(UUID legacyProcurementDocumentId);
|
boolean existsByLegacyProcurementDocumentId(UUID legacyProcurementDocumentId);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -76,7 +76,7 @@ public class TedGenericDocumentRootService {
|
||||||
|
|
||||||
private String buildBusinessKey(ProcurementDocument tedDocument) {
|
private String buildBusinessKey(ProcurementDocument tedDocument) {
|
||||||
if (StringUtils.hasText(tedDocument.getPublicationId())) {
|
if (StringUtils.hasText(tedDocument.getPublicationId())) {
|
||||||
return "TED_NOTICE:" + tedDocument.getPublicationId();
|
return "TED:publication:" + tedDocument.getPublicationId();
|
||||||
}
|
}
|
||||||
if (StringUtils.hasText(tedDocument.getNoticeId())) {
|
if (StringUtils.hasText(tedDocument.getNoticeId())) {
|
||||||
return "TED:notice:" + tedDocument.getNoticeId();
|
return "TED:notice:" + tedDocument.getNoticeId();
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,6 @@ import org.springframework.transaction.annotation.Transactional;
|
||||||
public class TedNoticeProjectionService {
|
public class TedNoticeProjectionService {
|
||||||
|
|
||||||
private final TedProjectionProperties properties;
|
private final TedProjectionProperties properties;
|
||||||
private final TedPackageIdentifierResolver packageIdentifierResolver;
|
|
||||||
private final TedGenericDocumentRootService tedGenericDocumentRootService;
|
private final TedGenericDocumentRootService tedGenericDocumentRootService;
|
||||||
private final DocumentRepository documentRepository;
|
private final DocumentRepository documentRepository;
|
||||||
private final TedNoticeProjectionRepository projectionRepository;
|
private final TedNoticeProjectionRepository projectionRepository;
|
||||||
|
|
@ -79,10 +78,6 @@ public class TedNoticeProjectionService {
|
||||||
|
|
||||||
private void mapProjection(TedNoticeProjection projection, Document genericDocument, ProcurementDocument legacyDocument) {
|
private void mapProjection(TedNoticeProjection projection, Document genericDocument, ProcurementDocument legacyDocument) {
|
||||||
projection.setDocument(genericDocument);
|
projection.setDocument(genericDocument);
|
||||||
projection.setLegacyProcurementDocumentId(legacyDocument.getId());
|
|
||||||
projection.setPackageIdentifier(packageIdentifierResolver
|
|
||||||
.resolveFromSourceMetadata(legacyDocument.getSourcePath(), legacyDocument.getSourceFilename())
|
|
||||||
.orElse(null));
|
|
||||||
projection.setNoticeId(legacyDocument.getNoticeId());
|
projection.setNoticeId(legacyDocument.getNoticeId());
|
||||||
projection.setPublicationId(legacyDocument.getPublicationId());
|
projection.setPublicationId(legacyDocument.getPublicationId());
|
||||||
projection.setNoticeUrl(legacyDocument.getNoticeUrl());
|
projection.setNoticeUrl(legacyDocument.getNoticeUrl());
|
||||||
|
|
|
||||||
|
|
@ -1,113 +0,0 @@
|
||||||
package at.procon.dip.domain.ted.service;
|
|
||||||
|
|
||||||
import at.procon.dip.domain.access.DocumentVisibility;
|
|
||||||
import at.procon.dip.domain.document.DocumentFamily;
|
|
||||||
import at.procon.dip.domain.document.DocumentStatus;
|
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
|
||||||
import at.procon.dip.domain.document.entity.Document;
|
|
||||||
import at.procon.dip.domain.document.repository.DocumentRepository;
|
|
||||||
import at.procon.dip.domain.document.service.DocumentService;
|
|
||||||
import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
|
|
||||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
|
||||||
import at.procon.dip.runtime.config.RuntimeMode;
|
|
||||||
import at.procon.ted.model.entity.ProcurementDocument;
|
|
||||||
import at.procon.ted.model.entity.TedDailyPackage;
|
|
||||||
import at.procon.ted.repository.TedDailyPackageRepository;
|
|
||||||
import at.procon.ted.util.HashUtils;
|
|
||||||
import java.time.OffsetDateTime;
|
|
||||||
import java.util.Optional;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
import org.springframework.transaction.annotation.Transactional;
|
|
||||||
|
|
||||||
@Service
|
|
||||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class TedPackageDocumentService {
|
|
||||||
|
|
||||||
private static final String PACKAGE_MIME_TYPE = "application/gzip";
|
|
||||||
|
|
||||||
private final TedPackageIdentifierResolver packageIdentifierResolver;
|
|
||||||
private final TedDailyPackageRepository tedDailyPackageRepository;
|
|
||||||
private final DocumentRepository documentRepository;
|
|
||||||
private final DocumentService documentService;
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
public Optional<Document> ensurePackageDocumentForLegacyNotice(ProcurementDocument legacyDocument) {
|
|
||||||
return packageIdentifierResolver.resolveFromSourceMetadata(legacyDocument.getSourcePath(), legacyDocument.getSourceFilename())
|
|
||||||
.map(this::ensurePackageDocument);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
public Document ensurePackageDocument(String packageIdentifier) {
|
|
||||||
String businessKey = buildBusinessKey(packageIdentifier);
|
|
||||||
Document document = documentRepository.findByBusinessKey(businessKey)
|
|
||||||
.orElseGet(() -> createPackageDocument(packageIdentifier));
|
|
||||||
|
|
||||||
Optional<TedDailyPackage> packageEntity = tedDailyPackageRepository.findByPackageIdentifier(packageIdentifier);
|
|
||||||
document.setVisibility(DocumentVisibility.PUBLIC);
|
|
||||||
document.setDocumentType(DocumentType.TED_PACKAGE);
|
|
||||||
document.setDocumentFamily(DocumentFamily.PROCUREMENT);
|
|
||||||
document.setStatus(resolveStatus(packageEntity));
|
|
||||||
document.setTitle(buildTitle(packageIdentifier));
|
|
||||||
document.setSummary(buildSummary(packageIdentifier, packageEntity.orElse(null)));
|
|
||||||
document.setMimeType(PACKAGE_MIME_TYPE);
|
|
||||||
document.setBusinessKey(businessKey);
|
|
||||||
document.setDedupHash(HashUtils.computeSha256(businessKey));
|
|
||||||
return documentService.save(document);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Document createPackageDocument(String packageIdentifier) {
|
|
||||||
String businessKey = buildBusinessKey(packageIdentifier);
|
|
||||||
return documentService.create(new CreateDocumentCommand(
|
|
||||||
null,
|
|
||||||
DocumentVisibility.PUBLIC,
|
|
||||||
DocumentType.TED_PACKAGE,
|
|
||||||
DocumentFamily.PROCUREMENT,
|
|
||||||
DocumentStatus.RECEIVED,
|
|
||||||
buildTitle(packageIdentifier),
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
PACKAGE_MIME_TYPE,
|
|
||||||
businessKey,
|
|
||||||
HashUtils.computeSha256(businessKey)
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
private DocumentStatus resolveStatus(Optional<TedDailyPackage> packageEntity) {
|
|
||||||
if (packageEntity.isEmpty()) {
|
|
||||||
return DocumentStatus.RECEIVED;
|
|
||||||
}
|
|
||||||
return switch (packageEntity.get().getDownloadStatus()) {
|
|
||||||
case COMPLETED -> DocumentStatus.CLASSIFIED;
|
|
||||||
case FAILED, NOT_FOUND -> DocumentStatus.FAILED;
|
|
||||||
default -> DocumentStatus.RECEIVED;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private String buildBusinessKey(String packageIdentifier) {
|
|
||||||
return "TED_PACKAGE:" + packageIdentifier;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String buildTitle(String packageIdentifier) {
|
|
||||||
return packageIdentifier + ".tar.gz";
|
|
||||||
}
|
|
||||||
|
|
||||||
private String buildSummary(String packageIdentifier, TedDailyPackage packageEntity) {
|
|
||||||
if (packageEntity == null) {
|
|
||||||
return "TED daily package " + packageIdentifier;
|
|
||||||
}
|
|
||||||
return "TED daily package %s (status=%s, xmlFileCount=%s, processedCount=%s, failedCount=%s, downloadedAt=%s)".formatted(
|
|
||||||
packageIdentifier,
|
|
||||||
packageEntity.getDownloadStatus(),
|
|
||||||
packageEntity.getXmlFileCount(),
|
|
||||||
packageEntity.getProcessedCount(),
|
|
||||||
packageEntity.getFailedCount(),
|
|
||||||
formatTimestamp(packageEntity.getDownloadedAt())
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String formatTimestamp(OffsetDateTime value) {
|
|
||||||
return value == null ? null : value.toString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
||||||
package at.procon.dip.domain.ted.service;
|
|
||||||
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import org.springframework.stereotype.Component;
|
|
||||||
import org.springframework.util.StringUtils;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Resolves a TED daily package identifier (YYYYSSSSS) from legacy source metadata.
|
|
||||||
*/
|
|
||||||
@Component
|
|
||||||
public class TedPackageIdentifierResolver {
|
|
||||||
|
|
||||||
private static final Pattern PACKAGE_IDENTIFIER_PATTERN = Pattern.compile("(?<!\\d)(20\\d{7})(?!\\d)");
|
|
||||||
|
|
||||||
public Optional<String> resolveFromSourceMetadata(String sourcePath, String sourceFilename) {
|
|
||||||
return resolve(sourcePath).or(() -> resolve(sourceFilename));
|
|
||||||
}
|
|
||||||
|
|
||||||
public Optional<String> resolve(String value) {
|
|
||||||
if (!StringUtils.hasText(value)) {
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
Matcher matcher = PACKAGE_IDENTIFIER_PATTERN.matcher(value);
|
|
||||||
if (matcher.find()) {
|
|
||||||
return Optional.of(matcher.group(1));
|
|
||||||
}
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,35 +0,0 @@
|
||||||
package at.procon.dip.migration.config;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
|
||||||
import org.springframework.context.annotation.Configuration;
|
|
||||||
|
|
||||||
@Configuration
|
|
||||||
@ConfigurationProperties(prefix = "dip.migration.legacy-ted")
|
|
||||||
@Data
|
|
||||||
public class LegacyTedBackfillProperties {
|
|
||||||
|
|
||||||
/** Enable the TED legacy -> DOC/projection backfill subsystem. */
|
|
||||||
private boolean enabled = false;
|
|
||||||
|
|
||||||
/** Run the backfill automatically on application startup in NEW runtime. */
|
|
||||||
private boolean startupEnabled = false;
|
|
||||||
|
|
||||||
/** Number of legacy TED documents to process per fetch batch. */
|
|
||||||
private int batchSize = 100;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Optional cap for a single run. 0 or negative means unlimited until the full legacy set is backfilled.
|
|
||||||
* Useful to migrate incrementally in controlled slices.
|
|
||||||
*/
|
|
||||||
private long maxDocumentsPerRun = 0;
|
|
||||||
|
|
||||||
/** Resume the latest STOPPED/FAILED run from its persisted cursor. */
|
|
||||||
private boolean resumeLatestIncompleteRun = true;
|
|
||||||
|
|
||||||
/** Import batch id written to DOC.doc_source rows created by the migration. */
|
|
||||||
private String importBatchId = "legacy-ted-backfill";
|
|
||||||
|
|
||||||
/** Queue embeddings for migrated TED representations after the DOC/projection backfill. */
|
|
||||||
private boolean queueEmbeddings = false;
|
|
||||||
}
|
|
||||||
|
|
@ -1,75 +0,0 @@
|
||||||
package at.procon.dip.migration.entity;
|
|
||||||
|
|
||||||
import at.procon.dip.architecture.SchemaNames;
|
|
||||||
import jakarta.persistence.Column;
|
|
||||||
import jakarta.persistence.Entity;
|
|
||||||
import jakarta.persistence.FetchType;
|
|
||||||
import jakarta.persistence.GeneratedValue;
|
|
||||||
import jakarta.persistence.GenerationType;
|
|
||||||
import jakarta.persistence.Id;
|
|
||||||
import jakarta.persistence.Index;
|
|
||||||
import jakarta.persistence.JoinColumn;
|
|
||||||
import jakarta.persistence.ManyToOne;
|
|
||||||
import jakarta.persistence.PrePersist;
|
|
||||||
import jakarta.persistence.Table;
|
|
||||||
import java.time.OffsetDateTime;
|
|
||||||
import java.util.UUID;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
import lombok.Setter;
|
|
||||||
|
|
||||||
@Entity
|
|
||||||
@Table(schema = SchemaNames.DOC, name = "doc_legacy_ted_migration_checkpoint", indexes = {
|
|
||||||
@Index(name = "idx_doc_legacy_ted_mig_ckpt_run", columnList = "run_id"),
|
|
||||||
@Index(name = "idx_doc_legacy_ted_mig_ckpt_batch", columnList = "batch_number")
|
|
||||||
})
|
|
||||||
@Getter
|
|
||||||
@Setter
|
|
||||||
@NoArgsConstructor
|
|
||||||
@AllArgsConstructor
|
|
||||||
@Builder
|
|
||||||
public class LegacyTedMigrationCheckpoint {
|
|
||||||
|
|
||||||
@Id
|
|
||||||
@GeneratedValue(strategy = GenerationType.UUID)
|
|
||||||
private UUID id;
|
|
||||||
|
|
||||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
|
||||||
@JoinColumn(name = "run_id", nullable = false)
|
|
||||||
private LegacyTedMigrationRun run;
|
|
||||||
|
|
||||||
@Column(name = "batch_number", nullable = false)
|
|
||||||
private int batchNumber;
|
|
||||||
|
|
||||||
@Column(name = "batch_processed_count", nullable = false)
|
|
||||||
private int batchProcessedCount;
|
|
||||||
|
|
||||||
@Column(name = "cumulative_processed_count", nullable = false)
|
|
||||||
private long cumulativeProcessedCount;
|
|
||||||
|
|
||||||
@Column(name = "last_legacy_created_at")
|
|
||||||
private OffsetDateTime lastLegacyCreatedAt;
|
|
||||||
|
|
||||||
@Column(name = "last_legacy_document_id")
|
|
||||||
private UUID lastLegacyDocumentId;
|
|
||||||
|
|
||||||
@Column(name = "last_doc_document_id")
|
|
||||||
private UUID lastDocDocumentId;
|
|
||||||
|
|
||||||
@Column(name = "last_projection_id")
|
|
||||||
private UUID lastProjectionId;
|
|
||||||
|
|
||||||
@Column(name = "note", columnDefinition = "TEXT")
|
|
||||||
private String note;
|
|
||||||
|
|
||||||
@Builder.Default
|
|
||||||
@Column(name = "created_at", nullable = false, updatable = false)
|
|
||||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
|
||||||
|
|
||||||
@PrePersist
|
|
||||||
protected void onCreate() {
|
|
||||||
createdAt = OffsetDateTime.now();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,103 +0,0 @@
|
||||||
package at.procon.dip.migration.entity;
|
|
||||||
|
|
||||||
import at.procon.dip.architecture.SchemaNames;
|
|
||||||
import jakarta.persistence.Column;
|
|
||||||
import jakarta.persistence.Entity;
|
|
||||||
import jakarta.persistence.EnumType;
|
|
||||||
import jakarta.persistence.Enumerated;
|
|
||||||
import jakarta.persistence.GeneratedValue;
|
|
||||||
import jakarta.persistence.GenerationType;
|
|
||||||
import jakarta.persistence.Id;
|
|
||||||
import jakarta.persistence.Index;
|
|
||||||
import jakarta.persistence.PrePersist;
|
|
||||||
import jakarta.persistence.PreUpdate;
|
|
||||||
import jakarta.persistence.Table;
|
|
||||||
import java.time.OffsetDateTime;
|
|
||||||
import java.util.UUID;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
import lombok.Setter;
|
|
||||||
|
|
||||||
@Entity
|
|
||||||
@Table(schema = SchemaNames.DOC, name = "doc_legacy_ted_migration_run", indexes = {
|
|
||||||
@Index(name = "idx_doc_legacy_ted_mig_run_status", columnList = "status"),
|
|
||||||
@Index(name = "idx_doc_legacy_ted_mig_run_started", columnList = "started_at DESC")
|
|
||||||
})
|
|
||||||
@Getter
|
|
||||||
@Setter
|
|
||||||
@NoArgsConstructor
|
|
||||||
@AllArgsConstructor
|
|
||||||
@Builder
|
|
||||||
public class LegacyTedMigrationRun {
|
|
||||||
|
|
||||||
@Id
|
|
||||||
@GeneratedValue(strategy = GenerationType.UUID)
|
|
||||||
private UUID id;
|
|
||||||
|
|
||||||
@Enumerated(EnumType.STRING)
|
|
||||||
@Column(name = "status", nullable = false, length = 32)
|
|
||||||
private LegacyTedMigrationRunStatus status;
|
|
||||||
|
|
||||||
@Column(name = "import_batch_id", length = 255)
|
|
||||||
private String importBatchId;
|
|
||||||
|
|
||||||
@Column(name = "queue_embeddings", nullable = false)
|
|
||||||
private boolean queueEmbeddings;
|
|
||||||
|
|
||||||
@Column(name = "batch_size", nullable = false)
|
|
||||||
private int batchSize;
|
|
||||||
|
|
||||||
@Column(name = "max_documents_per_run")
|
|
||||||
private Long maxDocumentsPerRun;
|
|
||||||
|
|
||||||
@Column(name = "processed_count", nullable = false)
|
|
||||||
@Builder.Default
|
|
||||||
private long processedCount = 0;
|
|
||||||
|
|
||||||
@Column(name = "success_count", nullable = false)
|
|
||||||
@Builder.Default
|
|
||||||
private long successCount = 0;
|
|
||||||
|
|
||||||
@Column(name = "failed_count", nullable = false)
|
|
||||||
@Builder.Default
|
|
||||||
private long failedCount = 0;
|
|
||||||
|
|
||||||
@Column(name = "last_legacy_created_at")
|
|
||||||
private OffsetDateTime lastLegacyCreatedAt;
|
|
||||||
|
|
||||||
@Column(name = "last_legacy_document_id")
|
|
||||||
private UUID lastLegacyDocumentId;
|
|
||||||
|
|
||||||
@Column(name = "last_doc_document_id")
|
|
||||||
private UUID lastDocDocumentId;
|
|
||||||
|
|
||||||
@Column(name = "last_projection_id")
|
|
||||||
private UUID lastProjectionId;
|
|
||||||
|
|
||||||
@Column(name = "last_error", columnDefinition = "TEXT")
|
|
||||||
private String lastError;
|
|
||||||
|
|
||||||
@Builder.Default
|
|
||||||
@Column(name = "started_at", nullable = false, updatable = false)
|
|
||||||
private OffsetDateTime startedAt = OffsetDateTime.now();
|
|
||||||
|
|
||||||
@Builder.Default
|
|
||||||
@Column(name = "updated_at", nullable = false)
|
|
||||||
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
|
||||||
|
|
||||||
@Column(name = "completed_at")
|
|
||||||
private OffsetDateTime completedAt;
|
|
||||||
|
|
||||||
@PrePersist
|
|
||||||
protected void onCreate() {
|
|
||||||
startedAt = startedAt == null ? OffsetDateTime.now() : startedAt;
|
|
||||||
updatedAt = OffsetDateTime.now();
|
|
||||||
}
|
|
||||||
|
|
||||||
@PreUpdate
|
|
||||||
protected void onUpdate() {
|
|
||||||
updatedAt = OffsetDateTime.now();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
package at.procon.dip.migration.entity;
|
|
||||||
|
|
||||||
public enum LegacyTedMigrationRunStatus {
|
|
||||||
RUNNING,
|
|
||||||
STOPPED,
|
|
||||||
COMPLETED,
|
|
||||||
FAILED
|
|
||||||
}
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
package at.procon.dip.migration.repository;
|
|
||||||
|
|
||||||
import at.procon.dip.migration.entity.LegacyTedMigrationCheckpoint;
|
|
||||||
import java.util.UUID;
|
|
||||||
import org.springframework.data.jpa.repository.JpaRepository;
|
|
||||||
|
|
||||||
public interface LegacyTedMigrationCheckpointRepository extends JpaRepository<LegacyTedMigrationCheckpoint, UUID> {
|
|
||||||
|
|
||||||
long countByRun_Id(UUID runId);
|
|
||||||
}
|
|
||||||
|
|
@ -1,23 +0,0 @@
|
||||||
package at.procon.dip.migration.repository;
|
|
||||||
|
|
||||||
import at.procon.dip.migration.entity.LegacyTedMigrationRun;
|
|
||||||
import at.procon.dip.migration.entity.LegacyTedMigrationRunStatus;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.UUID;
|
|
||||||
import org.springframework.data.domain.Pageable;
|
|
||||||
import org.springframework.data.jpa.repository.JpaRepository;
|
|
||||||
import org.springframework.data.jpa.repository.Query;
|
|
||||||
import org.springframework.data.repository.query.Param;
|
|
||||||
|
|
||||||
public interface LegacyTedMigrationRunRepository extends JpaRepository<LegacyTedMigrationRun, UUID> {
|
|
||||||
|
|
||||||
@Query("""
|
|
||||||
select r
|
|
||||||
from LegacyTedMigrationRun r
|
|
||||||
where r.status in :statuses
|
|
||||||
order by r.startedAt desc
|
|
||||||
""")
|
|
||||||
List<LegacyTedMigrationRun> findLatestByStatuses(@Param("statuses") Collection<LegacyTedMigrationRunStatus> statuses,
|
|
||||||
Pageable pageable);
|
|
||||||
}
|
|
||||||
|
|
@ -1,197 +0,0 @@
|
||||||
package at.procon.dip.migration.service;
|
|
||||||
|
|
||||||
import at.procon.dip.migration.config.LegacyTedBackfillProperties;
|
|
||||||
import at.procon.dip.migration.entity.LegacyTedMigrationCheckpoint;
|
|
||||||
import at.procon.dip.migration.entity.LegacyTedMigrationRun;
|
|
||||||
import at.procon.dip.migration.entity.LegacyTedMigrationRunStatus;
|
|
||||||
import at.procon.dip.migration.repository.LegacyTedMigrationCheckpointRepository;
|
|
||||||
import at.procon.dip.migration.repository.LegacyTedMigrationRunRepository;
|
|
||||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
|
||||||
import at.procon.dip.runtime.config.RuntimeMode;
|
|
||||||
import at.procon.ted.repository.LegacyTedMigrationCursor;
|
|
||||||
import at.procon.ted.repository.ProcurementDocumentRepository;
|
|
||||||
import java.time.OffsetDateTime;
|
|
||||||
import java.time.ZoneOffset;
|
|
||||||
import java.util.EnumSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.UUID;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.springframework.data.domain.PageRequest;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
import org.springframework.transaction.annotation.Transactional;
|
|
||||||
|
|
||||||
@Service
|
|
||||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
@Slf4j
|
|
||||||
public class LegacyTedBackfillMigrationService {
|
|
||||||
|
|
||||||
private final LegacyTedBackfillProperties properties;
|
|
||||||
private final ProcurementDocumentRepository procurementDocumentRepository;
|
|
||||||
private final LegacyTedMigrationRunRepository runRepository;
|
|
||||||
private final LegacyTedMigrationCheckpointRepository checkpointRepository;
|
|
||||||
private final LegacyTedBackfillWorker worker;
|
|
||||||
|
|
||||||
public UUID runBackfill() {
|
|
||||||
if (!properties.isEnabled()) {
|
|
||||||
log.info("Legacy TED backfill is disabled");
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
LegacyTedMigrationRun run = resolveRun();
|
|
||||||
log.info("Starting legacy TED -> DOC/projection backfill run {} (batchSize={}, maxDocumentsPerRun={}, queueEmbeddings={})",
|
|
||||||
run.getId(), run.getBatchSize(), run.getMaxDocumentsPerRun(), run.isQueueEmbeddings());
|
|
||||||
|
|
||||||
long existingCheckpointCount = checkpointRepository.countByRun_Id(run.getId());
|
|
||||||
int batchNumber = existingCheckpointCount >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) existingCheckpointCount;
|
|
||||||
long processedInThisInvocation = 0;
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
int limit = effectiveBatchLimit(run, processedInThisInvocation);
|
|
||||||
if (limit <= 0) {
|
|
||||||
markStopped(run, "Stopped because maxDocumentsPerRun was reached");
|
|
||||||
createCheckpoint(run, ++batchNumber, 0, "Invocation limit reached");
|
|
||||||
return run.getId();
|
|
||||||
}
|
|
||||||
|
|
||||||
List<LegacyTedMigrationCursor> batch = loadNextBatch(run, limit);
|
|
||||||
|
|
||||||
if (batch.isEmpty()) {
|
|
||||||
markCompleted(run);
|
|
||||||
createCheckpoint(run, ++batchNumber, 0, "Backfill completed - no more legacy TED documents after current cursor");
|
|
||||||
return run.getId();
|
|
||||||
}
|
|
||||||
|
|
||||||
int processedInBatch = 0;
|
|
||||||
for (LegacyTedMigrationCursor cursor : batch) {
|
|
||||||
try {
|
|
||||||
LegacyTedBackfillWorker.BackfillOutcome outcome = worker.backfill(
|
|
||||||
cursor.getId(),
|
|
||||||
run.getImportBatchId(),
|
|
||||||
run.isQueueEmbeddings()
|
|
||||||
);
|
|
||||||
advanceRun(run, cursor, outcome);
|
|
||||||
processedInBatch++;
|
|
||||||
processedInThisInvocation++;
|
|
||||||
} catch (RuntimeException ex) {
|
|
||||||
markFailed(run, cursor, ex);
|
|
||||||
createCheckpoint(run, ++batchNumber, processedInBatch,
|
|
||||||
"Failed at legacy document %s: %s".formatted(cursor.getId(), ex.getMessage()));
|
|
||||||
throw ex;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
createCheckpoint(run, ++batchNumber, processedInBatch,
|
|
||||||
"Processed %d legacy TED documents in batch".formatted(processedInBatch));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected List<LegacyTedMigrationCursor> loadNextBatch(LegacyTedMigrationRun run, int limit) {
|
|
||||||
if (run.getLastLegacyCreatedAt() == null || run.getLastLegacyDocumentId() == null) {
|
|
||||||
return procurementDocumentRepository.findFirstMigrationBatch(limit);
|
|
||||||
}
|
|
||||||
return procurementDocumentRepository.findNextMigrationBatch(
|
|
||||||
run.getLastLegacyCreatedAt(),
|
|
||||||
run.getLastLegacyDocumentId(),
|
|
||||||
limit
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
protected LegacyTedMigrationRun resolveRun() {
|
|
||||||
if (properties.isResumeLatestIncompleteRun()) {
|
|
||||||
List<LegacyTedMigrationRun> resumable = runRepository.findLatestByStatuses(
|
|
||||||
EnumSet.of(LegacyTedMigrationRunStatus.RUNNING, LegacyTedMigrationRunStatus.STOPPED, LegacyTedMigrationRunStatus.FAILED),
|
|
||||||
PageRequest.of(0, 1)
|
|
||||||
);
|
|
||||||
if (!resumable.isEmpty()) {
|
|
||||||
LegacyTedMigrationRun run = resumable.get(0);
|
|
||||||
run.setStatus(LegacyTedMigrationRunStatus.RUNNING);
|
|
||||||
run.setLastError(null);
|
|
||||||
run.setCompletedAt(null);
|
|
||||||
return runRepository.save(run);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return runRepository.save(LegacyTedMigrationRun.builder()
|
|
||||||
.status(LegacyTedMigrationRunStatus.RUNNING)
|
|
||||||
.importBatchId(properties.getImportBatchId())
|
|
||||||
.queueEmbeddings(properties.isQueueEmbeddings())
|
|
||||||
.batchSize(Math.max(1, properties.getBatchSize()))
|
|
||||||
.maxDocumentsPerRun(properties.getMaxDocumentsPerRun() > 0 ? properties.getMaxDocumentsPerRun() : null)
|
|
||||||
.build());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
protected void advanceRun(LegacyTedMigrationRun run,
|
|
||||||
LegacyTedMigrationCursor cursor,
|
|
||||||
LegacyTedBackfillWorker.BackfillOutcome outcome) {
|
|
||||||
run.setStatus(LegacyTedMigrationRunStatus.RUNNING);
|
|
||||||
run.setProcessedCount(run.getProcessedCount() + 1);
|
|
||||||
run.setSuccessCount(run.getSuccessCount() + 1);
|
|
||||||
run.setLastLegacyCreatedAt(cursor.getCreatedAt() != null
|
|
||||||
? cursor.getCreatedAt().atOffset(ZoneOffset.UTC)
|
|
||||||
: null);
|
|
||||||
run.setLastLegacyDocumentId(cursor.getId());
|
|
||||||
run.setLastDocDocumentId(outcome.documentId());
|
|
||||||
run.setLastProjectionId(outcome.projectionId());
|
|
||||||
run.setLastError(null);
|
|
||||||
runRepository.save(run);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
protected void createCheckpoint(LegacyTedMigrationRun run, int batchNumber, int processedInBatch, String note) {
|
|
||||||
checkpointRepository.save(LegacyTedMigrationCheckpoint.builder()
|
|
||||||
.run(run)
|
|
||||||
.batchNumber(batchNumber)
|
|
||||||
.batchProcessedCount(processedInBatch)
|
|
||||||
.cumulativeProcessedCount(run.getProcessedCount())
|
|
||||||
.lastLegacyCreatedAt(run.getLastLegacyCreatedAt())
|
|
||||||
.lastLegacyDocumentId(run.getLastLegacyDocumentId())
|
|
||||||
.lastDocDocumentId(run.getLastDocDocumentId())
|
|
||||||
.lastProjectionId(run.getLastProjectionId())
|
|
||||||
.note(note)
|
|
||||||
.build());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
protected void markCompleted(LegacyTedMigrationRun run) {
|
|
||||||
run.setStatus(LegacyTedMigrationRunStatus.COMPLETED);
|
|
||||||
run.setCompletedAt(OffsetDateTime.now());
|
|
||||||
run.setLastError(null);
|
|
||||||
runRepository.save(run);
|
|
||||||
log.info("Legacy TED backfill run {} completed successfully - processed {} documents",
|
|
||||||
run.getId(), run.getProcessedCount());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
protected void markStopped(LegacyTedMigrationRun run, String message) {
|
|
||||||
run.setStatus(LegacyTedMigrationRunStatus.STOPPED);
|
|
||||||
run.setLastError(message);
|
|
||||||
runRepository.save(run);
|
|
||||||
log.info("Legacy TED backfill run {} stopped after {} processed documents: {}",
|
|
||||||
run.getId(), run.getProcessedCount(), message);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
protected void markFailed(LegacyTedMigrationRun run, LegacyTedMigrationCursor cursor, RuntimeException ex) {
|
|
||||||
run.setStatus(LegacyTedMigrationRunStatus.FAILED);
|
|
||||||
run.setFailedCount(run.getFailedCount() + 1);
|
|
||||||
run.setLastError(ex.getMessage());
|
|
||||||
runRepository.save(run);
|
|
||||||
log.error("Legacy TED backfill run {} failed at legacy document {}: {}",
|
|
||||||
run.getId(), cursor.getId(), ex.getMessage(), ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int effectiveBatchLimit(LegacyTedMigrationRun run, long processedInThisInvocation) {
|
|
||||||
long maxPerRun = run.getMaxDocumentsPerRun() != null ? run.getMaxDocumentsPerRun() : 0L;
|
|
||||||
if (maxPerRun <= 0) {
|
|
||||||
return Math.max(1, run.getBatchSize());
|
|
||||||
}
|
|
||||||
long remaining = maxPerRun - processedInThisInvocation;
|
|
||||||
if (remaining <= 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
return (int) Math.max(1L, Math.min(run.getBatchSize(), remaining));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,390 +0,0 @@
|
||||||
package at.procon.dip.migration.service;
|
|
||||||
|
|
||||||
import at.procon.dip.classification.spi.DetectionResult;
|
|
||||||
import at.procon.dip.domain.document.ContentRole;
|
|
||||||
import at.procon.dip.domain.document.DocumentFamily;
|
|
||||||
import at.procon.dip.domain.document.DocumentStatus;
|
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
|
||||||
import at.procon.dip.domain.document.RelationType;
|
|
||||||
import at.procon.dip.domain.document.RepresentationType;
|
|
||||||
import at.procon.dip.domain.document.SourceType;
|
|
||||||
import at.procon.dip.domain.document.StorageType;
|
|
||||||
import at.procon.dip.domain.document.entity.Document;
|
|
||||||
import at.procon.dip.domain.document.entity.DocumentContent;
|
|
||||||
import at.procon.dip.domain.document.entity.DocumentSource;
|
|
||||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
|
||||||
import at.procon.dip.domain.document.repository.DocumentContentRepository;
|
|
||||||
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
|
||||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
|
||||||
import at.procon.dip.domain.document.service.DocumentContentService;
|
|
||||||
import at.procon.dip.domain.document.service.DocumentRelationService;
|
|
||||||
import at.procon.dip.domain.document.service.DocumentRepresentationService;
|
|
||||||
import at.procon.dip.domain.document.service.DocumentService;
|
|
||||||
import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
|
|
||||||
import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
|
|
||||||
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
|
||||||
import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
|
|
||||||
import at.procon.dip.domain.ted.service.TedGenericDocumentRootService;
|
|
||||||
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
|
|
||||||
import at.procon.dip.domain.ted.service.TedPackageDocumentService;
|
|
||||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
|
||||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
|
||||||
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
|
||||||
import at.procon.dip.extraction.spi.ExtractionResult;
|
|
||||||
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
|
||||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
|
||||||
import at.procon.dip.normalization.service.TextRepresentationBuildService;
|
|
||||||
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
|
||||||
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
|
||||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
|
||||||
import at.procon.dip.runtime.config.RuntimeMode;
|
|
||||||
import at.procon.dip.search.service.DocumentLexicalIndexService;
|
|
||||||
import at.procon.ted.model.entity.ProcurementDocument;
|
|
||||||
import at.procon.ted.repository.ProcurementDocumentRepository;
|
|
||||||
import at.procon.ted.util.HashUtils;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.time.OffsetDateTime;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.LinkedHashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.UUID;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
import org.springframework.transaction.annotation.Propagation;
|
|
||||||
import org.springframework.transaction.annotation.Transactional;
|
|
||||||
import org.springframework.util.StringUtils;
|
|
||||||
|
|
||||||
@Service
|
|
||||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
@Slf4j
|
|
||||||
public class LegacyTedBackfillWorker {
|
|
||||||
|
|
||||||
private static final String XML_MIME_TYPE = "application/xml";
|
|
||||||
private static final String TEXT_MIME_TYPE = "text/plain";
|
|
||||||
private static final String CHARSET_UTF8 = StandardCharsets.UTF_8.name();
|
|
||||||
|
|
||||||
private final ProcurementDocumentRepository procurementDocumentRepository;
|
|
||||||
private final TedGenericDocumentRootService tedGenericDocumentRootService;
|
|
||||||
private final TedPackageDocumentService tedPackageDocumentService;
|
|
||||||
private final TedNoticeProjectionService tedNoticeProjectionService;
|
|
||||||
private final DocumentService documentService;
|
|
||||||
private final DocumentSourceRepository sourceRepository;
|
|
||||||
private final at.procon.dip.domain.document.service.DocumentSourceService sourceService;
|
|
||||||
private final DocumentContentRepository contentRepository;
|
|
||||||
private final DocumentContentService documentContentService;
|
|
||||||
private final DocumentTextRepresentationRepository representationRepository;
|
|
||||||
private final DocumentRepresentationService documentRepresentationService;
|
|
||||||
private final DocumentLexicalIndexService lexicalIndexService;
|
|
||||||
private final DocumentRelationService documentRelationService;
|
|
||||||
private final TextRepresentationBuildService textRepresentationBuildService;
|
|
||||||
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
|
||||||
private final EmbeddingProperties embeddingProperties;
|
|
||||||
|
|
||||||
@Transactional(propagation = Propagation.REQUIRES_NEW)
|
|
||||||
public BackfillOutcome backfill(UUID legacyProcurementDocumentId, String importBatchId, boolean queueEmbeddings) {
|
|
||||||
ProcurementDocument legacyDocument = procurementDocumentRepository.findById(legacyProcurementDocumentId)
|
|
||||||
.orElseThrow(() -> new IllegalArgumentException("Unknown legacy TED procurement document id: " + legacyProcurementDocumentId));
|
|
||||||
|
|
||||||
Document document = tedGenericDocumentRootService.ensureGenericTedDocument(legacyDocument);
|
|
||||||
ensureSource(document, legacyDocument, importBatchId);
|
|
||||||
DocumentContent originalContent = ensureOriginalXmlContent(document, legacyDocument);
|
|
||||||
DocumentContent normalizedTextContent = ensureNormalizedTextContent(document, legacyDocument);
|
|
||||||
|
|
||||||
List<TextRepresentationDraft> drafts = buildDrafts(legacyDocument);
|
|
||||||
List<DocumentTextRepresentation> savedRepresentations = ensureRepresentations(document, originalContent, normalizedTextContent, drafts);
|
|
||||||
|
|
||||||
tedPackageDocumentService.ensurePackageDocumentForLegacyNotice(legacyDocument)
|
|
||||||
.ifPresent(packageDocument -> documentRelationService.ensureRelation(new CreateDocumentRelationCommand(
|
|
||||||
packageDocument.getId(),
|
|
||||||
document.getId(),
|
|
||||||
RelationType.CONTAINS,
|
|
||||||
null,
|
|
||||||
legacyDocument.getSourcePath()
|
|
||||||
)));
|
|
||||||
|
|
||||||
UUID projectionId = tedNoticeProjectionService.registerOrRefreshProjection(legacyDocument, document.getId());
|
|
||||||
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
|
|
||||||
|
|
||||||
if (queueEmbeddings) {
|
|
||||||
queueEmbeddings(document.getId(), savedRepresentations);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new BackfillOutcome(document.getId(), projectionId);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void ensureSource(Document document, ProcurementDocument legacyDocument, String importBatchId) {
|
|
||||||
String externalSourceId = legacyDocument.getId().toString();
|
|
||||||
DocumentSource existing = sourceRepository.findByDocument_Id(document.getId()).stream()
|
|
||||||
.filter(candidate -> candidate.getSourceType() == SourceType.MIGRATION || Objects.equals(candidate.getExternalSourceId(), externalSourceId))
|
|
||||||
.findFirst()
|
|
||||||
.orElse(null);
|
|
||||||
if (existing == null) {
|
|
||||||
sourceService.addSource(new AddDocumentSourceCommand(
|
|
||||||
document.getId(),
|
|
||||||
SourceType.MIGRATION,
|
|
||||||
externalSourceId,
|
|
||||||
legacyDocument.getSourcePath(),
|
|
||||||
legacyDocument.getSourceFilename(),
|
|
||||||
null,
|
|
||||||
importBatchId,
|
|
||||||
legacyDocument.getCreatedAt() != null ? legacyDocument.getCreatedAt() : OffsetDateTime.now()
|
|
||||||
));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
existing.setSourceType(SourceType.MIGRATION);
|
|
||||||
existing.setExternalSourceId(externalSourceId);
|
|
||||||
existing.setSourceUri(legacyDocument.getSourcePath());
|
|
||||||
existing.setSourceFilename(legacyDocument.getSourceFilename());
|
|
||||||
existing.setImportBatchId(importBatchId);
|
|
||||||
existing.setReceivedAt(legacyDocument.getCreatedAt() != null ? legacyDocument.getCreatedAt() : existing.getReceivedAt());
|
|
||||||
sourceRepository.save(existing);
|
|
||||||
}
|
|
||||||
|
|
||||||
private DocumentContent ensureOriginalXmlContent(Document document, ProcurementDocument legacyDocument) {
|
|
||||||
String xml = legacyDocument.getXmlDocument();
|
|
||||||
long sizeBytes = legacyDocument.getFileSizeBytes() != null
|
|
||||||
? legacyDocument.getFileSizeBytes()
|
|
||||||
: (xml == null ? 0L : (long) xml.getBytes(StandardCharsets.UTF_8).length);
|
|
||||||
return ensureContent(document, ContentRole.ORIGINAL, XML_MIME_TYPE, xml, legacyDocument.getDocumentHash(), sizeBytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
private DocumentContent ensureNormalizedTextContent(Document document, ProcurementDocument legacyDocument) {
|
|
||||||
String normalizedText = resolveNormalizedText(legacyDocument);
|
|
||||||
String contentHash = HashUtils.computeSha256(legacyDocument.getDocumentHash() + ":NORMALIZED_TEXT:" + normalizedText);
|
|
||||||
return ensureContent(document, ContentRole.NORMALIZED_TEXT, TEXT_MIME_TYPE, normalizedText, contentHash, (long) normalizedText.length());
|
|
||||||
}
|
|
||||||
|
|
||||||
private DocumentContent ensureContent(Document document,
|
|
||||||
ContentRole role,
|
|
||||||
String mimeType,
|
|
||||||
String text,
|
|
||||||
String contentHash,
|
|
||||||
Long sizeBytes) {
|
|
||||||
List<DocumentContent> existing = contentRepository.findByDocument_IdAndContentRole(document.getId(), role);
|
|
||||||
if (!existing.isEmpty()) {
|
|
||||||
DocumentContent content = existing.get(0);
|
|
||||||
content.setStorageType(StorageType.DB_TEXT);
|
|
||||||
content.setMimeType(mimeType);
|
|
||||||
content.setCharsetName(CHARSET_UTF8);
|
|
||||||
content.setTextContent(text);
|
|
||||||
content.setBinaryContent(null);
|
|
||||||
content.setBinaryRef(null);
|
|
||||||
content.setContentHash(contentHash);
|
|
||||||
content.setSizeBytes(sizeBytes);
|
|
||||||
return contentRepository.save(content);
|
|
||||||
}
|
|
||||||
|
|
||||||
return documentContentService.addContent(new AddDocumentContentCommand(
|
|
||||||
document.getId(),
|
|
||||||
role,
|
|
||||||
StorageType.DB_TEXT,
|
|
||||||
mimeType,
|
|
||||||
CHARSET_UTF8,
|
|
||||||
text,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
contentHash,
|
|
||||||
sizeBytes
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<TextRepresentationDraft> buildDrafts(ProcurementDocument legacyDocument) {
|
|
||||||
String normalizedText = resolveNormalizedText(legacyDocument);
|
|
||||||
|
|
||||||
Map<ContentRole, String> derivedText = new LinkedHashMap<>();
|
|
||||||
derivedText.put(ContentRole.NORMALIZED_TEXT, normalizedText);
|
|
||||||
|
|
||||||
SourceDescriptor sourceDescriptor = new SourceDescriptor(
|
|
||||||
null,
|
|
||||||
SourceType.MIGRATION,
|
|
||||||
legacyDocument.getId().toString(),
|
|
||||||
legacyDocument.getSourcePath(),
|
|
||||||
legacyDocument.getSourceFilename(),
|
|
||||||
XML_MIME_TYPE,
|
|
||||||
null,
|
|
||||||
legacyDocument.getXmlDocument(),
|
|
||||||
legacyDocument.getCreatedAt(),
|
|
||||||
OriginalContentStoragePolicy.DEFAULT,
|
|
||||||
Map.of("legacyProcurementDocumentId", legacyDocument.getId().toString())
|
|
||||||
);
|
|
||||||
|
|
||||||
DetectionResult detectionResult = new DetectionResult(
|
|
||||||
DocumentType.TED_NOTICE,
|
|
||||||
DocumentFamily.PROCUREMENT,
|
|
||||||
XML_MIME_TYPE,
|
|
||||||
legacyDocument.getLanguageCode(),
|
|
||||||
Map.of()
|
|
||||||
);
|
|
||||||
|
|
||||||
ExtractionResult extractionResult = new ExtractionResult(
|
|
||||||
derivedText,
|
|
||||||
List.of(new ExtractedStructuredPayload("ted-notice", buildStructuredAttributes(legacyDocument))),
|
|
||||||
List.of()
|
|
||||||
);
|
|
||||||
|
|
||||||
return textRepresentationBuildService.build(new RepresentationBuildRequest(
|
|
||||||
sourceDescriptor,
|
|
||||||
detectionResult,
|
|
||||||
extractionResult
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<DocumentTextRepresentation> ensureRepresentations(Document document,
|
|
||||||
DocumentContent originalContent,
|
|
||||||
DocumentContent normalizedTextContent,
|
|
||||||
List<TextRepresentationDraft> drafts) {
|
|
||||||
if (drafts == null || drafts.isEmpty()) {
|
|
||||||
return List.of();
|
|
||||||
}
|
|
||||||
|
|
||||||
List<DocumentTextRepresentation> existing = new ArrayList<>(representationRepository.findByDocument_Id(document.getId()));
|
|
||||||
existing.sort(Comparator.comparing(DocumentTextRepresentation::getCreatedAt, Comparator.nullsLast(Comparator.naturalOrder())));
|
|
||||||
|
|
||||||
List<DocumentTextRepresentation> saved = new ArrayList<>();
|
|
||||||
for (TextRepresentationDraft draft : drafts) {
|
|
||||||
if (!StringUtils.hasText(draft.textBody())) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
DocumentTextRepresentation representation = findMatchingRepresentation(existing, draft)
|
|
||||||
.orElseGet(DocumentTextRepresentation::new);
|
|
||||||
|
|
||||||
DocumentContent linkedContent = draft.sourceContentRole() == ContentRole.NORMALIZED_TEXT
|
|
||||||
? normalizedTextContent
|
|
||||||
: originalContent;
|
|
||||||
|
|
||||||
boolean isNew = representation.getId() == null;
|
|
||||||
representation.setDocument(document);
|
|
||||||
representation.setContent(linkedContent);
|
|
||||||
representation.setRepresentationType(draft.representationType());
|
|
||||||
representation.setBuilderKey(draft.builderKey());
|
|
||||||
representation.setLanguageCode(draft.languageCode());
|
|
||||||
representation.setTokenCount(draft.textBody().length());
|
|
||||||
representation.setCharCount(draft.textBody().length());
|
|
||||||
representation.setChunkIndex(draft.chunkIndex());
|
|
||||||
representation.setChunkStartOffset(draft.chunkStartOffset());
|
|
||||||
representation.setChunkEndOffset(draft.chunkEndOffset());
|
|
||||||
representation.setPrimaryRepresentation(draft.primary());
|
|
||||||
representation.setTextBody(draft.textBody());
|
|
||||||
|
|
||||||
DocumentTextRepresentation savedRepresentation;
|
|
||||||
if (isNew) {
|
|
||||||
savedRepresentation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
|
|
||||||
document.getId(),
|
|
||||||
linkedContent == null ? null : linkedContent.getId(),
|
|
||||||
draft.representationType(),
|
|
||||||
draft.builderKey(),
|
|
||||||
draft.languageCode(),
|
|
||||||
draft.textBody().length(),
|
|
||||||
draft.chunkIndex(),
|
|
||||||
draft.chunkStartOffset(),
|
|
||||||
draft.chunkEndOffset(),
|
|
||||||
draft.primary(),
|
|
||||||
draft.textBody()
|
|
||||||
));
|
|
||||||
existing.add(savedRepresentation);
|
|
||||||
} else {
|
|
||||||
savedRepresentation = representationRepository.save(representation);
|
|
||||||
lexicalIndexService.indexRepresentation(savedRepresentation.getId());
|
|
||||||
}
|
|
||||||
saved.add(savedRepresentation);
|
|
||||||
}
|
|
||||||
return saved;
|
|
||||||
}
|
|
||||||
|
|
||||||
private java.util.Optional<DocumentTextRepresentation> findMatchingRepresentation(List<DocumentTextRepresentation> existing,
|
|
||||||
TextRepresentationDraft draft) {
|
|
||||||
return existing.stream()
|
|
||||||
.filter(candidate -> candidate.getRepresentationType() == draft.representationType())
|
|
||||||
.filter(candidate -> Objects.equals(candidate.getChunkIndex(), draft.chunkIndex()))
|
|
||||||
.filter(candidate -> Objects.equals(candidate.isPrimaryRepresentation(), draft.primary()))
|
|
||||||
.findFirst();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void queueEmbeddings(UUID documentId, List<DocumentTextRepresentation> representations) {
|
|
||||||
if (!embeddingProperties.isEnabled() || !StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) {
|
|
||||||
log.debug("Skipping embedding queue for migrated document {} because no default document model is configured", documentId);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (DocumentTextRepresentation representation : representations) {
|
|
||||||
RepresentationType type = representation.getRepresentationType();
|
|
||||||
boolean queue = switch (type) {
|
|
||||||
case SEMANTIC_TEXT -> true;
|
|
||||||
case CHUNK -> true;
|
|
||||||
default -> false;
|
|
||||||
};
|
|
||||||
if (!queue) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
embeddingOrchestrator.enqueueRepresentation(documentId, representation.getId(), embeddingProperties.getDefaultDocumentModel());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String resolveNormalizedText(ProcurementDocument legacyDocument) {
|
|
||||||
if (StringUtils.hasText(legacyDocument.getTextContent())) {
|
|
||||||
return legacyDocument.getTextContent().trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
appendLine(sb, legacyDocument.getProjectTitle());
|
|
||||||
appendBlank(sb, legacyDocument.getProjectDescription());
|
|
||||||
appendLine(sb, legacyDocument.getBuyerName());
|
|
||||||
appendLine(sb, joinArray(legacyDocument.getCpvCodes()));
|
|
||||||
appendLine(sb, joinArray(legacyDocument.getNutsCodes()));
|
|
||||||
String fallback = sb.toString().trim();
|
|
||||||
return StringUtils.hasText(fallback) ? fallback : legacyDocument.getDocumentHash();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Map<String, Object> buildStructuredAttributes(ProcurementDocument legacyDocument) {
|
|
||||||
Map<String, Object> attrs = new LinkedHashMap<>();
|
|
||||||
putIfText(attrs, "title", legacyDocument.getProjectTitle());
|
|
||||||
putIfText(attrs, "description", legacyDocument.getProjectDescription());
|
|
||||||
putIfText(attrs, "buyerName", legacyDocument.getBuyerName());
|
|
||||||
putIfText(attrs, "cpvCodes", joinArray(legacyDocument.getCpvCodes()));
|
|
||||||
putIfText(attrs, "nutsCodes", joinArray(legacyDocument.getNutsCodes()));
|
|
||||||
putIfText(attrs, "publicationId", legacyDocument.getPublicationId());
|
|
||||||
return attrs;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void putIfText(Map<String, Object> target, String key, String value) {
|
|
||||||
if (StringUtils.hasText(value)) {
|
|
||||||
target.put(key, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String joinArray(String[] values) {
|
|
||||||
if (values == null || values.length == 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return String.join(", ", values);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void appendLine(StringBuilder sb, String value) {
|
|
||||||
if (StringUtils.hasText(value)) {
|
|
||||||
if (sb.length() > 0) {
|
|
||||||
sb.append('\n');
|
|
||||||
}
|
|
||||||
sb.append(value.trim());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void appendBlank(StringBuilder sb, String value) {
|
|
||||||
if (StringUtils.hasText(value)) {
|
|
||||||
if (sb.length() > 0) {
|
|
||||||
sb.append("\n\n");
|
|
||||||
}
|
|
||||||
sb.append(value.trim());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public record BackfillOutcome(UUID documentId, UUID projectionId) {
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
||||||
package at.procon.dip.migration.startup;
|
|
||||||
|
|
||||||
import at.procon.dip.migration.config.LegacyTedBackfillProperties;
|
|
||||||
import at.procon.dip.migration.service.LegacyTedBackfillMigrationService;
|
|
||||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
|
||||||
import at.procon.dip.runtime.config.RuntimeMode;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.springframework.boot.ApplicationArguments;
|
|
||||||
import org.springframework.boot.ApplicationRunner;
|
|
||||||
import org.springframework.stereotype.Component;
|
|
||||||
|
|
||||||
@Component
|
|
||||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
@Slf4j
|
|
||||||
public class LegacyTedBackfillStartupRunner implements ApplicationRunner {
|
|
||||||
|
|
||||||
private final LegacyTedBackfillProperties properties;
|
|
||||||
private final LegacyTedBackfillMigrationService migrationService;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void run(ApplicationArguments args) {
|
|
||||||
if (!properties.isEnabled() || !properties.isStartupEnabled()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info("Startup-triggered legacy TED backfill is enabled");
|
|
||||||
migrationService.runBackfill();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -66,7 +66,6 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.TRUE
|
Boolean.TRUE
|
||||||
));
|
));
|
||||||
/*
|
|
||||||
drafts.add(new TextRepresentationDraft(
|
drafts.add(new TextRepresentationDraft(
|
||||||
RepresentationType.FULLTEXT,
|
RepresentationType.FULLTEXT,
|
||||||
BUILDER_KEY,
|
BUILDER_KEY,
|
||||||
|
|
@ -105,7 +104,6 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.FALSE
|
Boolean.FALSE
|
||||||
));
|
));
|
||||||
*/
|
|
||||||
return drafts;
|
return drafts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -52,7 +52,7 @@ public class Organization {
|
||||||
/**
|
/**
|
||||||
* Company/tax registration ID.
|
* Company/tax registration ID.
|
||||||
*/
|
*/
|
||||||
@Column(name = "company_id", columnDefinition = "TEXT")
|
@Column(name = "company_id", length = 1000)
|
||||||
private String companyId;
|
private String companyId;
|
||||||
|
|
||||||
@Column(name = "country_code", length = 10)
|
@Column(name = "country_code", length = 10)
|
||||||
|
|
|
||||||
|
|
@ -105,7 +105,7 @@ public class ProcurementDocument {
|
||||||
@Column(name = "buyer_city", columnDefinition = "TEXT")
|
@Column(name = "buyer_city", columnDefinition = "TEXT")
|
||||||
private String buyerCity;
|
private String buyerCity;
|
||||||
|
|
||||||
@Column(name = "buyer_postal_code", columnDefinition = "TEXT")
|
@Column(name = "buyer_postal_code", length = 100)
|
||||||
private String buyerPostalCode;
|
private String buyerPostalCode;
|
||||||
|
|
||||||
@Column(name = "buyer_nuts_code", length = 10)
|
@Column(name = "buyer_nuts_code", length = 10)
|
||||||
|
|
|
||||||
|
|
@ -1,9 +0,0 @@
|
||||||
package at.procon.ted.repository;
|
|
||||||
|
|
||||||
import java.time.Instant;
|
|
||||||
import java.util.UUID;
|
|
||||||
|
|
||||||
public interface LegacyTedMigrationCursor {
|
|
||||||
UUID getId();
|
|
||||||
Instant getCreatedAt();
|
|
||||||
}
|
|
||||||
|
|
@ -209,33 +209,6 @@ public interface ProcurementDocumentRepository extends
|
||||||
nativeQuery = true)
|
nativeQuery = true)
|
||||||
List<ProcurementDocument> findByTextContentContaining(@Param("query") String query, Pageable pageable);
|
List<ProcurementDocument> findByTextContentContaining(@Param("query") String query, Pageable pageable);
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* First lightweight cursor query for resumable legacy -> DOC/projection backfill.
|
|
||||||
*/
|
|
||||||
@Query(value = """
|
|
||||||
SELECT p.id AS id, p.created_at AS createdAt
|
|
||||||
FROM ted.procurement_document p
|
|
||||||
ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
|
|
||||||
LIMIT :limit
|
|
||||||
""", nativeQuery = true)
|
|
||||||
List<LegacyTedMigrationCursor> findFirstMigrationBatch(@Param("limit") int limit);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Next lightweight cursor query for resumable legacy -> DOC/projection backfill.
|
|
||||||
*/
|
|
||||||
@Query(value = """
|
|
||||||
SELECT p.id AS id, p.created_at AS createdAt
|
|
||||||
FROM ted.procurement_document p
|
|
||||||
WHERE p.created_at > :lastCreatedAt
|
|
||||||
OR (p.created_at = :lastCreatedAt AND CAST(p.id AS text) > CAST(:lastId AS text))
|
|
||||||
ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
|
|
||||||
LIMIT :limit
|
|
||||||
""", nativeQuery = true)
|
|
||||||
List<LegacyTedMigrationCursor> findNextMigrationBatch(@Param("lastCreatedAt") OffsetDateTime lastCreatedAt,
|
|
||||||
@Param("lastId") UUID lastId,
|
|
||||||
@Param("limit") int limit);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Delete all documents created before the specified date.
|
* Delete all documents created before the specified date.
|
||||||
* Cascading deletes will automatically remove related lots, organizations, and logs.
|
* Cascading deletes will automatically remove related lots, organizations, and logs.
|
||||||
|
|
|
||||||
|
|
@ -228,7 +228,6 @@ public class XmlParserService {
|
||||||
|
|
||||||
// Name
|
// Name
|
||||||
org.setName(getTextContent(xpath, orgNode, ".//cac:PartyName/cbc:Name"));
|
org.setName(getTextContent(xpath, orgNode, ".//cac:PartyName/cbc:Name"));
|
||||||
if(org.getName() == null) org.setName("");
|
|
||||||
|
|
||||||
// Company ID
|
// Company ID
|
||||||
org.setCompanyId(getTextContent(xpath, orgNode, ".//cac:PartyLegalEntity/cbc:CompanyID"));
|
org.setCompanyId(getTextContent(xpath, orgNode, ".//cac:PartyLegalEntity/cbc:CompanyID"));
|
||||||
|
|
@ -265,361 +264,7 @@ public class XmlParserService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final Map<String, XPathExpression> cache = new HashMap<>();
|
|
||||||
|
|
||||||
private XPathExpression getCompiled(XPath xpath, String expression) throws XPathExpressionException {
|
|
||||||
XPathExpression compiled = cache.get(expression);
|
|
||||||
if (compiled == null) {
|
|
||||||
compiled = xpath.compile(expression);
|
|
||||||
cache.put(expression, compiled);
|
|
||||||
}
|
|
||||||
return compiled;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getTextContent(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
|
||||||
XPathExpression expr = getCompiled(xpath, expression);
|
|
||||||
Node node = (Node) expr.evaluate(item, XPathConstants.NODE);
|
|
||||||
return node != null ? node.getTextContent().trim() : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Node getNode(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
|
||||||
return (Node) getCompiled(xpath, expression).evaluate(item, XPathConstants.NODE);
|
|
||||||
}
|
|
||||||
|
|
||||||
private NodeList getNodes(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
|
||||||
return (NodeList) getCompiled(xpath, expression).evaluate(item, XPathConstants.NODESET);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Element getDirectChild(Element parent, String namespaceUri, String localName) {
|
|
||||||
Node child = parent.getFirstChild();
|
|
||||||
while (child != null) {
|
|
||||||
if (child.getNodeType() == Node.ELEMENT_NODE) {
|
|
||||||
Element el = (Element) child;
|
|
||||||
if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) {
|
|
||||||
return el;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
child = child.getNextSibling();
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<Element> getDirectChildren(Element parent, String namespaceUri, String localName) {
|
|
||||||
List<Element> result = new ArrayList<>();
|
|
||||||
Node child = parent.getFirstChild();
|
|
||||||
while (child != null) {
|
|
||||||
if (child.getNodeType() == Node.ELEMENT_NODE) {
|
|
||||||
Element el = (Element) child;
|
|
||||||
if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) {
|
|
||||||
result.add(el);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
child = child.getNextSibling();
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getDirectChildText(Element parent, String namespaceUri, String localName) {
|
|
||||||
Element child = getDirectChild(parent, namespaceUri, localName);
|
|
||||||
if (child == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return trimToNull(child.getTextContent());
|
|
||||||
}
|
|
||||||
|
|
||||||
private String trimToNull(String value) {
|
|
||||||
if (value == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
String trimmed = value.trim();
|
|
||||||
return trimmed.isEmpty() ? null : trimmed;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void parseLotsDOM(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
||||||
NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot");
|
|
||||||
document.setTotalLots(lotNodes.getLength());
|
|
||||||
|
|
||||||
for (int i = 0; i < lotNodes.getLength(); i++) {
|
|
||||||
Node lotNode = lotNodes.item(i);
|
|
||||||
if (lotNode.getNodeType() != Node.ELEMENT_NODE) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Element lotEl = (Element) lotNode;
|
|
||||||
ProcurementLot lot = ProcurementLot.builder().build();
|
|
||||||
|
|
||||||
// Direct child values on the lot
|
|
||||||
lot.setLotId(getDirectChildText(lotEl, NS_CBC, "ID"));
|
|
||||||
|
|
||||||
Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject");
|
|
||||||
if (procurementProjectEl != null) {
|
|
||||||
lot.setInternalId(getDirectChildText(procurementProjectEl, NS_CBC, "ID"));
|
|
||||||
lot.setTitle(getDirectChildText(procurementProjectEl, NS_CBC, "Name"));
|
|
||||||
lot.setDescription(getDirectChildText(procurementProjectEl, NS_CBC, "Description"));
|
|
||||||
|
|
||||||
// CPV codes
|
|
||||||
List<String> lotCpvCodes = new ArrayList<>();
|
|
||||||
for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) {
|
|
||||||
String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode");
|
|
||||||
if (cpv != null && !cpv.isEmpty()) {
|
|
||||||
lotCpvCodes.add(cpv);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
lot.setCpvCodes(lotCpvCodes.toArray(new String[0]));
|
|
||||||
|
|
||||||
// NUTS codes
|
|
||||||
List<String> lotNutsCodes = new ArrayList<>();
|
|
||||||
for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) {
|
|
||||||
Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address");
|
|
||||||
if (addressEl != null) {
|
|
||||||
String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode");
|
|
||||||
if (nuts != null && !nuts.isEmpty()) {
|
|
||||||
lotNutsCodes.add(nuts);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
|
|
||||||
|
|
||||||
// Duration
|
|
||||||
Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod");
|
|
||||||
if (plannedPeriodEl != null) {
|
|
||||||
Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure");
|
|
||||||
if (durationEl != null) {
|
|
||||||
String durationValue = trimToNull(durationEl.getTextContent());
|
|
||||||
if (durationValue != null) {
|
|
||||||
try {
|
|
||||||
lot.setDurationValue(Double.parseDouble(durationValue));
|
|
||||||
} catch (NumberFormatException e) {
|
|
||||||
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
|
|
||||||
if (unitCode != null) {
|
|
||||||
lot.setDurationUnit(unitCode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Submission deadline
|
|
||||||
Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess");
|
|
||||||
if (tenderingProcessEl != null) {
|
|
||||||
Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod");
|
|
||||||
if (deadlinePeriodEl != null) {
|
|
||||||
String endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate");
|
|
||||||
if (endDate != null) {
|
|
||||||
String endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime");
|
|
||||||
lot.setSubmissionDeadline(parseDateTime(endDate, endTime));
|
|
||||||
|
|
||||||
if (document.getSubmissionDeadline() == null) {
|
|
||||||
document.setSubmissionDeadline(lot.getSubmissionDeadline());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// EU funded
|
|
||||||
Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms");
|
|
||||||
if (tenderingTermsEl != null) {
|
|
||||||
String fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode");
|
|
||||||
lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds"));
|
|
||||||
}
|
|
||||||
|
|
||||||
document.addLot(lot);
|
|
||||||
}
|
|
||||||
|
|
||||||
document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded())));
|
|
||||||
}
|
|
||||||
|
|
||||||
private void parseLots(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
private void parseLots(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
||||||
NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot");
|
|
||||||
document.setTotalLots(lotNodes.getLength());
|
|
||||||
|
|
||||||
for (int i = 0; i < lotNodes.getLength(); i++) {
|
|
||||||
Node lotNode = lotNodes.item(i);
|
|
||||||
if (lotNode.getNodeType() != Node.ELEMENT_NODE) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Element lotEl = (Element) lotNode;
|
|
||||||
ProcurementLot lot = ProcurementLot.builder().build();
|
|
||||||
|
|
||||||
// Fast direct children
|
|
||||||
Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject");
|
|
||||||
Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess");
|
|
||||||
Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms");
|
|
||||||
|
|
||||||
// --- Lot ID ---
|
|
||||||
String lotId = getDirectChildText(lotEl, NS_CBC, "ID");
|
|
||||||
if (lotId == null) {
|
|
||||||
lotId = getTextContent(xpath, lotNode, "cbc:ID");
|
|
||||||
}
|
|
||||||
lot.setLotId(lotId);
|
|
||||||
|
|
||||||
// --- Internal ID ---
|
|
||||||
String internalId = null;
|
|
||||||
if (procurementProjectEl != null) {
|
|
||||||
internalId = getDirectChildText(procurementProjectEl, NS_CBC, "ID");
|
|
||||||
}
|
|
||||||
if (internalId == null) {
|
|
||||||
internalId = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:ID");
|
|
||||||
}
|
|
||||||
lot.setInternalId(internalId);
|
|
||||||
|
|
||||||
// --- Title ---
|
|
||||||
String title = null;
|
|
||||||
if (procurementProjectEl != null) {
|
|
||||||
title = getDirectChildText(procurementProjectEl, NS_CBC, "Name");
|
|
||||||
}
|
|
||||||
if (title == null) {
|
|
||||||
title = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Name");
|
|
||||||
}
|
|
||||||
lot.setTitle(title);
|
|
||||||
|
|
||||||
// --- Description ---
|
|
||||||
String description = null;
|
|
||||||
if (procurementProjectEl != null) {
|
|
||||||
description = getDirectChildText(procurementProjectEl, NS_CBC, "Description");
|
|
||||||
}
|
|
||||||
if (description == null) {
|
|
||||||
description = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Description");
|
|
||||||
}
|
|
||||||
lot.setDescription(description);
|
|
||||||
|
|
||||||
// --- CPV codes ---
|
|
||||||
List<String> lotCpvCodes = new ArrayList<>();
|
|
||||||
if (procurementProjectEl != null) {
|
|
||||||
for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) {
|
|
||||||
String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode");
|
|
||||||
if (cpv != null && !cpv.isEmpty()) {
|
|
||||||
lotCpvCodes.add(cpv);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (lotCpvCodes.isEmpty()) {
|
|
||||||
NodeList cpvNodes = getNodes(xpath, lotNode,
|
|
||||||
".//cac:MainCommodityClassification/cbc:ItemClassificationCode");
|
|
||||||
for (int j = 0; j < cpvNodes.getLength(); j++) {
|
|
||||||
String cpv = trimToNull(cpvNodes.item(j).getTextContent());
|
|
||||||
if (cpv != null) {
|
|
||||||
lotCpvCodes.add(cpv);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
lot.setCpvCodes(lotCpvCodes.toArray(new String[0]));
|
|
||||||
|
|
||||||
// --- NUTS codes ---
|
|
||||||
List<String> lotNutsCodes = new ArrayList<>();
|
|
||||||
if (procurementProjectEl != null) {
|
|
||||||
for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) {
|
|
||||||
Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address");
|
|
||||||
if (addressEl != null) {
|
|
||||||
String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode");
|
|
||||||
if (nuts != null && !nuts.isEmpty()) {
|
|
||||||
lotNutsCodes.add(nuts);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (lotNutsCodes.isEmpty()) {
|
|
||||||
NodeList nutsNodes = getNodes(xpath, lotNode,
|
|
||||||
".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode");
|
|
||||||
for (int j = 0; j < nutsNodes.getLength(); j++) {
|
|
||||||
String nuts = trimToNull(nutsNodes.item(j).getTextContent());
|
|
||||||
if (nuts != null) {
|
|
||||||
lotNutsCodes.add(nuts);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
|
|
||||||
|
|
||||||
// --- Duration ---
|
|
||||||
boolean durationSet = false;
|
|
||||||
if (procurementProjectEl != null) {
|
|
||||||
Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod");
|
|
||||||
if (plannedPeriodEl != null) {
|
|
||||||
Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure");
|
|
||||||
if (durationEl != null) {
|
|
||||||
String durationValue = trimToNull(durationEl.getTextContent());
|
|
||||||
if (durationValue != null) {
|
|
||||||
try {
|
|
||||||
lot.setDurationValue(Double.parseDouble(durationValue));
|
|
||||||
} catch (NumberFormatException e) {
|
|
||||||
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
|
|
||||||
if (unitCode != null) {
|
|
||||||
lot.setDurationUnit(unitCode);
|
|
||||||
}
|
|
||||||
durationSet = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!durationSet) {
|
|
||||||
Node durationNode = getNode(xpath, lotNode,
|
|
||||||
"cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure");
|
|
||||||
if (durationNode != null) {
|
|
||||||
String durationValue = trimToNull(durationNode.getTextContent());
|
|
||||||
if (durationValue != null) {
|
|
||||||
try {
|
|
||||||
lot.setDurationValue(Double.parseDouble(durationValue));
|
|
||||||
} catch (NumberFormatException e) {
|
|
||||||
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (durationNode instanceof Element durationEl) {
|
|
||||||
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
|
|
||||||
if (unitCode != null) {
|
|
||||||
lot.setDurationUnit(unitCode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Submission deadline ---
|
|
||||||
String endDate = null;
|
|
||||||
String endTime = null;
|
|
||||||
if (tenderingProcessEl != null) {
|
|
||||||
Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod");
|
|
||||||
if (deadlinePeriodEl != null) {
|
|
||||||
endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate");
|
|
||||||
endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (endDate == null) {
|
|
||||||
endDate = getTextContent(xpath, lotNode,
|
|
||||||
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndDate");
|
|
||||||
endTime = getTextContent(xpath, lotNode,
|
|
||||||
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndTime");
|
|
||||||
}
|
|
||||||
if (endDate != null) {
|
|
||||||
lot.setSubmissionDeadline(parseDateTime(endDate, endTime));
|
|
||||||
if (document.getSubmissionDeadline() == null) {
|
|
||||||
document.setSubmissionDeadline(lot.getSubmissionDeadline());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- EU funded ---
|
|
||||||
String fundingProgramCode = null;
|
|
||||||
if (tenderingTermsEl != null) {
|
|
||||||
fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode");
|
|
||||||
}
|
|
||||||
if (fundingProgramCode == null) {
|
|
||||||
fundingProgramCode = getTextContent(xpath, lotNode,
|
|
||||||
"cac:TenderingTerms/cbc:FundingProgramCode");
|
|
||||||
}
|
|
||||||
lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds"));
|
|
||||||
|
|
||||||
document.addLot(lot);
|
|
||||||
}
|
|
||||||
|
|
||||||
document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded())));
|
|
||||||
}
|
|
||||||
|
|
||||||
private void parseLotsOld(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
||||||
NodeList lotNodes = (NodeList) xpath.evaluate(
|
NodeList lotNodes = (NodeList) xpath.evaluate(
|
||||||
"//cac:ProcurementProjectLot", doc, XPathConstants.NODESET);
|
"//cac:ProcurementProjectLot", doc, XPathConstants.NODESET);
|
||||||
|
|
||||||
|
|
@ -784,9 +429,14 @@ public class XmlParserService {
|
||||||
|
|
||||||
// Helper methods
|
// Helper methods
|
||||||
|
|
||||||
|
private String getTextContent(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
||||||
|
Node node = (Node) xpath.evaluate(expression, item, XPathConstants.NODE);
|
||||||
|
return node != null ? node.getTextContent().trim() : null;
|
||||||
|
}
|
||||||
|
|
||||||
private List<String> getTextContents(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
private List<String> getTextContents(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
||||||
List<String> results = new ArrayList<>();
|
List<String> results = new ArrayList<>();
|
||||||
NodeList nodes = getNodes(xpath, item, expression);
|
NodeList nodes = (NodeList) xpath.evaluate(expression, item, XPathConstants.NODESET);
|
||||||
for (int i = 0; i < nodes.getLength(); i++) {
|
for (int i = 0; i < nodes.getLength(); i++) {
|
||||||
String text = nodes.item(i).getTextContent().trim();
|
String text = nodes.item(i).getTextContent().trim();
|
||||||
if (!text.isEmpty()) {
|
if (!text.isEmpty()) {
|
||||||
|
|
@ -797,10 +447,9 @@ public class XmlParserService {
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getAttributeValue(XPath xpath, Object item, String expression, String attrName) throws XPathExpressionException {
|
private String getAttributeValue(XPath xpath, Object item, String expression, String attrName) throws XPathExpressionException {
|
||||||
Node node = getNode(xpath, item, expression);
|
Node node = (Node) xpath.evaluate(expression, item, XPathConstants.NODE);
|
||||||
if (node instanceof Element element) {
|
if (node instanceof Element) {
|
||||||
String value = element.getAttribute(attrName);
|
return ((Element) node).getAttribute(attrName);
|
||||||
return trimToNull(value);
|
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ ted:
|
||||||
# Use external HTTP API instead of subprocess
|
# Use external HTTP API instead of subprocess
|
||||||
use-http-api: true
|
use-http-api: true
|
||||||
# Embedding service URL
|
# Embedding service URL
|
||||||
api-url: http://172.20.20.6:8001
|
api-url: http://172.20.240.18:8001
|
||||||
# Model name for sentence-transformers
|
# Model name for sentence-transformers
|
||||||
model-name: intfloat/multilingual-e5-large
|
model-name: intfloat/multilingual-e5-large
|
||||||
# Vector dimensions (must match model output)
|
# Vector dimensions (must match model output)
|
||||||
|
|
@ -55,7 +55,7 @@ ted:
|
||||||
# Packages download configuration
|
# Packages download configuration
|
||||||
download:
|
download:
|
||||||
# Enable/disable automatic package download
|
# Enable/disable automatic package download
|
||||||
enabled: false
|
enabled: true
|
||||||
# User service-based camel route
|
# User service-based camel route
|
||||||
use-service-based: false
|
use-service-based: false
|
||||||
# Base URL for TED Daily Packages
|
# Base URL for TED Daily Packages
|
||||||
|
|
@ -91,9 +91,9 @@ ted:
|
||||||
# Enable one-off repair / re-import of incomplete TED packages on startup
|
# Enable one-off repair / re-import of incomplete TED packages on startup
|
||||||
enabled: false
|
enabled: false
|
||||||
# Only list candidate packages without modifying data
|
# Only list candidate packages without modifying data
|
||||||
dry-run: true
|
dry-run: false
|
||||||
# Safety cap for one startup run
|
# Safety cap for one startup run
|
||||||
max-packages: 200
|
max-packages: 100
|
||||||
# Optional explicit package identifiers to repair
|
# Optional explicit package identifiers to repair
|
||||||
package-identifiers: []
|
package-identifiers: []
|
||||||
# Optional inclusive package range
|
# Optional inclusive package range
|
||||||
|
|
|
||||||
|
|
@ -231,7 +231,7 @@ dip:
|
||||||
# Start year for downloads
|
# Start year for downloads
|
||||||
start-year: 2026
|
start-year: 2026
|
||||||
# Polling interval (milliseconds) - 2 minutes
|
# Polling interval (milliseconds) - 2 minutes
|
||||||
poll-interval: 60000
|
poll-interval: 3600000
|
||||||
# Retry interval for tail NOT_FOUND packages - 6 hours
|
# Retry interval for tail NOT_FOUND packages - 6 hours
|
||||||
not-found-retry-interval: 21600000
|
not-found-retry-interval: 21600000
|
||||||
# Grace period after year end before a previous-year tail 404 is treated as final
|
# Grace period after year end before a previous-year tail 404 is treated as final
|
||||||
|
|
@ -247,19 +247,10 @@ dip:
|
||||||
# Delete tar.gz after ingestion
|
# Delete tar.gz after ingestion
|
||||||
delete-after-ingestion: true
|
delete-after-ingestion: true
|
||||||
|
|
||||||
ted: # Phase 3 TED projection configuration
|
|
||||||
projection:
|
|
||||||
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document
|
|
||||||
enabled: true
|
|
||||||
# Optional startup backfill for legacy TED documents without a projection row yet
|
|
||||||
startup-backfill-enabled: false
|
|
||||||
# Maximum number of legacy TED documents to backfill during startup
|
|
||||||
startup-backfill-limit: 250
|
|
||||||
|
|
||||||
migration:
|
migration:
|
||||||
legacy-audit:
|
legacy-audit:
|
||||||
# Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem
|
# Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem
|
||||||
enabled: false
|
enabled: true
|
||||||
# Optional startup execution; the audit is read-only and only writes audit run/finding tables
|
# Optional startup execution; the audit is read-only and only writes audit run/finding tables
|
||||||
startup-run-enabled: true
|
startup-run-enabled: true
|
||||||
# Maximum number of legacy TED documents to scan during startup (0 = all)
|
# Maximum number of legacy TED documents to scan during startup (0 = all)
|
||||||
|
|
@ -271,18 +262,12 @@ dip:
|
||||||
# Maximum number of grouped duplicate samples captured for aggregate checks
|
# Maximum number of grouped duplicate samples captured for aggregate checks
|
||||||
max-duplicate-samples: 100
|
max-duplicate-samples: 100
|
||||||
|
|
||||||
legacy-ted:
|
ted: # Phase 3 TED projection configuration
|
||||||
# Enable the resumable legacy TED -> DOC/projection backfill subsystem
|
projection:
|
||||||
enabled: false
|
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document
|
||||||
# Run the backfill automatically on NEW-runtime startup
|
enabled: true
|
||||||
startup-enabled: false
|
# Optional startup backfill for legacy TED documents without a projection row yet
|
||||||
# Number of legacy TED documents fetched and processed per batch
|
startup-backfill-enabled: false
|
||||||
batch-size: 100
|
# Maximum number of legacy TED documents to backfill during startup
|
||||||
# Optional cap for a single invocation; 0 means migrate all remaining rows
|
startup-backfill-limit: 250
|
||||||
max-documents-per-run: 0
|
|
||||||
# Resume the latest STOPPED/FAILED run from its saved cursor
|
|
||||||
resume-latest-incomplete-run: true
|
|
||||||
# Import batch id written to DOC.doc_source rows created by the migration
|
|
||||||
import-batch-id: legacy-ted-backfill
|
|
||||||
# Keep false for Wave 1; embeddings can be backfilled later as a separate step
|
|
||||||
queue-embeddings: false
|
|
||||||
|
|
|
||||||
|
|
@ -14,9 +14,9 @@ spring:
|
||||||
name: document-intelligence-platform
|
name: document-intelligence-platform
|
||||||
|
|
||||||
datasource:
|
datasource:
|
||||||
url: jdbc:postgresql://localhost:5432/RELM
|
url: jdbc:postgresql://94.130.218.54:32333/RELM
|
||||||
username: ${DB_USERNAME:postgres}
|
username: ${DB_USERNAME:postgres}
|
||||||
password: ${DB_PASSWORD:P54!pcd#Wi}
|
password: ${DB_PASSWORD:PDmXRx0Rbk9OFOn9qO5Gm/mPCfqW8zwbZ+/YIU1lySc=}
|
||||||
driver-class-name: org.postgresql.Driver
|
driver-class-name: org.postgresql.Driver
|
||||||
hikari:
|
hikari:
|
||||||
maximum-pool-size: 5
|
maximum-pool-size: 5
|
||||||
|
|
@ -28,7 +28,7 @@ spring:
|
||||||
|
|
||||||
jpa:
|
jpa:
|
||||||
hibernate:
|
hibernate:
|
||||||
ddl-auto: update
|
ddl-auto: validate
|
||||||
show-sql: false
|
show-sql: false
|
||||||
open-in-view: false
|
open-in-view: false
|
||||||
properties:
|
properties:
|
||||||
|
|
|
||||||
|
|
@ -1,21 +0,0 @@
|
||||||
-- Store the TED daily package identifier directly on the Phase 3 TED notice projection.
|
|
||||||
-- This makes migration, audit, and repair flows package-aware without having to derive the
|
|
||||||
-- package membership from source paths at query time.
|
|
||||||
|
|
||||||
SET search_path TO TED, DOC, public;
|
|
||||||
|
|
||||||
ALTER TABLE IF EXISTS TED.ted_notice_projection
|
|
||||||
ADD COLUMN IF NOT EXISTS package_identifier VARCHAR(20);
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_ted_notice_projection_package_identifier
|
|
||||||
ON TED.ted_notice_projection(package_identifier);
|
|
||||||
|
|
||||||
-- Backfill from the linked legacy TED document's source path when the path contains the
|
|
||||||
-- extracted package directory, e.g. .../extracted/202600003/123.xml
|
|
||||||
UPDATE TED.ted_notice_projection projection
|
|
||||||
SET package_identifier = substring(legacy.source_path from '(?:^|[\\/])((?:19|20)[0-9]{7})(?:[\\/]|$)')
|
|
||||||
FROM TED.procurement_document legacy
|
|
||||||
WHERE projection.legacy_procurement_document_id = legacy.id
|
|
||||||
AND projection.package_identifier IS NULL
|
|
||||||
AND legacy.source_path IS NOT NULL
|
|
||||||
AND substring(legacy.source_path from '(?:^|[\\/])((?:19|20)[0-9]{7})(?:[\\/]|$)') IS NOT NULL;
|
|
||||||
|
|
@ -3,10 +3,10 @@
|
||||||
-- Description: PostgreSQL schema for storing EU eForms procurement notices with vector search support
|
-- Description: PostgreSQL schema for storing EU eForms procurement notices with vector search support
|
||||||
|
|
||||||
-- Create TED schema if it doesn't exist
|
-- Create TED schema if it doesn't exist
|
||||||
CREATE SCHEMA IF NOT EXISTS ted;
|
CREATE SCHEMA IF NOT EXISTS TED;
|
||||||
|
|
||||||
-- Set search path to use TED schema
|
-- Set search path to use TED schema
|
||||||
SET search_path TO ted;
|
SET search_path TO TED;
|
||||||
|
|
||||||
-- Enable required PostgreSQL extensions (wenn Berechtigung vorhanden)
|
-- Enable required PostgreSQL extensions (wenn Berechtigung vorhanden)
|
||||||
-- Falls Extensions nicht erstellt werden können, müssen diese vom DBA manuell erstellt werden
|
-- Falls Extensions nicht erstellt werden können, müssen diese vom DBA manuell erstellt werden
|
||||||
|
|
|
||||||
|
|
@ -1,75 +0,0 @@
|
||||||
SET search_path TO TED, DOC, public;
|
|
||||||
|
|
||||||
ALTER TABLE IF EXISTS TED.ted_notice_projection
|
|
||||||
ADD COLUMN IF NOT EXISTS package_identifier VARCHAR(32);
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_ted_notice_projection_package_identifier
|
|
||||||
ON TED.ted_notice_projection(package_identifier);
|
|
||||||
|
|
||||||
ALTER TABLE IF EXISTS TED.organization
|
|
||||||
ALTER COLUMN city TYPE TEXT;
|
|
||||||
|
|
||||||
ALTER TABLE IF EXISTS TED.organization
|
|
||||||
ALTER COLUMN company_id TYPE TEXT;
|
|
||||||
|
|
||||||
ALTER TABLE IF EXISTS TED.procurement_document
|
|
||||||
ALTER COLUMN buyer_city TYPE TEXT;
|
|
||||||
|
|
||||||
ALTER TABLE IF EXISTS TED.ted_notice_organization
|
|
||||||
ALTER COLUMN city TYPE TEXT;
|
|
||||||
|
|
||||||
ALTER TABLE IF EXISTS TED.ted_notice_projection
|
|
||||||
ALTER COLUMN buyer_city TYPE TEXT;
|
|
||||||
|
|
||||||
UPDATE TED.ted_notice_projection p
|
|
||||||
SET package_identifier = substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{2}[0-9]{5})')
|
|
||||||
FROM TED.procurement_document d
|
|
||||||
WHERE p.legacy_procurement_document_id = d.id
|
|
||||||
AND p.package_identifier IS NULL
|
|
||||||
AND substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{2}[0-9]{5})') IS NOT NULL;
|
|
||||||
|
|
||||||
SET search_path TO DOC, public;
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS DOC.doc_legacy_ted_migration_run (
|
|
||||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
||||||
status VARCHAR(32) NOT NULL,
|
|
||||||
import_batch_id VARCHAR(255),
|
|
||||||
queue_embeddings BOOLEAN NOT NULL DEFAULT FALSE,
|
|
||||||
batch_size INTEGER NOT NULL DEFAULT 100,
|
|
||||||
max_documents_per_run BIGINT,
|
|
||||||
processed_count BIGINT NOT NULL DEFAULT 0,
|
|
||||||
success_count BIGINT NOT NULL DEFAULT 0,
|
|
||||||
failed_count BIGINT NOT NULL DEFAULT 0,
|
|
||||||
last_legacy_created_at TIMESTAMP WITH TIME ZONE,
|
|
||||||
last_legacy_document_id UUID,
|
|
||||||
last_doc_document_id UUID,
|
|
||||||
last_projection_id UUID,
|
|
||||||
last_error TEXT,
|
|
||||||
started_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
completed_at TIMESTAMP WITH TIME ZONE
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_doc_legacy_ted_mig_run_status
|
|
||||||
ON DOC.doc_legacy_ted_migration_run(status);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_doc_legacy_ted_mig_run_started
|
|
||||||
ON DOC.doc_legacy_ted_migration_run(started_at DESC);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS DOC.doc_legacy_ted_migration_checkpoint (
|
|
||||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
||||||
run_id UUID NOT NULL REFERENCES DOC.doc_legacy_ted_migration_run(id) ON DELETE CASCADE,
|
|
||||||
batch_number INTEGER NOT NULL,
|
|
||||||
batch_processed_count INTEGER NOT NULL DEFAULT 0,
|
|
||||||
cumulative_processed_count BIGINT NOT NULL DEFAULT 0,
|
|
||||||
last_legacy_created_at TIMESTAMP WITH TIME ZONE,
|
|
||||||
last_legacy_document_id UUID,
|
|
||||||
last_doc_document_id UUID,
|
|
||||||
last_projection_id UUID,
|
|
||||||
note TEXT,
|
|
||||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_doc_legacy_ted_mig_ckpt_run
|
|
||||||
ON DOC.doc_legacy_ted_migration_checkpoint(run_id);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_doc_legacy_ted_mig_ckpt_batch
|
|
||||||
ON DOC.doc_legacy_ted_migration_checkpoint(batch_number);
|
|
||||||
|
|
@ -1,95 +0,0 @@
|
||||||
SET search_path TO TED, DOC, public;
|
|
||||||
|
|
||||||
WITH legacy_package_map AS (
|
|
||||||
SELECT
|
|
||||||
d.id AS legacy_procurement_document_id,
|
|
||||||
p.document_id AS child_document_id,
|
|
||||||
substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') AS package_identifier
|
|
||||||
FROM TED.procurement_document d
|
|
||||||
JOIN TED.ted_notice_projection p
|
|
||||||
ON p.legacy_procurement_document_id = d.id
|
|
||||||
WHERE substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') IS NOT NULL
|
|
||||||
), package_documents AS (
|
|
||||||
SELECT DISTINCT
|
|
||||||
l.package_identifier,
|
|
||||||
'TED:package:' || l.package_identifier AS business_key,
|
|
||||||
encode(digest('TED:package:' || l.package_identifier, 'sha256'), 'hex') AS dedup_hash
|
|
||||||
FROM legacy_package_map l
|
|
||||||
)
|
|
||||||
INSERT INTO DOC.doc_document (
|
|
||||||
id, visibility, document_type, document_family, status, title, summary, mime_type, business_key, dedup_hash
|
|
||||||
)
|
|
||||||
SELECT
|
|
||||||
gen_random_uuid(),
|
|
||||||
'PUBLIC',
|
|
||||||
'TED_PACKAGE',
|
|
||||||
'PROCUREMENT',
|
|
||||||
CASE
|
|
||||||
WHEN pkg.download_status = 'COMPLETED' THEN 'CLASSIFIED'
|
|
||||||
WHEN pkg.download_status IN ('FAILED', 'NOT_FOUND') THEN 'FAILED'
|
|
||||||
ELSE 'RECEIVED'
|
|
||||||
END,
|
|
||||||
'TED Daily Package ' || pd.package_identifier,
|
|
||||||
CASE
|
|
||||||
WHEN pkg.package_identifier IS NULL THEN 'TED daily package ' || pd.package_identifier
|
|
||||||
ELSE 'TED daily package ' || pd.package_identifier ||
|
|
||||||
' (status=' || coalesce(pkg.download_status::text, 'UNKNOWN') ||
|
|
||||||
', xmlFileCount=' || coalesce(pkg.xml_file_count::text, 'null') ||
|
|
||||||
', processedCount=' || coalesce(pkg.processed_count::text, 'null') ||
|
|
||||||
', failedCount=' || coalesce(pkg.failed_count::text, 'null') || ')'
|
|
||||||
END,
|
|
||||||
'application/gzip',
|
|
||||||
pd.business_key,
|
|
||||||
pd.dedup_hash
|
|
||||||
FROM package_documents pd
|
|
||||||
LEFT JOIN TED.ted_daily_package pkg
|
|
||||||
ON pkg.package_identifier = pd.package_identifier
|
|
||||||
WHERE NOT EXISTS (
|
|
||||||
SELECT 1
|
|
||||||
FROM DOC.doc_document existing
|
|
||||||
WHERE existing.business_key = pd.business_key
|
|
||||||
);
|
|
||||||
|
|
||||||
UPDATE TED.ted_notice_projection p
|
|
||||||
SET package_identifier = substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})')
|
|
||||||
FROM TED.procurement_document d
|
|
||||||
WHERE p.legacy_procurement_document_id = d.id
|
|
||||||
AND substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') IS NOT NULL
|
|
||||||
AND p.package_identifier IS DISTINCT FROM substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})');
|
|
||||||
|
|
||||||
WITH legacy_package_map AS (
|
|
||||||
SELECT
|
|
||||||
p.document_id AS child_document_id,
|
|
||||||
substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') AS package_identifier
|
|
||||||
FROM TED.procurement_document d
|
|
||||||
JOIN TED.ted_notice_projection p
|
|
||||||
ON p.legacy_procurement_document_id = d.id
|
|
||||||
WHERE substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') IS NOT NULL
|
|
||||||
), package_documents AS (
|
|
||||||
SELECT
|
|
||||||
doc.id AS parent_document_id,
|
|
||||||
substring(doc.business_key from '(20[0-9]{7})') AS package_identifier
|
|
||||||
FROM DOC.doc_document doc
|
|
||||||
WHERE doc.document_type = 'TED_PACKAGE'
|
|
||||||
AND doc.business_key LIKE 'TED:package:%'
|
|
||||||
)
|
|
||||||
INSERT INTO DOC.doc_relation (
|
|
||||||
id, parent_document_id, child_document_id, relation_type, sort_order, relation_metadata
|
|
||||||
)
|
|
||||||
SELECT
|
|
||||||
gen_random_uuid(),
|
|
||||||
pkg.parent_document_id,
|
|
||||||
l.child_document_id,
|
|
||||||
'CONTAINS',
|
|
||||||
NULL,
|
|
||||||
'packageIdentifier=' || l.package_identifier
|
|
||||||
FROM legacy_package_map l
|
|
||||||
JOIN package_documents pkg
|
|
||||||
ON pkg.package_identifier = l.package_identifier
|
|
||||||
WHERE NOT EXISTS (
|
|
||||||
SELECT 1
|
|
||||||
FROM DOC.doc_relation rel
|
|
||||||
WHERE rel.parent_document_id = pkg.parent_document_id
|
|
||||||
AND rel.child_document_id = l.child_document_id
|
|
||||||
AND rel.relation_type = 'CONTAINS'
|
|
||||||
);
|
|
||||||
Loading…
Reference in New Issue