ted legacy data migration, repair fixes
This commit is contained in:
parent
61d163f8fe
commit
28c7854ead
|
|
@ -13,6 +13,8 @@ import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
public interface DocumentRepository extends JpaRepository<Document, UUID> {
|
public interface DocumentRepository extends JpaRepository<Document, UUID> {
|
||||||
|
|
||||||
|
Optional<Document> findByBusinessKey(String businessKey);
|
||||||
|
|
||||||
Optional<Document> findByDedupHash(String dedupHash);
|
Optional<Document> findByDedupHash(String dedupHash);
|
||||||
|
|
||||||
List<Document> findAllByDedupHash(String dedupHash);
|
List<Document> findAllByDedupHash(String dedupHash);
|
||||||
|
|
|
||||||
|
|
@ -189,10 +189,24 @@ public class TedNoticeProjection {
|
||||||
protected void onCreate() {
|
protected void onCreate() {
|
||||||
createdAt = OffsetDateTime.now();
|
createdAt = OffsetDateTime.now();
|
||||||
updatedAt = OffsetDateTime.now();
|
updatedAt = OffsetDateTime.now();
|
||||||
|
generateNoticeUrl();
|
||||||
}
|
}
|
||||||
|
|
||||||
@PreUpdate
|
@PreUpdate
|
||||||
protected void onUpdate() {
|
protected void onUpdate() {
|
||||||
updatedAt = OffsetDateTime.now();
|
updatedAt = OffsetDateTime.now();
|
||||||
|
generateNoticeUrl();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates TED notice URL from publication_id.
|
||||||
|
* Format: https://ted.europa.eu/en/notice/-/detail/{publication_id without leading zeros}
|
||||||
|
*/
|
||||||
|
private void generateNoticeUrl() {
|
||||||
|
if (publicationId != null && !publicationId.isEmpty()) {
|
||||||
|
// Remove leading zeros from publication_id
|
||||||
|
String cleanId = publicationId.replaceFirst("^0+", "");
|
||||||
|
this.noticeUrl = "https://ted.europa.eu/en/notice/-/detail/" + cleanId;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -76,7 +76,7 @@ public class TedGenericDocumentRootService {
|
||||||
|
|
||||||
private String buildBusinessKey(ProcurementDocument tedDocument) {
|
private String buildBusinessKey(ProcurementDocument tedDocument) {
|
||||||
if (StringUtils.hasText(tedDocument.getPublicationId())) {
|
if (StringUtils.hasText(tedDocument.getPublicationId())) {
|
||||||
return "TED:publication:" + tedDocument.getPublicationId();
|
return "TED_NOTICE:" + tedDocument.getPublicationId();
|
||||||
}
|
}
|
||||||
if (StringUtils.hasText(tedDocument.getNoticeId())) {
|
if (StringUtils.hasText(tedDocument.getNoticeId())) {
|
||||||
return "TED:notice:" + tedDocument.getNoticeId();
|
return "TED:notice:" + tedDocument.getNoticeId();
|
||||||
|
|
|
||||||
|
|
@ -17,8 +17,6 @@ import at.procon.ted.model.entity.ProcurementLot;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
@ -33,9 +31,8 @@ import org.springframework.transaction.annotation.Transactional;
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class TedNoticeProjectionService {
|
public class TedNoticeProjectionService {
|
||||||
|
|
||||||
private static final Pattern PACKAGE_IDENTIFIER_PATTERN = Pattern.compile("(?<!\\d)(20\\d{2}\\d{5})(?!\\d)");
|
|
||||||
|
|
||||||
private final TedProjectionProperties properties;
|
private final TedProjectionProperties properties;
|
||||||
|
private final TedPackageIdentifierResolver packageIdentifierResolver;
|
||||||
private final TedGenericDocumentRootService tedGenericDocumentRootService;
|
private final TedGenericDocumentRootService tedGenericDocumentRootService;
|
||||||
private final DocumentRepository documentRepository;
|
private final DocumentRepository documentRepository;
|
||||||
private final TedNoticeProjectionRepository projectionRepository;
|
private final TedNoticeProjectionRepository projectionRepository;
|
||||||
|
|
@ -83,7 +80,9 @@ public class TedNoticeProjectionService {
|
||||||
private void mapProjection(TedNoticeProjection projection, Document genericDocument, ProcurementDocument legacyDocument) {
|
private void mapProjection(TedNoticeProjection projection, Document genericDocument, ProcurementDocument legacyDocument) {
|
||||||
projection.setDocument(genericDocument);
|
projection.setDocument(genericDocument);
|
||||||
projection.setLegacyProcurementDocumentId(legacyDocument.getId());
|
projection.setLegacyProcurementDocumentId(legacyDocument.getId());
|
||||||
projection.setPackageIdentifier(extractPackageIdentifier(legacyDocument));
|
projection.setPackageIdentifier(packageIdentifierResolver
|
||||||
|
.resolveFromSourceMetadata(legacyDocument.getSourcePath(), legacyDocument.getSourceFilename())
|
||||||
|
.orElse(null));
|
||||||
projection.setNoticeId(legacyDocument.getNoticeId());
|
projection.setNoticeId(legacyDocument.getNoticeId());
|
||||||
projection.setPublicationId(legacyDocument.getPublicationId());
|
projection.setPublicationId(legacyDocument.getPublicationId());
|
||||||
projection.setNoticeUrl(legacyDocument.getNoticeUrl());
|
projection.setNoticeUrl(legacyDocument.getNoticeUrl());
|
||||||
|
|
@ -174,27 +173,6 @@ public class TedNoticeProjectionService {
|
||||||
organizationRepository.saveAll(projectedOrganizations);
|
organizationRepository.saveAll(projectedOrganizations);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String extractPackageIdentifier(ProcurementDocument legacyDocument) {
|
|
||||||
String value = firstNonBlank(legacyDocument.getSourcePath(), legacyDocument.getSourceFilename());
|
|
||||||
if (value == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
Matcher matcher = PACKAGE_IDENTIFIER_PATTERN.matcher(value);
|
|
||||||
return matcher.find() ? matcher.group(1) : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String firstNonBlank(String... values) {
|
|
||||||
if (values == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
for (String value : values) {
|
|
||||||
if (value != null && !value.isBlank()) {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String[] copyArray(String[] source) {
|
private String[] copyArray(String[] source) {
|
||||||
return source == null ? null : source.clone();
|
return source == null ? null : source.clone();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,113 @@
|
||||||
|
package at.procon.dip.domain.ted.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentRepository;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentService;
|
||||||
|
import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
|
||||||
|
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||||
|
import at.procon.dip.runtime.config.RuntimeMode;
|
||||||
|
import at.procon.ted.model.entity.ProcurementDocument;
|
||||||
|
import at.procon.ted.model.entity.TedDailyPackage;
|
||||||
|
import at.procon.ted.repository.TedDailyPackageRepository;
|
||||||
|
import at.procon.ted.util.HashUtils;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.Optional;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class TedPackageDocumentService {
|
||||||
|
|
||||||
|
private static final String PACKAGE_MIME_TYPE = "application/gzip";
|
||||||
|
|
||||||
|
private final TedPackageIdentifierResolver packageIdentifierResolver;
|
||||||
|
private final TedDailyPackageRepository tedDailyPackageRepository;
|
||||||
|
private final DocumentRepository documentRepository;
|
||||||
|
private final DocumentService documentService;
|
||||||
|
|
||||||
|
@Transactional
|
||||||
|
public Optional<Document> ensurePackageDocumentForLegacyNotice(ProcurementDocument legacyDocument) {
|
||||||
|
return packageIdentifierResolver.resolveFromSourceMetadata(legacyDocument.getSourcePath(), legacyDocument.getSourceFilename())
|
||||||
|
.map(this::ensurePackageDocument);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional
|
||||||
|
public Document ensurePackageDocument(String packageIdentifier) {
|
||||||
|
String businessKey = buildBusinessKey(packageIdentifier);
|
||||||
|
Document document = documentRepository.findByBusinessKey(businessKey)
|
||||||
|
.orElseGet(() -> createPackageDocument(packageIdentifier));
|
||||||
|
|
||||||
|
Optional<TedDailyPackage> packageEntity = tedDailyPackageRepository.findByPackageIdentifier(packageIdentifier);
|
||||||
|
document.setVisibility(DocumentVisibility.PUBLIC);
|
||||||
|
document.setDocumentType(DocumentType.TED_PACKAGE);
|
||||||
|
document.setDocumentFamily(DocumentFamily.PROCUREMENT);
|
||||||
|
document.setStatus(resolveStatus(packageEntity));
|
||||||
|
document.setTitle(buildTitle(packageIdentifier));
|
||||||
|
document.setSummary(buildSummary(packageIdentifier, packageEntity.orElse(null)));
|
||||||
|
document.setMimeType(PACKAGE_MIME_TYPE);
|
||||||
|
document.setBusinessKey(businessKey);
|
||||||
|
document.setDedupHash(HashUtils.computeSha256(businessKey));
|
||||||
|
return documentService.save(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Document createPackageDocument(String packageIdentifier) {
|
||||||
|
String businessKey = buildBusinessKey(packageIdentifier);
|
||||||
|
return documentService.create(new CreateDocumentCommand(
|
||||||
|
null,
|
||||||
|
DocumentVisibility.PUBLIC,
|
||||||
|
DocumentType.TED_PACKAGE,
|
||||||
|
DocumentFamily.PROCUREMENT,
|
||||||
|
DocumentStatus.RECEIVED,
|
||||||
|
buildTitle(packageIdentifier),
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
PACKAGE_MIME_TYPE,
|
||||||
|
businessKey,
|
||||||
|
HashUtils.computeSha256(businessKey)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentStatus resolveStatus(Optional<TedDailyPackage> packageEntity) {
|
||||||
|
if (packageEntity.isEmpty()) {
|
||||||
|
return DocumentStatus.RECEIVED;
|
||||||
|
}
|
||||||
|
return switch (packageEntity.get().getDownloadStatus()) {
|
||||||
|
case COMPLETED -> DocumentStatus.CLASSIFIED;
|
||||||
|
case FAILED, NOT_FOUND -> DocumentStatus.FAILED;
|
||||||
|
default -> DocumentStatus.RECEIVED;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildBusinessKey(String packageIdentifier) {
|
||||||
|
return "TED_PACKAGE:" + packageIdentifier;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildTitle(String packageIdentifier) {
|
||||||
|
return packageIdentifier + ".tar.gz";
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildSummary(String packageIdentifier, TedDailyPackage packageEntity) {
|
||||||
|
if (packageEntity == null) {
|
||||||
|
return "TED daily package " + packageIdentifier;
|
||||||
|
}
|
||||||
|
return "TED daily package %s (status=%s, xmlFileCount=%s, processedCount=%s, failedCount=%s, downloadedAt=%s)".formatted(
|
||||||
|
packageIdentifier,
|
||||||
|
packageEntity.getDownloadStatus(),
|
||||||
|
packageEntity.getXmlFileCount(),
|
||||||
|
packageEntity.getProcessedCount(),
|
||||||
|
packageEntity.getFailedCount(),
|
||||||
|
formatTimestamp(packageEntity.getDownloadedAt())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String formatTimestamp(OffsetDateTime value) {
|
||||||
|
return value == null ? null : value.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
package at.procon.dip.domain.ted.service;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolves a TED daily package identifier (YYYYSSSSS) from legacy source metadata.
|
||||||
|
*/
|
||||||
|
@Component
|
||||||
|
public class TedPackageIdentifierResolver {
|
||||||
|
|
||||||
|
private static final Pattern PACKAGE_IDENTIFIER_PATTERN = Pattern.compile("(?<!\\d)(20\\d{7})(?!\\d)");
|
||||||
|
|
||||||
|
public Optional<String> resolveFromSourceMetadata(String sourcePath, String sourceFilename) {
|
||||||
|
return resolve(sourcePath).or(() -> resolve(sourceFilename));
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<String> resolve(String value) {
|
||||||
|
if (!StringUtils.hasText(value)) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
Matcher matcher = PACKAGE_IDENTIFIER_PATTERN.matcher(value);
|
||||||
|
if (matcher.find()) {
|
||||||
|
return Optional.of(matcher.group(1));
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -11,6 +11,7 @@ import at.procon.dip.runtime.config.RuntimeMode;
|
||||||
import at.procon.ted.repository.LegacyTedMigrationCursor;
|
import at.procon.ted.repository.LegacyTedMigrationCursor;
|
||||||
import at.procon.ted.repository.ProcurementDocumentRepository;
|
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||||
import java.time.OffsetDateTime;
|
import java.time.OffsetDateTime;
|
||||||
|
import java.time.ZoneOffset;
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
@ -54,11 +55,7 @@ public class LegacyTedBackfillMigrationService {
|
||||||
return run.getId();
|
return run.getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
List<LegacyTedMigrationCursor> batch = procurementDocumentRepository.findNextMigrationBatch(
|
List<LegacyTedMigrationCursor> batch = loadNextBatch(run, limit);
|
||||||
run.getLastLegacyCreatedAt(),
|
|
||||||
run.getLastLegacyDocumentId(),
|
|
||||||
limit
|
|
||||||
);
|
|
||||||
|
|
||||||
if (batch.isEmpty()) {
|
if (batch.isEmpty()) {
|
||||||
markCompleted(run);
|
markCompleted(run);
|
||||||
|
|
@ -89,6 +86,17 @@ public class LegacyTedBackfillMigrationService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected List<LegacyTedMigrationCursor> loadNextBatch(LegacyTedMigrationRun run, int limit) {
|
||||||
|
if (run.getLastLegacyCreatedAt() == null || run.getLastLegacyDocumentId() == null) {
|
||||||
|
return procurementDocumentRepository.findFirstMigrationBatch(limit);
|
||||||
|
}
|
||||||
|
return procurementDocumentRepository.findNextMigrationBatch(
|
||||||
|
run.getLastLegacyCreatedAt(),
|
||||||
|
run.getLastLegacyDocumentId(),
|
||||||
|
limit
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
@Transactional
|
@Transactional
|
||||||
protected LegacyTedMigrationRun resolveRun() {
|
protected LegacyTedMigrationRun resolveRun() {
|
||||||
if (properties.isResumeLatestIncompleteRun()) {
|
if (properties.isResumeLatestIncompleteRun()) {
|
||||||
|
|
@ -121,7 +129,9 @@ public class LegacyTedBackfillMigrationService {
|
||||||
run.setStatus(LegacyTedMigrationRunStatus.RUNNING);
|
run.setStatus(LegacyTedMigrationRunStatus.RUNNING);
|
||||||
run.setProcessedCount(run.getProcessedCount() + 1);
|
run.setProcessedCount(run.getProcessedCount() + 1);
|
||||||
run.setSuccessCount(run.getSuccessCount() + 1);
|
run.setSuccessCount(run.getSuccessCount() + 1);
|
||||||
run.setLastLegacyCreatedAt(cursor.getCreatedAt());
|
run.setLastLegacyCreatedAt(cursor.getCreatedAt() != null
|
||||||
|
? cursor.getCreatedAt().atOffset(ZoneOffset.UTC)
|
||||||
|
: null);
|
||||||
run.setLastLegacyDocumentId(cursor.getId());
|
run.setLastLegacyDocumentId(cursor.getId());
|
||||||
run.setLastDocDocumentId(outcome.documentId());
|
run.setLastDocDocumentId(outcome.documentId());
|
||||||
run.setLastProjectionId(outcome.projectionId());
|
run.setLastProjectionId(outcome.projectionId());
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ import at.procon.dip.domain.document.ContentRole;
|
||||||
import at.procon.dip.domain.document.DocumentFamily;
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
import at.procon.dip.domain.document.DocumentStatus;
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RelationType;
|
||||||
import at.procon.dip.domain.document.RepresentationType;
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
import at.procon.dip.domain.document.SourceType;
|
import at.procon.dip.domain.document.SourceType;
|
||||||
import at.procon.dip.domain.document.StorageType;
|
import at.procon.dip.domain.document.StorageType;
|
||||||
|
|
@ -16,13 +17,16 @@ import at.procon.dip.domain.document.repository.DocumentContentRepository;
|
||||||
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
||||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||||
import at.procon.dip.domain.document.service.DocumentContentService;
|
import at.procon.dip.domain.document.service.DocumentContentService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentRelationService;
|
||||||
import at.procon.dip.domain.document.service.DocumentRepresentationService;
|
import at.procon.dip.domain.document.service.DocumentRepresentationService;
|
||||||
import at.procon.dip.domain.document.service.DocumentService;
|
import at.procon.dip.domain.document.service.DocumentService;
|
||||||
import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
|
import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
|
||||||
import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
|
import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
|
||||||
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
||||||
|
import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand;
|
||||||
import at.procon.dip.domain.ted.service.TedGenericDocumentRootService;
|
import at.procon.dip.domain.ted.service.TedGenericDocumentRootService;
|
||||||
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
|
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
|
||||||
|
import at.procon.dip.domain.ted.service.TedPackageDocumentService;
|
||||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||||
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||||
|
|
@ -32,6 +36,8 @@ import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
import at.procon.dip.normalization.service.TextRepresentationBuildService;
|
import at.procon.dip.normalization.service.TextRepresentationBuildService;
|
||||||
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||||
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
||||||
|
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||||
|
import at.procon.dip.runtime.config.RuntimeMode;
|
||||||
import at.procon.dip.search.service.DocumentLexicalIndexService;
|
import at.procon.dip.search.service.DocumentLexicalIndexService;
|
||||||
import at.procon.ted.model.entity.ProcurementDocument;
|
import at.procon.ted.model.entity.ProcurementDocument;
|
||||||
import at.procon.ted.repository.ProcurementDocumentRepository;
|
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||||
|
|
@ -53,6 +59,7 @@ import org.springframework.transaction.annotation.Transactional;
|
||||||
import org.springframework.util.StringUtils;
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
|
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class LegacyTedBackfillWorker {
|
public class LegacyTedBackfillWorker {
|
||||||
|
|
@ -63,6 +70,7 @@ public class LegacyTedBackfillWorker {
|
||||||
|
|
||||||
private final ProcurementDocumentRepository procurementDocumentRepository;
|
private final ProcurementDocumentRepository procurementDocumentRepository;
|
||||||
private final TedGenericDocumentRootService tedGenericDocumentRootService;
|
private final TedGenericDocumentRootService tedGenericDocumentRootService;
|
||||||
|
private final TedPackageDocumentService tedPackageDocumentService;
|
||||||
private final TedNoticeProjectionService tedNoticeProjectionService;
|
private final TedNoticeProjectionService tedNoticeProjectionService;
|
||||||
private final DocumentService documentService;
|
private final DocumentService documentService;
|
||||||
private final DocumentSourceRepository sourceRepository;
|
private final DocumentSourceRepository sourceRepository;
|
||||||
|
|
@ -72,6 +80,7 @@ public class LegacyTedBackfillWorker {
|
||||||
private final DocumentTextRepresentationRepository representationRepository;
|
private final DocumentTextRepresentationRepository representationRepository;
|
||||||
private final DocumentRepresentationService documentRepresentationService;
|
private final DocumentRepresentationService documentRepresentationService;
|
||||||
private final DocumentLexicalIndexService lexicalIndexService;
|
private final DocumentLexicalIndexService lexicalIndexService;
|
||||||
|
private final DocumentRelationService documentRelationService;
|
||||||
private final TextRepresentationBuildService textRepresentationBuildService;
|
private final TextRepresentationBuildService textRepresentationBuildService;
|
||||||
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
||||||
private final EmbeddingProperties embeddingProperties;
|
private final EmbeddingProperties embeddingProperties;
|
||||||
|
|
@ -89,6 +98,15 @@ public class LegacyTedBackfillWorker {
|
||||||
List<TextRepresentationDraft> drafts = buildDrafts(legacyDocument);
|
List<TextRepresentationDraft> drafts = buildDrafts(legacyDocument);
|
||||||
List<DocumentTextRepresentation> savedRepresentations = ensureRepresentations(document, originalContent, normalizedTextContent, drafts);
|
List<DocumentTextRepresentation> savedRepresentations = ensureRepresentations(document, originalContent, normalizedTextContent, drafts);
|
||||||
|
|
||||||
|
tedPackageDocumentService.ensurePackageDocumentForLegacyNotice(legacyDocument)
|
||||||
|
.ifPresent(packageDocument -> documentRelationService.ensureRelation(new CreateDocumentRelationCommand(
|
||||||
|
packageDocument.getId(),
|
||||||
|
document.getId(),
|
||||||
|
RelationType.CONTAINS,
|
||||||
|
null,
|
||||||
|
legacyDocument.getSourcePath()
|
||||||
|
)));
|
||||||
|
|
||||||
UUID projectionId = tedNoticeProjectionService.registerOrRefreshProjection(legacyDocument, document.getId());
|
UUID projectionId = tedNoticeProjectionService.registerOrRefreshProjection(legacyDocument, document.getId());
|
||||||
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
|
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -66,6 +66,7 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.TRUE
|
Boolean.TRUE
|
||||||
));
|
));
|
||||||
|
/*
|
||||||
drafts.add(new TextRepresentationDraft(
|
drafts.add(new TextRepresentationDraft(
|
||||||
RepresentationType.FULLTEXT,
|
RepresentationType.FULLTEXT,
|
||||||
BUILDER_KEY,
|
BUILDER_KEY,
|
||||||
|
|
@ -104,6 +105,7 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.FALSE
|
Boolean.FALSE
|
||||||
));
|
));
|
||||||
|
*/
|
||||||
return drafts;
|
return drafts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -52,7 +52,7 @@ public class Organization {
|
||||||
/**
|
/**
|
||||||
* Company/tax registration ID.
|
* Company/tax registration ID.
|
||||||
*/
|
*/
|
||||||
@Column(name = "company_id", length = 1000)
|
@Column(name = "company_id", columnDefinition = "TEXT")
|
||||||
private String companyId;
|
private String companyId;
|
||||||
|
|
||||||
@Column(name = "country_code", length = 10)
|
@Column(name = "country_code", length = 10)
|
||||||
|
|
|
||||||
|
|
@ -105,7 +105,7 @@ public class ProcurementDocument {
|
||||||
@Column(name = "buyer_city", columnDefinition = "TEXT")
|
@Column(name = "buyer_city", columnDefinition = "TEXT")
|
||||||
private String buyerCity;
|
private String buyerCity;
|
||||||
|
|
||||||
@Column(name = "buyer_postal_code", length = 100)
|
@Column(name = "buyer_postal_code", columnDefinition = "TEXT")
|
||||||
private String buyerPostalCode;
|
private String buyerPostalCode;
|
||||||
|
|
||||||
@Column(name = "buyer_nuts_code", length = 10)
|
@Column(name = "buyer_nuts_code", length = 10)
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
package at.procon.ted.repository;
|
package at.procon.ted.repository;
|
||||||
|
|
||||||
import java.time.OffsetDateTime;
|
import java.time.Instant;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
public interface LegacyTedMigrationCursor {
|
public interface LegacyTedMigrationCursor {
|
||||||
UUID getId();
|
UUID getId();
|
||||||
OffsetDateTime getCreatedAt();
|
Instant getCreatedAt();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -211,14 +211,24 @@ public interface ProcurementDocumentRepository extends
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lightweight cursor query for resumable legacy -> DOC/projection backfill.
|
* First lightweight cursor query for resumable legacy -> DOC/projection backfill.
|
||||||
*/
|
*/
|
||||||
@Query(value = """
|
@Query(value = """
|
||||||
SELECT p.id AS id, p.created_at AS createdAt
|
SELECT p.id AS id, p.created_at AS createdAt
|
||||||
FROM ted.procurement_document p
|
FROM ted.procurement_document p
|
||||||
WHERE (:lastCreatedAt IS NULL
|
ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
|
||||||
OR p.created_at > :lastCreatedAt
|
LIMIT :limit
|
||||||
OR (p.created_at = :lastCreatedAt AND CAST(p.id AS text) > CAST(:lastId AS text)))
|
""", nativeQuery = true)
|
||||||
|
List<LegacyTedMigrationCursor> findFirstMigrationBatch(@Param("limit") int limit);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Next lightweight cursor query for resumable legacy -> DOC/projection backfill.
|
||||||
|
*/
|
||||||
|
@Query(value = """
|
||||||
|
SELECT p.id AS id, p.created_at AS createdAt
|
||||||
|
FROM ted.procurement_document p
|
||||||
|
WHERE p.created_at > :lastCreatedAt
|
||||||
|
OR (p.created_at = :lastCreatedAt AND CAST(p.id AS text) > CAST(:lastId AS text))
|
||||||
ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
|
ORDER BY p.created_at ASC, CAST(p.id AS text) ASC
|
||||||
LIMIT :limit
|
LIMIT :limit
|
||||||
""", nativeQuery = true)
|
""", nativeQuery = true)
|
||||||
|
|
|
||||||
|
|
@ -228,7 +228,8 @@ public class XmlParserService {
|
||||||
|
|
||||||
// Name
|
// Name
|
||||||
org.setName(getTextContent(xpath, orgNode, ".//cac:PartyName/cbc:Name"));
|
org.setName(getTextContent(xpath, orgNode, ".//cac:PartyName/cbc:Name"));
|
||||||
|
if(org.getName() == null) org.setName("");
|
||||||
|
|
||||||
// Company ID
|
// Company ID
|
||||||
org.setCompanyId(getTextContent(xpath, orgNode, ".//cac:PartyLegalEntity/cbc:CompanyID"));
|
org.setCompanyId(getTextContent(xpath, orgNode, ".//cac:PartyLegalEntity/cbc:CompanyID"));
|
||||||
|
|
||||||
|
|
@ -264,7 +265,361 @@ public class XmlParserService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final Map<String, XPathExpression> cache = new HashMap<>();
|
||||||
|
|
||||||
|
private XPathExpression getCompiled(XPath xpath, String expression) throws XPathExpressionException {
|
||||||
|
XPathExpression compiled = cache.get(expression);
|
||||||
|
if (compiled == null) {
|
||||||
|
compiled = xpath.compile(expression);
|
||||||
|
cache.put(expression, compiled);
|
||||||
|
}
|
||||||
|
return compiled;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getTextContent(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
||||||
|
XPathExpression expr = getCompiled(xpath, expression);
|
||||||
|
Node node = (Node) expr.evaluate(item, XPathConstants.NODE);
|
||||||
|
return node != null ? node.getTextContent().trim() : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Node getNode(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
||||||
|
return (Node) getCompiled(xpath, expression).evaluate(item, XPathConstants.NODE);
|
||||||
|
}
|
||||||
|
|
||||||
|
private NodeList getNodes(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
||||||
|
return (NodeList) getCompiled(xpath, expression).evaluate(item, XPathConstants.NODESET);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Element getDirectChild(Element parent, String namespaceUri, String localName) {
|
||||||
|
Node child = parent.getFirstChild();
|
||||||
|
while (child != null) {
|
||||||
|
if (child.getNodeType() == Node.ELEMENT_NODE) {
|
||||||
|
Element el = (Element) child;
|
||||||
|
if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) {
|
||||||
|
return el;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
child = child.getNextSibling();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Element> getDirectChildren(Element parent, String namespaceUri, String localName) {
|
||||||
|
List<Element> result = new ArrayList<>();
|
||||||
|
Node child = parent.getFirstChild();
|
||||||
|
while (child != null) {
|
||||||
|
if (child.getNodeType() == Node.ELEMENT_NODE) {
|
||||||
|
Element el = (Element) child;
|
||||||
|
if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) {
|
||||||
|
result.add(el);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
child = child.getNextSibling();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getDirectChildText(Element parent, String namespaceUri, String localName) {
|
||||||
|
Element child = getDirectChild(parent, namespaceUri, localName);
|
||||||
|
if (child == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return trimToNull(child.getTextContent());
|
||||||
|
}
|
||||||
|
|
||||||
|
private String trimToNull(String value) {
|
||||||
|
if (value == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
String trimmed = value.trim();
|
||||||
|
return trimmed.isEmpty() ? null : trimmed;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void parseLotsDOM(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
||||||
|
NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot");
|
||||||
|
document.setTotalLots(lotNodes.getLength());
|
||||||
|
|
||||||
|
for (int i = 0; i < lotNodes.getLength(); i++) {
|
||||||
|
Node lotNode = lotNodes.item(i);
|
||||||
|
if (lotNode.getNodeType() != Node.ELEMENT_NODE) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Element lotEl = (Element) lotNode;
|
||||||
|
ProcurementLot lot = ProcurementLot.builder().build();
|
||||||
|
|
||||||
|
// Direct child values on the lot
|
||||||
|
lot.setLotId(getDirectChildText(lotEl, NS_CBC, "ID"));
|
||||||
|
|
||||||
|
Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject");
|
||||||
|
if (procurementProjectEl != null) {
|
||||||
|
lot.setInternalId(getDirectChildText(procurementProjectEl, NS_CBC, "ID"));
|
||||||
|
lot.setTitle(getDirectChildText(procurementProjectEl, NS_CBC, "Name"));
|
||||||
|
lot.setDescription(getDirectChildText(procurementProjectEl, NS_CBC, "Description"));
|
||||||
|
|
||||||
|
// CPV codes
|
||||||
|
List<String> lotCpvCodes = new ArrayList<>();
|
||||||
|
for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) {
|
||||||
|
String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode");
|
||||||
|
if (cpv != null && !cpv.isEmpty()) {
|
||||||
|
lotCpvCodes.add(cpv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lot.setCpvCodes(lotCpvCodes.toArray(new String[0]));
|
||||||
|
|
||||||
|
// NUTS codes
|
||||||
|
List<String> lotNutsCodes = new ArrayList<>();
|
||||||
|
for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) {
|
||||||
|
Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address");
|
||||||
|
if (addressEl != null) {
|
||||||
|
String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode");
|
||||||
|
if (nuts != null && !nuts.isEmpty()) {
|
||||||
|
lotNutsCodes.add(nuts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
|
||||||
|
|
||||||
|
// Duration
|
||||||
|
Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod");
|
||||||
|
if (plannedPeriodEl != null) {
|
||||||
|
Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure");
|
||||||
|
if (durationEl != null) {
|
||||||
|
String durationValue = trimToNull(durationEl.getTextContent());
|
||||||
|
if (durationValue != null) {
|
||||||
|
try {
|
||||||
|
lot.setDurationValue(Double.parseDouble(durationValue));
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
|
||||||
|
if (unitCode != null) {
|
||||||
|
lot.setDurationUnit(unitCode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Submission deadline
|
||||||
|
Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess");
|
||||||
|
if (tenderingProcessEl != null) {
|
||||||
|
Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod");
|
||||||
|
if (deadlinePeriodEl != null) {
|
||||||
|
String endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate");
|
||||||
|
if (endDate != null) {
|
||||||
|
String endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime");
|
||||||
|
lot.setSubmissionDeadline(parseDateTime(endDate, endTime));
|
||||||
|
|
||||||
|
if (document.getSubmissionDeadline() == null) {
|
||||||
|
document.setSubmissionDeadline(lot.getSubmissionDeadline());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// EU funded
|
||||||
|
Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms");
|
||||||
|
if (tenderingTermsEl != null) {
|
||||||
|
String fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode");
|
||||||
|
lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds"));
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addLot(lot);
|
||||||
|
}
|
||||||
|
|
||||||
|
document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded())));
|
||||||
|
}
|
||||||
|
|
||||||
private void parseLots(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
private void parseLots(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
||||||
|
NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot");
|
||||||
|
document.setTotalLots(lotNodes.getLength());
|
||||||
|
|
||||||
|
for (int i = 0; i < lotNodes.getLength(); i++) {
|
||||||
|
Node lotNode = lotNodes.item(i);
|
||||||
|
if (lotNode.getNodeType() != Node.ELEMENT_NODE) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Element lotEl = (Element) lotNode;
|
||||||
|
ProcurementLot lot = ProcurementLot.builder().build();
|
||||||
|
|
||||||
|
// Fast direct children
|
||||||
|
Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject");
|
||||||
|
Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess");
|
||||||
|
Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms");
|
||||||
|
|
||||||
|
// --- Lot ID ---
|
||||||
|
String lotId = getDirectChildText(lotEl, NS_CBC, "ID");
|
||||||
|
if (lotId == null) {
|
||||||
|
lotId = getTextContent(xpath, lotNode, "cbc:ID");
|
||||||
|
}
|
||||||
|
lot.setLotId(lotId);
|
||||||
|
|
||||||
|
// --- Internal ID ---
|
||||||
|
String internalId = null;
|
||||||
|
if (procurementProjectEl != null) {
|
||||||
|
internalId = getDirectChildText(procurementProjectEl, NS_CBC, "ID");
|
||||||
|
}
|
||||||
|
if (internalId == null) {
|
||||||
|
internalId = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:ID");
|
||||||
|
}
|
||||||
|
lot.setInternalId(internalId);
|
||||||
|
|
||||||
|
// --- Title ---
|
||||||
|
String title = null;
|
||||||
|
if (procurementProjectEl != null) {
|
||||||
|
title = getDirectChildText(procurementProjectEl, NS_CBC, "Name");
|
||||||
|
}
|
||||||
|
if (title == null) {
|
||||||
|
title = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Name");
|
||||||
|
}
|
||||||
|
lot.setTitle(title);
|
||||||
|
|
||||||
|
// --- Description ---
|
||||||
|
String description = null;
|
||||||
|
if (procurementProjectEl != null) {
|
||||||
|
description = getDirectChildText(procurementProjectEl, NS_CBC, "Description");
|
||||||
|
}
|
||||||
|
if (description == null) {
|
||||||
|
description = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Description");
|
||||||
|
}
|
||||||
|
lot.setDescription(description);
|
||||||
|
|
||||||
|
// --- CPV codes ---
|
||||||
|
List<String> lotCpvCodes = new ArrayList<>();
|
||||||
|
if (procurementProjectEl != null) {
|
||||||
|
for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) {
|
||||||
|
String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode");
|
||||||
|
if (cpv != null && !cpv.isEmpty()) {
|
||||||
|
lotCpvCodes.add(cpv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (lotCpvCodes.isEmpty()) {
|
||||||
|
NodeList cpvNodes = getNodes(xpath, lotNode,
|
||||||
|
".//cac:MainCommodityClassification/cbc:ItemClassificationCode");
|
||||||
|
for (int j = 0; j < cpvNodes.getLength(); j++) {
|
||||||
|
String cpv = trimToNull(cpvNodes.item(j).getTextContent());
|
||||||
|
if (cpv != null) {
|
||||||
|
lotCpvCodes.add(cpv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lot.setCpvCodes(lotCpvCodes.toArray(new String[0]));
|
||||||
|
|
||||||
|
// --- NUTS codes ---
|
||||||
|
List<String> lotNutsCodes = new ArrayList<>();
|
||||||
|
if (procurementProjectEl != null) {
|
||||||
|
for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) {
|
||||||
|
Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address");
|
||||||
|
if (addressEl != null) {
|
||||||
|
String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode");
|
||||||
|
if (nuts != null && !nuts.isEmpty()) {
|
||||||
|
lotNutsCodes.add(nuts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (lotNutsCodes.isEmpty()) {
|
||||||
|
NodeList nutsNodes = getNodes(xpath, lotNode,
|
||||||
|
".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode");
|
||||||
|
for (int j = 0; j < nutsNodes.getLength(); j++) {
|
||||||
|
String nuts = trimToNull(nutsNodes.item(j).getTextContent());
|
||||||
|
if (nuts != null) {
|
||||||
|
lotNutsCodes.add(nuts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
|
||||||
|
|
||||||
|
// --- Duration ---
|
||||||
|
boolean durationSet = false;
|
||||||
|
if (procurementProjectEl != null) {
|
||||||
|
Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod");
|
||||||
|
if (plannedPeriodEl != null) {
|
||||||
|
Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure");
|
||||||
|
if (durationEl != null) {
|
||||||
|
String durationValue = trimToNull(durationEl.getTextContent());
|
||||||
|
if (durationValue != null) {
|
||||||
|
try {
|
||||||
|
lot.setDurationValue(Double.parseDouble(durationValue));
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
|
||||||
|
if (unitCode != null) {
|
||||||
|
lot.setDurationUnit(unitCode);
|
||||||
|
}
|
||||||
|
durationSet = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!durationSet) {
|
||||||
|
Node durationNode = getNode(xpath, lotNode,
|
||||||
|
"cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure");
|
||||||
|
if (durationNode != null) {
|
||||||
|
String durationValue = trimToNull(durationNode.getTextContent());
|
||||||
|
if (durationValue != null) {
|
||||||
|
try {
|
||||||
|
lot.setDurationValue(Double.parseDouble(durationValue));
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (durationNode instanceof Element durationEl) {
|
||||||
|
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
|
||||||
|
if (unitCode != null) {
|
||||||
|
lot.setDurationUnit(unitCode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Submission deadline ---
|
||||||
|
String endDate = null;
|
||||||
|
String endTime = null;
|
||||||
|
if (tenderingProcessEl != null) {
|
||||||
|
Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod");
|
||||||
|
if (deadlinePeriodEl != null) {
|
||||||
|
endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate");
|
||||||
|
endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (endDate == null) {
|
||||||
|
endDate = getTextContent(xpath, lotNode,
|
||||||
|
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndDate");
|
||||||
|
endTime = getTextContent(xpath, lotNode,
|
||||||
|
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndTime");
|
||||||
|
}
|
||||||
|
if (endDate != null) {
|
||||||
|
lot.setSubmissionDeadline(parseDateTime(endDate, endTime));
|
||||||
|
if (document.getSubmissionDeadline() == null) {
|
||||||
|
document.setSubmissionDeadline(lot.getSubmissionDeadline());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- EU funded ---
|
||||||
|
String fundingProgramCode = null;
|
||||||
|
if (tenderingTermsEl != null) {
|
||||||
|
fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode");
|
||||||
|
}
|
||||||
|
if (fundingProgramCode == null) {
|
||||||
|
fundingProgramCode = getTextContent(xpath, lotNode,
|
||||||
|
"cac:TenderingTerms/cbc:FundingProgramCode");
|
||||||
|
}
|
||||||
|
lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds"));
|
||||||
|
|
||||||
|
document.addLot(lot);
|
||||||
|
}
|
||||||
|
|
||||||
|
document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded())));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void parseLotsOld(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
||||||
NodeList lotNodes = (NodeList) xpath.evaluate(
|
NodeList lotNodes = (NodeList) xpath.evaluate(
|
||||||
"//cac:ProcurementProjectLot", doc, XPathConstants.NODESET);
|
"//cac:ProcurementProjectLot", doc, XPathConstants.NODESET);
|
||||||
|
|
||||||
|
|
@ -288,7 +643,7 @@ public class XmlParserService {
|
||||||
// CPV codes for this lot
|
// CPV codes for this lot
|
||||||
List<String> lotCpvCodes = new ArrayList<>();
|
List<String> lotCpvCodes = new ArrayList<>();
|
||||||
NodeList cpvNodes = (NodeList) xpath.evaluate(
|
NodeList cpvNodes = (NodeList) xpath.evaluate(
|
||||||
".//cac:MainCommodityClassification/cbc:ItemClassificationCode",
|
".//cac:MainCommodityClassification/cbc:ItemClassificationCode",
|
||||||
lotNode, XPathConstants.NODESET);
|
lotNode, XPathConstants.NODESET);
|
||||||
for (int j = 0; j < cpvNodes.getLength(); j++) {
|
for (int j = 0; j < cpvNodes.getLength(); j++) {
|
||||||
lotCpvCodes.add(cpvNodes.item(j).getTextContent());
|
lotCpvCodes.add(cpvNodes.item(j).getTextContent());
|
||||||
|
|
@ -298,13 +653,13 @@ public class XmlParserService {
|
||||||
// NUTS codes for this lot
|
// NUTS codes for this lot
|
||||||
List<String> lotNutsCodes = new ArrayList<>();
|
List<String> lotNutsCodes = new ArrayList<>();
|
||||||
NodeList nutsNodes = (NodeList) xpath.evaluate(
|
NodeList nutsNodes = (NodeList) xpath.evaluate(
|
||||||
".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode",
|
".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode",
|
||||||
lotNode, XPathConstants.NODESET);
|
lotNode, XPathConstants.NODESET);
|
||||||
for (int j = 0; j < nutsNodes.getLength(); j++) {
|
for (int j = 0; j < nutsNodes.getLength(); j++) {
|
||||||
lotNutsCodes.add(nutsNodes.item(j).getTextContent());
|
lotNutsCodes.add(nutsNodes.item(j).getTextContent());
|
||||||
}
|
}
|
||||||
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
|
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
|
||||||
|
|
||||||
// Duration
|
// Duration
|
||||||
String durationValue = getTextContent(xpath, lotNode,
|
String durationValue = getTextContent(xpath, lotNode,
|
||||||
"cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure");
|
"cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure");
|
||||||
|
|
@ -428,15 +783,10 @@ public class XmlParserService {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper methods
|
// Helper methods
|
||||||
|
|
||||||
private String getTextContent(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
|
||||||
Node node = (Node) xpath.evaluate(expression, item, XPathConstants.NODE);
|
|
||||||
return node != null ? node.getTextContent().trim() : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> getTextContents(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
private List<String> getTextContents(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
||||||
List<String> results = new ArrayList<>();
|
List<String> results = new ArrayList<>();
|
||||||
NodeList nodes = (NodeList) xpath.evaluate(expression, item, XPathConstants.NODESET);
|
NodeList nodes = getNodes(xpath, item, expression);
|
||||||
for (int i = 0; i < nodes.getLength(); i++) {
|
for (int i = 0; i < nodes.getLength(); i++) {
|
||||||
String text = nodes.item(i).getTextContent().trim();
|
String text = nodes.item(i).getTextContent().trim();
|
||||||
if (!text.isEmpty()) {
|
if (!text.isEmpty()) {
|
||||||
|
|
@ -447,9 +797,10 @@ public class XmlParserService {
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getAttributeValue(XPath xpath, Object item, String expression, String attrName) throws XPathExpressionException {
|
private String getAttributeValue(XPath xpath, Object item, String expression, String attrName) throws XPathExpressionException {
|
||||||
Node node = (Node) xpath.evaluate(expression, item, XPathConstants.NODE);
|
Node node = getNode(xpath, item, expression);
|
||||||
if (node instanceof Element) {
|
if (node instanceof Element element) {
|
||||||
return ((Element) node).getAttribute(attrName);
|
String value = element.getAttribute(attrName);
|
||||||
|
return trimToNull(value);
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ ted:
|
||||||
# Use external HTTP API instead of subprocess
|
# Use external HTTP API instead of subprocess
|
||||||
use-http-api: true
|
use-http-api: true
|
||||||
# Embedding service URL
|
# Embedding service URL
|
||||||
api-url: http://172.20.240.18:8001
|
api-url: http://172.20.20.6:8001
|
||||||
# Model name for sentence-transformers
|
# Model name for sentence-transformers
|
||||||
model-name: intfloat/multilingual-e5-large
|
model-name: intfloat/multilingual-e5-large
|
||||||
# Vector dimensions (must match model output)
|
# Vector dimensions (must match model output)
|
||||||
|
|
|
||||||
|
|
@ -223,7 +223,7 @@ dip:
|
||||||
# ted packages download configuration
|
# ted packages download configuration
|
||||||
ted-download:
|
ted-download:
|
||||||
# Enable/disable automatic package download
|
# Enable/disable automatic package download
|
||||||
enabled: true
|
enabled: false
|
||||||
# Base URL for TED Daily Packages
|
# Base URL for TED Daily Packages
|
||||||
base-url: https://ted.europa.eu/packages/daily/
|
base-url: https://ted.europa.eu/packages/daily/
|
||||||
# Download directory for tar.gz files
|
# Download directory for tar.gz files
|
||||||
|
|
@ -231,7 +231,7 @@ dip:
|
||||||
# Start year for downloads
|
# Start year for downloads
|
||||||
start-year: 2026
|
start-year: 2026
|
||||||
# Polling interval (milliseconds) - 2 minutes
|
# Polling interval (milliseconds) - 2 minutes
|
||||||
poll-interval: 120000
|
poll-interval: 60000
|
||||||
# Retry interval for tail NOT_FOUND packages - 6 hours
|
# Retry interval for tail NOT_FOUND packages - 6 hours
|
||||||
not-found-retry-interval: 21600000
|
not-found-retry-interval: 21600000
|
||||||
# Grace period after year end before a previous-year tail 404 is treated as final
|
# Grace period after year end before a previous-year tail 404 is treated as final
|
||||||
|
|
@ -246,6 +246,7 @@ dip:
|
||||||
delay-between-downloads: 5000
|
delay-between-downloads: 5000
|
||||||
# Delete tar.gz after ingestion
|
# Delete tar.gz after ingestion
|
||||||
delete-after-ingestion: true
|
delete-after-ingestion: true
|
||||||
|
|
||||||
ted: # Phase 3 TED projection configuration
|
ted: # Phase 3 TED projection configuration
|
||||||
projection:
|
projection:
|
||||||
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document
|
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document
|
||||||
|
|
@ -254,6 +255,7 @@ dip:
|
||||||
startup-backfill-enabled: false
|
startup-backfill-enabled: false
|
||||||
# Maximum number of legacy TED documents to backfill during startup
|
# Maximum number of legacy TED documents to backfill during startup
|
||||||
startup-backfill-limit: 250
|
startup-backfill-limit: 250
|
||||||
|
|
||||||
migration:
|
migration:
|
||||||
legacy-audit:
|
legacy-audit:
|
||||||
# Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem
|
# Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
# Author: Martin.Schweitzer@procon.co.at and claude.ai
|
# Author: Martin.Schweitzer@procon.co.at and claude.ai
|
||||||
|
|
||||||
server:
|
server:
|
||||||
port: 8885
|
port: 8889
|
||||||
servlet:
|
servlet:
|
||||||
context-path: /api
|
context-path: /api
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,10 +3,10 @@
|
||||||
-- Description: PostgreSQL schema for storing EU eForms procurement notices with vector search support
|
-- Description: PostgreSQL schema for storing EU eForms procurement notices with vector search support
|
||||||
|
|
||||||
-- Create TED schema if it doesn't exist
|
-- Create TED schema if it doesn't exist
|
||||||
CREATE SCHEMA IF NOT EXISTS TED;
|
CREATE SCHEMA IF NOT EXISTS ted;
|
||||||
|
|
||||||
-- Set search path to use TED schema
|
-- Set search path to use TED schema
|
||||||
SET search_path TO TED;
|
SET search_path TO ted;
|
||||||
|
|
||||||
-- Enable required PostgreSQL extensions (wenn Berechtigung vorhanden)
|
-- Enable required PostgreSQL extensions (wenn Berechtigung vorhanden)
|
||||||
-- Falls Extensions nicht erstellt werden können, müssen diese vom DBA manuell erstellt werden
|
-- Falls Extensions nicht erstellt werden können, müssen diese vom DBA manuell erstellt werden
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,9 @@ CREATE INDEX IF NOT EXISTS idx_ted_notice_projection_package_identifier
|
||||||
ALTER TABLE IF EXISTS TED.organization
|
ALTER TABLE IF EXISTS TED.organization
|
||||||
ALTER COLUMN city TYPE TEXT;
|
ALTER COLUMN city TYPE TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE IF EXISTS TED.organization
|
||||||
|
ALTER COLUMN company_id TYPE TEXT;
|
||||||
|
|
||||||
ALTER TABLE IF EXISTS TED.procurement_document
|
ALTER TABLE IF EXISTS TED.procurement_document
|
||||||
ALTER COLUMN buyer_city TYPE TEXT;
|
ALTER COLUMN buyer_city TYPE TEXT;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,95 @@
|
||||||
|
SET search_path TO TED, DOC, public;
|
||||||
|
|
||||||
|
WITH legacy_package_map AS (
|
||||||
|
SELECT
|
||||||
|
d.id AS legacy_procurement_document_id,
|
||||||
|
p.document_id AS child_document_id,
|
||||||
|
substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') AS package_identifier
|
||||||
|
FROM TED.procurement_document d
|
||||||
|
JOIN TED.ted_notice_projection p
|
||||||
|
ON p.legacy_procurement_document_id = d.id
|
||||||
|
WHERE substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') IS NOT NULL
|
||||||
|
), package_documents AS (
|
||||||
|
SELECT DISTINCT
|
||||||
|
l.package_identifier,
|
||||||
|
'TED:package:' || l.package_identifier AS business_key,
|
||||||
|
encode(digest('TED:package:' || l.package_identifier, 'sha256'), 'hex') AS dedup_hash
|
||||||
|
FROM legacy_package_map l
|
||||||
|
)
|
||||||
|
INSERT INTO DOC.doc_document (
|
||||||
|
id, visibility, document_type, document_family, status, title, summary, mime_type, business_key, dedup_hash
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
gen_random_uuid(),
|
||||||
|
'PUBLIC',
|
||||||
|
'TED_PACKAGE',
|
||||||
|
'PROCUREMENT',
|
||||||
|
CASE
|
||||||
|
WHEN pkg.download_status = 'COMPLETED' THEN 'CLASSIFIED'
|
||||||
|
WHEN pkg.download_status IN ('FAILED', 'NOT_FOUND') THEN 'FAILED'
|
||||||
|
ELSE 'RECEIVED'
|
||||||
|
END,
|
||||||
|
'TED Daily Package ' || pd.package_identifier,
|
||||||
|
CASE
|
||||||
|
WHEN pkg.package_identifier IS NULL THEN 'TED daily package ' || pd.package_identifier
|
||||||
|
ELSE 'TED daily package ' || pd.package_identifier ||
|
||||||
|
' (status=' || coalesce(pkg.download_status::text, 'UNKNOWN') ||
|
||||||
|
', xmlFileCount=' || coalesce(pkg.xml_file_count::text, 'null') ||
|
||||||
|
', processedCount=' || coalesce(pkg.processed_count::text, 'null') ||
|
||||||
|
', failedCount=' || coalesce(pkg.failed_count::text, 'null') || ')'
|
||||||
|
END,
|
||||||
|
'application/gzip',
|
||||||
|
pd.business_key,
|
||||||
|
pd.dedup_hash
|
||||||
|
FROM package_documents pd
|
||||||
|
LEFT JOIN TED.ted_daily_package pkg
|
||||||
|
ON pkg.package_identifier = pd.package_identifier
|
||||||
|
WHERE NOT EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM DOC.doc_document existing
|
||||||
|
WHERE existing.business_key = pd.business_key
|
||||||
|
);
|
||||||
|
|
||||||
|
UPDATE TED.ted_notice_projection p
|
||||||
|
SET package_identifier = substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})')
|
||||||
|
FROM TED.procurement_document d
|
||||||
|
WHERE p.legacy_procurement_document_id = d.id
|
||||||
|
AND substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') IS NOT NULL
|
||||||
|
AND p.package_identifier IS DISTINCT FROM substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})');
|
||||||
|
|
||||||
|
WITH legacy_package_map AS (
|
||||||
|
SELECT
|
||||||
|
p.document_id AS child_document_id,
|
||||||
|
substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') AS package_identifier
|
||||||
|
FROM TED.procurement_document d
|
||||||
|
JOIN TED.ted_notice_projection p
|
||||||
|
ON p.legacy_procurement_document_id = d.id
|
||||||
|
WHERE substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') IS NOT NULL
|
||||||
|
), package_documents AS (
|
||||||
|
SELECT
|
||||||
|
doc.id AS parent_document_id,
|
||||||
|
substring(doc.business_key from '(20[0-9]{7})') AS package_identifier
|
||||||
|
FROM DOC.doc_document doc
|
||||||
|
WHERE doc.document_type = 'TED_PACKAGE'
|
||||||
|
AND doc.business_key LIKE 'TED:package:%'
|
||||||
|
)
|
||||||
|
INSERT INTO DOC.doc_relation (
|
||||||
|
id, parent_document_id, child_document_id, relation_type, sort_order, relation_metadata
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
gen_random_uuid(),
|
||||||
|
pkg.parent_document_id,
|
||||||
|
l.child_document_id,
|
||||||
|
'CONTAINS',
|
||||||
|
NULL,
|
||||||
|
'packageIdentifier=' || l.package_identifier
|
||||||
|
FROM legacy_package_map l
|
||||||
|
JOIN package_documents pkg
|
||||||
|
ON pkg.package_identifier = l.package_identifier
|
||||||
|
WHERE NOT EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM DOC.doc_relation rel
|
||||||
|
WHERE rel.parent_document_id = pkg.parent_document_id
|
||||||
|
AND rel.child_document_id = l.child_document_id
|
||||||
|
AND rel.relation_type = 'CONTAINS'
|
||||||
|
);
|
||||||
Loading…
Reference in New Issue