From 28c7854ead27f077b8c0e28b29ff8e8a6a531335 Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Tue, 14 Apr 2026 17:42:13 +0200 Subject: [PATCH] ted legacy data migration, repair fixes --- .../repository/DocumentRepository.java | 2 + .../ted/entity/TedNoticeProjection.java | 14 + .../TedGenericDocumentRootService.java | 2 +- .../service/TedNoticeProjectionService.java | 30 +- .../service/TedPackageDocumentService.java | 113 ++++++ .../service/TedPackageIdentifierResolver.java | 31 ++ .../LegacyTedBackfillMigrationService.java | 22 +- .../service/LegacyTedBackfillWorker.java | 18 + ...edStructuredTextRepresentationBuilder.java | 2 + .../procon/ted/model/entity/Organization.java | 2 +- .../ted/model/entity/ProcurementDocument.java | 2 +- .../repository/LegacyTedMigrationCursor.java | 4 +- .../ProcurementDocumentRepository.java | 18 +- .../procon/ted/service/XmlParserService.java | 377 +++++++++++++++++- src/main/resources/application-legacy.yml | 2 +- src/main/resources/application-new.yml | 6 +- src/main/resources/application.yml | 2 +- .../db/migration/V1__initial_schema.sql | 4 +- ...projection_package_and_legacy_backfill.sql | 3 + ...oc_ted_package_documents_and_relations.sql | 95 +++++ 20 files changed, 689 insertions(+), 60 deletions(-) create mode 100644 src/main/java/at/procon/dip/domain/ted/service/TedPackageDocumentService.java create mode 100644 src/main/java/at/procon/dip/domain/ted/service/TedPackageIdentifierResolver.java create mode 100644 src/main/resources/db/migration/V21__doc_ted_package_documents_and_relations.sql diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java index 3e5192c..1db9ad1 100644 --- a/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java @@ -13,6 +13,8 @@ import org.springframework.data.jpa.repository.JpaRepository; public interface DocumentRepository extends JpaRepository { + Optional findByBusinessKey(String businessKey); + Optional findByDedupHash(String dedupHash); List findAllByDedupHash(String dedupHash); diff --git a/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java index e7d83e9..6a6c683 100644 --- a/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java +++ b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java @@ -189,10 +189,24 @@ public class TedNoticeProjection { protected void onCreate() { createdAt = OffsetDateTime.now(); updatedAt = OffsetDateTime.now(); + generateNoticeUrl(); } @PreUpdate protected void onUpdate() { updatedAt = OffsetDateTime.now(); + generateNoticeUrl(); + } + + /** + * Generates TED notice URL from publication_id. + * Format: https://ted.europa.eu/en/notice/-/detail/{publication_id without leading zeros} + */ + private void generateNoticeUrl() { + if (publicationId != null && !publicationId.isEmpty()) { + // Remove leading zeros from publication_id + String cleanId = publicationId.replaceFirst("^0+", ""); + this.noticeUrl = "https://ted.europa.eu/en/notice/-/detail/" + cleanId; + } } } diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedGenericDocumentRootService.java b/src/main/java/at/procon/dip/domain/ted/service/TedGenericDocumentRootService.java index d377e4f..34f1113 100644 --- a/src/main/java/at/procon/dip/domain/ted/service/TedGenericDocumentRootService.java +++ b/src/main/java/at/procon/dip/domain/ted/service/TedGenericDocumentRootService.java @@ -76,7 +76,7 @@ public class TedGenericDocumentRootService { private String buildBusinessKey(ProcurementDocument tedDocument) { if (StringUtils.hasText(tedDocument.getPublicationId())) { - return "TED:publication:" + tedDocument.getPublicationId(); + return "TED_NOTICE:" + tedDocument.getPublicationId(); } if (StringUtils.hasText(tedDocument.getNoticeId())) { return "TED:notice:" + tedDocument.getNoticeId(); diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java index 534eabb..90eaac0 100644 --- a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java +++ b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java @@ -17,8 +17,6 @@ import at.procon.ted.model.entity.ProcurementLot; import java.util.ArrayList; import java.util.List; import java.util.UUID; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; @@ -33,9 +31,8 @@ import org.springframework.transaction.annotation.Transactional; @Slf4j public class TedNoticeProjectionService { - private static final Pattern PACKAGE_IDENTIFIER_PATTERN = Pattern.compile("(? ensurePackageDocumentForLegacyNotice(ProcurementDocument legacyDocument) { + return packageIdentifierResolver.resolveFromSourceMetadata(legacyDocument.getSourcePath(), legacyDocument.getSourceFilename()) + .map(this::ensurePackageDocument); + } + + @Transactional + public Document ensurePackageDocument(String packageIdentifier) { + String businessKey = buildBusinessKey(packageIdentifier); + Document document = documentRepository.findByBusinessKey(businessKey) + .orElseGet(() -> createPackageDocument(packageIdentifier)); + + Optional packageEntity = tedDailyPackageRepository.findByPackageIdentifier(packageIdentifier); + document.setVisibility(DocumentVisibility.PUBLIC); + document.setDocumentType(DocumentType.TED_PACKAGE); + document.setDocumentFamily(DocumentFamily.PROCUREMENT); + document.setStatus(resolveStatus(packageEntity)); + document.setTitle(buildTitle(packageIdentifier)); + document.setSummary(buildSummary(packageIdentifier, packageEntity.orElse(null))); + document.setMimeType(PACKAGE_MIME_TYPE); + document.setBusinessKey(businessKey); + document.setDedupHash(HashUtils.computeSha256(businessKey)); + return documentService.save(document); + } + + private Document createPackageDocument(String packageIdentifier) { + String businessKey = buildBusinessKey(packageIdentifier); + return documentService.create(new CreateDocumentCommand( + null, + DocumentVisibility.PUBLIC, + DocumentType.TED_PACKAGE, + DocumentFamily.PROCUREMENT, + DocumentStatus.RECEIVED, + buildTitle(packageIdentifier), + null, + null, + PACKAGE_MIME_TYPE, + businessKey, + HashUtils.computeSha256(businessKey) + )); + } + + private DocumentStatus resolveStatus(Optional packageEntity) { + if (packageEntity.isEmpty()) { + return DocumentStatus.RECEIVED; + } + return switch (packageEntity.get().getDownloadStatus()) { + case COMPLETED -> DocumentStatus.CLASSIFIED; + case FAILED, NOT_FOUND -> DocumentStatus.FAILED; + default -> DocumentStatus.RECEIVED; + }; + } + + private String buildBusinessKey(String packageIdentifier) { + return "TED_PACKAGE:" + packageIdentifier; + } + + private String buildTitle(String packageIdentifier) { + return packageIdentifier + ".tar.gz"; + } + + private String buildSummary(String packageIdentifier, TedDailyPackage packageEntity) { + if (packageEntity == null) { + return "TED daily package " + packageIdentifier; + } + return "TED daily package %s (status=%s, xmlFileCount=%s, processedCount=%s, failedCount=%s, downloadedAt=%s)".formatted( + packageIdentifier, + packageEntity.getDownloadStatus(), + packageEntity.getXmlFileCount(), + packageEntity.getProcessedCount(), + packageEntity.getFailedCount(), + formatTimestamp(packageEntity.getDownloadedAt()) + ); + } + + private String formatTimestamp(OffsetDateTime value) { + return value == null ? null : value.toString(); + } +} diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedPackageIdentifierResolver.java b/src/main/java/at/procon/dip/domain/ted/service/TedPackageIdentifierResolver.java new file mode 100644 index 0000000..5d14795 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/service/TedPackageIdentifierResolver.java @@ -0,0 +1,31 @@ +package at.procon.dip.domain.ted.service; + +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; + +/** + * Resolves a TED daily package identifier (YYYYSSSSS) from legacy source metadata. + */ +@Component +public class TedPackageIdentifierResolver { + + private static final Pattern PACKAGE_IDENTIFIER_PATTERN = Pattern.compile("(? resolveFromSourceMetadata(String sourcePath, String sourceFilename) { + return resolve(sourcePath).or(() -> resolve(sourceFilename)); + } + + public Optional resolve(String value) { + if (!StringUtils.hasText(value)) { + return Optional.empty(); + } + Matcher matcher = PACKAGE_IDENTIFIER_PATTERN.matcher(value); + if (matcher.find()) { + return Optional.of(matcher.group(1)); + } + return Optional.empty(); + } +} diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java index ea2c985..5a68cdb 100644 --- a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java +++ b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillMigrationService.java @@ -11,6 +11,7 @@ import at.procon.dip.runtime.config.RuntimeMode; import at.procon.ted.repository.LegacyTedMigrationCursor; import at.procon.ted.repository.ProcurementDocumentRepository; import java.time.OffsetDateTime; +import java.time.ZoneOffset; import java.util.EnumSet; import java.util.List; import java.util.UUID; @@ -54,11 +55,7 @@ public class LegacyTedBackfillMigrationService { return run.getId(); } - List batch = procurementDocumentRepository.findNextMigrationBatch( - run.getLastLegacyCreatedAt(), - run.getLastLegacyDocumentId(), - limit - ); + List batch = loadNextBatch(run, limit); if (batch.isEmpty()) { markCompleted(run); @@ -89,6 +86,17 @@ public class LegacyTedBackfillMigrationService { } } + protected List loadNextBatch(LegacyTedMigrationRun run, int limit) { + if (run.getLastLegacyCreatedAt() == null || run.getLastLegacyDocumentId() == null) { + return procurementDocumentRepository.findFirstMigrationBatch(limit); + } + return procurementDocumentRepository.findNextMigrationBatch( + run.getLastLegacyCreatedAt(), + run.getLastLegacyDocumentId(), + limit + ); + } + @Transactional protected LegacyTedMigrationRun resolveRun() { if (properties.isResumeLatestIncompleteRun()) { @@ -121,7 +129,9 @@ public class LegacyTedBackfillMigrationService { run.setStatus(LegacyTedMigrationRunStatus.RUNNING); run.setProcessedCount(run.getProcessedCount() + 1); run.setSuccessCount(run.getSuccessCount() + 1); - run.setLastLegacyCreatedAt(cursor.getCreatedAt()); + run.setLastLegacyCreatedAt(cursor.getCreatedAt() != null + ? cursor.getCreatedAt().atOffset(ZoneOffset.UTC) + : null); run.setLastLegacyDocumentId(cursor.getId()); run.setLastDocDocumentId(outcome.documentId()); run.setLastProjectionId(outcome.projectionId()); diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java index bf99ea9..87f5e78 100644 --- a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java +++ b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java @@ -5,6 +5,7 @@ import at.procon.dip.domain.document.ContentRole; import at.procon.dip.domain.document.DocumentFamily; import at.procon.dip.domain.document.DocumentStatus; import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RelationType; import at.procon.dip.domain.document.RepresentationType; import at.procon.dip.domain.document.SourceType; import at.procon.dip.domain.document.StorageType; @@ -16,13 +17,16 @@ import at.procon.dip.domain.document.repository.DocumentContentRepository; import at.procon.dip.domain.document.repository.DocumentSourceRepository; import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; import at.procon.dip.domain.document.service.DocumentContentService; +import at.procon.dip.domain.document.service.DocumentRelationService; import at.procon.dip.domain.document.service.DocumentRepresentationService; import at.procon.dip.domain.document.service.DocumentService; import at.procon.dip.domain.document.service.command.AddDocumentContentCommand; import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand; import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand; +import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand; import at.procon.dip.domain.ted.service.TedGenericDocumentRootService; import at.procon.dip.domain.ted.service.TedNoticeProjectionService; +import at.procon.dip.domain.ted.service.TedPackageDocumentService; import at.procon.dip.embedding.config.EmbeddingProperties; import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; import at.procon.dip.extraction.spi.ExtractedStructuredPayload; @@ -32,6 +36,8 @@ import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.normalization.service.TextRepresentationBuildService; import at.procon.dip.normalization.spi.RepresentationBuildRequest; import at.procon.dip.normalization.spi.TextRepresentationDraft; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import at.procon.dip.search.service.DocumentLexicalIndexService; import at.procon.ted.model.entity.ProcurementDocument; import at.procon.ted.repository.ProcurementDocumentRepository; @@ -53,6 +59,7 @@ import org.springframework.transaction.annotation.Transactional; import org.springframework.util.StringUtils; @Service +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor @Slf4j public class LegacyTedBackfillWorker { @@ -63,6 +70,7 @@ public class LegacyTedBackfillWorker { private final ProcurementDocumentRepository procurementDocumentRepository; private final TedGenericDocumentRootService tedGenericDocumentRootService; + private final TedPackageDocumentService tedPackageDocumentService; private final TedNoticeProjectionService tedNoticeProjectionService; private final DocumentService documentService; private final DocumentSourceRepository sourceRepository; @@ -72,6 +80,7 @@ public class LegacyTedBackfillWorker { private final DocumentTextRepresentationRepository representationRepository; private final DocumentRepresentationService documentRepresentationService; private final DocumentLexicalIndexService lexicalIndexService; + private final DocumentRelationService documentRelationService; private final TextRepresentationBuildService textRepresentationBuildService; private final RepresentationEmbeddingOrchestrator embeddingOrchestrator; private final EmbeddingProperties embeddingProperties; @@ -89,6 +98,15 @@ public class LegacyTedBackfillWorker { List drafts = buildDrafts(legacyDocument); List savedRepresentations = ensureRepresentations(document, originalContent, normalizedTextContent, drafts); + tedPackageDocumentService.ensurePackageDocumentForLegacyNotice(legacyDocument) + .ifPresent(packageDocument -> documentRelationService.ensureRelation(new CreateDocumentRelationCommand( + packageDocument.getId(), + document.getId(), + RelationType.CONTAINS, + null, + legacyDocument.getSourcePath() + ))); + UUID projectionId = tedNoticeProjectionService.registerOrRefreshProjection(legacyDocument, document.getId()); documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED); diff --git a/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java index 4d9289a..639f810 100644 --- a/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java +++ b/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java @@ -66,6 +66,7 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio ContentRole.NORMALIZED_TEXT, Boolean.TRUE )); + /* drafts.add(new TextRepresentationDraft( RepresentationType.FULLTEXT, BUILDER_KEY, @@ -104,6 +105,7 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio ContentRole.NORMALIZED_TEXT, Boolean.FALSE )); + */ return drafts; } diff --git a/src/main/java/at/procon/ted/model/entity/Organization.java b/src/main/java/at/procon/ted/model/entity/Organization.java index 1972916..dd175ba 100644 --- a/src/main/java/at/procon/ted/model/entity/Organization.java +++ b/src/main/java/at/procon/ted/model/entity/Organization.java @@ -52,7 +52,7 @@ public class Organization { /** * Company/tax registration ID. */ - @Column(name = "company_id", length = 1000) + @Column(name = "company_id", columnDefinition = "TEXT") private String companyId; @Column(name = "country_code", length = 10) diff --git a/src/main/java/at/procon/ted/model/entity/ProcurementDocument.java b/src/main/java/at/procon/ted/model/entity/ProcurementDocument.java index 9e56436..f26539d 100644 --- a/src/main/java/at/procon/ted/model/entity/ProcurementDocument.java +++ b/src/main/java/at/procon/ted/model/entity/ProcurementDocument.java @@ -105,7 +105,7 @@ public class ProcurementDocument { @Column(name = "buyer_city", columnDefinition = "TEXT") private String buyerCity; - @Column(name = "buyer_postal_code", length = 100) + @Column(name = "buyer_postal_code", columnDefinition = "TEXT") private String buyerPostalCode; @Column(name = "buyer_nuts_code", length = 10) diff --git a/src/main/java/at/procon/ted/repository/LegacyTedMigrationCursor.java b/src/main/java/at/procon/ted/repository/LegacyTedMigrationCursor.java index a89b42d..58b712d 100644 --- a/src/main/java/at/procon/ted/repository/LegacyTedMigrationCursor.java +++ b/src/main/java/at/procon/ted/repository/LegacyTedMigrationCursor.java @@ -1,9 +1,9 @@ package at.procon.ted.repository; -import java.time.OffsetDateTime; +import java.time.Instant; import java.util.UUID; public interface LegacyTedMigrationCursor { UUID getId(); - OffsetDateTime getCreatedAt(); + Instant getCreatedAt(); } diff --git a/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java b/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java index e6268b0..bf76099 100644 --- a/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java +++ b/src/main/java/at/procon/ted/repository/ProcurementDocumentRepository.java @@ -211,14 +211,24 @@ public interface ProcurementDocumentRepository extends /** - * Lightweight cursor query for resumable legacy -> DOC/projection backfill. + * First lightweight cursor query for resumable legacy -> DOC/projection backfill. */ @Query(value = """ SELECT p.id AS id, p.created_at AS createdAt FROM ted.procurement_document p - WHERE (:lastCreatedAt IS NULL - OR p.created_at > :lastCreatedAt - OR (p.created_at = :lastCreatedAt AND CAST(p.id AS text) > CAST(:lastId AS text))) + ORDER BY p.created_at ASC, CAST(p.id AS text) ASC + LIMIT :limit + """, nativeQuery = true) + List findFirstMigrationBatch(@Param("limit") int limit); + + /** + * Next lightweight cursor query for resumable legacy -> DOC/projection backfill. + */ + @Query(value = """ + SELECT p.id AS id, p.created_at AS createdAt + FROM ted.procurement_document p + WHERE p.created_at > :lastCreatedAt + OR (p.created_at = :lastCreatedAt AND CAST(p.id AS text) > CAST(:lastId AS text)) ORDER BY p.created_at ASC, CAST(p.id AS text) ASC LIMIT :limit """, nativeQuery = true) diff --git a/src/main/java/at/procon/ted/service/XmlParserService.java b/src/main/java/at/procon/ted/service/XmlParserService.java index 34bc28b..8e1fead 100644 --- a/src/main/java/at/procon/ted/service/XmlParserService.java +++ b/src/main/java/at/procon/ted/service/XmlParserService.java @@ -228,7 +228,8 @@ public class XmlParserService { // Name org.setName(getTextContent(xpath, orgNode, ".//cac:PartyName/cbc:Name")); - + if(org.getName() == null) org.setName(""); + // Company ID org.setCompanyId(getTextContent(xpath, orgNode, ".//cac:PartyLegalEntity/cbc:CompanyID")); @@ -264,7 +265,361 @@ public class XmlParserService { } } + private final Map cache = new HashMap<>(); + + private XPathExpression getCompiled(XPath xpath, String expression) throws XPathExpressionException { + XPathExpression compiled = cache.get(expression); + if (compiled == null) { + compiled = xpath.compile(expression); + cache.put(expression, compiled); + } + return compiled; + } + + private String getTextContent(XPath xpath, Object item, String expression) throws XPathExpressionException { + XPathExpression expr = getCompiled(xpath, expression); + Node node = (Node) expr.evaluate(item, XPathConstants.NODE); + return node != null ? node.getTextContent().trim() : null; + } + + private Node getNode(XPath xpath, Object item, String expression) throws XPathExpressionException { + return (Node) getCompiled(xpath, expression).evaluate(item, XPathConstants.NODE); + } + + private NodeList getNodes(XPath xpath, Object item, String expression) throws XPathExpressionException { + return (NodeList) getCompiled(xpath, expression).evaluate(item, XPathConstants.NODESET); + } + + private Element getDirectChild(Element parent, String namespaceUri, String localName) { + Node child = parent.getFirstChild(); + while (child != null) { + if (child.getNodeType() == Node.ELEMENT_NODE) { + Element el = (Element) child; + if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) { + return el; + } + } + child = child.getNextSibling(); + } + return null; + } + + private List getDirectChildren(Element parent, String namespaceUri, String localName) { + List result = new ArrayList<>(); + Node child = parent.getFirstChild(); + while (child != null) { + if (child.getNodeType() == Node.ELEMENT_NODE) { + Element el = (Element) child; + if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) { + result.add(el); + } + } + child = child.getNextSibling(); + } + return result; + } + + private String getDirectChildText(Element parent, String namespaceUri, String localName) { + Element child = getDirectChild(parent, namespaceUri, localName); + if (child == null) { + return null; + } + return trimToNull(child.getTextContent()); + } + + private String trimToNull(String value) { + if (value == null) { + return null; + } + String trimmed = value.trim(); + return trimmed.isEmpty() ? null : trimmed; + } + + private void parseLotsDOM(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { + NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot"); + document.setTotalLots(lotNodes.getLength()); + + for (int i = 0; i < lotNodes.getLength(); i++) { + Node lotNode = lotNodes.item(i); + if (lotNode.getNodeType() != Node.ELEMENT_NODE) { + continue; + } + + Element lotEl = (Element) lotNode; + ProcurementLot lot = ProcurementLot.builder().build(); + + // Direct child values on the lot + lot.setLotId(getDirectChildText(lotEl, NS_CBC, "ID")); + + Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject"); + if (procurementProjectEl != null) { + lot.setInternalId(getDirectChildText(procurementProjectEl, NS_CBC, "ID")); + lot.setTitle(getDirectChildText(procurementProjectEl, NS_CBC, "Name")); + lot.setDescription(getDirectChildText(procurementProjectEl, NS_CBC, "Description")); + + // CPV codes + List lotCpvCodes = new ArrayList<>(); + for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) { + String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode"); + if (cpv != null && !cpv.isEmpty()) { + lotCpvCodes.add(cpv); + } + } + lot.setCpvCodes(lotCpvCodes.toArray(new String[0])); + + // NUTS codes + List lotNutsCodes = new ArrayList<>(); + for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) { + Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address"); + if (addressEl != null) { + String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode"); + if (nuts != null && !nuts.isEmpty()) { + lotNutsCodes.add(nuts); + } + } + } + lot.setNutsCodes(lotNutsCodes.toArray(new String[0])); + + // Duration + Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod"); + if (plannedPeriodEl != null) { + Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure"); + if (durationEl != null) { + String durationValue = trimToNull(durationEl.getTextContent()); + if (durationValue != null) { + try { + lot.setDurationValue(Double.parseDouble(durationValue)); + } catch (NumberFormatException e) { + log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId()); + } + } + + String unitCode = trimToNull(durationEl.getAttribute("unitCode")); + if (unitCode != null) { + lot.setDurationUnit(unitCode); + } + } + } + } + + // Submission deadline + Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess"); + if (tenderingProcessEl != null) { + Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod"); + if (deadlinePeriodEl != null) { + String endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate"); + if (endDate != null) { + String endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime"); + lot.setSubmissionDeadline(parseDateTime(endDate, endTime)); + + if (document.getSubmissionDeadline() == null) { + document.setSubmissionDeadline(lot.getSubmissionDeadline()); + } + } + } + } + + // EU funded + Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms"); + if (tenderingTermsEl != null) { + String fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode"); + lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds")); + } + + document.addLot(lot); + } + + document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded()))); + } + private void parseLots(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { + NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot"); + document.setTotalLots(lotNodes.getLength()); + + for (int i = 0; i < lotNodes.getLength(); i++) { + Node lotNode = lotNodes.item(i); + if (lotNode.getNodeType() != Node.ELEMENT_NODE) { + continue; + } + + Element lotEl = (Element) lotNode; + ProcurementLot lot = ProcurementLot.builder().build(); + + // Fast direct children + Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject"); + Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess"); + Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms"); + + // --- Lot ID --- + String lotId = getDirectChildText(lotEl, NS_CBC, "ID"); + if (lotId == null) { + lotId = getTextContent(xpath, lotNode, "cbc:ID"); + } + lot.setLotId(lotId); + + // --- Internal ID --- + String internalId = null; + if (procurementProjectEl != null) { + internalId = getDirectChildText(procurementProjectEl, NS_CBC, "ID"); + } + if (internalId == null) { + internalId = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:ID"); + } + lot.setInternalId(internalId); + + // --- Title --- + String title = null; + if (procurementProjectEl != null) { + title = getDirectChildText(procurementProjectEl, NS_CBC, "Name"); + } + if (title == null) { + title = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Name"); + } + lot.setTitle(title); + + // --- Description --- + String description = null; + if (procurementProjectEl != null) { + description = getDirectChildText(procurementProjectEl, NS_CBC, "Description"); + } + if (description == null) { + description = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Description"); + } + lot.setDescription(description); + + // --- CPV codes --- + List lotCpvCodes = new ArrayList<>(); + if (procurementProjectEl != null) { + for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) { + String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode"); + if (cpv != null && !cpv.isEmpty()) { + lotCpvCodes.add(cpv); + } + } + } + if (lotCpvCodes.isEmpty()) { + NodeList cpvNodes = getNodes(xpath, lotNode, + ".//cac:MainCommodityClassification/cbc:ItemClassificationCode"); + for (int j = 0; j < cpvNodes.getLength(); j++) { + String cpv = trimToNull(cpvNodes.item(j).getTextContent()); + if (cpv != null) { + lotCpvCodes.add(cpv); + } + } + } + lot.setCpvCodes(lotCpvCodes.toArray(new String[0])); + + // --- NUTS codes --- + List lotNutsCodes = new ArrayList<>(); + if (procurementProjectEl != null) { + for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) { + Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address"); + if (addressEl != null) { + String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode"); + if (nuts != null && !nuts.isEmpty()) { + lotNutsCodes.add(nuts); + } + } + } + } + if (lotNutsCodes.isEmpty()) { + NodeList nutsNodes = getNodes(xpath, lotNode, + ".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode"); + for (int j = 0; j < nutsNodes.getLength(); j++) { + String nuts = trimToNull(nutsNodes.item(j).getTextContent()); + if (nuts != null) { + lotNutsCodes.add(nuts); + } + } + } + lot.setNutsCodes(lotNutsCodes.toArray(new String[0])); + + // --- Duration --- + boolean durationSet = false; + if (procurementProjectEl != null) { + Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod"); + if (plannedPeriodEl != null) { + Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure"); + if (durationEl != null) { + String durationValue = trimToNull(durationEl.getTextContent()); + if (durationValue != null) { + try { + lot.setDurationValue(Double.parseDouble(durationValue)); + } catch (NumberFormatException e) { + log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId()); + } + } + + String unitCode = trimToNull(durationEl.getAttribute("unitCode")); + if (unitCode != null) { + lot.setDurationUnit(unitCode); + } + durationSet = true; + } + } + } + if (!durationSet) { + Node durationNode = getNode(xpath, lotNode, + "cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure"); + if (durationNode != null) { + String durationValue = trimToNull(durationNode.getTextContent()); + if (durationValue != null) { + try { + lot.setDurationValue(Double.parseDouble(durationValue)); + } catch (NumberFormatException e) { + log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId()); + } + } + if (durationNode instanceof Element durationEl) { + String unitCode = trimToNull(durationEl.getAttribute("unitCode")); + if (unitCode != null) { + lot.setDurationUnit(unitCode); + } + } + } + } + + // --- Submission deadline --- + String endDate = null; + String endTime = null; + if (tenderingProcessEl != null) { + Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod"); + if (deadlinePeriodEl != null) { + endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate"); + endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime"); + } + } + if (endDate == null) { + endDate = getTextContent(xpath, lotNode, + "cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndDate"); + endTime = getTextContent(xpath, lotNode, + "cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndTime"); + } + if (endDate != null) { + lot.setSubmissionDeadline(parseDateTime(endDate, endTime)); + if (document.getSubmissionDeadline() == null) { + document.setSubmissionDeadline(lot.getSubmissionDeadline()); + } + } + + // --- EU funded --- + String fundingProgramCode = null; + if (tenderingTermsEl != null) { + fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode"); + } + if (fundingProgramCode == null) { + fundingProgramCode = getTextContent(xpath, lotNode, + "cac:TenderingTerms/cbc:FundingProgramCode"); + } + lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds")); + + document.addLot(lot); + } + + document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded()))); + } + + private void parseLotsOld(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { NodeList lotNodes = (NodeList) xpath.evaluate( "//cac:ProcurementProjectLot", doc, XPathConstants.NODESET); @@ -288,7 +643,7 @@ public class XmlParserService { // CPV codes for this lot List lotCpvCodes = new ArrayList<>(); NodeList cpvNodes = (NodeList) xpath.evaluate( - ".//cac:MainCommodityClassification/cbc:ItemClassificationCode", + ".//cac:MainCommodityClassification/cbc:ItemClassificationCode", lotNode, XPathConstants.NODESET); for (int j = 0; j < cpvNodes.getLength(); j++) { lotCpvCodes.add(cpvNodes.item(j).getTextContent()); @@ -298,13 +653,13 @@ public class XmlParserService { // NUTS codes for this lot List lotNutsCodes = new ArrayList<>(); NodeList nutsNodes = (NodeList) xpath.evaluate( - ".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode", + ".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode", lotNode, XPathConstants.NODESET); for (int j = 0; j < nutsNodes.getLength(); j++) { lotNutsCodes.add(nutsNodes.item(j).getTextContent()); } lot.setNutsCodes(lotNutsCodes.toArray(new String[0])); - + // Duration String durationValue = getTextContent(xpath, lotNode, "cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure"); @@ -428,15 +783,10 @@ public class XmlParserService { } // Helper methods - - private String getTextContent(XPath xpath, Object item, String expression) throws XPathExpressionException { - Node node = (Node) xpath.evaluate(expression, item, XPathConstants.NODE); - return node != null ? node.getTextContent().trim() : null; - } private List getTextContents(XPath xpath, Object item, String expression) throws XPathExpressionException { List results = new ArrayList<>(); - NodeList nodes = (NodeList) xpath.evaluate(expression, item, XPathConstants.NODESET); + NodeList nodes = getNodes(xpath, item, expression); for (int i = 0; i < nodes.getLength(); i++) { String text = nodes.item(i).getTextContent().trim(); if (!text.isEmpty()) { @@ -447,9 +797,10 @@ public class XmlParserService { } private String getAttributeValue(XPath xpath, Object item, String expression, String attrName) throws XPathExpressionException { - Node node = (Node) xpath.evaluate(expression, item, XPathConstants.NODE); - if (node instanceof Element) { - return ((Element) node).getAttribute(attrName); + Node node = getNode(xpath, item, expression); + if (node instanceof Element element) { + String value = element.getAttribute(attrName); + return trimToNull(value); } return null; } diff --git a/src/main/resources/application-legacy.yml b/src/main/resources/application-legacy.yml index 5b71835..eef1beb 100644 --- a/src/main/resources/application-legacy.yml +++ b/src/main/resources/application-legacy.yml @@ -34,7 +34,7 @@ ted: # Use external HTTP API instead of subprocess use-http-api: true # Embedding service URL - api-url: http://172.20.240.18:8001 + api-url: http://172.20.20.6:8001 # Model name for sentence-transformers model-name: intfloat/multilingual-e5-large # Vector dimensions (must match model output) diff --git a/src/main/resources/application-new.yml b/src/main/resources/application-new.yml index 4a6dcf2..bbd1b58 100644 --- a/src/main/resources/application-new.yml +++ b/src/main/resources/application-new.yml @@ -223,7 +223,7 @@ dip: # ted packages download configuration ted-download: # Enable/disable automatic package download - enabled: true + enabled: false # Base URL for TED Daily Packages base-url: https://ted.europa.eu/packages/daily/ # Download directory for tar.gz files @@ -231,7 +231,7 @@ dip: # Start year for downloads start-year: 2026 # Polling interval (milliseconds) - 2 minutes - poll-interval: 120000 + poll-interval: 60000 # Retry interval for tail NOT_FOUND packages - 6 hours not-found-retry-interval: 21600000 # Grace period after year end before a previous-year tail 404 is treated as final @@ -246,6 +246,7 @@ dip: delay-between-downloads: 5000 # Delete tar.gz after ingestion delete-after-ingestion: true + ted: # Phase 3 TED projection configuration projection: # Enable/disable dual-write into the TED projection model on top of DOC.doc_document @@ -254,6 +255,7 @@ dip: startup-backfill-enabled: false # Maximum number of legacy TED documents to backfill during startup startup-backfill-limit: 250 + migration: legacy-audit: # Enable/disable the Wave 1 / Milestone A legacy integrity audit subsystem diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 3d903d2..1581780 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -2,7 +2,7 @@ # Author: Martin.Schweitzer@procon.co.at and claude.ai server: - port: 8885 + port: 8889 servlet: context-path: /api diff --git a/src/main/resources/db/migration/V1__initial_schema.sql b/src/main/resources/db/migration/V1__initial_schema.sql index 04cca68..a39b790 100644 --- a/src/main/resources/db/migration/V1__initial_schema.sql +++ b/src/main/resources/db/migration/V1__initial_schema.sql @@ -3,10 +3,10 @@ -- Description: PostgreSQL schema for storing EU eForms procurement notices with vector search support -- Create TED schema if it doesn't exist -CREATE SCHEMA IF NOT EXISTS TED; +CREATE SCHEMA IF NOT EXISTS ted; -- Set search path to use TED schema -SET search_path TO TED; +SET search_path TO ted; -- Enable required PostgreSQL extensions (wenn Berechtigung vorhanden) -- Falls Extensions nicht erstellt werden können, müssen diese vom DBA manuell erstellt werden diff --git a/src/main/resources/db/migration/V20__ted_projection_package_and_legacy_backfill.sql b/src/main/resources/db/migration/V20__ted_projection_package_and_legacy_backfill.sql index 8893fee..437278a 100644 --- a/src/main/resources/db/migration/V20__ted_projection_package_and_legacy_backfill.sql +++ b/src/main/resources/db/migration/V20__ted_projection_package_and_legacy_backfill.sql @@ -9,6 +9,9 @@ CREATE INDEX IF NOT EXISTS idx_ted_notice_projection_package_identifier ALTER TABLE IF EXISTS TED.organization ALTER COLUMN city TYPE TEXT; +ALTER TABLE IF EXISTS TED.organization + ALTER COLUMN company_id TYPE TEXT; + ALTER TABLE IF EXISTS TED.procurement_document ALTER COLUMN buyer_city TYPE TEXT; diff --git a/src/main/resources/db/migration/V21__doc_ted_package_documents_and_relations.sql b/src/main/resources/db/migration/V21__doc_ted_package_documents_and_relations.sql new file mode 100644 index 0000000..87c935d --- /dev/null +++ b/src/main/resources/db/migration/V21__doc_ted_package_documents_and_relations.sql @@ -0,0 +1,95 @@ +SET search_path TO TED, DOC, public; + +WITH legacy_package_map AS ( + SELECT + d.id AS legacy_procurement_document_id, + p.document_id AS child_document_id, + substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') AS package_identifier + FROM TED.procurement_document d + JOIN TED.ted_notice_projection p + ON p.legacy_procurement_document_id = d.id + WHERE substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') IS NOT NULL +), package_documents AS ( + SELECT DISTINCT + l.package_identifier, + 'TED:package:' || l.package_identifier AS business_key, + encode(digest('TED:package:' || l.package_identifier, 'sha256'), 'hex') AS dedup_hash + FROM legacy_package_map l +) +INSERT INTO DOC.doc_document ( + id, visibility, document_type, document_family, status, title, summary, mime_type, business_key, dedup_hash +) +SELECT + gen_random_uuid(), + 'PUBLIC', + 'TED_PACKAGE', + 'PROCUREMENT', + CASE + WHEN pkg.download_status = 'COMPLETED' THEN 'CLASSIFIED' + WHEN pkg.download_status IN ('FAILED', 'NOT_FOUND') THEN 'FAILED' + ELSE 'RECEIVED' + END, + 'TED Daily Package ' || pd.package_identifier, + CASE + WHEN pkg.package_identifier IS NULL THEN 'TED daily package ' || pd.package_identifier + ELSE 'TED daily package ' || pd.package_identifier || + ' (status=' || coalesce(pkg.download_status::text, 'UNKNOWN') || + ', xmlFileCount=' || coalesce(pkg.xml_file_count::text, 'null') || + ', processedCount=' || coalesce(pkg.processed_count::text, 'null') || + ', failedCount=' || coalesce(pkg.failed_count::text, 'null') || ')' + END, + 'application/gzip', + pd.business_key, + pd.dedup_hash +FROM package_documents pd +LEFT JOIN TED.ted_daily_package pkg + ON pkg.package_identifier = pd.package_identifier +WHERE NOT EXISTS ( + SELECT 1 + FROM DOC.doc_document existing + WHERE existing.business_key = pd.business_key +); + +UPDATE TED.ted_notice_projection p +SET package_identifier = substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') +FROM TED.procurement_document d +WHERE p.legacy_procurement_document_id = d.id + AND substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') IS NOT NULL + AND p.package_identifier IS DISTINCT FROM substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})'); + +WITH legacy_package_map AS ( + SELECT + p.document_id AS child_document_id, + substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') AS package_identifier + FROM TED.procurement_document d + JOIN TED.ted_notice_projection p + ON p.legacy_procurement_document_id = d.id + WHERE substring(coalesce(d.source_path, d.source_filename) from '(20[0-9]{7})') IS NOT NULL +), package_documents AS ( + SELECT + doc.id AS parent_document_id, + substring(doc.business_key from '(20[0-9]{7})') AS package_identifier + FROM DOC.doc_document doc + WHERE doc.document_type = 'TED_PACKAGE' + AND doc.business_key LIKE 'TED:package:%' +) +INSERT INTO DOC.doc_relation ( + id, parent_document_id, child_document_id, relation_type, sort_order, relation_metadata +) +SELECT + gen_random_uuid(), + pkg.parent_document_id, + l.child_document_id, + 'CONTAINS', + NULL, + 'packageIdentifier=' || l.package_identifier +FROM legacy_package_map l +JOIN package_documents pkg + ON pkg.package_identifier = l.package_identifier +WHERE NOT EXISTS ( + SELECT 1 + FROM DOC.doc_relation rel + WHERE rel.parent_document_id = pkg.parent_document_id + AND rel.child_document_id = l.child_document_id + AND rel.relation_type = 'CONTAINS' +);