From adc4f2da4344635a475fa88736edb45b6e6ddbd5 Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Tue, 17 Mar 2026 15:11:25 +0100 Subject: [PATCH] Refactor phases 3 --- .../PHASE3_TED_PROJECTION_MODEL.md | 46 +++++ ...cumentIntelligencePlatformApplication.java | 4 +- src/main/java/at/procon/dip/README_PHASE3.md | 30 +++ .../dip/domain/ted/entity/TedNoticeLot.java | 95 +++++++++ .../ted/entity/TedNoticeOrganization.java | 90 ++++++++ .../ted/entity/TedNoticeProjection.java | 194 ++++++++++++++++++ .../repository/TedNoticeLotRepository.java | 13 ++ .../TedNoticeOrganizationRepository.java | 13 ++ .../TedNoticeProjectionRepository.java | 15 ++ .../service/TedNoticeProjectionService.java | 181 ++++++++++++++++ .../startup/TedProjectionStartupRunner.java | 51 +++++ .../ted/config/TedProcessorProperties.java | 25 +++ .../BatchDocumentProcessingService.java | 10 +- .../service/DocumentProcessingService.java | 17 +- .../TedPhase2GenericDocumentService.java | 48 ++++- src/main/resources/application.yml | 9 + .../V6__ted_phase3_projection_model.sql | 105 ++++++++++ 17 files changed, 936 insertions(+), 10 deletions(-) create mode 100644 docs/architecture/PHASE3_TED_PROJECTION_MODEL.md create mode 100644 src/main/java/at/procon/dip/README_PHASE3.md create mode 100644 src/main/java/at/procon/dip/domain/ted/entity/TedNoticeLot.java create mode 100644 src/main/java/at/procon/dip/domain/ted/entity/TedNoticeOrganization.java create mode 100644 src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java create mode 100644 src/main/java/at/procon/dip/domain/ted/repository/TedNoticeLotRepository.java create mode 100644 src/main/java/at/procon/dip/domain/ted/repository/TedNoticeOrganizationRepository.java create mode 100644 src/main/java/at/procon/dip/domain/ted/repository/TedNoticeProjectionRepository.java create mode 100644 src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java create mode 100644 src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java create mode 100644 src/main/resources/db/migration/V6__ted_phase3_projection_model.sql diff --git a/docs/architecture/PHASE3_TED_PROJECTION_MODEL.md b/docs/architecture/PHASE3_TED_PROJECTION_MODEL.md new file mode 100644 index 0000000..c4032f8 --- /dev/null +++ b/docs/architecture/PHASE3_TED_PROJECTION_MODEL.md @@ -0,0 +1,46 @@ +# Phase 3 - TED projection model + +## Goal + +Move TED from being the implicit root data model to being a typed projection on top of the generic +canonical document model. + +## New persistence model + +### Generic root +- `DOC.doc_document` +- `DOC.doc_content` +- `DOC.doc_text_representation` +- `DOC.doc_embedding` + +### TED-specific projection +- `TED.ted_notice_projection` +- `TED.ted_notice_lot` +- `TED.ted_notice_organization` + +## Relationship model + +- one generic `DOC.doc_document` +- zero or one `TED.ted_notice_projection` +- zero to many `TED.ted_notice_lot` +- zero to many `TED.ted_notice_organization` + +The projection also keeps an optional back-reference to the legacy `TED.procurement_document` row to +support incremental migration and validation. + +## Runtime behavior + +When a new TED XML document is imported: +1. it is parsed into the existing legacy `ProcurementDocument` +2. the generic DOC root is ensured/refreshed +3. the primary text representation is ensured +4. if the generic vectorization pipeline is enabled, a pending embedding is ensured +5. the TED structured projection tables are refreshed from the parsed legacy document + +## Why this phase matters + +This is the first phase where TED is explicitly modeled as a document type projection instead of the +platform's canonical root entity. That makes the next steps possible: +- generic semantic search across multiple document types +- future non-TED projections +- migration of TED structured search to the new projection tables diff --git a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java index ba2c495..a172f94 100644 --- a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java +++ b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java @@ -18,8 +18,8 @@ import org.springframework.scheduling.annotation.EnableAsync; @SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"}) @EnableAsync //@EnableConfigurationProperties(TedProcessorProperties.class) -@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity"}) -@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository"}) +@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity"}) +@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository"}) public class DocumentIntelligencePlatformApplication { public static void main(String[] args) { diff --git a/src/main/java/at/procon/dip/README_PHASE3.md b/src/main/java/at/procon/dip/README_PHASE3.md new file mode 100644 index 0000000..200f81f --- /dev/null +++ b/src/main/java/at/procon/dip/README_PHASE3.md @@ -0,0 +1,30 @@ +# Phase 3 - TED as a structured projection on the generic document core + +Phase 3 makes TED a proper type-specific projection layered on top of the generic `DOC.doc_document` +root introduced in Phase 1 and the generic vectorization model introduced in Phase 2. + +## What is implemented +- `TED.ted_notice_projection` +- `TED.ted_notice_lot` +- `TED.ted_notice_organization` +- `TedNoticeProjectionService` +- optional startup backfill of missing TED projections +- processing flow updated so freshly imported TED notices dual-write to: + - `DOC` generic document/content/representation model + - `TED` structured projection tables + +## Core intent +TED is no longer the root model of the platform. Instead: +- `DOC.doc_document` is the canonical document root +- `TED.ted_notice_projection` holds TED-specific structured metadata +- `TED.ted_notice_lot` and `TED.ted_notice_organization` hold normalized child structures + +## Compatibility +This phase is additive: +- legacy `TED.procurement_document` remains in place +- existing search and API behavior continue to work +- new imports are now representable in both the legacy and new projection model + +## Important limitation +Structured search endpoints still read from the legacy TED model. Moving TED structured reads to the +new projection tables is the next migration step. diff --git a/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeLot.java b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeLot.java new file mode 100644 index 0000000..77ab844 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeLot.java @@ -0,0 +1,95 @@ +package at.procon.dip.domain.ted.entity; + +import at.procon.dip.architecture.SchemaNames; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.ManyToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.Table; +import jakarta.persistence.UniqueConstraint; +import java.math.BigDecimal; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import org.hibernate.annotations.JdbcTypeCode; +import org.hibernate.type.SqlTypes; + +@Entity +@Table(schema = SchemaNames.TED, name = "ted_notice_lot", indexes = { + @Index(name = "idx_ted_notice_lot_projection", columnList = "notice_projection_id") +}, uniqueConstraints = { + @UniqueConstraint(name = "uq_ted_notice_lot_projection_lot", columnNames = {"notice_projection_id", "lot_id"}) +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class TedNoticeLot { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @ManyToOne(fetch = FetchType.LAZY) + @JoinColumn(name = "notice_projection_id", nullable = false) + private TedNoticeProjection noticeProjection; + + @Column(name = "lot_id", nullable = false, length = 50) + private String lotId; + + @Column(name = "internal_id", columnDefinition = "TEXT") + private String internalId; + + @Column(name = "title", columnDefinition = "TEXT") + private String title; + + @Column(name = "description", columnDefinition = "TEXT") + private String description; + + @Column(name = "cpv_codes", columnDefinition = "VARCHAR(100)[]") + @JdbcTypeCode(SqlTypes.ARRAY) + private String[] cpvCodes; + + @Column(name = "nuts_codes", columnDefinition = "VARCHAR(20)[]") + @JdbcTypeCode(SqlTypes.ARRAY) + private String[] nutsCodes; + + @Column(name = "estimated_value", precision = 20, scale = 2) + private BigDecimal estimatedValue; + + @Column(name = "estimated_value_currency", length = 3) + private String estimatedValueCurrency; + + @Column(name = "duration_value") + private Double durationValue; + + @Column(name = "duration_unit", length = 20) + private String durationUnit; + + @Column(name = "submission_deadline") + private OffsetDateTime submissionDeadline; + + @Column(name = "eu_funded") + @Builder.Default + private Boolean euFunded = false; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeOrganization.java b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeOrganization.java new file mode 100644 index 0000000..9f63f9c --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeOrganization.java @@ -0,0 +1,90 @@ +package at.procon.dip.domain.ted.entity; + +import at.procon.dip.architecture.SchemaNames; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.ManyToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.Table; +import jakarta.persistence.UniqueConstraint; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +@Entity +@Table(schema = SchemaNames.TED, name = "ted_notice_organization", indexes = { + @Index(name = "idx_ted_notice_org_projection", columnList = "notice_projection_id"), + @Index(name = "idx_ted_notice_org_country", columnList = "country_code") +}, uniqueConstraints = { + @UniqueConstraint(name = "uq_ted_notice_org_projection_ref", columnNames = {"notice_projection_id", "org_reference"}) +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class TedNoticeOrganization { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @ManyToOne(fetch = FetchType.LAZY) + @JoinColumn(name = "notice_projection_id", nullable = false) + private TedNoticeProjection noticeProjection; + + @Column(name = "org_reference", length = 50) + private String orgReference; + + @Column(name = "role", length = 50) + private String role; + + @Column(name = "name", columnDefinition = "TEXT") + private String name; + + @Column(name = "company_id", length = 1000) + private String companyId; + + @Column(name = "country_code", length = 10) + private String countryCode; + + @Column(name = "city", length = 255) + private String city; + + @Column(name = "postal_code", length = 255) + private String postalCode; + + @Column(name = "street_name", columnDefinition = "TEXT") + private String streetName; + + @Column(name = "nuts_code", length = 10) + private String nutsCode; + + @Column(name = "website_uri", columnDefinition = "TEXT") + private String websiteUri; + + @Column(name = "email", length = 255) + private String email; + + @Column(name = "phone", length = 50) + private String phone; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java new file mode 100644 index 0000000..b06dea1 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java @@ -0,0 +1,194 @@ +package at.procon.dip.domain.ted.entity; + +import at.procon.dip.architecture.SchemaNames; +import at.procon.dip.domain.document.entity.Document; +import at.procon.ted.model.entity.ContractNature; +import at.procon.ted.model.entity.NoticeType; +import at.procon.ted.model.entity.ProcedureType; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.OneToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.PreUpdate; +import jakarta.persistence.Table; +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import org.hibernate.annotations.JdbcTypeCode; +import org.hibernate.type.SqlTypes; + +/** + * Phase 3 TED-specific projection that sits on top of the generic DOC document root. + */ +@Entity +@Table(schema = SchemaNames.TED, name = "ted_notice_projection", indexes = { + @Index(name = "idx_ted_proj_document", columnList = "document_id"), + @Index(name = "idx_ted_proj_legacy_doc", columnList = "legacy_procurement_document_id"), + @Index(name = "idx_ted_proj_publication_id", columnList = "publication_id"), + @Index(name = "idx_ted_proj_notice_type", columnList = "notice_type"), + @Index(name = "idx_ted_proj_buyer_country", columnList = "buyer_country_code"), + @Index(name = "idx_ted_proj_publication_date", columnList = "publication_date") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class TedNoticeProjection { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @OneToOne(fetch = FetchType.LAZY) + @JoinColumn(name = "document_id", nullable = false, unique = true) + private Document document; + + @Column(name = "legacy_procurement_document_id", unique = true) + private UUID legacyProcurementDocumentId; + + @Column(name = "notice_id", length = 100) + private String noticeId; + + @Column(name = "publication_id", length = 50) + private String publicationId; + + @Column(name = "notice_url", length = 255) + private String noticeUrl; + + @Column(name = "ojs_id", length = 20) + private String ojsId; + + @Column(name = "contract_folder_id", length = 100) + private String contractFolderId; + + @Enumerated(EnumType.STRING) + @Column(name = "notice_type", nullable = false, length = 50) + @Builder.Default + private NoticeType noticeType = NoticeType.OTHER; + + @Column(name = "notice_subtype_code", length = 10) + private String noticeSubtypeCode; + + @Column(name = "sdk_version", length = 20) + private String sdkVersion; + + @Column(name = "ubl_version", length = 10) + private String ublVersion; + + @Column(name = "language_code", length = 10) + private String languageCode; + + @Column(name = "issue_datetime") + private OffsetDateTime issueDateTime; + + @Column(name = "publication_date") + private LocalDate publicationDate; + + @Column(name = "submission_deadline") + private OffsetDateTime submissionDeadline; + + @Column(name = "buyer_name", columnDefinition = "TEXT") + private String buyerName; + + @Column(name = "buyer_country_code", length = 10) + private String buyerCountryCode; + + @Column(name = "buyer_city", length = 255) + private String buyerCity; + + @Column(name = "buyer_postal_code", length = 100) + private String buyerPostalCode; + + @Column(name = "buyer_nuts_code", length = 10) + private String buyerNutsCode; + + @Column(name = "buyer_activity_type", length = 50) + private String buyerActivityType; + + @Column(name = "buyer_legal_type", length = 50) + private String buyerLegalType; + + @Column(name = "project_title", columnDefinition = "TEXT") + private String projectTitle; + + @Column(name = "project_description", columnDefinition = "TEXT") + private String projectDescription; + + @Column(name = "internal_reference", length = 500) + private String internalReference; + + @Enumerated(EnumType.STRING) + @Column(name = "contract_nature", nullable = false, length = 50) + @Builder.Default + private ContractNature contractNature = ContractNature.UNKNOWN; + + @Enumerated(EnumType.STRING) + @Column(name = "procedure_type", length = 50) + @Builder.Default + private ProcedureType procedureType = ProcedureType.OTHER; + + @Column(name = "cpv_codes", columnDefinition = "VARCHAR(100)[]") + @JdbcTypeCode(SqlTypes.ARRAY) + private String[] cpvCodes; + + @Column(name = "nuts_codes", columnDefinition = "VARCHAR(20)[]") + @JdbcTypeCode(SqlTypes.ARRAY) + private String[] nutsCodes; + + @Column(name = "estimated_value", precision = 20, scale = 2) + private BigDecimal estimatedValue; + + @Column(name = "estimated_value_currency", length = 3) + private String estimatedValueCurrency; + + @Column(name = "total_lots") + @Builder.Default + private Integer totalLots = 0; + + @Column(name = "max_lots_awarded") + private Integer maxLotsAwarded; + + @Column(name = "max_lots_submitted") + private Integer maxLotsSubmitted; + + @Column(name = "regulatory_domain", length = 50) + private String regulatoryDomain; + + @Column(name = "eu_funded") + @Builder.Default + private Boolean euFunded = false; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @Builder.Default + @Column(name = "updated_at", nullable = false) + private OffsetDateTime updatedAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + updatedAt = OffsetDateTime.now(); + } + + @PreUpdate + protected void onUpdate() { + updatedAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/domain/ted/repository/TedNoticeLotRepository.java b/src/main/java/at/procon/dip/domain/ted/repository/TedNoticeLotRepository.java new file mode 100644 index 0000000..94c8b0b --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/repository/TedNoticeLotRepository.java @@ -0,0 +1,13 @@ +package at.procon.dip.domain.ted.repository; + +import at.procon.dip.domain.ted.entity.TedNoticeLot; +import java.util.List; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface TedNoticeLotRepository extends JpaRepository { + + List findByNoticeProjection_Id(UUID noticeProjectionId); + + void deleteByNoticeProjection_Id(UUID noticeProjectionId); +} diff --git a/src/main/java/at/procon/dip/domain/ted/repository/TedNoticeOrganizationRepository.java b/src/main/java/at/procon/dip/domain/ted/repository/TedNoticeOrganizationRepository.java new file mode 100644 index 0000000..4e42e31 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/repository/TedNoticeOrganizationRepository.java @@ -0,0 +1,13 @@ +package at.procon.dip.domain.ted.repository; + +import at.procon.dip.domain.ted.entity.TedNoticeOrganization; +import java.util.List; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface TedNoticeOrganizationRepository extends JpaRepository { + + List findByNoticeProjection_Id(UUID noticeProjectionId); + + void deleteByNoticeProjection_Id(UUID noticeProjectionId); +} diff --git a/src/main/java/at/procon/dip/domain/ted/repository/TedNoticeProjectionRepository.java b/src/main/java/at/procon/dip/domain/ted/repository/TedNoticeProjectionRepository.java new file mode 100644 index 0000000..dcc16cc --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/repository/TedNoticeProjectionRepository.java @@ -0,0 +1,15 @@ +package at.procon.dip.domain.ted.repository; + +import at.procon.dip.domain.ted.entity.TedNoticeProjection; +import java.util.Optional; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface TedNoticeProjectionRepository extends JpaRepository { + + Optional findByDocument_Id(UUID documentId); + + Optional findByLegacyProcurementDocumentId(UUID legacyProcurementDocumentId); + + boolean existsByLegacyProcurementDocumentId(UUID legacyProcurementDocumentId); +} diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java new file mode 100644 index 0000000..b6e3cb7 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java @@ -0,0 +1,181 @@ +package at.procon.dip.domain.ted.service; + +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.repository.DocumentRepository; +import at.procon.dip.domain.ted.entity.TedNoticeLot; +import at.procon.dip.domain.ted.entity.TedNoticeOrganization; +import at.procon.dip.domain.ted.entity.TedNoticeProjection; +import at.procon.dip.domain.ted.repository.TedNoticeLotRepository; +import at.procon.dip.domain.ted.repository.TedNoticeOrganizationRepository; +import at.procon.dip.domain.ted.repository.TedNoticeProjectionRepository; +import at.procon.ted.config.TedProcessorProperties; +import at.procon.ted.model.entity.Organization; +import at.procon.ted.model.entity.ProcurementDocument; +import at.procon.ted.model.entity.ProcurementLot; +import at.procon.ted.service.TedPhase2GenericDocumentService; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +/** + * Phase 3 service that materializes TED-specific structured projections on top of the generic DOC document root. + */ +@Service +@RequiredArgsConstructor +@Slf4j +public class TedNoticeProjectionService { + + private final TedProcessorProperties properties; + private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService; + private final DocumentRepository documentRepository; + private final TedNoticeProjectionRepository projectionRepository; + private final TedNoticeLotRepository lotRepository; + private final TedNoticeOrganizationRepository organizationRepository; + + @Transactional + public UUID registerOrRefreshProjection(ProcurementDocument legacyDocument) { + if (!properties.getProjection().isEnabled()) { + return null; + } + + TedPhase2GenericDocumentService.TedGenericDocumentSyncResult syncResult = + tedPhase2GenericDocumentService.syncTedDocument(legacyDocument); + return registerOrRefreshProjection(legacyDocument, syncResult.documentId()); + } + + @Transactional + public UUID registerOrRefreshProjection(ProcurementDocument legacyDocument, UUID genericDocumentId) { + if (!properties.getProjection().isEnabled()) { + return null; + } + + UUID resolvedDocumentId = genericDocumentId; + if (resolvedDocumentId == null) { + resolvedDocumentId = tedPhase2GenericDocumentService.ensureGenericTedDocument(legacyDocument); + } + + UUID finalResolvedDocumentId = resolvedDocumentId; + Document genericDocument = documentRepository.findById(resolvedDocumentId) + .orElseThrow(() -> new IllegalArgumentException("Unknown DOC document id: " + finalResolvedDocumentId)); + + TedNoticeProjection projection = projectionRepository.findByLegacyProcurementDocumentId(legacyDocument.getId()) + .or(() -> projectionRepository.findByDocument_Id(genericDocument.getId())) + .orElseGet(TedNoticeProjection::new); + + mapProjection(projection, genericDocument, legacyDocument); + projection = projectionRepository.save(projection); + replaceLots(projection, legacyDocument.getLots()); + replaceOrganizations(projection, legacyDocument.getOrganizations()); + + log.debug("Phase 3 TED projection ensured for legacy {} -> projection {} / doc {}", + legacyDocument.getId(), projection.getId(), genericDocument.getId()); + return projection.getId(); + } + + @Transactional(readOnly = true) + public Optional findByLegacyProcurementDocumentId(UUID legacyDocumentId) { + return projectionRepository.findByLegacyProcurementDocumentId(legacyDocumentId); + } + + private void mapProjection(TedNoticeProjection projection, Document genericDocument, ProcurementDocument legacyDocument) { + projection.setDocument(genericDocument); + projection.setLegacyProcurementDocumentId(legacyDocument.getId()); + projection.setNoticeId(legacyDocument.getNoticeId()); + projection.setPublicationId(legacyDocument.getPublicationId()); + projection.setNoticeUrl(legacyDocument.getNoticeUrl()); + projection.setOjsId(legacyDocument.getOjsId()); + projection.setContractFolderId(legacyDocument.getContractFolderId()); + projection.setNoticeType(legacyDocument.getNoticeType()); + projection.setNoticeSubtypeCode(legacyDocument.getNoticeSubtypeCode()); + projection.setSdkVersion(legacyDocument.getSdkVersion()); + projection.setUblVersion(legacyDocument.getUblVersion()); + projection.setLanguageCode(legacyDocument.getLanguageCode()); + projection.setIssueDateTime(legacyDocument.getIssueDateTime()); + projection.setPublicationDate(legacyDocument.getPublicationDate()); + projection.setSubmissionDeadline(legacyDocument.getSubmissionDeadline()); + projection.setBuyerName(legacyDocument.getBuyerName()); + projection.setBuyerCountryCode(legacyDocument.getBuyerCountryCode()); + projection.setBuyerCity(legacyDocument.getBuyerCity()); + projection.setBuyerPostalCode(legacyDocument.getBuyerPostalCode()); + projection.setBuyerNutsCode(legacyDocument.getBuyerNutsCode()); + projection.setBuyerActivityType(legacyDocument.getBuyerActivityType()); + projection.setBuyerLegalType(legacyDocument.getBuyerLegalType()); + projection.setProjectTitle(legacyDocument.getProjectTitle()); + projection.setProjectDescription(legacyDocument.getProjectDescription()); + projection.setInternalReference(legacyDocument.getInternalReference()); + projection.setContractNature(legacyDocument.getContractNature()); + projection.setProcedureType(legacyDocument.getProcedureType()); + projection.setCpvCodes(copyArray(legacyDocument.getCpvCodes())); + projection.setNutsCodes(copyArray(legacyDocument.getNutsCodes())); + projection.setEstimatedValue(legacyDocument.getEstimatedValue()); + projection.setEstimatedValueCurrency(legacyDocument.getEstimatedValueCurrency()); + projection.setTotalLots(legacyDocument.getTotalLots()); + projection.setMaxLotsAwarded(legacyDocument.getMaxLotsAwarded()); + projection.setMaxLotsSubmitted(legacyDocument.getMaxLotsSubmitted()); + projection.setRegulatoryDomain(legacyDocument.getRegulatoryDomain()); + projection.setEuFunded(legacyDocument.getEuFunded()); + } + + private void replaceLots(TedNoticeProjection projection, List legacyLots) { + lotRepository.deleteByNoticeProjection_Id(projection.getId()); + if (legacyLots == null || legacyLots.isEmpty()) { + return; + } + + List projectedLots = new ArrayList<>(); + for (ProcurementLot lot : legacyLots) { + projectedLots.add(TedNoticeLot.builder() + .noticeProjection(projection) + .lotId(lot.getLotId()) + .internalId(lot.getInternalId()) + .title(lot.getTitle()) + .description(lot.getDescription()) + .cpvCodes(copyArray(lot.getCpvCodes())) + .nutsCodes(copyArray(lot.getNutsCodes())) + .estimatedValue(lot.getEstimatedValue()) + .estimatedValueCurrency(lot.getEstimatedValueCurrency()) + .durationValue(lot.getDurationValue()) + .durationUnit(lot.getDurationUnit()) + .submissionDeadline(lot.getSubmissionDeadline()) + .euFunded(lot.getEuFunded()) + .build()); + } + lotRepository.saveAll(projectedLots); + } + + private void replaceOrganizations(TedNoticeProjection projection, List legacyOrganizations) { + organizationRepository.deleteByNoticeProjection_Id(projection.getId()); + if (legacyOrganizations == null || legacyOrganizations.isEmpty()) { + return; + } + + List projectedOrganizations = new ArrayList<>(); + for (Organization organization : legacyOrganizations) { + projectedOrganizations.add(TedNoticeOrganization.builder() + .noticeProjection(projection) + .orgReference(organization.getOrgReference()) + .role(organization.getRole()) + .name(organization.getName()) + .companyId(organization.getCompanyId()) + .countryCode(organization.getCountryCode()) + .city(organization.getCity()) + .postalCode(organization.getPostalCode()) + .streetName(organization.getStreetName()) + .nutsCode(organization.getNutsCode()) + .websiteUri(organization.getWebsiteUri()) + .email(organization.getEmail()) + .phone(organization.getPhone()) + .build()); + } + organizationRepository.saveAll(projectedOrganizations); + } + + private String[] copyArray(String[] source) { + return source == null ? null : source.clone(); + } +} diff --git a/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java b/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java new file mode 100644 index 0000000..57b6d5c --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java @@ -0,0 +1,51 @@ +package at.procon.dip.domain.ted.startup; + +import at.procon.dip.domain.ted.repository.TedNoticeProjectionRepository; +import at.procon.dip.domain.ted.service.TedNoticeProjectionService; +import at.procon.ted.config.TedProcessorProperties; +import at.procon.ted.repository.ProcurementDocumentRepository; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.data.domain.PageRequest; +import org.springframework.data.domain.Sort; +import org.springframework.stereotype.Component; + +/** + * Optional startup backfill for Phase 3 TED projections. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class TedProjectionStartupRunner implements ApplicationRunner { + + private final TedProcessorProperties properties; + private final ProcurementDocumentRepository procurementDocumentRepository; + private final TedNoticeProjectionRepository projectionRepository; + private final TedNoticeProjectionService projectionService; + + @Override + public void run(ApplicationArguments args) { + if (!properties.getProjection().isEnabled() || !properties.getProjection().isStartupBackfillEnabled()) { + return; + } + + int limit = properties.getProjection().getStartupBackfillLimit(); + log.info("Phase 3 startup backfill enabled - ensuring TED projections for up to {} documents", limit); + + var page = procurementDocumentRepository.findAll( + PageRequest.of(0, limit, Sort.by(Sort.Direction.ASC, "createdAt"))); + + int synced = 0; + for (var legacyDocument : page.getContent()) { + if (projectionRepository.existsByLegacyProcurementDocumentId(legacyDocument.getId())) { + continue; + } + projectionService.registerOrRefreshProjection(legacyDocument); + synced++; + } + + log.info("Phase 3 startup backfill completed - synced {} TED projections", synced); + } +} diff --git a/src/main/java/at/procon/ted/config/TedProcessorProperties.java b/src/main/java/at/procon/ted/config/TedProcessorProperties.java index aa434c0..e804f5e 100644 --- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java +++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java @@ -27,6 +27,7 @@ public class TedProcessorProperties { private DownloadProperties download = new DownloadProperties(); private MailProperties mail = new MailProperties(); private SolutionBriefProperties solutionBrief = new SolutionBriefProperties(); + private ProjectionProperties projection = new ProjectionProperties(); /** * Input directory configuration for Apache Camel file consumer. @@ -403,6 +404,30 @@ public class TedProcessorProperties { */ @Positive private long mimeInputPollInterval = 10000; + } + + + /** + * Phase 3 TED projection configuration. + */ + @Data + public static class ProjectionProperties { + + /** + * Enable/disable Phase 3 TED structured projection dual-write. + */ + private boolean enabled = true; + + /** + * Optional startup backfill of missing projections from legacy TED documents. + */ + private boolean startupBackfillEnabled = false; + + /** + * Maximum number of legacy TED documents to backfill during startup. + */ + @Positive + private int startupBackfillLimit = 250; } /** diff --git a/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java b/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java index 1526192..0958d66 100644 --- a/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java +++ b/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java @@ -1,5 +1,7 @@ package at.procon.ted.service; +import at.procon.dip.domain.ted.service.TedNoticeProjectionService; +import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.model.entity.ProcurementDocument; import at.procon.ted.model.entity.ProcessingLog; import at.procon.ted.repository.ProcurementDocumentRepository; @@ -38,7 +40,9 @@ public class BatchDocumentProcessingService { private final XmlParserService xmlParserService; private final ProcurementDocumentRepository documentRepository; private final ProcessingLogService processingLogService; + private final TedProcessorProperties properties; private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService; + private final TedNoticeProjectionService tedNoticeProjectionService; /** * Process a batch of XML files from a Daily Package. @@ -132,7 +136,11 @@ public class BatchDocumentProcessingService { doc.getSourceFilename(), 0); if (doc.getDocumentHash() != null) { - tedPhase2GenericDocumentService.registerOrRefreshTedDocument(doc); + if (properties.getProjection().isEnabled()) { + tedNoticeProjectionService.registerOrRefreshProjection(doc); + } else if (properties.getVectorization().isGenericPipelineEnabled()) { + tedPhase2GenericDocumentService.registerOrRefreshTedDocument(doc); + } } } diff --git a/src/main/java/at/procon/ted/service/DocumentProcessingService.java b/src/main/java/at/procon/ted/service/DocumentProcessingService.java index dd04db1..64b82ac 100644 --- a/src/main/java/at/procon/ted/service/DocumentProcessingService.java +++ b/src/main/java/at/procon/ted/service/DocumentProcessingService.java @@ -1,5 +1,6 @@ package at.procon.ted.service; +import at.procon.dip.domain.ted.service.TedNoticeProjectionService; import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.event.DocumentSavedEvent; import at.procon.ted.model.entity.*; @@ -37,6 +38,7 @@ public class DocumentProcessingService { private final TedProcessorProperties properties; private final ApplicationEventPublisher eventPublisher; private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService; + private final TedNoticeProjectionService tedNoticeProjectionService; /** * Process an XML document from the file system. @@ -88,7 +90,16 @@ public class DocumentProcessingService { "Document parsed and stored successfully", null, filename, (int) (System.currentTimeMillis() - startTime)); - if (properties.getVectorization().isGenericPipelineEnabled()) { + if (properties.getProjection().isEnabled()) { + tedNoticeProjectionService.registerOrRefreshProjection(document); + log.debug("Document saved successfully, Phase 3 TED projection ensured: {}", document.getId()); + + if (!properties.getVectorization().isGenericPipelineEnabled()) { + // Keep legacy vectorization behavior when the generic embedding pipeline is disabled. + eventPublisher.publishEvent(new DocumentSavedEvent(document.getId(), document.getPublicationId())); + log.debug("Document saved successfully, legacy vectorization event published: {}", document.getId()); + } + } else if (properties.getVectorization().isGenericPipelineEnabled()) { tedPhase2GenericDocumentService.registerOrRefreshTedDocument(document); log.debug("Document saved successfully, Phase 2 generic vectorization record ensured: {}", document.getId()); } else { @@ -147,7 +158,9 @@ public class DocumentProcessingService { documentRepository.save(updated); - if (properties.getVectorization().isGenericPipelineEnabled()) { + if (properties.getProjection().isEnabled()) { + tedNoticeProjectionService.registerOrRefreshProjection(updated); + } else if (properties.getVectorization().isGenericPipelineEnabled()) { tedPhase2GenericDocumentService.registerOrRefreshTedDocument(updated); } diff --git a/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java b/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java index 9563578..efbdaf5 100644 --- a/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java +++ b/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java @@ -34,6 +34,8 @@ import org.springframework.transaction.annotation.Transactional; /** * Phase 2 bridge that dual-writes TED documents into the generic DOC persistence backbone. + * In Phase 3 the same bridge also becomes the generic root for TED projections even when the + * generic vectorization route is disabled. */ @Service @RequiredArgsConstructor @@ -49,10 +51,28 @@ public class TedPhase2GenericDocumentService { private final DocumentService documentService; private final DocumentEmbeddingService embeddingService; + /** + * Phase 2 compatibility API used by manual trigger flows. Returns the embedding id when the + * generic vectorization pipeline is enabled; otherwise returns {@code null} after ensuring the + * generic document root exists. + */ @Transactional public UUID registerOrRefreshTedDocument(ProcurementDocument tedDocument) { - if (!properties.getVectorization().isGenericPipelineEnabled()) { - return null; + return syncTedDocument(tedDocument).embeddingId(); + } + + /** + * Ensures the generic DOC document exists and is refreshed from the current legacy TED document state. + */ + @Transactional + public UUID ensureGenericTedDocument(ProcurementDocument tedDocument) { + return syncTedDocument(tedDocument).documentId(); + } + + @Transactional + public TedGenericDocumentSyncResult syncTedDocument(ProcurementDocument tedDocument) { + if (!isGenericTedSyncEnabled()) { + return new TedGenericDocumentSyncResult(null, null, null); } Document document = documentRepository.findByDedupHash(tedDocument.getDocumentHash()) @@ -73,10 +93,21 @@ public class TedPhase2GenericDocumentService { ensureTedSource(document, tedDocument); DocumentContent originalContent = ensureOriginalContent(document, tedDocument); DocumentTextRepresentation representation = ensurePrimaryRepresentation(document, originalContent, tedDocument); - DocumentEmbedding embedding = ensurePendingEmbedding(document, representation); - log.debug("Phase 2 DOC bridge ensured generic TED document {} -> embedding {}", document.getId(), embedding.getId()); - return embedding.getId(); + UUID embeddingId = null; + if (properties.getVectorization().isGenericPipelineEnabled()) { + DocumentEmbedding embedding = ensurePendingEmbedding(document, representation); + embeddingId = embedding.getId(); + log.debug("Phase 2 DOC bridge ensured generic TED document {} -> embedding {}", document.getId(), embeddingId); + } else { + log.debug("Phase 2 DOC bridge ensured generic TED document {} without embedding queue", document.getId()); + } + + return new TedGenericDocumentSyncResult(document.getId(), embeddingId, representation.getId()); + } + + private boolean isGenericTedSyncEnabled() { + return properties.getVectorization().isGenericPipelineEnabled() || properties.getProjection().isEnabled(); } private Document createGenericDocument(ProcurementDocument tedDocument) { @@ -194,4 +225,11 @@ public class TedPhase2GenericDocumentService { } return "TED:hash:" + tedDocument.getDocumentHash(); } + + public record TedGenericDocumentSyncResult( + UUID documentId, + UUID embeddingId, + UUID primaryRepresentationId + ) { + } } diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 15dcad8..059fe1f 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -195,6 +195,15 @@ ted: # Polling interval for MIME input directory (milliseconds) mime-input-poll-interval: 10000 + # Phase 3 TED projection configuration + projection: + # Enable/disable dual-write into the TED projection model on top of DOC.doc_document + enabled: true + # Optional startup backfill for legacy TED documents without a projection row yet + startup-backfill-enabled: false + # Maximum number of legacy TED documents to backfill during startup + startup-backfill-limit: 250 + # Solution Brief processing configuration solution-brief: # Enable/disable Solution Brief processing diff --git a/src/main/resources/db/migration/V6__ted_phase3_projection_model.sql b/src/main/resources/db/migration/V6__ted_phase3_projection_model.sql new file mode 100644 index 0000000..e4d54be --- /dev/null +++ b/src/main/resources/db/migration/V6__ted_phase3_projection_model.sql @@ -0,0 +1,105 @@ +-- Phase 3: TED becomes a structured projection on top of the generic DOC document root. +-- Additive migration; legacy TED tables remain in place for compatibility. + +SET search_path TO TED, DOC, public; + +CREATE TABLE IF NOT EXISTS TED.ted_notice_projection ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + document_id UUID NOT NULL UNIQUE REFERENCES DOC.doc_document(id) ON DELETE CASCADE, + legacy_procurement_document_id UUID UNIQUE REFERENCES TED.procurement_document(id) ON DELETE SET NULL, + notice_id VARCHAR(100), + publication_id VARCHAR(50), + notice_url VARCHAR(255), + ojs_id VARCHAR(20), + contract_folder_id VARCHAR(100), + notice_type VARCHAR(50) NOT NULL DEFAULT 'OTHER', + notice_subtype_code VARCHAR(10), + sdk_version VARCHAR(20), + ubl_version VARCHAR(10), + language_code VARCHAR(10), + issue_datetime TIMESTAMP WITH TIME ZONE, + publication_date DATE, + submission_deadline TIMESTAMP WITH TIME ZONE, + buyer_name TEXT, + buyer_country_code VARCHAR(10), + buyer_city VARCHAR(255), + buyer_postal_code VARCHAR(100), + buyer_nuts_code VARCHAR(10), + buyer_activity_type VARCHAR(50), + buyer_legal_type VARCHAR(50), + project_title TEXT, + project_description TEXT, + internal_reference VARCHAR(500), + contract_nature VARCHAR(50) NOT NULL DEFAULT 'UNKNOWN', + procedure_type VARCHAR(50) DEFAULT 'OTHER', + cpv_codes VARCHAR(100)[], + nuts_codes VARCHAR(20)[], + estimated_value NUMERIC(20,2), + estimated_value_currency VARCHAR(3), + total_lots INTEGER NOT NULL DEFAULT 0, + max_lots_awarded INTEGER, + max_lots_submitted INTEGER, + regulatory_domain VARCHAR(50), + eu_funded BOOLEAN NOT NULL DEFAULT FALSE, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX IF NOT EXISTS idx_ted_notice_projection_publication_id + ON TED.ted_notice_projection(publication_id); +CREATE INDEX IF NOT EXISTS idx_ted_notice_projection_notice_type + ON TED.ted_notice_projection(notice_type); +CREATE INDEX IF NOT EXISTS idx_ted_notice_projection_buyer_country + ON TED.ted_notice_projection(buyer_country_code); +CREATE INDEX IF NOT EXISTS idx_ted_notice_projection_publication_date + ON TED.ted_notice_projection(publication_date DESC); +CREATE INDEX IF NOT EXISTS idx_ted_notice_projection_document + ON TED.ted_notice_projection(document_id); +CREATE INDEX IF NOT EXISTS idx_ted_notice_projection_legacy_doc + ON TED.ted_notice_projection(legacy_procurement_document_id); + +CREATE TABLE IF NOT EXISTS TED.ted_notice_lot ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + notice_projection_id UUID NOT NULL REFERENCES TED.ted_notice_projection(id) ON DELETE CASCADE, + lot_id VARCHAR(50) NOT NULL, + internal_id TEXT, + title TEXT, + description TEXT, + cpv_codes VARCHAR(100)[], + nuts_codes VARCHAR(20)[], + estimated_value NUMERIC(20,2), + estimated_value_currency VARCHAR(3), + duration_value DOUBLE PRECISION, + duration_unit VARCHAR(20), + submission_deadline TIMESTAMP WITH TIME ZONE, + eu_funded BOOLEAN NOT NULL DEFAULT FALSE, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT uq_ted_notice_lot_projection_lot UNIQUE (notice_projection_id, lot_id) +); + +CREATE INDEX IF NOT EXISTS idx_ted_notice_lot_projection + ON TED.ted_notice_lot(notice_projection_id); + +CREATE TABLE IF NOT EXISTS TED.ted_notice_organization ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + notice_projection_id UUID NOT NULL REFERENCES TED.ted_notice_projection(id) ON DELETE CASCADE, + org_reference VARCHAR(50), + role VARCHAR(50), + name TEXT, + company_id VARCHAR(1000), + country_code VARCHAR(10), + city VARCHAR(255), + postal_code VARCHAR(255), + street_name TEXT, + nuts_code VARCHAR(10), + website_uri TEXT, + email VARCHAR(255), + phone VARCHAR(50), + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT uq_ted_notice_org_projection_ref UNIQUE (notice_projection_id, org_reference) +); + +CREATE INDEX IF NOT EXISTS idx_ted_notice_org_projection + ON TED.ted_notice_organization(notice_projection_id); +CREATE INDEX IF NOT EXISTS idx_ted_notice_org_country + ON TED.ted_notice_organization(country_code);