From 902456001e096f6dd9e6b047011c67b12b85a0fa Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Tue, 28 Apr 2026 15:12:38 +0200 Subject: [PATCH] introduced TED Notice Lot documents --- ...beddingClusterSelectionRepositoryImpl.java | 1 - .../dip/domain/document/DocumentType.java | 1 + .../ted/config/TedProjectionProperties.java | 23 ++ .../TedLotDocumentMaterializationService.java | 262 ++++++++++++++++++ .../service/TedNoticeProjectionService.java | 4 +- .../startup/TedProjectionStartupRunner.java | 35 ++- .../ingestion/util/DocumentImportSupport.java | 2 +- .../db/migration/V32__ted_lot_documents.sql | 48 ++++ 8 files changed, 369 insertions(+), 7 deletions(-) create mode 100644 src/main/java/at/procon/dip/domain/ted/service/TedLotDocumentMaterializationService.java create mode 100644 src/main/resources/db/migration/V32__ted_lot_documents.sql diff --git a/src/main/java/at/procon/dip/clustering/repository/DocumentEmbeddingClusterSelectionRepositoryImpl.java b/src/main/java/at/procon/dip/clustering/repository/DocumentEmbeddingClusterSelectionRepositoryImpl.java index 1119d53..2ffe8a6 100644 --- a/src/main/java/at/procon/dip/clustering/repository/DocumentEmbeddingClusterSelectionRepositoryImpl.java +++ b/src/main/java/at/procon/dip/clustering/repository/DocumentEmbeddingClusterSelectionRepositoryImpl.java @@ -46,7 +46,6 @@ public class DocumentEmbeddingClusterSelectionRepositoryImpl implements Document where e.embedding_status = 'COMPLETED' and e.embedding_vector is not null and e.prefix_profile_id is not null - and d. """); MapSqlParameterSource params = new MapSqlParameterSource(); applyFilters(spec, sql, params); diff --git a/src/main/java/at/procon/dip/domain/document/DocumentType.java b/src/main/java/at/procon/dip/domain/document/DocumentType.java index 4f8c1dc..af88563 100644 --- a/src/main/java/at/procon/dip/domain/document/DocumentType.java +++ b/src/main/java/at/procon/dip/domain/document/DocumentType.java @@ -6,6 +6,7 @@ package at.procon.dip.domain.document; public enum DocumentType { TED_PACKAGE, TED_NOTICE, + TED_NOTICE_LOT, TIME_ENTRY, EMAIL, MIME_MESSAGE, diff --git a/src/main/java/at/procon/dip/domain/ted/config/TedProjectionProperties.java b/src/main/java/at/procon/dip/domain/ted/config/TedProjectionProperties.java index 9ae124d..8d60c99 100644 --- a/src/main/java/at/procon/dip/domain/ted/config/TedProjectionProperties.java +++ b/src/main/java/at/procon/dip/domain/ted/config/TedProjectionProperties.java @@ -17,4 +17,27 @@ public class TedProjectionProperties { private int structuredSearchHybridCandidateLimit = 5000; @Positive private int structuredSearchFacetBucketLimit = 12; + private LotDocuments lotDocuments = new LotDocuments(); + + @Data + public static class LotDocuments { + /** + * Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot. + */ + private boolean enabled = false; + /** + * Optional startup/backfill path for notices that were imported before lot documents existed. + */ + private boolean startupBackfillEnabled = false; + @Positive + private int startupBackfillLimit = 1000; + /** + * Queue embeddings whenever the lot semantic text representation is created or changed. + */ + private boolean queueEmbeddings = false; + /** + * Include parent notice project description even when the lot already has its own description. + */ + private boolean includeProjectDescription = false; + } } diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedLotDocumentMaterializationService.java b/src/main/java/at/procon/dip/domain/ted/service/TedLotDocumentMaterializationService.java new file mode 100644 index 0000000..92addb0 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/service/TedLotDocumentMaterializationService.java @@ -0,0 +1,262 @@ +package at.procon.dip.domain.ted.service; + +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RelationType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.entity.DocumentRelation; +import at.procon.dip.domain.document.entity.DocumentTextRepresentation; +import at.procon.dip.domain.document.repository.DocumentRelationRepository; +import at.procon.dip.domain.document.repository.DocumentRepository; +import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; +import at.procon.dip.domain.document.service.DocumentRepresentationService; +import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand; +import at.procon.dip.domain.ted.config.TedProjectionProperties; +import at.procon.dip.domain.ted.entity.TedNoticeLot; +import at.procon.dip.domain.ted.entity.TedNoticeProjection; +import at.procon.dip.domain.ted.repository.TedNoticeLotRepository; +import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.registry.EmbeddingModelRegistry; +import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; +import at.procon.dip.search.service.DocumentLexicalIndexService; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.UUID; +import java.util.stream.Collectors; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; +import org.springframework.util.StringUtils; + +@Service +@RequiredArgsConstructor +@Slf4j +public class TedLotDocumentMaterializationService { + + public static final String BUILDER_KEY = "ted-lot-structured-text"; + + private final TedProjectionProperties properties; + private final TedNoticeLotRepository lotRepository; + private final DocumentRepository documentRepository; + private final DocumentRelationRepository relationRepository; + private final DocumentTextRepresentationRepository representationRepository; + private final DocumentRepresentationService documentRepresentationService; + private final DocumentLexicalIndexService lexicalIndexService; + private final RepresentationEmbeddingOrchestrator embeddingOrchestrator; + private final EmbeddingProperties embeddingProperties; + private final EmbeddingModelRegistry modelRegistry; + + @Transactional + public int materializeProjectionLots(UUID projectionId) { + if (!properties.getLotDocuments().isEnabled()) { + return 0; + } + List lots = lotRepository.findByNoticeProjection_Id(projectionId); + int count = 0; + for (TedNoticeLot lot : lots) { + materializeLot(lot); + count++; + } + return count; + } + + @Transactional + public void materializeLots(TedNoticeProjection projection, List lots) { + if (!properties.getLotDocuments().isEnabled() || lots == null || lots.isEmpty()) { + return; + } + for (TedNoticeLot lot : lots) { + materializeLot(lot); + } + } + + private DocumentTextRepresentation materializeLot(TedNoticeLot lot) { + TedNoticeProjection projection = lot.getNoticeProjection(); + Document parent = projection.getDocument(); + String semanticText = buildSemanticText(projection, lot); + if (!StringUtils.hasText(semanticText)) { + log.debug("Skipping TED lot document for lot {} / projection {} because semantic text is blank", lot.getLotId(), projection.getId()); + return null; + } + + String businessKey = buildLotBusinessKey(projection, lot); + Document lotDocument = documentRepository.findByBusinessKey(businessKey) + .orElseGet(() -> newLotDocument(parent, businessKey)); + lotDocument.setDocumentType(DocumentType.TED_NOTICE_LOT); + lotDocument.setDocumentFamily(DocumentFamily.PROCUREMENT); + lotDocument.setStatus(DocumentStatus.REPRESENTED); + lotDocument.setTitle(firstNonBlank(lot.getTitle(), projection.getProjectTitle(), businessKey)); + lotDocument.setSummary(firstNonBlank(lot.getDescription(), projection.getProjectDescription())); + lotDocument.setLanguageCode(firstNonBlank(projection.getLanguageCode(), parent.getLanguageCode())); + lotDocument.setMimeType("application/x-ted-notice-lot"); + lotDocument = documentRepository.save(lotDocument); + + ensureRelation(parent, lotDocument, lot); + return upsertSemanticRepresentation(lotDocument, semanticText, lotDocument.getLanguageCode()); + } + + private Document newLotDocument(Document parent, String businessKey) { + Document document = new Document(); + document.setOwnerTenant(parent.getOwnerTenant()); + document.setVisibility(parent.getVisibility()); + document.setBusinessKey(businessKey); + return document; + } + + private void ensureRelation(Document parent, Document child, TedNoticeLot lot) { + if (!relationRepository.existsByParentDocument_IdAndChildDocument_IdAndRelationType(parent.getId(), child.getId(), RelationType.CONTAINS)) { + relationRepository.save(DocumentRelation.builder() + .parentDocument(parent) + .childDocument(child) + .relationType(RelationType.CONTAINS) + .sortOrder(resolveSortOrder(lot)) + .relationMetadata("{\"source\":\"ted-lot-materialization\",\"lotId\":\"" + escapeJson(firstNonBlank(lot.getLotId(), lot.getInternalId(), lot.getId().toString())) + "\"}") + .build()); + } + } + + private DocumentTextRepresentation upsertSemanticRepresentation(Document document, String semanticText, String languageCode) { + Optional existing = representationRepository + .findByDocument_IdAndRepresentationType(document.getId(), RepresentationType.SEMANTIC_TEXT) + .stream() + .filter(r -> BUILDER_KEY.equals(r.getBuilderKey()) || r.isPrimaryRepresentation()) + .findFirst(); + + boolean changed = existing.isEmpty() + || !semanticText.equals(existing.get().getTextBody()) + || !equalsNullable(languageCode, existing.get().getLanguageCode()) + || !BUILDER_KEY.equals(existing.get().getBuilderKey()); + + DocumentTextRepresentation semantic = existing + .map(found -> changed ? updateRepresentation(found, semanticText, languageCode) : found) + .orElseGet(() -> documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand( + document.getId(), + null, + RepresentationType.SEMANTIC_TEXT, + BUILDER_KEY, + languageCode, + null, + null, + null, + null, + true, + semanticText, + false + ))); + + if (changed && shouldQueueEmbeddings()) { + String modelKey = modelRegistry.getRequiredDefaultDocumentModelKey(); + embeddingOrchestrator.enqueueRepresentation(document.getId(), semantic.getId(), modelKey); + } + return semantic; + } + + private DocumentTextRepresentation updateRepresentation(DocumentTextRepresentation existing, String semanticText, String languageCode) { + existing.setBuilderKey(BUILDER_KEY); + existing.setLanguageCode(languageCode); + existing.setPrimaryRepresentation(true); + existing.setTextBody(semanticText); + existing.setCharCount(semanticText.length()); + DocumentTextRepresentation saved = representationRepository.saveAndFlush(existing); + lexicalIndexService.indexRepresentation(saved.getId()); + return saved; + } + + private boolean shouldQueueEmbeddings() { + return properties.getLotDocuments().isQueueEmbeddings() + && embeddingProperties.isEnabled() + && StringUtils.hasText(embeddingProperties.getDefaultDocumentModel()); + } + + private String buildSemanticText(TedNoticeProjection projection, TedNoticeLot lot) { + StringBuilder sb = new StringBuilder(1024); + append(sb, "Document type", "TED procurement lot"); + append(sb, "Lot title", lot.getTitle()); + append(sb, "Lot description", lot.getDescription()); + append(sb, "Project title", projection.getProjectTitle()); + + if (!StringUtils.hasText(lot.getDescription())) { + append(sb, "Project description", projection.getProjectDescription()); + } else if (properties.getLotDocuments().isIncludeProjectDescription()) { + append(sb, "Project context", projection.getProjectDescription()); + } + + append(sb, "Contract nature", projection.getContractNature() == null ? null : projection.getContractNature().name()); + append(sb, "Buyer activity", projection.getBuyerActivityType()); + append(sb, "Buyer country", projection.getBuyerCountryCode()); + append(sb, "CPV codes", joined(firstNonEmpty(lot.getCpvCodes(), projection.getCpvCodes()))); + append(sb, "NUTS codes", joined(firstNonEmpty(lot.getNutsCodes(), projection.getNutsCodes()))); + return sb.toString().trim(); + } + + private void append(StringBuilder sb, String label, String value) { + if (!StringUtils.hasText(value)) { + return; + } + if (sb.length() > 0) { + sb.append('\n').append('\n'); + } + sb.append(label).append(':').append('\n').append(value.trim()); + } + + private String buildLotBusinessKey(TedNoticeProjection projection, TedNoticeLot lot) { + String noticeKey = firstNonBlank(projection.getPublicationId(), projection.getNoticeId(), projection.getDocument().getId().toString()); + String lotKey = firstNonBlank(lot.getLotId(), lot.getInternalId(), lot.getId().toString()); + return "TED_NOTICE_LOT:" + sanitizeKey(noticeKey) + ":" + sanitizeKey(lotKey); + } + + private int resolveSortOrder(TedNoticeLot lot) { + String value = firstNonBlank(lot.getLotId(), lot.getInternalId()); + if (value != null) { + String digits = value.replaceAll("\\D+", ""); + if (StringUtils.hasText(digits)) { + try { + return Integer.parseInt(digits); + } catch (NumberFormatException ignored) { + // use default below + } + } + } + return 0; + } + + private String[] firstNonEmpty(String[] first, String[] fallback) { + return first != null && first.length > 0 ? first : fallback; + } + + private String joined(String[] values) { + if (values == null || values.length == 0) { + return null; + } + return Arrays.stream(values) + .filter(StringUtils::hasText) + .map(String::trim) + .distinct() + .collect(Collectors.joining(", ")); + } + + private String sanitizeKey(String value) { + return value == null ? "unknown" : value.trim().replaceAll("\\s+", "_"); + } + + private String escapeJson(String value) { + return value == null ? "" : value.replace("\\", "\\\\").replace("\"", "\\\""); + } + + private boolean equalsNullable(String left, String right) { + return left == null ? right == null : left.equals(right); + } + + private String firstNonBlank(String... values) { + for (String value : values) { + if (StringUtils.hasText(value)) { + return value.trim(); + } + } + return null; + } +} diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java index 6583cb1..d422eed 100644 --- a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java +++ b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java @@ -39,6 +39,7 @@ public class TedNoticeProjectionService { private final TedNoticeProjectionRepository projectionRepository; private final TedNoticeLotRepository lotRepository; private final TedNoticeOrganizationRepository organizationRepository; + private final TedLotDocumentMaterializationService lotDocumentMaterializationService; @Transactional public UUID registerOrRefreshProjection(ProcurementDocument legacyDocument) { @@ -166,7 +167,8 @@ public class TedNoticeProjectionService { .euFunded(lot.getEuFunded()) .build()); } - lotRepository.saveAll(projectedLots); + List savedLots = lotRepository.saveAll(projectedLots); + lotDocumentMaterializationService.materializeLots(projection, savedLots); } private void replaceOrganizations(TedNoticeProjection projection, List legacyOrganizations) { diff --git a/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java b/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java index 3689518..7d9ae3e 100644 --- a/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java +++ b/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java @@ -1,8 +1,9 @@ package at.procon.dip.domain.ted.startup; -import at.procon.dip.domain.ted.repository.TedNoticeProjectionRepository; -import at.procon.dip.domain.ted.service.TedNoticeProjectionService; import at.procon.dip.domain.ted.config.TedProjectionProperties; +import at.procon.dip.domain.ted.repository.TedNoticeProjectionRepository; +import at.procon.dip.domain.ted.service.TedLotDocumentMaterializationService; +import at.procon.dip.domain.ted.service.TedNoticeProjectionService; import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; import at.procon.dip.runtime.config.RuntimeMode; import at.procon.ted.repository.ProcurementDocumentRepository; @@ -15,7 +16,7 @@ import org.springframework.data.domain.Sort; import org.springframework.stereotype.Component; /** - * Optional startup backfill for Phase 3 TED projections. + * Optional startup backfill for Phase 3 TED projections and optional TED lot documents. */ @Component @ConditionalOnRuntimeMode(RuntimeMode.NEW) @@ -27,13 +28,24 @@ public class TedProjectionStartupRunner implements ApplicationRunner { private final ProcurementDocumentRepository procurementDocumentRepository; private final TedNoticeProjectionRepository projectionRepository; private final TedNoticeProjectionService projectionService; + private final TedLotDocumentMaterializationService lotDocumentMaterializationService; @Override public void run(ApplicationArguments args) { - if (!properties.isEnabled() || !properties.isStartupBackfillEnabled()) { + if (!properties.isEnabled()) { return; } + if (properties.isStartupBackfillEnabled()) { + backfillNoticeProjections(); + } + + if (properties.getLotDocuments().isEnabled() && properties.getLotDocuments().isStartupBackfillEnabled()) { + backfillLotDocuments(); + } + } + + private void backfillNoticeProjections() { int limit = properties.getStartupBackfillLimit(); log.info("Phase 3 startup backfill enabled - ensuring TED projections for up to {} documents", limit); @@ -51,4 +63,19 @@ public class TedProjectionStartupRunner implements ApplicationRunner { log.info("Phase 3 startup backfill completed - synced {} TED projections", synced); } + + private void backfillLotDocuments() { + int limit = properties.getLotDocuments().getStartupBackfillLimit(); + log.info("TED lot document startup backfill enabled - materializing lots for up to {} projections", limit); + + var page = projectionRepository.findAll( + PageRequest.of(0, limit, Sort.by(Sort.Direction.ASC, "createdAt"))); + + int lotDocuments = 0; + for (var projection : page.getContent()) { + lotDocuments += lotDocumentMaterializationService.materializeProjectionLots(projection.getId()); + } + + log.info("TED lot document startup backfill completed - materialized/updated {} lot documents", lotDocuments); + } } diff --git a/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java b/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java index 0adffd3..0794783 100644 --- a/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java +++ b/src/main/java/at/procon/dip/ingestion/util/DocumentImportSupport.java @@ -174,7 +174,7 @@ public final class DocumentImportSupport { public static DocumentFamily familyFor(DocumentType documentType) { return switch (documentType) { - case TED_PACKAGE, TED_NOTICE -> DocumentFamily.PROCUREMENT; + case TED_PACKAGE, TED_NOTICE, TED_NOTICE_LOT -> DocumentFamily.PROCUREMENT; case TIME_ENTRY -> DocumentFamily.TIME; case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL; case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN -> diff --git a/src/main/resources/db/migration/V32__ted_lot_documents.sql b/src/main/resources/db/migration/V32__ted_lot_documents.sql new file mode 100644 index 0000000..7d200c6 --- /dev/null +++ b/src/main/resources/db/migration/V32__ted_lot_documents.sql @@ -0,0 +1,48 @@ +-- Adds the canonical TED_NOTICE_LOT document type used for per-lot semantic representations. +-- The lot document stores derived semantic text in DOC.doc_text_representation; no DOC.doc_content row is required. + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_type t + JOIN pg_namespace n ON n.oid = t.typnamespace + WHERE n.nspname = 'doc' + AND t.typname = 'doc_document_type' + ) THEN + ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_NOTICE_LOT'; + END IF; +END +$$; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_constraint c + JOIN pg_class r ON r.oid = c.conrelid + JOIN pg_namespace n ON n.oid = r.relnamespace + WHERE n.nspname = 'doc' + AND r.relname = 'doc_document' + AND c.conname = 'doc_document_document_type_check' + ) THEN + ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_type_check; + ALTER TABLE DOC.doc_document + ADD CONSTRAINT doc_document_document_type_check + CHECK ( + document_type IN ( + 'TED_PACKAGE', 'TED_NOTICE', 'TED_NOTICE_LOT', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML', + 'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'TIME_ENTRY', 'UNKNOWN' + ) + ); + END IF; +END +$$; + +CREATE INDEX IF NOT EXISTS idx_doc_document_ted_lot_business_key + ON DOC.doc_document(business_key) + WHERE document_type = 'TED_NOTICE_LOT'; + +CREATE INDEX IF NOT EXISTS idx_doc_text_repr_ted_lot_builder + ON DOC.doc_text_representation(builder_key) + WHERE builder_key = 'ted-lot-structured-text';