introduced TED Notice Lot documents
This commit is contained in:
parent
39f7c0659e
commit
902456001e
|
|
@ -46,7 +46,6 @@ public class DocumentEmbeddingClusterSelectionRepositoryImpl implements Document
|
|||
where e.embedding_status = 'COMPLETED'
|
||||
and e.embedding_vector is not null
|
||||
and e.prefix_profile_id is not null
|
||||
and d.
|
||||
""");
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
applyFilters(spec, sql, params);
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ package at.procon.dip.domain.document;
|
|||
public enum DocumentType {
|
||||
TED_PACKAGE,
|
||||
TED_NOTICE,
|
||||
TED_NOTICE_LOT,
|
||||
TIME_ENTRY,
|
||||
EMAIL,
|
||||
MIME_MESSAGE,
|
||||
|
|
|
|||
|
|
@ -17,4 +17,27 @@ public class TedProjectionProperties {
|
|||
private int structuredSearchHybridCandidateLimit = 5000;
|
||||
@Positive
|
||||
private int structuredSearchFacetBucketLimit = 12;
|
||||
private LotDocuments lotDocuments = new LotDocuments();
|
||||
|
||||
@Data
|
||||
public static class LotDocuments {
|
||||
/**
|
||||
* Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot.
|
||||
*/
|
||||
private boolean enabled = false;
|
||||
/**
|
||||
* Optional startup/backfill path for notices that were imported before lot documents existed.
|
||||
*/
|
||||
private boolean startupBackfillEnabled = false;
|
||||
@Positive
|
||||
private int startupBackfillLimit = 1000;
|
||||
/**
|
||||
* Queue embeddings whenever the lot semantic text representation is created or changed.
|
||||
*/
|
||||
private boolean queueEmbeddings = false;
|
||||
/**
|
||||
* Include parent notice project description even when the lot already has its own description.
|
||||
*/
|
||||
private boolean includeProjectDescription = false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,262 @@
|
|||
package at.procon.dip.domain.ted.service;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentStatus;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RelationType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.domain.document.entity.DocumentRelation;
|
||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||
import at.procon.dip.domain.document.repository.DocumentRelationRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||
import at.procon.dip.domain.document.service.DocumentRepresentationService;
|
||||
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
||||
import at.procon.dip.domain.ted.config.TedProjectionProperties;
|
||||
import at.procon.dip.domain.ted.entity.TedNoticeLot;
|
||||
import at.procon.dip.domain.ted.entity.TedNoticeProjection;
|
||||
import at.procon.dip.domain.ted.repository.TedNoticeLotRepository;
|
||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||
import at.procon.dip.search.service.DocumentLexicalIndexService;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Collectors;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class TedLotDocumentMaterializationService {
|
||||
|
||||
public static final String BUILDER_KEY = "ted-lot-structured-text";
|
||||
|
||||
private final TedProjectionProperties properties;
|
||||
private final TedNoticeLotRepository lotRepository;
|
||||
private final DocumentRepository documentRepository;
|
||||
private final DocumentRelationRepository relationRepository;
|
||||
private final DocumentTextRepresentationRepository representationRepository;
|
||||
private final DocumentRepresentationService documentRepresentationService;
|
||||
private final DocumentLexicalIndexService lexicalIndexService;
|
||||
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
||||
private final EmbeddingProperties embeddingProperties;
|
||||
private final EmbeddingModelRegistry modelRegistry;
|
||||
|
||||
@Transactional
|
||||
public int materializeProjectionLots(UUID projectionId) {
|
||||
if (!properties.getLotDocuments().isEnabled()) {
|
||||
return 0;
|
||||
}
|
||||
List<TedNoticeLot> lots = lotRepository.findByNoticeProjection_Id(projectionId);
|
||||
int count = 0;
|
||||
for (TedNoticeLot lot : lots) {
|
||||
materializeLot(lot);
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public void materializeLots(TedNoticeProjection projection, List<TedNoticeLot> lots) {
|
||||
if (!properties.getLotDocuments().isEnabled() || lots == null || lots.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
for (TedNoticeLot lot : lots) {
|
||||
materializeLot(lot);
|
||||
}
|
||||
}
|
||||
|
||||
private DocumentTextRepresentation materializeLot(TedNoticeLot lot) {
|
||||
TedNoticeProjection projection = lot.getNoticeProjection();
|
||||
Document parent = projection.getDocument();
|
||||
String semanticText = buildSemanticText(projection, lot);
|
||||
if (!StringUtils.hasText(semanticText)) {
|
||||
log.debug("Skipping TED lot document for lot {} / projection {} because semantic text is blank", lot.getLotId(), projection.getId());
|
||||
return null;
|
||||
}
|
||||
|
||||
String businessKey = buildLotBusinessKey(projection, lot);
|
||||
Document lotDocument = documentRepository.findByBusinessKey(businessKey)
|
||||
.orElseGet(() -> newLotDocument(parent, businessKey));
|
||||
lotDocument.setDocumentType(DocumentType.TED_NOTICE_LOT);
|
||||
lotDocument.setDocumentFamily(DocumentFamily.PROCUREMENT);
|
||||
lotDocument.setStatus(DocumentStatus.REPRESENTED);
|
||||
lotDocument.setTitle(firstNonBlank(lot.getTitle(), projection.getProjectTitle(), businessKey));
|
||||
lotDocument.setSummary(firstNonBlank(lot.getDescription(), projection.getProjectDescription()));
|
||||
lotDocument.setLanguageCode(firstNonBlank(projection.getLanguageCode(), parent.getLanguageCode()));
|
||||
lotDocument.setMimeType("application/x-ted-notice-lot");
|
||||
lotDocument = documentRepository.save(lotDocument);
|
||||
|
||||
ensureRelation(parent, lotDocument, lot);
|
||||
return upsertSemanticRepresentation(lotDocument, semanticText, lotDocument.getLanguageCode());
|
||||
}
|
||||
|
||||
private Document newLotDocument(Document parent, String businessKey) {
|
||||
Document document = new Document();
|
||||
document.setOwnerTenant(parent.getOwnerTenant());
|
||||
document.setVisibility(parent.getVisibility());
|
||||
document.setBusinessKey(businessKey);
|
||||
return document;
|
||||
}
|
||||
|
||||
private void ensureRelation(Document parent, Document child, TedNoticeLot lot) {
|
||||
if (!relationRepository.existsByParentDocument_IdAndChildDocument_IdAndRelationType(parent.getId(), child.getId(), RelationType.CONTAINS)) {
|
||||
relationRepository.save(DocumentRelation.builder()
|
||||
.parentDocument(parent)
|
||||
.childDocument(child)
|
||||
.relationType(RelationType.CONTAINS)
|
||||
.sortOrder(resolveSortOrder(lot))
|
||||
.relationMetadata("{\"source\":\"ted-lot-materialization\",\"lotId\":\"" + escapeJson(firstNonBlank(lot.getLotId(), lot.getInternalId(), lot.getId().toString())) + "\"}")
|
||||
.build());
|
||||
}
|
||||
}
|
||||
|
||||
private DocumentTextRepresentation upsertSemanticRepresentation(Document document, String semanticText, String languageCode) {
|
||||
Optional<DocumentTextRepresentation> existing = representationRepository
|
||||
.findByDocument_IdAndRepresentationType(document.getId(), RepresentationType.SEMANTIC_TEXT)
|
||||
.stream()
|
||||
.filter(r -> BUILDER_KEY.equals(r.getBuilderKey()) || r.isPrimaryRepresentation())
|
||||
.findFirst();
|
||||
|
||||
boolean changed = existing.isEmpty()
|
||||
|| !semanticText.equals(existing.get().getTextBody())
|
||||
|| !equalsNullable(languageCode, existing.get().getLanguageCode())
|
||||
|| !BUILDER_KEY.equals(existing.get().getBuilderKey());
|
||||
|
||||
DocumentTextRepresentation semantic = existing
|
||||
.map(found -> changed ? updateRepresentation(found, semanticText, languageCode) : found)
|
||||
.orElseGet(() -> documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
|
||||
document.getId(),
|
||||
null,
|
||||
RepresentationType.SEMANTIC_TEXT,
|
||||
BUILDER_KEY,
|
||||
languageCode,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
true,
|
||||
semanticText,
|
||||
false
|
||||
)));
|
||||
|
||||
if (changed && shouldQueueEmbeddings()) {
|
||||
String modelKey = modelRegistry.getRequiredDefaultDocumentModelKey();
|
||||
embeddingOrchestrator.enqueueRepresentation(document.getId(), semantic.getId(), modelKey);
|
||||
}
|
||||
return semantic;
|
||||
}
|
||||
|
||||
private DocumentTextRepresentation updateRepresentation(DocumentTextRepresentation existing, String semanticText, String languageCode) {
|
||||
existing.setBuilderKey(BUILDER_KEY);
|
||||
existing.setLanguageCode(languageCode);
|
||||
existing.setPrimaryRepresentation(true);
|
||||
existing.setTextBody(semanticText);
|
||||
existing.setCharCount(semanticText.length());
|
||||
DocumentTextRepresentation saved = representationRepository.saveAndFlush(existing);
|
||||
lexicalIndexService.indexRepresentation(saved.getId());
|
||||
return saved;
|
||||
}
|
||||
|
||||
private boolean shouldQueueEmbeddings() {
|
||||
return properties.getLotDocuments().isQueueEmbeddings()
|
||||
&& embeddingProperties.isEnabled()
|
||||
&& StringUtils.hasText(embeddingProperties.getDefaultDocumentModel());
|
||||
}
|
||||
|
||||
private String buildSemanticText(TedNoticeProjection projection, TedNoticeLot lot) {
|
||||
StringBuilder sb = new StringBuilder(1024);
|
||||
append(sb, "Document type", "TED procurement lot");
|
||||
append(sb, "Lot title", lot.getTitle());
|
||||
append(sb, "Lot description", lot.getDescription());
|
||||
append(sb, "Project title", projection.getProjectTitle());
|
||||
|
||||
if (!StringUtils.hasText(lot.getDescription())) {
|
||||
append(sb, "Project description", projection.getProjectDescription());
|
||||
} else if (properties.getLotDocuments().isIncludeProjectDescription()) {
|
||||
append(sb, "Project context", projection.getProjectDescription());
|
||||
}
|
||||
|
||||
append(sb, "Contract nature", projection.getContractNature() == null ? null : projection.getContractNature().name());
|
||||
append(sb, "Buyer activity", projection.getBuyerActivityType());
|
||||
append(sb, "Buyer country", projection.getBuyerCountryCode());
|
||||
append(sb, "CPV codes", joined(firstNonEmpty(lot.getCpvCodes(), projection.getCpvCodes())));
|
||||
append(sb, "NUTS codes", joined(firstNonEmpty(lot.getNutsCodes(), projection.getNutsCodes())));
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
private void append(StringBuilder sb, String label, String value) {
|
||||
if (!StringUtils.hasText(value)) {
|
||||
return;
|
||||
}
|
||||
if (sb.length() > 0) {
|
||||
sb.append('\n').append('\n');
|
||||
}
|
||||
sb.append(label).append(':').append('\n').append(value.trim());
|
||||
}
|
||||
|
||||
private String buildLotBusinessKey(TedNoticeProjection projection, TedNoticeLot lot) {
|
||||
String noticeKey = firstNonBlank(projection.getPublicationId(), projection.getNoticeId(), projection.getDocument().getId().toString());
|
||||
String lotKey = firstNonBlank(lot.getLotId(), lot.getInternalId(), lot.getId().toString());
|
||||
return "TED_NOTICE_LOT:" + sanitizeKey(noticeKey) + ":" + sanitizeKey(lotKey);
|
||||
}
|
||||
|
||||
private int resolveSortOrder(TedNoticeLot lot) {
|
||||
String value = firstNonBlank(lot.getLotId(), lot.getInternalId());
|
||||
if (value != null) {
|
||||
String digits = value.replaceAll("\\D+", "");
|
||||
if (StringUtils.hasText(digits)) {
|
||||
try {
|
||||
return Integer.parseInt(digits);
|
||||
} catch (NumberFormatException ignored) {
|
||||
// use default below
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private String[] firstNonEmpty(String[] first, String[] fallback) {
|
||||
return first != null && first.length > 0 ? first : fallback;
|
||||
}
|
||||
|
||||
private String joined(String[] values) {
|
||||
if (values == null || values.length == 0) {
|
||||
return null;
|
||||
}
|
||||
return Arrays.stream(values)
|
||||
.filter(StringUtils::hasText)
|
||||
.map(String::trim)
|
||||
.distinct()
|
||||
.collect(Collectors.joining(", "));
|
||||
}
|
||||
|
||||
private String sanitizeKey(String value) {
|
||||
return value == null ? "unknown" : value.trim().replaceAll("\\s+", "_");
|
||||
}
|
||||
|
||||
private String escapeJson(String value) {
|
||||
return value == null ? "" : value.replace("\\", "\\\\").replace("\"", "\\\"");
|
||||
}
|
||||
|
||||
private boolean equalsNullable(String left, String right) {
|
||||
return left == null ? right == null : left.equals(right);
|
||||
}
|
||||
|
||||
private String firstNonBlank(String... values) {
|
||||
for (String value : values) {
|
||||
if (StringUtils.hasText(value)) {
|
||||
return value.trim();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -39,6 +39,7 @@ public class TedNoticeProjectionService {
|
|||
private final TedNoticeProjectionRepository projectionRepository;
|
||||
private final TedNoticeLotRepository lotRepository;
|
||||
private final TedNoticeOrganizationRepository organizationRepository;
|
||||
private final TedLotDocumentMaterializationService lotDocumentMaterializationService;
|
||||
|
||||
@Transactional
|
||||
public UUID registerOrRefreshProjection(ProcurementDocument legacyDocument) {
|
||||
|
|
@ -166,7 +167,8 @@ public class TedNoticeProjectionService {
|
|||
.euFunded(lot.getEuFunded())
|
||||
.build());
|
||||
}
|
||||
lotRepository.saveAll(projectedLots);
|
||||
List<TedNoticeLot> savedLots = lotRepository.saveAll(projectedLots);
|
||||
lotDocumentMaterializationService.materializeLots(projection, savedLots);
|
||||
}
|
||||
|
||||
private void replaceOrganizations(TedNoticeProjection projection, List<Organization> legacyOrganizations) {
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
package at.procon.dip.domain.ted.startup;
|
||||
|
||||
import at.procon.dip.domain.ted.repository.TedNoticeProjectionRepository;
|
||||
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
|
||||
import at.procon.dip.domain.ted.config.TedProjectionProperties;
|
||||
import at.procon.dip.domain.ted.repository.TedNoticeProjectionRepository;
|
||||
import at.procon.dip.domain.ted.service.TedLotDocumentMaterializationService;
|
||||
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||
|
|
@ -15,7 +16,7 @@ import org.springframework.data.domain.Sort;
|
|||
import org.springframework.stereotype.Component;
|
||||
|
||||
/**
|
||||
* Optional startup backfill for Phase 3 TED projections.
|
||||
* Optional startup backfill for Phase 3 TED projections and optional TED lot documents.
|
||||
*/
|
||||
@Component
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
|
|
@ -27,13 +28,24 @@ public class TedProjectionStartupRunner implements ApplicationRunner {
|
|||
private final ProcurementDocumentRepository procurementDocumentRepository;
|
||||
private final TedNoticeProjectionRepository projectionRepository;
|
||||
private final TedNoticeProjectionService projectionService;
|
||||
private final TedLotDocumentMaterializationService lotDocumentMaterializationService;
|
||||
|
||||
@Override
|
||||
public void run(ApplicationArguments args) {
|
||||
if (!properties.isEnabled() || !properties.isStartupBackfillEnabled()) {
|
||||
if (!properties.isEnabled()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (properties.isStartupBackfillEnabled()) {
|
||||
backfillNoticeProjections();
|
||||
}
|
||||
|
||||
if (properties.getLotDocuments().isEnabled() && properties.getLotDocuments().isStartupBackfillEnabled()) {
|
||||
backfillLotDocuments();
|
||||
}
|
||||
}
|
||||
|
||||
private void backfillNoticeProjections() {
|
||||
int limit = properties.getStartupBackfillLimit();
|
||||
log.info("Phase 3 startup backfill enabled - ensuring TED projections for up to {} documents", limit);
|
||||
|
||||
|
|
@ -51,4 +63,19 @@ public class TedProjectionStartupRunner implements ApplicationRunner {
|
|||
|
||||
log.info("Phase 3 startup backfill completed - synced {} TED projections", synced);
|
||||
}
|
||||
|
||||
private void backfillLotDocuments() {
|
||||
int limit = properties.getLotDocuments().getStartupBackfillLimit();
|
||||
log.info("TED lot document startup backfill enabled - materializing lots for up to {} projections", limit);
|
||||
|
||||
var page = projectionRepository.findAll(
|
||||
PageRequest.of(0, limit, Sort.by(Sort.Direction.ASC, "createdAt")));
|
||||
|
||||
int lotDocuments = 0;
|
||||
for (var projection : page.getContent()) {
|
||||
lotDocuments += lotDocumentMaterializationService.materializeProjectionLots(projection.getId());
|
||||
}
|
||||
|
||||
log.info("TED lot document startup backfill completed - materialized/updated {} lot documents", lotDocuments);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -174,7 +174,7 @@ public final class DocumentImportSupport {
|
|||
|
||||
public static DocumentFamily familyFor(DocumentType documentType) {
|
||||
return switch (documentType) {
|
||||
case TED_PACKAGE, TED_NOTICE -> DocumentFamily.PROCUREMENT;
|
||||
case TED_PACKAGE, TED_NOTICE, TED_NOTICE_LOT -> DocumentFamily.PROCUREMENT;
|
||||
case TIME_ENTRY -> DocumentFamily.TIME;
|
||||
case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL;
|
||||
case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN ->
|
||||
|
|
|
|||
|
|
@ -0,0 +1,48 @@
|
|||
-- Adds the canonical TED_NOTICE_LOT document type used for per-lot semantic representations.
|
||||
-- The lot document stores derived semantic text in DOC.doc_text_representation; no DOC.doc_content row is required.
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_type t
|
||||
JOIN pg_namespace n ON n.oid = t.typnamespace
|
||||
WHERE n.nspname = 'doc'
|
||||
AND t.typname = 'doc_document_type'
|
||||
) THEN
|
||||
ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_NOTICE_LOT';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM pg_constraint c
|
||||
JOIN pg_class r ON r.oid = c.conrelid
|
||||
JOIN pg_namespace n ON n.oid = r.relnamespace
|
||||
WHERE n.nspname = 'doc'
|
||||
AND r.relname = 'doc_document'
|
||||
AND c.conname = 'doc_document_document_type_check'
|
||||
) THEN
|
||||
ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_type_check;
|
||||
ALTER TABLE DOC.doc_document
|
||||
ADD CONSTRAINT doc_document_document_type_check
|
||||
CHECK (
|
||||
document_type IN (
|
||||
'TED_PACKAGE', 'TED_NOTICE', 'TED_NOTICE_LOT', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
|
||||
'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'TIME_ENTRY', 'UNKNOWN'
|
||||
)
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_ted_lot_business_key
|
||||
ON DOC.doc_document(business_key)
|
||||
WHERE document_type = 'TED_NOTICE_LOT';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_ted_lot_builder
|
||||
ON DOC.doc_text_representation(builder_key)
|
||||
WHERE builder_key = 'ted-lot-structured-text';
|
||||
Loading…
Reference in New Issue