Compare commits
No commits in common. "4bc503ed29230643368460f775c58854e097e030" and "39f7c0659e5fc93a54e1be5fed0b4555526804be" have entirely different histories.
4bc503ed29
...
39f7c0659e
|
|
@ -7,10 +7,10 @@ Included changes:
|
||||||
- Score normalization and result fusion
|
- Score normalization and result fusion
|
||||||
- Generic /search endpoint
|
- Generic /search endpoint
|
||||||
- Lexical index maintenance service and startup backfill runner
|
- Lexical index maintenance service and startup backfill runner
|
||||||
- DOC lexical search migration (V14)
|
- DOC lexical search migration (V9)
|
||||||
- Modified DOC representation write path to refresh search vectors
|
- Modified DOC representation write path to refresh search vectors
|
||||||
|
|
||||||
Important note:
|
Important note:
|
||||||
- Full-text search requires V14__doc_search_slice1_support.sql to be applied.
|
- Full-text search requires V9__doc_search_slice1_support.sql to be applied.
|
||||||
- The lexical index service is guarded and will no-op if the search columns are not yet present.
|
- The lexical index service is guarded and will no-op if the search columns are not yet present.
|
||||||
- Because Flyway is currently disabled in application.yml, apply the migration manually or enable Flyway before using the new search endpoint.
|
- Because Flyway is currently disabled in application.yml, apply the migration manually or enable Flyway before using the new search endpoint.
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,7 @@ public class DocumentEmbeddingClusterSelectionRepositoryImpl implements Document
|
||||||
where e.embedding_status = 'COMPLETED'
|
where e.embedding_status = 'COMPLETED'
|
||||||
and e.embedding_vector is not null
|
and e.embedding_vector is not null
|
||||||
and e.prefix_profile_id is not null
|
and e.prefix_profile_id is not null
|
||||||
|
and d.
|
||||||
""");
|
""");
|
||||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||||
applyFilters(spec, sql, params);
|
applyFilters(spec, sql, params);
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@ package at.procon.dip.domain.document;
|
||||||
public enum DocumentType {
|
public enum DocumentType {
|
||||||
TED_PACKAGE,
|
TED_PACKAGE,
|
||||||
TED_NOTICE,
|
TED_NOTICE,
|
||||||
TED_NOTICE_LOT,
|
|
||||||
TIME_ENTRY,
|
TIME_ENTRY,
|
||||||
EMAIL,
|
EMAIL,
|
||||||
MIME_MESSAGE,
|
MIME_MESSAGE,
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,11 @@
|
||||||
package at.procon.dip.domain.document.repository;
|
package at.procon.dip.domain.document.repository;
|
||||||
|
|
||||||
import at.procon.dip.domain.document.RepresentationType;
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
|
||||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
|
||||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import org.springframework.data.domain.Pageable;
|
|
||||||
import org.springframework.data.jpa.repository.JpaRepository;
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
import org.springframework.data.jpa.repository.Query;
|
|
||||||
import org.springframework.data.repository.query.Param;
|
|
||||||
|
|
||||||
public interface DocumentTextRepresentationRepository extends JpaRepository<DocumentTextRepresentation, UUID> {
|
public interface DocumentTextRepresentationRepository extends JpaRepository<DocumentTextRepresentation, UUID> {
|
||||||
|
|
||||||
|
|
@ -25,33 +20,4 @@ public interface DocumentTextRepresentationRepository extends JpaRepository<Docu
|
||||||
long countByRepresentationType(RepresentationType representationType);
|
long countByRepresentationType(RepresentationType representationType);
|
||||||
|
|
||||||
Optional<DocumentTextRepresentation> findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId);
|
Optional<DocumentTextRepresentation> findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId);
|
||||||
|
|
||||||
@Query("""
|
|
||||||
SELECT r
|
|
||||||
FROM DocumentTextRepresentation r
|
|
||||||
JOIN r.document d
|
|
||||||
WHERE (:documentType IS NULL OR d.documentType = :documentType)
|
|
||||||
AND (:representationType IS NULL OR r.representationType = :representationType)
|
|
||||||
AND (:builderKey IS NULL OR r.builderKey = :builderKey)
|
|
||||||
AND (:primaryOnly = false OR r.primaryRepresentation = true)
|
|
||||||
AND r.textBody IS NOT NULL
|
|
||||||
AND r.textBody <> ''
|
|
||||||
AND (:includeCompleted = true OR NOT EXISTS (
|
|
||||||
SELECT e.id
|
|
||||||
FROM DocumentEmbedding e
|
|
||||||
WHERE e.representation.id = r.id
|
|
||||||
AND e.model.modelKey = :modelKey
|
|
||||||
AND e.embeddingStatus = :completedStatus
|
|
||||||
))
|
|
||||||
ORDER BY r.createdAt ASC, r.id ASC
|
|
||||||
""")
|
|
||||||
List<DocumentTextRepresentation> findEmbeddingCandidatesByDocumentType(
|
|
||||||
@Param("documentType") DocumentType documentType,
|
|
||||||
@Param("representationType") RepresentationType representationType,
|
|
||||||
@Param("builderKey") String builderKey,
|
|
||||||
@Param("primaryOnly") boolean primaryOnly,
|
|
||||||
@Param("modelKey") String modelKey,
|
|
||||||
@Param("completedStatus") EmbeddingStatus completedStatus,
|
|
||||||
@Param("includeCompleted") boolean includeCompleted,
|
|
||||||
Pageable pageable);
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,52 +1,20 @@
|
||||||
package at.procon.dip.domain.ted.config;
|
package at.procon.dip.domain.ted.config;
|
||||||
|
|
||||||
import jakarta.validation.constraints.Min;
|
|
||||||
import jakarta.validation.constraints.Positive;
|
import jakarta.validation.constraints.Positive;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||||
import org.springframework.context.annotation.Configuration;
|
import org.springframework.context.annotation.Configuration;
|
||||||
import org.springframework.validation.annotation.Validated;
|
|
||||||
|
|
||||||
@Configuration
|
@Configuration
|
||||||
@ConfigurationProperties(prefix = "dip.ted.projection")
|
@ConfigurationProperties(prefix = "dip.ted.projection")
|
||||||
@Validated
|
|
||||||
@Data
|
@Data
|
||||||
public class TedProjectionProperties {
|
public class TedProjectionProperties {
|
||||||
private boolean enabled = true;
|
private boolean enabled = true;
|
||||||
private boolean startupBackfillEnabled = false;
|
private boolean startupBackfillEnabled = false;
|
||||||
/**
|
@Positive
|
||||||
* Maximum number of legacy TED documents to backfill on startup. 0 means no limit.
|
|
||||||
*/
|
|
||||||
@Min(0)
|
|
||||||
private int startupBackfillLimit = 250;
|
private int startupBackfillLimit = 250;
|
||||||
@Positive
|
@Positive
|
||||||
private int structuredSearchHybridCandidateLimit = 5000;
|
private int structuredSearchHybridCandidateLimit = 5000;
|
||||||
@Positive
|
@Positive
|
||||||
private int structuredSearchFacetBucketLimit = 12;
|
private int structuredSearchFacetBucketLimit = 12;
|
||||||
private LotDocuments lotDocuments = new LotDocuments();
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public static class LotDocuments {
|
|
||||||
/**
|
|
||||||
* Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot.
|
|
||||||
*/
|
|
||||||
private boolean enabled = false;
|
|
||||||
/**
|
|
||||||
* Optional startup/backfill path for notices that were imported before lot documents existed.
|
|
||||||
*/
|
|
||||||
private boolean startupBackfillEnabled = false;
|
|
||||||
/**
|
|
||||||
* Maximum number of projections to backfill on startup. 0 means no limit.
|
|
||||||
*/
|
|
||||||
@Min(0)
|
|
||||||
private int startupBackfillLimit = 1000;
|
|
||||||
/**
|
|
||||||
* Queue embeddings whenever the lot semantic text representation is created or changed.
|
|
||||||
*/
|
|
||||||
private boolean queueEmbeddings = false;
|
|
||||||
/**
|
|
||||||
* Include parent notice project description even when the lot already has its own description.
|
|
||||||
*/
|
|
||||||
private boolean includeProjectDescription = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,277 +0,0 @@
|
||||||
package at.procon.dip.domain.ted.service;
|
|
||||||
|
|
||||||
import at.procon.dip.domain.document.DocumentFamily;
|
|
||||||
import at.procon.dip.domain.document.DocumentStatus;
|
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
|
||||||
import at.procon.dip.domain.document.RelationType;
|
|
||||||
import at.procon.dip.domain.document.RepresentationType;
|
|
||||||
import at.procon.dip.domain.document.entity.Document;
|
|
||||||
import at.procon.dip.domain.document.entity.DocumentRelation;
|
|
||||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
|
||||||
import at.procon.dip.domain.document.repository.DocumentRelationRepository;
|
|
||||||
import at.procon.dip.domain.document.repository.DocumentRepository;
|
|
||||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
|
||||||
import at.procon.dip.domain.document.service.DocumentRepresentationService;
|
|
||||||
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
|
||||||
import at.procon.dip.domain.ted.config.TedProjectionProperties;
|
|
||||||
import at.procon.dip.domain.ted.entity.TedNoticeLot;
|
|
||||||
import at.procon.dip.domain.ted.entity.TedNoticeProjection;
|
|
||||||
import at.procon.dip.domain.ted.repository.TedNoticeLotRepository;
|
|
||||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
|
||||||
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
|
||||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
|
||||||
import at.procon.dip.search.service.DocumentLexicalIndexService;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.UUID;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
import org.springframework.transaction.annotation.Transactional;
|
|
||||||
import org.springframework.util.StringUtils;
|
|
||||||
|
|
||||||
@Service
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
@Slf4j
|
|
||||||
public class TedLotDocumentMaterializationService {
|
|
||||||
|
|
||||||
public static final String BUILDER_KEY = "ted-lot-clustering-text-v1";
|
|
||||||
|
|
||||||
private final TedProjectionProperties properties;
|
|
||||||
private final TedNoticeLotRepository lotRepository;
|
|
||||||
private final DocumentRepository documentRepository;
|
|
||||||
private final DocumentRelationRepository relationRepository;
|
|
||||||
private final DocumentTextRepresentationRepository representationRepository;
|
|
||||||
private final DocumentRepresentationService documentRepresentationService;
|
|
||||||
private final DocumentLexicalIndexService lexicalIndexService;
|
|
||||||
private final RepresentationEmbeddingOrchestrator embeddingOrchestrator;
|
|
||||||
private final EmbeddingProperties embeddingProperties;
|
|
||||||
private final EmbeddingModelRegistry modelRegistry;
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
public int materializeProjectionLots(UUID projectionId) {
|
|
||||||
if (!properties.getLotDocuments().isEnabled()) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
List<TedNoticeLot> lots = lotRepository.findByNoticeProjection_Id(projectionId);
|
|
||||||
int count = 0;
|
|
||||||
for (TedNoticeLot lot : lots) {
|
|
||||||
materializeLot(lot);
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
public void materializeLots(TedNoticeProjection projection, List<TedNoticeLot> lots) {
|
|
||||||
if (!properties.getLotDocuments().isEnabled() || lots == null || lots.isEmpty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (TedNoticeLot lot : lots) {
|
|
||||||
materializeLot(lot);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private DocumentTextRepresentation materializeLot(TedNoticeLot lot) {
|
|
||||||
TedNoticeProjection projection = lot.getNoticeProjection();
|
|
||||||
Document parent = projection.getDocument();
|
|
||||||
String semanticText = buildSemanticText(projection, lot);
|
|
||||||
if (!StringUtils.hasText(semanticText)) {
|
|
||||||
log.debug("Skipping TED lot document for lot {} / projection {} because semantic text is blank", lot.getLotId(), projection.getId());
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
String businessKey = buildLotBusinessKey(projection, lot);
|
|
||||||
Document lotDocument = documentRepository.findByBusinessKey(businessKey)
|
|
||||||
.orElseGet(() -> newLotDocument(parent, businessKey));
|
|
||||||
lotDocument.setDocumentType(DocumentType.TED_NOTICE_LOT);
|
|
||||||
lotDocument.setDocumentFamily(DocumentFamily.PROCUREMENT);
|
|
||||||
lotDocument.setStatus(DocumentStatus.REPRESENTED);
|
|
||||||
lotDocument.setTitle(firstNonBlank(lot.getTitle(), projection.getProjectTitle(), businessKey));
|
|
||||||
lotDocument.setSummary(firstNonBlank(lot.getDescription(), projection.getProjectDescription()));
|
|
||||||
lotDocument.setLanguageCode(firstNonBlank(projection.getLanguageCode(), parent.getLanguageCode()));
|
|
||||||
lotDocument.setMimeType("application/x-ted-notice-lot");
|
|
||||||
lotDocument = documentRepository.save(lotDocument);
|
|
||||||
|
|
||||||
ensureRelation(parent, lotDocument, lot);
|
|
||||||
return upsertSemanticRepresentation(lotDocument, semanticText, lotDocument.getLanguageCode());
|
|
||||||
}
|
|
||||||
|
|
||||||
private Document newLotDocument(Document parent, String businessKey) {
|
|
||||||
Document document = new Document();
|
|
||||||
document.setOwnerTenant(parent.getOwnerTenant());
|
|
||||||
document.setVisibility(parent.getVisibility());
|
|
||||||
document.setBusinessKey(businessKey);
|
|
||||||
return document;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void ensureRelation(Document parent, Document child, TedNoticeLot lot) {
|
|
||||||
if (!relationRepository.existsByParentDocument_IdAndChildDocument_IdAndRelationType(parent.getId(), child.getId(), RelationType.CONTAINS)) {
|
|
||||||
relationRepository.save(DocumentRelation.builder()
|
|
||||||
.parentDocument(parent)
|
|
||||||
.childDocument(child)
|
|
||||||
.relationType(RelationType.CONTAINS)
|
|
||||||
.sortOrder(resolveSortOrder(lot))
|
|
||||||
.relationMetadata("{\"source\":\"ted-lot-materialization\",\"lotId\":\"" + escapeJson(firstNonBlank(lot.getLotId(), lot.getInternalId(), lot.getId().toString())) + "\"}")
|
|
||||||
.build());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private DocumentTextRepresentation upsertSemanticRepresentation(Document document, String semanticText, String languageCode) {
|
|
||||||
Optional<DocumentTextRepresentation> existing = representationRepository
|
|
||||||
.findByDocument_IdAndRepresentationType(document.getId(), RepresentationType.SEMANTIC_TEXT)
|
|
||||||
.stream()
|
|
||||||
.filter(r -> BUILDER_KEY.equals(r.getBuilderKey()) || r.isPrimaryRepresentation())
|
|
||||||
.findFirst();
|
|
||||||
|
|
||||||
boolean changed = existing.isEmpty()
|
|
||||||
|| !semanticText.equals(existing.get().getTextBody())
|
|
||||||
|| !equalsNullable(languageCode, existing.get().getLanguageCode())
|
|
||||||
|| !BUILDER_KEY.equals(existing.get().getBuilderKey());
|
|
||||||
|
|
||||||
DocumentTextRepresentation semantic = existing
|
|
||||||
.map(found -> changed ? updateRepresentation(found, semanticText, languageCode) : found)
|
|
||||||
.orElseGet(() -> documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
|
|
||||||
document.getId(),
|
|
||||||
null,
|
|
||||||
RepresentationType.SEMANTIC_TEXT,
|
|
||||||
BUILDER_KEY,
|
|
||||||
languageCode,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
true,
|
|
||||||
semanticText,
|
|
||||||
false
|
|
||||||
)));
|
|
||||||
|
|
||||||
if (changed && shouldQueueEmbeddings()) {
|
|
||||||
String modelKey = modelRegistry.getRequiredDefaultDocumentModelKey();
|
|
||||||
embeddingOrchestrator.enqueueRepresentation(document.getId(), semantic.getId(), modelKey);
|
|
||||||
}
|
|
||||||
return semantic;
|
|
||||||
}
|
|
||||||
|
|
||||||
private DocumentTextRepresentation updateRepresentation(DocumentTextRepresentation existing, String semanticText, String languageCode) {
|
|
||||||
existing.setBuilderKey(BUILDER_KEY);
|
|
||||||
existing.setLanguageCode(languageCode);
|
|
||||||
existing.setPrimaryRepresentation(true);
|
|
||||||
existing.setTextBody(semanticText);
|
|
||||||
existing.setCharCount(semanticText.length());
|
|
||||||
DocumentTextRepresentation saved = representationRepository.saveAndFlush(existing);
|
|
||||||
lexicalIndexService.indexRepresentation(saved.getId());
|
|
||||||
return saved;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean shouldQueueEmbeddings() {
|
|
||||||
return properties.getLotDocuments().isQueueEmbeddings()
|
|
||||||
&& embeddingProperties.isEnabled()
|
|
||||||
&& StringUtils.hasText(embeddingProperties.getDefaultDocumentModel());
|
|
||||||
}
|
|
||||||
|
|
||||||
private String buildSemanticText(TedNoticeProjection projection, TedNoticeLot lot) {
|
|
||||||
StringBuilder sb = new StringBuilder(1024);
|
|
||||||
append(sb, "Entity type", "Procurement lot");
|
|
||||||
append(sb, "Procurement scope", joinedLines(lot.getTitle(), lot.getDescription()));
|
|
||||||
append(sb, "Procurement category", joined(firstNonEmpty(lot.getCpvCodes(), projection.getCpvCodes())));
|
|
||||||
append(sb, "Relevant domain", projection.getBuyerActivityType());
|
|
||||||
append(sb, "Contract type", projection.getContractNature() == null ? null : projection.getContractNature().name());
|
|
||||||
append(sb, "Geographic context", joined(firstNonEmpty(lot.getNutsCodes(), projection.getNutsCodes()), projection.getBuyerCountryCode()));
|
|
||||||
|
|
||||||
if (!StringUtils.hasText(lot.getDescription())) {
|
|
||||||
append(sb, "Parent notice context", joinedLines(projection.getProjectTitle(), projection.getProjectDescription()));
|
|
||||||
} else if (properties.getLotDocuments().isIncludeProjectDescription()) {
|
|
||||||
append(sb, "Parent notice context", joinedLines(projection.getProjectTitle(), projection.getProjectDescription()));
|
|
||||||
} else {
|
|
||||||
append(sb, "Parent notice context", projection.getProjectTitle());
|
|
||||||
}
|
|
||||||
return sb.toString().trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void append(StringBuilder sb, String label, String value) {
|
|
||||||
if (!StringUtils.hasText(value)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (sb.length() > 0) {
|
|
||||||
sb.append('\n').append('\n');
|
|
||||||
}
|
|
||||||
sb.append(label).append(':').append('\n').append(value.trim());
|
|
||||||
}
|
|
||||||
|
|
||||||
private String buildLotBusinessKey(TedNoticeProjection projection, TedNoticeLot lot) {
|
|
||||||
String noticeKey = firstNonBlank(projection.getPublicationId(), projection.getNoticeId(), projection.getDocument().getId().toString());
|
|
||||||
String lotKey = firstNonBlank(lot.getLotId(), lot.getInternalId(), lot.getId().toString());
|
|
||||||
return "TED_NOTICE_LOT:" + sanitizeKey(noticeKey) + ":" + sanitizeKey(lotKey);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int resolveSortOrder(TedNoticeLot lot) {
|
|
||||||
String value = firstNonBlank(lot.getLotId(), lot.getInternalId());
|
|
||||||
if (value != null) {
|
|
||||||
String digits = value.replaceAll("\\D+", "");
|
|
||||||
if (StringUtils.hasText(digits)) {
|
|
||||||
try {
|
|
||||||
return Integer.parseInt(digits);
|
|
||||||
} catch (NumberFormatException ignored) {
|
|
||||||
// use default below
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String[] firstNonEmpty(String[] first, String[] fallback) {
|
|
||||||
return first != null && first.length > 0 ? first : fallback;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String joined(String[] values) {
|
|
||||||
if (values == null || values.length == 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return Arrays.stream(values)
|
|
||||||
.filter(StringUtils::hasText)
|
|
||||||
.map(String::trim)
|
|
||||||
.distinct()
|
|
||||||
.collect(Collectors.joining(", "));
|
|
||||||
}
|
|
||||||
|
|
||||||
private String joined(String[] values, String fallback) {
|
|
||||||
String joined = joined(values);
|
|
||||||
if (StringUtils.hasText(joined)) {
|
|
||||||
return joined;
|
|
||||||
}
|
|
||||||
return fallback;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String joinedLines(String... values) {
|
|
||||||
String joined = Arrays.stream(values)
|
|
||||||
.filter(StringUtils::hasText)
|
|
||||||
.map(String::trim)
|
|
||||||
.distinct()
|
|
||||||
.collect(Collectors.joining("\n"));
|
|
||||||
return StringUtils.hasText(joined) ? joined : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String sanitizeKey(String value) {
|
|
||||||
return value == null ? "unknown" : value.trim().replaceAll("\\s+", "_");
|
|
||||||
}
|
|
||||||
|
|
||||||
private String escapeJson(String value) {
|
|
||||||
return value == null ? "" : value.replace("\\", "\\\\").replace("\"", "\\\"");
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean equalsNullable(String left, String right) {
|
|
||||||
return left == null ? right == null : left.equals(right);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String firstNonBlank(String... values) {
|
|
||||||
for (String value : values) {
|
|
||||||
if (StringUtils.hasText(value)) {
|
|
||||||
return value.trim();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -39,7 +39,6 @@ public class TedNoticeProjectionService {
|
||||||
private final TedNoticeProjectionRepository projectionRepository;
|
private final TedNoticeProjectionRepository projectionRepository;
|
||||||
private final TedNoticeLotRepository lotRepository;
|
private final TedNoticeLotRepository lotRepository;
|
||||||
private final TedNoticeOrganizationRepository organizationRepository;
|
private final TedNoticeOrganizationRepository organizationRepository;
|
||||||
private final TedLotDocumentMaterializationService lotDocumentMaterializationService;
|
|
||||||
|
|
||||||
@Transactional
|
@Transactional
|
||||||
public UUID registerOrRefreshProjection(ProcurementDocument legacyDocument) {
|
public UUID registerOrRefreshProjection(ProcurementDocument legacyDocument) {
|
||||||
|
|
@ -167,8 +166,7 @@ public class TedNoticeProjectionService {
|
||||||
.euFunded(lot.getEuFunded())
|
.euFunded(lot.getEuFunded())
|
||||||
.build());
|
.build());
|
||||||
}
|
}
|
||||||
List<TedNoticeLot> savedLots = lotRepository.saveAll(projectedLots);
|
lotRepository.saveAll(projectedLots);
|
||||||
lotDocumentMaterializationService.materializeLots(projection, savedLots);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void replaceOrganizations(TedNoticeProjection projection, List<Organization> legacyOrganizations) {
|
private void replaceOrganizations(TedNoticeProjection projection, List<Organization> legacyOrganizations) {
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,8 @@
|
||||||
package at.procon.dip.domain.ted.startup;
|
package at.procon.dip.domain.ted.startup;
|
||||||
|
|
||||||
import at.procon.dip.domain.ted.config.TedProjectionProperties;
|
|
||||||
import at.procon.dip.domain.ted.repository.TedNoticeProjectionRepository;
|
import at.procon.dip.domain.ted.repository.TedNoticeProjectionRepository;
|
||||||
import at.procon.dip.domain.ted.service.TedLotDocumentMaterializationService;
|
|
||||||
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
|
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
|
||||||
|
import at.procon.dip.domain.ted.config.TedProjectionProperties;
|
||||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||||
import at.procon.dip.runtime.config.RuntimeMode;
|
import at.procon.dip.runtime.config.RuntimeMode;
|
||||||
import at.procon.ted.repository.ProcurementDocumentRepository;
|
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||||
|
|
@ -16,7 +15,7 @@ import org.springframework.data.domain.Sort;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Optional startup backfill for Phase 3 TED projections and optional TED lot documents.
|
* Optional startup backfill for Phase 3 TED projections.
|
||||||
*/
|
*/
|
||||||
@Component
|
@Component
|
||||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||||
|
|
@ -24,99 +23,32 @@ import org.springframework.stereotype.Component;
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class TedProjectionStartupRunner implements ApplicationRunner {
|
public class TedProjectionStartupRunner implements ApplicationRunner {
|
||||||
|
|
||||||
private static final int STARTUP_BACKFILL_BATCH_SIZE = 1000;
|
|
||||||
|
|
||||||
private final TedProjectionProperties properties;
|
private final TedProjectionProperties properties;
|
||||||
private final ProcurementDocumentRepository procurementDocumentRepository;
|
private final ProcurementDocumentRepository procurementDocumentRepository;
|
||||||
private final TedNoticeProjectionRepository projectionRepository;
|
private final TedNoticeProjectionRepository projectionRepository;
|
||||||
private final TedNoticeProjectionService projectionService;
|
private final TedNoticeProjectionService projectionService;
|
||||||
private final TedLotDocumentMaterializationService lotDocumentMaterializationService;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run(ApplicationArguments args) {
|
public void run(ApplicationArguments args) {
|
||||||
if (!properties.isEnabled()) {
|
if (!properties.isEnabled() || !properties.isStartupBackfillEnabled()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (properties.isStartupBackfillEnabled()) {
|
|
||||||
backfillNoticeProjections();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (properties.getLotDocuments().isEnabled() && properties.getLotDocuments().isStartupBackfillEnabled()) {
|
|
||||||
backfillLotDocuments();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void backfillNoticeProjections() {
|
|
||||||
int limit = properties.getStartupBackfillLimit();
|
int limit = properties.getStartupBackfillLimit();
|
||||||
log.info("Phase 3 startup backfill enabled - ensuring TED projections for {} documents", describeLimit(limit));
|
log.info("Phase 3 startup backfill enabled - ensuring TED projections for up to {} documents", limit);
|
||||||
|
|
||||||
|
var page = procurementDocumentRepository.findAll(
|
||||||
|
PageRequest.of(0, limit, Sort.by(Sort.Direction.ASC, "createdAt")));
|
||||||
|
|
||||||
int synced = 0;
|
int synced = 0;
|
||||||
int processed = 0;
|
for (var legacyDocument : page.getContent()) {
|
||||||
int pageNumber = 0;
|
if (projectionRepository.existsByLegacyProcurementDocumentId(legacyDocument.getId())) {
|
||||||
Sort sort = Sort.by(Sort.Direction.ASC, "createdAt");
|
continue;
|
||||||
|
|
||||||
while (limit <= 0 || processed < limit) {
|
|
||||||
int pageSize = pageSizeFor(limit, processed);
|
|
||||||
var page = procurementDocumentRepository.findAll(PageRequest.of(pageNumber++, pageSize, sort));
|
|
||||||
if (page.isEmpty()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var legacyDocument : page.getContent()) {
|
|
||||||
processed++;
|
|
||||||
if (projectionRepository.existsByLegacyProcurementDocumentId(legacyDocument.getId())) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
projectionService.registerOrRefreshProjection(legacyDocument);
|
|
||||||
synced++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!page.hasNext()) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
projectionService.registerOrRefreshProjection(legacyDocument);
|
||||||
|
synced++;
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Phase 3 startup backfill completed - synced {} TED projections", synced);
|
log.info("Phase 3 startup backfill completed - synced {} TED projections", synced);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void backfillLotDocuments() {
|
|
||||||
int limit = properties.getLotDocuments().getStartupBackfillLimit();
|
|
||||||
log.info("TED lot document startup backfill enabled - materializing lots for {} projections", describeLimit(limit));
|
|
||||||
|
|
||||||
int lotDocuments = 0;
|
|
||||||
int processed = 0;
|
|
||||||
int pageNumber = 0;
|
|
||||||
Sort sort = Sort.by(Sort.Direction.ASC, "createdAt");
|
|
||||||
|
|
||||||
while (limit <= 0 || processed < limit) {
|
|
||||||
int pageSize = pageSizeFor(limit, processed);
|
|
||||||
var page = projectionRepository.findAll(PageRequest.of(pageNumber++, pageSize, sort));
|
|
||||||
if (page.isEmpty()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var projection : page.getContent()) {
|
|
||||||
processed++;
|
|
||||||
lotDocuments += lotDocumentMaterializationService.materializeProjectionLots(projection.getId());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!page.hasNext()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info("TED lot document startup backfill completed - materialized/updated {} lot documents", lotDocuments);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int pageSizeFor(int limit, int processed) {
|
|
||||||
if (limit <= 0) {
|
|
||||||
return STARTUP_BACKFILL_BATCH_SIZE;
|
|
||||||
}
|
|
||||||
return Math.min(STARTUP_BACKFILL_BATCH_SIZE, limit - processed);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String describeLimit(int limit) {
|
|
||||||
return limit > 0 ? "up to " + limit : "all available";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,6 @@
|
||||||
package at.procon.dip.embedding.config;
|
package at.procon.dip.embedding.config;
|
||||||
|
|
||||||
import at.procon.dip.domain.document.DistanceMetric;
|
import at.procon.dip.domain.document.DistanceMetric;
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
|
||||||
import at.procon.dip.domain.document.RepresentationType;
|
|
||||||
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
|
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
|
|
@ -23,7 +21,6 @@ public class EmbeddingProperties {
|
||||||
private Map<String, ModelProperties> models = new LinkedHashMap<>();
|
private Map<String, ModelProperties> models = new LinkedHashMap<>();
|
||||||
private IndexingProperties indexing = new IndexingProperties();
|
private IndexingProperties indexing = new IndexingProperties();
|
||||||
private JobsProperties jobs = new JobsProperties();
|
private JobsProperties jobs = new JobsProperties();
|
||||||
private StartupProperties startup = new StartupProperties();
|
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
public static class ProviderProperties {
|
public static class ProviderProperties {
|
||||||
|
|
@ -82,18 +79,4 @@ public class EmbeddingProperties {
|
||||||
private Duration maxRetryDelay = Duration.ofHours(6);
|
private Duration maxRetryDelay = Duration.ofHours(6);
|
||||||
private long schedulerDelayMs = 5000;
|
private long schedulerDelayMs = 5000;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Data
|
|
||||||
public static class StartupProperties {
|
|
||||||
private boolean enqueueMissingEnabled = false;
|
|
||||||
private boolean processReadyEnabled = false;
|
|
||||||
private DocumentType documentType;
|
|
||||||
private RepresentationType representationType;
|
|
||||||
private String builderKey;
|
|
||||||
private boolean primaryOnly = false;
|
|
||||||
private String modelKey;
|
|
||||||
private boolean force = false;
|
|
||||||
private int batchSize = 1000;
|
|
||||||
private long maxRepresentationsPerRun = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,17 +0,0 @@
|
||||||
package at.procon.dip.embedding.service;
|
|
||||||
|
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
|
||||||
import at.procon.dip.domain.document.RepresentationType;
|
|
||||||
|
|
||||||
public record ScopedEmbeddingEnqueueResult(
|
|
||||||
DocumentType documentType,
|
|
||||||
RepresentationType representationType,
|
|
||||||
String builderKey,
|
|
||||||
boolean primaryOnly,
|
|
||||||
String modelKey,
|
|
||||||
boolean force,
|
|
||||||
int requestedLimit,
|
|
||||||
int matchedRepresentations,
|
|
||||||
int jobsQueuedOrAlreadyActive
|
|
||||||
) {
|
|
||||||
}
|
|
||||||
|
|
@ -1,91 +0,0 @@
|
||||||
package at.procon.dip.embedding.service;
|
|
||||||
|
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
|
||||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
|
||||||
import at.procon.dip.domain.document.RepresentationType;
|
|
||||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
|
||||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
|
||||||
import at.procon.dip.embedding.job.service.EmbeddingJobService;
|
|
||||||
import at.procon.dip.embedding.model.EmbeddingJobType;
|
|
||||||
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
|
||||||
import java.util.List;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import org.springframework.data.domain.PageRequest;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
import org.springframework.transaction.annotation.Transactional;
|
|
||||||
import org.springframework.util.StringUtils;
|
|
||||||
|
|
||||||
@Service
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class ScopedEmbeddingEnqueueService {
|
|
||||||
|
|
||||||
private static final int MAX_LIMIT = 10_000;
|
|
||||||
|
|
||||||
private final DocumentTextRepresentationRepository representationRepository;
|
|
||||||
private final EmbeddingJobService jobService;
|
|
||||||
private final EmbeddingModelRegistry modelRegistry;
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
public ScopedEmbeddingEnqueueResult enqueueByDocumentType(DocumentType documentType,
|
|
||||||
RepresentationType representationType,
|
|
||||||
String builderKey,
|
|
||||||
boolean primaryOnly,
|
|
||||||
String modelKey,
|
|
||||||
boolean force,
|
|
||||||
int limit) {
|
|
||||||
return enqueueByDocumentType(documentType, representationType, builderKey, primaryOnly, modelKey, force, 0, limit);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Transactional
|
|
||||||
public ScopedEmbeddingEnqueueResult enqueueByDocumentType(DocumentType documentType,
|
|
||||||
RepresentationType representationType,
|
|
||||||
String builderKey,
|
|
||||||
boolean primaryOnly,
|
|
||||||
String modelKey,
|
|
||||||
boolean force,
|
|
||||||
int pageNumber,
|
|
||||||
int limit) {
|
|
||||||
String effectiveModelKey = StringUtils.hasText(modelKey)
|
|
||||||
? modelKey
|
|
||||||
: modelRegistry.getRequiredDefaultDocumentModelKey();
|
|
||||||
modelRegistry.getRequired(effectiveModelKey);
|
|
||||||
|
|
||||||
int effectiveLimit = Math.max(1, Math.min(limit, MAX_LIMIT));
|
|
||||||
int effectivePageNumber = Math.max(0, pageNumber);
|
|
||||||
String normalizedBuilderKey = StringUtils.hasText(builderKey) ? builderKey.trim() : null;
|
|
||||||
|
|
||||||
List<DocumentTextRepresentation> representations = representationRepository.findEmbeddingCandidatesByDocumentType(
|
|
||||||
documentType,
|
|
||||||
representationType,
|
|
||||||
normalizedBuilderKey,
|
|
||||||
primaryOnly,
|
|
||||||
effectiveModelKey,
|
|
||||||
EmbeddingStatus.COMPLETED,
|
|
||||||
force,
|
|
||||||
PageRequest.of(effectivePageNumber, effectiveLimit)
|
|
||||||
);
|
|
||||||
|
|
||||||
int jobs = 0;
|
|
||||||
for (DocumentTextRepresentation representation : representations) {
|
|
||||||
jobService.enqueueForRepresentation(
|
|
||||||
representation.getDocument().getId(),
|
|
||||||
representation.getId(),
|
|
||||||
effectiveModelKey,
|
|
||||||
EmbeddingJobType.DOCUMENT_EMBED
|
|
||||||
);
|
|
||||||
jobs++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return new ScopedEmbeddingEnqueueResult(
|
|
||||||
documentType,
|
|
||||||
representationType,
|
|
||||||
normalizedBuilderKey,
|
|
||||||
primaryOnly,
|
|
||||||
effectiveModelKey,
|
|
||||||
force,
|
|
||||||
effectiveLimit,
|
|
||||||
representations.size(),
|
|
||||||
jobs
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,92 +0,0 @@
|
||||||
package at.procon.dip.embedding.startup;
|
|
||||||
|
|
||||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
|
||||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
|
||||||
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueResult;
|
|
||||||
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueService;
|
|
||||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
|
||||||
import at.procon.dip.runtime.config.RuntimeMode;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.springframework.boot.ApplicationArguments;
|
|
||||||
import org.springframework.boot.ApplicationRunner;
|
|
||||||
import org.springframework.core.Ordered;
|
|
||||||
import org.springframework.core.annotation.Order;
|
|
||||||
import org.springframework.stereotype.Component;
|
|
||||||
|
|
||||||
@Component
|
|
||||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
@Order(Ordered.LOWEST_PRECEDENCE)
|
|
||||||
@Slf4j
|
|
||||||
public class EmbeddingStartupRunner implements ApplicationRunner {
|
|
||||||
|
|
||||||
private final EmbeddingProperties properties;
|
|
||||||
private final ScopedEmbeddingEnqueueService enqueueService;
|
|
||||||
private final RepresentationEmbeddingOrchestrator orchestrator;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void run(ApplicationArguments args) {
|
|
||||||
EmbeddingProperties.StartupProperties startup = properties.getStartup();
|
|
||||||
if (!properties.isEnabled() || startup == null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (startup.isEnqueueMissingEnabled()) {
|
|
||||||
enqueueMissing(startup);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (startup.isProcessReadyEnabled()) {
|
|
||||||
processReadyJobs();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void enqueueMissing(EmbeddingProperties.StartupProperties startup) {
|
|
||||||
int batchSize = Math.max(1, startup.getBatchSize());
|
|
||||||
long max = startup.getMaxRepresentationsPerRun();
|
|
||||||
long remaining = max > 0 ? max : Long.MAX_VALUE;
|
|
||||||
int pageNumber = 0;
|
|
||||||
long matched = 0;
|
|
||||||
long queued = 0;
|
|
||||||
|
|
||||||
log.info("Startup embedding enqueue enabled (documentType={}, representationType={}, builderKey={}, primaryOnly={}, force={}, modelKey={}, maxRepresentationsPerRun={})",
|
|
||||||
startup.getDocumentType(), startup.getRepresentationType(), startup.getBuilderKey(), startup.isPrimaryOnly(),
|
|
||||||
startup.isForce(), startup.getModelKey(), max > 0 ? max : "unbounded");
|
|
||||||
|
|
||||||
while (remaining > 0) {
|
|
||||||
int requested = (int) Math.min(batchSize, remaining);
|
|
||||||
ScopedEmbeddingEnqueueResult result = enqueueService.enqueueByDocumentType(
|
|
||||||
startup.getDocumentType(),
|
|
||||||
startup.getRepresentationType(),
|
|
||||||
startup.getBuilderKey(),
|
|
||||||
startup.isPrimaryOnly(),
|
|
||||||
startup.getModelKey(),
|
|
||||||
startup.isForce(),
|
|
||||||
pageNumber,
|
|
||||||
requested
|
|
||||||
);
|
|
||||||
|
|
||||||
matched += result.matchedRepresentations();
|
|
||||||
queued += result.jobsQueuedOrAlreadyActive();
|
|
||||||
remaining -= result.matchedRepresentations();
|
|
||||||
|
|
||||||
if (result.matchedRepresentations() < requested) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
pageNumber++;
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info("Startup embedding enqueue completed - matched {} representation(s), queued/already-active {} job(s)",
|
|
||||||
matched, queued);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void processReadyJobs() {
|
|
||||||
if (!properties.getJobs().isEnabled()) {
|
|
||||||
log.warn("Startup embedding processing was requested but dip.embedding.jobs.enabled=false; queued jobs will not be processed by the NEW embedding job subsystem");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int processed = orchestrator.processNextReadyBatch();
|
|
||||||
log.info("Startup embedding processing completed - processed {} ready job(s)", processed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,66 +0,0 @@
|
||||||
package at.procon.dip.embedding.web;
|
|
||||||
|
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
|
||||||
import at.procon.dip.domain.document.RepresentationType;
|
|
||||||
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueResult;
|
|
||||||
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueService;
|
|
||||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
|
||||||
import at.procon.dip.runtime.config.RuntimeMode;
|
|
||||||
import io.swagger.v3.oas.annotations.Operation;
|
|
||||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import org.springframework.http.HttpStatus;
|
|
||||||
import org.springframework.web.bind.annotation.PostMapping;
|
|
||||||
import org.springframework.web.bind.annotation.RequestMapping;
|
|
||||||
import org.springframework.web.bind.annotation.RequestParam;
|
|
||||||
import org.springframework.web.bind.annotation.RestController;
|
|
||||||
import org.springframework.web.server.ResponseStatusException;
|
|
||||||
|
|
||||||
@RestController
|
|
||||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
|
||||||
@RequestMapping("/v1/dip/admin/embeddings")
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
@Tag(name = "Embedding Admin", description = "Administrative operations for NEW-runtime representation embeddings")
|
|
||||||
public class EmbeddingAdminController {
|
|
||||||
|
|
||||||
private final ScopedEmbeddingEnqueueService enqueueService;
|
|
||||||
|
|
||||||
@PostMapping("/enqueue-by-document-type")
|
|
||||||
@Operation(
|
|
||||||
summary = "Queue embeddings by document type",
|
|
||||||
description = "Queues embedding jobs only for DOC text representations belonging to the requested document type."
|
|
||||||
)
|
|
||||||
public ScopedEmbeddingEnqueueResult enqueueByDocumentType(
|
|
||||||
@RequestParam String documentType,
|
|
||||||
@RequestParam(required = false) String representationType,
|
|
||||||
@RequestParam(required = false) String builderKey,
|
|
||||||
@RequestParam(required = false, defaultValue = "false") boolean primaryOnly,
|
|
||||||
@RequestParam(required = false) String modelKey,
|
|
||||||
@RequestParam(required = false, defaultValue = "false") boolean force,
|
|
||||||
@RequestParam(required = false, defaultValue = "1000") int limit) {
|
|
||||||
return enqueueService.enqueueByDocumentType(
|
|
||||||
parseEnum(DocumentType.class, documentType, "documentType"),
|
|
||||||
parseEnum(RepresentationType.class, representationType, "representationType"),
|
|
||||||
builderKey,
|
|
||||||
primaryOnly,
|
|
||||||
modelKey,
|
|
||||||
force,
|
|
||||||
limit
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
private <E extends Enum<E>> E parseEnum(Class<E> enumType, String value, String parameterName) {
|
|
||||||
if (value == null || value.isBlank()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
String normalized = value.trim().replace('-', '_').toUpperCase(java.util.Locale.ROOT);
|
|
||||||
try {
|
|
||||||
return Enum.valueOf(enumType, normalized);
|
|
||||||
} catch (IllegalArgumentException ex) {
|
|
||||||
throw new ResponseStatusException(
|
|
||||||
HttpStatus.BAD_REQUEST,
|
|
||||||
"Invalid " + parameterName + ": " + value
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -174,7 +174,7 @@ public final class DocumentImportSupport {
|
||||||
|
|
||||||
public static DocumentFamily familyFor(DocumentType documentType) {
|
public static DocumentFamily familyFor(DocumentType documentType) {
|
||||||
return switch (documentType) {
|
return switch (documentType) {
|
||||||
case TED_PACKAGE, TED_NOTICE, TED_NOTICE_LOT -> DocumentFamily.PROCUREMENT;
|
case TED_PACKAGE, TED_NOTICE -> DocumentFamily.PROCUREMENT;
|
||||||
case TIME_ENTRY -> DocumentFamily.TIME;
|
case TIME_ENTRY -> DocumentFamily.TIME;
|
||||||
case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL;
|
case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL;
|
||||||
case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN ->
|
case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN ->
|
||||||
|
|
|
||||||
|
|
@ -119,7 +119,7 @@ ted:
|
||||||
# Mail account username (email address)
|
# Mail account username (email address)
|
||||||
username: archiv@procon.co.at
|
username: archiv@procon.co.at
|
||||||
# Mail account password
|
# Mail account password
|
||||||
password: ${MAIL_PASSWORD:}
|
password: ${MAIL_PASSWORD:worasigg}
|
||||||
# Use SSL/TLS connection
|
# Use SSL/TLS connection
|
||||||
ssl: true
|
ssl: true
|
||||||
# Mail folder to read from
|
# Mail folder to read from
|
||||||
|
|
|
||||||
|
|
@ -43,29 +43,10 @@ dip:
|
||||||
enabled: true
|
enabled: true
|
||||||
jobs:
|
jobs:
|
||||||
enabled: false
|
enabled: false
|
||||||
parallel-batch-count: 2
|
parallel-batch-count: 1
|
||||||
process-in-batches: true
|
process-in-batches: true
|
||||||
batch-size: 48
|
batch-size: 16
|
||||||
execution-batch-size: 48
|
execution-batch-size: 16
|
||||||
startup:
|
|
||||||
# Enqueue missing DOC representation embeddings on NEW-runtime startup.
|
|
||||||
enqueue-missing-enabled: true
|
|
||||||
# Also process ready embedding jobs during startup. Requires dip.embedding.jobs.enabled=true.
|
|
||||||
process-ready-enabled: true
|
|
||||||
# Leave empty to enqueue missing embeddings for all document types, or set e.g. TED_NOTICE_LOT.
|
|
||||||
document-type: TED_NOTICE_LOT
|
|
||||||
# Optional representation filter, e.g. SEMANTIC_TEXT.
|
|
||||||
representation-type:
|
|
||||||
# Optional builder filter, e.g. ted-lot-clustering-text-v1.
|
|
||||||
builder-key:
|
|
||||||
primary-only: false
|
|
||||||
# Leave empty to use dip.embedding.default-document-model.
|
|
||||||
model-key:
|
|
||||||
# False skips representations that already have a COMPLETED embedding for the model.
|
|
||||||
force: false
|
|
||||||
batch-size: 1000
|
|
||||||
# 0 means enqueue all matching not-vectorized representations.
|
|
||||||
max-representations-per-run: 0
|
|
||||||
|
|
||||||
default-document-model: e5-default
|
default-document-model: e5-default
|
||||||
default-query-model: e5-default
|
default-query-model: e5-default
|
||||||
|
|
@ -277,7 +258,7 @@ dip:
|
||||||
# Mailbox username
|
# Mailbox username
|
||||||
username: archiv@procon.co.at
|
username: archiv@procon.co.at
|
||||||
# Mailbox password
|
# Mailbox password
|
||||||
password: ${MAIL_PASSWORD}
|
password: ${MAIL_PASSWORD:worasigg}
|
||||||
# Folder/mailbox name
|
# Folder/mailbox name
|
||||||
folder-name: INBOX
|
folder-name: INBOX
|
||||||
# Optional stable provider account key; falls back to username
|
# Optional stable provider account key; falls back to username
|
||||||
|
|
@ -369,17 +350,6 @@ dip:
|
||||||
startup-backfill-limit: 250
|
startup-backfill-limit: 250
|
||||||
structured-search-hybrid-candidate-limit: 5000
|
structured-search-hybrid-candidate-limit: 5000
|
||||||
structured-search-facet-bucket-limit: 12
|
structured-search-facet-bucket-limit: 12
|
||||||
lot-documents:
|
|
||||||
# Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot.
|
|
||||||
enabled: true
|
|
||||||
# Optional startup/backfill path for notices that were imported before lot documents existed.
|
|
||||||
startup-backfill-enabled: true
|
|
||||||
# Maximum number of legacy TED lot documents to backfill during startup (0 = all)
|
|
||||||
startup-backfill-limit: 0
|
|
||||||
# Queue embeddings whenever the lot semantic text representation is created or changed.
|
|
||||||
queue-embeddings-on-change: true
|
|
||||||
# Include parent notice project description even when the lot already has its own description.
|
|
||||||
include-parent-description: false
|
|
||||||
|
|
||||||
migration:
|
migration:
|
||||||
legacy-audit:
|
legacy-audit:
|
||||||
|
|
@ -420,4 +390,4 @@ dip:
|
||||||
batch-size: 500
|
batch-size: 500
|
||||||
max-documents-per-run: 0
|
max-documents-per-run: 0
|
||||||
skip-when-primary-representation-missing: true
|
skip-when-primary-representation-missing: true
|
||||||
queue-missing-embeddings: true
|
queue-missing-embeddings: true
|
||||||
|
|
@ -14,11 +14,12 @@ spring:
|
||||||
name: document-intelligence-platform
|
name: document-intelligence-platform
|
||||||
|
|
||||||
datasource:
|
datasource:
|
||||||
url: jdbc:postgresql://localhost:5432/RELM
|
#url: jdbc:postgresql://localhost:5432/RELM
|
||||||
#url: jdbc:postgresql://94.130.218.54:32333/RELM
|
#username: ${DB_USERNAME:postgres}
|
||||||
username: ${DB_USERNAME}
|
#password: ${DB_PASSWORD:P54!pcd#Wi}
|
||||||
password: ${DB_PASSWORD}
|
url: jdbc:postgresql://94.130.218.54:32333/RELM
|
||||||
|
username: ${DB_USERNAME:postgres}
|
||||||
|
password: ${DB_PASSWORD:PDmXRx0Rbk9OFOn9qO5Gm/mPCfqW8zwbZ+/YIU1lySc=}
|
||||||
driver-class-name: org.postgresql.Driver
|
driver-class-name: org.postgresql.Driver
|
||||||
hikari:
|
hikari:
|
||||||
maximum-pool-size: 5
|
maximum-pool-size: 5
|
||||||
|
|
@ -26,7 +27,7 @@ spring:
|
||||||
connection-timeout: 30000
|
connection-timeout: 30000
|
||||||
idle-timeout: 300000
|
idle-timeout: 300000
|
||||||
max-lifetime: 900000
|
max-lifetime: 900000
|
||||||
leak-detection-threshold: 300000 # 5 minutes - increased to avoid false positives with batch processing
|
leak-detection-threshold: 120000 # 2 minutes - increased to avoid false positives with batch processing
|
||||||
|
|
||||||
jpa:
|
jpa:
|
||||||
hibernate:
|
hibernate:
|
||||||
|
|
|
||||||
|
|
@ -1,48 +0,0 @@
|
||||||
-- Adds the canonical TED_NOTICE_LOT document type used for per-lot semantic representations.
|
|
||||||
-- The lot document stores derived semantic text in DOC.doc_text_representation; no DOC.doc_content row is required.
|
|
||||||
|
|
||||||
DO $$
|
|
||||||
BEGIN
|
|
||||||
IF EXISTS (
|
|
||||||
SELECT 1
|
|
||||||
FROM pg_type t
|
|
||||||
JOIN pg_namespace n ON n.oid = t.typnamespace
|
|
||||||
WHERE n.nspname = 'doc'
|
|
||||||
AND t.typname = 'doc_document_type'
|
|
||||||
) THEN
|
|
||||||
ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_NOTICE_LOT';
|
|
||||||
END IF;
|
|
||||||
END
|
|
||||||
$$;
|
|
||||||
|
|
||||||
DO $$
|
|
||||||
BEGIN
|
|
||||||
IF EXISTS (
|
|
||||||
SELECT 1
|
|
||||||
FROM pg_constraint c
|
|
||||||
JOIN pg_class r ON r.oid = c.conrelid
|
|
||||||
JOIN pg_namespace n ON n.oid = r.relnamespace
|
|
||||||
WHERE n.nspname = 'doc'
|
|
||||||
AND r.relname = 'doc_document'
|
|
||||||
AND c.conname = 'doc_document_document_type_check'
|
|
||||||
) THEN
|
|
||||||
ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_type_check;
|
|
||||||
ALTER TABLE DOC.doc_document
|
|
||||||
ADD CONSTRAINT doc_document_document_type_check
|
|
||||||
CHECK (
|
|
||||||
document_type IN (
|
|
||||||
'TED_PACKAGE', 'TED_NOTICE', 'TED_NOTICE_LOT', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
|
|
||||||
'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'TIME_ENTRY', 'UNKNOWN'
|
|
||||||
)
|
|
||||||
);
|
|
||||||
END IF;
|
|
||||||
END
|
|
||||||
$$;
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_doc_document_ted_lot_business_key
|
|
||||||
ON DOC.doc_document(business_key)
|
|
||||||
WHERE document_type = 'TED_NOTICE_LOT';
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_ted_lot_builder
|
|
||||||
ON DOC.doc_text_representation(builder_key)
|
|
||||||
WHERE builder_key = 'ted-lot-structured-text';
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
-- Adds an index for the TED lot clustering-oriented semantic representation builder.
|
|
||||||
-- The original ted-lot-structured-text index is left in place for databases that
|
|
||||||
-- already materialized or queried the earlier representation shape.
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_ted_lot_clustering_builder
|
|
||||||
ON DOC.doc_text_representation(builder_key)
|
|
||||||
WHERE builder_key = 'ted-lot-clustering-text-v1';
|
|
||||||
Loading…
Reference in New Issue