diff --git a/PATCH_NOTES.md b/PATCH_NOTES.md index e5538a2..8206fc6 100644 --- a/PATCH_NOTES.md +++ b/PATCH_NOTES.md @@ -7,10 +7,10 @@ Included changes: - Score normalization and result fusion - Generic /search endpoint - Lexical index maintenance service and startup backfill runner -- DOC lexical search migration (V9) +- DOC lexical search migration (V14) - Modified DOC representation write path to refresh search vectors Important note: -- Full-text search requires V9__doc_search_slice1_support.sql to be applied. +- Full-text search requires V14__doc_search_slice1_support.sql to be applied. - The lexical index service is guarded and will no-op if the search columns are not yet present. - Because Flyway is currently disabled in application.yml, apply the migration manually or enable Flyway before using the new search endpoint. diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java index c831cfe..04e7252 100644 --- a/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java @@ -1,11 +1,16 @@ package at.procon.dip.domain.document.repository; import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.EmbeddingStatus; import at.procon.dip.domain.document.entity.DocumentTextRepresentation; import java.util.List; import java.util.Optional; import java.util.UUID; +import org.springframework.data.domain.Pageable; import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.data.jpa.repository.Query; +import org.springframework.data.repository.query.Param; public interface DocumentTextRepresentationRepository extends JpaRepository { @@ -20,4 +25,33 @@ public interface DocumentTextRepresentationRepository extends JpaRepository findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId); + + @Query(""" + SELECT r + FROM DocumentTextRepresentation r + JOIN r.document d + WHERE (:documentType IS NULL OR d.documentType = :documentType) + AND (:representationType IS NULL OR r.representationType = :representationType) + AND (:builderKey IS NULL OR r.builderKey = :builderKey) + AND (:primaryOnly = false OR r.primaryRepresentation = true) + AND r.textBody IS NOT NULL + AND r.textBody <> '' + AND (:includeCompleted = true OR NOT EXISTS ( + SELECT e.id + FROM DocumentEmbedding e + WHERE e.representation.id = r.id + AND e.model.modelKey = :modelKey + AND e.embeddingStatus = :completedStatus + )) + ORDER BY r.createdAt ASC, r.id ASC + """) + List findEmbeddingCandidatesByDocumentType( + @Param("documentType") DocumentType documentType, + @Param("representationType") RepresentationType representationType, + @Param("builderKey") String builderKey, + @Param("primaryOnly") boolean primaryOnly, + @Param("modelKey") String modelKey, + @Param("completedStatus") EmbeddingStatus completedStatus, + @Param("includeCompleted") boolean includeCompleted, + Pageable pageable); } diff --git a/src/main/java/at/procon/dip/domain/ted/config/TedProjectionProperties.java b/src/main/java/at/procon/dip/domain/ted/config/TedProjectionProperties.java index 8d60c99..bc22754 100644 --- a/src/main/java/at/procon/dip/domain/ted/config/TedProjectionProperties.java +++ b/src/main/java/at/procon/dip/domain/ted/config/TedProjectionProperties.java @@ -1,17 +1,23 @@ package at.procon.dip.domain.ted.config; +import jakarta.validation.constraints.Min; import jakarta.validation.constraints.Positive; import lombok.Data; import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.context.annotation.Configuration; +import org.springframework.validation.annotation.Validated; @Configuration @ConfigurationProperties(prefix = "dip.ted.projection") +@Validated @Data public class TedProjectionProperties { private boolean enabled = true; private boolean startupBackfillEnabled = false; - @Positive + /** + * Maximum number of legacy TED documents to backfill on startup. 0 means no limit. + */ + @Min(0) private int startupBackfillLimit = 250; @Positive private int structuredSearchHybridCandidateLimit = 5000; @@ -29,7 +35,10 @@ public class TedProjectionProperties { * Optional startup/backfill path for notices that were imported before lot documents existed. */ private boolean startupBackfillEnabled = false; - @Positive + /** + * Maximum number of projections to backfill on startup. 0 means no limit. + */ + @Min(0) private int startupBackfillLimit = 1000; /** * Queue embeddings whenever the lot semantic text representation is created or changed. diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedLotDocumentMaterializationService.java b/src/main/java/at/procon/dip/domain/ted/service/TedLotDocumentMaterializationService.java index 92addb0..4db2202 100644 --- a/src/main/java/at/procon/dip/domain/ted/service/TedLotDocumentMaterializationService.java +++ b/src/main/java/at/procon/dip/domain/ted/service/TedLotDocumentMaterializationService.java @@ -37,7 +37,7 @@ import org.springframework.util.StringUtils; @Slf4j public class TedLotDocumentMaterializationService { - public static final String BUILDER_KEY = "ted-lot-structured-text"; + public static final String BUILDER_KEY = "ted-lot-clustering-text-v1"; private final TedProjectionProperties properties; private final TedNoticeLotRepository lotRepository; @@ -174,22 +174,20 @@ public class TedLotDocumentMaterializationService { private String buildSemanticText(TedNoticeProjection projection, TedNoticeLot lot) { StringBuilder sb = new StringBuilder(1024); - append(sb, "Document type", "TED procurement lot"); - append(sb, "Lot title", lot.getTitle()); - append(sb, "Lot description", lot.getDescription()); - append(sb, "Project title", projection.getProjectTitle()); + append(sb, "Entity type", "Procurement lot"); + append(sb, "Procurement scope", joinedLines(lot.getTitle(), lot.getDescription())); + append(sb, "Procurement category", joined(firstNonEmpty(lot.getCpvCodes(), projection.getCpvCodes()))); + append(sb, "Relevant domain", projection.getBuyerActivityType()); + append(sb, "Contract type", projection.getContractNature() == null ? null : projection.getContractNature().name()); + append(sb, "Geographic context", joined(firstNonEmpty(lot.getNutsCodes(), projection.getNutsCodes()), projection.getBuyerCountryCode())); if (!StringUtils.hasText(lot.getDescription())) { - append(sb, "Project description", projection.getProjectDescription()); + append(sb, "Parent notice context", joinedLines(projection.getProjectTitle(), projection.getProjectDescription())); } else if (properties.getLotDocuments().isIncludeProjectDescription()) { - append(sb, "Project context", projection.getProjectDescription()); + append(sb, "Parent notice context", joinedLines(projection.getProjectTitle(), projection.getProjectDescription())); + } else { + append(sb, "Parent notice context", projection.getProjectTitle()); } - - append(sb, "Contract nature", projection.getContractNature() == null ? null : projection.getContractNature().name()); - append(sb, "Buyer activity", projection.getBuyerActivityType()); - append(sb, "Buyer country", projection.getBuyerCountryCode()); - append(sb, "CPV codes", joined(firstNonEmpty(lot.getCpvCodes(), projection.getCpvCodes()))); - append(sb, "NUTS codes", joined(firstNonEmpty(lot.getNutsCodes(), projection.getNutsCodes()))); return sb.toString().trim(); } @@ -239,6 +237,23 @@ public class TedLotDocumentMaterializationService { .collect(Collectors.joining(", ")); } + private String joined(String[] values, String fallback) { + String joined = joined(values); + if (StringUtils.hasText(joined)) { + return joined; + } + return fallback; + } + + private String joinedLines(String... values) { + String joined = Arrays.stream(values) + .filter(StringUtils::hasText) + .map(String::trim) + .distinct() + .collect(Collectors.joining("\n")); + return StringUtils.hasText(joined) ? joined : null; + } + private String sanitizeKey(String value) { return value == null ? "unknown" : value.trim().replaceAll("\\s+", "_"); } diff --git a/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java b/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java index 7d9ae3e..5e8696e 100644 --- a/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java +++ b/src/main/java/at/procon/dip/domain/ted/startup/TedProjectionStartupRunner.java @@ -24,6 +24,8 @@ import org.springframework.stereotype.Component; @Slf4j public class TedProjectionStartupRunner implements ApplicationRunner { + private static final int STARTUP_BACKFILL_BATCH_SIZE = 1000; + private final TedProjectionProperties properties; private final ProcurementDocumentRepository procurementDocumentRepository; private final TedNoticeProjectionRepository projectionRepository; @@ -47,18 +49,32 @@ public class TedProjectionStartupRunner implements ApplicationRunner { private void backfillNoticeProjections() { int limit = properties.getStartupBackfillLimit(); - log.info("Phase 3 startup backfill enabled - ensuring TED projections for up to {} documents", limit); - - var page = procurementDocumentRepository.findAll( - PageRequest.of(0, limit, Sort.by(Sort.Direction.ASC, "createdAt"))); + log.info("Phase 3 startup backfill enabled - ensuring TED projections for {} documents", describeLimit(limit)); int synced = 0; - for (var legacyDocument : page.getContent()) { - if (projectionRepository.existsByLegacyProcurementDocumentId(legacyDocument.getId())) { - continue; + int processed = 0; + int pageNumber = 0; + Sort sort = Sort.by(Sort.Direction.ASC, "createdAt"); + + while (limit <= 0 || processed < limit) { + int pageSize = pageSizeFor(limit, processed); + var page = procurementDocumentRepository.findAll(PageRequest.of(pageNumber++, pageSize, sort)); + if (page.isEmpty()) { + break; + } + + for (var legacyDocument : page.getContent()) { + processed++; + if (projectionRepository.existsByLegacyProcurementDocumentId(legacyDocument.getId())) { + continue; + } + projectionService.registerOrRefreshProjection(legacyDocument); + synced++; + } + + if (!page.hasNext()) { + break; } - projectionService.registerOrRefreshProjection(legacyDocument); - synced++; } log.info("Phase 3 startup backfill completed - synced {} TED projections", synced); @@ -66,16 +82,41 @@ public class TedProjectionStartupRunner implements ApplicationRunner { private void backfillLotDocuments() { int limit = properties.getLotDocuments().getStartupBackfillLimit(); - log.info("TED lot document startup backfill enabled - materializing lots for up to {} projections", limit); - - var page = projectionRepository.findAll( - PageRequest.of(0, limit, Sort.by(Sort.Direction.ASC, "createdAt"))); + log.info("TED lot document startup backfill enabled - materializing lots for {} projections", describeLimit(limit)); int lotDocuments = 0; - for (var projection : page.getContent()) { - lotDocuments += lotDocumentMaterializationService.materializeProjectionLots(projection.getId()); + int processed = 0; + int pageNumber = 0; + Sort sort = Sort.by(Sort.Direction.ASC, "createdAt"); + + while (limit <= 0 || processed < limit) { + int pageSize = pageSizeFor(limit, processed); + var page = projectionRepository.findAll(PageRequest.of(pageNumber++, pageSize, sort)); + if (page.isEmpty()) { + break; + } + + for (var projection : page.getContent()) { + processed++; + lotDocuments += lotDocumentMaterializationService.materializeProjectionLots(projection.getId()); + } + + if (!page.hasNext()) { + break; + } } log.info("TED lot document startup backfill completed - materialized/updated {} lot documents", lotDocuments); } + + private int pageSizeFor(int limit, int processed) { + if (limit <= 0) { + return STARTUP_BACKFILL_BATCH_SIZE; + } + return Math.min(STARTUP_BACKFILL_BATCH_SIZE, limit - processed); + } + + private String describeLimit(int limit) { + return limit > 0 ? "up to " + limit : "all available"; + } } diff --git a/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java b/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java index 6abada7..fd75633 100644 --- a/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java +++ b/src/main/java/at/procon/dip/embedding/config/EmbeddingProperties.java @@ -1,6 +1,8 @@ package at.procon.dip.embedding.config; import at.procon.dip.domain.document.DistanceMetric; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; import at.procon.dip.embedding.model.EmbeddingPrefixMode; import java.time.Duration; import java.util.LinkedHashMap; @@ -21,6 +23,7 @@ public class EmbeddingProperties { private Map models = new LinkedHashMap<>(); private IndexingProperties indexing = new IndexingProperties(); private JobsProperties jobs = new JobsProperties(); + private StartupProperties startup = new StartupProperties(); @Data public static class ProviderProperties { @@ -79,4 +82,18 @@ public class EmbeddingProperties { private Duration maxRetryDelay = Duration.ofHours(6); private long schedulerDelayMs = 5000; } + + @Data + public static class StartupProperties { + private boolean enqueueMissingEnabled = false; + private boolean processReadyEnabled = false; + private DocumentType documentType; + private RepresentationType representationType; + private String builderKey; + private boolean primaryOnly = false; + private String modelKey; + private boolean force = false; + private int batchSize = 1000; + private long maxRepresentationsPerRun = 0; + } } diff --git a/src/main/java/at/procon/dip/embedding/service/ScopedEmbeddingEnqueueResult.java b/src/main/java/at/procon/dip/embedding/service/ScopedEmbeddingEnqueueResult.java new file mode 100644 index 0000000..cc74db0 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/service/ScopedEmbeddingEnqueueResult.java @@ -0,0 +1,17 @@ +package at.procon.dip.embedding.service; + +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; + +public record ScopedEmbeddingEnqueueResult( + DocumentType documentType, + RepresentationType representationType, + String builderKey, + boolean primaryOnly, + String modelKey, + boolean force, + int requestedLimit, + int matchedRepresentations, + int jobsQueuedOrAlreadyActive +) { +} diff --git a/src/main/java/at/procon/dip/embedding/service/ScopedEmbeddingEnqueueService.java b/src/main/java/at/procon/dip/embedding/service/ScopedEmbeddingEnqueueService.java new file mode 100644 index 0000000..c8dd8d8 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/service/ScopedEmbeddingEnqueueService.java @@ -0,0 +1,91 @@ +package at.procon.dip.embedding.service; + +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.EmbeddingStatus; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.domain.document.entity.DocumentTextRepresentation; +import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; +import at.procon.dip.embedding.job.service.EmbeddingJobService; +import at.procon.dip.embedding.model.EmbeddingJobType; +import at.procon.dip.embedding.registry.EmbeddingModelRegistry; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.data.domain.PageRequest; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; +import org.springframework.util.StringUtils; + +@Service +@RequiredArgsConstructor +public class ScopedEmbeddingEnqueueService { + + private static final int MAX_LIMIT = 10_000; + + private final DocumentTextRepresentationRepository representationRepository; + private final EmbeddingJobService jobService; + private final EmbeddingModelRegistry modelRegistry; + + @Transactional + public ScopedEmbeddingEnqueueResult enqueueByDocumentType(DocumentType documentType, + RepresentationType representationType, + String builderKey, + boolean primaryOnly, + String modelKey, + boolean force, + int limit) { + return enqueueByDocumentType(documentType, representationType, builderKey, primaryOnly, modelKey, force, 0, limit); + } + + @Transactional + public ScopedEmbeddingEnqueueResult enqueueByDocumentType(DocumentType documentType, + RepresentationType representationType, + String builderKey, + boolean primaryOnly, + String modelKey, + boolean force, + int pageNumber, + int limit) { + String effectiveModelKey = StringUtils.hasText(modelKey) + ? modelKey + : modelRegistry.getRequiredDefaultDocumentModelKey(); + modelRegistry.getRequired(effectiveModelKey); + + int effectiveLimit = Math.max(1, Math.min(limit, MAX_LIMIT)); + int effectivePageNumber = Math.max(0, pageNumber); + String normalizedBuilderKey = StringUtils.hasText(builderKey) ? builderKey.trim() : null; + + List representations = representationRepository.findEmbeddingCandidatesByDocumentType( + documentType, + representationType, + normalizedBuilderKey, + primaryOnly, + effectiveModelKey, + EmbeddingStatus.COMPLETED, + force, + PageRequest.of(effectivePageNumber, effectiveLimit) + ); + + int jobs = 0; + for (DocumentTextRepresentation representation : representations) { + jobService.enqueueForRepresentation( + representation.getDocument().getId(), + representation.getId(), + effectiveModelKey, + EmbeddingJobType.DOCUMENT_EMBED + ); + jobs++; + } + + return new ScopedEmbeddingEnqueueResult( + documentType, + representationType, + normalizedBuilderKey, + primaryOnly, + effectiveModelKey, + force, + effectiveLimit, + representations.size(), + jobs + ); + } +} diff --git a/src/main/java/at/procon/dip/embedding/startup/EmbeddingStartupRunner.java b/src/main/java/at/procon/dip/embedding/startup/EmbeddingStartupRunner.java new file mode 100644 index 0000000..5f6f3a9 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/startup/EmbeddingStartupRunner.java @@ -0,0 +1,92 @@ +package at.procon.dip.embedding.startup; + +import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; +import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueResult; +import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueService; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.core.Ordered; +import org.springframework.core.annotation.Order; +import org.springframework.stereotype.Component; + +@Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) +@RequiredArgsConstructor +@Order(Ordered.LOWEST_PRECEDENCE) +@Slf4j +public class EmbeddingStartupRunner implements ApplicationRunner { + + private final EmbeddingProperties properties; + private final ScopedEmbeddingEnqueueService enqueueService; + private final RepresentationEmbeddingOrchestrator orchestrator; + + @Override + public void run(ApplicationArguments args) { + EmbeddingProperties.StartupProperties startup = properties.getStartup(); + if (!properties.isEnabled() || startup == null) { + return; + } + + if (startup.isEnqueueMissingEnabled()) { + enqueueMissing(startup); + } + + if (startup.isProcessReadyEnabled()) { + processReadyJobs(); + } + } + + private void enqueueMissing(EmbeddingProperties.StartupProperties startup) { + int batchSize = Math.max(1, startup.getBatchSize()); + long max = startup.getMaxRepresentationsPerRun(); + long remaining = max > 0 ? max : Long.MAX_VALUE; + int pageNumber = 0; + long matched = 0; + long queued = 0; + + log.info("Startup embedding enqueue enabled (documentType={}, representationType={}, builderKey={}, primaryOnly={}, force={}, modelKey={}, maxRepresentationsPerRun={})", + startup.getDocumentType(), startup.getRepresentationType(), startup.getBuilderKey(), startup.isPrimaryOnly(), + startup.isForce(), startup.getModelKey(), max > 0 ? max : "unbounded"); + + while (remaining > 0) { + int requested = (int) Math.min(batchSize, remaining); + ScopedEmbeddingEnqueueResult result = enqueueService.enqueueByDocumentType( + startup.getDocumentType(), + startup.getRepresentationType(), + startup.getBuilderKey(), + startup.isPrimaryOnly(), + startup.getModelKey(), + startup.isForce(), + pageNumber, + requested + ); + + matched += result.matchedRepresentations(); + queued += result.jobsQueuedOrAlreadyActive(); + remaining -= result.matchedRepresentations(); + + if (result.matchedRepresentations() < requested) { + break; + } + pageNumber++; + } + + log.info("Startup embedding enqueue completed - matched {} representation(s), queued/already-active {} job(s)", + matched, queued); + } + + private void processReadyJobs() { + if (!properties.getJobs().isEnabled()) { + log.warn("Startup embedding processing was requested but dip.embedding.jobs.enabled=false; queued jobs will not be processed by the NEW embedding job subsystem"); + return; + } + + int processed = orchestrator.processNextReadyBatch(); + log.info("Startup embedding processing completed - processed {} ready job(s)", processed); + } +} diff --git a/src/main/java/at/procon/dip/embedding/web/EmbeddingAdminController.java b/src/main/java/at/procon/dip/embedding/web/EmbeddingAdminController.java new file mode 100644 index 0000000..23af2eb --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/web/EmbeddingAdminController.java @@ -0,0 +1,66 @@ +package at.procon.dip.embedding.web; + +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueResult; +import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueService; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import lombok.RequiredArgsConstructor; +import org.springframework.http.HttpStatus; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.server.ResponseStatusException; + +@RestController +@ConditionalOnRuntimeMode(RuntimeMode.NEW) +@RequestMapping("/v1/dip/admin/embeddings") +@RequiredArgsConstructor +@Tag(name = "Embedding Admin", description = "Administrative operations for NEW-runtime representation embeddings") +public class EmbeddingAdminController { + + private final ScopedEmbeddingEnqueueService enqueueService; + + @PostMapping("/enqueue-by-document-type") + @Operation( + summary = "Queue embeddings by document type", + description = "Queues embedding jobs only for DOC text representations belonging to the requested document type." + ) + public ScopedEmbeddingEnqueueResult enqueueByDocumentType( + @RequestParam String documentType, + @RequestParam(required = false) String representationType, + @RequestParam(required = false) String builderKey, + @RequestParam(required = false, defaultValue = "false") boolean primaryOnly, + @RequestParam(required = false) String modelKey, + @RequestParam(required = false, defaultValue = "false") boolean force, + @RequestParam(required = false, defaultValue = "1000") int limit) { + return enqueueService.enqueueByDocumentType( + parseEnum(DocumentType.class, documentType, "documentType"), + parseEnum(RepresentationType.class, representationType, "representationType"), + builderKey, + primaryOnly, + modelKey, + force, + limit + ); + } + + private > E parseEnum(Class enumType, String value, String parameterName) { + if (value == null || value.isBlank()) { + return null; + } + String normalized = value.trim().replace('-', '_').toUpperCase(java.util.Locale.ROOT); + try { + return Enum.valueOf(enumType, normalized); + } catch (IllegalArgumentException ex) { + throw new ResponseStatusException( + HttpStatus.BAD_REQUEST, + "Invalid " + parameterName + ": " + value + ); + } + } +} diff --git a/src/main/resources/application-legacy.yml b/src/main/resources/application-legacy.yml index d1c29d1..9ee596d 100644 --- a/src/main/resources/application-legacy.yml +++ b/src/main/resources/application-legacy.yml @@ -119,7 +119,7 @@ ted: # Mail account username (email address) username: archiv@procon.co.at # Mail account password - password: ${MAIL_PASSWORD:worasigg} + password: ${MAIL_PASSWORD:} # Use SSL/TLS connection ssl: true # Mail folder to read from diff --git a/src/main/resources/application-new.yml b/src/main/resources/application-new.yml index 625abaf..66521c2 100644 --- a/src/main/resources/application-new.yml +++ b/src/main/resources/application-new.yml @@ -43,10 +43,29 @@ dip: enabled: true jobs: enabled: false - parallel-batch-count: 1 + parallel-batch-count: 2 process-in-batches: true - batch-size: 16 - execution-batch-size: 16 + batch-size: 48 + execution-batch-size: 48 + startup: + # Enqueue missing DOC representation embeddings on NEW-runtime startup. + enqueue-missing-enabled: true + # Also process ready embedding jobs during startup. Requires dip.embedding.jobs.enabled=true. + process-ready-enabled: true + # Leave empty to enqueue missing embeddings for all document types, or set e.g. TED_NOTICE_LOT. + document-type: TED_NOTICE_LOT + # Optional representation filter, e.g. SEMANTIC_TEXT. + representation-type: + # Optional builder filter, e.g. ted-lot-clustering-text-v1. + builder-key: + primary-only: false + # Leave empty to use dip.embedding.default-document-model. + model-key: + # False skips representations that already have a COMPLETED embedding for the model. + force: false + batch-size: 1000 + # 0 means enqueue all matching not-vectorized representations. + max-representations-per-run: 0 default-document-model: e5-default default-query-model: e5-default @@ -258,7 +277,7 @@ dip: # Mailbox username username: archiv@procon.co.at # Mailbox password - password: ${MAIL_PASSWORD:worasigg} + password: ${MAIL_PASSWORD} # Folder/mailbox name folder-name: INBOX # Optional stable provider account key; falls back to username @@ -350,6 +369,17 @@ dip: startup-backfill-limit: 250 structured-search-hybrid-candidate-limit: 5000 structured-search-facet-bucket-limit: 12 + lot-documents: + # Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot. + enabled: true + # Optional startup/backfill path for notices that were imported before lot documents existed. + startup-backfill-enabled: true + # Maximum number of legacy TED lot documents to backfill during startup (0 = all) + startup-backfill-limit: 0 + # Queue embeddings whenever the lot semantic text representation is created or changed. + queue-embeddings-on-change: true + # Include parent notice project description even when the lot already has its own description. + include-parent-description: false migration: legacy-audit: @@ -390,4 +420,4 @@ dip: batch-size: 500 max-documents-per-run: 0 skip-when-primary-representation-missing: true - queue-missing-embeddings: true \ No newline at end of file + queue-missing-embeddings: true diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 4a00f13..0112318 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -14,12 +14,11 @@ spring: name: document-intelligence-platform datasource: - #url: jdbc:postgresql://localhost:5432/RELM - #username: ${DB_USERNAME:postgres} - #password: ${DB_PASSWORD:P54!pcd#Wi} - url: jdbc:postgresql://94.130.218.54:32333/RELM - username: ${DB_USERNAME:postgres} - password: ${DB_PASSWORD:PDmXRx0Rbk9OFOn9qO5Gm/mPCfqW8zwbZ+/YIU1lySc=} + url: jdbc:postgresql://localhost:5432/RELM + #url: jdbc:postgresql://94.130.218.54:32333/RELM + username: ${DB_USERNAME} + password: ${DB_PASSWORD} + driver-class-name: org.postgresql.Driver hikari: maximum-pool-size: 5 @@ -27,7 +26,7 @@ spring: connection-timeout: 30000 idle-timeout: 300000 max-lifetime: 900000 - leak-detection-threshold: 120000 # 2 minutes - increased to avoid false positives with batch processing + leak-detection-threshold: 300000 # 5 minutes - increased to avoid false positives with batch processing jpa: hibernate: diff --git a/src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql b/src/main/resources/db/migration/V10__search_slice2_generic_search_support.sql similarity index 100% rename from src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql rename to src/main/resources/db/migration/V10__search_slice2_generic_search_support.sql diff --git a/src/main/resources/db/migration/V12__ted_projection_add_package_identifier.sql b/src/main/resources/db/migration/V13__ted_projection_add_package_identifier.sql similarity index 100% rename from src/main/resources/db/migration/V12__ted_projection_add_package_identifier.sql rename to src/main/resources/db/migration/V13__ted_projection_add_package_identifier.sql diff --git a/src/main/resources/db/migration/V9__doc_search_slice1_support.sql b/src/main/resources/db/migration/V14__doc_search_slice1_support.sql similarity index 100% rename from src/main/resources/db/migration/V9__doc_search_slice1_support.sql rename to src/main/resources/db/migration/V14__doc_search_slice1_support.sql diff --git a/src/main/resources/db/migration/V24__time_t1_foundation.sql b/src/main/resources/db/migration/V33__time_t1_foundation.sql similarity index 100% rename from src/main/resources/db/migration/V24__time_t1_foundation.sql rename to src/main/resources/db/migration/V33__time_t1_foundation.sql diff --git a/src/main/resources/db/migration/V25__time_t2_leitstand_import.sql b/src/main/resources/db/migration/V34__time_t2_leitstand_import.sql similarity index 100% rename from src/main/resources/db/migration/V25__time_t2_leitstand_import.sql rename to src/main/resources/db/migration/V34__time_t2_leitstand_import.sql diff --git a/src/main/resources/db/migration/V26__time_t3_projection_representations.sql b/src/main/resources/db/migration/V35__time_t3_projection_representations.sql similarity index 100% rename from src/main/resources/db/migration/V26__time_t3_projection_representations.sql rename to src/main/resources/db/migration/V35__time_t3_projection_representations.sql diff --git a/src/main/resources/db/migration/V36__ted_lot_clustering_text_builder_index.sql b/src/main/resources/db/migration/V36__ted_lot_clustering_text_builder_index.sql new file mode 100644 index 0000000..d255ad0 --- /dev/null +++ b/src/main/resources/db/migration/V36__ted_lot_clustering_text_builder_index.sql @@ -0,0 +1,7 @@ +-- Adds an index for the TED lot clustering-oriented semantic representation builder. +-- The original ted-lot-structured-text index is left in place for databases that +-- already materialized or queried the earlier representation shape. + +CREATE INDEX IF NOT EXISTS idx_doc_text_repr_ted_lot_clustering_builder + ON DOC.doc_text_representation(builder_key) + WHERE builder_key = 'ted-lot-clustering-text-v1';