diff --git a/docs/CONFIG_SPLIT_APPLICATION_NEW.md b/docs/CONFIG_SPLIT_APPLICATION_NEW.md new file mode 100644 index 0000000..8bb292c --- /dev/null +++ b/docs/CONFIG_SPLIT_APPLICATION_NEW.md @@ -0,0 +1,31 @@ +# Config split: moved new-runtime properties to application-new.yml + +This patch keeps shared and legacy defaults in `application.yml` and moves new-runtime properties into `application-new.yml`. + +Activate the new runtime with: + +``` +--spring.profiles.active=new +``` + +`application-new.yml` also sets: + +```yaml +dip.runtime.mode: NEW +``` + +So profile selection and runtime mode stay aligned. + +Moved blocks: +- `dip.embedding.*` +- `ted.search.*` (new generic search tuning, now under `dip.search.*`) +- `ted.projection.*` +- `ted.generic-ingestion.*` +- new/transitional `ted.vectorization.*` keys: + - `generic-pipeline-enabled` + - `dual-write-legacy-ted-vectors` + - `generic-scheduler-period-ms` + - `primary-representation-builder-key` + - `embedding-provider` + +Shared / legacy defaults remain in `application.yml`. diff --git a/docs/architecture/RUNTIME_SPLIT_PATCH_C.md b/docs/architecture/RUNTIME_SPLIT_PATCH_C.md new file mode 100644 index 0000000..f53f4e4 --- /dev/null +++ b/docs/architecture/RUNTIME_SPLIT_PATCH_C.md @@ -0,0 +1,36 @@ +# Runtime split Patch C + +Patch C moves the **new generic search runtime** off `TedProcessorProperties.search` +and into a dedicated `dip.search.*` config tree. + +## New config class +- `at.procon.dip.search.config.DipSearchProperties` + +## New config root +```yaml +dip: + search: + ... +``` + +## Classes moved off `TedProcessorProperties` +- `PostgresFullTextSearchEngine` +- `PostgresTrigramSearchEngine` +- `PgVectorSemanticSearchEngine` +- `DefaultSearchOrchestrator` +- `DefaultSearchResultFusionService` +- `SearchLexicalIndexStartupRunner` +- `ChunkedLongTextRepresentationBuilder` + +## What this patch intentionally does not do +- it does not yet remove `TedProcessorProperties` from all NEW-mode classes +- it does not yet move `generic-ingestion` config off `ted.*` +- it does not yet finish the legacy/new config split for import/mail/TED package processing + +Those should be handled in the next config-splitting patch. + +## Practical result +After this patch, **new search/semantic/chunking tuning** should be configured only via: +- `dip.search.*` + +while `ted.search.*` remains legacy-oriented. \ No newline at end of file diff --git a/docs/architecture/RUNTIME_SPLIT_PATCH_D.md b/docs/architecture/RUNTIME_SPLIT_PATCH_D.md new file mode 100644 index 0000000..93457ee --- /dev/null +++ b/docs/architecture/RUNTIME_SPLIT_PATCH_D.md @@ -0,0 +1,40 @@ +# Runtime Split Patch D + +This patch completes the next configuration split step for the NEW runtime. + +## New property classes + +- `at.procon.dip.ingestion.config.DipIngestionProperties` + - prefix: `dip.ingestion` +- `at.procon.dip.domain.ted.config.TedProjectionProperties` + - prefix: `dip.ted.projection` + +## Classes moved off `TedProcessorProperties` + +### NEW-mode ingestion +- `GenericDocumentImportService` +- `GenericFileSystemIngestionRoute` +- `GenericDocumentImportController` +- `MailDocumentIngestionAdapter` +- `TedPackageDocumentIngestionAdapter` +- `TedPackageChildImportProcessor` + +### NEW-mode projection +- `TedNoticeProjectionService` +- `TedProjectionStartupRunner` + +## Additional cleanup in `GenericDocumentImportService` + +It now resolves the default document embedding model through the new embedding subsystem: + +- `EmbeddingProperties` +- `EmbeddingModelRegistry` +- `EmbeddingModelCatalogService` + +and no longer reads vectorization model/provider/dimensions from `TedProcessorProperties`. + +## What still remains for later split steps + +- legacy routes/services still using `TedProcessorProperties` +- legacy/new runtime bean gating for all remaining shared classes +- moving old TED-only config fully under `legacy.ted.*` diff --git a/docs/architecture/RUNTIME_SPLIT_PATCH_E.md b/docs/architecture/RUNTIME_SPLIT_PATCH_E.md new file mode 100644 index 0000000..86edd16 --- /dev/null +++ b/docs/architecture/RUNTIME_SPLIT_PATCH_E.md @@ -0,0 +1,26 @@ +# Runtime split Patch E + +This patch continues the runtime/config split by targeting the remaining NEW-mode classes +that still injected `TedProcessorProperties`. + +## New config classes +- `DipIngestionProperties` (`dip.ingestion.*`) +- `TedProjectionProperties` (`dip.ted.projection.*`) + +## NEW-mode classes moved off `TedProcessorProperties` +- `GenericDocumentImportService` +- `GenericFileSystemIngestionRoute` +- `GenericDocumentImportController` +- `MailDocumentIngestionAdapter` +- `TedPackageDocumentIngestionAdapter` +- `TedPackageChildImportProcessor` +- `TedNoticeProjectionService` +- `TedProjectionStartupRunner` + +## Additional behavior change +`GenericDocumentImportService` now hands embedding work off to the new embedding subsystem by: +- resolving the default document model from `EmbeddingModelRegistry` +- ensuring the model is registered via `EmbeddingModelCatalogService` +- enqueueing jobs through `RepresentationEmbeddingOrchestrator` + +This removes the new import path's runtime dependence on legacy `TedProcessorProperties.vectorization`. \ No newline at end of file diff --git a/docs/architecture/RUNTIME_SPLIT_PATCH_F.md b/docs/architecture/RUNTIME_SPLIT_PATCH_F.md new file mode 100644 index 0000000..2d1db5d --- /dev/null +++ b/docs/architecture/RUNTIME_SPLIT_PATCH_F.md @@ -0,0 +1,39 @@ +# Runtime split Patch F + +This patch finishes the first major bean-gating pass for the **legacy runtime**. + +## What it does +Marks the remaining old runtime classes as: + +- `@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)` + +### Legacy routes / runtime +- `MailRoute` +- `SolutionBriefRoute` +- `TedDocumentRoute` +- `TedPackageDownloadCamelRoute` +- `TedPackageDownloadRoute` +- `VectorizationRoute` + +### Legacy config/runtime infrastructure +- `AsyncConfig` +- `TedProcessorProperties` + +### Legacy controller / listeners / services +- `AdminController` +- `VectorizationEventListener` +- `AttachmentProcessingService` +- `BatchDocumentProcessingService` +- `DocumentProcessingService` +- `SearchService` +- `SimilaritySearchService` +- `TedPackageDownloadService` +- `TedPhase2GenericDocumentService` +- `VectorizationProcessorService` +- `VectorizationService` +- `VectorizationStartupRunner` + +## Added profile file +- `application-legacy.yml` + +This patch is intended to apply **after Patch A–E**. It does not yet remove the old `ted.*` property tree; it makes the old bean graph activate only in `LEGACY` mode. \ No newline at end of file diff --git a/docs/architecture/RUNTIME_SPLIT_PATCH_G.md b/docs/architecture/RUNTIME_SPLIT_PATCH_G.md new file mode 100644 index 0000000..7b88cdc --- /dev/null +++ b/docs/architecture/RUNTIME_SPLIT_PATCH_G.md @@ -0,0 +1,24 @@ +# Runtime split Patch G + +Patch G moves the remaining NEW-mode search/chunking classes off `TedProcessorProperties.search` +and onto `DipSearchProperties` (`dip.search.*`). + +## New config class +- `at.procon.dip.search.config.DipSearchProperties` + +## Classes switched to `DipSearchProperties` +- `PostgresFullTextSearchEngine` +- `PostgresTrigramSearchEngine` +- `PgVectorSemanticSearchEngine` +- `DefaultSearchResultFusionService` +- `DefaultSearchOrchestrator` +- `SearchLexicalIndexStartupRunner` +- `ChunkedLongTextRepresentationBuilder` + +## Additional cleanup +These classes are also marked `NEW`-only in this patch. + +## Effect +After Patch G, the generic NEW-mode search/chunking path no longer depends on +`TedProcessorProperties.search`. That leaves `TedProcessorProperties` much closer to +legacy-only ownership. \ No newline at end of file diff --git a/docs/architecture/RUNTIME_SPLIT_PATCH_H.md b/docs/architecture/RUNTIME_SPLIT_PATCH_H.md new file mode 100644 index 0000000..d502ba6 --- /dev/null +++ b/docs/architecture/RUNTIME_SPLIT_PATCH_H.md @@ -0,0 +1,17 @@ +# Runtime split Patch H + +Patch H is a final cleanup / verification step after the previous split patches. + +## What it does +- makes `TedProcessorProperties` explicitly `LEGACY`-only +- removes the stale `TedProcessorProperties` import/comment from `DocumentIntelligencePlatformApplication` +- adds a regression test that fails if NEW runtime classes reintroduce a dependency on `TedProcessorProperties` +- adds a simple `application-legacy.yml` profile file + +## Why this matters +After the NEW search/ingestion/projection classes are moved to: +- `DipSearchProperties` +- `DipIngestionProperties` +- `TedProjectionProperties` + +`TedProcessorProperties` should be owned strictly by the legacy runtime graph. \ No newline at end of file diff --git a/docs/architecture/RUNTIME_SPLIT_PATCH_I.md b/docs/architecture/RUNTIME_SPLIT_PATCH_I.md new file mode 100644 index 0000000..c6b0c81 --- /dev/null +++ b/docs/architecture/RUNTIME_SPLIT_PATCH_I.md @@ -0,0 +1,21 @@ +# Runtime split Patch I + +Patch I extracts the remaining legacy vectorization cluster off `TedProcessorProperties` +and onto a dedicated legacy-only config class. + +## New config class +- `at.procon.ted.config.LegacyVectorizationProperties` + - prefix: `legacy.ted.vectorization.*` + +## Classes switched off `TedProcessorProperties` +- `GenericVectorizationRoute` +- `DocumentEmbeddingProcessingService` +- `ConfiguredEmbeddingModelStartupRunner` +- `GenericVectorizationStartupRunner` + +## Additional cleanup +These classes are also marked `LEGACY`-only via `@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)`. + +## Effect +The `at.procon.dip.vectorization.*` package now clearly belongs to the old runtime graph and no longer pulls +its settings from the shared monolithic `TedProcessorProperties`. \ No newline at end of file diff --git a/docs/architecture/RUNTIME_SPLIT_PATCH_J.md b/docs/architecture/RUNTIME_SPLIT_PATCH_J.md new file mode 100644 index 0000000..49ae825 --- /dev/null +++ b/docs/architecture/RUNTIME_SPLIT_PATCH_J.md @@ -0,0 +1,45 @@ +# Runtime split Patch J + +Patch J is a broader cleanup patch for the **actual current codebase**. + +It adds the missing runtime/config split scaffolding and rewires the remaining NEW-mode classes +that still injected `TedProcessorProperties`. + +## Added +- `dip.runtime` infrastructure + - `RuntimeMode` + - `RuntimeModeProperties` + - `@ConditionalOnRuntimeMode` + - `RuntimeModeCondition` +- `DipSearchProperties` +- `DipIngestionProperties` +- `TedProjectionProperties` + +## Rewired off `TedProcessorProperties` +### NEW search/chunking +- `PostgresFullTextSearchEngine` +- `PostgresTrigramSearchEngine` +- `PgVectorSemanticSearchEngine` +- `DefaultSearchOrchestrator` +- `SearchLexicalIndexStartupRunner` +- `DefaultSearchResultFusionService` +- `ChunkedLongTextRepresentationBuilder` + +### NEW ingestion/projection +- `GenericDocumentImportService` +- `GenericFileSystemIngestionRoute` +- `GenericDocumentImportController` +- `MailDocumentIngestionAdapter` +- `TedPackageDocumentIngestionAdapter` +- `TedPackageChildImportProcessor` +- `TedNoticeProjectionService` +- `TedProjectionStartupRunner` + +## Additional behavior +- `GenericDocumentImportService` now hands embedding work off to the new embedding subsystem + via `RepresentationEmbeddingOrchestrator` and resolves the default model through + `EmbeddingModelRegistry` / `EmbeddingModelCatalogService`. + +## Notes +This patch intentionally targets the real current leftovers visible in the actual codebase. +It assumes the new embedding subsystem already exists. \ No newline at end of file diff --git a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java index b7f2d7e..2d7011d 100644 --- a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java +++ b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java @@ -1,6 +1,5 @@ package at.procon.dip; -import at.procon.ted.config.TedProcessorProperties; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.context.properties.EnableConfigurationProperties; @@ -17,7 +16,6 @@ import org.springframework.scheduling.annotation.EnableAsync; */ @SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"}) @EnableAsync -//@EnableConfigurationProperties(TedProcessorProperties.class) @EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity"}) @EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository"}) public class DocumentIntelligencePlatformApplication { diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java index e5e1b99..85e0b63 100644 --- a/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java @@ -38,7 +38,7 @@ public interface DocumentEmbeddingRepository extends JpaRepository(); + map.put("text", request.texts().getFirst()); + map.put("isQuery", false); + HttpRequest.Builder builder = HttpRequest.newBuilder() .uri(URI.create(trimTrailingSlash(providerConfig.baseUrl()) + "/embed")) .timeout(providerConfig.readTimeout() == null ? Duration.ofSeconds(60) : providerConfig.readTimeout()) .header("Content-Type", "application/json") - .POST(HttpRequest.BodyPublishers.ofString(objectMapper.writeValueAsString(payload), StandardCharsets.UTF_8)); + .header("documentId", UUID.randomUUID().toString()) + .POST(HttpRequest.BodyPublishers.ofString(objectMapper.writeValueAsString(map), StandardCharsets.UTF_8)); if (providerConfig.apiKey() != null && !providerConfig.apiKey().isBlank()) { builder.header("Authorization", "Bearer " + providerConfig.apiKey()); @@ -84,21 +91,17 @@ public class ExternalHttpEmbeddingProvider implements EmbeddingProvider { throw new IllegalStateException("Embedding provider returned status %d: %s".formatted(response.statusCode(), response.body())); } - ProviderResponse parsed = objectMapper.readValue(response.body(), ProviderResponse.class); + EmbedResponse parsed = objectMapper.readValue(response.body(), EmbedResponse.class); List vectors = new ArrayList<>(); - if (parsed.embeddings != null) { - for (List embedding : parsed.embeddings) { - vectors.add(toArray(embedding)); - } - } else if (parsed.embedding != null) { - vectors.add(toArray(parsed.embedding)); + if (parsed.embedding != null) { + vectors.add(toArray(toList(parsed.embedding))); } return new EmbeddingProviderResult( model, vectors, - parsed.warnings == null ? List.of() : parsed.warnings, - parsed.requestId, + null, //parsed.warnings == null ? List.of() : parsed.warnings, + null, //parsed.requestId, parsed.tokenCount ); } catch (InterruptedException e) { @@ -109,6 +112,17 @@ public class ExternalHttpEmbeddingProvider implements EmbeddingProvider { } } + public static List toList(float[] arr) { + if (arr == null) { + return null; + } + List list = new ArrayList<>(arr.length); + for (float v : arr) { + list.add(v); + } + return list; + } + private float[] toArray(List embedding) { float[] result = new float[embedding.size()]; for (int i = 0; i < embedding.size(); i++) { @@ -148,4 +162,74 @@ public class ExternalHttpEmbeddingProvider implements EmbeddingProvider { @JsonProperty("token_count") public Integer tokenCount; } + + /** + * Request model for embedding service. + * Matches Python FastAPI EmbedRequest model with snake_case field names. + */ + public static class EmbedRequest { + @JsonProperty("text") + public String text; + + @JsonProperty("is_query") + public boolean isQuery; + + public EmbedRequest() {} + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + @JsonProperty("is_query") + public boolean isIsQuery() { + return isQuery; + } + + @JsonProperty("is_query") + public void setIsQuery(boolean isQuery) { + this.isQuery = isQuery; + } + } + + /** + * Response model for embedding service. + */ + public static class EmbedResponse { + public float[] embedding; + public int dimensions; + @JsonProperty("token_count") + public int tokenCount; + + public EmbedResponse() {} + + public float[] getEmbedding() { + return embedding; + } + + public void setEmbedding(float[] embedding) { + this.embedding = embedding; + } + + public int getDimensions() { + return dimensions; + } + + public void setDimensions(int dimensions) { + this.dimensions = dimensions; + } + + @JsonProperty("token_count") + public int getTokenCount() { + return tokenCount; + } + + @JsonProperty("token_count") + public void setTokenCount(int tokenCount) { + this.tokenCount = tokenCount; + } + } } diff --git a/src/main/java/at/procon/dip/embedding/service/EmbeddingPersistenceService.java b/src/main/java/at/procon/dip/embedding/service/EmbeddingPersistenceService.java index 502b477..de63639 100644 --- a/src/main/java/at/procon/dip/embedding/service/EmbeddingPersistenceService.java +++ b/src/main/java/at/procon/dip/embedding/service/EmbeddingPersistenceService.java @@ -47,7 +47,7 @@ public class EmbeddingPersistenceService { float[] vector = result.vectors().getFirst(); embeddingRepository.updateEmbeddingVector( embeddingId, - EmbeddingVectorCodec.toPgVector(vector), + vector, //EmbeddingVectorCodec.toPgVector(vector), result.tokenCount(), vector.length ); diff --git a/src/main/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapter.java index edf9ebd..452a527 100644 --- a/src/main/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapter.java +++ b/src/main/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapter.java @@ -17,7 +17,9 @@ import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.util.DocumentImportSupport; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.ingestion.config.DipIngestionProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import at.procon.ted.service.attachment.AttachmentExtractor; import at.procon.ted.service.attachment.ZipExtractionService; import java.time.OffsetDateTime; @@ -30,11 +32,12 @@ import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; @Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor @Slf4j public class MailDocumentIngestionAdapter implements DocumentIngestionAdapter { - private final TedProcessorProperties properties; + private final DipIngestionProperties properties; private final GenericDocumentImportService importService; private final MailMessageExtractionService mailExtractionService; private final DocumentRelationService relationService; @@ -43,8 +46,8 @@ public class MailDocumentIngestionAdapter implements DocumentIngestionAdapter { @Override public boolean supports(SourceDescriptor sourceDescriptor) { return sourceDescriptor.sourceType() == SourceType.MAIL - && properties.getGenericIngestion().isEnabled() - && properties.getGenericIngestion().isMailAdapterEnabled(); + && properties.isEnabled() + && properties.isMailAdapterEnabled(); } @Override @@ -62,7 +65,7 @@ public class MailDocumentIngestionAdapter implements DocumentIngestionAdapter { if (!parsed.recipients().isEmpty()) rootAttributes.put("to", String.join(", ", parsed.recipients())); rootAttributes.putIfAbsent("title", parsed.subject() != null ? parsed.subject() : sourceDescriptor.fileName()); rootAttributes.put("attachmentCount", Integer.toString(parsed.attachments().size())); - rootAttributes.put("importBatchId", properties.getGenericIngestion().getMailImportBatchId()); + rootAttributes.put("importBatchId", properties.getMailImportBatchId()); ImportedDocumentResult rootResult = importService.importDocument(new SourceDescriptor( accessContext, @@ -93,13 +96,13 @@ public class MailDocumentIngestionAdapter implements DocumentIngestionAdapter { private void importAttachment(java.util.UUID parentDocumentId, DocumentAccessContext accessContext, SourceDescriptor parentSource, MailAttachment attachment, List documents, List warnings, int sortOrder, int depth) { - boolean expandableWrapper = properties.getGenericIngestion().isExpandMailZipAttachments() + boolean expandableWrapper = properties.isExpandMailZipAttachments() && zipExtractionService.canHandle(attachment.fileName(), attachment.contentType()); Map attachmentAttributes = new LinkedHashMap<>(); attachmentAttributes.put("title", attachment.fileName()); attachmentAttributes.put("mailSourceIdentifier", parentSource.sourceIdentifier()); - attachmentAttributes.put("importBatchId", properties.getGenericIngestion().getMailImportBatchId()); + attachmentAttributes.put("importBatchId", properties.getMailImportBatchId()); if (expandableWrapper) { attachmentAttributes.put("wrapperDocument", Boolean.TRUE.toString()); } @@ -144,11 +147,11 @@ public class MailDocumentIngestionAdapter implements DocumentIngestionAdapter { } private DocumentAccessContext defaultMailAccessContext() { - String tenantKey = properties.getGenericIngestion().getMailDefaultOwnerTenantKey(); + String tenantKey = properties.getMailDefaultOwnerTenantKey(); if (tenantKey == null || tenantKey.isBlank()) { - tenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey(); + tenantKey = properties.getDefaultOwnerTenantKey(); } - DocumentVisibility visibility = properties.getGenericIngestion().getMailDefaultVisibility(); + DocumentVisibility visibility = properties.getMailDefaultVisibility(); TenantRef tenant = (tenantKey == null || tenantKey.isBlank()) ? null : new TenantRef(null, tenantKey, tenantKey); if (tenant == null && visibility == DocumentVisibility.TENANT) { visibility = DocumentVisibility.RESTRICTED; diff --git a/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java index bcc380a..45429fa 100644 --- a/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java +++ b/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java @@ -9,7 +9,9 @@ import at.procon.dip.ingestion.spi.DocumentIngestionAdapter; import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.ingestion.config.DipIngestionProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import java.nio.file.Files; import java.nio.file.Path; import java.time.OffsetDateTime; @@ -26,11 +28,12 @@ import org.springframework.transaction.annotation.Transactional; import org.springframework.util.StringUtils; @Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor @Slf4j public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdapter { - private final TedProcessorProperties properties; + private final DipIngestionProperties properties; private final GenericDocumentImportService importService; private final TedPackageExpansionService expansionService; private final TedPackageChildImportProcessor childImportProcessor; @@ -38,8 +41,8 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap @Override public boolean supports(SourceDescriptor sourceDescriptor) { return sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.TED_PACKAGE - && properties.getGenericIngestion().isEnabled() - && properties.getGenericIngestion().isTedPackageAdapterEnabled(); + && properties.isEnabled() + && properties.isTedPackageAdapterEnabled(); } @Override @@ -51,7 +54,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap rootAttributes.putIfAbsent("packageId", sourceDescriptor.sourceIdentifier()); rootAttributes.putIfAbsent("title", sourceDescriptor.fileName() != null ? sourceDescriptor.fileName() : sourceDescriptor.sourceIdentifier()); rootAttributes.put("wrapperDocument", Boolean.TRUE.toString()); - rootAttributes.put("importBatchId", properties.getGenericIngestion().getTedPackageImportBatchId()); + rootAttributes.put("importBatchId", properties.getTedPackageImportBatchId()); ImportedDocumentResult packageDocument = importService.importDocument(new SourceDescriptor( sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(), diff --git a/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java b/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java index a0a16b4..089c45e 100644 --- a/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java +++ b/src/main/java/at/procon/dip/ingestion/camel/GenericFileSystemIngestionRoute.java @@ -7,7 +7,9 @@ import at.procon.dip.domain.tenant.TenantRef; import at.procon.dip.ingestion.service.DocumentIngestionGateway; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.ingestion.config.DipIngestionProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import java.nio.file.Files; import java.nio.file.Path; import java.time.OffsetDateTime; @@ -21,21 +23,22 @@ import org.springframework.stereotype.Component; import org.springframework.util.StringUtils; @Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor @Slf4j public class GenericFileSystemIngestionRoute extends RouteBuilder { - private final TedProcessorProperties properties; + private final DipIngestionProperties properties; private final DocumentIngestionGateway ingestionGateway; @Override public void configure() { - if (!properties.getGenericIngestion().isEnabled() || !properties.getGenericIngestion().isFileSystemEnabled()) { + if (!properties.isEnabled() || !properties.isFileSystemEnabled()) { log.info("Phase 4 generic filesystem ingestion route disabled"); return; } - var config = properties.getGenericIngestion(); + var config = properties; log.info("Configuring Phase 4 generic filesystem ingestion from {}", config.getInputDirectory()); fromF("file:%s?recursive=true&include=%s&delay=%d&maxMessagesPerPoll=%d&move=%s&moveFailed=%s", @@ -58,7 +61,7 @@ public class GenericFileSystemIngestionRoute extends RouteBuilder { } byte[] payload = Files.readAllBytes(path); Map attributes = new LinkedHashMap<>(); - String languageCode = properties.getGenericIngestion().getDefaultLanguageCode(); + String languageCode = properties.getDefaultLanguageCode(); if (StringUtils.hasText(languageCode)) { attributes.put("languageCode", languageCode); } @@ -80,8 +83,8 @@ public class GenericFileSystemIngestionRoute extends RouteBuilder { } private DocumentAccessContext buildDefaultAccessContext() { - String ownerTenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey(); - DocumentVisibility visibility = properties.getGenericIngestion().getDefaultVisibility(); + String ownerTenantKey = properties.getDefaultOwnerTenantKey(); + DocumentVisibility visibility = properties.getDefaultVisibility(); if (!StringUtils.hasText(ownerTenantKey)) { return new DocumentAccessContext(null, visibility); } diff --git a/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java b/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java new file mode 100644 index 0000000..4986c4e --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java @@ -0,0 +1,59 @@ +package at.procon.dip.ingestion.config; + +import at.procon.dip.domain.access.DocumentVisibility; +import jakarta.validation.constraints.NotBlank; +import jakarta.validation.constraints.Positive; +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Configuration; + +@Configuration +@ConfigurationProperties(prefix = "dip.ingestion") +@Data +public class DipIngestionProperties { + + private boolean enabled = false; + private boolean fileSystemEnabled = false; + private boolean restUploadEnabled = true; + private String inputDirectory = "/ted.europe/generic-input"; + private String filePattern = ".*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$"; + private String processedDirectory = ".dip-processed"; + private String errorDirectory = ".dip-error"; + + @Positive + private long pollInterval = 15000; + + @Positive + private int maxMessagesPerPoll = 10; + + private String defaultOwnerTenantKey; + private DocumentVisibility defaultVisibility = DocumentVisibility.PUBLIC; + private String defaultLanguageCode; + + private boolean storeOriginalBinaryInDb = true; + + @Positive + private int maxBinaryBytesInDb = 5242880; + + private boolean deduplicateByContentHash = true; + private boolean storeOriginalContentForWrapperDocuments = true; + private boolean vectorizePrimaryRepresentationOnly = true; + + @NotBlank + private String importBatchId = "phase4-generic"; + + private boolean tedPackageAdapterEnabled = true; + private boolean mailAdapterEnabled = false; + + private String mailDefaultOwnerTenantKey; + private DocumentVisibility mailDefaultVisibility = DocumentVisibility.TENANT; + private boolean expandMailZipAttachments = true; + + @NotBlank + private String tedPackageImportBatchId = "phase41-ted-package"; + + private boolean gatewayOnlyForTedPackages = false; + + @NotBlank + private String mailImportBatchId = "phase41-mail"; +} \ No newline at end of file diff --git a/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java b/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java index 86b91c3..9653c14 100644 --- a/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java +++ b/src/main/java/at/procon/dip/ingestion/controller/GenericDocumentImportController.java @@ -11,7 +11,9 @@ import at.procon.dip.ingestion.service.DocumentIngestionGateway; import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.ingestion.config.DipIngestionProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import java.time.OffsetDateTime; import java.util.LinkedHashMap; import java.util.Map; @@ -28,10 +30,11 @@ import org.springframework.web.multipart.MultipartFile; @RestController @RequestMapping("/v1/dip/import") +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor public class GenericDocumentImportController { - private final TedProcessorProperties properties; + private final DipIngestionProperties properties; private final DocumentIngestionGateway ingestionGateway; @PostMapping(path = "/upload", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) @@ -99,7 +102,7 @@ public class GenericDocumentImportController { } private void ensureRestUploadEnabled() { - if (!properties.getGenericIngestion().isEnabled() || !properties.getGenericIngestion().isRestUploadEnabled()) { + if (!properties.isEnabled() || !properties.isRestUploadEnabled()) { throw new IllegalStateException("Generic REST import is disabled"); } } @@ -107,7 +110,7 @@ public class GenericDocumentImportController { private DocumentAccessContext buildAccessContext(String ownerTenantKey, DocumentVisibility visibility) { DocumentVisibility effectiveVisibility = visibility != null ? visibility - : properties.getGenericIngestion().getDefaultVisibility(); + : properties.getDefaultVisibility(); if (!StringUtils.hasText(ownerTenantKey)) { return new DocumentAccessContext(null, effectiveVisibility); } diff --git a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java index ad4cb38..3fbaa2f 100644 --- a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java +++ b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java @@ -10,13 +10,10 @@ import at.procon.dip.domain.document.DocumentStatus; import at.procon.dip.domain.document.StorageType; import at.procon.dip.domain.document.entity.Document; import at.procon.dip.domain.document.entity.DocumentContent; -import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; import at.procon.dip.domain.document.entity.DocumentSource; -import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; import at.procon.dip.domain.document.repository.DocumentRepository; import at.procon.dip.domain.document.repository.DocumentSourceRepository; import at.procon.dip.domain.document.service.DocumentContentService; -import at.procon.dip.domain.document.service.DocumentEmbeddingService; import at.procon.dip.domain.document.service.DocumentRepresentationService; import at.procon.dip.domain.document.service.DocumentService; import at.procon.dip.domain.document.service.DocumentSourceService; @@ -24,7 +21,6 @@ import at.procon.dip.domain.document.service.command.AddDocumentContentCommand; import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand; import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand; import at.procon.dip.domain.document.service.command.CreateDocumentCommand; -import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand; import at.procon.dip.extraction.service.DocumentExtractionService; import at.procon.dip.extraction.spi.ExtractionRequest; import at.procon.dip.extraction.spi.ExtractionResult; @@ -32,18 +28,19 @@ import at.procon.dip.ingestion.dto.ImportedDocumentResult; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.util.DocumentImportSupport; +import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.registry.EmbeddingModelRegistry; +import at.procon.dip.embedding.service.EmbeddingModelCatalogService; +import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; +import at.procon.dip.ingestion.config.DipIngestionProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import at.procon.dip.normalization.service.TextRepresentationBuildService; import at.procon.dip.processing.service.StructuredDocumentProcessingService; import at.procon.dip.normalization.spi.RepresentationBuildRequest; import at.procon.dip.normalization.spi.TextRepresentationDraft; import at.procon.dip.processing.spi.DocumentProcessingPolicy; import at.procon.dip.processing.spi.StructuredProcessingRequest; -import at.procon.dip.embedding.config.EmbeddingProperties; -import at.procon.dip.embedding.service.EmbeddingModelCatalogService; -import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; -import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; -import at.procon.dip.runtime.config.RuntimeMode; -import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.util.HashUtils; import java.nio.charset.StandardCharsets; import java.time.OffsetDateTime; @@ -63,27 +60,26 @@ import org.springframework.util.StringUtils; * Phase 4 generic import pipeline that persists arbitrary document types into the DOC model. */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor @Slf4j -@ConditionalOnRuntimeMode(RuntimeMode.NEW) public class GenericDocumentImportService { - private final TedProcessorProperties properties; + private final DipIngestionProperties properties; private final DocumentRepository documentRepository; private final DocumentSourceRepository documentSourceRepository; - private final DocumentEmbeddingRepository documentEmbeddingRepository; private final DocumentService documentService; private final DocumentSourceService documentSourceService; private final DocumentContentService documentContentService; private final DocumentRepresentationService documentRepresentationService; - private final DocumentEmbeddingService documentEmbeddingService; - private final EmbeddingProperties embeddingProperties; - private final EmbeddingModelCatalogService embeddingModelCatalogService; - private final RepresentationEmbeddingOrchestrator representationEmbeddingOrchestrator; private final DocumentClassificationService classificationService; private final DocumentExtractionService extractionService; private final TextRepresentationBuildService representationBuildService; private final StructuredDocumentProcessingService structuredProcessingService; + private final EmbeddingProperties embeddingProperties; + private final EmbeddingModelRegistry embeddingModelRegistry; + private final EmbeddingModelCatalogService embeddingModelCatalogService; + private final RepresentationEmbeddingOrchestrator representationEmbeddingOrchestrator; @Transactional public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) { @@ -95,7 +91,7 @@ public class GenericDocumentImportService { ? defaultAccessContext() : sourceDescriptor.accessContext(); - if (properties.getGenericIngestion().isDeduplicateByContentHash()) { + if (properties.isDeduplicateByContentHash()) { Optional existing = resolveDeduplicatedDocument(dedupHash, accessContext); if (existing.isPresent()) { Document document = existing.get(); @@ -257,8 +253,8 @@ public class GenericDocumentImportService { } private DocumentAccessContext defaultAccessContext() { - String tenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey(); - DocumentVisibility visibility = properties.getGenericIngestion().getDefaultVisibility(); + String tenantKey = properties.getDefaultOwnerTenantKey(); + DocumentVisibility visibility = properties.getDefaultVisibility(); if (!StringUtils.hasText(tenantKey)) { return new DocumentAccessContext(null, visibility); } @@ -303,7 +299,7 @@ public class GenericDocumentImportService { String importBatchId = sourceDescriptor.attributes() != null && StringUtils.hasText(sourceDescriptor.attributes().get("importBatchId")) ? sourceDescriptor.attributes().get("importBatchId") - : properties.getGenericIngestion().getImportBatchId(); + : properties.getImportBatchId(); documentSourceService.addSource(new AddDocumentSourceCommand( document.getId(), @@ -324,7 +320,7 @@ public class GenericDocumentImportService { if (sourceDescriptor.originalContentStoragePolicy() == OriginalContentStoragePolicy.SKIP) { return false; } - if (properties.getGenericIngestion().isStoreOriginalContentForWrapperDocuments()) { + if (properties.isStoreOriginalContentForWrapperDocuments()) { return true; } return !isWrapperDocument(sourceDescriptor); @@ -364,9 +360,9 @@ public class GenericDocumentImportService { } private boolean shouldStoreBinaryInDb(byte[] binaryContent) { - return properties.getGenericIngestion().isStoreOriginalBinaryInDb() + return properties.isStoreOriginalBinaryInDb() && binaryContent != null - && binaryContent.length <= properties.getGenericIngestion().getMaxBinaryBytesInDb(); + && binaryContent.length <= properties.getMaxBinaryBytesInDb(); } private Map persistDerivedContent(Document document, @@ -404,13 +400,12 @@ public class GenericDocumentImportService { return; } - String embeddingModelKey = resolveNewRuntimeEmbeddingModelKey(); - if (embeddingModelKey != null) { + String embeddingModelKey = null; + if (embeddingProperties.isEnabled()) { + embeddingModelKey = embeddingModelRegistry.getRequiredDefaultDocumentModelKey(); embeddingModelCatalogService.ensureRegistered(embeddingModelKey); } - java.util.List queuedRepresentationIds = new java.util.ArrayList<>(); - for (TextRepresentationDraft draft : drafts) { if (!StringUtils.hasText(draft.textBody())) { continue; @@ -435,30 +430,12 @@ public class GenericDocumentImportService { )); if (embeddingModelKey != null && shouldQueueEmbedding(draft)) { - queuedRepresentationIds.add(representation.getId()); + representationEmbeddingOrchestrator.enqueueRepresentation(document.getId(), representation.getId(), embeddingModelKey); } } - - if (embeddingModelKey != null) { - for (UUID representationId : queuedRepresentationIds) { - representationEmbeddingOrchestrator.enqueueRepresentation(document.getId(), representationId, embeddingModelKey); - } - } - documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED); } - private String resolveNewRuntimeEmbeddingModelKey() { - if (!embeddingProperties.isEnabled() || !embeddingProperties.getJobs().isEnabled()) { - return null; - } - if (!StringUtils.hasText(embeddingProperties.getDefaultDocumentModel())) { - log.warn("NEW runtime embedding is enabled, but dip.embedding.default-document-model is not configured; skipping embedding job creation"); - return null; - } - return embeddingProperties.getDefaultDocumentModel(); - } - private DocumentContent resolveLinkedContent(TextRepresentationDraft draft, DocumentContent originalContent, Map derivedContent) { @@ -472,7 +449,7 @@ public class GenericDocumentImportService { if (draft.queueForEmbedding() != null) { return draft.queueForEmbedding(); } - return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true; + return properties.isVectorizePrimaryRepresentationOnly() ? draft.primary() : true; } private ExtractionResult mergeExtractionResults(ExtractionResult base, ExtractionResult override) { diff --git a/src/main/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessor.java b/src/main/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessor.java index 1b0fa0d..6f21a74 100644 --- a/src/main/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessor.java +++ b/src/main/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessor.java @@ -10,7 +10,9 @@ import at.procon.dip.ingestion.dto.ImportedDocumentResult; import at.procon.dip.ingestion.service.TedPackageExpansionService.TedPackageEntry; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.ingestion.config.DipIngestionProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import java.time.OffsetDateTime; import java.util.LinkedHashMap; import java.util.Map; @@ -21,12 +23,13 @@ import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Transactional; @Service +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor public class TedPackageChildImportProcessor { private final GenericDocumentImportService importService; private final DocumentRelationService relationService; - private final TedProcessorProperties properties; + private final DipIngestionProperties properties; @Transactional(propagation = Propagation.REQUIRES_NEW) public ChildImportResult processChild(UUID packageDocumentId, @@ -46,7 +49,7 @@ public class TedPackageChildImportProcessor { childAttributes.put("packageId", packageSourceIdentifier); childAttributes.put("archivePath", entry.archivePath()); childAttributes.put("title", entry.fileName()); - childAttributes.put("importBatchId", properties.getGenericIngestion().getTedPackageImportBatchId()); + childAttributes.put("importBatchId", properties.getTedPackageImportBatchId()); ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor( accessContext == null ? DocumentAccessContext.publicDocument() : accessContext, diff --git a/src/main/java/at/procon/dip/normalization/impl/ChunkedLongTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/ChunkedLongTextRepresentationBuilder.java index 42f78f5..cd4e95d 100644 --- a/src/main/java/at/procon/dip/normalization/impl/ChunkedLongTextRepresentationBuilder.java +++ b/src/main/java/at/procon/dip/normalization/impl/ChunkedLongTextRepresentationBuilder.java @@ -6,7 +6,9 @@ import at.procon.dip.domain.document.RepresentationType; import at.procon.dip.normalization.spi.RepresentationBuildRequest; import at.procon.dip.normalization.spi.TextRepresentationBuilder; import at.procon.dip.normalization.spi.TextRepresentationDraft; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.search.config.DipSearchProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import java.util.ArrayList; import java.util.List; import lombok.RequiredArgsConstructor; @@ -21,7 +23,7 @@ public class ChunkedLongTextRepresentationBuilder implements TextRepresentationB public static final String BUILDER_KEY = "long-text-chunker"; - private final TedProcessorProperties properties; + private final DipSearchProperties properties; @Override public boolean supports(DocumentType documentType) { @@ -30,7 +32,7 @@ public class ChunkedLongTextRepresentationBuilder implements TextRepresentationB @Override public List build(RepresentationBuildRequest request) { - if (!properties.getSearch().isChunkingEnabled()) { + if (!properties.isChunkingEnabled()) { return List.of(); } @@ -42,8 +44,8 @@ public class ChunkedLongTextRepresentationBuilder implements TextRepresentationB return List.of(); } - int target = Math.max(400, properties.getSearch().getChunkTargetChars()); - int overlap = Math.max(0, Math.min(target / 3, properties.getSearch().getChunkOverlapChars())); + int target = Math.max(400, properties.getChunkTargetChars()); + int overlap = Math.max(0, Math.min(target / 3, properties.getChunkOverlapChars())); if (baseText.length() <= target + overlap) { return List.of(); } @@ -51,7 +53,7 @@ public class ChunkedLongTextRepresentationBuilder implements TextRepresentationB List drafts = new ArrayList<>(); int start = 0; int chunkIndex = 0; - while (start < baseText.length() && chunkIndex < properties.getSearch().getMaxChunksPerDocument()) { + while (start < baseText.length() && chunkIndex < properties.getMaxChunksPerDocument()) { int end = Math.min(baseText.length(), start + target); if (end < baseText.length()) { int boundary = findBoundary(baseText, end, Math.min(baseText.length(), end + 160)); @@ -94,4 +96,4 @@ public class ChunkedLongTextRepresentationBuilder implements TextRepresentationB } return preferred; } -} +} \ No newline at end of file diff --git a/src/main/java/at/procon/dip/runtime/condition/RuntimeModeCondition.java b/src/main/java/at/procon/dip/runtime/condition/RuntimeModeCondition.java index 50c818d..4b53bd0 100644 --- a/src/main/java/at/procon/dip/runtime/condition/RuntimeModeCondition.java +++ b/src/main/java/at/procon/dip/runtime/condition/RuntimeModeCondition.java @@ -1,12 +1,18 @@ package at.procon.dip.runtime.condition; import at.procon.dip.runtime.config.RuntimeMode; -import java.util.Map; +import org.springframework.boot.context.properties.bind.Binder; +import org.springframework.context.EnvironmentAware; import org.springframework.context.annotation.Condition; import org.springframework.context.annotation.ConditionContext; +import org.springframework.core.env.Environment; import org.springframework.core.type.AnnotatedTypeMetadata; -public class RuntimeModeCondition implements Condition { +import java.util.Map; + +public class RuntimeModeCondition implements Condition, EnvironmentAware { + + private Environment environment; @Override public boolean matches(ConditionContext context, AnnotatedTypeMetadata metadata) { @@ -25,4 +31,9 @@ public class RuntimeModeCondition implements Condition { } return actual == expected; } + + @Override + public void setEnvironment(Environment environment) { + this.environment = environment; + } } diff --git a/src/main/java/at/procon/dip/search/config/DipSearchProperties.java b/src/main/java/at/procon/dip/search/config/DipSearchProperties.java index 934084d..e4e5e0b 100644 --- a/src/main/java/at/procon/dip/search/config/DipSearchProperties.java +++ b/src/main/java/at/procon/dip/search/config/DipSearchProperties.java @@ -1,49 +1,81 @@ package at.procon.dip.search.config; +import jakarta.validation.constraints.Min; +import jakarta.validation.constraints.Positive; import lombok.Data; import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.context.annotation.Configuration; +import org.springframework.validation.annotation.Validated; +/** + * New-runtime generic search configuration. + * + *

This property tree is intentionally separated from the legacy + * {@code ted.search.*} settings. NEW-mode search/semantic/lexical code should + * depend on {@code dip.search.*} only.

+ */ @Configuration @ConfigurationProperties(prefix = "dip.search") @Data +@Validated public class DipSearchProperties { - private Lexical lexical = new Lexical(); - private Semantic semantic = new Semantic(); - private Fusion fusion = new Fusion(); - private Chunking chunking = new Chunking(); - - @Data - public static class Lexical { - private double trigramSimilarityThreshold = 0.12; - private int fulltextCandidateLimit = 120; - private int trigramCandidateLimit = 120; - } - - @Data - public static class Semantic { - private double similarityThreshold = 0.7; - private int semanticCandidateLimit = 120; - private String defaultModelKey; - } - - @Data - public static class Fusion { - private double fulltextWeight = 0.35; - private double trigramWeight = 0.20; - private double semanticWeight = 0.45; - private double recencyBoostWeight = 0.05; - private int recencyHalfLifeDays = 30; - private int debugTopHitsPerEngine = 10; - } - - @Data - public static class Chunking { - private boolean enabled = true; - private int targetChars = 1800; - private int overlapChars = 200; - private int maxChunksPerDocument = 12; - private int startupLexicalBackfillLimit = 500; - } -} + /** Default page size for search results. */ + @Positive + private int defaultPageSize = 20; + + /** Maximum allowed page size. */ + @Positive + private int maxPageSize = 100; + + /** Semantic similarity threshold (normalized score). */ + private double similarityThreshold = 0.7d; + + /** Minimum trigram similarity for fuzzy lexical matches. */ + private double trigramSimilarityThreshold = 0.12d; + + /** Candidate limits per search engine before fusion/collapse. */ + @Positive + private int fulltextCandidateLimit = 120; + + @Positive + private int trigramCandidateLimit = 120; + + @Positive + private int semanticCandidateLimit = 120; + + /** Hybrid fusion weights. */ + private double fulltextWeight = 0.35d; + private double trigramWeight = 0.20d; + private double semanticWeight = 0.45d; + + /** Enable chunk representations for long documents. */ + private boolean chunkingEnabled = true; + + /** Target chunk size in characters for CHUNK representations. */ + @Positive + private int chunkTargetChars = 1800; + + /** Overlap between consecutive chunks in characters. */ + @Min(0) + private int chunkOverlapChars = 200; + + /** Maximum CHUNK representations generated per document. */ + @Positive + private int maxChunksPerDocument = 12; + + /** Additional score weight for recency. */ + private double recencyBoostWeight = 0.05d; + + /** Half-life in days used for recency decay. */ + @Positive + private int recencyHalfLifeDays = 30; + + /** Startup backfill limit for missing DOC lexical vectors. */ + @Positive + private int startupLexicalBackfillLimit = 500; + + /** Number of hits per engine returned by the debug endpoint. */ + @Positive + private int debugTopHitsPerEngine = 10; +} \ No newline at end of file diff --git a/src/main/java/at/procon/dip/search/engine/fulltext/PostgresFullTextSearchEngine.java b/src/main/java/at/procon/dip/search/engine/fulltext/PostgresFullTextSearchEngine.java index 23452af..91c3b43 100644 --- a/src/main/java/at/procon/dip/search/engine/fulltext/PostgresFullTextSearchEngine.java +++ b/src/main/java/at/procon/dip/search/engine/fulltext/PostgresFullTextSearchEngine.java @@ -5,17 +5,20 @@ import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.engine.SearchEngine; import at.procon.dip.search.repository.DocumentFullTextSearchRepository; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.search.config.DipSearchProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import java.util.List; import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Component; @Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor public class PostgresFullTextSearchEngine implements SearchEngine { private final DocumentFullTextSearchRepository repository; - private final TedProcessorProperties properties; + private final DipSearchProperties properties; @Override public SearchEngineType type() { @@ -29,6 +32,6 @@ public class PostgresFullTextSearchEngine implements SearchEngine { @Override public List execute(SearchExecutionContext context) { - return repository.search(context, properties.getSearch().getFulltextCandidateLimit()); + return repository.search(context, properties.getFulltextCandidateLimit()); } } diff --git a/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java b/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java index 6bbca48..b6481a4 100644 --- a/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java +++ b/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java @@ -9,23 +9,23 @@ import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.engine.SearchEngine; import at.procon.dip.search.repository.DocumentSemanticSearchRepository; import at.procon.dip.search.service.SemanticQueryEmbeddingService; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.search.config.DipSearchProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import java.util.List; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; -import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; -import at.procon.dip.runtime.config.RuntimeMode; @Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor @Slf4j -@ConditionalOnRuntimeMode(RuntimeMode.NEW) public class PgVectorSemanticSearchEngine implements SearchEngine { private final EmbeddingProperties embeddingProperties; private final EmbeddingModelRegistry embeddingModelRegistry; - private final TedProcessorProperties properties; + private final DipSearchProperties properties; private final SemanticQueryEmbeddingService queryEmbeddingService; private final DocumentSemanticSearchRepository repository; @@ -56,8 +56,8 @@ public class PgVectorSemanticSearchEngine implements SearchEngine { model.dimensions(), model.distanceMetric(), query.vectorString(), - properties.getSearch().getSemanticCandidateLimit(), - properties.getSearch().getSimilarityThreshold())) + properties.getSemanticCandidateLimit(), + properties.getSimilarityThreshold())) .orElseGet(() -> { log.debug("Semantic search skipped because query embedding could not be generated for model {}", model.modelKey()); return List.of(); diff --git a/src/main/java/at/procon/dip/search/engine/trigram/PostgresTrigramSearchEngine.java b/src/main/java/at/procon/dip/search/engine/trigram/PostgresTrigramSearchEngine.java index dce85c9..9afe5dd 100644 --- a/src/main/java/at/procon/dip/search/engine/trigram/PostgresTrigramSearchEngine.java +++ b/src/main/java/at/procon/dip/search/engine/trigram/PostgresTrigramSearchEngine.java @@ -5,17 +5,20 @@ import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.engine.SearchEngine; import at.procon.dip.search.repository.DocumentTrigramSearchRepository; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.search.config.DipSearchProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import java.util.List; import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Component; @Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor public class PostgresTrigramSearchEngine implements SearchEngine { private final DocumentTrigramSearchRepository repository; - private final TedProcessorProperties properties; + private final DipSearchProperties properties; @Override public SearchEngineType type() { @@ -31,7 +34,7 @@ public class PostgresTrigramSearchEngine implements SearchEngine { public List execute(SearchExecutionContext context) { return repository.search( context, - properties.getSearch().getTrigramCandidateLimit(), - properties.getSearch().getTrigramSimilarityThreshold()); + properties.getTrigramCandidateLimit(), + properties.getTrigramSimilarityThreshold()); } } diff --git a/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java b/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java index e61c010..ec4c12c 100644 --- a/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java +++ b/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java @@ -7,7 +7,9 @@ import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchResponse; import at.procon.dip.search.dto.SearchSortMode; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.search.config.DipSearchProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import java.util.ArrayList; import java.util.Comparator; import java.util.EnumMap; @@ -20,11 +22,12 @@ import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Component; @Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor public class DefaultSearchResultFusionService implements SearchResultFusionService { private final SearchScoreNormalizer normalizer; - private final TedProcessorProperties properties; + private final DipSearchProperties properties; @Override public SearchResponse fuse(SearchExecutionContext context, @@ -97,7 +100,7 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi if (hit == null) { return 0.0d; } - TedProcessorProperties.SearchProperties search = properties.getSearch(); + DipSearchProperties search = properties; return switch (engineType) { case POSTGRES_FULLTEXT -> hit.getNormalizedScore() * search.getFulltextWeight(); case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * search.getTrigramWeight(); @@ -110,9 +113,9 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi normalized.forEach((engine, hits) -> { for (SearchHit hit : hits) { double finalScore = switch (engine) { - case POSTGRES_FULLTEXT -> hit.getNormalizedScore() * properties.getSearch().getFulltextWeight(); - case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * properties.getSearch().getTrigramWeight(); - case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * properties.getSearch().getSemanticWeight(); + case POSTGRES_FULLTEXT -> hit.getNormalizedScore() * properties.getFulltextWeight(); + case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * properties.getTrigramWeight(); + case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * properties.getSemanticWeight(); }; merged.add(hit.toBuilder() .finalScore(finalScore + recencyBoost(hit)) @@ -138,13 +141,13 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi } private double recencyBoost(SearchHit hit) { - if (properties.getSearch().getRecencyBoostWeight() <= 0.0d || hit.getCreatedAt() == null) { + if (properties.getRecencyBoostWeight() <= 0.0d || hit.getCreatedAt() == null) { return 0.0d; } - double halfLifeDays = Math.max(1.0d, properties.getSearch().getRecencyHalfLifeDays()); + double halfLifeDays = Math.max(1.0d, properties.getRecencyHalfLifeDays()); double ageDays = Math.max(0.0d, java.time.Duration.between(hit.getCreatedAt(), java.time.OffsetDateTime.now()).toSeconds() / 86400.0d); double normalized = Math.exp(-Math.log(2.0d) * (ageDays / halfLifeDays)); - return normalized * properties.getSearch().getRecencyBoostWeight(); + return normalized * properties.getRecencyBoostWeight(); } private int representationPriority(SearchHit hit) { diff --git a/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java index 8322c7b..21aeef5 100644 --- a/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java +++ b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java @@ -33,7 +33,7 @@ public class DocumentSemanticSearchRepository { throw new IllegalArgumentException("Semantic search requires a distance metric"); } - String vectorType = "public.vector(" + modelDimensions + ")"; + String vectorType = "vector(" + modelDimensions + ")"; String similarityExpr = buildSimilarityExpression(distanceMetric, vectorType); StringBuilder sql = new StringBuilder(""" diff --git a/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java b/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java index 9799168..993ee7b 100644 --- a/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java +++ b/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java @@ -12,7 +12,9 @@ import at.procon.dip.search.engine.SearchEngine; import at.procon.dip.search.plan.SearchPlanner; import at.procon.dip.search.rank.SearchResultFusionService; import at.procon.dip.search.spi.SearchDocumentScope; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.search.config.DipSearchProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; @@ -21,10 +23,11 @@ import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Service; @Service +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor public class DefaultSearchOrchestrator implements SearchOrchestrator { - private final TedProcessorProperties properties; + private final DipSearchProperties properties; private final SearchPlanner planner; private final List engines; private final SearchResultFusionService fusionService; @@ -45,7 +48,7 @@ public class DefaultSearchOrchestrator implements SearchOrchestrator { metricsService.recordSearch(execution.engineResults(), fused.getHits().size(), true); List debugResults = new ArrayList<>(); - int topLimit = properties.getSearch().getDebugTopHitsPerEngine(); + int topLimit = properties.getDebugTopHitsPerEngine(); execution.engineResults().forEach((engine, hits) -> debugResults.add(SearchEngineDebugResult.builder() .engineType(engine) .hitCount(hits.size()) @@ -68,9 +71,9 @@ public class DefaultSearchOrchestrator implements SearchOrchestrator { private SearchExecution executeInternal(SearchRequest request, SearchDocumentScope scope) { int page = request.getPage() == null || request.getPage() < 0 ? 0 : request.getPage(); int requestedSize = request.getSize() == null || request.getSize() <= 0 - ? properties.getSearch().getDefaultPageSize() + ? properties.getDefaultPageSize() : request.getSize(); - int size = Math.min(requestedSize, properties.getSearch().getMaxPageSize()); + int size = Math.min(requestedSize, properties.getMaxPageSize()); SearchExecutionContext context = SearchExecutionContext.builder() .request(request) diff --git a/src/main/java/at/procon/dip/search/service/SearchLexicalIndexStartupRunner.java b/src/main/java/at/procon/dip/search/service/SearchLexicalIndexStartupRunner.java index c834c24..3c6fcb9 100644 --- a/src/main/java/at/procon/dip/search/service/SearchLexicalIndexStartupRunner.java +++ b/src/main/java/at/procon/dip/search/service/SearchLexicalIndexStartupRunner.java @@ -1,6 +1,8 @@ package at.procon.dip.search.service; -import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.search.config.DipSearchProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.boot.ApplicationArguments; @@ -8,16 +10,17 @@ import org.springframework.boot.ApplicationRunner; import org.springframework.stereotype.Component; @Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor @Slf4j public class SearchLexicalIndexStartupRunner implements ApplicationRunner { - private final TedProcessorProperties properties; + private final DipSearchProperties properties; private final DocumentLexicalIndexService lexicalIndexService; @Override public void run(ApplicationArguments args) { - int updated = lexicalIndexService.backfillMissingVectors(properties.getSearch().getStartupLexicalBackfillLimit()); + int updated = lexicalIndexService.backfillMissingVectors(properties.getStartupLexicalBackfillLimit()); if (updated > 0) { log.info("Search lexical index startup backfill updated {} representations", updated); } diff --git a/src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java b/src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java index 4606409..3512992 100644 --- a/src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java +++ b/src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java @@ -2,8 +2,13 @@ package at.procon.dip.vectorization.camel; import at.procon.dip.domain.document.EmbeddingStatus; import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import at.procon.dip.vectorization.service.DocumentEmbeddingProcessingService; +import at.procon.ted.config.LegacyVectorizationProperties; import com.fasterxml.jackson.annotation.JsonProperty; +import java.util.List; +import java.util.UUID; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.Exchange; @@ -13,27 +18,22 @@ import org.apache.camel.model.dataformat.JsonLibrary; import org.springframework.data.domain.PageRequest; import org.springframework.stereotype.Component; -import at.procon.ted.config.TedProcessorProperties; -import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; -import at.procon.dip.runtime.config.RuntimeMode; -import java.util.List; -import java.util.UUID; - /** - * Phase 2 generic vectorization route. - * Uses DOC.doc_text_representation as the source text and DOC.doc_embedding as the write target. + * Legacy generic vectorization route. + * Uses DOC.doc_text_representation as the source text and DOC.doc_embedding as the write target + * but belongs to the old runtime graph and is therefore activated only in LEGACY mode. */ @Component +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j -@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) public class GenericVectorizationRoute extends RouteBuilder { private static final String ROUTE_ID_TRIGGER = "generic-vectorization-trigger"; private static final String ROUTE_ID_PROCESSOR = "generic-vectorization-processor"; private static final String ROUTE_ID_SCHEDULER = "generic-vectorization-scheduler"; - private final TedProcessorProperties properties; + private final LegacyVectorizationProperties properties; private final DocumentEmbeddingRepository embeddingRepository; private final DocumentEmbeddingProcessingService processingService; @@ -52,163 +52,95 @@ public class GenericVectorizationRoute extends RouteBuilder { @Override public void configure() { - if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) { + if (!properties.isEnabled() || !properties.isGenericPipelineEnabled()) { log.info("Phase 2 generic vectorization route disabled"); return; } - log.info("Configuring generic vectorization routes (phase2=true, apiUrl={}, scheduler={}ms)", - properties.getVectorization().getApiUrl(), - properties.getVectorization().getGenericSchedulerPeriodMs()); + log.info("Configuring generic vectorization routes (legacy mode, apiUrl={}, scheduler={}ms)", + properties.getApiUrl(), + properties.getGenericSchedulerPeriodMs()); onException(Exception.class) .handled(true) .process(exchange -> { UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class); Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class); - String error = exception != null ? exception.getMessage() : "Unknown vectorization error"; + log.error("Generic vectorization failed for embedding {}: {}", embeddingId, + exception != null ? exception.getMessage() : "unknown error", exception); if (embeddingId != null) { - try { - processingService.markAsFailed(embeddingId, error); - } catch (Exception nested) { - log.warn("Failed to mark embedding {} as failed: {}", embeddingId, nested.getMessage()); - } + processingService.markAsFailed(embeddingId, + exception != null ? exception.getMessage() : "Unknown vectorization error"); } }) - .to("log:generic-vectorization-error?level=WARN"); + .log(LoggingLevel.WARN, "Generic vectorization exception handled for ${header.embeddingId}"); from("direct:vectorize-embedding") .routeId(ROUTE_ID_TRIGGER) - .doTry() - .to("seda:vectorize-embedding-async?waitForTaskToComplete=Never&size=1000&blockWhenFull=true&timeout=5000") - .doCatch(Exception.class) - .log(LoggingLevel.WARN, "Failed to queue embedding ${header.embeddingId}: ${exception.message}") - .end(); + .setHeader("embeddingId", header("embeddingId")) + .to("seda:vectorize-embedding-async"); - from("seda:vectorize-embedding-async?size=1000") + from("seda:vectorize-embedding-async?concurrentConsumers=1&blockWhenFull=true&size=1000") .routeId(ROUTE_ID_PROCESSOR) .threads().executorService(executorService()) .process(exchange -> { UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class); + if (embeddingId == null) { + exchange.setProperty(Exchange.ROUTE_STOP, Boolean.TRUE); + return; + } + DocumentEmbeddingProcessingService.EmbeddingPayload payload = processingService.prepareEmbeddingForVectorization(embeddingId); if (payload == null) { - exchange.setProperty("skipVectorization", true); + exchange.setProperty(Exchange.ROUTE_STOP, Boolean.TRUE); return; } - - EmbedRequest request = new EmbedRequest(); - request.text = payload.textContent(); - request.isQuery = false; - - exchange.getIn().setHeader("embeddingId", payload.embeddingId()); - exchange.getIn().setHeader("documentId", payload.documentId()); - exchange.getIn().setHeader(Exchange.HTTP_METHOD, "POST"); - exchange.getIn().setHeader(Exchange.CONTENT_TYPE, "application/json"); + exchange.getIn().setBody(payload); + }) + .filter(exchangeProperty(Exchange.ROUTE_STOP).isNull()) + .process(exchange -> { + DocumentEmbeddingProcessingService.EmbeddingPayload payload = + exchange.getIn().getBody(DocumentEmbeddingProcessingService.EmbeddingPayload.class); + VectorizationRequest request = new VectorizationRequest(payload.textContent(), false); exchange.getIn().setBody(request); }) - .choice() - .when(exchangeProperty("skipVectorization").isEqualTo(true)) - .log(LoggingLevel.DEBUG, "Skipping generic vectorization for ${header.embeddingId}") - .otherwise() - .marshal().json(JsonLibrary.Jackson) - .setProperty("retryCount", constant(0)) - .setProperty("maxRetries", constant(properties.getVectorization().getMaxRetries())) - .setProperty("vectorizationSuccess", constant(false)) - .loopDoWhile(simple("${exchangeProperty.vectorizationSuccess} == false && ${exchangeProperty.retryCount} < ${exchangeProperty.maxRetries}")) - .process(exchange -> { - Integer retryCount = exchange.getProperty("retryCount", Integer.class); - exchange.setProperty("retryCount", retryCount + 1); - if (retryCount > 0) { - long backoffMs = (long) Math.pow(2, retryCount) * 1000L; - Thread.sleep(backoffMs); - } - }) - .doTry() - .toD(properties.getVectorization().getApiUrl() + "/embed?bridgeEndpoint=true&throwExceptionOnFailure=false&connectTimeout=" + - properties.getVectorization().getConnectTimeout() + "&socketTimeout=" + - properties.getVectorization().getSocketTimeout()) - .process(exchange -> { - Integer statusCode = exchange.getIn().getHeader(Exchange.HTTP_RESPONSE_CODE, Integer.class); - if (statusCode == null || statusCode != 200) { - String body = exchange.getIn().getBody(String.class); - throw new RuntimeException("Embedding service returned HTTP " + statusCode + ": " + body); - } - }) - .unmarshal().json(JsonLibrary.Jackson, EmbedResponse.class) - .process(exchange -> { - UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class); - EmbedResponse response = exchange.getIn().getBody(EmbedResponse.class); - if (response == null || response.embedding == null) { - throw new RuntimeException("Embedding service returned null embedding response"); - } - processingService.saveEmbedding(embeddingId, response.embedding, response.tokenCount); - exchange.setProperty("vectorizationSuccess", true); - }) - .doCatch(Exception.class) - .process(exchange -> { - UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class); - Integer retryCount = exchange.getProperty("retryCount", Integer.class); - Integer maxRetries = exchange.getProperty("maxRetries", Integer.class); - Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class); - String errorMsg = exception != null ? exception.getMessage() : "Unknown error"; - if (errorMsg != null && errorMsg.contains("Connection pool shut down")) { - log.warn("Generic vectorization aborted for {} because the application is shutting down", embeddingId); - exchange.setProperty("vectorizationSuccess", true); - return; - } - if (retryCount >= maxRetries) { - processingService.markAsFailed(embeddingId, errorMsg); - } else { - log.warn("Generic vectorization attempt #{} failed for {}: {}", retryCount, embeddingId, errorMsg); - } - }) - .end() - .end() - .end(); + .marshal().json(JsonLibrary.Jackson) + .removeHeaders("CamelHttp*") + .setHeader(Exchange.HTTP_METHOD, constant("POST")) + .setHeader(Exchange.CONTENT_TYPE, constant("application/json")) + .toD(properties.getApiUrl() + "/embed?bridgeEndpoint=true&throwExceptionOnFailure=true") + .unmarshal().json(JsonLibrary.Jackson, VectorizationResponse.class) + .process(exchange -> { + UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class); + VectorizationResponse response = exchange.getIn().getBody(VectorizationResponse.class); + if (response == null || response.embedding() == null) { + throw new IllegalStateException("Embedding service returned empty response"); + } + processingService.saveEmbedding(embeddingId, response.embedding(), response.tokenCount()); + }); - from("timer:generic-vectorization-scheduler?period=" + properties.getVectorization().getGenericSchedulerPeriodMs() + "&delay=500") + from("timer:generic-vectorization-poller?period=" + properties.getGenericSchedulerPeriodMs()) .routeId(ROUTE_ID_SCHEDULER) .process(exchange -> { - int batchSize = properties.getVectorization().getBatchSize(); - List pending = embeddingRepository.findIdsByEmbeddingStatus(EmbeddingStatus.PENDING, PageRequest.of(0, batchSize)); - List failed = List.of(); - if (pending.isEmpty()) { - failed = embeddingRepository.findIdsByEmbeddingStatus(EmbeddingStatus.FAILED, PageRequest.of(0, batchSize)); - } - List toProcess = !pending.isEmpty() ? pending : failed; - if (toProcess.isEmpty()) { - exchange.setProperty("noPendingEmbeddings", true); - } else { - exchange.getIn().setBody(toProcess); - } + List ids = embeddingRepository.findIdsByEmbeddingStatus( + EmbeddingStatus.PENDING, PageRequest.of(0, properties.getBatchSize())); + exchange.getIn().setBody(ids); }) - .choice() - .when(exchangeProperty("noPendingEmbeddings").isEqualTo(true)) - .log(LoggingLevel.DEBUG, "Generic vectorization scheduler: nothing pending") - .otherwise() - .split(body()) - .process(exchange -> { - UUID embeddingId = exchange.getIn().getBody(UUID.class); - exchange.getIn().setHeader("embeddingId", embeddingId); - }) - .to("direct:vectorize-embedding") - .end() - .end(); + .split(body()) + .setHeader("embeddingId", body()) + .to("direct:vectorize-embedding"); } - public static class EmbedRequest { - @JsonProperty("text") - public String text; - - @JsonProperty("is_query") - public boolean isQuery; + public record VectorizationRequest( + @JsonProperty("text") String text, + @JsonProperty("isQuery") boolean isQuery + ) { } - public static class EmbedResponse { - public float[] embedding; - public int dimensions; - @JsonProperty("token_count") - public int tokenCount; + public record VectorizationResponse( + @JsonProperty("embedding") float[] embedding, + @JsonProperty("token_count") Integer tokenCount + ) { } -} +} \ No newline at end of file diff --git a/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java b/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java index 1deb93a..f2f646d 100644 --- a/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java +++ b/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java @@ -1,16 +1,20 @@ package at.procon.dip.vectorization.service; + import at.procon.dip.domain.document.DocumentStatus; import at.procon.dip.domain.document.EmbeddingStatus; import at.procon.dip.domain.document.entity.DocumentEmbedding; import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; import at.procon.dip.domain.document.service.DocumentService; -import at.procon.ted.config.TedProcessorProperties; + +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.ted.config.LegacyVectorizationProperties; + import at.procon.ted.model.entity.VectorizationStatus; import at.procon.ted.repository.ProcurementDocumentRepository; import at.procon.ted.service.VectorizationService; -import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; -import at.procon.dip.runtime.config.RuntimeMode; + import java.time.OffsetDateTime; import java.util.UUID; import lombok.RequiredArgsConstructor; @@ -20,21 +24,21 @@ import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Transactional; /** - * Phase 2 generic vectorization processor that works on DOC text representations and DOC embeddings. + * Legacy generic vectorization processor that works on DOC text representations and DOC embeddings. *

* The service keeps the existing TED semantic search operational by optionally dual-writing completed * embeddings back into the legacy TED procurement_document vector columns, resolved by document hash. */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j -@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) public class DocumentEmbeddingProcessingService { private final DocumentEmbeddingRepository embeddingRepository; private final DocumentService documentService; private final VectorizationService vectorizationService; - private final TedProcessorProperties properties; + private final LegacyVectorizationProperties properties; private final ProcurementDocumentRepository procurementDocumentRepository; @Transactional(propagation = Propagation.REQUIRES_NEW) @@ -61,7 +65,7 @@ public class DocumentEmbeddingProcessingService { return null; } - int maxLength = properties.getVectorization().getMaxTextLength(); + int maxLength = properties.getMaxTextLength(); if (textBody.length() > maxLength) { log.debug("Truncating representation {} for embedding {} from {} to {} chars", embedding.getRepresentation().getId(), embeddingId, textBody.length(), maxLength); @@ -91,10 +95,10 @@ public class DocumentEmbeddingProcessingService { } String vectorString = vectorizationService.floatArrayToVectorString(embedding); - embeddingRepository.updateEmbeddingVector(embeddingId, vectorString, tokenCount, embedding.length); + embeddingRepository.updateEmbeddingVector(embeddingId, embedding, tokenCount, embedding.length); documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.INDEXED); - if (properties.getVectorization().isDualWriteLegacyTedVectors()) { + if (properties.isDualWriteLegacyTedVectors()) { dualWriteLegacyTedVector(loaded, vectorString, tokenCount); } } @@ -107,8 +111,7 @@ public class DocumentEmbeddingProcessingService { embeddingRepository.updateEmbeddingStatus(embeddingId, EmbeddingStatus.FAILED, errorMessage, null); documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.FAILED); - if (properties.getVectorization().isDualWriteLegacyTedVectors()) { - loaded.getDocument().getDedupHash(); + if (properties.isDualWriteLegacyTedVectors()) { procurementDocumentRepository.findByDocumentHash(loaded.getDocument().getDedupHash()) .ifPresent(doc -> procurementDocumentRepository.updateVectorizationStatus( doc.getId(), VectorizationStatus.FAILED, errorMessage, null)); @@ -142,4 +145,4 @@ public class DocumentEmbeddingProcessingService { UUID representationId ) { } -} +} \ No newline at end of file diff --git a/src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java b/src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java index 7187954..44df2bd 100644 --- a/src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java +++ b/src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java @@ -2,9 +2,9 @@ package at.procon.dip.vectorization.startup; import at.procon.dip.domain.document.service.DocumentEmbeddingService; import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand; -import at.procon.ted.config.TedProcessorProperties; import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.ted.config.LegacyVectorizationProperties; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.boot.ApplicationArguments; @@ -12,33 +12,33 @@ import org.springframework.boot.ApplicationRunner; import org.springframework.stereotype.Component; /** - * Ensures the configured embedding model exists in DOC.doc_embedding_model. + * Ensures the configured embedding model exists in DOC.doc_embedding_model for the legacy runtime path. */ @Component +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j -@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) public class ConfiguredEmbeddingModelStartupRunner implements ApplicationRunner { - private final TedProcessorProperties properties; + private final LegacyVectorizationProperties properties; private final DocumentEmbeddingService embeddingService; @Override public void run(ApplicationArguments args) { - if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) { + if (!properties.isEnabled() || !properties.isGenericPipelineEnabled()) { return; } embeddingService.registerModel(new RegisterEmbeddingModelCommand( - properties.getVectorization().getModelName(), - properties.getVectorization().getEmbeddingProvider(), - properties.getVectorization().getModelName(), - properties.getVectorization().getDimensions(), + properties.getModelName(), + properties.getEmbeddingProvider(), + properties.getModelName(), + properties.getDimensions(), null, false, true )); - log.info("Phase 2 embedding model ensured: {}", properties.getVectorization().getModelName()); + log.info("Legacy embedding model ensured: {}", properties.getModelName()); } -} +} \ No newline at end of file diff --git a/src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java b/src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java index 31dd216..4a2a967 100644 --- a/src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java +++ b/src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java @@ -2,9 +2,9 @@ package at.procon.dip.vectorization.startup; import at.procon.dip.domain.document.EmbeddingStatus; import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; -import at.procon.ted.config.TedProcessorProperties; import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.ted.config.LegacyVectorizationProperties; import java.util.List; import java.util.UUID; import lombok.RequiredArgsConstructor; @@ -16,30 +16,30 @@ import org.springframework.data.domain.PageRequest; import org.springframework.stereotype.Component; /** - * Queues pending and failed DOC embeddings immediately on startup. + * Queues pending and failed DOC embeddings immediately on startup for the legacy runtime graph. */ @Component +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j -@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) public class GenericVectorizationStartupRunner implements ApplicationRunner { private static final int BATCH_SIZE = 1000; - private final TedProcessorProperties properties; + private final LegacyVectorizationProperties properties; private final DocumentEmbeddingRepository embeddingRepository; private final ProducerTemplate producerTemplate; @Override public void run(ApplicationArguments args) { - if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) { + if (!properties.isEnabled() || !properties.isGenericPipelineEnabled()) { return; } int queued = 0; queued += queueByStatus(EmbeddingStatus.PENDING, "PENDING"); queued += queueByStatus(EmbeddingStatus.FAILED, "FAILED"); - log.info("Generic vectorization startup runner queued {} embedding jobs", queued); + log.info("Legacy generic vectorization startup runner queued {} embedding jobs", queued); } private int queueByStatus(EmbeddingStatus status, String label) { @@ -60,4 +60,4 @@ public class GenericVectorizationStartupRunner implements ApplicationRunner { } while (ids.size() == BATCH_SIZE); return queued; } -} +} \ No newline at end of file diff --git a/src/main/java/at/procon/ted/camel/MailRoute.java b/src/main/java/at/procon/ted/camel/MailRoute.java index 49e9e6b..aa0d0e2 100644 --- a/src/main/java/at/procon/ted/camel/MailRoute.java +++ b/src/main/java/at/procon/ted/camel/MailRoute.java @@ -13,6 +13,8 @@ import jakarta.mail.Multipart; import jakarta.mail.Part; import jakarta.mail.Session; import jakarta.mail.internet.MimeMessage; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.Exchange; @@ -45,6 +47,7 @@ import java.util.*; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Component +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class MailRoute extends RouteBuilder { diff --git a/src/main/java/at/procon/ted/camel/SolutionBriefRoute.java b/src/main/java/at/procon/ted/camel/SolutionBriefRoute.java index e1019c1..75f974b 100644 --- a/src/main/java/at/procon/ted/camel/SolutionBriefRoute.java +++ b/src/main/java/at/procon/ted/camel/SolutionBriefRoute.java @@ -4,6 +4,8 @@ import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.service.ExcelExportService; import at.procon.ted.service.SimilaritySearchService; import at.procon.ted.service.SimilaritySearchService.SimilaritySearchResponse; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.Exchange; @@ -27,6 +29,7 @@ import java.nio.file.Paths; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Component +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class SolutionBriefRoute extends RouteBuilder { diff --git a/src/main/java/at/procon/ted/camel/TedDocumentRoute.java b/src/main/java/at/procon/ted/camel/TedDocumentRoute.java index 7920b3c..ce7c622 100644 --- a/src/main/java/at/procon/ted/camel/TedDocumentRoute.java +++ b/src/main/java/at/procon/ted/camel/TedDocumentRoute.java @@ -2,6 +2,8 @@ package at.procon.ted.camel; import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.service.DocumentProcessingService; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.Exchange; @@ -24,6 +26,7 @@ import java.nio.file.Path; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Component +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class TedDocumentRoute extends RouteBuilder { diff --git a/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java b/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java index 5e891d1..7bb67a0 100644 --- a/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java +++ b/src/main/java/at/procon/ted/camel/TedPackageDownloadCamelRoute.java @@ -9,6 +9,8 @@ import at.procon.ted.model.entity.TedDailyPackage; import at.procon.ted.repository.TedDailyPackageRepository; import at.procon.ted.service.BatchDocumentProcessingService; import at.procon.ted.service.TedPackageDownloadService; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.Exchange; @@ -47,6 +49,7 @@ import java.util.Optional; @ConditionalOnProperty(name = "ted.download.enabled", havingValue = "true") @RequiredArgsConstructor @Slf4j +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) public class TedPackageDownloadCamelRoute extends RouteBuilder { private static final String ROUTE_ID_SCHEDULER = "ted-package-scheduler"; diff --git a/src/main/java/at/procon/ted/camel/TedPackageDownloadRoute.java b/src/main/java/at/procon/ted/camel/TedPackageDownloadRoute.java index fda70ad..3a88deb 100644 --- a/src/main/java/at/procon/ted/camel/TedPackageDownloadRoute.java +++ b/src/main/java/at/procon/ted/camel/TedPackageDownloadRoute.java @@ -2,6 +2,8 @@ package at.procon.ted.camel; import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.service.TedPackageDownloadService; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.Exchange; @@ -30,6 +32,7 @@ import java.util.List; @ConditionalOnProperty(name = "ted.download.use-service-based", havingValue = "true") @RequiredArgsConstructor @Slf4j +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) public class TedPackageDownloadRoute extends RouteBuilder { private static final String ROUTE_ID_SCHEDULER = "ted-package-download-scheduler"; diff --git a/src/main/java/at/procon/ted/camel/VectorizationRoute.java b/src/main/java/at/procon/ted/camel/VectorizationRoute.java index 84865ee..4cd7962 100644 --- a/src/main/java/at/procon/ted/camel/VectorizationRoute.java +++ b/src/main/java/at/procon/ted/camel/VectorizationRoute.java @@ -7,6 +7,8 @@ import at.procon.ted.repository.ProcurementDocumentRepository; import at.procon.ted.service.VectorizationProcessorService; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.Exchange; @@ -31,6 +33,7 @@ import java.util.UUID; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Component +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class VectorizationRoute extends RouteBuilder { @@ -68,10 +71,6 @@ public class VectorizationRoute extends RouteBuilder { log.info("Vectorization is disabled, skipping route configuration"); return; } - if (properties.getVectorization().isGenericPipelineEnabled()) { - log.info("Legacy vectorization route disabled because Phase 2 generic pipeline is enabled"); - return; - } log.info("Configuring vectorization routes (enabled=true, apiUrl={}, connectTimeout={}ms, socketTimeout={}ms, maxRetries={}, scheduler every 6s)", properties.getVectorization().getApiUrl(), diff --git a/src/main/java/at/procon/ted/config/AsyncConfig.java b/src/main/java/at/procon/ted/config/AsyncConfig.java index 33ad8ec..a3a9106 100644 --- a/src/main/java/at/procon/ted/config/AsyncConfig.java +++ b/src/main/java/at/procon/ted/config/AsyncConfig.java @@ -1,5 +1,7 @@ package at.procon.ted.config; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.aop.interceptor.AsyncUncaughtExceptionHandler; @@ -19,6 +21,7 @@ import java.util.concurrent.Executor; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Configuration +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @EnableAsync @RequiredArgsConstructor @Slf4j diff --git a/src/main/java/at/procon/ted/config/LegacyVectorizationProperties.java b/src/main/java/at/procon/ted/config/LegacyVectorizationProperties.java new file mode 100644 index 0000000..8cf86df --- /dev/null +++ b/src/main/java/at/procon/ted/config/LegacyVectorizationProperties.java @@ -0,0 +1,115 @@ +package at.procon.ted.config; + +import jakarta.validation.constraints.Min; +import jakarta.validation.constraints.NotBlank; +import jakarta.validation.constraints.Positive; +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Configuration; +import org.springframework.validation.annotation.Validated; + +/** + * Legacy vectorization configuration used only by the old runtime path. + *

+ * This extracts the former ted.vectorization.* subtree away from TedProcessorProperties + * so that legacy vectorization beans no longer depend on the shared monolithic config. + */ +@Configuration +@ConfigurationProperties(prefix = "legacy.ted.vectorization") +@Data +@Validated +public class LegacyVectorizationProperties { + + /** + * Enable/disable legacy async vectorization. + */ + private boolean enabled = true; + + /** + * Use external HTTP API instead of Python subprocess. + */ + private boolean useHttpApi = false; + + /** + * Embedding service HTTP API URL. + */ + private String apiUrl = "http://localhost:8001"; + + /** + * Sentence transformer model name. + */ + private String modelName = "intfloat/multilingual-e5-large"; + + /** + * Vector dimensions (must match model output). + */ + @Positive + private int dimensions = 1024; + + /** + * Batch size for vectorization processing. + */ + @Min(1) + private int batchSize = 16; + + /** + * Thread pool size for async vectorization. + */ + @Min(1) + private int threadPoolSize = 4; + + /** + * Maximum text length for vectorization (characters). + */ + @Positive + private int maxTextLength = 8192; + + /** + * HTTP connection timeout in milliseconds. + */ + @Positive + private int connectTimeout = 10000; + + /** + * HTTP socket/read timeout in milliseconds. + */ + @Positive + private int socketTimeout = 60000; + + /** + * Maximum retries on connection failure. + */ + @Min(0) + private int maxRetries = 5; + + /** + * Enable the former Phase 2 generic pipeline in the legacy runtime. + * In the split runtime design this should normally stay false in NEW mode + * because legacy beans are not instantiated there. + */ + private boolean genericPipelineEnabled = true; + + /** + * Keep writing completed TED embeddings back to the legacy ted.procurement_document + * vector columns so the existing semantic search stays operational during migration. + */ + private boolean dualWriteLegacyTedVectors = true; + + /** + * Scheduler interval for generic embedding polling (milliseconds). + */ + @Positive + private long genericSchedulerPeriodMs = 6000; + + /** + * Builder key for the primary TED semantic representation created during transitional dual-write. + */ + @NotBlank + private String primaryRepresentationBuilderKey = "ted-phase2-primary-representation"; + + /** + * Provider key used when registering the configured embedding model in DOC.doc_embedding_model. + */ + @NotBlank + private String embeddingProvider = "http-embedding-service"; +} \ No newline at end of file diff --git a/src/main/java/at/procon/ted/config/TedProcessorProperties.java b/src/main/java/at/procon/ted/config/TedProcessorProperties.java index 9799f5f..b624937 100644 --- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java +++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java @@ -1,8 +1,11 @@ package at.procon.ted.config; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.Data; import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Primary; import org.springframework.validation.annotation.Validated; import jakarta.validation.constraints.Min; @@ -15,9 +18,11 @@ import jakarta.validation.constraints.Positive; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Configuration +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @ConfigurationProperties(prefix = "ted") @Data @Validated +@Primary public class TedProcessorProperties { private InputProperties input = new InputProperties(); @@ -154,37 +159,9 @@ public class TedProcessorProperties { */ @Min(0) private int maxRetries = 5; - - /** - * Enable the Phase 2 generic vectorization pipeline based on DOC text representations - * and DOC embeddings instead of the legacy TED document vector columns as the primary - * write target. - */ - private boolean genericPipelineEnabled = true; - - /** - * Keep writing completed TED embeddings back to the legacy ted.procurement_document - * vector columns so the existing semantic search stays operational during migration. - */ - private boolean dualWriteLegacyTedVectors = true; - - /** - * Scheduler interval for generic embedding polling (milliseconds). - */ - @Positive - private long genericSchedulerPeriodMs = 6000; - - /** - * Builder key for the primary TED semantic representation created during Phase 2 dual-write. - */ - @NotBlank - private String primaryRepresentationBuilderKey = "ted-phase2-primary-representation"; - - /** - * Provider key used when registering the configured embedding model in DOC.doc_embedding_model. - */ - @NotBlank - private String embeddingProvider = "http-embedding-service"; + @Positive + private long genericSchedulerPeriodMs = 30000; + private String primaryRepresentationBuilderKey = "default-generic"; } /** diff --git a/src/main/java/at/procon/ted/controller/AdminController.java b/src/main/java/at/procon/ted/controller/AdminController.java index 6434142..37da13e 100644 --- a/src/main/java/at/procon/ted/controller/AdminController.java +++ b/src/main/java/at/procon/ted/controller/AdminController.java @@ -10,6 +10,8 @@ import at.procon.ted.service.DocumentProcessingService; import at.procon.ted.service.VectorizationService; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.ProducerTemplate; @@ -35,6 +37,7 @@ import java.util.UUID; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @RestController +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequestMapping("/v1/admin") @RequiredArgsConstructor @Slf4j @@ -75,17 +78,11 @@ public class AdminController { Map status = new HashMap<>(); Map statusCounts = new HashMap<>(); - if (properties.getVectorization().isGenericPipelineEnabled()) { - List counts = documentEmbeddingRepository.countByEmbeddingStatus(); - for (Object[] row : counts) { - statusCounts.put(((EmbeddingStatus) row[0]).name(), (Long) row[1]); - } - } else { - List counts = documentRepository.countByVectorizationStatus(); - for (Object[] row : counts) { - statusCounts.put(((VectorizationStatus) row[0]).name(), (Long) row[1]); - } + List counts = documentRepository.countByVectorizationStatus(); + for (Object[] row : counts) { + statusCounts.put(((VectorizationStatus) row[0]).name(), (Long) row[1]); } + status.put("counts", statusCounts); status.put("serviceAvailable", vectorizationService.isAvailable()); @@ -115,14 +112,7 @@ public class AdminController { return ResponseEntity.badRequest().body(result); } - if (properties.getVectorization().isGenericPipelineEnabled()) { - var document = documentRepository.findById(documentId).orElseThrow(); - UUID embeddingId = tedPhase2GenericDocumentService.registerOrRefreshTedDocument(document); - producerTemplate.sendBodyAndHeader("direct:vectorize-embedding", null, "embeddingId", embeddingId); - result.put("embeddingId", embeddingId); - } else { - producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", documentId); - } + producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", documentId); result.put("success", true); result.put("message", "Vectorization triggered for document " + documentId); @@ -147,23 +137,13 @@ public class AdminController { } int count = 0; - if (properties.getVectorization().isGenericPipelineEnabled()) { - var pending = documentEmbeddingRepository.findIdsByEmbeddingStatus( - EmbeddingStatus.PENDING, - PageRequest.of(0, Math.min(batchSize, 500))); - for (UUID embeddingId : pending) { - producerTemplate.sendBodyAndHeader("direct:vectorize-embedding", null, "embeddingId", embeddingId); - count++; - } - } else { - var pending = documentRepository.findByVectorizationStatus( - VectorizationStatus.PENDING, - PageRequest.of(0, Math.min(batchSize, 500))); - - for (var doc : pending) { - producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", doc.getId()); - count++; - } + var pending = documentRepository.findByVectorizationStatus( + VectorizationStatus.PENDING, + PageRequest.of(0, Math.min(batchSize, 500))); + + for (var doc : pending) { + producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", doc.getId()); + count++; } result.put("success", true); diff --git a/src/main/java/at/procon/ted/event/VectorizationEventListener.java b/src/main/java/at/procon/ted/event/VectorizationEventListener.java index 0c2efd7..478f8bf 100644 --- a/src/main/java/at/procon/ted/event/VectorizationEventListener.java +++ b/src/main/java/at/procon/ted/event/VectorizationEventListener.java @@ -1,6 +1,8 @@ package at.procon.ted.event; import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.ProducerTemplate; @@ -15,6 +17,7 @@ import org.springframework.transaction.event.TransactionalEventListener; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Component +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class VectorizationEventListener { @@ -28,7 +31,7 @@ public class VectorizationEventListener { */ @TransactionalEventListener(phase = TransactionPhase.AFTER_COMMIT) public void onDocumentSaved(DocumentSavedEvent event) { - if (!properties.getVectorization().isEnabled() || properties.getVectorization().isGenericPipelineEnabled()) { + if (!properties.getVectorization().isEnabled()) { return; } diff --git a/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java b/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java index 0958d66..88c34f7 100644 --- a/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java +++ b/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java @@ -6,6 +6,8 @@ import at.procon.ted.model.entity.ProcurementDocument; import at.procon.ted.model.entity.ProcessingLog; import at.procon.ted.repository.ProcurementDocumentRepository; import at.procon.ted.util.HashUtils; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; @@ -33,6 +35,7 @@ import java.util.UUID; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class BatchDocumentProcessingService { @@ -138,8 +141,6 @@ public class BatchDocumentProcessingService { if (doc.getDocumentHash() != null) { if (properties.getProjection().isEnabled()) { tedNoticeProjectionService.registerOrRefreshProjection(doc); - } else if (properties.getVectorization().isGenericPipelineEnabled()) { - tedPhase2GenericDocumentService.registerOrRefreshTedDocument(doc); } } } diff --git a/src/main/java/at/procon/ted/service/DocumentProcessingService.java b/src/main/java/at/procon/ted/service/DocumentProcessingService.java index 64b82ac..ea7dba5 100644 --- a/src/main/java/at/procon/ted/service/DocumentProcessingService.java +++ b/src/main/java/at/procon/ted/service/DocumentProcessingService.java @@ -6,6 +6,8 @@ import at.procon.ted.event.DocumentSavedEvent; import at.procon.ted.model.entity.*; import at.procon.ted.repository.ProcurementDocumentRepository; import at.procon.ted.util.HashUtils; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.context.ApplicationEventPublisher; @@ -28,6 +30,7 @@ import java.util.Optional; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class DocumentProcessingService { @@ -94,14 +97,10 @@ public class DocumentProcessingService { tedNoticeProjectionService.registerOrRefreshProjection(document); log.debug("Document saved successfully, Phase 3 TED projection ensured: {}", document.getId()); - if (!properties.getVectorization().isGenericPipelineEnabled()) { - // Keep legacy vectorization behavior when the generic embedding pipeline is disabled. - eventPublisher.publishEvent(new DocumentSavedEvent(document.getId(), document.getPublicationId())); - log.debug("Document saved successfully, legacy vectorization event published: {}", document.getId()); - } - } else if (properties.getVectorization().isGenericPipelineEnabled()) { - tedPhase2GenericDocumentService.registerOrRefreshTedDocument(document); - log.debug("Document saved successfully, Phase 2 generic vectorization record ensured: {}", document.getId()); + // Keep legacy vectorization behavior when the generic embedding pipeline is disabled. + eventPublisher.publishEvent(new DocumentSavedEvent(document.getId(), document.getPublicationId())); + log.debug("Document saved successfully, legacy vectorization event published: {}", document.getId()); + } else { // Publish event to trigger vectorization AFTER transaction commit // This ensures document is visible in DB and avoids transaction isolation issues @@ -160,8 +159,6 @@ public class DocumentProcessingService { if (properties.getProjection().isEnabled()) { tedNoticeProjectionService.registerOrRefreshProjection(updated); - } else if (properties.getVectorization().isGenericPipelineEnabled()) { - tedPhase2GenericDocumentService.registerOrRefreshTedDocument(updated); } // Note: Re-vectorization will be triggered automatically by the active scheduler diff --git a/src/main/java/at/procon/ted/service/SearchService.java b/src/main/java/at/procon/ted/service/SearchService.java index da6cbe3..134cadb 100644 --- a/src/main/java/at/procon/ted/service/SearchService.java +++ b/src/main/java/at/procon/ted/service/SearchService.java @@ -7,6 +7,8 @@ import at.procon.ted.repository.ProcurementDocumentRepository; import jakarta.persistence.EntityManager; import jakarta.persistence.PersistenceContext; import jakarta.persistence.criteria.*; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.data.domain.Page; @@ -33,6 +35,7 @@ import java.util.stream.Collectors; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j @Transactional(readOnly = true) diff --git a/src/main/java/at/procon/ted/service/SimilaritySearchService.java b/src/main/java/at/procon/ted/service/SimilaritySearchService.java index 407f61b..f1298e2 100644 --- a/src/main/java/at/procon/ted/service/SimilaritySearchService.java +++ b/src/main/java/at/procon/ted/service/SimilaritySearchService.java @@ -5,6 +5,8 @@ import at.procon.ted.model.entity.ProcurementDocument; import at.procon.ted.repository.ProcurementDocumentRepository; import at.procon.ted.service.attachment.PdfExtractionService; import at.procon.ted.service.attachment.AttachmentExtractor.ExtractionResult; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; @@ -23,6 +25,7 @@ import java.util.UUID; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j @Transactional(readOnly = true) diff --git a/src/main/java/at/procon/ted/service/TedPackageDownloadService.java b/src/main/java/at/procon/ted/service/TedPackageDownloadService.java index 50fbf9b..fafb169 100644 --- a/src/main/java/at/procon/ted/service/TedPackageDownloadService.java +++ b/src/main/java/at/procon/ted/service/TedPackageDownloadService.java @@ -3,6 +3,8 @@ package at.procon.ted.service; import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.model.entity.TedDailyPackage; import at.procon.ted.repository.TedDailyPackageRepository; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; @@ -43,6 +45,7 @@ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class TedPackageDownloadService { diff --git a/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java b/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java index efbdaf5..d0d300e 100644 --- a/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java +++ b/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java @@ -27,6 +27,8 @@ import at.procon.ted.model.entity.ProcurementDocument; import java.time.OffsetDateTime; import java.util.List; import java.util.UUID; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; @@ -38,6 +40,7 @@ import org.springframework.transaction.annotation.Transactional; * generic vectorization route is disabled. */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class TedPhase2GenericDocumentService { @@ -95,19 +98,13 @@ public class TedPhase2GenericDocumentService { DocumentTextRepresentation representation = ensurePrimaryRepresentation(document, originalContent, tedDocument); UUID embeddingId = null; - if (properties.getVectorization().isGenericPipelineEnabled()) { - DocumentEmbedding embedding = ensurePendingEmbedding(document, representation); - embeddingId = embedding.getId(); - log.debug("Phase 2 DOC bridge ensured generic TED document {} -> embedding {}", document.getId(), embeddingId); - } else { - log.debug("Phase 2 DOC bridge ensured generic TED document {} without embedding queue", document.getId()); - } + log.debug("Phase 2 DOC bridge ensured generic TED document {} without embedding queue", document.getId()); return new TedGenericDocumentSyncResult(document.getId(), embeddingId, representation.getId()); } private boolean isGenericTedSyncEnabled() { - return properties.getVectorization().isGenericPipelineEnabled() || properties.getProjection().isEnabled(); + return properties.getProjection().isEnabled(); } private Document createGenericDocument(ProcurementDocument tedDocument) { @@ -195,7 +192,7 @@ public class TedPhase2GenericDocumentService { private DocumentEmbedding ensurePendingEmbedding(Document document, DocumentTextRepresentation representation) { DocumentEmbeddingModel model = embeddingService.registerModel(new RegisterEmbeddingModelCommand( properties.getVectorization().getModelName(), - properties.getVectorization().getEmbeddingProvider(), + null, properties.getVectorization().getModelName(), properties.getVectorization().getDimensions(), null, diff --git a/src/main/java/at/procon/ted/service/VectorizationProcessorService.java b/src/main/java/at/procon/ted/service/VectorizationProcessorService.java index 82b61f3..3c554fa 100644 --- a/src/main/java/at/procon/ted/service/VectorizationProcessorService.java +++ b/src/main/java/at/procon/ted/service/VectorizationProcessorService.java @@ -4,6 +4,8 @@ import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.model.entity.ProcurementDocument; import at.procon.ted.model.entity.VectorizationStatus; import at.procon.ted.repository.ProcurementDocumentRepository; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; @@ -20,6 +22,7 @@ import java.util.UUID; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class VectorizationProcessorService { diff --git a/src/main/java/at/procon/ted/service/VectorizationService.java b/src/main/java/at/procon/ted/service/VectorizationService.java index e5285b3..7afbacc 100644 --- a/src/main/java/at/procon/ted/service/VectorizationService.java +++ b/src/main/java/at/procon/ted/service/VectorizationService.java @@ -1,6 +1,8 @@ package at.procon.ted.service; import at.procon.ted.config.TedProcessorProperties; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; @@ -28,6 +30,7 @@ import java.util.stream.Collectors; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class VectorizationService { diff --git a/src/main/java/at/procon/ted/service/attachment/AttachmentProcessingService.java b/src/main/java/at/procon/ted/service/attachment/AttachmentProcessingService.java index 0504227..6b9c9e0 100644 --- a/src/main/java/at/procon/ted/service/attachment/AttachmentProcessingService.java +++ b/src/main/java/at/procon/ted/service/attachment/AttachmentProcessingService.java @@ -5,6 +5,8 @@ import at.procon.ted.model.entity.ProcessedAttachment; import at.procon.ted.model.entity.ProcessedAttachment.ProcessingStatus; import at.procon.ted.repository.ProcessedAttachmentRepository; import at.procon.ted.util.HashUtils; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; @@ -25,6 +27,7 @@ import java.util.Optional; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class AttachmentProcessingService { diff --git a/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java b/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java index b75c2be..028fc22 100644 --- a/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java +++ b/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java @@ -3,6 +3,8 @@ package at.procon.ted.startup; import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.model.entity.VectorizationStatus; import at.procon.ted.repository.ProcurementDocumentRepository; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.ProducerTemplate; @@ -28,6 +30,7 @@ import java.util.UUID; * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Component +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class VectorizationStartupRunner implements ApplicationRunner { @@ -44,10 +47,6 @@ public class VectorizationStartupRunner implements ApplicationRunner { log.info("Vectorization is disabled, skipping startup processing"); return; } - if (properties.getVectorization().isGenericPipelineEnabled()) { - log.info("Legacy vectorization startup runner disabled because Phase 2 generic pipeline is enabled"); - return; - } log.info("Checking for pending and failed vectorizations on startup..."); diff --git a/src/main/resources/application-legacy.yml b/src/main/resources/application-legacy.yml index ff37b35..23a314f 100644 --- a/src/main/resources/application-legacy.yml +++ b/src/main/resources/application-legacy.yml @@ -1,3 +1,30 @@ +spring: + config: + activate: + on-profile: legacy + dip: runtime: mode: LEGACY + +# Legacy runtime uses the existing ted.* property tree. +# Move old route/download/mail/vectorization/search settings here over time. +legacy: + ted: + vectorization: + enabled: true + use-http-api: false + api-url: http://localhost:8001 + model-name: intfloat/multilingual-e5-large + dimensions: 1024 + batch-size: 16 + thread-pool-size: 4 + max-text-length: 8192 + connect-timeout: 10000 + socket-timeout: 60000 + max-retries: 5 + generic-pipeline-enabled: true + dual-write-legacy-ted-vectors: true + generic-scheduler-period-ms: 6000 + primary-representation-builder-key: ted-phase2-primary-representation + embedding-provider: http-embedding-service diff --git a/src/main/resources/application-new.yml b/src/main/resources/application-new.yml index 533aafd..ab6ee21 100644 --- a/src/main/resources/application-new.yml +++ b/src/main/resources/application-new.yml @@ -1,9 +1,143 @@ +# New runtime overrides +# Activate with: --spring.profiles.active=new + +# Optional explicit marker; file is profile-specific already +spring: + config: + activate: + on-profile: new + dip: runtime: mode: NEW + search: + # Default page size for search results + default-page-size: 20 + # Maximum page size + max-page-size: 100 + # Similarity threshold for vector search (0.0 - 1.0) + similarity-threshold: 0.7 + # Minimum trigram similarity for fuzzy lexical matches + trigram-similarity-threshold: 0.12 + # Candidate limits per engine before fusion/collapse + fulltext-candidate-limit: 120 + trigram-candidate-limit: 120 + semantic-candidate-limit: 120 + # Hybrid fusion weights + fulltext-weight: 0.35 + trigram-weight: 0.20 + semantic-weight: 0.45 + # Additional score weight for recency + recency-boost-weight: 0.05 + # Recency half-life in days + recency-half-life-days: 30 + # Enable chunk representations for long documents + chunking-enabled: true + # Target chunk size in characters + chunk-target-chars: 1800 + # Overlap between consecutive chunks + chunk-overlap-chars: 200 + # Maximum number of chunks generated per document + max-chunks-per-document: 12 + # Startup backfill limit for missing lexical vectors + startup-lexical-backfill-limit: 500 + # Number of top hits per engine returned by /search/debug + debug-top-hits-per-engine: 10 + embedding: + enabled: true + default-document-model: e5-default + default-query-model: e5-default + providers: + mock-default: + type: mock + dimensions: 16 + external-e5: + type: http-json + base-url: http://172.20.240.18:8001 + connect-timeout: 5s + read-timeout: 60s + models: + mock-search: + provider-config-key: mock-default + provider-model-key: mock-search + dimensions: 16 + distance-metric: COSINE + supports-query-embedding-mode: true + active: true + e5-default: + provider-config-key: external-e5 + provider-model-key: intfloat/multilingual-e5-large + dimensions: 1024 + distance-metric: COSINE + supports-query-embedding-mode: true + active: true + jobs: + enabled: true + + # Phase 4 generic ingestion configuration + ingestion: + # Master switch for arbitrary document ingestion into the DOC model enabled: true - jobs: + # Enable file-system polling for non-TED documents + file-system-enabled: false + # Allow REST/API upload endpoints for arbitrary documents + rest-upload-enabled: true + # Input directory for the generic Camel file route + input-directory: /ted.europe/generic-input + # Regex for files accepted by the generic file route + file-pattern: .*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$ + # Move successfully processed files here + processed-directory: .dip-processed + # Move failed files here + error-directory: .dip-error + # Polling interval for the generic route + poll-interval: 15000 + # Maximum files per poll + max-messages-per-poll: 200 + # Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs + default-owner-tenant-key: + # Default visibility when no explicit access context is provided + default-visibility: PUBLIC + # Optional default language for filesystem imports + default-language-code: + # Store small binary originals in DOC.doc_content.binary_content + store-original-binary-in-db: true + # Maximum binary payload size persisted inline in DB + max-binary-bytes-in-db: 5242880 + # Deduplicate by content hash and attach additional sources to the same canonical document + deduplicate-by-content-hash: true + # Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers + store-original-content-for-wrapper-documents: true + # Queue only the primary text representation for vectorization + vectorize-primary-representation-only: true + # Import batch marker written to DOC.doc_source.import_batch_id + import-batch-id: phase4-generic + # Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI + ted-package-adapter-enabled: true + # Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI + mail-adapter-enabled: true + # Optional dedicated mail owner tenant, falls back to default-owner-tenant-key + mail-default-owner-tenant-key: + # Visibility for imported mail messages and attachments + mail-default-visibility: TENANT + # Expand ZIP attachments recursively through the mail adapter + expand-mail-zip-attachments: true + # Import batch marker for TED package roots and children + ted-package-import-batch-id: phase41-ted-package + # When true, TED package documents are stored only through the generic ingestion gateway + # and the legacy XML batch processing path is skipped + gateway-only-for-ted-packages: true + # Import batch marker for mail roots and attachments + mail-import-batch-id: phase41-mail + + ted: # Phase 3 TED projection configuration + projection: + # Enable/disable dual-write into the TED projection model on top of DOC.doc_document enabled: true - scheduler-delay-ms: 5000 + # Optional startup backfill for legacy TED documents without a projection row yet + startup-backfill-enabled: false + # Maximum number of legacy TED documents to backfill during startup + startup-backfill-limit: 250 + diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index bbe4f1f..283c87d 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -57,7 +57,14 @@ camel: # Weniger strenge Health-Checks für File-Consumer consumers-enabled: false -# Custom Application Properties +# Default runtime mode: legacy / initial implementation +# Activate profile 'new' to load application-new.yml and switch to the new runtime. +dip: + runtime: + mode: LEGACY + +# Legacy / shared application properties +# New-runtime-only properties are moved to application-new.yml. ted: # Directory configuration for file processing input: @@ -84,7 +91,7 @@ ted: # Vectorization configuration vectorization: # Enable/disable async vectorization - enabled: true + enabled: false # Use external HTTP API instead of subprocess use-http-api: true # Embedding service URL @@ -105,53 +112,7 @@ ted: socket-timeout: 60000 # Maximum retries on connection failure max-retries: 5 - # Phase 2: use generic DOC representation/embedding pipeline as primary vectorization path - generic-pipeline-enabled: true - # Keep legacy TED vector columns updated until semantic search is migrated - dual-write-legacy-ted-vectors: true - # Scheduler interval for generic embedding polling - generic-scheduler-period-ms: 6000 - # Builder identifier for primary TED semantic representations in DOC - primary-representation-builder-key: ted-phase2-primary-representation - # Provider key stored in DOC.doc_embedding_model - embedding-provider: http-embedding-service - - # Search configuration - search: - # Default page size for search results - default-page-size: 20 - # Maximum page size - max-page-size: 100 - # Similarity threshold for vector search (0.0 - 1.0) - similarity-threshold: 0.7 - # Minimum trigram similarity for fuzzy lexical matches - trigram-similarity-threshold: 0.12 - # Candidate limits per engine before fusion/collapse - fulltext-candidate-limit: 120 - trigram-candidate-limit: 120 - semantic-candidate-limit: 120 - # Hybrid fusion weights - fulltext-weight: 0.35 - trigram-weight: 0.20 - semantic-weight: 0.45 - # Additional score weight for recency - recency-boost-weight: 0.05 - # Recency half-life in days - recency-half-life-days: 30 - # Enable chunk representations for long documents - chunking-enabled: true - # Target chunk size in characters - chunk-target-chars: 1800 - # Overlap between consecutive chunks - chunk-overlap-chars: 200 - # Maximum number of chunks generated per document - max-chunks-per-document: 12 - # Startup backfill limit for missing lexical vectors - startup-lexical-backfill-limit: 500 - # Number of top hits per engine returned by /search/debug - debug-top-hits-per-engine: 10 - - # TED Daily Package Download configuration + # Packages download configuration download: # Enable/disable automatic package download enabled: true @@ -185,7 +146,6 @@ ted: delete-after-extraction: true # Prioritize current year first prioritize-current-year: false - # IMAP Mail configuration mail: # Enable/disable mail processing @@ -222,73 +182,7 @@ ted: mime-input-pattern: .*\\.eml # Polling interval for MIME input directory (milliseconds) mime-input-poll-interval: 1000000 - - # Phase 3 TED projection configuration - projection: - # Enable/disable dual-write into the TED projection model on top of DOC.doc_document - enabled: true - # Optional startup backfill for legacy TED documents without a projection row yet - startup-backfill-enabled: false - # Maximum number of legacy TED documents to backfill during startup - startup-backfill-limit: 250 - - # Phase 4 generic ingestion configuration - generic-ingestion: - # Master switch for arbitrary document ingestion into the DOC model - enabled: true - # Enable file-system polling for non-TED documents - file-system-enabled: false - # Allow REST/API upload endpoints for arbitrary documents - rest-upload-enabled: true - # Input directory for the generic Camel file route - input-directory: /ted.europe/generic-input - # Regex for files accepted by the generic file route - file-pattern: .*\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$ - # Move successfully processed files here - processed-directory: .dip-processed - # Move failed files here - error-directory: .dip-error - # Polling interval for the generic route - poll-interval: 15000 - # Maximum files per poll - max-messages-per-poll: 200 - # Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs - default-owner-tenant-key: - # Default visibility when no explicit access context is provided - default-visibility: PUBLIC - # Optional default language for filesystem imports - default-language-code: - # Store small binary originals in DOC.doc_content.binary_content - store-original-binary-in-db: true - # Maximum binary payload size persisted inline in DB - max-binary-bytes-in-db: 5242880 - # Deduplicate by content hash and attach additional sources to the same canonical document - deduplicate-by-content-hash: true - # Persist ORIGINAL content rows for wrapper/container documents such as TED packages or ZIP wrappers - store-original-content-for-wrapper-documents: true - # Queue only the primary text representation for vectorization - vectorize-primary-representation-only: true - # Import batch marker written to DOC.doc_source.import_batch_id - import-batch-id: phase4-generic - # Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI - ted-package-adapter-enabled: true - # Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI - mail-adapter-enabled: true - # Optional dedicated mail owner tenant, falls back to default-owner-tenant-key - mail-default-owner-tenant-key: - # Visibility for imported mail messages and attachments - mail-default-visibility: TENANT - # Expand ZIP attachments recursively through the mail adapter - expand-mail-zip-attachments: true - # Import batch marker for TED package roots and children - ted-package-import-batch-id: phase41-ted-package - # When true, TED package documents are stored only through the generic ingestion gateway - # and the legacy XML batch processing path is skipped - gateway-only-for-ted-packages: true - # Import batch marker for mail roots and attachments - mail-import-batch-id: phase41-mail - - # Solution Brief processing configuration + # solution brief processing solution-brief: # Enable/disable Solution Brief processing enabled: false @@ -344,35 +238,3 @@ logging: org.apache.camel: INFO org.hibernate.SQL: WARN org.hibernate.type.descriptor.sql: WARN - - -# Parallel generic embedding subsystem (disabled by default while built alongside the legacy path) -dip: - embedding: - enabled: true - default-document-model: mock-search - default-query-model: mock-search - providers: - mock-default: - type: mock - dimensions: 16 - external-e5: - type: http-json - base-url: http://localhost:8001 - connect-timeout: 5s - read-timeout: 60s - models: - mock-search: - provider-config-key: mock-default - provider-model-key: mock-search - dimensions: 16 - distance-metric: COSINE - supports-query-embedding-mode: true - active: true - e5-default: - provider-config-key: external-e5 - provider-model-key: intfloat/multilingual-e5-large - dimensions: 1024 - distance-metric: COSINE - supports-query-embedding-mode: true - active: true diff --git a/src/test/java/at/procon/dip/architecture/NewRuntimeMustNotDependOnTedProcessorPropertiesTest.java b/src/test/java/at/procon/dip/architecture/NewRuntimeMustNotDependOnTedProcessorPropertiesTest.java new file mode 100644 index 0000000..98e6577 --- /dev/null +++ b/src/test/java/at/procon/dip/architecture/NewRuntimeMustNotDependOnTedProcessorPropertiesTest.java @@ -0,0 +1,69 @@ +package at.procon.dip.architecture; + +import at.procon.dip.domain.ted.config.TedProjectionProperties; +import at.procon.dip.ingestion.config.DipIngestionProperties; +import at.procon.dip.search.config.DipSearchProperties; +import at.procon.ted.config.TedProcessorProperties; +import java.lang.reflect.Constructor; +import java.lang.reflect.Field; +import java.util.List; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Regression guard for the runtime/config split. + * NEW runtime classes must not depend on TedProcessorProperties anymore. + */ +class NewRuntimeMustNotDependOnTedProcessorPropertiesTest { + + @Test + void new_runtime_classes_should_not_depend_on_ted_processor_properties() { + List> newRuntimeClasses = List.of( + at.procon.dip.ingestion.service.GenericDocumentImportService.class, + at.procon.dip.ingestion.camel.GenericFileSystemIngestionRoute.class, + at.procon.dip.ingestion.controller.GenericDocumentImportController.class, + at.procon.dip.ingestion.adapter.MailDocumentIngestionAdapter.class, + at.procon.dip.ingestion.adapter.TedPackageDocumentIngestionAdapter.class, + at.procon.dip.ingestion.service.TedPackageChildImportProcessor.class, + at.procon.dip.domain.ted.service.TedNoticeProjectionService.class, + at.procon.dip.domain.ted.startup.TedProjectionStartupRunner.class, + at.procon.dip.search.engine.fulltext.PostgresFullTextSearchEngine.class, + at.procon.dip.search.engine.trigram.PostgresTrigramSearchEngine.class, + at.procon.dip.search.engine.semantic.PgVectorSemanticSearchEngine.class, + at.procon.dip.search.rank.DefaultSearchResultFusionService.class, + at.procon.dip.search.service.DefaultSearchOrchestrator.class, + at.procon.dip.search.service.SearchLexicalIndexStartupRunner.class, + at.procon.dip.normalization.impl.ChunkedLongTextRepresentationBuilder.class + ); + + for (Class type : newRuntimeClasses) { + assertThat(hasDependency(type, TedProcessorProperties.class)) + .as(type.getName() + " must not depend on TedProcessorProperties") + .isFalse(); + } + } + + @Test + void new_runtime_config_classes_exist_as_replacements() { + assertThat(DipSearchProperties.class).isNotNull(); + assertThat(DipIngestionProperties.class).isNotNull(); + assertThat(TedProjectionProperties.class).isNotNull(); + } + + private boolean hasDependency(Class owner, Class dependency) { + for (Field field : owner.getDeclaredFields()) { + if (field.getType().equals(dependency)) { + return true; + } + } + for (Constructor constructor : owner.getDeclaredConstructors()) { + for (Class param : constructor.getParameterTypes()) { + if (param.equals(dependency)) { + return true; + } + } + } + return false; + } +} \ No newline at end of file diff --git a/src/test/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapterBundleTest.java b/src/test/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapterBundleTest.java index 6d15c6f..9cbde82 100644 --- a/src/test/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapterBundleTest.java +++ b/src/test/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapterBundleTest.java @@ -9,12 +9,12 @@ import at.procon.dip.domain.document.RelationType; import at.procon.dip.domain.document.SourceType; import at.procon.dip.domain.document.entity.Document; import at.procon.dip.domain.document.service.DocumentRelationService; +import at.procon.dip.ingestion.config.DipIngestionProperties; import at.procon.dip.ingestion.dto.ImportedDocumentResult; import at.procon.dip.ingestion.service.GenericDocumentImportService; import at.procon.dip.ingestion.service.MailMessageExtractionService; import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.SourceDescriptor; -import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.service.attachment.ZipExtractionService; import at.procon.dip.testsupport.MailBundleTestSupport; import java.nio.file.Files; @@ -58,11 +58,11 @@ class MailDocumentIngestionAdapterBundleTest { @BeforeEach void setUp() { - TedProcessorProperties properties = new TedProcessorProperties(); - properties.getGenericIngestion().setEnabled(true); - properties.getGenericIngestion().setMailAdapterEnabled(true); - properties.getGenericIngestion().setExpandMailZipAttachments(false); - properties.getGenericIngestion().setMailImportBatchId("test-mail-bundle"); + var properties = new DipIngestionProperties(); + properties.setEnabled(true); + properties.setMailAdapterEnabled(true); + properties.setExpandMailZipAttachments(false); + properties.setMailImportBatchId("test-mail-bundle"); lenient().when(zipExtractionService.canHandle(any(), any())).thenReturn(false); adapter = new MailDocumentIngestionAdapter(properties, importService, new MailMessageExtractionService(), relationService, zipExtractionService); } diff --git a/src/test/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapterFileSystemTest.java b/src/test/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapterFileSystemTest.java index be71af3..a04a940 100644 --- a/src/test/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapterFileSystemTest.java +++ b/src/test/java/at/procon/dip/ingestion/adapter/MailDocumentIngestionAdapterFileSystemTest.java @@ -11,12 +11,12 @@ import at.procon.dip.domain.document.SourceType; import at.procon.dip.domain.document.entity.Document; import at.procon.dip.domain.document.service.DocumentRelationService; import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand; +import at.procon.dip.ingestion.config.DipIngestionProperties; import at.procon.dip.ingestion.dto.ImportedDocumentResult; import at.procon.dip.ingestion.service.GenericDocumentImportService; import at.procon.dip.ingestion.service.MailMessageExtractionService; import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.SourceDescriptor; -import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.service.attachment.ZipExtractionService; import java.nio.file.Files; import java.nio.file.Path; @@ -56,12 +56,12 @@ class MailDocumentIngestionAdapterFileSystemTest { @BeforeEach void setUp() { - TedProcessorProperties properties = new TedProcessorProperties(); - properties.getGenericIngestion().setEnabled(true); - properties.getGenericIngestion().setMailAdapterEnabled(true); - properties.getGenericIngestion().setMailImportBatchId("test-mail-batch"); - properties.getGenericIngestion().setDefaultOwnerTenantKey("tenant-a"); - properties.getGenericIngestion().setMailDefaultVisibility(DocumentVisibility.TENANT); + var properties = new DipIngestionProperties(); + properties.setEnabled(true); + properties.setMailAdapterEnabled(true); + properties.setMailImportBatchId("test-mail-batch"); + properties.setDefaultOwnerTenantKey("tenant-a"); + properties.setMailDefaultVisibility(DocumentVisibility.TENANT); MailMessageExtractionService extractionService = new MailMessageExtractionService(); adapter = new MailDocumentIngestionAdapter( diff --git a/src/test/java/at/procon/dip/ingestion/integration/MailBundleProcessingIntegrationTest.java b/src/test/java/at/procon/dip/ingestion/integration/MailBundleProcessingIntegrationTest.java index 7c30b86..3e673dc 100644 --- a/src/test/java/at/procon/dip/ingestion/integration/MailBundleProcessingIntegrationTest.java +++ b/src/test/java/at/procon/dip/ingestion/integration/MailBundleProcessingIntegrationTest.java @@ -26,6 +26,7 @@ import at.procon.dip.domain.tenant.repository.DocumentTenantRepository; import at.procon.dip.extraction.impl.*; import at.procon.dip.extraction.service.DocumentExtractionService; import at.procon.dip.ingestion.adapter.MailDocumentIngestionAdapter; +import at.procon.dip.ingestion.config.DipIngestionProperties; import at.procon.dip.ingestion.service.DocumentIngestionGateway; import at.procon.dip.ingestion.service.GenericDocumentImportService; import at.procon.dip.ingestion.service.MailMessageExtractionService; @@ -36,7 +37,6 @@ import at.procon.dip.normalization.impl.DefaultGenericTextRepresentationBuilder; import at.procon.dip.normalization.service.TextRepresentationBuildService; import at.procon.dip.processing.service.StructuredDocumentProcessingService; import at.procon.dip.search.service.DocumentLexicalIndexService; -import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.service.attachment.PdfExtractionService; import at.procon.ted.service.attachment.ZipExtractionService; import java.io.IOException; @@ -334,7 +334,7 @@ class MailBundleProcessingIntegrationTest { TransactionAutoConfiguration.class, JdbcTemplateAutoConfiguration.class }) - @EnableConfigurationProperties(TedProcessorProperties.class) + @EnableConfigurationProperties(DipIngestionProperties.class) @EntityScan(basePackages = { "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity" diff --git a/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchEndpointIntegrationTest.java b/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchEndpointIntegrationTest.java index 8db322d..62454d7 100644 --- a/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchEndpointIntegrationTest.java +++ b/src/test/java/at/procon/dip/search/integration/GenericSemanticSearchEndpointIntegrationTest.java @@ -78,6 +78,7 @@ class GenericSemanticSearchEndpointIntegrationTest extends AbstractSemanticSearc @Test void debugEndpoint_should_show_semantic_engine_in_plan() throws Exception { + var modelKey = "mock-search"; //"e5-default"; dataFactory.createAndEmbedPrimaryRepresentation( "Heat network planning", "Municipal energy planning", @@ -86,14 +87,14 @@ class GenericSemanticSearchEndpointIntegrationTest extends AbstractSemanticSearc DocumentFamily.GENERIC, "en", RepresentationType.SEMANTIC_TEXT, - "mock-search" + modelKey ); SearchRequest request = SearchRequest.builder() .queryText("district heating optimization") .modes(Set.of(SearchMode.HYBRID)) .representationSelectionMode(SearchRepresentationSelectionMode.PRIMARY_ONLY) - .semanticModelKey("mock-search") + .semanticModelKey(modelKey) .build(); mockMvc.perform(post("/search/debug") diff --git a/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java b/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java index 0616125..540e944 100644 --- a/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java +++ b/src/test/java/at/procon/dip/testsupport/SearchSemanticTestApplication.java @@ -7,6 +7,7 @@ import at.procon.dip.domain.document.service.DocumentContentService; import at.procon.dip.domain.document.service.DocumentRepresentationService; import at.procon.dip.domain.document.service.DocumentService; import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.ingestion.config.DipIngestionProperties; import at.procon.dip.search.engine.fulltext.PostgresFullTextSearchEngine; import at.procon.dip.search.engine.trigram.PostgresTrigramSearchEngine; import at.procon.dip.search.plan.DefaultSearchPlanner; @@ -20,7 +21,6 @@ import at.procon.dip.search.service.DefaultSearchOrchestrator; import at.procon.dip.search.service.DocumentLexicalIndexService; import at.procon.dip.search.service.SearchMetricsService; import at.procon.dip.search.web.GenericSearchController; -import at.procon.ted.config.TedProcessorProperties; import org.springframework.boot.SpringBootConfiguration; import org.springframework.boot.autoconfigure.EnableAutoConfiguration; import org.springframework.boot.autoconfigure.ImportAutoConfiguration; @@ -48,7 +48,7 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories; TransactionAutoConfiguration.class, JdbcTemplateAutoConfiguration.class }) -@EnableConfigurationProperties({TedProcessorProperties.class}) +@EnableConfigurationProperties({DipIngestionProperties.class}) @EntityScan(basePackages = { "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", diff --git a/src/test/java/at/procon/dip/testsupport/SearchTestApplication.java b/src/test/java/at/procon/dip/testsupport/SearchTestApplication.java index 69ad9c4..2fd5eaf 100644 --- a/src/test/java/at/procon/dip/testsupport/SearchTestApplication.java +++ b/src/test/java/at/procon/dip/testsupport/SearchTestApplication.java @@ -5,6 +5,7 @@ import at.procon.dip.domain.document.service.DocumentContentService; import at.procon.dip.domain.document.service.DocumentRepresentationService; import at.procon.dip.domain.document.service.DocumentService; +import at.procon.dip.ingestion.config.DipIngestionProperties; import at.procon.dip.search.engine.fulltext.PostgresFullTextSearchEngine; import at.procon.dip.search.engine.trigram.PostgresTrigramSearchEngine; import at.procon.dip.search.plan.DefaultSearchPlanner; @@ -16,8 +17,6 @@ import at.procon.dip.search.service.DefaultSearchOrchestrator; import at.procon.dip.search.service.DocumentLexicalIndexService; import at.procon.dip.search.service.SearchMetricsService; import at.procon.dip.search.web.GenericSearchController; -import at.procon.ted.config.TedProcessorProperties; -import com.fasterxml.jackson.databind.ObjectMapper; import org.springframework.boot.SpringBootConfiguration; import org.springframework.boot.autoconfigure.EnableAutoConfiguration; import org.springframework.boot.autoconfigure.ImportAutoConfiguration; @@ -49,7 +48,7 @@ import org.springframework.data.jpa.repository.config.EnableJpaRepositories; TransactionAutoConfiguration.class, JdbcTemplateAutoConfiguration.class }) -@EnableConfigurationProperties(TedProcessorProperties.class) +@EnableConfigurationProperties(DipIngestionProperties.class) @EntityScan(basePackages = { "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity"