diff --git a/docs/architecture/TED_PACKAGE_NEW_IMPORT.md b/docs/architecture/TED_PACKAGE_NEW_IMPORT.md new file mode 100644 index 0000000..de0a1d8 --- /dev/null +++ b/docs/architecture/TED_PACKAGE_NEW_IMPORT.md @@ -0,0 +1,20 @@ +# NEW TED package import route + +This patch adds a NEW-runtime TED package download path that: + +- reuses the proven package sequencing rules +- stores package tracking in `TedDailyPackage` +- downloads the package tar.gz +- ingests it only through `DocumentIngestionGateway` +- never calls the legacy XML batch processing / vectorization flow + +## Added classes + +- `TedPackageSequenceService` +- `DefaultTedPackageSequenceService` +- `TedPackageDownloadNewProperties` +- `TedPackageDownloadNewRoute` + +## Config + +Use the `dip.ingestion.ted-download.*` block in `application-new.yml`. diff --git a/src/main/java/at/procon/dip/domain/ted/service/DefaultTedPackageSequenceService.java b/src/main/java/at/procon/dip/domain/ted/service/DefaultTedPackageSequenceService.java new file mode 100644 index 0000000..19ed138 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/service/DefaultTedPackageSequenceService.java @@ -0,0 +1,189 @@ +package at.procon.dip.domain.ted.service; + +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.dip.ingestion.config.TedPackageDownloadProperties; +import at.procon.ted.model.entity.TedDailyPackage; +import at.procon.ted.repository.TedDailyPackageRepository; +import java.time.Duration; +import java.time.LocalDate; +import java.time.OffsetDateTime; +import java.time.Year; +import java.time.ZoneOffset; +import java.util.Optional; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; + +/** + * NEW-runtime implementation of TED package sequencing. + *
+ * This reuses the same decision rules as the legacy TED package downloader: + *
+ * This service encapsulates the proven sequencing rules from the legacy download implementation
+ * so they can also be used by the NEW runtime without depending on the old route/service graph.
+ */
+public interface TedPackageSequenceService {
+
+ /**
+ * Returns the next package to download according to the current sequencing strategy,
+ * or {@code null} if nothing should be downloaded right now.
+ */
+ PackageInfo getNextPackageToDownload();
+
+ /**
+ * Simple year/serial pair with TED package identifier helper.
+ */
+ record PackageInfo(int year, int serialNumber) {
+ public String identifier() {
+ return "%04d%05d".formatted(year, serialNumber);
+ }
+ }
+}
diff --git a/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java
index 91025ec..9592740 100644
--- a/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java
+++ b/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java
@@ -6,11 +6,15 @@ import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.List;
+
+import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
+import at.procon.dip.runtime.config.RuntimeMode;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
+@ConditionalOnRuntimeMode(RuntimeMode.NEW)
public class FileSystemDocumentIngestionAdapter implements DocumentIngestionAdapter {
private final GenericDocumentImportService importService;
diff --git a/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java
index 39088ed..93a20a6 100644
--- a/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java
+++ b/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java
@@ -7,11 +7,15 @@ import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.List;
+
+import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
+import at.procon.dip.runtime.config.RuntimeMode;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
+@ConditionalOnRuntimeMode(RuntimeMode.NEW)
public class InlineContentDocumentIngestionAdapter implements DocumentIngestionAdapter {
private final GenericDocumentImportService importService;
diff --git a/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java
index 45429fa..77dc7b3 100644
--- a/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java
+++ b/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java
@@ -1,6 +1,8 @@
package at.procon.dip.ingestion.adapter;
import at.procon.dip.domain.access.DocumentAccessContext;
+import at.procon.dip.domain.document.CanonicalDocumentMetadata;
+import at.procon.dip.domain.document.SourceType;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.service.GenericDocumentImportService;
import at.procon.dip.ingestion.service.TedPackageChildImportProcessor;
@@ -22,6 +24,7 @@ import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
@@ -40,7 +43,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap
@Override
public boolean supports(SourceDescriptor sourceDescriptor) {
- return sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.TED_PACKAGE
+ return sourceDescriptor.sourceType() == SourceType.TED_PACKAGE
&& properties.isEnabled()
&& properties.isTedPackageAdapterEnabled();
}
@@ -58,7 +61,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap
ImportedDocumentResult packageDocument = importService.importDocument(new SourceDescriptor(
sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(),
- at.procon.dip.domain.document.SourceType.TED_PACKAGE,
+ SourceType.TED_PACKAGE,
sourceDescriptor.sourceIdentifier(),
packageRootSource.sourceUri(),
sourceDescriptor.fileName(),
@@ -71,7 +74,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap
));
List
+ * Reuses the proven package sequencing rules through {@link TedPackageSequenceService},
+ * but hands off processing only to the NEW ingestion gateway. No legacy XML batch persistence,
+ * no legacy vectorization route, no old semantic path.
+ */
+@Component
+@ConditionalOnRuntimeMode(RuntimeMode.NEW)
+@ConditionalOnProperty(name = "dip.ingestion.ted-download.enabled", havingValue = "true")
+@RequiredArgsConstructor
+@Slf4j
+public class TedPackageDownloadRoute extends RouteBuilder {
+
+ private static final String ROUTE_ID_SCHEDULER = "ted-package-new-scheduler";
+ private static final String ROUTE_ID_DOWNLOADER = "ted-package-new-downloader";
+ private static final String ROUTE_ID_ERROR = "ted-package-new-error-handler";
+
+ private final TedPackageDownloadProperties properties;
+ private final TedDailyPackageRepository packageRepository;
+ private final TedPackageSequenceService sequenceService;
+ private final DocumentIngestionGateway documentIngestionGateway;
+
+ @Override
+ public void configure() {
+ errorHandler(deadLetterChannel("direct:ted-package-new-error")
+ .maximumRedeliveries(3)
+ .redeliveryDelay(10_000)
+ .retryAttemptedLogLevel(LoggingLevel.WARN)
+ .logStackTrace(true));
+
+ from("direct:ted-package-new-error")
+ .routeId(ROUTE_ID_ERROR)
+ .process(this::handleError);
+
+ from("timer:ted-package-new-scheduler?period={{dip.ingestion.ted-download.poll-interval:3600000}}&delay=0")
+ .routeId(ROUTE_ID_SCHEDULER)
+ .process(this::checkRunningPackages)
+ .choice()
+ .when(header("tooManyRunning").isEqualTo(true))
+ .log(LoggingLevel.INFO, "Skipping NEW TED package download - already ${header.runningCount} packages in progress")
+ .otherwise()
+ .process(this::determineNextPackage)
+ .choice()
+ .when(header("packageId").isNotNull())
+ .to("direct:download-ted-package-new")
+ .otherwise()
+ .log(LoggingLevel.INFO, "No NEW TED package to download right now")
+ .end()
+ .end();
+
+ from("direct:download-ted-package-new")
+ .routeId(ROUTE_ID_DOWNLOADER)
+ .log(LoggingLevel.INFO, "NEW TED package download started: ${header.packageId}")
+ .setHeader("downloadStartTime", constant(System.currentTimeMillis()))
+ .process(this::createPackageRecord)
+ .delay(simple("{{dip.ingestion.ted-download.delay-between-downloads:5000}}"))
+ .setHeader(Exchange.HTTP_METHOD, constant("GET"))
+ .setHeader("CamelHttpConnectionClose", constant(true))
+ .toD("${header.downloadUrl}?bridgeEndpoint=true&throwExceptionOnFailure=false&socketTimeout={{dip.ingestion.ted-download.download-timeout:300000}}")
+ .choice()
+ .when(header(Exchange.HTTP_RESPONSE_CODE).isEqualTo(200))
+ .process(this::calculateHash)
+ .process(this::checkDuplicateByHash)
+ .choice()
+ .when(header("isDuplicate").isEqualTo(true))
+ .process(this::markDuplicate)
+ .otherwise()
+ .process(this::saveDownloadedPackage)
+ .process(this::ingestThroughGateway)
+ .process(this::markCompleted)
+ .endChoice()
+ .when(header(Exchange.HTTP_RESPONSE_CODE).isEqualTo(404))
+ .process(this::markNotFound)
+ .otherwise()
+ .process(this::markFailed)
+ .end();
+ }
+
+ private void checkRunningPackages(Exchange exchange) {
+ long downloadingCount = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADING).size();
+ long processingCount = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.PROCESSING).size();
+ long runningCount = downloadingCount + processingCount;
+
+ exchange.getIn().setHeader("runningCount", runningCount);
+ exchange.getIn().setHeader("tooManyRunning", runningCount >= properties.getMaxRunningPackages());
+
+ if (runningCount > 0) {
+ log.info("Currently {} TED packages in progress in NEW runtime ({} downloading, {} processing)",
+ runningCount, downloadingCount, processingCount);
+ }
+ }
+
+ private void determineNextPackage(Exchange exchange) {
+ List
+ * This is intentionally separate from the legacy {@code ted.download.*} tree.
+ */
+@Configuration
+@ConfigurationProperties(prefix = "dip.ingestion.ted-download")
+@Data
+public class TedPackageDownloadProperties {
+
+ private boolean enabled = false;
+
+ @NotBlank
+ private String baseUrl = "https://ted.europa.eu/packages/daily/";
+
+ @NotBlank
+ private String downloadDirectory = "/ted.europe/downloads-new";
+
+ @Positive
+ private int startYear = 2015;
+
+ @Positive
+ private long pollInterval = 3_600_000L;
+
+ @Positive
+ private long notFoundRetryInterval = 21_600_000L;
+
+ @Min(0)
+ private int previousYearGracePeriodDays = 30;
+
+ private boolean retryCurrentYearNotFoundIndefinitely = true;
+
+ @Positive
+ private long downloadTimeout = 300_000L;
+
+ @Positive
+ private int maxRunningPackages = 2;
+
+ @Positive
+ private long delayBetweenDownloads = 5_000L;
+
+ private boolean deleteAfterIngestion = true;
+}
diff --git a/src/main/java/at/procon/ted/config/LegacyTedProperties.java b/src/main/java/at/procon/ted/config/LegacyTedProperties.java
deleted file mode 100644
index b06e2ce..0000000
--- a/src/main/java/at/procon/ted/config/LegacyTedProperties.java
+++ /dev/null
@@ -1,16 +0,0 @@
-package at.procon.ted.config;
-
-import org.springframework.boot.context.properties.ConfigurationProperties;
-import org.springframework.context.annotation.Configuration;
-
-/**
- * Patch A scaffold for the legacy runtime configuration tree.
- *
- * The legacy runtime still uses {@link TedProcessorProperties} today. This class is
- * introduced so the old configuration can be moved gradually from `ted.*` to
- * `legacy.ted.*` without blocking the runtime split.
- */
-@Configuration
-@ConfigurationProperties(prefix = "legacy.ted")
-public class LegacyTedProperties extends TedProcessorProperties {
-}
diff --git a/src/main/java/at/procon/ted/controller/DocumentController.java b/src/main/java/at/procon/ted/controller/DocumentController.java
index a7f19c2..7f0647a 100644
--- a/src/main/java/at/procon/ted/controller/DocumentController.java
+++ b/src/main/java/at/procon/ted/controller/DocumentController.java
@@ -1,5 +1,7 @@
package at.procon.ted.controller;
+import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
+import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.model.dto.DocumentDtos.*;
import at.procon.ted.model.entity.ContractNature;
import at.procon.ted.model.entity.NoticeType;
@@ -38,6 +40,7 @@ import java.util.UUID;
@RequestMapping("/v1/documents")
@RequiredArgsConstructor
@Slf4j
+@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
@Tag(name = "Documents", description = "TED Procurement Document Search API")
public class DocumentController {
diff --git a/src/main/java/at/procon/ted/controller/SimilaritySearchController.java b/src/main/java/at/procon/ted/controller/SimilaritySearchController.java
index 5ce2e16..862e2cb 100644
--- a/src/main/java/at/procon/ted/controller/SimilaritySearchController.java
+++ b/src/main/java/at/procon/ted/controller/SimilaritySearchController.java
@@ -1,5 +1,7 @@
package at.procon.ted.controller;
+import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
+import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.service.SimilaritySearchService;
import at.procon.ted.service.SimilaritySearchService.SimilaritySearchResponse;
import io.swagger.v3.oas.annotations.Operation;
@@ -28,6 +30,7 @@ import java.io.IOException;
@RequestMapping("/similarity")
@RequiredArgsConstructor
@Slf4j
+@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
@Tag(name = "Similarity Search", description = "Vector-based semantic similarity search on TED procurement documents")
public class SimilaritySearchController {
diff --git a/src/main/resources/application-legacy.yml b/src/main/resources/application-legacy.yml
index 23a314f..1b13171 100644
--- a/src/main/resources/application-legacy.yml
+++ b/src/main/resources/application-legacy.yml
@@ -1,11 +1,152 @@
-spring:
- config:
- activate:
- on-profile: legacy
-dip:
- runtime:
- mode: LEGACY
+# Legacy / shared application properties
+# New-runtime-only properties are moved to application-new.yml.
+ted:
+ # Directory configuration for file processing
+ input:
+ # Base directory for watching incoming TED XML files
+ directory: ${TED_INPUT_DIR:/ted.europe/extracted}
+ # File pattern to match (recursive scanning)
+ pattern: "**/*.xml"
+ # Move processed files to this directory
+ processed-directory: ${TED_PROCESSED_DIR:.processed}
+ # Move failed files to this directory
+ error-directory: ${TED_ERROR_DIR:.error}
+ # Polling interval in milliseconds
+ poll-interval: 5000
+ # Maximum messages per poll (reduced to prevent memory issues)
+ max-messages-per-poll: 10
+
+ # Schema validation configuration
+ schema:
+ # Enable/disable XSD validation
+ enabled: true
+ # Path to eForms SDK schemas (from Maven dependency or custom location)
+ path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
+
+ # Vectorization configuration
+ vectorization:
+ # Enable/disable async vectorization
+ enabled: false
+ # Use external HTTP API instead of subprocess
+ use-http-api: true
+ # Embedding service URL
+ api-url: http://172.20.240.18:8001
+ # Model name for sentence-transformers
+ model-name: intfloat/multilingual-e5-large
+ # Vector dimensions (must match model output)
+ dimensions: 1024
+ # Batch size for vectorization
+ batch-size: 16
+ # Thread pool size for async processing
+ thread-pool-size: 4
+ # Maximum text length for vectorization (characters)
+ max-text-length: 8192
+ # HTTP connection timeout (milliseconds)
+ connect-timeout: 10000
+ # HTTP socket/read timeout (milliseconds)
+ socket-timeout: 60000
+ # Maximum retries on connection failure
+ max-retries: 5
+ # Packages download configuration
+ download:
+ # Enable/disable automatic package download
+ enabled: false
+ # User service-based camel route
+ use-service-based: false
+ # Base URL for TED Daily Packages
+ base-url: https://ted.europa.eu/packages/daily/
+ # Download directory for tar.gz files
+ download-directory: /ted.europe/downloads
+ # Extract directory for XML files
+ extract-directory: /ted.europe/extracted
+ # Start year for downloads
+ start-year: 2026
+ # Max consecutive 404 errors before stopping
+ max-consecutive-404: 4
+ # Polling interval (milliseconds) - 2 minutes
+ poll-interval: 300000
+ # Retry interval for tail NOT_FOUND packages - 6 hours
+ not-found-retry-interval: 21600000
+ # Grace period after year end before a previous-year tail 404 is treated as final
+ previous-year-grace-period-days: 30
+ # Keep retrying current-year tail 404 packages indefinitely
+ retry-current-year-not-found-indefinitely: true
+ # Download timeout (milliseconds) - 5 minutes
+ download-timeout: 300000
+ # Max concurrent downloads
+ max-concurrent-downloads: 2
+ # Delay between downloads (milliseconds) for rate limiting - 5 seconds
+ delay-between-downloads: 3000
+ # Delete tar.gz after extraction
+ delete-after-extraction: true
+ # Prioritize current year first
+ prioritize-current-year: false
+ # IMAP Mail configuration
+ mail:
+ # Enable/disable mail processing
+ enabled: false
+ # IMAP server hostname
+ host: mail.mymagenta.business
+ # IMAP server port (993 for IMAPS)
+ port: 993
+ # Mail account username (email address)
+ username: archiv@procon.co.at
+ # Mail account password
+ password: ${MAIL_PASSWORD:worasigg}
+ # Use SSL/TLS connection
+ ssl: true
+ # Mail folder to read from
+ folder-name: INBOX
+ # Delete messages after processing
+ delete: false
+ # Mark messages as seen after processing (false = peek mode, don't mark as read)
+ seen: false
+ # Only process unseen messages
+ unseen: true
+ # Polling delay in milliseconds (1 minute)
+ delay: 60000
+ # Max messages per poll
+ max-messages-per-poll: 100
+ # Output directory for processed attachments
+ attachment-output-directory: /ted.europe/mail-attachments
+ # Enable/disable MIME file input processing
+ mime-input-enabled: true
+ # Input directory for MIME files (.eml)
+ mime-input-directory: /ted.europe/mime-input
+ # File pattern for MIME files (regex)
+ mime-input-pattern: .*\\.eml
+ # Polling interval for MIME input directory (milliseconds)
+ mime-input-poll-interval: 1000000
+ # solution brief processing
+ solution-brief:
+ # Enable/disable Solution Brief processing
+ enabled: false
+ # Input directory for Solution Brief PDF files
+ input-directory: C:/work/SolutionBrief
+ # Output directory for Excel result files (relative to input or absolute)
+ result-directory: ./result
+ # Number of top similar documents to include
+ top-k: 20
+ # Minimum similarity threshold (0.0-1.0)
+ similarity-threshold: 0.5
+ # Polling interval in milliseconds (30 seconds)
+ poll-interval: 30000
+ # File pattern for PDF files (regex)
+ file-pattern: .*\\.pdf
+ # Process files only once (idempotent)
+ idempotent: true
+ # Idempotent repository file path
+ idempotent-repository: ./solution-brief-processed.dat
+
+ # Data cleanup configuration
+ cleanup:
+ # Enable automatic cleanup of old documents
+ enabled: false
+ # Retention period in years (default: 10)
+ retention-years: 10
+ # Cron expression for cleanup schedule (default: daily at 2 AM)
+ cron: "0 0 2 * * *"
# Legacy runtime uses the existing ted.* property tree.
# Move old route/download/mail/vectorization/search settings here over time.
diff --git a/src/main/resources/application-new.yml b/src/main/resources/application-new.yml
index ab6ee21..f7439ab 100644
--- a/src/main/resources/application-new.yml
+++ b/src/main/resources/application-new.yml
@@ -1,16 +1,6 @@
-# New runtime overrides
-# Activate with: --spring.profiles.active=new
-
-# Optional explicit marker; file is profile-specific already
-spring:
- config:
- activate:
- on-profile: new
-
dip:
runtime:
mode: NEW
-
search:
# Default page size for search results
default-page-size: 20
@@ -44,7 +34,6 @@ dip:
startup-lexical-backfill-limit: 500
# Number of top hits per engine returned by /search/debug
debug-top-hits-per-engine: 10
-
embedding:
enabled: true
default-document-model: e5-default
@@ -75,7 +64,6 @@ dip:
active: true
jobs:
enabled: true
-
# Phase 4 generic ingestion configuration
ingestion:
# Master switch for arbitrary document ingestion into the DOC model
@@ -132,6 +120,32 @@ dip:
# Import batch marker for mail roots and attachments
mail-import-batch-id: phase41-mail
+ # ted packages download configuration
+ ted-download:
+ # Enable/disable automatic package download
+ enabled: true
+ # Base URL for TED Daily Packages
+ base-url: https://ted.europa.eu/packages/daily/
+ # Download directory for tar.gz files
+ download-directory: /ted.europe/downloads-new
+ # Start year for downloads
+ start-year: 2026
+ # Polling interval (milliseconds) - 2 minutes
+ poll-interval: 3600000
+ # Retry interval for tail NOT_FOUND packages - 6 hours
+ not-found-retry-interval: 21600000
+ # Grace period after year end before a previous-year tail 404 is treated as final
+ previous-year-grace-period-days: 30
+ # Keep retrying current-year tail 404 packages indefinitely
+ retry-current-year-not-found-indefinitely: true
+ # Download timeout (milliseconds) - 5 minutes
+ download-timeout: 300000
+ # Max concurrent downloads
+ max-running-packages: 2
+ # Delay between downloads (milliseconds) for rate limiting - 5 seconds
+ delay-between-downloads: 5000
+ # Delete tar.gz after ingestion
+ delete-after-ingestion: true
ted: # Phase 3 TED projection configuration
projection:
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document
diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
index 283c87d..1581780 100644
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@@ -7,6 +7,9 @@ server:
context-path: /api
spring:
+ profiles:
+ active: new
+
application:
name: document-intelligence-platform
@@ -56,161 +59,6 @@ camel:
enabled: true
# Weniger strenge Health-Checks für File-Consumer
consumers-enabled: false
-
-# Default runtime mode: legacy / initial implementation
-# Activate profile 'new' to load application-new.yml and switch to the new runtime.
-dip:
- runtime:
- mode: LEGACY
-
-# Legacy / shared application properties
-# New-runtime-only properties are moved to application-new.yml.
-ted:
- # Directory configuration for file processing
- input:
- # Base directory for watching incoming TED XML files
- directory: ${TED_INPUT_DIR:/ted.europe/extracted}
- # File pattern to match (recursive scanning)
- pattern: "**/*.xml"
- # Move processed files to this directory
- processed-directory: ${TED_PROCESSED_DIR:.processed}
- # Move failed files to this directory
- error-directory: ${TED_ERROR_DIR:.error}
- # Polling interval in milliseconds
- poll-interval: 5000
- # Maximum messages per poll (reduced to prevent memory issues)
- max-messages-per-poll: 10
-
- # Schema validation configuration
- schema:
- # Enable/disable XSD validation
- enabled: true
- # Path to eForms SDK schemas (from Maven dependency or custom location)
- path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
-
- # Vectorization configuration
- vectorization:
- # Enable/disable async vectorization
- enabled: false
- # Use external HTTP API instead of subprocess
- use-http-api: true
- # Embedding service URL
- api-url: http://172.20.240.18:8001
- # Model name for sentence-transformers
- model-name: intfloat/multilingual-e5-large
- # Vector dimensions (must match model output)
- dimensions: 1024
- # Batch size for vectorization
- batch-size: 16
- # Thread pool size for async processing
- thread-pool-size: 4
- # Maximum text length for vectorization (characters)
- max-text-length: 8192
- # HTTP connection timeout (milliseconds)
- connect-timeout: 10000
- # HTTP socket/read timeout (milliseconds)
- socket-timeout: 60000
- # Maximum retries on connection failure
- max-retries: 5
- # Packages download configuration
- download:
- # Enable/disable automatic package download
- enabled: true
- # User service-based camel route
- use-service-based: false
- # Base URL for TED Daily Packages
- base-url: https://ted.europa.eu/packages/daily/
- # Download directory for tar.gz files
- download-directory: /ted.europe/downloads
- # Extract directory for XML files
- extract-directory: /ted.europe/extracted
- # Start year for downloads
- start-year: 2026
- # Max consecutive 404 errors before stopping
- max-consecutive-404: 4
- # Polling interval (milliseconds) - 2 minutes
- poll-interval: 300000
- # Retry interval for tail NOT_FOUND packages - 6 hours
- not-found-retry-interval: 21600000
- # Grace period after year end before a previous-year tail 404 is treated as final
- previous-year-grace-period-days: 30
- # Keep retrying current-year tail 404 packages indefinitely
- retry-current-year-not-found-indefinitely: true
- # Download timeout (milliseconds) - 5 minutes
- download-timeout: 300000
- # Max concurrent downloads
- max-concurrent-downloads: 2
- # Delay between downloads (milliseconds) for rate limiting - 5 seconds
- delay-between-downloads: 3000
- # Delete tar.gz after extraction
- delete-after-extraction: true
- # Prioritize current year first
- prioritize-current-year: false
- # IMAP Mail configuration
- mail:
- # Enable/disable mail processing
- enabled: false
- # IMAP server hostname
- host: mail.mymagenta.business
- # IMAP server port (993 for IMAPS)
- port: 993
- # Mail account username (email address)
- username: archiv@procon.co.at
- # Mail account password
- password: ${MAIL_PASSWORD:worasigg}
- # Use SSL/TLS connection
- ssl: true
- # Mail folder to read from
- folder-name: INBOX
- # Delete messages after processing
- delete: false
- # Mark messages as seen after processing (false = peek mode, don't mark as read)
- seen: false
- # Only process unseen messages
- unseen: true
- # Polling delay in milliseconds (1 minute)
- delay: 60000
- # Max messages per poll
- max-messages-per-poll: 100
- # Output directory for processed attachments
- attachment-output-directory: /ted.europe/mail-attachments
- # Enable/disable MIME file input processing
- mime-input-enabled: true
- # Input directory for MIME files (.eml)
- mime-input-directory: /ted.europe/mime-input
- # File pattern for MIME files (regex)
- mime-input-pattern: .*\\.eml
- # Polling interval for MIME input directory (milliseconds)
- mime-input-poll-interval: 1000000
- # solution brief processing
- solution-brief:
- # Enable/disable Solution Brief processing
- enabled: false
- # Input directory for Solution Brief PDF files
- input-directory: C:/work/SolutionBrief
- # Output directory for Excel result files (relative to input or absolute)
- result-directory: ./result
- # Number of top similar documents to include
- top-k: 20
- # Minimum similarity threshold (0.0-1.0)
- similarity-threshold: 0.5
- # Polling interval in milliseconds (30 seconds)
- poll-interval: 30000
- # File pattern for PDF files (regex)
- file-pattern: .*\\.pdf
- # Process files only once (idempotent)
- idempotent: true
- # Idempotent repository file path
- idempotent-repository: ./solution-brief-processed.dat
-
- # Data cleanup configuration
- cleanup:
- # Enable automatic cleanup of old documents
- enabled: false
- # Retention period in years (default: 10)
- retention-years: 10
- # Cron expression for cleanup schedule (default: daily at 2 AM)
- cron: "0 0 2 * * *"
# Actuator endpoints
management: