package at.procon.ted.config; import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; import at.procon.dip.runtime.config.RuntimeMode; import lombok.Data; import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Primary; import org.springframework.validation.annotation.Validated; import jakarta.validation.constraints.Min; import jakarta.validation.constraints.NotBlank; import jakarta.validation.constraints.Positive; /** * Configuration properties for TED Procurement Processor. * * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Configuration @ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @ConfigurationProperties(prefix = "ted") @Data @Validated @Primary public class TedProcessorProperties { private InputProperties input = new InputProperties(); private SchemaProperties schema = new SchemaProperties(); private VectorizationProperties vectorization = new VectorizationProperties(); private SearchProperties search = new SearchProperties(); private DownloadProperties download = new DownloadProperties(); private MailProperties mail = new MailProperties(); private SolutionBriefProperties solutionBrief = new SolutionBriefProperties(); private ProjectionProperties projection = new ProjectionProperties(); private GenericIngestionProperties genericIngestion = new GenericIngestionProperties(); private RepairProperties repair = new RepairProperties(); /** * Input directory configuration for Apache Camel file consumer. */ @Data public static class InputProperties { /** * Base directory for watching incoming TED XML files. */ @NotBlank private String directory = "D:/ted.europe/2025-11.tar/2025-11/11"; /** * File pattern to match (supports Ant-style patterns). */ private String pattern = "**/*.xml"; /** * Directory to move successfully processed files. */ private String processedDirectory = ".processed"; /** * Directory to move failed files. */ private String errorDirectory = ".error"; /** * Polling interval in milliseconds. */ @Positive private long pollInterval = 5000; /** * Maximum number of messages per poll. */ @Positive private int maxMessagesPerPoll = 100; } /** * XML Schema validation configuration. */ @Data public static class SchemaProperties { /** * Enable/disable XSD validation. */ private boolean enabled = true; /** * Path to the eForms XSD schema file. */ private String path = "classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd"; } /** * Document vectorization configuration. */ @Data public static class VectorizationProperties { /** * Enable/disable async vectorization. */ private boolean enabled = true; /** * Use external HTTP API instead of Python subprocess. */ private boolean useHttpApi = false; /** * Embedding service HTTP API URL. */ private String apiUrl = "http://localhost:8001"; /** * Sentence transformer model name. */ private String modelName = "intfloat/multilingual-e5-large"; /** * Vector dimensions (must match model output). */ @Positive private int dimensions = 1024; /** * Batch size for vectorization processing. */ @Min(1) private int batchSize = 16; /** * Thread pool size for async vectorization. */ @Min(1) private int threadPoolSize = 4; /** * Maximum text length for vectorization (characters). */ @Positive private int maxTextLength = 8192; /** * HTTP connection timeout in milliseconds. */ @Positive private int connectTimeout = 10000; /** * HTTP socket/read timeout in milliseconds. */ @Positive private int socketTimeout = 60000; /** * Maximum retries on connection failure. */ @Min(0) private int maxRetries = 5; @Positive private long genericSchedulerPeriodMs = 30000; private String primaryRepresentationBuilderKey = "default-generic"; } /** * Search configuration. */ @Data public static class SearchProperties { /** * Default page size for search results. */ @Positive private int defaultPageSize = 20; /** * Maximum allowed page size. */ @Positive private int maxPageSize = 100; /** * Similarity threshold for vector search (0.0 - 1.0). */ private double similarityThreshold = 0.7; /** * Minimum trigram similarity for fuzzy lexical matches. */ private double trigramSimilarityThreshold = 0.12; /** * Candidate limits per search engine before fusion/collapse. */ @Positive private int fulltextCandidateLimit = 120; @Positive private int trigramCandidateLimit = 120; @Positive private int semanticCandidateLimit = 120; /** * Hybrid fusion weights. */ private double fulltextWeight = 0.35; private double trigramWeight = 0.20; private double semanticWeight = 0.45; /** * Enable chunk representations for long documents. */ private boolean chunkingEnabled = true; /** * Target chunk size in characters for CHUNK representations. */ @Positive private int chunkTargetChars = 1800; /** * Overlap between consecutive chunks in characters. */ @Min(0) private int chunkOverlapChars = 200; /** * Maximum CHUNK representations generated per document. */ @Positive private int maxChunksPerDocument = 12; /** * Additional score weight for recency. */ private double recencyBoostWeight = 0.05; /** * Half-life in days used for recency decay. */ @Positive private int recencyHalfLifeDays = 30; /** * Startup backfill limit for missing DOC lexical vectors. */ @Positive private int startupLexicalBackfillLimit = 500; /** * Number of hits per engine returned by the debug endpoint. */ @Positive private int debugTopHitsPerEngine = 10; } /** * TED Daily Package Download configuration. */ @Data public static class DownloadProperties { /** * Enable/disable automatic package download. */ private boolean enabled = false; /** * Base URL für TED Daily Packages. */ private String baseUrl = "https://ted.europa.eu/packages/daily/"; /** * Download-Verzeichnis für tar.gz Files. */ private String downloadDirectory = "D:/ted.europe/downloads"; /** * Extrahierungs-Verzeichnis für XML-Dateien. */ private String extractDirectory = "D:/ted.europe/extracted"; /** * Start-Jahr für den Download. */ @Positive private int startYear = 2015; /** * Anzahl aufeinanderfolgender 404-Fehler bevor Download stoppt. * HINWEIS: Wird nicht mehr verwendet. System stoppt jetzt sofort bei erstem 404. * @deprecated Nicht mehr verwendet seit Update auf sofortige 404-Behandlung */ @Positive @Deprecated private int maxConsecutive404 = 1; /** * Polling-Interval für neue Packages (Millisekunden). */ @Positive private long pollInterval = 3600000; // 1 Stunde /** * Retry-Intervall für tail-NOT_FOUND Packages. * Current year packages remain retryable indefinitely. */ @Positive private long notFoundRetryInterval = 21600000; // 6 Stunden /** * Grace period for previous years after year end before a tail-NOT_FOUND is treated as final. */ @Min(0) private int previousYearGracePeriodDays = 30; /** * Keep retrying current-year tail NOT_FOUND packages indefinitely. */ private boolean retryCurrentYearNotFoundIndefinitely = true; /** * Download-Timeout (Millisekunden). */ @Positive private long downloadTimeout = 300000; // 5 Minuten /** * Maximale gleichzeitige Downloads. */ @Positive private int maxConcurrentDownloads = 2; /** * Verzögerung zwischen Downloads (Millisekunden) für Rate Limiting. */ @Positive private long delayBetweenDownloads = 5000; // 5 Sekunden /** * Automatisches Löschen von tar.gz nach Extraktion. */ private boolean deleteAfterExtraction = true; /** * Priorisierung: Aktuelles Jahr zuerst, dann rückwärts. * HINWEIS: Wird nicht mehr verwendet. System priorisiert immer das aktuelle Jahr. * @deprecated Nicht mehr verwendet - immer aktiv */ @Deprecated private boolean prioritizeCurrentYear = true; } /** * Legacy TED package repair / re-import configuration. */ @Data public static class RepairProperties { /** * Enable startup repair of incomplete or missing TED packages. */ private boolean enabled = false; /** * If true, only logs the selected package candidates without modifying data. */ private boolean dryRun = false; /** * Maximum number of packages to process in one startup run. */ @Positive private int maxPackages = 100; /** * Optional explicit package identifiers (YYYYSSSSS) to repair. */ private java.util.List packageIdentifiers = new java.util.ArrayList<>(); /** * Optional lower bound package identifier (inclusive). */ private String fromPackageIdentifier; /** * Optional upper bound package identifier (inclusive). */ private String toPackageIdentifier; /** * Include missing package sequence numbers inside the selected range. */ private boolean includeMissingSequenceGaps = true; /** * Re-download the package archive when it is missing locally. */ private boolean redownloadMissingArchives = true; /** * Always re-download the package archive even when a local archive already exists. */ private boolean forceRedownload = false; /** * Refuse startup repair while the automatic legacy package download scheduler is enabled. */ private boolean allowWhileDownloadEnabled = false; } /** * IMAP Mail configuration for email processing. */ @Data public static class MailProperties { /** * Enable/disable mail processing. */ private boolean enabled = false; /** * IMAP server hostname. */ @NotBlank private String host = "mail.mymagenta.business"; /** * IMAP server port. */ @Positive private int port = 993; /** * Mail account username (email address). */ @NotBlank private String username = "archiv@procon.co.at"; /** * Mail account password. */ @NotBlank private String password = ""; /** * Use SSL/TLS connection. */ private boolean ssl = true; /** * Mail folder to read from. */ private String folderName = "INBOX"; /** * Delete messages after processing. */ private boolean delete = false; /** * Mark messages as seen after processing. */ private boolean seen = true; /** * Only process unseen messages. */ private boolean unseen = true; /** * Polling delay in milliseconds. */ @Positive private long delay = 60000; /** * Max messages per poll. */ @Positive private int maxMessagesPerPoll = 10; /** * Output directory for processed attachments. */ private String attachmentOutputDirectory = "D:/ted.europe/mail-attachments"; /** * Enable/disable MIME file input processing. */ private boolean mimeInputEnabled = false; /** * Input directory for MIME files (.eml, .msg). */ private String mimeInputDirectory = "D:/ted.europe/mime-input"; /** * File pattern for MIME files. */ private String mimeInputPattern = "*.eml"; /** * Polling interval for MIME input directory (milliseconds). */ @Positive private long mimeInputPollInterval = 10000; } /** * Phase 3 TED projection configuration. */ @Data public static class ProjectionProperties { /** * Enable/disable Phase 3 TED structured projection dual-write. */ private boolean enabled = true; /** * Optional startup backfill of missing projections from legacy TED documents. */ private boolean startupBackfillEnabled = false; /** * Maximum number of legacy TED documents to backfill during startup. */ @Positive private int startupBackfillLimit = 250; } /** * Solution Brief processing configuration. * Scans PDF files and generates Excel reports with similar TED documents. */ @Data public static class SolutionBriefProperties { /** * Enable/disable Solution Brief processing. */ private boolean enabled = false; /** * Input directory for Solution Brief PDF files. */ private String inputDirectory = "C:/work/SolutionBrief"; /** * Output directory for Excel result files (relative to input or absolute). */ private String resultDirectory = "./result"; /** * Number of top similar documents to include in results. */ @Positive private int topK = 20; /** * Minimum similarity threshold (0.0-1.0). */ private double similarityThreshold = 0.5; /** * Polling interval in milliseconds. */ @Positive private long pollInterval = 30000; /** * File pattern for PDF files. */ private String filePattern = ".*\\.pdf"; /** * Process files only once (idempotent based on filename+size+date). */ private boolean idempotent = true; /** * Idempotent repository file path. */ private String idempotentRepository = "./solution-brief-processed.dat"; } /** * Phase 4 generic ingestion configuration. */ @Data public static class GenericIngestionProperties { /** * Master switch for the generic ingestion pipeline. */ private boolean enabled = false; /** * Enable/disable filesystem import route for arbitrary documents. */ private boolean fileSystemEnabled = false; /** * Enable/disable REST/API upload endpoints for arbitrary documents. */ private boolean restUploadEnabled = true; /** * Input directory for the generic filesystem importer. */ private String inputDirectory = "/ted.europe/generic-input"; /** * Regular-expression file pattern used by the Camel file route. */ private String filePattern = ".*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$"; /** * Directory where successfully imported files are moved. */ private String processedDirectory = ".dip-processed"; /** * Directory where failed files are moved. */ private String errorDirectory = ".dip-error"; /** * Polling interval in milliseconds. */ @Positive private long pollInterval = 15000; /** * Maximum number of files per poll. */ @Positive private int maxMessagesPerPoll = 10; /** * Default owner tenant for files imported through the generic route. */ private String defaultOwnerTenantKey; /** * Default visibility for generic imports when not supplied explicitly. */ private at.procon.dip.domain.access.DocumentVisibility defaultVisibility = at.procon.dip.domain.access.DocumentVisibility.PUBLIC; /** * Optional default language code applied to filesystem imports. */ private String defaultLanguageCode; /** * Persist original binary payloads in DB when they are small enough. */ private boolean storeOriginalBinaryInDb = true; /** * Maximum binary size (bytes) stored inline in DOC.doc_content.binary_content. */ @Positive private int maxBinaryBytesInDb = 5242880; /** * Whether an already imported content hash should resolve to the existing document. */ private boolean deduplicateByContentHash = true; /** * Persist ORIGINAL content rows for wrapper/container documents that primarily exist * to group or reference child documents (for example TED packages or expanded ZIP wrappers). * When disabled, wrappers are still classified, extracted and represented, but the raw * ORIGINAL content payload is not stored in DOC.doc_content. */ private boolean storeOriginalContentForWrapperDocuments = true; /** * Queue only the primary text representation for embedding. */ private boolean vectorizePrimaryRepresentationOnly = true; /** * Import batch identifier written to DOC.doc_source.import_batch_id. */ @NotBlank private String importBatchId = "phase4-generic"; /** * Enable the Phase 4.1 TED package adapter built on top of the generic DOC ingestion SPI. */ private boolean tedPackageAdapterEnabled = true; /** * Enable the Phase 4.1 mail/document adapter built on top of the generic DOC ingestion SPI. */ private boolean mailAdapterEnabled = false; /** * Optional dedicated owner tenant key for imported mail messages and attachments. * Falls back to defaultOwnerTenantKey when not configured. */ private String mailDefaultOwnerTenantKey; /** * Default visibility for imported mail messages and attachments. */ private at.procon.dip.domain.access.DocumentVisibility mailDefaultVisibility = at.procon.dip.domain.access.DocumentVisibility.TENANT; /** * Expand ZIP attachments recursively through the mail adapter. */ private boolean expandMailZipAttachments = true; /** * Import batch identifier for TED package roots and extracted TED child documents. */ @NotBlank private String tedPackageImportBatchId = "phase41-ted-package"; /** * When true, TED packages are persisted only through the generic ingestion gateway * and the legacy XML batch persistence path is skipped. */ private boolean gatewayOnlyForTedPackages = false; /** * Import batch identifier for imported mail root messages and child attachments. */ @NotBlank private String mailImportBatchId = "phase41-mail"; } }