package at.procon.ted.config; import lombok.Data; import org.springframework.boot.context.properties.ConfigurationProperties; import org.springframework.context.annotation.Configuration; import org.springframework.validation.annotation.Validated; import jakarta.validation.constraints.Min; import jakarta.validation.constraints.NotBlank; import jakarta.validation.constraints.Positive; /** * Configuration properties for TED Procurement Processor. * * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Configuration @ConfigurationProperties(prefix = "ted") @Data @Validated public class TedProcessorProperties { private InputProperties input = new InputProperties(); private SchemaProperties schema = new SchemaProperties(); private VectorizationProperties vectorization = new VectorizationProperties(); private SearchProperties search = new SearchProperties(); private DownloadProperties download = new DownloadProperties(); private MailProperties mail = new MailProperties(); private SolutionBriefProperties solutionBrief = new SolutionBriefProperties(); private ProjectionProperties projection = new ProjectionProperties(); /** * Input directory configuration for Apache Camel file consumer. */ @Data public static class InputProperties { /** * Base directory for watching incoming TED XML files. */ @NotBlank private String directory = "D:/ted.europe/2025-11.tar/2025-11/11"; /** * File pattern to match (supports Ant-style patterns). */ private String pattern = "**/*.xml"; /** * Directory to move successfully processed files. */ private String processedDirectory = ".processed"; /** * Directory to move failed files. */ private String errorDirectory = ".error"; /** * Polling interval in milliseconds. */ @Positive private long pollInterval = 5000; /** * Maximum number of messages per poll. */ @Positive private int maxMessagesPerPoll = 100; } /** * XML Schema validation configuration. */ @Data public static class SchemaProperties { /** * Enable/disable XSD validation. */ private boolean enabled = true; /** * Path to the eForms XSD schema file. */ private String path = "classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd"; } /** * Document vectorization configuration. */ @Data public static class VectorizationProperties { /** * Enable/disable async vectorization. */ private boolean enabled = true; /** * Use external HTTP API instead of Python subprocess. */ private boolean useHttpApi = false; /** * Embedding service HTTP API URL. */ private String apiUrl = "http://localhost:8001"; /** * Sentence transformer model name. */ private String modelName = "intfloat/multilingual-e5-large"; /** * Vector dimensions (must match model output). */ @Positive private int dimensions = 1024; /** * Batch size for vectorization processing. */ @Min(1) private int batchSize = 16; /** * Thread pool size for async vectorization. */ @Min(1) private int threadPoolSize = 4; /** * Maximum text length for vectorization (characters). */ @Positive private int maxTextLength = 8192; /** * HTTP connection timeout in milliseconds. */ @Positive private int connectTimeout = 10000; /** * HTTP socket/read timeout in milliseconds. */ @Positive private int socketTimeout = 60000; /** * Maximum retries on connection failure. */ @Min(0) private int maxRetries = 5; /** * Enable the Phase 2 generic vectorization pipeline based on DOC text representations * and DOC embeddings instead of the legacy TED document vector columns as the primary * write target. */ private boolean genericPipelineEnabled = true; /** * Keep writing completed TED embeddings back to the legacy ted.procurement_document * vector columns so the existing semantic search stays operational during migration. */ private boolean dualWriteLegacyTedVectors = true; /** * Scheduler interval for generic embedding polling (milliseconds). */ @Positive private long genericSchedulerPeriodMs = 6000; /** * Builder key for the primary TED semantic representation created during Phase 2 dual-write. */ @NotBlank private String primaryRepresentationBuilderKey = "ted-phase2-primary-representation"; /** * Provider key used when registering the configured embedding model in DOC.doc_embedding_model. */ @NotBlank private String embeddingProvider = "http-embedding-service"; } /** * Search configuration. */ @Data public static class SearchProperties { /** * Default page size for search results. */ @Positive private int defaultPageSize = 20; /** * Maximum allowed page size. */ @Positive private int maxPageSize = 100; /** * Similarity threshold for vector search (0.0 - 1.0). */ private double similarityThreshold = 0.7; } /** * TED Daily Package Download configuration. */ @Data public static class DownloadProperties { /** * Enable/disable automatic package download. */ private boolean enabled = false; /** * Base URL für TED Daily Packages. */ private String baseUrl = "https://ted.europa.eu/packages/daily/"; /** * Download-Verzeichnis für tar.gz Files. */ private String downloadDirectory = "D:/ted.europe/downloads"; /** * Extrahierungs-Verzeichnis für XML-Dateien. */ private String extractDirectory = "D:/ted.europe/extracted"; /** * Start-Jahr für den Download. */ @Positive private int startYear = 2015; /** * Anzahl aufeinanderfolgender 404-Fehler bevor Download stoppt. * HINWEIS: Wird nicht mehr verwendet. System stoppt jetzt sofort bei erstem 404. * @deprecated Nicht mehr verwendet seit Update auf sofortige 404-Behandlung */ @Positive @Deprecated private int maxConsecutive404 = 1; /** * Polling-Interval für neue Packages (Millisekunden). */ @Positive private long pollInterval = 3600000; // 1 Stunde /** * Retry-Intervall für tail-NOT_FOUND Packages. * Current year packages remain retryable indefinitely. */ @Positive private long notFoundRetryInterval = 21600000; // 6 Stunden /** * Grace period for previous years after year end before a tail-NOT_FOUND is treated as final. */ @Min(0) private int previousYearGracePeriodDays = 30; /** * Keep retrying current-year tail NOT_FOUND packages indefinitely. */ private boolean retryCurrentYearNotFoundIndefinitely = true; /** * Download-Timeout (Millisekunden). */ @Positive private long downloadTimeout = 300000; // 5 Minuten /** * Maximale gleichzeitige Downloads. */ @Positive private int maxConcurrentDownloads = 2; /** * Verzögerung zwischen Downloads (Millisekunden) für Rate Limiting. */ @Positive private long delayBetweenDownloads = 5000; // 5 Sekunden /** * Automatisches Löschen von tar.gz nach Extraktion. */ private boolean deleteAfterExtraction = true; /** * Priorisierung: Aktuelles Jahr zuerst, dann rückwärts. * HINWEIS: Wird nicht mehr verwendet. System priorisiert immer das aktuelle Jahr. * @deprecated Nicht mehr verwendet - immer aktiv */ @Deprecated private boolean prioritizeCurrentYear = true; } /** * IMAP Mail configuration for email processing. */ @Data public static class MailProperties { /** * Enable/disable mail processing. */ private boolean enabled = false; /** * IMAP server hostname. */ @NotBlank private String host = "mail.mymagenta.business"; /** * IMAP server port. */ @Positive private int port = 993; /** * Mail account username (email address). */ @NotBlank private String username = "archiv@procon.co.at"; /** * Mail account password. */ @NotBlank private String password = ""; /** * Use SSL/TLS connection. */ private boolean ssl = true; /** * Mail folder to read from. */ private String folderName = "INBOX"; /** * Delete messages after processing. */ private boolean delete = false; /** * Mark messages as seen after processing. */ private boolean seen = true; /** * Only process unseen messages. */ private boolean unseen = true; /** * Polling delay in milliseconds. */ @Positive private long delay = 60000; /** * Max messages per poll. */ @Positive private int maxMessagesPerPoll = 10; /** * Output directory for processed attachments. */ private String attachmentOutputDirectory = "D:/ted.europe/mail-attachments"; /** * Enable/disable MIME file input processing. */ private boolean mimeInputEnabled = false; /** * Input directory for MIME files (.eml, .msg). */ private String mimeInputDirectory = "D:/ted.europe/mime-input"; /** * File pattern for MIME files. */ private String mimeInputPattern = "*.eml"; /** * Polling interval for MIME input directory (milliseconds). */ @Positive private long mimeInputPollInterval = 10000; } /** * Phase 3 TED projection configuration. */ @Data public static class ProjectionProperties { /** * Enable/disable Phase 3 TED structured projection dual-write. */ private boolean enabled = true; /** * Optional startup backfill of missing projections from legacy TED documents. */ private boolean startupBackfillEnabled = false; /** * Maximum number of legacy TED documents to backfill during startup. */ @Positive private int startupBackfillLimit = 250; } /** * Solution Brief processing configuration. * Scans PDF files and generates Excel reports with similar TED documents. */ @Data public static class SolutionBriefProperties { /** * Enable/disable Solution Brief processing. */ private boolean enabled = false; /** * Input directory for Solution Brief PDF files. */ private String inputDirectory = "C:/work/SolutionBrief"; /** * Output directory for Excel result files (relative to input or absolute). */ private String resultDirectory = "./result"; /** * Number of top similar documents to include in results. */ @Positive private int topK = 20; /** * Minimum similarity threshold (0.0-1.0). */ private double similarityThreshold = 0.5; /** * Polling interval in milliseconds. */ @Positive private long pollInterval = 30000; /** * File pattern for PDF files. */ private String filePattern = ".*\\.pdf"; /** * Process files only once (idempotent based on filename+size+date). */ private boolean idempotent = true; /** * Idempotent repository file path. */ private String idempotentRepository = "./solution-brief-processed.dat"; } }