You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
488 lines
13 KiB
Java
488 lines
13 KiB
Java
package at.procon.ted.config;
|
|
|
|
import lombok.Data;
|
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
|
import org.springframework.context.annotation.Configuration;
|
|
import org.springframework.validation.annotation.Validated;
|
|
|
|
import jakarta.validation.constraints.Min;
|
|
import jakarta.validation.constraints.NotBlank;
|
|
import jakarta.validation.constraints.Positive;
|
|
|
|
/**
|
|
* Configuration properties for TED Procurement Processor.
|
|
*
|
|
* @author Martin.Schweitzer@procon.co.at and claude.ai
|
|
*/
|
|
@Configuration
|
|
@ConfigurationProperties(prefix = "ted")
|
|
@Data
|
|
@Validated
|
|
public class TedProcessorProperties {
|
|
|
|
private InputProperties input = new InputProperties();
|
|
private SchemaProperties schema = new SchemaProperties();
|
|
private VectorizationProperties vectorization = new VectorizationProperties();
|
|
private SearchProperties search = new SearchProperties();
|
|
private DownloadProperties download = new DownloadProperties();
|
|
private MailProperties mail = new MailProperties();
|
|
private SolutionBriefProperties solutionBrief = new SolutionBriefProperties();
|
|
private ProjectionProperties projection = new ProjectionProperties();
|
|
|
|
/**
|
|
* Input directory configuration for Apache Camel file consumer.
|
|
*/
|
|
@Data
|
|
public static class InputProperties {
|
|
|
|
/**
|
|
* Base directory for watching incoming TED XML files.
|
|
*/
|
|
@NotBlank
|
|
private String directory = "D:/ted.europe/2025-11.tar/2025-11/11";
|
|
|
|
/**
|
|
* File pattern to match (supports Ant-style patterns).
|
|
*/
|
|
private String pattern = "**/*.xml";
|
|
|
|
/**
|
|
* Directory to move successfully processed files.
|
|
*/
|
|
private String processedDirectory = ".processed";
|
|
|
|
/**
|
|
* Directory to move failed files.
|
|
*/
|
|
private String errorDirectory = ".error";
|
|
|
|
/**
|
|
* Polling interval in milliseconds.
|
|
*/
|
|
@Positive
|
|
private long pollInterval = 5000;
|
|
|
|
/**
|
|
* Maximum number of messages per poll.
|
|
*/
|
|
@Positive
|
|
private int maxMessagesPerPoll = 100;
|
|
}
|
|
|
|
/**
|
|
* XML Schema validation configuration.
|
|
*/
|
|
@Data
|
|
public static class SchemaProperties {
|
|
|
|
/**
|
|
* Enable/disable XSD validation.
|
|
*/
|
|
private boolean enabled = true;
|
|
|
|
/**
|
|
* Path to the eForms XSD schema file.
|
|
*/
|
|
private String path = "classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd";
|
|
}
|
|
|
|
/**
|
|
* Document vectorization configuration.
|
|
*/
|
|
@Data
|
|
public static class VectorizationProperties {
|
|
|
|
/**
|
|
* Enable/disable async vectorization.
|
|
*/
|
|
private boolean enabled = true;
|
|
|
|
/**
|
|
* Use external HTTP API instead of Python subprocess.
|
|
*/
|
|
private boolean useHttpApi = false;
|
|
|
|
/**
|
|
* Embedding service HTTP API URL.
|
|
*/
|
|
private String apiUrl = "http://localhost:8001";
|
|
|
|
/**
|
|
* Sentence transformer model name.
|
|
*/
|
|
private String modelName = "intfloat/multilingual-e5-large";
|
|
|
|
/**
|
|
* Vector dimensions (must match model output).
|
|
*/
|
|
@Positive
|
|
private int dimensions = 1024;
|
|
|
|
/**
|
|
* Batch size for vectorization processing.
|
|
*/
|
|
@Min(1)
|
|
private int batchSize = 16;
|
|
|
|
/**
|
|
* Thread pool size for async vectorization.
|
|
*/
|
|
@Min(1)
|
|
private int threadPoolSize = 4;
|
|
|
|
/**
|
|
* Maximum text length for vectorization (characters).
|
|
*/
|
|
@Positive
|
|
private int maxTextLength = 8192;
|
|
|
|
/**
|
|
* HTTP connection timeout in milliseconds.
|
|
*/
|
|
@Positive
|
|
private int connectTimeout = 10000;
|
|
|
|
/**
|
|
* HTTP socket/read timeout in milliseconds.
|
|
*/
|
|
@Positive
|
|
private int socketTimeout = 60000;
|
|
|
|
/**
|
|
* Maximum retries on connection failure.
|
|
*/
|
|
@Min(0)
|
|
private int maxRetries = 5;
|
|
|
|
/**
|
|
* Enable the Phase 2 generic vectorization pipeline based on DOC text representations
|
|
* and DOC embeddings instead of the legacy TED document vector columns as the primary
|
|
* write target.
|
|
*/
|
|
private boolean genericPipelineEnabled = true;
|
|
|
|
/**
|
|
* Keep writing completed TED embeddings back to the legacy ted.procurement_document
|
|
* vector columns so the existing semantic search stays operational during migration.
|
|
*/
|
|
private boolean dualWriteLegacyTedVectors = true;
|
|
|
|
/**
|
|
* Scheduler interval for generic embedding polling (milliseconds).
|
|
*/
|
|
@Positive
|
|
private long genericSchedulerPeriodMs = 6000;
|
|
|
|
/**
|
|
* Builder key for the primary TED semantic representation created during Phase 2 dual-write.
|
|
*/
|
|
@NotBlank
|
|
private String primaryRepresentationBuilderKey = "ted-phase2-primary-representation";
|
|
|
|
/**
|
|
* Provider key used when registering the configured embedding model in DOC.doc_embedding_model.
|
|
*/
|
|
@NotBlank
|
|
private String embeddingProvider = "http-embedding-service";
|
|
}
|
|
|
|
/**
|
|
* Search configuration.
|
|
*/
|
|
@Data
|
|
public static class SearchProperties {
|
|
|
|
/**
|
|
* Default page size for search results.
|
|
*/
|
|
@Positive
|
|
private int defaultPageSize = 20;
|
|
|
|
/**
|
|
* Maximum allowed page size.
|
|
*/
|
|
@Positive
|
|
private int maxPageSize = 100;
|
|
|
|
/**
|
|
* Similarity threshold for vector search (0.0 - 1.0).
|
|
*/
|
|
private double similarityThreshold = 0.7;
|
|
}
|
|
|
|
/**
|
|
* TED Daily Package Download configuration.
|
|
*/
|
|
@Data
|
|
public static class DownloadProperties {
|
|
|
|
/**
|
|
* Enable/disable automatic package download.
|
|
*/
|
|
private boolean enabled = false;
|
|
|
|
/**
|
|
* Base URL für TED Daily Packages.
|
|
*/
|
|
private String baseUrl = "https://ted.europa.eu/packages/daily/";
|
|
|
|
/**
|
|
* Download-Verzeichnis für tar.gz Files.
|
|
*/
|
|
private String downloadDirectory = "D:/ted.europe/downloads";
|
|
|
|
/**
|
|
* Extrahierungs-Verzeichnis für XML-Dateien.
|
|
*/
|
|
private String extractDirectory = "D:/ted.europe/extracted";
|
|
|
|
/**
|
|
* Start-Jahr für den Download.
|
|
*/
|
|
@Positive
|
|
private int startYear = 2015;
|
|
|
|
/**
|
|
* Anzahl aufeinanderfolgender 404-Fehler bevor Download stoppt.
|
|
* HINWEIS: Wird nicht mehr verwendet. System stoppt jetzt sofort bei erstem 404.
|
|
* @deprecated Nicht mehr verwendet seit Update auf sofortige 404-Behandlung
|
|
*/
|
|
@Positive
|
|
@Deprecated
|
|
private int maxConsecutive404 = 1;
|
|
|
|
/**
|
|
* Polling-Interval für neue Packages (Millisekunden).
|
|
*/
|
|
@Positive
|
|
private long pollInterval = 3600000; // 1 Stunde
|
|
|
|
/**
|
|
* Retry-Intervall für tail-NOT_FOUND Packages.
|
|
* Current year packages remain retryable indefinitely.
|
|
*/
|
|
@Positive
|
|
private long notFoundRetryInterval = 21600000; // 6 Stunden
|
|
|
|
/**
|
|
* Grace period for previous years after year end before a tail-NOT_FOUND is treated as final.
|
|
*/
|
|
@Min(0)
|
|
private int previousYearGracePeriodDays = 30;
|
|
|
|
/**
|
|
* Keep retrying current-year tail NOT_FOUND packages indefinitely.
|
|
*/
|
|
private boolean retryCurrentYearNotFoundIndefinitely = true;
|
|
|
|
/**
|
|
* Download-Timeout (Millisekunden).
|
|
*/
|
|
@Positive
|
|
private long downloadTimeout = 300000; // 5 Minuten
|
|
|
|
/**
|
|
* Maximale gleichzeitige Downloads.
|
|
*/
|
|
@Positive
|
|
private int maxConcurrentDownloads = 2;
|
|
|
|
/**
|
|
* Verzögerung zwischen Downloads (Millisekunden) für Rate Limiting.
|
|
*/
|
|
@Positive
|
|
private long delayBetweenDownloads = 5000; // 5 Sekunden
|
|
|
|
/**
|
|
* Automatisches Löschen von tar.gz nach Extraktion.
|
|
*/
|
|
private boolean deleteAfterExtraction = true;
|
|
|
|
/**
|
|
* Priorisierung: Aktuelles Jahr zuerst, dann rückwärts.
|
|
* HINWEIS: Wird nicht mehr verwendet. System priorisiert immer das aktuelle Jahr.
|
|
* @deprecated Nicht mehr verwendet - immer aktiv
|
|
*/
|
|
@Deprecated
|
|
private boolean prioritizeCurrentYear = true;
|
|
}
|
|
|
|
/**
|
|
* IMAP Mail configuration for email processing.
|
|
*/
|
|
@Data
|
|
public static class MailProperties {
|
|
|
|
/**
|
|
* Enable/disable mail processing.
|
|
*/
|
|
private boolean enabled = false;
|
|
|
|
/**
|
|
* IMAP server hostname.
|
|
*/
|
|
@NotBlank
|
|
private String host = "mail.mymagenta.business";
|
|
|
|
/**
|
|
* IMAP server port.
|
|
*/
|
|
@Positive
|
|
private int port = 993;
|
|
|
|
/**
|
|
* Mail account username (email address).
|
|
*/
|
|
@NotBlank
|
|
private String username = "archiv@procon.co.at";
|
|
|
|
/**
|
|
* Mail account password.
|
|
*/
|
|
@NotBlank
|
|
private String password = "";
|
|
|
|
/**
|
|
* Use SSL/TLS connection.
|
|
*/
|
|
private boolean ssl = true;
|
|
|
|
/**
|
|
* Mail folder to read from.
|
|
*/
|
|
private String folderName = "INBOX";
|
|
|
|
/**
|
|
* Delete messages after processing.
|
|
*/
|
|
private boolean delete = false;
|
|
|
|
/**
|
|
* Mark messages as seen after processing.
|
|
*/
|
|
private boolean seen = true;
|
|
|
|
/**
|
|
* Only process unseen messages.
|
|
*/
|
|
private boolean unseen = true;
|
|
|
|
/**
|
|
* Polling delay in milliseconds.
|
|
*/
|
|
@Positive
|
|
private long delay = 60000;
|
|
|
|
/**
|
|
* Max messages per poll.
|
|
*/
|
|
@Positive
|
|
private int maxMessagesPerPoll = 10;
|
|
|
|
/**
|
|
* Output directory for processed attachments.
|
|
*/
|
|
private String attachmentOutputDirectory = "D:/ted.europe/mail-attachments";
|
|
|
|
/**
|
|
* Enable/disable MIME file input processing.
|
|
*/
|
|
private boolean mimeInputEnabled = false;
|
|
|
|
/**
|
|
* Input directory for MIME files (.eml, .msg).
|
|
*/
|
|
private String mimeInputDirectory = "D:/ted.europe/mime-input";
|
|
|
|
/**
|
|
* File pattern for MIME files.
|
|
*/
|
|
private String mimeInputPattern = "*.eml";
|
|
|
|
/**
|
|
* Polling interval for MIME input directory (milliseconds).
|
|
*/
|
|
@Positive
|
|
private long mimeInputPollInterval = 10000;
|
|
}
|
|
|
|
|
|
/**
|
|
* Phase 3 TED projection configuration.
|
|
*/
|
|
@Data
|
|
public static class ProjectionProperties {
|
|
|
|
/**
|
|
* Enable/disable Phase 3 TED structured projection dual-write.
|
|
*/
|
|
private boolean enabled = true;
|
|
|
|
/**
|
|
* Optional startup backfill of missing projections from legacy TED documents.
|
|
*/
|
|
private boolean startupBackfillEnabled = false;
|
|
|
|
/**
|
|
* Maximum number of legacy TED documents to backfill during startup.
|
|
*/
|
|
@Positive
|
|
private int startupBackfillLimit = 250;
|
|
}
|
|
|
|
/**
|
|
* Solution Brief processing configuration.
|
|
* Scans PDF files and generates Excel reports with similar TED documents.
|
|
*/
|
|
@Data
|
|
public static class SolutionBriefProperties {
|
|
|
|
/**
|
|
* Enable/disable Solution Brief processing.
|
|
*/
|
|
private boolean enabled = false;
|
|
|
|
/**
|
|
* Input directory for Solution Brief PDF files.
|
|
*/
|
|
private String inputDirectory = "C:/work/SolutionBrief";
|
|
|
|
/**
|
|
* Output directory for Excel result files (relative to input or absolute).
|
|
*/
|
|
private String resultDirectory = "./result";
|
|
|
|
/**
|
|
* Number of top similar documents to include in results.
|
|
*/
|
|
@Positive
|
|
private int topK = 20;
|
|
|
|
/**
|
|
* Minimum similarity threshold (0.0-1.0).
|
|
*/
|
|
private double similarityThreshold = 0.5;
|
|
|
|
/**
|
|
* Polling interval in milliseconds.
|
|
*/
|
|
@Positive
|
|
private long pollInterval = 30000;
|
|
|
|
/**
|
|
* File pattern for PDF files.
|
|
*/
|
|
private String filePattern = ".*\\.pdf";
|
|
|
|
/**
|
|
* Process files only once (idempotent based on filename+size+date).
|
|
*/
|
|
private boolean idempotent = true;
|
|
|
|
/**
|
|
* Idempotent repository file path.
|
|
*/
|
|
private String idempotentRepository = "./solution-brief-processed.dat";
|
|
}
|
|
}
|