You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
DIP/src/main/java/at/procon/ted/config/TedProcessorProperties.java

488 lines
13 KiB
Java

package at.procon.ted.config;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
import org.springframework.validation.annotation.Validated;
import jakarta.validation.constraints.Min;
import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.Positive;
/**
* Configuration properties for TED Procurement Processor.
*
* @author Martin.Schweitzer@procon.co.at and claude.ai
*/
@Configuration
@ConfigurationProperties(prefix = "ted")
@Data
@Validated
public class TedProcessorProperties {
private InputProperties input = new InputProperties();
private SchemaProperties schema = new SchemaProperties();
private VectorizationProperties vectorization = new VectorizationProperties();
private SearchProperties search = new SearchProperties();
private DownloadProperties download = new DownloadProperties();
private MailProperties mail = new MailProperties();
private SolutionBriefProperties solutionBrief = new SolutionBriefProperties();
private ProjectionProperties projection = new ProjectionProperties();
/**
* Input directory configuration for Apache Camel file consumer.
*/
@Data
public static class InputProperties {
/**
* Base directory for watching incoming TED XML files.
*/
@NotBlank
private String directory = "D:/ted.europe/2025-11.tar/2025-11/11";
/**
* File pattern to match (supports Ant-style patterns).
*/
private String pattern = "**/*.xml";
/**
* Directory to move successfully processed files.
*/
private String processedDirectory = ".processed";
/**
* Directory to move failed files.
*/
private String errorDirectory = ".error";
/**
* Polling interval in milliseconds.
*/
@Positive
private long pollInterval = 5000;
/**
* Maximum number of messages per poll.
*/
@Positive
private int maxMessagesPerPoll = 100;
}
/**
* XML Schema validation configuration.
*/
@Data
public static class SchemaProperties {
/**
* Enable/disable XSD validation.
*/
private boolean enabled = true;
/**
* Path to the eForms XSD schema file.
*/
private String path = "classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd";
}
/**
* Document vectorization configuration.
*/
@Data
public static class VectorizationProperties {
/**
* Enable/disable async vectorization.
*/
private boolean enabled = true;
/**
* Use external HTTP API instead of Python subprocess.
*/
private boolean useHttpApi = false;
/**
* Embedding service HTTP API URL.
*/
private String apiUrl = "http://localhost:8001";
/**
* Sentence transformer model name.
*/
private String modelName = "intfloat/multilingual-e5-large";
/**
* Vector dimensions (must match model output).
*/
@Positive
private int dimensions = 1024;
/**
* Batch size for vectorization processing.
*/
@Min(1)
private int batchSize = 16;
/**
* Thread pool size for async vectorization.
*/
@Min(1)
private int threadPoolSize = 4;
/**
* Maximum text length for vectorization (characters).
*/
@Positive
private int maxTextLength = 8192;
/**
* HTTP connection timeout in milliseconds.
*/
@Positive
private int connectTimeout = 10000;
/**
* HTTP socket/read timeout in milliseconds.
*/
@Positive
private int socketTimeout = 60000;
/**
* Maximum retries on connection failure.
*/
@Min(0)
private int maxRetries = 5;
/**
* Enable the Phase 2 generic vectorization pipeline based on DOC text representations
* and DOC embeddings instead of the legacy TED document vector columns as the primary
* write target.
*/
private boolean genericPipelineEnabled = true;
/**
* Keep writing completed TED embeddings back to the legacy ted.procurement_document
* vector columns so the existing semantic search stays operational during migration.
*/
private boolean dualWriteLegacyTedVectors = true;
/**
* Scheduler interval for generic embedding polling (milliseconds).
*/
@Positive
private long genericSchedulerPeriodMs = 6000;
/**
* Builder key for the primary TED semantic representation created during Phase 2 dual-write.
*/
@NotBlank
private String primaryRepresentationBuilderKey = "ted-phase2-primary-representation";
/**
* Provider key used when registering the configured embedding model in DOC.doc_embedding_model.
*/
@NotBlank
private String embeddingProvider = "http-embedding-service";
}
/**
* Search configuration.
*/
@Data
public static class SearchProperties {
/**
* Default page size for search results.
*/
@Positive
private int defaultPageSize = 20;
/**
* Maximum allowed page size.
*/
@Positive
private int maxPageSize = 100;
/**
* Similarity threshold for vector search (0.0 - 1.0).
*/
private double similarityThreshold = 0.7;
}
/**
* TED Daily Package Download configuration.
*/
@Data
public static class DownloadProperties {
/**
* Enable/disable automatic package download.
*/
private boolean enabled = false;
/**
* Base URL für TED Daily Packages.
*/
private String baseUrl = "https://ted.europa.eu/packages/daily/";
/**
* Download-Verzeichnis für tar.gz Files.
*/
private String downloadDirectory = "D:/ted.europe/downloads";
/**
* Extrahierungs-Verzeichnis für XML-Dateien.
*/
private String extractDirectory = "D:/ted.europe/extracted";
/**
* Start-Jahr für den Download.
*/
@Positive
private int startYear = 2015;
/**
* Anzahl aufeinanderfolgender 404-Fehler bevor Download stoppt.
* HINWEIS: Wird nicht mehr verwendet. System stoppt jetzt sofort bei erstem 404.
* @deprecated Nicht mehr verwendet seit Update auf sofortige 404-Behandlung
*/
@Positive
@Deprecated
private int maxConsecutive404 = 1;
/**
* Polling-Interval für neue Packages (Millisekunden).
*/
@Positive
private long pollInterval = 3600000; // 1 Stunde
/**
* Retry-Intervall für tail-NOT_FOUND Packages.
* Current year packages remain retryable indefinitely.
*/
@Positive
private long notFoundRetryInterval = 21600000; // 6 Stunden
/**
* Grace period for previous years after year end before a tail-NOT_FOUND is treated as final.
*/
@Min(0)
private int previousYearGracePeriodDays = 30;
/**
* Keep retrying current-year tail NOT_FOUND packages indefinitely.
*/
private boolean retryCurrentYearNotFoundIndefinitely = true;
/**
* Download-Timeout (Millisekunden).
*/
@Positive
private long downloadTimeout = 300000; // 5 Minuten
/**
* Maximale gleichzeitige Downloads.
*/
@Positive
private int maxConcurrentDownloads = 2;
/**
* Verzögerung zwischen Downloads (Millisekunden) für Rate Limiting.
*/
@Positive
private long delayBetweenDownloads = 5000; // 5 Sekunden
/**
* Automatisches Löschen von tar.gz nach Extraktion.
*/
private boolean deleteAfterExtraction = true;
/**
* Priorisierung: Aktuelles Jahr zuerst, dann rückwärts.
* HINWEIS: Wird nicht mehr verwendet. System priorisiert immer das aktuelle Jahr.
* @deprecated Nicht mehr verwendet - immer aktiv
*/
@Deprecated
private boolean prioritizeCurrentYear = true;
}
/**
* IMAP Mail configuration for email processing.
*/
@Data
public static class MailProperties {
/**
* Enable/disable mail processing.
*/
private boolean enabled = false;
/**
* IMAP server hostname.
*/
@NotBlank
private String host = "mail.mymagenta.business";
/**
* IMAP server port.
*/
@Positive
private int port = 993;
/**
* Mail account username (email address).
*/
@NotBlank
private String username = "archiv@procon.co.at";
/**
* Mail account password.
*/
@NotBlank
private String password = "";
/**
* Use SSL/TLS connection.
*/
private boolean ssl = true;
/**
* Mail folder to read from.
*/
private String folderName = "INBOX";
/**
* Delete messages after processing.
*/
private boolean delete = false;
/**
* Mark messages as seen after processing.
*/
private boolean seen = true;
/**
* Only process unseen messages.
*/
private boolean unseen = true;
/**
* Polling delay in milliseconds.
*/
@Positive
private long delay = 60000;
/**
* Max messages per poll.
*/
@Positive
private int maxMessagesPerPoll = 10;
/**
* Output directory for processed attachments.
*/
private String attachmentOutputDirectory = "D:/ted.europe/mail-attachments";
/**
* Enable/disable MIME file input processing.
*/
private boolean mimeInputEnabled = false;
/**
* Input directory for MIME files (.eml, .msg).
*/
private String mimeInputDirectory = "D:/ted.europe/mime-input";
/**
* File pattern for MIME files.
*/
private String mimeInputPattern = "*.eml";
/**
* Polling interval for MIME input directory (milliseconds).
*/
@Positive
private long mimeInputPollInterval = 10000;
}
/**
* Phase 3 TED projection configuration.
*/
@Data
public static class ProjectionProperties {
/**
* Enable/disable Phase 3 TED structured projection dual-write.
*/
private boolean enabled = true;
/**
* Optional startup backfill of missing projections from legacy TED documents.
*/
private boolean startupBackfillEnabled = false;
/**
* Maximum number of legacy TED documents to backfill during startup.
*/
@Positive
private int startupBackfillLimit = 250;
}
/**
* Solution Brief processing configuration.
* Scans PDF files and generates Excel reports with similar TED documents.
*/
@Data
public static class SolutionBriefProperties {
/**
* Enable/disable Solution Brief processing.
*/
private boolean enabled = false;
/**
* Input directory for Solution Brief PDF files.
*/
private String inputDirectory = "C:/work/SolutionBrief";
/**
* Output directory for Excel result files (relative to input or absolute).
*/
private String resultDirectory = "./result";
/**
* Number of top similar documents to include in results.
*/
@Positive
private int topK = 20;
/**
* Minimum similarity threshold (0.0-1.0).
*/
private double similarityThreshold = 0.5;
/**
* Polling interval in milliseconds.
*/
@Positive
private long pollInterval = 30000;
/**
* File pattern for PDF files.
*/
private String filePattern = ".*\\.pdf";
/**
* Process files only once (idempotent based on filename+size+date).
*/
private boolean idempotent = true;
/**
* Idempotent repository file path.
*/
private String idempotentRepository = "./solution-brief-processed.dat";
}
}