You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
DIP/src/main/java/at/procon/ted/config/TedProcessorProperties.java

745 lines
20 KiB
Java

package at.procon.ted.config;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Primary;
import org.springframework.validation.annotation.Validated;
import jakarta.validation.constraints.Min;
import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.Positive;
/**
* Configuration properties for TED Procurement Processor.
*
* @author Martin.Schweitzer@procon.co.at and claude.ai
*/
@Configuration
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
@ConfigurationProperties(prefix = "ted")
@Data
@Validated
@Primary
public class TedProcessorProperties {
private InputProperties input = new InputProperties();
private SchemaProperties schema = new SchemaProperties();
private VectorizationProperties vectorization = new VectorizationProperties();
private SearchProperties search = new SearchProperties();
private DownloadProperties download = new DownloadProperties();
private MailProperties mail = new MailProperties();
private SolutionBriefProperties solutionBrief = new SolutionBriefProperties();
private ProjectionProperties projection = new ProjectionProperties();
private GenericIngestionProperties genericIngestion = new GenericIngestionProperties();
private RepairProperties repair = new RepairProperties();
/**
* Input directory configuration for Apache Camel file consumer.
*/
@Data
public static class InputProperties {
/**
* Base directory for watching incoming TED XML files.
*/
@NotBlank
private String directory = "D:/ted.europe/2025-11.tar/2025-11/11";
/**
* File pattern to match (supports Ant-style patterns).
*/
private String pattern = "**/*.xml";
/**
* Directory to move successfully processed files.
*/
private String processedDirectory = ".processed";
/**
* Directory to move failed files.
*/
private String errorDirectory = ".error";
/**
* Polling interval in milliseconds.
*/
@Positive
private long pollInterval = 5000;
/**
* Maximum number of messages per poll.
*/
@Positive
private int maxMessagesPerPoll = 100;
}
/**
* XML Schema validation configuration.
*/
@Data
public static class SchemaProperties {
/**
* Enable/disable XSD validation.
*/
private boolean enabled = true;
/**
* Path to the eForms XSD schema file.
*/
private String path = "classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd";
}
/**
* Document vectorization configuration.
*/
@Data
public static class VectorizationProperties {
/**
* Enable/disable async vectorization.
*/
private boolean enabled = true;
/**
* Use external HTTP API instead of Python subprocess.
*/
private boolean useHttpApi = false;
/**
* Embedding service HTTP API URL.
*/
private String apiUrl = "http://localhost:8001";
/**
* Sentence transformer model name.
*/
private String modelName = "intfloat/multilingual-e5-large";
/**
* Vector dimensions (must match model output).
*/
@Positive
private int dimensions = 1024;
/**
* Batch size for vectorization processing.
*/
@Min(1)
private int batchSize = 16;
/**
* Thread pool size for async vectorization.
*/
@Min(1)
private int threadPoolSize = 4;
/**
* Maximum text length for vectorization (characters).
*/
@Positive
private int maxTextLength = 8192;
/**
* HTTP connection timeout in milliseconds.
*/
@Positive
private int connectTimeout = 10000;
/**
* HTTP socket/read timeout in milliseconds.
*/
@Positive
private int socketTimeout = 60000;
/**
* Maximum retries on connection failure.
*/
@Min(0)
private int maxRetries = 5;
@Positive
private long genericSchedulerPeriodMs = 30000;
private String primaryRepresentationBuilderKey = "default-generic";
}
/**
* Search configuration.
*/
@Data
public static class SearchProperties {
/**
* Default page size for search results.
*/
@Positive
private int defaultPageSize = 20;
/**
* Maximum allowed page size.
*/
@Positive
private int maxPageSize = 100;
/**
* Similarity threshold for vector search (0.0 - 1.0).
*/
private double similarityThreshold = 0.7;
/**
* Minimum trigram similarity for fuzzy lexical matches.
*/
private double trigramSimilarityThreshold = 0.12;
/**
* Candidate limits per search engine before fusion/collapse.
*/
@Positive
private int fulltextCandidateLimit = 120;
@Positive
private int trigramCandidateLimit = 120;
@Positive
private int semanticCandidateLimit = 120;
/**
* Hybrid fusion weights.
*/
private double fulltextWeight = 0.35;
private double trigramWeight = 0.20;
private double semanticWeight = 0.45;
/**
* Enable chunk representations for long documents.
*/
private boolean chunkingEnabled = true;
/**
* Target chunk size in characters for CHUNK representations.
*/
@Positive
private int chunkTargetChars = 1800;
/**
* Overlap between consecutive chunks in characters.
*/
@Min(0)
private int chunkOverlapChars = 200;
/**
* Maximum CHUNK representations generated per document.
*/
@Positive
private int maxChunksPerDocument = 12;
/**
* Additional score weight for recency.
*/
private double recencyBoostWeight = 0.05;
/**
* Half-life in days used for recency decay.
*/
@Positive
private int recencyHalfLifeDays = 30;
/**
* Startup backfill limit for missing DOC lexical vectors.
*/
@Positive
private int startupLexicalBackfillLimit = 500;
/**
* Number of hits per engine returned by the debug endpoint.
*/
@Positive
private int debugTopHitsPerEngine = 10;
}
/**
* TED Daily Package Download configuration.
*/
@Data
public static class DownloadProperties {
/**
* Enable/disable automatic package download.
*/
private boolean enabled = false;
/**
* Base URL für TED Daily Packages.
*/
private String baseUrl = "https://ted.europa.eu/packages/daily/";
/**
* Download-Verzeichnis für tar.gz Files.
*/
private String downloadDirectory = "D:/ted.europe/downloads";
/**
* Extrahierungs-Verzeichnis für XML-Dateien.
*/
private String extractDirectory = "D:/ted.europe/extracted";
/**
* Start-Jahr für den Download.
*/
@Positive
private int startYear = 2015;
/**
* Anzahl aufeinanderfolgender 404-Fehler bevor Download stoppt.
* HINWEIS: Wird nicht mehr verwendet. System stoppt jetzt sofort bei erstem 404.
* @deprecated Nicht mehr verwendet seit Update auf sofortige 404-Behandlung
*/
@Positive
@Deprecated
private int maxConsecutive404 = 1;
/**
* Polling-Interval für neue Packages (Millisekunden).
*/
@Positive
private long pollInterval = 3600000; // 1 Stunde
/**
* Retry-Intervall für tail-NOT_FOUND Packages.
* Current year packages remain retryable indefinitely.
*/
@Positive
private long notFoundRetryInterval = 21600000; // 6 Stunden
/**
* Grace period for previous years after year end before a tail-NOT_FOUND is treated as final.
*/
@Min(0)
private int previousYearGracePeriodDays = 30;
/**
* Keep retrying current-year tail NOT_FOUND packages indefinitely.
*/
private boolean retryCurrentYearNotFoundIndefinitely = true;
/**
* Download-Timeout (Millisekunden).
*/
@Positive
private long downloadTimeout = 300000; // 5 Minuten
/**
* Maximale gleichzeitige Downloads.
*/
@Positive
private int maxConcurrentDownloads = 2;
/**
* Verzögerung zwischen Downloads (Millisekunden) für Rate Limiting.
*/
@Positive
private long delayBetweenDownloads = 5000; // 5 Sekunden
/**
* Automatisches Löschen von tar.gz nach Extraktion.
*/
private boolean deleteAfterExtraction = true;
/**
* Priorisierung: Aktuelles Jahr zuerst, dann rückwärts.
* HINWEIS: Wird nicht mehr verwendet. System priorisiert immer das aktuelle Jahr.
* @deprecated Nicht mehr verwendet - immer aktiv
*/
@Deprecated
private boolean prioritizeCurrentYear = true;
}
/**
* Legacy TED package repair / re-import configuration.
*/
@Data
public static class RepairProperties {
/**
* Enable startup repair of incomplete or missing TED packages.
*/
private boolean enabled = false;
/**
* If true, only logs the selected package candidates without modifying data.
*/
private boolean dryRun = false;
/**
* Maximum number of packages to process in one startup run.
*/
@Positive
private int maxPackages = 100;
/**
* Optional explicit package identifiers (YYYYSSSSS) to repair.
*/
private java.util.List<String> packageIdentifiers = new java.util.ArrayList<>();
/**
* Optional lower bound package identifier (inclusive).
*/
private String fromPackageIdentifier;
/**
* Optional upper bound package identifier (inclusive).
*/
private String toPackageIdentifier;
/**
* Include missing package sequence numbers inside the selected range.
*/
private boolean includeMissingSequenceGaps = true;
/**
* Re-download the package archive when it is missing locally.
*/
private boolean redownloadMissingArchives = true;
/**
* Always re-download the package archive even when a local archive already exists.
*/
private boolean forceRedownload = false;
/**
* Refuse startup repair while the automatic legacy package download scheduler is enabled.
*/
private boolean allowWhileDownloadEnabled = false;
}
/**
* IMAP Mail configuration for email processing.
*/
@Data
public static class MailProperties {
/**
* Enable/disable mail processing.
*/
private boolean enabled = false;
/**
* IMAP server hostname.
*/
@NotBlank
private String host = "mail.mymagenta.business";
/**
* IMAP server port.
*/
@Positive
private int port = 993;
/**
* Mail account username (email address).
*/
@NotBlank
private String username = "archiv@procon.co.at";
/**
* Mail account password.
*/
@NotBlank
private String password = "";
/**
* Use SSL/TLS connection.
*/
private boolean ssl = true;
/**
* Mail folder to read from.
*/
private String folderName = "INBOX";
/**
* Delete messages after processing.
*/
private boolean delete = false;
/**
* Mark messages as seen after processing.
*/
private boolean seen = true;
/**
* Only process unseen messages.
*/
private boolean unseen = true;
/**
* Polling delay in milliseconds.
*/
@Positive
private long delay = 60000;
/**
* Max messages per poll.
*/
@Positive
private int maxMessagesPerPoll = 10;
/**
* Output directory for processed attachments.
*/
private String attachmentOutputDirectory = "D:/ted.europe/mail-attachments";
/**
* Enable/disable MIME file input processing.
*/
private boolean mimeInputEnabled = false;
/**
* Input directory for MIME files (.eml, .msg).
*/
private String mimeInputDirectory = "D:/ted.europe/mime-input";
/**
* File pattern for MIME files.
*/
private String mimeInputPattern = "*.eml";
/**
* Polling interval for MIME input directory (milliseconds).
*/
@Positive
private long mimeInputPollInterval = 10000;
}
/**
* Phase 3 TED projection configuration.
*/
@Data
public static class ProjectionProperties {
/**
* Enable/disable Phase 3 TED structured projection dual-write.
*/
private boolean enabled = true;
/**
* Optional startup backfill of missing projections from legacy TED documents.
*/
private boolean startupBackfillEnabled = false;
/**
* Maximum number of legacy TED documents to backfill during startup.
*/
@Positive
private int startupBackfillLimit = 250;
}
/**
* Solution Brief processing configuration.
* Scans PDF files and generates Excel reports with similar TED documents.
*/
@Data
public static class SolutionBriefProperties {
/**
* Enable/disable Solution Brief processing.
*/
private boolean enabled = false;
/**
* Input directory for Solution Brief PDF files.
*/
private String inputDirectory = "C:/work/SolutionBrief";
/**
* Output directory for Excel result files (relative to input or absolute).
*/
private String resultDirectory = "./result";
/**
* Number of top similar documents to include in results.
*/
@Positive
private int topK = 20;
/**
* Minimum similarity threshold (0.0-1.0).
*/
private double similarityThreshold = 0.5;
/**
* Polling interval in milliseconds.
*/
@Positive
private long pollInterval = 30000;
/**
* File pattern for PDF files.
*/
private String filePattern = ".*\\.pdf";
/**
* Process files only once (idempotent based on filename+size+date).
*/
private boolean idempotent = true;
/**
* Idempotent repository file path.
*/
private String idempotentRepository = "./solution-brief-processed.dat";
}
/**
* Phase 4 generic ingestion configuration.
*/
@Data
public static class GenericIngestionProperties {
/**
* Master switch for the generic ingestion pipeline.
*/
private boolean enabled = false;
/**
* Enable/disable filesystem import route for arbitrary documents.
*/
private boolean fileSystemEnabled = false;
/**
* Enable/disable REST/API upload endpoints for arbitrary documents.
*/
private boolean restUploadEnabled = true;
/**
* Input directory for the generic filesystem importer.
*/
private String inputDirectory = "/ted.europe/generic-input";
/**
* Regular-expression file pattern used by the Camel file route.
*/
private String filePattern = ".*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$";
/**
* Directory where successfully imported files are moved.
*/
private String processedDirectory = ".dip-processed";
/**
* Directory where failed files are moved.
*/
private String errorDirectory = ".dip-error";
/**
* Polling interval in milliseconds.
*/
@Positive
private long pollInterval = 15000;
/**
* Maximum number of files per poll.
*/
@Positive
private int maxMessagesPerPoll = 10;
/**
* Default owner tenant for files imported through the generic route.
*/
private String defaultOwnerTenantKey;
/**
* Default visibility for generic imports when not supplied explicitly.
*/
private at.procon.dip.domain.access.DocumentVisibility defaultVisibility = at.procon.dip.domain.access.DocumentVisibility.PUBLIC;
/**
* Optional default language code applied to filesystem imports.
*/
private String defaultLanguageCode;
/**
* Persist original binary payloads in DB when they are small enough.
*/
private boolean storeOriginalBinaryInDb = true;
/**
* Maximum binary size (bytes) stored inline in DOC.doc_content.binary_content.
*/
@Positive
private int maxBinaryBytesInDb = 5242880;
/**
* Whether an already imported content hash should resolve to the existing document.
*/
private boolean deduplicateByContentHash = true;
/**
* Persist ORIGINAL content rows for wrapper/container documents that primarily exist
* to group or reference child documents (for example TED packages or expanded ZIP wrappers).
* When disabled, wrappers are still classified, extracted and represented, but the raw
* ORIGINAL content payload is not stored in DOC.doc_content.
*/
private boolean storeOriginalContentForWrapperDocuments = true;
/**
* Queue only the primary text representation for embedding.
*/
private boolean vectorizePrimaryRepresentationOnly = true;
/**
* Import batch identifier written to DOC.doc_source.import_batch_id.
*/
@NotBlank
private String importBatchId = "phase4-generic";
/**
* Enable the Phase 4.1 TED package adapter built on top of the generic DOC ingestion SPI.
*/
private boolean tedPackageAdapterEnabled = true;
/**
* Enable the Phase 4.1 mail/document adapter built on top of the generic DOC ingestion SPI.
*/
private boolean mailAdapterEnabled = false;
/**
* Optional dedicated owner tenant key for imported mail messages and attachments.
* Falls back to defaultOwnerTenantKey when not configured.
*/
private String mailDefaultOwnerTenantKey;
/**
* Default visibility for imported mail messages and attachments.
*/
private at.procon.dip.domain.access.DocumentVisibility mailDefaultVisibility = at.procon.dip.domain.access.DocumentVisibility.TENANT;
/**
* Expand ZIP attachments recursively through the mail adapter.
*/
private boolean expandMailZipAttachments = true;
/**
* Import batch identifier for TED package roots and extracted TED child documents.
*/
@NotBlank
private String tedPackageImportBatchId = "phase41-ted-package";
/**
* When true, TED packages are persisted only through the generic ingestion gateway
* and the legacy XML batch persistence path is skipped.
*/
private boolean gatewayOnlyForTedPackages = false;
/**
* Import batch identifier for imported mail root messages and child attachments.
*/
@NotBlank
private String mailImportBatchId = "phase41-mail";
}
}