You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
745 lines
20 KiB
Java
745 lines
20 KiB
Java
package at.procon.ted.config;
|
|
|
|
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
|
import at.procon.dip.runtime.config.RuntimeMode;
|
|
import lombok.Data;
|
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
|
import org.springframework.context.annotation.Configuration;
|
|
import org.springframework.context.annotation.Primary;
|
|
import org.springframework.validation.annotation.Validated;
|
|
|
|
import jakarta.validation.constraints.Min;
|
|
import jakarta.validation.constraints.NotBlank;
|
|
import jakarta.validation.constraints.Positive;
|
|
|
|
/**
|
|
* Configuration properties for TED Procurement Processor.
|
|
*
|
|
* @author Martin.Schweitzer@procon.co.at and claude.ai
|
|
*/
|
|
@Configuration
|
|
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
|
|
@ConfigurationProperties(prefix = "ted")
|
|
@Data
|
|
@Validated
|
|
@Primary
|
|
public class TedProcessorProperties {
|
|
|
|
private InputProperties input = new InputProperties();
|
|
private SchemaProperties schema = new SchemaProperties();
|
|
private VectorizationProperties vectorization = new VectorizationProperties();
|
|
private SearchProperties search = new SearchProperties();
|
|
private DownloadProperties download = new DownloadProperties();
|
|
private MailProperties mail = new MailProperties();
|
|
private SolutionBriefProperties solutionBrief = new SolutionBriefProperties();
|
|
private ProjectionProperties projection = new ProjectionProperties();
|
|
private GenericIngestionProperties genericIngestion = new GenericIngestionProperties();
|
|
private RepairProperties repair = new RepairProperties();
|
|
|
|
/**
|
|
* Input directory configuration for Apache Camel file consumer.
|
|
*/
|
|
@Data
|
|
public static class InputProperties {
|
|
|
|
/**
|
|
* Base directory for watching incoming TED XML files.
|
|
*/
|
|
@NotBlank
|
|
private String directory = "D:/ted.europe/2025-11.tar/2025-11/11";
|
|
|
|
/**
|
|
* File pattern to match (supports Ant-style patterns).
|
|
*/
|
|
private String pattern = "**/*.xml";
|
|
|
|
/**
|
|
* Directory to move successfully processed files.
|
|
*/
|
|
private String processedDirectory = ".processed";
|
|
|
|
/**
|
|
* Directory to move failed files.
|
|
*/
|
|
private String errorDirectory = ".error";
|
|
|
|
/**
|
|
* Polling interval in milliseconds.
|
|
*/
|
|
@Positive
|
|
private long pollInterval = 5000;
|
|
|
|
/**
|
|
* Maximum number of messages per poll.
|
|
*/
|
|
@Positive
|
|
private int maxMessagesPerPoll = 100;
|
|
}
|
|
|
|
/**
|
|
* XML Schema validation configuration.
|
|
*/
|
|
@Data
|
|
public static class SchemaProperties {
|
|
|
|
/**
|
|
* Enable/disable XSD validation.
|
|
*/
|
|
private boolean enabled = true;
|
|
|
|
/**
|
|
* Path to the eForms XSD schema file.
|
|
*/
|
|
private String path = "classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd";
|
|
}
|
|
|
|
/**
|
|
* Document vectorization configuration.
|
|
*/
|
|
@Data
|
|
public static class VectorizationProperties {
|
|
|
|
/**
|
|
* Enable/disable async vectorization.
|
|
*/
|
|
private boolean enabled = true;
|
|
|
|
/**
|
|
* Use external HTTP API instead of Python subprocess.
|
|
*/
|
|
private boolean useHttpApi = false;
|
|
|
|
/**
|
|
* Embedding service HTTP API URL.
|
|
*/
|
|
private String apiUrl = "http://localhost:8001";
|
|
|
|
/**
|
|
* Sentence transformer model name.
|
|
*/
|
|
private String modelName = "intfloat/multilingual-e5-large";
|
|
|
|
/**
|
|
* Vector dimensions (must match model output).
|
|
*/
|
|
@Positive
|
|
private int dimensions = 1024;
|
|
|
|
/**
|
|
* Batch size for vectorization processing.
|
|
*/
|
|
@Min(1)
|
|
private int batchSize = 16;
|
|
|
|
/**
|
|
* Thread pool size for async vectorization.
|
|
*/
|
|
@Min(1)
|
|
private int threadPoolSize = 4;
|
|
|
|
/**
|
|
* Maximum text length for vectorization (characters).
|
|
*/
|
|
@Positive
|
|
private int maxTextLength = 8192;
|
|
|
|
/**
|
|
* HTTP connection timeout in milliseconds.
|
|
*/
|
|
@Positive
|
|
private int connectTimeout = 10000;
|
|
|
|
/**
|
|
* HTTP socket/read timeout in milliseconds.
|
|
*/
|
|
@Positive
|
|
private int socketTimeout = 60000;
|
|
|
|
/**
|
|
* Maximum retries on connection failure.
|
|
*/
|
|
@Min(0)
|
|
private int maxRetries = 5;
|
|
@Positive
|
|
private long genericSchedulerPeriodMs = 30000;
|
|
private String primaryRepresentationBuilderKey = "default-generic";
|
|
}
|
|
|
|
/**
|
|
* Search configuration.
|
|
*/
|
|
@Data
|
|
public static class SearchProperties {
|
|
|
|
/**
|
|
* Default page size for search results.
|
|
*/
|
|
@Positive
|
|
private int defaultPageSize = 20;
|
|
|
|
/**
|
|
* Maximum allowed page size.
|
|
*/
|
|
@Positive
|
|
private int maxPageSize = 100;
|
|
|
|
/**
|
|
* Similarity threshold for vector search (0.0 - 1.0).
|
|
*/
|
|
private double similarityThreshold = 0.7;
|
|
|
|
/**
|
|
* Minimum trigram similarity for fuzzy lexical matches.
|
|
*/
|
|
private double trigramSimilarityThreshold = 0.12;
|
|
|
|
/**
|
|
* Candidate limits per search engine before fusion/collapse.
|
|
*/
|
|
@Positive
|
|
private int fulltextCandidateLimit = 120;
|
|
|
|
@Positive
|
|
private int trigramCandidateLimit = 120;
|
|
|
|
@Positive
|
|
private int semanticCandidateLimit = 120;
|
|
|
|
/**
|
|
* Hybrid fusion weights.
|
|
*/
|
|
private double fulltextWeight = 0.35;
|
|
private double trigramWeight = 0.20;
|
|
private double semanticWeight = 0.45;
|
|
|
|
|
|
/**
|
|
* Enable chunk representations for long documents.
|
|
*/
|
|
private boolean chunkingEnabled = true;
|
|
|
|
/**
|
|
* Target chunk size in characters for CHUNK representations.
|
|
*/
|
|
@Positive
|
|
private int chunkTargetChars = 1800;
|
|
|
|
/**
|
|
* Overlap between consecutive chunks in characters.
|
|
*/
|
|
@Min(0)
|
|
private int chunkOverlapChars = 200;
|
|
|
|
/**
|
|
* Maximum CHUNK representations generated per document.
|
|
*/
|
|
@Positive
|
|
private int maxChunksPerDocument = 12;
|
|
|
|
/**
|
|
* Additional score weight for recency.
|
|
*/
|
|
private double recencyBoostWeight = 0.05;
|
|
|
|
/**
|
|
* Half-life in days used for recency decay.
|
|
*/
|
|
@Positive
|
|
private int recencyHalfLifeDays = 30;
|
|
|
|
/**
|
|
* Startup backfill limit for missing DOC lexical vectors.
|
|
*/
|
|
@Positive
|
|
private int startupLexicalBackfillLimit = 500;
|
|
|
|
/**
|
|
* Number of hits per engine returned by the debug endpoint.
|
|
*/
|
|
@Positive
|
|
private int debugTopHitsPerEngine = 10;
|
|
}
|
|
|
|
/**
|
|
* TED Daily Package Download configuration.
|
|
*/
|
|
@Data
|
|
public static class DownloadProperties {
|
|
|
|
/**
|
|
* Enable/disable automatic package download.
|
|
*/
|
|
private boolean enabled = false;
|
|
|
|
/**
|
|
* Base URL für TED Daily Packages.
|
|
*/
|
|
private String baseUrl = "https://ted.europa.eu/packages/daily/";
|
|
|
|
/**
|
|
* Download-Verzeichnis für tar.gz Files.
|
|
*/
|
|
private String downloadDirectory = "D:/ted.europe/downloads";
|
|
|
|
/**
|
|
* Extrahierungs-Verzeichnis für XML-Dateien.
|
|
*/
|
|
private String extractDirectory = "D:/ted.europe/extracted";
|
|
|
|
/**
|
|
* Start-Jahr für den Download.
|
|
*/
|
|
@Positive
|
|
private int startYear = 2015;
|
|
|
|
/**
|
|
* Anzahl aufeinanderfolgender 404-Fehler bevor Download stoppt.
|
|
* HINWEIS: Wird nicht mehr verwendet. System stoppt jetzt sofort bei erstem 404.
|
|
* @deprecated Nicht mehr verwendet seit Update auf sofortige 404-Behandlung
|
|
*/
|
|
@Positive
|
|
@Deprecated
|
|
private int maxConsecutive404 = 1;
|
|
|
|
/**
|
|
* Polling-Interval für neue Packages (Millisekunden).
|
|
*/
|
|
@Positive
|
|
private long pollInterval = 3600000; // 1 Stunde
|
|
|
|
/**
|
|
* Retry-Intervall für tail-NOT_FOUND Packages.
|
|
* Current year packages remain retryable indefinitely.
|
|
*/
|
|
@Positive
|
|
private long notFoundRetryInterval = 21600000; // 6 Stunden
|
|
|
|
/**
|
|
* Grace period for previous years after year end before a tail-NOT_FOUND is treated as final.
|
|
*/
|
|
@Min(0)
|
|
private int previousYearGracePeriodDays = 30;
|
|
|
|
/**
|
|
* Keep retrying current-year tail NOT_FOUND packages indefinitely.
|
|
*/
|
|
private boolean retryCurrentYearNotFoundIndefinitely = true;
|
|
|
|
/**
|
|
* Download-Timeout (Millisekunden).
|
|
*/
|
|
@Positive
|
|
private long downloadTimeout = 300000; // 5 Minuten
|
|
|
|
/**
|
|
* Maximale gleichzeitige Downloads.
|
|
*/
|
|
@Positive
|
|
private int maxConcurrentDownloads = 2;
|
|
|
|
/**
|
|
* Verzögerung zwischen Downloads (Millisekunden) für Rate Limiting.
|
|
*/
|
|
@Positive
|
|
private long delayBetweenDownloads = 5000; // 5 Sekunden
|
|
|
|
/**
|
|
* Automatisches Löschen von tar.gz nach Extraktion.
|
|
*/
|
|
private boolean deleteAfterExtraction = true;
|
|
|
|
/**
|
|
* Priorisierung: Aktuelles Jahr zuerst, dann rückwärts.
|
|
* HINWEIS: Wird nicht mehr verwendet. System priorisiert immer das aktuelle Jahr.
|
|
* @deprecated Nicht mehr verwendet - immer aktiv
|
|
*/
|
|
@Deprecated
|
|
private boolean prioritizeCurrentYear = true;
|
|
}
|
|
|
|
/**
|
|
* Legacy TED package repair / re-import configuration.
|
|
*/
|
|
@Data
|
|
public static class RepairProperties {
|
|
|
|
/**
|
|
* Enable startup repair of incomplete or missing TED packages.
|
|
*/
|
|
private boolean enabled = false;
|
|
|
|
/**
|
|
* If true, only logs the selected package candidates without modifying data.
|
|
*/
|
|
private boolean dryRun = false;
|
|
|
|
/**
|
|
* Maximum number of packages to process in one startup run.
|
|
*/
|
|
@Positive
|
|
private int maxPackages = 100;
|
|
|
|
/**
|
|
* Optional explicit package identifiers (YYYYSSSSS) to repair.
|
|
*/
|
|
private java.util.List<String> packageIdentifiers = new java.util.ArrayList<>();
|
|
|
|
/**
|
|
* Optional lower bound package identifier (inclusive).
|
|
*/
|
|
private String fromPackageIdentifier;
|
|
|
|
/**
|
|
* Optional upper bound package identifier (inclusive).
|
|
*/
|
|
private String toPackageIdentifier;
|
|
|
|
/**
|
|
* Include missing package sequence numbers inside the selected range.
|
|
*/
|
|
private boolean includeMissingSequenceGaps = true;
|
|
|
|
/**
|
|
* Re-download the package archive when it is missing locally.
|
|
*/
|
|
private boolean redownloadMissingArchives = true;
|
|
|
|
/**
|
|
* Always re-download the package archive even when a local archive already exists.
|
|
*/
|
|
private boolean forceRedownload = false;
|
|
|
|
/**
|
|
* Refuse startup repair while the automatic legacy package download scheduler is enabled.
|
|
*/
|
|
private boolean allowWhileDownloadEnabled = false;
|
|
}
|
|
|
|
/**
|
|
* IMAP Mail configuration for email processing.
|
|
*/
|
|
@Data
|
|
public static class MailProperties {
|
|
|
|
/**
|
|
* Enable/disable mail processing.
|
|
*/
|
|
private boolean enabled = false;
|
|
|
|
/**
|
|
* IMAP server hostname.
|
|
*/
|
|
@NotBlank
|
|
private String host = "mail.mymagenta.business";
|
|
|
|
/**
|
|
* IMAP server port.
|
|
*/
|
|
@Positive
|
|
private int port = 993;
|
|
|
|
/**
|
|
* Mail account username (email address).
|
|
*/
|
|
@NotBlank
|
|
private String username = "archiv@procon.co.at";
|
|
|
|
/**
|
|
* Mail account password.
|
|
*/
|
|
@NotBlank
|
|
private String password = "";
|
|
|
|
/**
|
|
* Use SSL/TLS connection.
|
|
*/
|
|
private boolean ssl = true;
|
|
|
|
/**
|
|
* Mail folder to read from.
|
|
*/
|
|
private String folderName = "INBOX";
|
|
|
|
/**
|
|
* Delete messages after processing.
|
|
*/
|
|
private boolean delete = false;
|
|
|
|
/**
|
|
* Mark messages as seen after processing.
|
|
*/
|
|
private boolean seen = true;
|
|
|
|
/**
|
|
* Only process unseen messages.
|
|
*/
|
|
private boolean unseen = true;
|
|
|
|
/**
|
|
* Polling delay in milliseconds.
|
|
*/
|
|
@Positive
|
|
private long delay = 60000;
|
|
|
|
/**
|
|
* Max messages per poll.
|
|
*/
|
|
@Positive
|
|
private int maxMessagesPerPoll = 10;
|
|
|
|
/**
|
|
* Output directory for processed attachments.
|
|
*/
|
|
private String attachmentOutputDirectory = "D:/ted.europe/mail-attachments";
|
|
|
|
/**
|
|
* Enable/disable MIME file input processing.
|
|
*/
|
|
private boolean mimeInputEnabled = false;
|
|
|
|
/**
|
|
* Input directory for MIME files (.eml, .msg).
|
|
*/
|
|
private String mimeInputDirectory = "D:/ted.europe/mime-input";
|
|
|
|
/**
|
|
* File pattern for MIME files.
|
|
*/
|
|
private String mimeInputPattern = "*.eml";
|
|
|
|
/**
|
|
* Polling interval for MIME input directory (milliseconds).
|
|
*/
|
|
@Positive
|
|
private long mimeInputPollInterval = 10000;
|
|
}
|
|
|
|
|
|
/**
|
|
* Phase 3 TED projection configuration.
|
|
*/
|
|
@Data
|
|
public static class ProjectionProperties {
|
|
|
|
/**
|
|
* Enable/disable Phase 3 TED structured projection dual-write.
|
|
*/
|
|
private boolean enabled = true;
|
|
|
|
/**
|
|
* Optional startup backfill of missing projections from legacy TED documents.
|
|
*/
|
|
private boolean startupBackfillEnabled = false;
|
|
|
|
/**
|
|
* Maximum number of legacy TED documents to backfill during startup.
|
|
*/
|
|
@Positive
|
|
private int startupBackfillLimit = 250;
|
|
}
|
|
|
|
/**
|
|
* Solution Brief processing configuration.
|
|
* Scans PDF files and generates Excel reports with similar TED documents.
|
|
*/
|
|
@Data
|
|
public static class SolutionBriefProperties {
|
|
|
|
/**
|
|
* Enable/disable Solution Brief processing.
|
|
*/
|
|
private boolean enabled = false;
|
|
|
|
/**
|
|
* Input directory for Solution Brief PDF files.
|
|
*/
|
|
private String inputDirectory = "C:/work/SolutionBrief";
|
|
|
|
/**
|
|
* Output directory for Excel result files (relative to input or absolute).
|
|
*/
|
|
private String resultDirectory = "./result";
|
|
|
|
/**
|
|
* Number of top similar documents to include in results.
|
|
*/
|
|
@Positive
|
|
private int topK = 20;
|
|
|
|
/**
|
|
* Minimum similarity threshold (0.0-1.0).
|
|
*/
|
|
private double similarityThreshold = 0.5;
|
|
|
|
/**
|
|
* Polling interval in milliseconds.
|
|
*/
|
|
@Positive
|
|
private long pollInterval = 30000;
|
|
|
|
/**
|
|
* File pattern for PDF files.
|
|
*/
|
|
private String filePattern = ".*\\.pdf";
|
|
|
|
/**
|
|
* Process files only once (idempotent based on filename+size+date).
|
|
*/
|
|
private boolean idempotent = true;
|
|
|
|
/**
|
|
* Idempotent repository file path.
|
|
*/
|
|
private String idempotentRepository = "./solution-brief-processed.dat";
|
|
}
|
|
|
|
/**
|
|
* Phase 4 generic ingestion configuration.
|
|
*/
|
|
@Data
|
|
public static class GenericIngestionProperties {
|
|
|
|
/**
|
|
* Master switch for the generic ingestion pipeline.
|
|
*/
|
|
private boolean enabled = false;
|
|
|
|
/**
|
|
* Enable/disable filesystem import route for arbitrary documents.
|
|
*/
|
|
private boolean fileSystemEnabled = false;
|
|
|
|
/**
|
|
* Enable/disable REST/API upload endpoints for arbitrary documents.
|
|
*/
|
|
private boolean restUploadEnabled = true;
|
|
|
|
/**
|
|
* Input directory for the generic filesystem importer.
|
|
*/
|
|
private String inputDirectory = "/ted.europe/generic-input";
|
|
|
|
/**
|
|
* Regular-expression file pattern used by the Camel file route.
|
|
*/
|
|
private String filePattern = ".*\\.(pdf|txt|html|htm|xml|md|markdown|csv|json|yaml|yml)$";
|
|
|
|
/**
|
|
* Directory where successfully imported files are moved.
|
|
*/
|
|
private String processedDirectory = ".dip-processed";
|
|
|
|
/**
|
|
* Directory where failed files are moved.
|
|
*/
|
|
private String errorDirectory = ".dip-error";
|
|
|
|
/**
|
|
* Polling interval in milliseconds.
|
|
*/
|
|
@Positive
|
|
private long pollInterval = 15000;
|
|
|
|
/**
|
|
* Maximum number of files per poll.
|
|
*/
|
|
@Positive
|
|
private int maxMessagesPerPoll = 10;
|
|
|
|
/**
|
|
* Default owner tenant for files imported through the generic route.
|
|
*/
|
|
private String defaultOwnerTenantKey;
|
|
|
|
/**
|
|
* Default visibility for generic imports when not supplied explicitly.
|
|
*/
|
|
private at.procon.dip.domain.access.DocumentVisibility defaultVisibility = at.procon.dip.domain.access.DocumentVisibility.PUBLIC;
|
|
|
|
/**
|
|
* Optional default language code applied to filesystem imports.
|
|
*/
|
|
private String defaultLanguageCode;
|
|
|
|
/**
|
|
* Persist original binary payloads in DB when they are small enough.
|
|
*/
|
|
private boolean storeOriginalBinaryInDb = true;
|
|
|
|
/**
|
|
* Maximum binary size (bytes) stored inline in DOC.doc_content.binary_content.
|
|
*/
|
|
@Positive
|
|
private int maxBinaryBytesInDb = 5242880;
|
|
|
|
/**
|
|
* Whether an already imported content hash should resolve to the existing document.
|
|
*/
|
|
private boolean deduplicateByContentHash = true;
|
|
|
|
/**
|
|
* Persist ORIGINAL content rows for wrapper/container documents that primarily exist
|
|
* to group or reference child documents (for example TED packages or expanded ZIP wrappers).
|
|
* When disabled, wrappers are still classified, extracted and represented, but the raw
|
|
* ORIGINAL content payload is not stored in DOC.doc_content.
|
|
*/
|
|
private boolean storeOriginalContentForWrapperDocuments = true;
|
|
|
|
/**
|
|
* Queue only the primary text representation for embedding.
|
|
*/
|
|
private boolean vectorizePrimaryRepresentationOnly = true;
|
|
|
|
/**
|
|
* Import batch identifier written to DOC.doc_source.import_batch_id.
|
|
*/
|
|
@NotBlank
|
|
private String importBatchId = "phase4-generic";
|
|
|
|
/**
|
|
* Enable the Phase 4.1 TED package adapter built on top of the generic DOC ingestion SPI.
|
|
*/
|
|
private boolean tedPackageAdapterEnabled = true;
|
|
|
|
/**
|
|
* Enable the Phase 4.1 mail/document adapter built on top of the generic DOC ingestion SPI.
|
|
*/
|
|
private boolean mailAdapterEnabled = false;
|
|
|
|
/**
|
|
* Optional dedicated owner tenant key for imported mail messages and attachments.
|
|
* Falls back to defaultOwnerTenantKey when not configured.
|
|
*/
|
|
private String mailDefaultOwnerTenantKey;
|
|
|
|
/**
|
|
* Default visibility for imported mail messages and attachments.
|
|
*/
|
|
private at.procon.dip.domain.access.DocumentVisibility mailDefaultVisibility = at.procon.dip.domain.access.DocumentVisibility.TENANT;
|
|
|
|
/**
|
|
* Expand ZIP attachments recursively through the mail adapter.
|
|
*/
|
|
private boolean expandMailZipAttachments = true;
|
|
|
|
/**
|
|
* Import batch identifier for TED package roots and extracted TED child documents.
|
|
*/
|
|
@NotBlank
|
|
private String tedPackageImportBatchId = "phase41-ted-package";
|
|
|
|
/**
|
|
* When true, TED packages are persisted only through the generic ingestion gateway
|
|
* and the legacy XML batch persistence path is skipped.
|
|
*/
|
|
private boolean gatewayOnlyForTedPackages = false;
|
|
|
|
/**
|
|
* Import batch identifier for imported mail root messages and child attachments.
|
|
*/
|
|
@NotBlank
|
|
private String mailImportBatchId = "phase41-mail";
|
|
}
|
|
|
|
}
|