ted package new import
This commit is contained in:
parent
1aa599b587
commit
fbd249e56b
|
|
@ -0,0 +1,20 @@
|
|||
# NEW TED package import route
|
||||
|
||||
This patch adds a NEW-runtime TED package download path that:
|
||||
|
||||
- reuses the proven package sequencing rules
|
||||
- stores package tracking in `TedDailyPackage`
|
||||
- downloads the package tar.gz
|
||||
- ingests it only through `DocumentIngestionGateway`
|
||||
- never calls the legacy XML batch processing / vectorization flow
|
||||
|
||||
## Added classes
|
||||
|
||||
- `TedPackageSequenceService`
|
||||
- `DefaultTedPackageSequenceService`
|
||||
- `TedPackageDownloadNewProperties`
|
||||
- `TedPackageDownloadNewRoute`
|
||||
|
||||
## Config
|
||||
|
||||
Use the `dip.ingestion.ted-download.*` block in `application-new.yml`.
|
||||
|
|
@ -0,0 +1,189 @@
|
|||
package at.procon.dip.domain.ted.service;
|
||||
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.dip.ingestion.config.TedPackageDownloadProperties;
|
||||
import at.procon.ted.model.entity.TedDailyPackage;
|
||||
import at.procon.ted.repository.TedDailyPackageRepository;
|
||||
import java.time.Duration;
|
||||
import java.time.LocalDate;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.Year;
|
||||
import java.time.ZoneOffset;
|
||||
import java.util.Optional;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
/**
|
||||
* NEW-runtime implementation of TED package sequencing.
|
||||
* <p>
|
||||
* This reuses the same decision rules as the legacy TED package downloader:
|
||||
* <ul>
|
||||
* <li>current year forward crawling first</li>
|
||||
* <li>gap filling by walking backward to package 1</li>
|
||||
* <li>NOT_FOUND retry handling with current-year indefinite retry support</li>
|
||||
* <li>previous-year grace period before a tail NOT_FOUND becomes final</li>
|
||||
* </ul>
|
||||
*/
|
||||
@Service
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class DefaultTedPackageSequenceService implements TedPackageSequenceService {
|
||||
|
||||
private final TedPackageDownloadProperties properties;
|
||||
private final TedDailyPackageRepository packageRepository;
|
||||
|
||||
@Override
|
||||
public PackageInfo getNextPackageToDownload() {
|
||||
int currentYear = Year.now().getValue();
|
||||
|
||||
log.debug("Determining next TED package to download for NEW runtime (current year: {})", currentYear);
|
||||
|
||||
// 1) Current year forward crawling first (newest data first)
|
||||
PackageInfo nextInCurrentYear = getNextForwardPackage(currentYear);
|
||||
if (nextInCurrentYear != null) {
|
||||
log.info("Next TED package: {} (current year {} forward)", nextInCurrentYear.identifier(), currentYear);
|
||||
return nextInCurrentYear;
|
||||
}
|
||||
|
||||
// 2) Walk all years backward and fill gaps / continue unfinished years
|
||||
for (int year = currentYear; year >= properties.getStartYear(); year--) {
|
||||
PackageInfo gapFiller = getGapFillerPackage(year);
|
||||
if (gapFiller != null) {
|
||||
log.info("Next TED package: {} (filling gap in year {})", gapFiller.identifier(), year);
|
||||
return gapFiller;
|
||||
}
|
||||
|
||||
if (!isYearComplete(year)) {
|
||||
PackageInfo forwardPackage = getNextForwardPackage(year);
|
||||
if (forwardPackage != null) {
|
||||
log.info("Next TED package: {} (continuing year {})", forwardPackage.identifier(), year);
|
||||
return forwardPackage;
|
||||
}
|
||||
} else {
|
||||
log.debug("TED package year {} is complete", year);
|
||||
}
|
||||
}
|
||||
|
||||
// 3) Open a new older year if possible
|
||||
int oldestYear = getOldestYearWithData();
|
||||
if (oldestYear > properties.getStartYear()) {
|
||||
int previousYear = oldestYear - 1;
|
||||
if (previousYear >= properties.getStartYear()) {
|
||||
PackageInfo first = new PackageInfo(previousYear, 1);
|
||||
log.info("Next TED package: {} (opening year {})", first.identifier(), previousYear);
|
||||
return first;
|
||||
}
|
||||
}
|
||||
|
||||
log.info("All TED package years from {} to {} appear complete - nothing to download",
|
||||
properties.getStartYear(), currentYear);
|
||||
return null;
|
||||
}
|
||||
|
||||
private PackageInfo getNextForwardPackage(int year) {
|
||||
Optional<TedDailyPackage> latest = packageRepository.findLatestByYear(year);
|
||||
|
||||
if (latest.isEmpty()) {
|
||||
return new PackageInfo(year, 1);
|
||||
}
|
||||
|
||||
TedDailyPackage latestPackage = latest.get();
|
||||
|
||||
if (latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) {
|
||||
if (shouldRetryNotFoundPackage(latestPackage)) {
|
||||
return new PackageInfo(year, latestPackage.getSerialNumber());
|
||||
}
|
||||
|
||||
if (isNotFoundRetryableForYear(latestPackage)) {
|
||||
log.debug("Year {} still inside NOT_FOUND retry window for package {} until {}",
|
||||
year, latestPackage.getPackageIdentifier(), calculateNextRetryAt(latestPackage));
|
||||
return null;
|
||||
}
|
||||
|
||||
log.debug("Year {} finalized after grace period at tail package {}", year, latestPackage.getPackageIdentifier());
|
||||
return null;
|
||||
}
|
||||
|
||||
return new PackageInfo(year, latestPackage.getSerialNumber() + 1);
|
||||
}
|
||||
|
||||
private PackageInfo getGapFillerPackage(int year) {
|
||||
Optional<TedDailyPackage> first = packageRepository.findFirstByYear(year);
|
||||
|
||||
if (first.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
int minSerial = first.get().getSerialNumber();
|
||||
if (minSerial <= 1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new PackageInfo(year, minSerial - 1);
|
||||
}
|
||||
|
||||
private boolean isYearComplete(int year) {
|
||||
Optional<TedDailyPackage> first = packageRepository.findFirstByYear(year);
|
||||
Optional<TedDailyPackage> latest = packageRepository.findLatestByYear(year);
|
||||
|
||||
if (first.isEmpty() || latest.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (first.get().getSerialNumber() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
TedDailyPackage latestPackage = latest.get();
|
||||
return latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND
|
||||
&& !isNotFoundRetryableForYear(latestPackage);
|
||||
}
|
||||
|
||||
private boolean shouldRetryNotFoundPackage(TedDailyPackage pkg) {
|
||||
if (!isNotFoundRetryableForYear(pkg)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
OffsetDateTime nextRetryAt = calculateNextRetryAt(pkg);
|
||||
return !nextRetryAt.isAfter(OffsetDateTime.now());
|
||||
}
|
||||
|
||||
private boolean isNotFoundRetryableForYear(TedDailyPackage pkg) {
|
||||
int currentYear = Year.now().getValue();
|
||||
int packageYear = pkg.getYear() != null ? pkg.getYear() : currentYear;
|
||||
|
||||
if (packageYear >= currentYear) {
|
||||
return properties.isRetryCurrentYearNotFoundIndefinitely();
|
||||
}
|
||||
|
||||
return OffsetDateTime.now().isBefore(getYearRetryGraceDeadline(packageYear));
|
||||
}
|
||||
|
||||
private OffsetDateTime calculateNextRetryAt(TedDailyPackage pkg) {
|
||||
OffsetDateTime lastAttemptAt = pkg.getUpdatedAt() != null
|
||||
? pkg.getUpdatedAt()
|
||||
: (pkg.getCreatedAt() != null ? pkg.getCreatedAt() : OffsetDateTime.now());
|
||||
|
||||
return lastAttemptAt.plus(Duration.ofMillis(properties.getNotFoundRetryInterval()));
|
||||
}
|
||||
|
||||
private OffsetDateTime getYearRetryGraceDeadline(int year) {
|
||||
return LocalDate.of(year + 1, 1, 1)
|
||||
.atStartOfDay()
|
||||
.atOffset(ZoneOffset.UTC)
|
||||
.plusDays(properties.getPreviousYearGracePeriodDays());
|
||||
}
|
||||
|
||||
private int getOldestYearWithData() {
|
||||
int currentYear = Year.now().getValue();
|
||||
for (int year = properties.getStartYear(); year <= currentYear; year++) {
|
||||
if (packageRepository.findLatestByYear(year).isPresent()) {
|
||||
return year;
|
||||
}
|
||||
}
|
||||
return currentYear;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
package at.procon.dip.domain.ted.service;
|
||||
|
||||
/**
|
||||
* Shared package sequencing contract used to determine the next TED daily package to download.
|
||||
* <p>
|
||||
* This service encapsulates the proven sequencing rules from the legacy download implementation
|
||||
* so they can also be used by the NEW runtime without depending on the old route/service graph.
|
||||
*/
|
||||
public interface TedPackageSequenceService {
|
||||
|
||||
/**
|
||||
* Returns the next package to download according to the current sequencing strategy,
|
||||
* or {@code null} if nothing should be downloaded right now.
|
||||
*/
|
||||
PackageInfo getNextPackageToDownload();
|
||||
|
||||
/**
|
||||
* Simple year/serial pair with TED package identifier helper.
|
||||
*/
|
||||
record PackageInfo(int year, int serialNumber) {
|
||||
public String identifier() {
|
||||
return "%04d%05d".formatted(year, serialNumber);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -6,11 +6,15 @@ import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
|
|||
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import java.util.List;
|
||||
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
public class FileSystemDocumentIngestionAdapter implements DocumentIngestionAdapter {
|
||||
|
||||
private final GenericDocumentImportService importService;
|
||||
|
|
|
|||
|
|
@ -7,11 +7,15 @@ import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
|
|||
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import java.util.List;
|
||||
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
public class InlineContentDocumentIngestionAdapter implements DocumentIngestionAdapter {
|
||||
|
||||
private final GenericDocumentImportService importService;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
package at.procon.dip.ingestion.adapter;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
|
||||
import at.procon.dip.domain.document.SourceType;
|
||||
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||
import at.procon.dip.ingestion.service.GenericDocumentImportService;
|
||||
import at.procon.dip.ingestion.service.TedPackageChildImportProcessor;
|
||||
|
|
@ -22,6 +24,7 @@ import java.util.Map;
|
|||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Qualifier;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.transaction.annotation.Propagation;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
|
@ -40,7 +43,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap
|
|||
|
||||
@Override
|
||||
public boolean supports(SourceDescriptor sourceDescriptor) {
|
||||
return sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.TED_PACKAGE
|
||||
return sourceDescriptor.sourceType() == SourceType.TED_PACKAGE
|
||||
&& properties.isEnabled()
|
||||
&& properties.isTedPackageAdapterEnabled();
|
||||
}
|
||||
|
|
@ -58,7 +61,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap
|
|||
|
||||
ImportedDocumentResult packageDocument = importService.importDocument(new SourceDescriptor(
|
||||
sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(),
|
||||
at.procon.dip.domain.document.SourceType.TED_PACKAGE,
|
||||
SourceType.TED_PACKAGE,
|
||||
sourceDescriptor.sourceIdentifier(),
|
||||
packageRootSource.sourceUri(),
|
||||
sourceDescriptor.fileName(),
|
||||
|
|
@ -71,7 +74,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap
|
|||
));
|
||||
|
||||
List<String> warnings = new ArrayList<>(packageDocument.warnings());
|
||||
List<at.procon.dip.domain.document.CanonicalDocumentMetadata> documents = new ArrayList<>();
|
||||
List<CanonicalDocumentMetadata> documents = new ArrayList<>();
|
||||
documents.add(packageDocument.document().toCanonicalMetadata());
|
||||
|
||||
AtomicInteger sortOrder = new AtomicInteger();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,328 @@
|
|||
package at.procon.dip.ingestion.camel;
|
||||
|
||||
import at.procon.dip.domain.document.SourceType;
|
||||
import at.procon.dip.ingestion.config.TedPackageDownloadProperties;
|
||||
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
|
||||
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.ted.model.entity.TedDailyPackage;
|
||||
import at.procon.ted.repository.TedDailyPackageRepository;
|
||||
import at.procon.dip.domain.ted.service.TedPackageSequenceService;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.security.MessageDigest;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.camel.Exchange;
|
||||
import org.apache.camel.LoggingLevel;
|
||||
import org.apache.camel.builder.RouteBuilder;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
/**
|
||||
* NEW-runtime TED daily package download route.
|
||||
* <p>
|
||||
* Reuses the proven package sequencing rules through {@link TedPackageSequenceService},
|
||||
* but hands off processing only to the NEW ingestion gateway. No legacy XML batch persistence,
|
||||
* no legacy vectorization route, no old semantic path.
|
||||
*/
|
||||
@Component
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
@ConditionalOnProperty(name = "dip.ingestion.ted-download.enabled", havingValue = "true")
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class TedPackageDownloadRoute extends RouteBuilder {
|
||||
|
||||
private static final String ROUTE_ID_SCHEDULER = "ted-package-new-scheduler";
|
||||
private static final String ROUTE_ID_DOWNLOADER = "ted-package-new-downloader";
|
||||
private static final String ROUTE_ID_ERROR = "ted-package-new-error-handler";
|
||||
|
||||
private final TedPackageDownloadProperties properties;
|
||||
private final TedDailyPackageRepository packageRepository;
|
||||
private final TedPackageSequenceService sequenceService;
|
||||
private final DocumentIngestionGateway documentIngestionGateway;
|
||||
|
||||
@Override
|
||||
public void configure() {
|
||||
errorHandler(deadLetterChannel("direct:ted-package-new-error")
|
||||
.maximumRedeliveries(3)
|
||||
.redeliveryDelay(10_000)
|
||||
.retryAttemptedLogLevel(LoggingLevel.WARN)
|
||||
.logStackTrace(true));
|
||||
|
||||
from("direct:ted-package-new-error")
|
||||
.routeId(ROUTE_ID_ERROR)
|
||||
.process(this::handleError);
|
||||
|
||||
from("timer:ted-package-new-scheduler?period={{dip.ingestion.ted-download.poll-interval:3600000}}&delay=0")
|
||||
.routeId(ROUTE_ID_SCHEDULER)
|
||||
.process(this::checkRunningPackages)
|
||||
.choice()
|
||||
.when(header("tooManyRunning").isEqualTo(true))
|
||||
.log(LoggingLevel.INFO, "Skipping NEW TED package download - already ${header.runningCount} packages in progress")
|
||||
.otherwise()
|
||||
.process(this::determineNextPackage)
|
||||
.choice()
|
||||
.when(header("packageId").isNotNull())
|
||||
.to("direct:download-ted-package-new")
|
||||
.otherwise()
|
||||
.log(LoggingLevel.INFO, "No NEW TED package to download right now")
|
||||
.end()
|
||||
.end();
|
||||
|
||||
from("direct:download-ted-package-new")
|
||||
.routeId(ROUTE_ID_DOWNLOADER)
|
||||
.log(LoggingLevel.INFO, "NEW TED package download started: ${header.packageId}")
|
||||
.setHeader("downloadStartTime", constant(System.currentTimeMillis()))
|
||||
.process(this::createPackageRecord)
|
||||
.delay(simple("{{dip.ingestion.ted-download.delay-between-downloads:5000}}"))
|
||||
.setHeader(Exchange.HTTP_METHOD, constant("GET"))
|
||||
.setHeader("CamelHttpConnectionClose", constant(true))
|
||||
.toD("${header.downloadUrl}?bridgeEndpoint=true&throwExceptionOnFailure=false&socketTimeout={{dip.ingestion.ted-download.download-timeout:300000}}")
|
||||
.choice()
|
||||
.when(header(Exchange.HTTP_RESPONSE_CODE).isEqualTo(200))
|
||||
.process(this::calculateHash)
|
||||
.process(this::checkDuplicateByHash)
|
||||
.choice()
|
||||
.when(header("isDuplicate").isEqualTo(true))
|
||||
.process(this::markDuplicate)
|
||||
.otherwise()
|
||||
.process(this::saveDownloadedPackage)
|
||||
.process(this::ingestThroughGateway)
|
||||
.process(this::markCompleted)
|
||||
.endChoice()
|
||||
.when(header(Exchange.HTTP_RESPONSE_CODE).isEqualTo(404))
|
||||
.process(this::markNotFound)
|
||||
.otherwise()
|
||||
.process(this::markFailed)
|
||||
.end();
|
||||
}
|
||||
|
||||
private void checkRunningPackages(Exchange exchange) {
|
||||
long downloadingCount = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADING).size();
|
||||
long processingCount = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.PROCESSING).size();
|
||||
long runningCount = downloadingCount + processingCount;
|
||||
|
||||
exchange.getIn().setHeader("runningCount", runningCount);
|
||||
exchange.getIn().setHeader("tooManyRunning", runningCount >= properties.getMaxRunningPackages());
|
||||
|
||||
if (runningCount > 0) {
|
||||
log.info("Currently {} TED packages in progress in NEW runtime ({} downloading, {} processing)",
|
||||
runningCount, downloadingCount, processingCount);
|
||||
}
|
||||
}
|
||||
|
||||
private void determineNextPackage(Exchange exchange) {
|
||||
List<TedDailyPackage> pendingPackages = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.PENDING);
|
||||
|
||||
if (!pendingPackages.isEmpty()) {
|
||||
TedDailyPackage pkg = pendingPackages.get(0);
|
||||
log.info("Retrying PENDING TED package in NEW runtime: {}", pkg.getPackageIdentifier());
|
||||
setPackageHeaders(exchange, pkg.getYear(), pkg.getSerialNumber());
|
||||
return;
|
||||
}
|
||||
|
||||
TedPackageSequenceService.PackageInfo packageInfo = sequenceService.getNextPackageToDownload();
|
||||
if (packageInfo == null) {
|
||||
exchange.getIn().setHeader("packageId", null);
|
||||
return;
|
||||
}
|
||||
|
||||
setPackageHeaders(exchange, packageInfo.year(), packageInfo.serialNumber());
|
||||
}
|
||||
|
||||
private void setPackageHeaders(Exchange exchange, int year, int serialNumber) {
|
||||
String packageId = "%04d%05d".formatted(year, serialNumber);
|
||||
String downloadUrl = properties.getBaseUrl() + packageId;
|
||||
|
||||
exchange.getIn().setHeader("packageId", packageId);
|
||||
exchange.getIn().setHeader("year", year);
|
||||
exchange.getIn().setHeader("serialNumber", serialNumber);
|
||||
exchange.getIn().setHeader("downloadUrl", downloadUrl);
|
||||
}
|
||||
|
||||
private void createPackageRecord(Exchange exchange) {
|
||||
String packageId = exchange.getIn().getHeader("packageId", String.class);
|
||||
Integer year = exchange.getIn().getHeader("year", Integer.class);
|
||||
Integer serialNumber = exchange.getIn().getHeader("serialNumber", Integer.class);
|
||||
String downloadUrl = exchange.getIn().getHeader("downloadUrl", String.class);
|
||||
|
||||
if (packageRepository.existsByPackageIdentifier(packageId)) {
|
||||
return;
|
||||
}
|
||||
|
||||
TedDailyPackage pkg = TedDailyPackage.builder()
|
||||
.packageIdentifier(packageId)
|
||||
.year(year)
|
||||
.serialNumber(serialNumber)
|
||||
.downloadUrl(downloadUrl)
|
||||
.downloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADING)
|
||||
.build();
|
||||
|
||||
packageRepository.save(pkg);
|
||||
}
|
||||
|
||||
private void calculateHash(Exchange exchange) throws Exception {
|
||||
byte[] body = exchange.getIn().getBody(byte[].class);
|
||||
MessageDigest digest = MessageDigest.getInstance("SHA-256");
|
||||
byte[] hashBytes = digest.digest(body);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (byte b : hashBytes) {
|
||||
sb.append(String.format("%02x", b));
|
||||
}
|
||||
|
||||
exchange.getIn().setHeader("fileHash", sb.toString());
|
||||
}
|
||||
|
||||
private void checkDuplicateByHash(Exchange exchange) {
|
||||
String hash = exchange.getIn().getHeader("fileHash", String.class);
|
||||
|
||||
Optional<TedDailyPackage> duplicate = packageRepository.findAll().stream()
|
||||
.filter(p -> hash.equals(p.getFileHash()))
|
||||
.findFirst();
|
||||
|
||||
exchange.getIn().setHeader("isDuplicate", duplicate.isPresent());
|
||||
duplicate.ifPresent(pkg -> exchange.getIn().setHeader("duplicateOf", pkg.getPackageIdentifier()));
|
||||
}
|
||||
|
||||
private void saveDownloadedPackage(Exchange exchange) throws Exception {
|
||||
String packageId = exchange.getIn().getHeader("packageId", String.class);
|
||||
String hash = exchange.getIn().getHeader("fileHash", String.class);
|
||||
byte[] body = exchange.getIn().getBody(byte[].class);
|
||||
|
||||
Path downloadDir = Paths.get(properties.getDownloadDirectory());
|
||||
Files.createDirectories(downloadDir);
|
||||
Path downloadPath = downloadDir.resolve(packageId + ".tar.gz");
|
||||
Files.write(downloadPath, body);
|
||||
|
||||
long downloadDuration = System.currentTimeMillis() -
|
||||
exchange.getIn().getHeader("downloadStartTime", Long.class);
|
||||
|
||||
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
|
||||
pkg.setFileHash(hash);
|
||||
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADED);
|
||||
pkg.setDownloadedAt(OffsetDateTime.now());
|
||||
pkg.setDownloadDurationMs(downloadDuration);
|
||||
packageRepository.save(pkg);
|
||||
});
|
||||
|
||||
exchange.getIn().setHeader("downloadPath", downloadPath.toString());
|
||||
}
|
||||
|
||||
private void ingestThroughGateway(Exchange exchange) {
|
||||
String packageId = exchange.getIn().getHeader("packageId", String.class);
|
||||
String downloadPath = exchange.getIn().getHeader("downloadPath", String.class);
|
||||
|
||||
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
|
||||
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.PROCESSING);
|
||||
packageRepository.save(pkg);
|
||||
});
|
||||
|
||||
IngestionResult ingestionResult = documentIngestionGateway.ingest(new SourceDescriptor(
|
||||
null,
|
||||
SourceType.TED_PACKAGE,
|
||||
packageId,
|
||||
downloadPath,
|
||||
packageId + ".tar.gz",
|
||||
"application/gzip",
|
||||
null,
|
||||
null,
|
||||
OffsetDateTime.now(),
|
||||
OriginalContentStoragePolicy.DEFAULT,
|
||||
Map.of(
|
||||
"packageId", packageId,
|
||||
"title", packageId + ".tar.gz"
|
||||
)
|
||||
));
|
||||
|
||||
int importedChildCount = Math.max(0, ingestionResult.documents().size() - 1);
|
||||
exchange.getIn().setHeader("gatewayImportedChildCount", importedChildCount);
|
||||
exchange.getIn().setHeader("gatewayImportWarnings", ingestionResult.warnings().size());
|
||||
|
||||
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
|
||||
pkg.setXmlFileCount(importedChildCount);
|
||||
pkg.setProcessedCount(importedChildCount);
|
||||
pkg.setFailedCount(0);
|
||||
packageRepository.save(pkg);
|
||||
});
|
||||
}
|
||||
|
||||
private void markCompleted(Exchange exchange) throws Exception {
|
||||
String packageId = exchange.getIn().getHeader("packageId", String.class);
|
||||
String downloadPath = exchange.getIn().getHeader("downloadPath", String.class);
|
||||
|
||||
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
|
||||
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.COMPLETED);
|
||||
pkg.setProcessedAt(OffsetDateTime.now());
|
||||
if (pkg.getDownloadedAt() != null) {
|
||||
long processingDuration = Math.max(0L,
|
||||
java.time.Duration.between(pkg.getDownloadedAt(), OffsetDateTime.now()).toMillis());
|
||||
pkg.setProcessingDurationMs(processingDuration);
|
||||
}
|
||||
packageRepository.save(pkg);
|
||||
});
|
||||
|
||||
if (properties.isDeleteAfterIngestion() && downloadPath != null) {
|
||||
Files.deleteIfExists(Path.of(downloadPath));
|
||||
}
|
||||
|
||||
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
|
||||
long totalDuration = (pkg.getDownloadDurationMs() != null ? pkg.getDownloadDurationMs() : 0L)
|
||||
+ (pkg.getProcessingDurationMs() != null ? pkg.getProcessingDurationMs() : 0L);
|
||||
log.info("NEW TED package {} completed: xmlCount={}, processed={}, failed={}, totalDuration={}ms",
|
||||
packageId, pkg.getXmlFileCount(), pkg.getProcessedCount(), pkg.getFailedCount(), totalDuration);
|
||||
});
|
||||
}
|
||||
|
||||
private void markNotFound(Exchange exchange) {
|
||||
String packageId = exchange.getIn().getHeader("packageId", String.class);
|
||||
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
|
||||
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.NOT_FOUND);
|
||||
pkg.setErrorMessage("Package not found (404)");
|
||||
packageRepository.save(pkg);
|
||||
});
|
||||
}
|
||||
|
||||
private void markFailed(Exchange exchange) {
|
||||
String packageId = exchange.getIn().getHeader("packageId", String.class);
|
||||
Integer httpCode = exchange.getIn().getHeader(Exchange.HTTP_RESPONSE_CODE, Integer.class);
|
||||
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
|
||||
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.FAILED);
|
||||
pkg.setErrorMessage("HTTP " + httpCode);
|
||||
packageRepository.save(pkg);
|
||||
});
|
||||
}
|
||||
|
||||
private void markDuplicate(Exchange exchange) {
|
||||
String packageId = exchange.getIn().getHeader("packageId", String.class);
|
||||
String duplicateOf = exchange.getIn().getHeader("duplicateOf", String.class);
|
||||
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
|
||||
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.COMPLETED);
|
||||
pkg.setErrorMessage("Duplicate of " + duplicateOf);
|
||||
pkg.setProcessedAt(OffsetDateTime.now());
|
||||
packageRepository.save(pkg);
|
||||
});
|
||||
}
|
||||
|
||||
private void handleError(Exchange exchange) {
|
||||
Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class);
|
||||
String packageId = exchange.getIn().getHeader("packageId", String.class);
|
||||
|
||||
if (packageId != null) {
|
||||
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
|
||||
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.FAILED);
|
||||
pkg.setErrorMessage(exception != null ? exception.getMessage() : "Unknown route error");
|
||||
packageRepository.save(pkg);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,8 @@
|
|||
package at.procon.dip.ingestion.config;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import jakarta.validation.constraints.NotBlank;
|
||||
import jakarta.validation.constraints.Positive;
|
||||
import lombok.Data;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,52 @@
|
|||
package at.procon.dip.ingestion.config;
|
||||
|
||||
import jakarta.validation.constraints.Min;
|
||||
import jakarta.validation.constraints.NotBlank;
|
||||
import jakarta.validation.constraints.Positive;
|
||||
import lombok.Data;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
/**
|
||||
* NEW-runtime TED package download configuration.
|
||||
* <p>
|
||||
* This is intentionally separate from the legacy {@code ted.download.*} tree.
|
||||
*/
|
||||
@Configuration
|
||||
@ConfigurationProperties(prefix = "dip.ingestion.ted-download")
|
||||
@Data
|
||||
public class TedPackageDownloadProperties {
|
||||
|
||||
private boolean enabled = false;
|
||||
|
||||
@NotBlank
|
||||
private String baseUrl = "https://ted.europa.eu/packages/daily/";
|
||||
|
||||
@NotBlank
|
||||
private String downloadDirectory = "/ted.europe/downloads-new";
|
||||
|
||||
@Positive
|
||||
private int startYear = 2015;
|
||||
|
||||
@Positive
|
||||
private long pollInterval = 3_600_000L;
|
||||
|
||||
@Positive
|
||||
private long notFoundRetryInterval = 21_600_000L;
|
||||
|
||||
@Min(0)
|
||||
private int previousYearGracePeriodDays = 30;
|
||||
|
||||
private boolean retryCurrentYearNotFoundIndefinitely = true;
|
||||
|
||||
@Positive
|
||||
private long downloadTimeout = 300_000L;
|
||||
|
||||
@Positive
|
||||
private int maxRunningPackages = 2;
|
||||
|
||||
@Positive
|
||||
private long delayBetweenDownloads = 5_000L;
|
||||
|
||||
private boolean deleteAfterIngestion = true;
|
||||
}
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
package at.procon.ted.config;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
/**
|
||||
* Patch A scaffold for the legacy runtime configuration tree.
|
||||
*
|
||||
* The legacy runtime still uses {@link TedProcessorProperties} today. This class is
|
||||
* introduced so the old configuration can be moved gradually from `ted.*` to
|
||||
* `legacy.ted.*` without blocking the runtime split.
|
||||
*/
|
||||
@Configuration
|
||||
@ConfigurationProperties(prefix = "legacy.ted")
|
||||
public class LegacyTedProperties extends TedProcessorProperties {
|
||||
}
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
package at.procon.ted.controller;
|
||||
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.ted.model.dto.DocumentDtos.*;
|
||||
import at.procon.ted.model.entity.ContractNature;
|
||||
import at.procon.ted.model.entity.NoticeType;
|
||||
|
|
@ -38,6 +40,7 @@ import java.util.UUID;
|
|||
@RequestMapping("/v1/documents")
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
|
||||
@Tag(name = "Documents", description = "TED Procurement Document Search API")
|
||||
public class DocumentController {
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package at.procon.ted.controller;
|
||||
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.ted.service.SimilaritySearchService;
|
||||
import at.procon.ted.service.SimilaritySearchService.SimilaritySearchResponse;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
|
|
@ -28,6 +30,7 @@ import java.io.IOException;
|
|||
@RequestMapping("/similarity")
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
|
||||
@Tag(name = "Similarity Search", description = "Vector-based semantic similarity search on TED procurement documents")
|
||||
public class SimilaritySearchController {
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,152 @@
|
|||
spring:
|
||||
config:
|
||||
activate:
|
||||
on-profile: legacy
|
||||
|
||||
dip:
|
||||
runtime:
|
||||
mode: LEGACY
|
||||
# Legacy / shared application properties
|
||||
# New-runtime-only properties are moved to application-new.yml.
|
||||
ted:
|
||||
# Directory configuration for file processing
|
||||
input:
|
||||
# Base directory for watching incoming TED XML files
|
||||
directory: ${TED_INPUT_DIR:/ted.europe/extracted}
|
||||
# File pattern to match (recursive scanning)
|
||||
pattern: "**/*.xml"
|
||||
# Move processed files to this directory
|
||||
processed-directory: ${TED_PROCESSED_DIR:.processed}
|
||||
# Move failed files to this directory
|
||||
error-directory: ${TED_ERROR_DIR:.error}
|
||||
# Polling interval in milliseconds
|
||||
poll-interval: 5000
|
||||
# Maximum messages per poll (reduced to prevent memory issues)
|
||||
max-messages-per-poll: 10
|
||||
|
||||
# Schema validation configuration
|
||||
schema:
|
||||
# Enable/disable XSD validation
|
||||
enabled: true
|
||||
# Path to eForms SDK schemas (from Maven dependency or custom location)
|
||||
path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
|
||||
|
||||
# Vectorization configuration
|
||||
vectorization:
|
||||
# Enable/disable async vectorization
|
||||
enabled: false
|
||||
# Use external HTTP API instead of subprocess
|
||||
use-http-api: true
|
||||
# Embedding service URL
|
||||
api-url: http://172.20.240.18:8001
|
||||
# Model name for sentence-transformers
|
||||
model-name: intfloat/multilingual-e5-large
|
||||
# Vector dimensions (must match model output)
|
||||
dimensions: 1024
|
||||
# Batch size for vectorization
|
||||
batch-size: 16
|
||||
# Thread pool size for async processing
|
||||
thread-pool-size: 4
|
||||
# Maximum text length for vectorization (characters)
|
||||
max-text-length: 8192
|
||||
# HTTP connection timeout (milliseconds)
|
||||
connect-timeout: 10000
|
||||
# HTTP socket/read timeout (milliseconds)
|
||||
socket-timeout: 60000
|
||||
# Maximum retries on connection failure
|
||||
max-retries: 5
|
||||
# Packages download configuration
|
||||
download:
|
||||
# Enable/disable automatic package download
|
||||
enabled: false
|
||||
# User service-based camel route
|
||||
use-service-based: false
|
||||
# Base URL for TED Daily Packages
|
||||
base-url: https://ted.europa.eu/packages/daily/
|
||||
# Download directory for tar.gz files
|
||||
download-directory: /ted.europe/downloads
|
||||
# Extract directory for XML files
|
||||
extract-directory: /ted.europe/extracted
|
||||
# Start year for downloads
|
||||
start-year: 2026
|
||||
# Max consecutive 404 errors before stopping
|
||||
max-consecutive-404: 4
|
||||
# Polling interval (milliseconds) - 2 minutes
|
||||
poll-interval: 300000
|
||||
# Retry interval for tail NOT_FOUND packages - 6 hours
|
||||
not-found-retry-interval: 21600000
|
||||
# Grace period after year end before a previous-year tail 404 is treated as final
|
||||
previous-year-grace-period-days: 30
|
||||
# Keep retrying current-year tail 404 packages indefinitely
|
||||
retry-current-year-not-found-indefinitely: true
|
||||
# Download timeout (milliseconds) - 5 minutes
|
||||
download-timeout: 300000
|
||||
# Max concurrent downloads
|
||||
max-concurrent-downloads: 2
|
||||
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
|
||||
delay-between-downloads: 3000
|
||||
# Delete tar.gz after extraction
|
||||
delete-after-extraction: true
|
||||
# Prioritize current year first
|
||||
prioritize-current-year: false
|
||||
# IMAP Mail configuration
|
||||
mail:
|
||||
# Enable/disable mail processing
|
||||
enabled: false
|
||||
# IMAP server hostname
|
||||
host: mail.mymagenta.business
|
||||
# IMAP server port (993 for IMAPS)
|
||||
port: 993
|
||||
# Mail account username (email address)
|
||||
username: archiv@procon.co.at
|
||||
# Mail account password
|
||||
password: ${MAIL_PASSWORD:worasigg}
|
||||
# Use SSL/TLS connection
|
||||
ssl: true
|
||||
# Mail folder to read from
|
||||
folder-name: INBOX
|
||||
# Delete messages after processing
|
||||
delete: false
|
||||
# Mark messages as seen after processing (false = peek mode, don't mark as read)
|
||||
seen: false
|
||||
# Only process unseen messages
|
||||
unseen: true
|
||||
# Polling delay in milliseconds (1 minute)
|
||||
delay: 60000
|
||||
# Max messages per poll
|
||||
max-messages-per-poll: 100
|
||||
# Output directory for processed attachments
|
||||
attachment-output-directory: /ted.europe/mail-attachments
|
||||
# Enable/disable MIME file input processing
|
||||
mime-input-enabled: true
|
||||
# Input directory for MIME files (.eml)
|
||||
mime-input-directory: /ted.europe/mime-input
|
||||
# File pattern for MIME files (regex)
|
||||
mime-input-pattern: .*\\.eml
|
||||
# Polling interval for MIME input directory (milliseconds)
|
||||
mime-input-poll-interval: 1000000
|
||||
# solution brief processing
|
||||
solution-brief:
|
||||
# Enable/disable Solution Brief processing
|
||||
enabled: false
|
||||
# Input directory for Solution Brief PDF files
|
||||
input-directory: C:/work/SolutionBrief
|
||||
# Output directory for Excel result files (relative to input or absolute)
|
||||
result-directory: ./result
|
||||
# Number of top similar documents to include
|
||||
top-k: 20
|
||||
# Minimum similarity threshold (0.0-1.0)
|
||||
similarity-threshold: 0.5
|
||||
# Polling interval in milliseconds (30 seconds)
|
||||
poll-interval: 30000
|
||||
# File pattern for PDF files (regex)
|
||||
file-pattern: .*\\.pdf
|
||||
# Process files only once (idempotent)
|
||||
idempotent: true
|
||||
# Idempotent repository file path
|
||||
idempotent-repository: ./solution-brief-processed.dat
|
||||
|
||||
# Data cleanup configuration
|
||||
cleanup:
|
||||
# Enable automatic cleanup of old documents
|
||||
enabled: false
|
||||
# Retention period in years (default: 10)
|
||||
retention-years: 10
|
||||
# Cron expression for cleanup schedule (default: daily at 2 AM)
|
||||
cron: "0 0 2 * * *"
|
||||
|
||||
# Legacy runtime uses the existing ted.* property tree.
|
||||
# Move old route/download/mail/vectorization/search settings here over time.
|
||||
|
|
|
|||
|
|
@ -1,16 +1,6 @@
|
|||
# New runtime overrides
|
||||
# Activate with: --spring.profiles.active=new
|
||||
|
||||
# Optional explicit marker; file is profile-specific already
|
||||
spring:
|
||||
config:
|
||||
activate:
|
||||
on-profile: new
|
||||
|
||||
dip:
|
||||
runtime:
|
||||
mode: NEW
|
||||
|
||||
search:
|
||||
# Default page size for search results
|
||||
default-page-size: 20
|
||||
|
|
@ -44,7 +34,6 @@ dip:
|
|||
startup-lexical-backfill-limit: 500
|
||||
# Number of top hits per engine returned by /search/debug
|
||||
debug-top-hits-per-engine: 10
|
||||
|
||||
embedding:
|
||||
enabled: true
|
||||
default-document-model: e5-default
|
||||
|
|
@ -75,7 +64,6 @@ dip:
|
|||
active: true
|
||||
jobs:
|
||||
enabled: true
|
||||
|
||||
# Phase 4 generic ingestion configuration
|
||||
ingestion:
|
||||
# Master switch for arbitrary document ingestion into the DOC model
|
||||
|
|
@ -132,6 +120,32 @@ dip:
|
|||
# Import batch marker for mail roots and attachments
|
||||
mail-import-batch-id: phase41-mail
|
||||
|
||||
# ted packages download configuration
|
||||
ted-download:
|
||||
# Enable/disable automatic package download
|
||||
enabled: true
|
||||
# Base URL for TED Daily Packages
|
||||
base-url: https://ted.europa.eu/packages/daily/
|
||||
# Download directory for tar.gz files
|
||||
download-directory: /ted.europe/downloads-new
|
||||
# Start year for downloads
|
||||
start-year: 2026
|
||||
# Polling interval (milliseconds) - 2 minutes
|
||||
poll-interval: 3600000
|
||||
# Retry interval for tail NOT_FOUND packages - 6 hours
|
||||
not-found-retry-interval: 21600000
|
||||
# Grace period after year end before a previous-year tail 404 is treated as final
|
||||
previous-year-grace-period-days: 30
|
||||
# Keep retrying current-year tail 404 packages indefinitely
|
||||
retry-current-year-not-found-indefinitely: true
|
||||
# Download timeout (milliseconds) - 5 minutes
|
||||
download-timeout: 300000
|
||||
# Max concurrent downloads
|
||||
max-running-packages: 2
|
||||
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
|
||||
delay-between-downloads: 5000
|
||||
# Delete tar.gz after ingestion
|
||||
delete-after-ingestion: true
|
||||
ted: # Phase 3 TED projection configuration
|
||||
projection:
|
||||
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document
|
||||
|
|
|
|||
|
|
@ -7,6 +7,9 @@ server:
|
|||
context-path: /api
|
||||
|
||||
spring:
|
||||
profiles:
|
||||
active: new
|
||||
|
||||
application:
|
||||
name: document-intelligence-platform
|
||||
|
||||
|
|
@ -56,161 +59,6 @@ camel:
|
|||
enabled: true
|
||||
# Weniger strenge Health-Checks für File-Consumer
|
||||
consumers-enabled: false
|
||||
|
||||
# Default runtime mode: legacy / initial implementation
|
||||
# Activate profile 'new' to load application-new.yml and switch to the new runtime.
|
||||
dip:
|
||||
runtime:
|
||||
mode: LEGACY
|
||||
|
||||
# Legacy / shared application properties
|
||||
# New-runtime-only properties are moved to application-new.yml.
|
||||
ted:
|
||||
# Directory configuration for file processing
|
||||
input:
|
||||
# Base directory for watching incoming TED XML files
|
||||
directory: ${TED_INPUT_DIR:/ted.europe/extracted}
|
||||
# File pattern to match (recursive scanning)
|
||||
pattern: "**/*.xml"
|
||||
# Move processed files to this directory
|
||||
processed-directory: ${TED_PROCESSED_DIR:.processed}
|
||||
# Move failed files to this directory
|
||||
error-directory: ${TED_ERROR_DIR:.error}
|
||||
# Polling interval in milliseconds
|
||||
poll-interval: 5000
|
||||
# Maximum messages per poll (reduced to prevent memory issues)
|
||||
max-messages-per-poll: 10
|
||||
|
||||
# Schema validation configuration
|
||||
schema:
|
||||
# Enable/disable XSD validation
|
||||
enabled: true
|
||||
# Path to eForms SDK schemas (from Maven dependency or custom location)
|
||||
path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
|
||||
|
||||
# Vectorization configuration
|
||||
vectorization:
|
||||
# Enable/disable async vectorization
|
||||
enabled: false
|
||||
# Use external HTTP API instead of subprocess
|
||||
use-http-api: true
|
||||
# Embedding service URL
|
||||
api-url: http://172.20.240.18:8001
|
||||
# Model name for sentence-transformers
|
||||
model-name: intfloat/multilingual-e5-large
|
||||
# Vector dimensions (must match model output)
|
||||
dimensions: 1024
|
||||
# Batch size for vectorization
|
||||
batch-size: 16
|
||||
# Thread pool size for async processing
|
||||
thread-pool-size: 4
|
||||
# Maximum text length for vectorization (characters)
|
||||
max-text-length: 8192
|
||||
# HTTP connection timeout (milliseconds)
|
||||
connect-timeout: 10000
|
||||
# HTTP socket/read timeout (milliseconds)
|
||||
socket-timeout: 60000
|
||||
# Maximum retries on connection failure
|
||||
max-retries: 5
|
||||
# Packages download configuration
|
||||
download:
|
||||
# Enable/disable automatic package download
|
||||
enabled: true
|
||||
# User service-based camel route
|
||||
use-service-based: false
|
||||
# Base URL for TED Daily Packages
|
||||
base-url: https://ted.europa.eu/packages/daily/
|
||||
# Download directory for tar.gz files
|
||||
download-directory: /ted.europe/downloads
|
||||
# Extract directory for XML files
|
||||
extract-directory: /ted.europe/extracted
|
||||
# Start year for downloads
|
||||
start-year: 2026
|
||||
# Max consecutive 404 errors before stopping
|
||||
max-consecutive-404: 4
|
||||
# Polling interval (milliseconds) - 2 minutes
|
||||
poll-interval: 300000
|
||||
# Retry interval for tail NOT_FOUND packages - 6 hours
|
||||
not-found-retry-interval: 21600000
|
||||
# Grace period after year end before a previous-year tail 404 is treated as final
|
||||
previous-year-grace-period-days: 30
|
||||
# Keep retrying current-year tail 404 packages indefinitely
|
||||
retry-current-year-not-found-indefinitely: true
|
||||
# Download timeout (milliseconds) - 5 minutes
|
||||
download-timeout: 300000
|
||||
# Max concurrent downloads
|
||||
max-concurrent-downloads: 2
|
||||
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
|
||||
delay-between-downloads: 3000
|
||||
# Delete tar.gz after extraction
|
||||
delete-after-extraction: true
|
||||
# Prioritize current year first
|
||||
prioritize-current-year: false
|
||||
# IMAP Mail configuration
|
||||
mail:
|
||||
# Enable/disable mail processing
|
||||
enabled: false
|
||||
# IMAP server hostname
|
||||
host: mail.mymagenta.business
|
||||
# IMAP server port (993 for IMAPS)
|
||||
port: 993
|
||||
# Mail account username (email address)
|
||||
username: archiv@procon.co.at
|
||||
# Mail account password
|
||||
password: ${MAIL_PASSWORD:worasigg}
|
||||
# Use SSL/TLS connection
|
||||
ssl: true
|
||||
# Mail folder to read from
|
||||
folder-name: INBOX
|
||||
# Delete messages after processing
|
||||
delete: false
|
||||
# Mark messages as seen after processing (false = peek mode, don't mark as read)
|
||||
seen: false
|
||||
# Only process unseen messages
|
||||
unseen: true
|
||||
# Polling delay in milliseconds (1 minute)
|
||||
delay: 60000
|
||||
# Max messages per poll
|
||||
max-messages-per-poll: 100
|
||||
# Output directory for processed attachments
|
||||
attachment-output-directory: /ted.europe/mail-attachments
|
||||
# Enable/disable MIME file input processing
|
||||
mime-input-enabled: true
|
||||
# Input directory for MIME files (.eml)
|
||||
mime-input-directory: /ted.europe/mime-input
|
||||
# File pattern for MIME files (regex)
|
||||
mime-input-pattern: .*\\.eml
|
||||
# Polling interval for MIME input directory (milliseconds)
|
||||
mime-input-poll-interval: 1000000
|
||||
# solution brief processing
|
||||
solution-brief:
|
||||
# Enable/disable Solution Brief processing
|
||||
enabled: false
|
||||
# Input directory for Solution Brief PDF files
|
||||
input-directory: C:/work/SolutionBrief
|
||||
# Output directory for Excel result files (relative to input or absolute)
|
||||
result-directory: ./result
|
||||
# Number of top similar documents to include
|
||||
top-k: 20
|
||||
# Minimum similarity threshold (0.0-1.0)
|
||||
similarity-threshold: 0.5
|
||||
# Polling interval in milliseconds (30 seconds)
|
||||
poll-interval: 30000
|
||||
# File pattern for PDF files (regex)
|
||||
file-pattern: .*\\.pdf
|
||||
# Process files only once (idempotent)
|
||||
idempotent: true
|
||||
# Idempotent repository file path
|
||||
idempotent-repository: ./solution-brief-processed.dat
|
||||
|
||||
# Data cleanup configuration
|
||||
cleanup:
|
||||
# Enable automatic cleanup of old documents
|
||||
enabled: false
|
||||
# Retention period in years (default: 10)
|
||||
retention-years: 10
|
||||
# Cron expression for cleanup schedule (default: daily at 2 AM)
|
||||
cron: "0 0 2 * * *"
|
||||
|
||||
# Actuator endpoints
|
||||
management:
|
||||
|
|
|
|||
Loading…
Reference in New Issue