diff --git a/docs/architecture/TED_PACKAGE_NEW_IMPORT.md b/docs/architecture/TED_PACKAGE_NEW_IMPORT.md new file mode 100644 index 0000000..de0a1d8 --- /dev/null +++ b/docs/architecture/TED_PACKAGE_NEW_IMPORT.md @@ -0,0 +1,20 @@ +# NEW TED package import route + +This patch adds a NEW-runtime TED package download path that: + +- reuses the proven package sequencing rules +- stores package tracking in `TedDailyPackage` +- downloads the package tar.gz +- ingests it only through `DocumentIngestionGateway` +- never calls the legacy XML batch processing / vectorization flow + +## Added classes + +- `TedPackageSequenceService` +- `DefaultTedPackageSequenceService` +- `TedPackageDownloadNewProperties` +- `TedPackageDownloadNewRoute` + +## Config + +Use the `dip.ingestion.ted-download.*` block in `application-new.yml`. diff --git a/src/main/java/at/procon/dip/domain/ted/service/DefaultTedPackageSequenceService.java b/src/main/java/at/procon/dip/domain/ted/service/DefaultTedPackageSequenceService.java new file mode 100644 index 0000000..19ed138 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/service/DefaultTedPackageSequenceService.java @@ -0,0 +1,189 @@ +package at.procon.dip.domain.ted.service; + +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.dip.ingestion.config.TedPackageDownloadProperties; +import at.procon.ted.model.entity.TedDailyPackage; +import at.procon.ted.repository.TedDailyPackageRepository; +import java.time.Duration; +import java.time.LocalDate; +import java.time.OffsetDateTime; +import java.time.Year; +import java.time.ZoneOffset; +import java.util.Optional; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; + +/** + * NEW-runtime implementation of TED package sequencing. + *

+ * This reuses the same decision rules as the legacy TED package downloader: + *

+ */ +@Service +@ConditionalOnRuntimeMode(RuntimeMode.NEW) +@RequiredArgsConstructor +@Slf4j +public class DefaultTedPackageSequenceService implements TedPackageSequenceService { + + private final TedPackageDownloadProperties properties; + private final TedDailyPackageRepository packageRepository; + + @Override + public PackageInfo getNextPackageToDownload() { + int currentYear = Year.now().getValue(); + + log.debug("Determining next TED package to download for NEW runtime (current year: {})", currentYear); + + // 1) Current year forward crawling first (newest data first) + PackageInfo nextInCurrentYear = getNextForwardPackage(currentYear); + if (nextInCurrentYear != null) { + log.info("Next TED package: {} (current year {} forward)", nextInCurrentYear.identifier(), currentYear); + return nextInCurrentYear; + } + + // 2) Walk all years backward and fill gaps / continue unfinished years + for (int year = currentYear; year >= properties.getStartYear(); year--) { + PackageInfo gapFiller = getGapFillerPackage(year); + if (gapFiller != null) { + log.info("Next TED package: {} (filling gap in year {})", gapFiller.identifier(), year); + return gapFiller; + } + + if (!isYearComplete(year)) { + PackageInfo forwardPackage = getNextForwardPackage(year); + if (forwardPackage != null) { + log.info("Next TED package: {} (continuing year {})", forwardPackage.identifier(), year); + return forwardPackage; + } + } else { + log.debug("TED package year {} is complete", year); + } + } + + // 3) Open a new older year if possible + int oldestYear = getOldestYearWithData(); + if (oldestYear > properties.getStartYear()) { + int previousYear = oldestYear - 1; + if (previousYear >= properties.getStartYear()) { + PackageInfo first = new PackageInfo(previousYear, 1); + log.info("Next TED package: {} (opening year {})", first.identifier(), previousYear); + return first; + } + } + + log.info("All TED package years from {} to {} appear complete - nothing to download", + properties.getStartYear(), currentYear); + return null; + } + + private PackageInfo getNextForwardPackage(int year) { + Optional latest = packageRepository.findLatestByYear(year); + + if (latest.isEmpty()) { + return new PackageInfo(year, 1); + } + + TedDailyPackage latestPackage = latest.get(); + + if (latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) { + if (shouldRetryNotFoundPackage(latestPackage)) { + return new PackageInfo(year, latestPackage.getSerialNumber()); + } + + if (isNotFoundRetryableForYear(latestPackage)) { + log.debug("Year {} still inside NOT_FOUND retry window for package {} until {}", + year, latestPackage.getPackageIdentifier(), calculateNextRetryAt(latestPackage)); + return null; + } + + log.debug("Year {} finalized after grace period at tail package {}", year, latestPackage.getPackageIdentifier()); + return null; + } + + return new PackageInfo(year, latestPackage.getSerialNumber() + 1); + } + + private PackageInfo getGapFillerPackage(int year) { + Optional first = packageRepository.findFirstByYear(year); + + if (first.isEmpty()) { + return null; + } + + int minSerial = first.get().getSerialNumber(); + if (minSerial <= 1) { + return null; + } + + return new PackageInfo(year, minSerial - 1); + } + + private boolean isYearComplete(int year) { + Optional first = packageRepository.findFirstByYear(year); + Optional latest = packageRepository.findLatestByYear(year); + + if (first.isEmpty() || latest.isEmpty()) { + return false; + } + + if (first.get().getSerialNumber() != 1) { + return false; + } + + TedDailyPackage latestPackage = latest.get(); + return latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND + && !isNotFoundRetryableForYear(latestPackage); + } + + private boolean shouldRetryNotFoundPackage(TedDailyPackage pkg) { + if (!isNotFoundRetryableForYear(pkg)) { + return false; + } + + OffsetDateTime nextRetryAt = calculateNextRetryAt(pkg); + return !nextRetryAt.isAfter(OffsetDateTime.now()); + } + + private boolean isNotFoundRetryableForYear(TedDailyPackage pkg) { + int currentYear = Year.now().getValue(); + int packageYear = pkg.getYear() != null ? pkg.getYear() : currentYear; + + if (packageYear >= currentYear) { + return properties.isRetryCurrentYearNotFoundIndefinitely(); + } + + return OffsetDateTime.now().isBefore(getYearRetryGraceDeadline(packageYear)); + } + + private OffsetDateTime calculateNextRetryAt(TedDailyPackage pkg) { + OffsetDateTime lastAttemptAt = pkg.getUpdatedAt() != null + ? pkg.getUpdatedAt() + : (pkg.getCreatedAt() != null ? pkg.getCreatedAt() : OffsetDateTime.now()); + + return lastAttemptAt.plus(Duration.ofMillis(properties.getNotFoundRetryInterval())); + } + + private OffsetDateTime getYearRetryGraceDeadline(int year) { + return LocalDate.of(year + 1, 1, 1) + .atStartOfDay() + .atOffset(ZoneOffset.UTC) + .plusDays(properties.getPreviousYearGracePeriodDays()); + } + + private int getOldestYearWithData() { + int currentYear = Year.now().getValue(); + for (int year = properties.getStartYear(); year <= currentYear; year++) { + if (packageRepository.findLatestByYear(year).isPresent()) { + return year; + } + } + return currentYear; + } +} diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedPackageSequenceService.java b/src/main/java/at/procon/dip/domain/ted/service/TedPackageSequenceService.java new file mode 100644 index 0000000..2ece585 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/ted/service/TedPackageSequenceService.java @@ -0,0 +1,25 @@ +package at.procon.dip.domain.ted.service; + +/** + * Shared package sequencing contract used to determine the next TED daily package to download. + *

+ * This service encapsulates the proven sequencing rules from the legacy download implementation + * so they can also be used by the NEW runtime without depending on the old route/service graph. + */ +public interface TedPackageSequenceService { + + /** + * Returns the next package to download according to the current sequencing strategy, + * or {@code null} if nothing should be downloaded right now. + */ + PackageInfo getNextPackageToDownload(); + + /** + * Simple year/serial pair with TED package identifier helper. + */ + record PackageInfo(int year, int serialNumber) { + public String identifier() { + return "%04d%05d".formatted(year, serialNumber); + } + } +} diff --git a/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java index 91025ec..9592740 100644 --- a/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java +++ b/src/main/java/at/procon/dip/ingestion/adapter/FileSystemDocumentIngestionAdapter.java @@ -6,11 +6,15 @@ import at.procon.dip.ingestion.spi.DocumentIngestionAdapter; import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.SourceDescriptor; import java.util.List; + +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Component; @Component @RequiredArgsConstructor +@ConditionalOnRuntimeMode(RuntimeMode.NEW) public class FileSystemDocumentIngestionAdapter implements DocumentIngestionAdapter { private final GenericDocumentImportService importService; diff --git a/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java index 39088ed..93a20a6 100644 --- a/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java +++ b/src/main/java/at/procon/dip/ingestion/adapter/InlineContentDocumentIngestionAdapter.java @@ -7,11 +7,15 @@ import at.procon.dip.ingestion.spi.DocumentIngestionAdapter; import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.SourceDescriptor; import java.util.List; + +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Component; @Component @RequiredArgsConstructor +@ConditionalOnRuntimeMode(RuntimeMode.NEW) public class InlineContentDocumentIngestionAdapter implements DocumentIngestionAdapter { private final GenericDocumentImportService importService; diff --git a/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java index 45429fa..77dc7b3 100644 --- a/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java +++ b/src/main/java/at/procon/dip/ingestion/adapter/TedPackageDocumentIngestionAdapter.java @@ -1,6 +1,8 @@ package at.procon.dip.ingestion.adapter; import at.procon.dip.domain.access.DocumentAccessContext; +import at.procon.dip.domain.document.CanonicalDocumentMetadata; +import at.procon.dip.domain.document.SourceType; import at.procon.dip.ingestion.dto.ImportedDocumentResult; import at.procon.dip.ingestion.service.GenericDocumentImportService; import at.procon.dip.ingestion.service.TedPackageChildImportProcessor; @@ -22,6 +24,7 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.stereotype.Component; import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Transactional; @@ -40,7 +43,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap @Override public boolean supports(SourceDescriptor sourceDescriptor) { - return sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.TED_PACKAGE + return sourceDescriptor.sourceType() == SourceType.TED_PACKAGE && properties.isEnabled() && properties.isTedPackageAdapterEnabled(); } @@ -58,7 +61,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap ImportedDocumentResult packageDocument = importService.importDocument(new SourceDescriptor( sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(), - at.procon.dip.domain.document.SourceType.TED_PACKAGE, + SourceType.TED_PACKAGE, sourceDescriptor.sourceIdentifier(), packageRootSource.sourceUri(), sourceDescriptor.fileName(), @@ -71,7 +74,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap )); List warnings = new ArrayList<>(packageDocument.warnings()); - List documents = new ArrayList<>(); + List documents = new ArrayList<>(); documents.add(packageDocument.document().toCanonicalMetadata()); AtomicInteger sortOrder = new AtomicInteger(); diff --git a/src/main/java/at/procon/dip/ingestion/camel/TedPackageDownloadRoute.java b/src/main/java/at/procon/dip/ingestion/camel/TedPackageDownloadRoute.java new file mode 100644 index 0000000..b579613 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/camel/TedPackageDownloadRoute.java @@ -0,0 +1,328 @@ +package at.procon.dip.ingestion.camel; + +import at.procon.dip.domain.document.SourceType; +import at.procon.dip.ingestion.config.TedPackageDownloadProperties; +import at.procon.dip.ingestion.service.DocumentIngestionGateway; +import at.procon.dip.ingestion.spi.IngestionResult; +import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.ted.model.entity.TedDailyPackage; +import at.procon.ted.repository.TedDailyPackageRepository; +import at.procon.dip.domain.ted.service.TedPackageSequenceService; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.security.MessageDigest; +import java.time.OffsetDateTime; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.apache.camel.Exchange; +import org.apache.camel.LoggingLevel; +import org.apache.camel.builder.RouteBuilder; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Component; + +/** + * NEW-runtime TED daily package download route. + *

+ * Reuses the proven package sequencing rules through {@link TedPackageSequenceService}, + * but hands off processing only to the NEW ingestion gateway. No legacy XML batch persistence, + * no legacy vectorization route, no old semantic path. + */ +@Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) +@ConditionalOnProperty(name = "dip.ingestion.ted-download.enabled", havingValue = "true") +@RequiredArgsConstructor +@Slf4j +public class TedPackageDownloadRoute extends RouteBuilder { + + private static final String ROUTE_ID_SCHEDULER = "ted-package-new-scheduler"; + private static final String ROUTE_ID_DOWNLOADER = "ted-package-new-downloader"; + private static final String ROUTE_ID_ERROR = "ted-package-new-error-handler"; + + private final TedPackageDownloadProperties properties; + private final TedDailyPackageRepository packageRepository; + private final TedPackageSequenceService sequenceService; + private final DocumentIngestionGateway documentIngestionGateway; + + @Override + public void configure() { + errorHandler(deadLetterChannel("direct:ted-package-new-error") + .maximumRedeliveries(3) + .redeliveryDelay(10_000) + .retryAttemptedLogLevel(LoggingLevel.WARN) + .logStackTrace(true)); + + from("direct:ted-package-new-error") + .routeId(ROUTE_ID_ERROR) + .process(this::handleError); + + from("timer:ted-package-new-scheduler?period={{dip.ingestion.ted-download.poll-interval:3600000}}&delay=0") + .routeId(ROUTE_ID_SCHEDULER) + .process(this::checkRunningPackages) + .choice() + .when(header("tooManyRunning").isEqualTo(true)) + .log(LoggingLevel.INFO, "Skipping NEW TED package download - already ${header.runningCount} packages in progress") + .otherwise() + .process(this::determineNextPackage) + .choice() + .when(header("packageId").isNotNull()) + .to("direct:download-ted-package-new") + .otherwise() + .log(LoggingLevel.INFO, "No NEW TED package to download right now") + .end() + .end(); + + from("direct:download-ted-package-new") + .routeId(ROUTE_ID_DOWNLOADER) + .log(LoggingLevel.INFO, "NEW TED package download started: ${header.packageId}") + .setHeader("downloadStartTime", constant(System.currentTimeMillis())) + .process(this::createPackageRecord) + .delay(simple("{{dip.ingestion.ted-download.delay-between-downloads:5000}}")) + .setHeader(Exchange.HTTP_METHOD, constant("GET")) + .setHeader("CamelHttpConnectionClose", constant(true)) + .toD("${header.downloadUrl}?bridgeEndpoint=true&throwExceptionOnFailure=false&socketTimeout={{dip.ingestion.ted-download.download-timeout:300000}}") + .choice() + .when(header(Exchange.HTTP_RESPONSE_CODE).isEqualTo(200)) + .process(this::calculateHash) + .process(this::checkDuplicateByHash) + .choice() + .when(header("isDuplicate").isEqualTo(true)) + .process(this::markDuplicate) + .otherwise() + .process(this::saveDownloadedPackage) + .process(this::ingestThroughGateway) + .process(this::markCompleted) + .endChoice() + .when(header(Exchange.HTTP_RESPONSE_CODE).isEqualTo(404)) + .process(this::markNotFound) + .otherwise() + .process(this::markFailed) + .end(); + } + + private void checkRunningPackages(Exchange exchange) { + long downloadingCount = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADING).size(); + long processingCount = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.PROCESSING).size(); + long runningCount = downloadingCount + processingCount; + + exchange.getIn().setHeader("runningCount", runningCount); + exchange.getIn().setHeader("tooManyRunning", runningCount >= properties.getMaxRunningPackages()); + + if (runningCount > 0) { + log.info("Currently {} TED packages in progress in NEW runtime ({} downloading, {} processing)", + runningCount, downloadingCount, processingCount); + } + } + + private void determineNextPackage(Exchange exchange) { + List pendingPackages = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.PENDING); + + if (!pendingPackages.isEmpty()) { + TedDailyPackage pkg = pendingPackages.get(0); + log.info("Retrying PENDING TED package in NEW runtime: {}", pkg.getPackageIdentifier()); + setPackageHeaders(exchange, pkg.getYear(), pkg.getSerialNumber()); + return; + } + + TedPackageSequenceService.PackageInfo packageInfo = sequenceService.getNextPackageToDownload(); + if (packageInfo == null) { + exchange.getIn().setHeader("packageId", null); + return; + } + + setPackageHeaders(exchange, packageInfo.year(), packageInfo.serialNumber()); + } + + private void setPackageHeaders(Exchange exchange, int year, int serialNumber) { + String packageId = "%04d%05d".formatted(year, serialNumber); + String downloadUrl = properties.getBaseUrl() + packageId; + + exchange.getIn().setHeader("packageId", packageId); + exchange.getIn().setHeader("year", year); + exchange.getIn().setHeader("serialNumber", serialNumber); + exchange.getIn().setHeader("downloadUrl", downloadUrl); + } + + private void createPackageRecord(Exchange exchange) { + String packageId = exchange.getIn().getHeader("packageId", String.class); + Integer year = exchange.getIn().getHeader("year", Integer.class); + Integer serialNumber = exchange.getIn().getHeader("serialNumber", Integer.class); + String downloadUrl = exchange.getIn().getHeader("downloadUrl", String.class); + + if (packageRepository.existsByPackageIdentifier(packageId)) { + return; + } + + TedDailyPackage pkg = TedDailyPackage.builder() + .packageIdentifier(packageId) + .year(year) + .serialNumber(serialNumber) + .downloadUrl(downloadUrl) + .downloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADING) + .build(); + + packageRepository.save(pkg); + } + + private void calculateHash(Exchange exchange) throws Exception { + byte[] body = exchange.getIn().getBody(byte[].class); + MessageDigest digest = MessageDigest.getInstance("SHA-256"); + byte[] hashBytes = digest.digest(body); + + StringBuilder sb = new StringBuilder(); + for (byte b : hashBytes) { + sb.append(String.format("%02x", b)); + } + + exchange.getIn().setHeader("fileHash", sb.toString()); + } + + private void checkDuplicateByHash(Exchange exchange) { + String hash = exchange.getIn().getHeader("fileHash", String.class); + + Optional duplicate = packageRepository.findAll().stream() + .filter(p -> hash.equals(p.getFileHash())) + .findFirst(); + + exchange.getIn().setHeader("isDuplicate", duplicate.isPresent()); + duplicate.ifPresent(pkg -> exchange.getIn().setHeader("duplicateOf", pkg.getPackageIdentifier())); + } + + private void saveDownloadedPackage(Exchange exchange) throws Exception { + String packageId = exchange.getIn().getHeader("packageId", String.class); + String hash = exchange.getIn().getHeader("fileHash", String.class); + byte[] body = exchange.getIn().getBody(byte[].class); + + Path downloadDir = Paths.get(properties.getDownloadDirectory()); + Files.createDirectories(downloadDir); + Path downloadPath = downloadDir.resolve(packageId + ".tar.gz"); + Files.write(downloadPath, body); + + long downloadDuration = System.currentTimeMillis() - + exchange.getIn().getHeader("downloadStartTime", Long.class); + + packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> { + pkg.setFileHash(hash); + pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADED); + pkg.setDownloadedAt(OffsetDateTime.now()); + pkg.setDownloadDurationMs(downloadDuration); + packageRepository.save(pkg); + }); + + exchange.getIn().setHeader("downloadPath", downloadPath.toString()); + } + + private void ingestThroughGateway(Exchange exchange) { + String packageId = exchange.getIn().getHeader("packageId", String.class); + String downloadPath = exchange.getIn().getHeader("downloadPath", String.class); + + packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> { + pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.PROCESSING); + packageRepository.save(pkg); + }); + + IngestionResult ingestionResult = documentIngestionGateway.ingest(new SourceDescriptor( + null, + SourceType.TED_PACKAGE, + packageId, + downloadPath, + packageId + ".tar.gz", + "application/gzip", + null, + null, + OffsetDateTime.now(), + OriginalContentStoragePolicy.DEFAULT, + Map.of( + "packageId", packageId, + "title", packageId + ".tar.gz" + ) + )); + + int importedChildCount = Math.max(0, ingestionResult.documents().size() - 1); + exchange.getIn().setHeader("gatewayImportedChildCount", importedChildCount); + exchange.getIn().setHeader("gatewayImportWarnings", ingestionResult.warnings().size()); + + packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> { + pkg.setXmlFileCount(importedChildCount); + pkg.setProcessedCount(importedChildCount); + pkg.setFailedCount(0); + packageRepository.save(pkg); + }); + } + + private void markCompleted(Exchange exchange) throws Exception { + String packageId = exchange.getIn().getHeader("packageId", String.class); + String downloadPath = exchange.getIn().getHeader("downloadPath", String.class); + + packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> { + pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.COMPLETED); + pkg.setProcessedAt(OffsetDateTime.now()); + if (pkg.getDownloadedAt() != null) { + long processingDuration = Math.max(0L, + java.time.Duration.between(pkg.getDownloadedAt(), OffsetDateTime.now()).toMillis()); + pkg.setProcessingDurationMs(processingDuration); + } + packageRepository.save(pkg); + }); + + if (properties.isDeleteAfterIngestion() && downloadPath != null) { + Files.deleteIfExists(Path.of(downloadPath)); + } + + packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> { + long totalDuration = (pkg.getDownloadDurationMs() != null ? pkg.getDownloadDurationMs() : 0L) + + (pkg.getProcessingDurationMs() != null ? pkg.getProcessingDurationMs() : 0L); + log.info("NEW TED package {} completed: xmlCount={}, processed={}, failed={}, totalDuration={}ms", + packageId, pkg.getXmlFileCount(), pkg.getProcessedCount(), pkg.getFailedCount(), totalDuration); + }); + } + + private void markNotFound(Exchange exchange) { + String packageId = exchange.getIn().getHeader("packageId", String.class); + packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> { + pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.NOT_FOUND); + pkg.setErrorMessage("Package not found (404)"); + packageRepository.save(pkg); + }); + } + + private void markFailed(Exchange exchange) { + String packageId = exchange.getIn().getHeader("packageId", String.class); + Integer httpCode = exchange.getIn().getHeader(Exchange.HTTP_RESPONSE_CODE, Integer.class); + packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> { + pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.FAILED); + pkg.setErrorMessage("HTTP " + httpCode); + packageRepository.save(pkg); + }); + } + + private void markDuplicate(Exchange exchange) { + String packageId = exchange.getIn().getHeader("packageId", String.class); + String duplicateOf = exchange.getIn().getHeader("duplicateOf", String.class); + packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> { + pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.COMPLETED); + pkg.setErrorMessage("Duplicate of " + duplicateOf); + pkg.setProcessedAt(OffsetDateTime.now()); + packageRepository.save(pkg); + }); + } + + private void handleError(Exchange exchange) { + Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class); + String packageId = exchange.getIn().getHeader("packageId", String.class); + + if (packageId != null) { + packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> { + pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.FAILED); + pkg.setErrorMessage(exception != null ? exception.getMessage() : "Unknown route error"); + packageRepository.save(pkg); + }); + } + } +} diff --git a/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java b/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java index 4986c4e..40d29ec 100644 --- a/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java +++ b/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java @@ -1,6 +1,8 @@ package at.procon.dip.ingestion.config; import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import jakarta.validation.constraints.NotBlank; import jakarta.validation.constraints.Positive; import lombok.Data; diff --git a/src/main/java/at/procon/dip/ingestion/config/TedPackageDownloadProperties.java b/src/main/java/at/procon/dip/ingestion/config/TedPackageDownloadProperties.java new file mode 100644 index 0000000..fb68de2 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/config/TedPackageDownloadProperties.java @@ -0,0 +1,52 @@ +package at.procon.dip.ingestion.config; + +import jakarta.validation.constraints.Min; +import jakarta.validation.constraints.NotBlank; +import jakarta.validation.constraints.Positive; +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Configuration; + +/** + * NEW-runtime TED package download configuration. + *

+ * This is intentionally separate from the legacy {@code ted.download.*} tree. + */ +@Configuration +@ConfigurationProperties(prefix = "dip.ingestion.ted-download") +@Data +public class TedPackageDownloadProperties { + + private boolean enabled = false; + + @NotBlank + private String baseUrl = "https://ted.europa.eu/packages/daily/"; + + @NotBlank + private String downloadDirectory = "/ted.europe/downloads-new"; + + @Positive + private int startYear = 2015; + + @Positive + private long pollInterval = 3_600_000L; + + @Positive + private long notFoundRetryInterval = 21_600_000L; + + @Min(0) + private int previousYearGracePeriodDays = 30; + + private boolean retryCurrentYearNotFoundIndefinitely = true; + + @Positive + private long downloadTimeout = 300_000L; + + @Positive + private int maxRunningPackages = 2; + + @Positive + private long delayBetweenDownloads = 5_000L; + + private boolean deleteAfterIngestion = true; +} diff --git a/src/main/java/at/procon/ted/config/LegacyTedProperties.java b/src/main/java/at/procon/ted/config/LegacyTedProperties.java deleted file mode 100644 index b06e2ce..0000000 --- a/src/main/java/at/procon/ted/config/LegacyTedProperties.java +++ /dev/null @@ -1,16 +0,0 @@ -package at.procon.ted.config; - -import org.springframework.boot.context.properties.ConfigurationProperties; -import org.springframework.context.annotation.Configuration; - -/** - * Patch A scaffold for the legacy runtime configuration tree. - * - * The legacy runtime still uses {@link TedProcessorProperties} today. This class is - * introduced so the old configuration can be moved gradually from `ted.*` to - * `legacy.ted.*` without blocking the runtime split. - */ -@Configuration -@ConfigurationProperties(prefix = "legacy.ted") -public class LegacyTedProperties extends TedProcessorProperties { -} diff --git a/src/main/java/at/procon/ted/controller/DocumentController.java b/src/main/java/at/procon/ted/controller/DocumentController.java index a7f19c2..7f0647a 100644 --- a/src/main/java/at/procon/ted/controller/DocumentController.java +++ b/src/main/java/at/procon/ted/controller/DocumentController.java @@ -1,5 +1,7 @@ package at.procon.ted.controller; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import at.procon.ted.model.dto.DocumentDtos.*; import at.procon.ted.model.entity.ContractNature; import at.procon.ted.model.entity.NoticeType; @@ -38,6 +40,7 @@ import java.util.UUID; @RequestMapping("/v1/documents") @RequiredArgsConstructor @Slf4j +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @Tag(name = "Documents", description = "TED Procurement Document Search API") public class DocumentController { diff --git a/src/main/java/at/procon/ted/controller/SimilaritySearchController.java b/src/main/java/at/procon/ted/controller/SimilaritySearchController.java index 5ce2e16..862e2cb 100644 --- a/src/main/java/at/procon/ted/controller/SimilaritySearchController.java +++ b/src/main/java/at/procon/ted/controller/SimilaritySearchController.java @@ -1,5 +1,7 @@ package at.procon.ted.controller; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import at.procon.ted.service.SimilaritySearchService; import at.procon.ted.service.SimilaritySearchService.SimilaritySearchResponse; import io.swagger.v3.oas.annotations.Operation; @@ -28,6 +30,7 @@ import java.io.IOException; @RequestMapping("/similarity") @RequiredArgsConstructor @Slf4j +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @Tag(name = "Similarity Search", description = "Vector-based semantic similarity search on TED procurement documents") public class SimilaritySearchController { diff --git a/src/main/resources/application-legacy.yml b/src/main/resources/application-legacy.yml index 23a314f..1b13171 100644 --- a/src/main/resources/application-legacy.yml +++ b/src/main/resources/application-legacy.yml @@ -1,11 +1,152 @@ -spring: - config: - activate: - on-profile: legacy -dip: - runtime: - mode: LEGACY +# Legacy / shared application properties +# New-runtime-only properties are moved to application-new.yml. +ted: + # Directory configuration for file processing + input: + # Base directory for watching incoming TED XML files + directory: ${TED_INPUT_DIR:/ted.europe/extracted} + # File pattern to match (recursive scanning) + pattern: "**/*.xml" + # Move processed files to this directory + processed-directory: ${TED_PROCESSED_DIR:.processed} + # Move failed files to this directory + error-directory: ${TED_ERROR_DIR:.error} + # Polling interval in milliseconds + poll-interval: 5000 + # Maximum messages per poll (reduced to prevent memory issues) + max-messages-per-poll: 10 + + # Schema validation configuration + schema: + # Enable/disable XSD validation + enabled: true + # Path to eForms SDK schemas (from Maven dependency or custom location) + path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd + + # Vectorization configuration + vectorization: + # Enable/disable async vectorization + enabled: false + # Use external HTTP API instead of subprocess + use-http-api: true + # Embedding service URL + api-url: http://172.20.240.18:8001 + # Model name for sentence-transformers + model-name: intfloat/multilingual-e5-large + # Vector dimensions (must match model output) + dimensions: 1024 + # Batch size for vectorization + batch-size: 16 + # Thread pool size for async processing + thread-pool-size: 4 + # Maximum text length for vectorization (characters) + max-text-length: 8192 + # HTTP connection timeout (milliseconds) + connect-timeout: 10000 + # HTTP socket/read timeout (milliseconds) + socket-timeout: 60000 + # Maximum retries on connection failure + max-retries: 5 + # Packages download configuration + download: + # Enable/disable automatic package download + enabled: false + # User service-based camel route + use-service-based: false + # Base URL for TED Daily Packages + base-url: https://ted.europa.eu/packages/daily/ + # Download directory for tar.gz files + download-directory: /ted.europe/downloads + # Extract directory for XML files + extract-directory: /ted.europe/extracted + # Start year for downloads + start-year: 2026 + # Max consecutive 404 errors before stopping + max-consecutive-404: 4 + # Polling interval (milliseconds) - 2 minutes + poll-interval: 300000 + # Retry interval for tail NOT_FOUND packages - 6 hours + not-found-retry-interval: 21600000 + # Grace period after year end before a previous-year tail 404 is treated as final + previous-year-grace-period-days: 30 + # Keep retrying current-year tail 404 packages indefinitely + retry-current-year-not-found-indefinitely: true + # Download timeout (milliseconds) - 5 minutes + download-timeout: 300000 + # Max concurrent downloads + max-concurrent-downloads: 2 + # Delay between downloads (milliseconds) for rate limiting - 5 seconds + delay-between-downloads: 3000 + # Delete tar.gz after extraction + delete-after-extraction: true + # Prioritize current year first + prioritize-current-year: false + # IMAP Mail configuration + mail: + # Enable/disable mail processing + enabled: false + # IMAP server hostname + host: mail.mymagenta.business + # IMAP server port (993 for IMAPS) + port: 993 + # Mail account username (email address) + username: archiv@procon.co.at + # Mail account password + password: ${MAIL_PASSWORD:worasigg} + # Use SSL/TLS connection + ssl: true + # Mail folder to read from + folder-name: INBOX + # Delete messages after processing + delete: false + # Mark messages as seen after processing (false = peek mode, don't mark as read) + seen: false + # Only process unseen messages + unseen: true + # Polling delay in milliseconds (1 minute) + delay: 60000 + # Max messages per poll + max-messages-per-poll: 100 + # Output directory for processed attachments + attachment-output-directory: /ted.europe/mail-attachments + # Enable/disable MIME file input processing + mime-input-enabled: true + # Input directory for MIME files (.eml) + mime-input-directory: /ted.europe/mime-input + # File pattern for MIME files (regex) + mime-input-pattern: .*\\.eml + # Polling interval for MIME input directory (milliseconds) + mime-input-poll-interval: 1000000 + # solution brief processing + solution-brief: + # Enable/disable Solution Brief processing + enabled: false + # Input directory for Solution Brief PDF files + input-directory: C:/work/SolutionBrief + # Output directory for Excel result files (relative to input or absolute) + result-directory: ./result + # Number of top similar documents to include + top-k: 20 + # Minimum similarity threshold (0.0-1.0) + similarity-threshold: 0.5 + # Polling interval in milliseconds (30 seconds) + poll-interval: 30000 + # File pattern for PDF files (regex) + file-pattern: .*\\.pdf + # Process files only once (idempotent) + idempotent: true + # Idempotent repository file path + idempotent-repository: ./solution-brief-processed.dat + + # Data cleanup configuration + cleanup: + # Enable automatic cleanup of old documents + enabled: false + # Retention period in years (default: 10) + retention-years: 10 + # Cron expression for cleanup schedule (default: daily at 2 AM) + cron: "0 0 2 * * *" # Legacy runtime uses the existing ted.* property tree. # Move old route/download/mail/vectorization/search settings here over time. diff --git a/src/main/resources/application-new.yml b/src/main/resources/application-new.yml index ab6ee21..f7439ab 100644 --- a/src/main/resources/application-new.yml +++ b/src/main/resources/application-new.yml @@ -1,16 +1,6 @@ -# New runtime overrides -# Activate with: --spring.profiles.active=new - -# Optional explicit marker; file is profile-specific already -spring: - config: - activate: - on-profile: new - dip: runtime: mode: NEW - search: # Default page size for search results default-page-size: 20 @@ -44,7 +34,6 @@ dip: startup-lexical-backfill-limit: 500 # Number of top hits per engine returned by /search/debug debug-top-hits-per-engine: 10 - embedding: enabled: true default-document-model: e5-default @@ -75,7 +64,6 @@ dip: active: true jobs: enabled: true - # Phase 4 generic ingestion configuration ingestion: # Master switch for arbitrary document ingestion into the DOC model @@ -132,6 +120,32 @@ dip: # Import batch marker for mail roots and attachments mail-import-batch-id: phase41-mail + # ted packages download configuration + ted-download: + # Enable/disable automatic package download + enabled: true + # Base URL for TED Daily Packages + base-url: https://ted.europa.eu/packages/daily/ + # Download directory for tar.gz files + download-directory: /ted.europe/downloads-new + # Start year for downloads + start-year: 2026 + # Polling interval (milliseconds) - 2 minutes + poll-interval: 3600000 + # Retry interval for tail NOT_FOUND packages - 6 hours + not-found-retry-interval: 21600000 + # Grace period after year end before a previous-year tail 404 is treated as final + previous-year-grace-period-days: 30 + # Keep retrying current-year tail 404 packages indefinitely + retry-current-year-not-found-indefinitely: true + # Download timeout (milliseconds) - 5 minutes + download-timeout: 300000 + # Max concurrent downloads + max-running-packages: 2 + # Delay between downloads (milliseconds) for rate limiting - 5 seconds + delay-between-downloads: 5000 + # Delete tar.gz after ingestion + delete-after-ingestion: true ted: # Phase 3 TED projection configuration projection: # Enable/disable dual-write into the TED projection model on top of DOC.doc_document diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 283c87d..1581780 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -7,6 +7,9 @@ server: context-path: /api spring: + profiles: + active: new + application: name: document-intelligence-platform @@ -56,161 +59,6 @@ camel: enabled: true # Weniger strenge Health-Checks für File-Consumer consumers-enabled: false - -# Default runtime mode: legacy / initial implementation -# Activate profile 'new' to load application-new.yml and switch to the new runtime. -dip: - runtime: - mode: LEGACY - -# Legacy / shared application properties -# New-runtime-only properties are moved to application-new.yml. -ted: - # Directory configuration for file processing - input: - # Base directory for watching incoming TED XML files - directory: ${TED_INPUT_DIR:/ted.europe/extracted} - # File pattern to match (recursive scanning) - pattern: "**/*.xml" - # Move processed files to this directory - processed-directory: ${TED_PROCESSED_DIR:.processed} - # Move failed files to this directory - error-directory: ${TED_ERROR_DIR:.error} - # Polling interval in milliseconds - poll-interval: 5000 - # Maximum messages per poll (reduced to prevent memory issues) - max-messages-per-poll: 10 - - # Schema validation configuration - schema: - # Enable/disable XSD validation - enabled: true - # Path to eForms SDK schemas (from Maven dependency or custom location) - path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd - - # Vectorization configuration - vectorization: - # Enable/disable async vectorization - enabled: false - # Use external HTTP API instead of subprocess - use-http-api: true - # Embedding service URL - api-url: http://172.20.240.18:8001 - # Model name for sentence-transformers - model-name: intfloat/multilingual-e5-large - # Vector dimensions (must match model output) - dimensions: 1024 - # Batch size for vectorization - batch-size: 16 - # Thread pool size for async processing - thread-pool-size: 4 - # Maximum text length for vectorization (characters) - max-text-length: 8192 - # HTTP connection timeout (milliseconds) - connect-timeout: 10000 - # HTTP socket/read timeout (milliseconds) - socket-timeout: 60000 - # Maximum retries on connection failure - max-retries: 5 - # Packages download configuration - download: - # Enable/disable automatic package download - enabled: true - # User service-based camel route - use-service-based: false - # Base URL for TED Daily Packages - base-url: https://ted.europa.eu/packages/daily/ - # Download directory for tar.gz files - download-directory: /ted.europe/downloads - # Extract directory for XML files - extract-directory: /ted.europe/extracted - # Start year for downloads - start-year: 2026 - # Max consecutive 404 errors before stopping - max-consecutive-404: 4 - # Polling interval (milliseconds) - 2 minutes - poll-interval: 300000 - # Retry interval for tail NOT_FOUND packages - 6 hours - not-found-retry-interval: 21600000 - # Grace period after year end before a previous-year tail 404 is treated as final - previous-year-grace-period-days: 30 - # Keep retrying current-year tail 404 packages indefinitely - retry-current-year-not-found-indefinitely: true - # Download timeout (milliseconds) - 5 minutes - download-timeout: 300000 - # Max concurrent downloads - max-concurrent-downloads: 2 - # Delay between downloads (milliseconds) for rate limiting - 5 seconds - delay-between-downloads: 3000 - # Delete tar.gz after extraction - delete-after-extraction: true - # Prioritize current year first - prioritize-current-year: false - # IMAP Mail configuration - mail: - # Enable/disable mail processing - enabled: false - # IMAP server hostname - host: mail.mymagenta.business - # IMAP server port (993 for IMAPS) - port: 993 - # Mail account username (email address) - username: archiv@procon.co.at - # Mail account password - password: ${MAIL_PASSWORD:worasigg} - # Use SSL/TLS connection - ssl: true - # Mail folder to read from - folder-name: INBOX - # Delete messages after processing - delete: false - # Mark messages as seen after processing (false = peek mode, don't mark as read) - seen: false - # Only process unseen messages - unseen: true - # Polling delay in milliseconds (1 minute) - delay: 60000 - # Max messages per poll - max-messages-per-poll: 100 - # Output directory for processed attachments - attachment-output-directory: /ted.europe/mail-attachments - # Enable/disable MIME file input processing - mime-input-enabled: true - # Input directory for MIME files (.eml) - mime-input-directory: /ted.europe/mime-input - # File pattern for MIME files (regex) - mime-input-pattern: .*\\.eml - # Polling interval for MIME input directory (milliseconds) - mime-input-poll-interval: 1000000 - # solution brief processing - solution-brief: - # Enable/disable Solution Brief processing - enabled: false - # Input directory for Solution Brief PDF files - input-directory: C:/work/SolutionBrief - # Output directory for Excel result files (relative to input or absolute) - result-directory: ./result - # Number of top similar documents to include - top-k: 20 - # Minimum similarity threshold (0.0-1.0) - similarity-threshold: 0.5 - # Polling interval in milliseconds (30 seconds) - poll-interval: 30000 - # File pattern for PDF files (regex) - file-pattern: .*\\.pdf - # Process files only once (idempotent) - idempotent: true - # Idempotent repository file path - idempotent-repository: ./solution-brief-processed.dat - - # Data cleanup configuration - cleanup: - # Enable automatic cleanup of old documents - enabled: false - # Retention period in years (default: 10) - retention-years: 10 - # Cron expression for cleanup schedule (default: daily at 2 AM) - cron: "0 0 2 * * *" # Actuator endpoints management: