ted package new import

master
trifonovt 4 weeks ago
parent 1aa599b587
commit fbd249e56b

@ -0,0 +1,20 @@
# NEW TED package import route
This patch adds a NEW-runtime TED package download path that:
- reuses the proven package sequencing rules
- stores package tracking in `TedDailyPackage`
- downloads the package tar.gz
- ingests it only through `DocumentIngestionGateway`
- never calls the legacy XML batch processing / vectorization flow
## Added classes
- `TedPackageSequenceService`
- `DefaultTedPackageSequenceService`
- `TedPackageDownloadNewProperties`
- `TedPackageDownloadNewRoute`
## Config
Use the `dip.ingestion.ted-download.*` block in `application-new.yml`.

@ -0,0 +1,189 @@
package at.procon.dip.domain.ted.service;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.dip.ingestion.config.TedPackageDownloadProperties;
import at.procon.ted.model.entity.TedDailyPackage;
import at.procon.ted.repository.TedDailyPackageRepository;
import java.time.Duration;
import java.time.LocalDate;
import java.time.OffsetDateTime;
import java.time.Year;
import java.time.ZoneOffset;
import java.util.Optional;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
/**
* NEW-runtime implementation of TED package sequencing.
* <p>
* This reuses the same decision rules as the legacy TED package downloader:
* <ul>
* <li>current year forward crawling first</li>
* <li>gap filling by walking backward to package 1</li>
* <li>NOT_FOUND retry handling with current-year indefinite retry support</li>
* <li>previous-year grace period before a tail NOT_FOUND becomes final</li>
* </ul>
*/
@Service
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@RequiredArgsConstructor
@Slf4j
public class DefaultTedPackageSequenceService implements TedPackageSequenceService {
private final TedPackageDownloadProperties properties;
private final TedDailyPackageRepository packageRepository;
@Override
public PackageInfo getNextPackageToDownload() {
int currentYear = Year.now().getValue();
log.debug("Determining next TED package to download for NEW runtime (current year: {})", currentYear);
// 1) Current year forward crawling first (newest data first)
PackageInfo nextInCurrentYear = getNextForwardPackage(currentYear);
if (nextInCurrentYear != null) {
log.info("Next TED package: {} (current year {} forward)", nextInCurrentYear.identifier(), currentYear);
return nextInCurrentYear;
}
// 2) Walk all years backward and fill gaps / continue unfinished years
for (int year = currentYear; year >= properties.getStartYear(); year--) {
PackageInfo gapFiller = getGapFillerPackage(year);
if (gapFiller != null) {
log.info("Next TED package: {} (filling gap in year {})", gapFiller.identifier(), year);
return gapFiller;
}
if (!isYearComplete(year)) {
PackageInfo forwardPackage = getNextForwardPackage(year);
if (forwardPackage != null) {
log.info("Next TED package: {} (continuing year {})", forwardPackage.identifier(), year);
return forwardPackage;
}
} else {
log.debug("TED package year {} is complete", year);
}
}
// 3) Open a new older year if possible
int oldestYear = getOldestYearWithData();
if (oldestYear > properties.getStartYear()) {
int previousYear = oldestYear - 1;
if (previousYear >= properties.getStartYear()) {
PackageInfo first = new PackageInfo(previousYear, 1);
log.info("Next TED package: {} (opening year {})", first.identifier(), previousYear);
return first;
}
}
log.info("All TED package years from {} to {} appear complete - nothing to download",
properties.getStartYear(), currentYear);
return null;
}
private PackageInfo getNextForwardPackage(int year) {
Optional<TedDailyPackage> latest = packageRepository.findLatestByYear(year);
if (latest.isEmpty()) {
return new PackageInfo(year, 1);
}
TedDailyPackage latestPackage = latest.get();
if (latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) {
if (shouldRetryNotFoundPackage(latestPackage)) {
return new PackageInfo(year, latestPackage.getSerialNumber());
}
if (isNotFoundRetryableForYear(latestPackage)) {
log.debug("Year {} still inside NOT_FOUND retry window for package {} until {}",
year, latestPackage.getPackageIdentifier(), calculateNextRetryAt(latestPackage));
return null;
}
log.debug("Year {} finalized after grace period at tail package {}", year, latestPackage.getPackageIdentifier());
return null;
}
return new PackageInfo(year, latestPackage.getSerialNumber() + 1);
}
private PackageInfo getGapFillerPackage(int year) {
Optional<TedDailyPackage> first = packageRepository.findFirstByYear(year);
if (first.isEmpty()) {
return null;
}
int minSerial = first.get().getSerialNumber();
if (minSerial <= 1) {
return null;
}
return new PackageInfo(year, minSerial - 1);
}
private boolean isYearComplete(int year) {
Optional<TedDailyPackage> first = packageRepository.findFirstByYear(year);
Optional<TedDailyPackage> latest = packageRepository.findLatestByYear(year);
if (first.isEmpty() || latest.isEmpty()) {
return false;
}
if (first.get().getSerialNumber() != 1) {
return false;
}
TedDailyPackage latestPackage = latest.get();
return latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND
&& !isNotFoundRetryableForYear(latestPackage);
}
private boolean shouldRetryNotFoundPackage(TedDailyPackage pkg) {
if (!isNotFoundRetryableForYear(pkg)) {
return false;
}
OffsetDateTime nextRetryAt = calculateNextRetryAt(pkg);
return !nextRetryAt.isAfter(OffsetDateTime.now());
}
private boolean isNotFoundRetryableForYear(TedDailyPackage pkg) {
int currentYear = Year.now().getValue();
int packageYear = pkg.getYear() != null ? pkg.getYear() : currentYear;
if (packageYear >= currentYear) {
return properties.isRetryCurrentYearNotFoundIndefinitely();
}
return OffsetDateTime.now().isBefore(getYearRetryGraceDeadline(packageYear));
}
private OffsetDateTime calculateNextRetryAt(TedDailyPackage pkg) {
OffsetDateTime lastAttemptAt = pkg.getUpdatedAt() != null
? pkg.getUpdatedAt()
: (pkg.getCreatedAt() != null ? pkg.getCreatedAt() : OffsetDateTime.now());
return lastAttemptAt.plus(Duration.ofMillis(properties.getNotFoundRetryInterval()));
}
private OffsetDateTime getYearRetryGraceDeadline(int year) {
return LocalDate.of(year + 1, 1, 1)
.atStartOfDay()
.atOffset(ZoneOffset.UTC)
.plusDays(properties.getPreviousYearGracePeriodDays());
}
private int getOldestYearWithData() {
int currentYear = Year.now().getValue();
for (int year = properties.getStartYear(); year <= currentYear; year++) {
if (packageRepository.findLatestByYear(year).isPresent()) {
return year;
}
}
return currentYear;
}
}

@ -0,0 +1,25 @@
package at.procon.dip.domain.ted.service;
/**
* Shared package sequencing contract used to determine the next TED daily package to download.
* <p>
* This service encapsulates the proven sequencing rules from the legacy download implementation
* so they can also be used by the NEW runtime without depending on the old route/service graph.
*/
public interface TedPackageSequenceService {
/**
* Returns the next package to download according to the current sequencing strategy,
* or {@code null} if nothing should be downloaded right now.
*/
PackageInfo getNextPackageToDownload();
/**
* Simple year/serial pair with TED package identifier helper.
*/
record PackageInfo(int year, int serialNumber) {
public String identifier() {
return "%04d%05d".formatted(year, serialNumber);
}
}
}

@ -6,11 +6,15 @@ import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.List; import java.util.List;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@Component @Component
@RequiredArgsConstructor @RequiredArgsConstructor
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
public class FileSystemDocumentIngestionAdapter implements DocumentIngestionAdapter { public class FileSystemDocumentIngestionAdapter implements DocumentIngestionAdapter {
private final GenericDocumentImportService importService; private final GenericDocumentImportService importService;

@ -7,11 +7,15 @@ import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
import at.procon.dip.ingestion.spi.IngestionResult; import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.List; import java.util.List;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@Component @Component
@RequiredArgsConstructor @RequiredArgsConstructor
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
public class InlineContentDocumentIngestionAdapter implements DocumentIngestionAdapter { public class InlineContentDocumentIngestionAdapter implements DocumentIngestionAdapter {
private final GenericDocumentImportService importService; private final GenericDocumentImportService importService;

@ -1,6 +1,8 @@
package at.procon.dip.ingestion.adapter; package at.procon.dip.ingestion.adapter;
import at.procon.dip.domain.access.DocumentAccessContext; import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.ingestion.dto.ImportedDocumentResult; import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.service.GenericDocumentImportService; import at.procon.dip.ingestion.service.GenericDocumentImportService;
import at.procon.dip.ingestion.service.TedPackageChildImportProcessor; import at.procon.dip.ingestion.service.TedPackageChildImportProcessor;
@ -22,6 +24,7 @@ import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional; import org.springframework.transaction.annotation.Transactional;
@ -40,7 +43,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap
@Override @Override
public boolean supports(SourceDescriptor sourceDescriptor) { public boolean supports(SourceDescriptor sourceDescriptor) {
return sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.TED_PACKAGE return sourceDescriptor.sourceType() == SourceType.TED_PACKAGE
&& properties.isEnabled() && properties.isEnabled()
&& properties.isTedPackageAdapterEnabled(); && properties.isTedPackageAdapterEnabled();
} }
@ -58,7 +61,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap
ImportedDocumentResult packageDocument = importService.importDocument(new SourceDescriptor( ImportedDocumentResult packageDocument = importService.importDocument(new SourceDescriptor(
sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(), sourceDescriptor.accessContext() == null ? DocumentAccessContext.publicDocument() : sourceDescriptor.accessContext(),
at.procon.dip.domain.document.SourceType.TED_PACKAGE, SourceType.TED_PACKAGE,
sourceDescriptor.sourceIdentifier(), sourceDescriptor.sourceIdentifier(),
packageRootSource.sourceUri(), packageRootSource.sourceUri(),
sourceDescriptor.fileName(), sourceDescriptor.fileName(),
@ -71,7 +74,7 @@ public class TedPackageDocumentIngestionAdapter implements DocumentIngestionAdap
)); ));
List<String> warnings = new ArrayList<>(packageDocument.warnings()); List<String> warnings = new ArrayList<>(packageDocument.warnings());
List<at.procon.dip.domain.document.CanonicalDocumentMetadata> documents = new ArrayList<>(); List<CanonicalDocumentMetadata> documents = new ArrayList<>();
documents.add(packageDocument.document().toCanonicalMetadata()); documents.add(packageDocument.document().toCanonicalMetadata());
AtomicInteger sortOrder = new AtomicInteger(); AtomicInteger sortOrder = new AtomicInteger();

@ -0,0 +1,328 @@
package at.procon.dip.ingestion.camel;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.ingestion.config.TedPackageDownloadProperties;
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
import at.procon.dip.ingestion.spi.IngestionResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.model.entity.TedDailyPackage;
import at.procon.ted.repository.TedDailyPackageRepository;
import at.procon.dip.domain.ted.service.TedPackageSequenceService;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.security.MessageDigest;
import java.time.OffsetDateTime;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.camel.Exchange;
import org.apache.camel.LoggingLevel;
import org.apache.camel.builder.RouteBuilder;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Component;
/**
* NEW-runtime TED daily package download route.
* <p>
* Reuses the proven package sequencing rules through {@link TedPackageSequenceService},
* but hands off processing only to the NEW ingestion gateway. No legacy XML batch persistence,
* no legacy vectorization route, no old semantic path.
*/
@Component
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@ConditionalOnProperty(name = "dip.ingestion.ted-download.enabled", havingValue = "true")
@RequiredArgsConstructor
@Slf4j
public class TedPackageDownloadRoute extends RouteBuilder {
private static final String ROUTE_ID_SCHEDULER = "ted-package-new-scheduler";
private static final String ROUTE_ID_DOWNLOADER = "ted-package-new-downloader";
private static final String ROUTE_ID_ERROR = "ted-package-new-error-handler";
private final TedPackageDownloadProperties properties;
private final TedDailyPackageRepository packageRepository;
private final TedPackageSequenceService sequenceService;
private final DocumentIngestionGateway documentIngestionGateway;
@Override
public void configure() {
errorHandler(deadLetterChannel("direct:ted-package-new-error")
.maximumRedeliveries(3)
.redeliveryDelay(10_000)
.retryAttemptedLogLevel(LoggingLevel.WARN)
.logStackTrace(true));
from("direct:ted-package-new-error")
.routeId(ROUTE_ID_ERROR)
.process(this::handleError);
from("timer:ted-package-new-scheduler?period={{dip.ingestion.ted-download.poll-interval:3600000}}&delay=0")
.routeId(ROUTE_ID_SCHEDULER)
.process(this::checkRunningPackages)
.choice()
.when(header("tooManyRunning").isEqualTo(true))
.log(LoggingLevel.INFO, "Skipping NEW TED package download - already ${header.runningCount} packages in progress")
.otherwise()
.process(this::determineNextPackage)
.choice()
.when(header("packageId").isNotNull())
.to("direct:download-ted-package-new")
.otherwise()
.log(LoggingLevel.INFO, "No NEW TED package to download right now")
.end()
.end();
from("direct:download-ted-package-new")
.routeId(ROUTE_ID_DOWNLOADER)
.log(LoggingLevel.INFO, "NEW TED package download started: ${header.packageId}")
.setHeader("downloadStartTime", constant(System.currentTimeMillis()))
.process(this::createPackageRecord)
.delay(simple("{{dip.ingestion.ted-download.delay-between-downloads:5000}}"))
.setHeader(Exchange.HTTP_METHOD, constant("GET"))
.setHeader("CamelHttpConnectionClose", constant(true))
.toD("${header.downloadUrl}?bridgeEndpoint=true&throwExceptionOnFailure=false&socketTimeout={{dip.ingestion.ted-download.download-timeout:300000}}")
.choice()
.when(header(Exchange.HTTP_RESPONSE_CODE).isEqualTo(200))
.process(this::calculateHash)
.process(this::checkDuplicateByHash)
.choice()
.when(header("isDuplicate").isEqualTo(true))
.process(this::markDuplicate)
.otherwise()
.process(this::saveDownloadedPackage)
.process(this::ingestThroughGateway)
.process(this::markCompleted)
.endChoice()
.when(header(Exchange.HTTP_RESPONSE_CODE).isEqualTo(404))
.process(this::markNotFound)
.otherwise()
.process(this::markFailed)
.end();
}
private void checkRunningPackages(Exchange exchange) {
long downloadingCount = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADING).size();
long processingCount = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.PROCESSING).size();
long runningCount = downloadingCount + processingCount;
exchange.getIn().setHeader("runningCount", runningCount);
exchange.getIn().setHeader("tooManyRunning", runningCount >= properties.getMaxRunningPackages());
if (runningCount > 0) {
log.info("Currently {} TED packages in progress in NEW runtime ({} downloading, {} processing)",
runningCount, downloadingCount, processingCount);
}
}
private void determineNextPackage(Exchange exchange) {
List<TedDailyPackage> pendingPackages = packageRepository.findByDownloadStatus(TedDailyPackage.DownloadStatus.PENDING);
if (!pendingPackages.isEmpty()) {
TedDailyPackage pkg = pendingPackages.get(0);
log.info("Retrying PENDING TED package in NEW runtime: {}", pkg.getPackageIdentifier());
setPackageHeaders(exchange, pkg.getYear(), pkg.getSerialNumber());
return;
}
TedPackageSequenceService.PackageInfo packageInfo = sequenceService.getNextPackageToDownload();
if (packageInfo == null) {
exchange.getIn().setHeader("packageId", null);
return;
}
setPackageHeaders(exchange, packageInfo.year(), packageInfo.serialNumber());
}
private void setPackageHeaders(Exchange exchange, int year, int serialNumber) {
String packageId = "%04d%05d".formatted(year, serialNumber);
String downloadUrl = properties.getBaseUrl() + packageId;
exchange.getIn().setHeader("packageId", packageId);
exchange.getIn().setHeader("year", year);
exchange.getIn().setHeader("serialNumber", serialNumber);
exchange.getIn().setHeader("downloadUrl", downloadUrl);
}
private void createPackageRecord(Exchange exchange) {
String packageId = exchange.getIn().getHeader("packageId", String.class);
Integer year = exchange.getIn().getHeader("year", Integer.class);
Integer serialNumber = exchange.getIn().getHeader("serialNumber", Integer.class);
String downloadUrl = exchange.getIn().getHeader("downloadUrl", String.class);
if (packageRepository.existsByPackageIdentifier(packageId)) {
return;
}
TedDailyPackage pkg = TedDailyPackage.builder()
.packageIdentifier(packageId)
.year(year)
.serialNumber(serialNumber)
.downloadUrl(downloadUrl)
.downloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADING)
.build();
packageRepository.save(pkg);
}
private void calculateHash(Exchange exchange) throws Exception {
byte[] body = exchange.getIn().getBody(byte[].class);
MessageDigest digest = MessageDigest.getInstance("SHA-256");
byte[] hashBytes = digest.digest(body);
StringBuilder sb = new StringBuilder();
for (byte b : hashBytes) {
sb.append(String.format("%02x", b));
}
exchange.getIn().setHeader("fileHash", sb.toString());
}
private void checkDuplicateByHash(Exchange exchange) {
String hash = exchange.getIn().getHeader("fileHash", String.class);
Optional<TedDailyPackage> duplicate = packageRepository.findAll().stream()
.filter(p -> hash.equals(p.getFileHash()))
.findFirst();
exchange.getIn().setHeader("isDuplicate", duplicate.isPresent());
duplicate.ifPresent(pkg -> exchange.getIn().setHeader("duplicateOf", pkg.getPackageIdentifier()));
}
private void saveDownloadedPackage(Exchange exchange) throws Exception {
String packageId = exchange.getIn().getHeader("packageId", String.class);
String hash = exchange.getIn().getHeader("fileHash", String.class);
byte[] body = exchange.getIn().getBody(byte[].class);
Path downloadDir = Paths.get(properties.getDownloadDirectory());
Files.createDirectories(downloadDir);
Path downloadPath = downloadDir.resolve(packageId + ".tar.gz");
Files.write(downloadPath, body);
long downloadDuration = System.currentTimeMillis() -
exchange.getIn().getHeader("downloadStartTime", Long.class);
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
pkg.setFileHash(hash);
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADED);
pkg.setDownloadedAt(OffsetDateTime.now());
pkg.setDownloadDurationMs(downloadDuration);
packageRepository.save(pkg);
});
exchange.getIn().setHeader("downloadPath", downloadPath.toString());
}
private void ingestThroughGateway(Exchange exchange) {
String packageId = exchange.getIn().getHeader("packageId", String.class);
String downloadPath = exchange.getIn().getHeader("downloadPath", String.class);
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.PROCESSING);
packageRepository.save(pkg);
});
IngestionResult ingestionResult = documentIngestionGateway.ingest(new SourceDescriptor(
null,
SourceType.TED_PACKAGE,
packageId,
downloadPath,
packageId + ".tar.gz",
"application/gzip",
null,
null,
OffsetDateTime.now(),
OriginalContentStoragePolicy.DEFAULT,
Map.of(
"packageId", packageId,
"title", packageId + ".tar.gz"
)
));
int importedChildCount = Math.max(0, ingestionResult.documents().size() - 1);
exchange.getIn().setHeader("gatewayImportedChildCount", importedChildCount);
exchange.getIn().setHeader("gatewayImportWarnings", ingestionResult.warnings().size());
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
pkg.setXmlFileCount(importedChildCount);
pkg.setProcessedCount(importedChildCount);
pkg.setFailedCount(0);
packageRepository.save(pkg);
});
}
private void markCompleted(Exchange exchange) throws Exception {
String packageId = exchange.getIn().getHeader("packageId", String.class);
String downloadPath = exchange.getIn().getHeader("downloadPath", String.class);
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.COMPLETED);
pkg.setProcessedAt(OffsetDateTime.now());
if (pkg.getDownloadedAt() != null) {
long processingDuration = Math.max(0L,
java.time.Duration.between(pkg.getDownloadedAt(), OffsetDateTime.now()).toMillis());
pkg.setProcessingDurationMs(processingDuration);
}
packageRepository.save(pkg);
});
if (properties.isDeleteAfterIngestion() && downloadPath != null) {
Files.deleteIfExists(Path.of(downloadPath));
}
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
long totalDuration = (pkg.getDownloadDurationMs() != null ? pkg.getDownloadDurationMs() : 0L)
+ (pkg.getProcessingDurationMs() != null ? pkg.getProcessingDurationMs() : 0L);
log.info("NEW TED package {} completed: xmlCount={}, processed={}, failed={}, totalDuration={}ms",
packageId, pkg.getXmlFileCount(), pkg.getProcessedCount(), pkg.getFailedCount(), totalDuration);
});
}
private void markNotFound(Exchange exchange) {
String packageId = exchange.getIn().getHeader("packageId", String.class);
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.NOT_FOUND);
pkg.setErrorMessage("Package not found (404)");
packageRepository.save(pkg);
});
}
private void markFailed(Exchange exchange) {
String packageId = exchange.getIn().getHeader("packageId", String.class);
Integer httpCode = exchange.getIn().getHeader(Exchange.HTTP_RESPONSE_CODE, Integer.class);
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.FAILED);
pkg.setErrorMessage("HTTP " + httpCode);
packageRepository.save(pkg);
});
}
private void markDuplicate(Exchange exchange) {
String packageId = exchange.getIn().getHeader("packageId", String.class);
String duplicateOf = exchange.getIn().getHeader("duplicateOf", String.class);
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.COMPLETED);
pkg.setErrorMessage("Duplicate of " + duplicateOf);
pkg.setProcessedAt(OffsetDateTime.now());
packageRepository.save(pkg);
});
}
private void handleError(Exchange exchange) {
Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class);
String packageId = exchange.getIn().getHeader("packageId", String.class);
if (packageId != null) {
packageRepository.findByPackageIdentifier(packageId).ifPresent(pkg -> {
pkg.setDownloadStatus(TedDailyPackage.DownloadStatus.FAILED);
pkg.setErrorMessage(exception != null ? exception.getMessage() : "Unknown route error");
packageRepository.save(pkg);
});
}
}
}

@ -1,6 +1,8 @@
package at.procon.dip.ingestion.config; package at.procon.dip.ingestion.config;
import at.procon.dip.domain.access.DocumentVisibility; import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import jakarta.validation.constraints.NotBlank; import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.Positive; import jakarta.validation.constraints.Positive;
import lombok.Data; import lombok.Data;

@ -0,0 +1,52 @@
package at.procon.dip.ingestion.config;
import jakarta.validation.constraints.Min;
import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.Positive;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
/**
* NEW-runtime TED package download configuration.
* <p>
* This is intentionally separate from the legacy {@code ted.download.*} tree.
*/
@Configuration
@ConfigurationProperties(prefix = "dip.ingestion.ted-download")
@Data
public class TedPackageDownloadProperties {
private boolean enabled = false;
@NotBlank
private String baseUrl = "https://ted.europa.eu/packages/daily/";
@NotBlank
private String downloadDirectory = "/ted.europe/downloads-new";
@Positive
private int startYear = 2015;
@Positive
private long pollInterval = 3_600_000L;
@Positive
private long notFoundRetryInterval = 21_600_000L;
@Min(0)
private int previousYearGracePeriodDays = 30;
private boolean retryCurrentYearNotFoundIndefinitely = true;
@Positive
private long downloadTimeout = 300_000L;
@Positive
private int maxRunningPackages = 2;
@Positive
private long delayBetweenDownloads = 5_000L;
private boolean deleteAfterIngestion = true;
}

@ -1,16 +0,0 @@
package at.procon.ted.config;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
/**
* Patch A scaffold for the legacy runtime configuration tree.
*
* The legacy runtime still uses {@link TedProcessorProperties} today. This class is
* introduced so the old configuration can be moved gradually from `ted.*` to
* `legacy.ted.*` without blocking the runtime split.
*/
@Configuration
@ConfigurationProperties(prefix = "legacy.ted")
public class LegacyTedProperties extends TedProcessorProperties {
}

@ -1,5 +1,7 @@
package at.procon.ted.controller; package at.procon.ted.controller;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.model.dto.DocumentDtos.*; import at.procon.ted.model.dto.DocumentDtos.*;
import at.procon.ted.model.entity.ContractNature; import at.procon.ted.model.entity.ContractNature;
import at.procon.ted.model.entity.NoticeType; import at.procon.ted.model.entity.NoticeType;
@ -38,6 +40,7 @@ import java.util.UUID;
@RequestMapping("/v1/documents") @RequestMapping("/v1/documents")
@RequiredArgsConstructor @RequiredArgsConstructor
@Slf4j @Slf4j
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
@Tag(name = "Documents", description = "TED Procurement Document Search API") @Tag(name = "Documents", description = "TED Procurement Document Search API")
public class DocumentController { public class DocumentController {

@ -1,5 +1,7 @@
package at.procon.ted.controller; package at.procon.ted.controller;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.service.SimilaritySearchService; import at.procon.ted.service.SimilaritySearchService;
import at.procon.ted.service.SimilaritySearchService.SimilaritySearchResponse; import at.procon.ted.service.SimilaritySearchService.SimilaritySearchResponse;
import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Operation;
@ -28,6 +30,7 @@ import java.io.IOException;
@RequestMapping("/similarity") @RequestMapping("/similarity")
@RequiredArgsConstructor @RequiredArgsConstructor
@Slf4j @Slf4j
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
@Tag(name = "Similarity Search", description = "Vector-based semantic similarity search on TED procurement documents") @Tag(name = "Similarity Search", description = "Vector-based semantic similarity search on TED procurement documents")
public class SimilaritySearchController { public class SimilaritySearchController {

@ -1,11 +1,152 @@
spring:
config:
activate:
on-profile: legacy
dip: # Legacy / shared application properties
runtime: # New-runtime-only properties are moved to application-new.yml.
mode: LEGACY ted:
# Directory configuration for file processing
input:
# Base directory for watching incoming TED XML files
directory: ${TED_INPUT_DIR:/ted.europe/extracted}
# File pattern to match (recursive scanning)
pattern: "**/*.xml"
# Move processed files to this directory
processed-directory: ${TED_PROCESSED_DIR:.processed}
# Move failed files to this directory
error-directory: ${TED_ERROR_DIR:.error}
# Polling interval in milliseconds
poll-interval: 5000
# Maximum messages per poll (reduced to prevent memory issues)
max-messages-per-poll: 10
# Schema validation configuration
schema:
# Enable/disable XSD validation
enabled: true
# Path to eForms SDK schemas (from Maven dependency or custom location)
path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
# Vectorization configuration
vectorization:
# Enable/disable async vectorization
enabled: false
# Use external HTTP API instead of subprocess
use-http-api: true
# Embedding service URL
api-url: http://172.20.240.18:8001
# Model name for sentence-transformers
model-name: intfloat/multilingual-e5-large
# Vector dimensions (must match model output)
dimensions: 1024
# Batch size for vectorization
batch-size: 16
# Thread pool size for async processing
thread-pool-size: 4
# Maximum text length for vectorization (characters)
max-text-length: 8192
# HTTP connection timeout (milliseconds)
connect-timeout: 10000
# HTTP socket/read timeout (milliseconds)
socket-timeout: 60000
# Maximum retries on connection failure
max-retries: 5
# Packages download configuration
download:
# Enable/disable automatic package download
enabled: false
# User service-based camel route
use-service-based: false
# Base URL for TED Daily Packages
base-url: https://ted.europa.eu/packages/daily/
# Download directory for tar.gz files
download-directory: /ted.europe/downloads
# Extract directory for XML files
extract-directory: /ted.europe/extracted
# Start year for downloads
start-year: 2026
# Max consecutive 404 errors before stopping
max-consecutive-404: 4
# Polling interval (milliseconds) - 2 minutes
poll-interval: 300000
# Retry interval for tail NOT_FOUND packages - 6 hours
not-found-retry-interval: 21600000
# Grace period after year end before a previous-year tail 404 is treated as final
previous-year-grace-period-days: 30
# Keep retrying current-year tail 404 packages indefinitely
retry-current-year-not-found-indefinitely: true
# Download timeout (milliseconds) - 5 minutes
download-timeout: 300000
# Max concurrent downloads
max-concurrent-downloads: 2
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
delay-between-downloads: 3000
# Delete tar.gz after extraction
delete-after-extraction: true
# Prioritize current year first
prioritize-current-year: false
# IMAP Mail configuration
mail:
# Enable/disable mail processing
enabled: false
# IMAP server hostname
host: mail.mymagenta.business
# IMAP server port (993 for IMAPS)
port: 993
# Mail account username (email address)
username: archiv@procon.co.at
# Mail account password
password: ${MAIL_PASSWORD:worasigg}
# Use SSL/TLS connection
ssl: true
# Mail folder to read from
folder-name: INBOX
# Delete messages after processing
delete: false
# Mark messages as seen after processing (false = peek mode, don't mark as read)
seen: false
# Only process unseen messages
unseen: true
# Polling delay in milliseconds (1 minute)
delay: 60000
# Max messages per poll
max-messages-per-poll: 100
# Output directory for processed attachments
attachment-output-directory: /ted.europe/mail-attachments
# Enable/disable MIME file input processing
mime-input-enabled: true
# Input directory for MIME files (.eml)
mime-input-directory: /ted.europe/mime-input
# File pattern for MIME files (regex)
mime-input-pattern: .*\\.eml
# Polling interval for MIME input directory (milliseconds)
mime-input-poll-interval: 1000000
# solution brief processing
solution-brief:
# Enable/disable Solution Brief processing
enabled: false
# Input directory for Solution Brief PDF files
input-directory: C:/work/SolutionBrief
# Output directory for Excel result files (relative to input or absolute)
result-directory: ./result
# Number of top similar documents to include
top-k: 20
# Minimum similarity threshold (0.0-1.0)
similarity-threshold: 0.5
# Polling interval in milliseconds (30 seconds)
poll-interval: 30000
# File pattern for PDF files (regex)
file-pattern: .*\\.pdf
# Process files only once (idempotent)
idempotent: true
# Idempotent repository file path
idempotent-repository: ./solution-brief-processed.dat
# Data cleanup configuration
cleanup:
# Enable automatic cleanup of old documents
enabled: false
# Retention period in years (default: 10)
retention-years: 10
# Cron expression for cleanup schedule (default: daily at 2 AM)
cron: "0 0 2 * * *"
# Legacy runtime uses the existing ted.* property tree. # Legacy runtime uses the existing ted.* property tree.
# Move old route/download/mail/vectorization/search settings here over time. # Move old route/download/mail/vectorization/search settings here over time.

@ -1,16 +1,6 @@
# New runtime overrides
# Activate with: --spring.profiles.active=new
# Optional explicit marker; file is profile-specific already
spring:
config:
activate:
on-profile: new
dip: dip:
runtime: runtime:
mode: NEW mode: NEW
search: search:
# Default page size for search results # Default page size for search results
default-page-size: 20 default-page-size: 20
@ -44,7 +34,6 @@ dip:
startup-lexical-backfill-limit: 500 startup-lexical-backfill-limit: 500
# Number of top hits per engine returned by /search/debug # Number of top hits per engine returned by /search/debug
debug-top-hits-per-engine: 10 debug-top-hits-per-engine: 10
embedding: embedding:
enabled: true enabled: true
default-document-model: e5-default default-document-model: e5-default
@ -75,7 +64,6 @@ dip:
active: true active: true
jobs: jobs:
enabled: true enabled: true
# Phase 4 generic ingestion configuration # Phase 4 generic ingestion configuration
ingestion: ingestion:
# Master switch for arbitrary document ingestion into the DOC model # Master switch for arbitrary document ingestion into the DOC model
@ -132,6 +120,32 @@ dip:
# Import batch marker for mail roots and attachments # Import batch marker for mail roots and attachments
mail-import-batch-id: phase41-mail mail-import-batch-id: phase41-mail
# ted packages download configuration
ted-download:
# Enable/disable automatic package download
enabled: true
# Base URL for TED Daily Packages
base-url: https://ted.europa.eu/packages/daily/
# Download directory for tar.gz files
download-directory: /ted.europe/downloads-new
# Start year for downloads
start-year: 2026
# Polling interval (milliseconds) - 2 minutes
poll-interval: 3600000
# Retry interval for tail NOT_FOUND packages - 6 hours
not-found-retry-interval: 21600000
# Grace period after year end before a previous-year tail 404 is treated as final
previous-year-grace-period-days: 30
# Keep retrying current-year tail 404 packages indefinitely
retry-current-year-not-found-indefinitely: true
# Download timeout (milliseconds) - 5 minutes
download-timeout: 300000
# Max concurrent downloads
max-running-packages: 2
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
delay-between-downloads: 5000
# Delete tar.gz after ingestion
delete-after-ingestion: true
ted: # Phase 3 TED projection configuration ted: # Phase 3 TED projection configuration
projection: projection:
# Enable/disable dual-write into the TED projection model on top of DOC.doc_document # Enable/disable dual-write into the TED projection model on top of DOC.doc_document

@ -7,6 +7,9 @@ server:
context-path: /api context-path: /api
spring: spring:
profiles:
active: new
application: application:
name: document-intelligence-platform name: document-intelligence-platform
@ -57,161 +60,6 @@ camel:
# Weniger strenge Health-Checks für File-Consumer # Weniger strenge Health-Checks für File-Consumer
consumers-enabled: false consumers-enabled: false
# Default runtime mode: legacy / initial implementation
# Activate profile 'new' to load application-new.yml and switch to the new runtime.
dip:
runtime:
mode: LEGACY
# Legacy / shared application properties
# New-runtime-only properties are moved to application-new.yml.
ted:
# Directory configuration for file processing
input:
# Base directory for watching incoming TED XML files
directory: ${TED_INPUT_DIR:/ted.europe/extracted}
# File pattern to match (recursive scanning)
pattern: "**/*.xml"
# Move processed files to this directory
processed-directory: ${TED_PROCESSED_DIR:.processed}
# Move failed files to this directory
error-directory: ${TED_ERROR_DIR:.error}
# Polling interval in milliseconds
poll-interval: 5000
# Maximum messages per poll (reduced to prevent memory issues)
max-messages-per-poll: 10
# Schema validation configuration
schema:
# Enable/disable XSD validation
enabled: true
# Path to eForms SDK schemas (from Maven dependency or custom location)
path: classpath:schemas/maindoc/UBL-ContractNotice-2.3.xsd
# Vectorization configuration
vectorization:
# Enable/disable async vectorization
enabled: false
# Use external HTTP API instead of subprocess
use-http-api: true
# Embedding service URL
api-url: http://172.20.240.18:8001
# Model name for sentence-transformers
model-name: intfloat/multilingual-e5-large
# Vector dimensions (must match model output)
dimensions: 1024
# Batch size for vectorization
batch-size: 16
# Thread pool size for async processing
thread-pool-size: 4
# Maximum text length for vectorization (characters)
max-text-length: 8192
# HTTP connection timeout (milliseconds)
connect-timeout: 10000
# HTTP socket/read timeout (milliseconds)
socket-timeout: 60000
# Maximum retries on connection failure
max-retries: 5
# Packages download configuration
download:
# Enable/disable automatic package download
enabled: true
# User service-based camel route
use-service-based: false
# Base URL for TED Daily Packages
base-url: https://ted.europa.eu/packages/daily/
# Download directory for tar.gz files
download-directory: /ted.europe/downloads
# Extract directory for XML files
extract-directory: /ted.europe/extracted
# Start year for downloads
start-year: 2026
# Max consecutive 404 errors before stopping
max-consecutive-404: 4
# Polling interval (milliseconds) - 2 minutes
poll-interval: 300000
# Retry interval for tail NOT_FOUND packages - 6 hours
not-found-retry-interval: 21600000
# Grace period after year end before a previous-year tail 404 is treated as final
previous-year-grace-period-days: 30
# Keep retrying current-year tail 404 packages indefinitely
retry-current-year-not-found-indefinitely: true
# Download timeout (milliseconds) - 5 minutes
download-timeout: 300000
# Max concurrent downloads
max-concurrent-downloads: 2
# Delay between downloads (milliseconds) for rate limiting - 5 seconds
delay-between-downloads: 3000
# Delete tar.gz after extraction
delete-after-extraction: true
# Prioritize current year first
prioritize-current-year: false
# IMAP Mail configuration
mail:
# Enable/disable mail processing
enabled: false
# IMAP server hostname
host: mail.mymagenta.business
# IMAP server port (993 for IMAPS)
port: 993
# Mail account username (email address)
username: archiv@procon.co.at
# Mail account password
password: ${MAIL_PASSWORD:worasigg}
# Use SSL/TLS connection
ssl: true
# Mail folder to read from
folder-name: INBOX
# Delete messages after processing
delete: false
# Mark messages as seen after processing (false = peek mode, don't mark as read)
seen: false
# Only process unseen messages
unseen: true
# Polling delay in milliseconds (1 minute)
delay: 60000
# Max messages per poll
max-messages-per-poll: 100
# Output directory for processed attachments
attachment-output-directory: /ted.europe/mail-attachments
# Enable/disable MIME file input processing
mime-input-enabled: true
# Input directory for MIME files (.eml)
mime-input-directory: /ted.europe/mime-input
# File pattern for MIME files (regex)
mime-input-pattern: .*\\.eml
# Polling interval for MIME input directory (milliseconds)
mime-input-poll-interval: 1000000
# solution brief processing
solution-brief:
# Enable/disable Solution Brief processing
enabled: false
# Input directory for Solution Brief PDF files
input-directory: C:/work/SolutionBrief
# Output directory for Excel result files (relative to input or absolute)
result-directory: ./result
# Number of top similar documents to include
top-k: 20
# Minimum similarity threshold (0.0-1.0)
similarity-threshold: 0.5
# Polling interval in milliseconds (30 seconds)
poll-interval: 30000
# File pattern for PDF files (regex)
file-pattern: .*\\.pdf
# Process files only once (idempotent)
idempotent: true
# Idempotent repository file path
idempotent-repository: ./solution-brief-processed.dat
# Data cleanup configuration
cleanup:
# Enable automatic cleanup of old documents
enabled: false
# Retention period in years (default: 10)
retention-years: 10
# Cron expression for cleanup schedule (default: daily at 2 AM)
cron: "0 0 2 * * *"
# Actuator endpoints # Actuator endpoints
management: management:
endpoints: endpoints:

Loading…
Cancel
Save