package at.procon.ted.service; import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.model.entity.TedDailyPackage; import at.procon.ted.repository.TedDailyPackageRepository; import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; import at.procon.dip.runtime.config.RuntimeMode; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; import java.io.*; import java.net.HttpURLConnection; import java.net.URI; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.security.MessageDigest; import java.time.Duration; import java.time.LocalDate; import java.time.OffsetDateTime; import java.time.Year; import java.time.ZoneOffset; import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.stream.Stream; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; /** * Service for downloading and processing TED Daily Packages. * * Features: * - Automatic download from https://ted.europa.eu/packages/daily/ * - Hash-based idempotency check * - tar.gz extraction * - Integration with XML processing * * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service @ConditionalOnRuntimeMode(RuntimeMode.LEGACY) @RequiredArgsConstructor @Slf4j public class TedPackageDownloadService { private final TedProcessorProperties properties; private final TedDailyPackageRepository packageRepository; /** * Generates package identifier in YYYYSSSSS format. */ public String generatePackageIdentifier(int year, int serialNumber) { return String.format("%04d%05d", year, serialNumber); } /** * Determines the next package to download. * * Strategy (starts with current year FIRST, then goes backward): * 1. CURRENT YEAR (2026): Forward from max(nr) until 404 (get today's packages first!) * 2. All years (2026 -> 2025 -> 2024...): Fill gaps (if min(nr) > 1, then backward to 1) * 3. If current year complete (min=1 and 404 after max) -> previous year * 4. Repeat backward until startYear * * This ensures we always get the newest data first! */ public PackageInfo getNextPackageToDownload() { int currentYear = Year.now().getValue(); log.debug("Determining next package to download (current year: {})", currentYear); // 1. PRIORITY: Current year forward crawling (max+1) - GET TODAY'S PACKAGES FIRST! PackageInfo nextInCurrentYear = getNextForwardPackage(currentYear); if (nextInCurrentYear != null) { log.info("Next package: {} (CURRENT YEAR {} forward - newest data first!)", nextInCurrentYear.getIdentifier(), currentYear); return nextInCurrentYear; } log.debug("Current year {} complete or has 404, checking older years backward", currentYear); // 2. Go through all years BACKWARD (current year -> startYear) for gaps and completion for (int year = currentYear; year >= properties.getDownload().getStartYear(); year--) { // 2a. Check if there are gaps (min > 1) PackageInfo gapFiller = getGapFillerPackage(year); if (gapFiller != null) { log.info("Next package: {} (filling gap in year {})", gapFiller.getIdentifier(), year); return gapFiller; } // 2b. If no gap filler, check if year is complete if (!isYearComplete(year)) { // Year not complete, try forward crawling PackageInfo forwardPackage = getNextForwardPackage(year); if (forwardPackage != null) { log.info("Next package: {} (forward in year {})", forwardPackage.getIdentifier(), year); return forwardPackage; } } else { log.debug("Year {} is complete", year); } } // 3. Check if we can open a new previous year int oldestYear = getOldestYearWithData(); if (oldestYear > properties.getDownload().getStartYear()) { int previousYear = oldestYear - 1; if (previousYear >= properties.getDownload().getStartYear()) { // Open new year, start with 1 log.info("Next package: {} (opening new year {})", String.format("%04d%05d", previousYear, 1), previousYear); return new PackageInfo(previousYear, 1); } } log.info("All years from {} to {} are complete - no more packages", properties.getDownload().getStartYear(), currentYear); return null; // No more packages } /** * Finds the next package for forward crawling (max+1). * * Stronger NOT_FOUND handling: * - Current year: a tail 404 is treated as "not available yet" and retried indefinitely * - Older years: a tail 404 remains retryable until the configured grace period after year end expires * - Final year completion is only assumed after that grace period */ private PackageInfo getNextForwardPackage(int year) { Optional latest = packageRepository.findLatestByYear(year); if (latest.isEmpty()) { // No package for this year -> start with 1 log.debug("Year {} has no packages, starting with 1", year); return new PackageInfo(year, 1); } TedDailyPackage latestPackage = latest.get(); if (latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) { if (shouldRetryNotFoundPackage(latestPackage)) { log.info("Retrying tail NOT_FOUND package {} for year {}", latestPackage.getPackageIdentifier(), year); return new PackageInfo(year, latestPackage.getSerialNumber()); } if (isNotFoundRetryableForYear(latestPackage)) { log.debug("Year {} waiting until {} before retrying tail package {}", year, calculateNextRetryAt(latestPackage), latestPackage.getPackageIdentifier()); return null; } log.debug("Year {} finalized after grace period at tail package {}", year, latestPackage.getPackageIdentifier()); return null; } // Next package (max+1) log.debug("Year {} continues from package {} to {}", year, latestPackage.getSerialNumber(), latestPackage.getSerialNumber() + 1); return new PackageInfo(year, latestPackage.getSerialNumber() + 1); } /** * Finds package for gap filling (min-1). * Returns null if no gap exists (min = 1). */ private PackageInfo getGapFillerPackage(int year) { Optional first = packageRepository.findFirstByYear(year); if (first.isEmpty()) { // No package for this year return null; } int minSerial = first.get().getSerialNumber(); if (minSerial <= 1) { // No gap, already starts at 1 return null; } // Gap found: Get (min-1) return new PackageInfo(year, minSerial - 1); } /** * Checks if a year is complete. * A year is complete only when: * - package numbering starts at 1, and * - the current tail package is NOT_FOUND, and * - that NOT_FOUND is no longer retryable (grace period expired) */ private boolean isYearComplete(int year) { Optional first = packageRepository.findFirstByYear(year); Optional latest = packageRepository.findLatestByYear(year); if (first.isEmpty() || latest.isEmpty()) { return false; } if (first.get().getSerialNumber() != 1) { return false; } TedDailyPackage latestPackage = latest.get(); return latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND && !isNotFoundRetryableForYear(latestPackage); } private boolean shouldRetryNotFoundPackage(TedDailyPackage pkg) { if (!isNotFoundRetryableForYear(pkg)) { return false; } OffsetDateTime nextRetryAt = calculateNextRetryAt(pkg); return !nextRetryAt.isAfter(OffsetDateTime.now()); } private boolean isNotFoundRetryableForYear(TedDailyPackage pkg) { int currentYear = Year.now().getValue(); int packageYear = pkg.getYear() != null ? pkg.getYear() : currentYear; if (packageYear >= currentYear) { return properties.getDownload().isRetryCurrentYearNotFoundIndefinitely(); } return OffsetDateTime.now().isBefore(getYearRetryGraceDeadline(packageYear)); } private OffsetDateTime calculateNextRetryAt(TedDailyPackage pkg) { OffsetDateTime lastAttemptAt = pkg.getUpdatedAt() != null ? pkg.getUpdatedAt() : (pkg.getCreatedAt() != null ? pkg.getCreatedAt() : OffsetDateTime.now()); return lastAttemptAt.plus(Duration.ofMillis(properties.getDownload().getNotFoundRetryInterval())); } private OffsetDateTime getYearRetryGraceDeadline(int year) { return LocalDate.of(year + 1, 1, 1) .atStartOfDay() .atOffset(ZoneOffset.UTC) .plusDays(properties.getDownload().getPreviousYearGracePeriodDays()); } /** * Finds the oldest year for which we have data. */ private int getOldestYearWithData() { // Start from startYear and go forward to find the first year with data int currentYear = Year.now().getValue(); for (int year = properties.getDownload().getStartYear(); year <= currentYear; year++) { if (packageRepository.findLatestByYear(year).isPresent()) { // Found the oldest year with data return year; } } return currentYear; } /** * Lädt ein Package herunter und verarbeitet es. */ @Transactional public DownloadResult downloadPackage(int year, int serialNumber) { String packageId = generatePackageIdentifier(year, serialNumber); log.debug("Starting download of package: {}", packageId); // Prüfe ob Package bereits existiert Optional existing = packageRepository.findByPackageIdentifier(packageId); String downloadUrl = buildDownloadUrl(packageId); TedDailyPackage packageEntity; if (existing.isPresent()) { TedDailyPackage existingPackage = existing.get(); if (existingPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND && isNotFoundRetryableForYear(existingPackage)) { log.info("Retrying previously NOT_FOUND package: {}", packageId); existingPackage.setDownloadUrl(downloadUrl); existingPackage.setErrorMessage(null); existingPackage.setDownloadStatus(TedDailyPackage.DownloadStatus.PENDING); packageEntity = packageRepository.save(existingPackage); } else { log.debug("Package {} already exists with status: {}", packageId, existingPackage.getDownloadStatus()); return DownloadResult.alreadyExists(existingPackage); } } else { // Erstelle Package-Eintrag packageEntity = TedDailyPackage.builder() .packageIdentifier(packageId) .year(year) .serialNumber(serialNumber) .downloadUrl(downloadUrl) .downloadStatus(TedDailyPackage.DownloadStatus.PENDING) .build(); packageEntity = packageRepository.save(packageEntity); } long startTime = System.currentTimeMillis(); try { // Update Status: DOWNLOADING updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.DOWNLOADING, null); // Download tar.gz file Path downloadPath = downloadFile(downloadUrl, packageId); if (downloadPath == null) { // 404 - Package existiert nicht updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.NOT_FOUND, "Package not found (404)"); return DownloadResult.notFound(packageEntity); } // Berechne Hash String fileHash = calculateSHA256(downloadPath); // Prüfe auf Duplikat via Hash Optional duplicateByHash = packageRepository.findAll().stream() .filter(p -> fileHash.equals(p.getFileHash())) .findFirst(); if (duplicateByHash.isPresent()) { log.debug("Duplicate package detected via hash: {} = {}", packageId, duplicateByHash.get().getPackageIdentifier()); cleanupDownload(downloadPath); updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.COMPLETED, "Duplicate of " + duplicateByHash.get().getPackageIdentifier()); return DownloadResult.duplicate(packageEntity); } long downloadDuration = System.currentTimeMillis() - startTime; // Update: DOWNLOADED packageEntity = packageRepository.findById(packageEntity.getId()).orElseThrow(); packageEntity.setFileHash(fileHash); packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADED); packageEntity.setDownloadedAt(OffsetDateTime.now()); packageEntity.setDownloadDurationMs(downloadDuration); packageEntity = packageRepository.save(packageEntity); // Extrahiere XML-Dateien List xmlFiles = extractTarGz(downloadPath, packageId); packageEntity.setXmlFileCount(xmlFiles.size()); packageEntity = packageRepository.save(packageEntity); // Cleanup tar.gz if configured if (properties.getDownload().isDeleteAfterExtraction()) { cleanupDownload(downloadPath); } log.debug("Successfully downloaded package {}: {} XML files", packageId, xmlFiles.size()); return DownloadResult.success(packageEntity, xmlFiles); } catch (Exception e) { log.error("Failed to download package {}: {}", packageId, e.getMessage(), e); updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.FAILED, e.getMessage()); return DownloadResult.failed(packageEntity, e); } } /** * Baut die Download-URL. */ private String buildDownloadUrl(String packageId) { return properties.getDownload().getBaseUrl() + packageId; } /** * Lädt eine Datei herunter. * Gibt null zurück bei 404. */ private Path downloadFile(String urlString, String packageId) throws IOException { URL url = URI.create(urlString).toURL(); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setRequestMethod("GET"); connection.setConnectTimeout((int) properties.getDownload().getDownloadTimeout()); connection.setReadTimeout((int) properties.getDownload().getDownloadTimeout()); connection.setInstanceFollowRedirects(true); int responseCode = connection.getResponseCode(); if (responseCode == 404) { log.info("Package not found (404): {}", urlString); return null; } if (responseCode != 200) { throw new IOException("HTTP " + responseCode + " for URL: " + urlString); } // Erstelle Download-Verzeichnis Path downloadDir = Paths.get(properties.getDownload().getDownloadDirectory()); Files.createDirectories(downloadDir); // Download file Path targetPath = downloadDir.resolve(packageId + ".tar.gz"); try (InputStream in = connection.getInputStream()) { Files.copy(in, targetPath, StandardCopyOption.REPLACE_EXISTING); } log.debug("Downloaded {} bytes to {}", Files.size(targetPath), targetPath); return targetPath; } /** * Berechnet SHA-256 Hash einer Datei. */ private String calculateSHA256(Path file) throws Exception { MessageDigest digest = MessageDigest.getInstance("SHA-256"); try (InputStream is = Files.newInputStream(file)) { byte[] buffer = new byte[8192]; int read; while ((read = is.read(buffer)) > 0) { digest.update(buffer, 0, read); } } byte[] hashBytes = digest.digest(); StringBuilder sb = new StringBuilder(); for (byte b : hashBytes) { sb.append(String.format("%02x", b)); } return sb.toString(); } /** * Extrahiert tar.gz und gibt Liste der XML-Dateien zurück. */ private List extractTarGz(Path tarGzFile, String packageId) throws IOException { List xmlFiles = new ArrayList<>(); Path extractDir = Paths.get(properties.getDownload().getExtractDirectory()) .resolve(packageId); Files.createDirectories(extractDir); try (FileInputStream fis = new FileInputStream(tarGzFile.toFile()); GzipCompressorInputStream gzis = new GzipCompressorInputStream(fis); TarArchiveInputStream tais = new TarArchiveInputStream(gzis)) { TarArchiveEntry entry; while ((entry = tais.getNextTarEntry()) != null) { if (entry.isDirectory()) { continue; } String name = entry.getName(); if (!name.toLowerCase().endsWith(".xml")) { continue; } // Extrahiere XML-Datei Path outputPath = extractDir.resolve(new File(name).getName()); try (OutputStream os = Files.newOutputStream(outputPath)) { byte[] buffer = new byte[8192]; int read; while ((read = tais.read(buffer)) > 0) { os.write(buffer, 0, read); } } xmlFiles.add(outputPath); } } log.debug("Extracted {} XML files from {}", xmlFiles.size(), tarGzFile.getFileName()); return xmlFiles; } /** * Aktualisiert den Package-Status. */ private void updatePackageStatus(java.util.UUID packageId, TedDailyPackage.DownloadStatus status, String errorMessage) { packageRepository.findById(packageId).ifPresent(pkg -> { pkg.setDownloadStatus(status); if (errorMessage != null) { pkg.setErrorMessage(errorMessage); } packageRepository.save(pkg); }); } /** * Löscht heruntergeladene Datei. */ private void cleanupDownload(Path file) { try { Files.deleteIfExists(file); log.debug("Cleaned up download: {}", file); } catch (IOException e) { log.warn("Failed to delete file {}: {}", file, e.getMessage()); } } /** * Package-Info Record. */ public record PackageInfo(int year, int serialNumber) { public String getIdentifier() { return String.format("%04d%05d", year, serialNumber); } } /** * Download-Ergebnis. */ public record DownloadResult( TedDailyPackage packageEntity, Status status, List xmlFiles, Exception error ) { public enum Status { SUCCESS, ALREADY_EXISTS, NOT_FOUND, DUPLICATE, FAILED } public static DownloadResult success(TedDailyPackage pkg, List files) { return new DownloadResult(pkg, Status.SUCCESS, files, null); } public static DownloadResult alreadyExists(TedDailyPackage pkg) { return new DownloadResult(pkg, Status.ALREADY_EXISTS, List.of(), null); } public static DownloadResult notFound(TedDailyPackage pkg) { return new DownloadResult(pkg, Status.NOT_FOUND, List.of(), null); } public static DownloadResult duplicate(TedDailyPackage pkg) { return new DownloadResult(pkg, Status.DUPLICATE, List.of(), null); } public static DownloadResult failed(TedDailyPackage pkg, Exception e) { return new DownloadResult(pkg, Status.FAILED, List.of(), e); } public boolean isSuccess() { return status == Status.SUCCESS; } } }