You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
DIP/src/main/java/at/procon/ted/service/TedPackageDownloadService.java

562 lines
21 KiB
Java

package at.procon.ted.service;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.model.entity.TedDailyPackage;
import at.procon.ted.repository.TedDailyPackageRepository;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.security.MessageDigest;
import java.time.Duration;
import java.time.LocalDate;
import java.time.OffsetDateTime;
import java.time.Year;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Stream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
/**
* Service for downloading and processing TED Daily Packages.
*
* Features:
* - Automatic download from https://ted.europa.eu/packages/daily/
* - Hash-based idempotency check
* - tar.gz extraction
* - Integration with XML processing
*
* @author Martin.Schweitzer@procon.co.at and claude.ai
*/
@Service
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
@RequiredArgsConstructor
@Slf4j
public class TedPackageDownloadService {
private final TedProcessorProperties properties;
private final TedDailyPackageRepository packageRepository;
/**
* Generates package identifier in YYYYSSSSS format.
*/
public String generatePackageIdentifier(int year, int serialNumber) {
return String.format("%04d%05d", year, serialNumber);
}
/**
* Determines the next package to download.
*
* Strategy (starts with current year FIRST, then goes backward):
* 1. CURRENT YEAR (2026): Forward from max(nr) until 404 (get today's packages first!)
* 2. All years (2026 -> 2025 -> 2024...): Fill gaps (if min(nr) > 1, then backward to 1)
* 3. If current year complete (min=1 and 404 after max) -> previous year
* 4. Repeat backward until startYear
*
* This ensures we always get the newest data first!
*/
public PackageInfo getNextPackageToDownload() {
int currentYear = Year.now().getValue();
log.debug("Determining next package to download (current year: {})", currentYear);
// 1. PRIORITY: Current year forward crawling (max+1) - GET TODAY'S PACKAGES FIRST!
PackageInfo nextInCurrentYear = getNextForwardPackage(currentYear);
if (nextInCurrentYear != null) {
log.info("Next package: {} (CURRENT YEAR {} forward - newest data first!)",
nextInCurrentYear.getIdentifier(), currentYear);
return nextInCurrentYear;
}
log.debug("Current year {} complete or has 404, checking older years backward", currentYear);
// 2. Go through all years BACKWARD (current year -> startYear) for gaps and completion
for (int year = currentYear; year >= properties.getDownload().getStartYear(); year--) {
// 2a. Check if there are gaps (min > 1)
PackageInfo gapFiller = getGapFillerPackage(year);
if (gapFiller != null) {
log.info("Next package: {} (filling gap in year {})", gapFiller.getIdentifier(), year);
return gapFiller;
}
// 2b. If no gap filler, check if year is complete
if (!isYearComplete(year)) {
// Year not complete, try forward crawling
PackageInfo forwardPackage = getNextForwardPackage(year);
if (forwardPackage != null) {
log.info("Next package: {} (forward in year {})", forwardPackage.getIdentifier(), year);
return forwardPackage;
}
} else {
log.debug("Year {} is complete", year);
}
}
// 3. Check if we can open a new previous year
int oldestYear = getOldestYearWithData();
if (oldestYear > properties.getDownload().getStartYear()) {
int previousYear = oldestYear - 1;
if (previousYear >= properties.getDownload().getStartYear()) {
// Open new year, start with 1
log.info("Next package: {} (opening new year {})", String.format("%04d%05d", previousYear, 1), previousYear);
return new PackageInfo(previousYear, 1);
}
}
log.info("All years from {} to {} are complete - no more packages",
properties.getDownload().getStartYear(), currentYear);
return null; // No more packages
}
/**
* Finds the next package for forward crawling (max+1).
*
* Stronger NOT_FOUND handling:
* - Current year: a tail 404 is treated as "not available yet" and retried indefinitely
* - Older years: a tail 404 remains retryable until the configured grace period after year end expires
* - Final year completion is only assumed after that grace period
*/
private PackageInfo getNextForwardPackage(int year) {
Optional<TedDailyPackage> latest = packageRepository.findLatestByYear(year);
if (latest.isEmpty()) {
// No package for this year -> start with 1
log.debug("Year {} has no packages, starting with 1", year);
return new PackageInfo(year, 1);
}
TedDailyPackage latestPackage = latest.get();
if (latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) {
if (shouldRetryNotFoundPackage(latestPackage)) {
log.info("Retrying tail NOT_FOUND package {} for year {}", latestPackage.getPackageIdentifier(), year);
return new PackageInfo(year, latestPackage.getSerialNumber());
}
if (isNotFoundRetryableForYear(latestPackage)) {
log.debug("Year {} waiting until {} before retrying tail package {}",
year, calculateNextRetryAt(latestPackage), latestPackage.getPackageIdentifier());
return null;
}
log.debug("Year {} finalized after grace period at tail package {}",
year, latestPackage.getPackageIdentifier());
return null;
}
// Next package (max+1)
log.debug("Year {} continues from package {} to {}", year, latestPackage.getSerialNumber(), latestPackage.getSerialNumber() + 1);
return new PackageInfo(year, latestPackage.getSerialNumber() + 1);
}
/**
* Finds package for gap filling (min-1).
* Returns null if no gap exists (min = 1).
*/
private PackageInfo getGapFillerPackage(int year) {
Optional<TedDailyPackage> first = packageRepository.findFirstByYear(year);
if (first.isEmpty()) {
// No package for this year
return null;
}
int minSerial = first.get().getSerialNumber();
if (minSerial <= 1) {
// No gap, already starts at 1
return null;
}
// Gap found: Get (min-1)
return new PackageInfo(year, minSerial - 1);
}
/**
* Checks if a year is complete.
* A year is complete only when:
* - package numbering starts at 1, and
* - the current tail package is NOT_FOUND, and
* - that NOT_FOUND is no longer retryable (grace period expired)
*/
private boolean isYearComplete(int year) {
Optional<TedDailyPackage> first = packageRepository.findFirstByYear(year);
Optional<TedDailyPackage> latest = packageRepository.findLatestByYear(year);
if (first.isEmpty() || latest.isEmpty()) {
return false;
}
if (first.get().getSerialNumber() != 1) {
return false;
}
TedDailyPackage latestPackage = latest.get();
return latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND
&& !isNotFoundRetryableForYear(latestPackage);
}
private boolean shouldRetryNotFoundPackage(TedDailyPackage pkg) {
if (!isNotFoundRetryableForYear(pkg)) {
return false;
}
OffsetDateTime nextRetryAt = calculateNextRetryAt(pkg);
return !nextRetryAt.isAfter(OffsetDateTime.now());
}
private boolean isNotFoundRetryableForYear(TedDailyPackage pkg) {
int currentYear = Year.now().getValue();
int packageYear = pkg.getYear() != null ? pkg.getYear() : currentYear;
if (packageYear >= currentYear) {
return properties.getDownload().isRetryCurrentYearNotFoundIndefinitely();
}
return OffsetDateTime.now().isBefore(getYearRetryGraceDeadline(packageYear));
}
private OffsetDateTime calculateNextRetryAt(TedDailyPackage pkg) {
OffsetDateTime lastAttemptAt = pkg.getUpdatedAt() != null
? pkg.getUpdatedAt()
: (pkg.getCreatedAt() != null ? pkg.getCreatedAt() : OffsetDateTime.now());
return lastAttemptAt.plus(Duration.ofMillis(properties.getDownload().getNotFoundRetryInterval()));
}
private OffsetDateTime getYearRetryGraceDeadline(int year) {
return LocalDate.of(year + 1, 1, 1)
.atStartOfDay()
.atOffset(ZoneOffset.UTC)
.plusDays(properties.getDownload().getPreviousYearGracePeriodDays());
}
/**
* Finds the oldest year for which we have data.
*/
private int getOldestYearWithData() {
// Start from startYear and go forward to find the first year with data
int currentYear = Year.now().getValue();
for (int year = properties.getDownload().getStartYear(); year <= currentYear; year++) {
if (packageRepository.findLatestByYear(year).isPresent()) {
// Found the oldest year with data
return year;
}
}
return currentYear;
}
/**
* Lädt ein Package herunter und verarbeitet es.
*/
@Transactional
public DownloadResult downloadPackage(int year, int serialNumber) {
String packageId = generatePackageIdentifier(year, serialNumber);
log.debug("Starting download of package: {}", packageId);
// Prüfe ob Package bereits existiert
Optional<TedDailyPackage> existing = packageRepository.findByPackageIdentifier(packageId);
String downloadUrl = buildDownloadUrl(packageId);
TedDailyPackage packageEntity;
if (existing.isPresent()) {
TedDailyPackage existingPackage = existing.get();
if (existingPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND
&& isNotFoundRetryableForYear(existingPackage)) {
log.info("Retrying previously NOT_FOUND package: {}", packageId);
existingPackage.setDownloadUrl(downloadUrl);
existingPackage.setErrorMessage(null);
existingPackage.setDownloadStatus(TedDailyPackage.DownloadStatus.PENDING);
packageEntity = packageRepository.save(existingPackage);
} else {
log.debug("Package {} already exists with status: {}", packageId, existingPackage.getDownloadStatus());
return DownloadResult.alreadyExists(existingPackage);
}
} else {
// Erstelle Package-Eintrag
packageEntity = TedDailyPackage.builder()
.packageIdentifier(packageId)
.year(year)
.serialNumber(serialNumber)
.downloadUrl(downloadUrl)
.downloadStatus(TedDailyPackage.DownloadStatus.PENDING)
.build();
packageEntity = packageRepository.save(packageEntity);
}
long startTime = System.currentTimeMillis();
try {
// Update Status: DOWNLOADING
updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.DOWNLOADING, null);
// Download tar.gz file
Path downloadPath = downloadFile(downloadUrl, packageId);
if (downloadPath == null) {
// 404 - Package existiert nicht
updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.NOT_FOUND,
"Package not found (404)");
return DownloadResult.notFound(packageEntity);
}
// Berechne Hash
String fileHash = calculateSHA256(downloadPath);
// Prüfe auf Duplikat via Hash
Optional<TedDailyPackage> duplicateByHash = packageRepository.findAll().stream()
.filter(p -> fileHash.equals(p.getFileHash()))
.findFirst();
if (duplicateByHash.isPresent()) {
log.debug("Duplicate package detected via hash: {} = {}", packageId, duplicateByHash.get().getPackageIdentifier());
cleanupDownload(downloadPath);
updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.COMPLETED,
"Duplicate of " + duplicateByHash.get().getPackageIdentifier());
return DownloadResult.duplicate(packageEntity);
}
long downloadDuration = System.currentTimeMillis() - startTime;
// Update: DOWNLOADED
packageEntity = packageRepository.findById(packageEntity.getId()).orElseThrow();
packageEntity.setFileHash(fileHash);
packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADED);
packageEntity.setDownloadedAt(OffsetDateTime.now());
packageEntity.setDownloadDurationMs(downloadDuration);
packageEntity = packageRepository.save(packageEntity);
// Extrahiere XML-Dateien
List<Path> xmlFiles = extractTarGz(downloadPath, packageId);
packageEntity.setXmlFileCount(xmlFiles.size());
packageEntity = packageRepository.save(packageEntity);
// Cleanup tar.gz if configured
if (properties.getDownload().isDeleteAfterExtraction()) {
cleanupDownload(downloadPath);
}
log.debug("Successfully downloaded package {}: {} XML files", packageId, xmlFiles.size());
return DownloadResult.success(packageEntity, xmlFiles);
} catch (Exception e) {
log.error("Failed to download package {}: {}", packageId, e.getMessage(), e);
updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.FAILED,
e.getMessage());
return DownloadResult.failed(packageEntity, e);
}
}
/**
* Baut die Download-URL.
*/
private String buildDownloadUrl(String packageId) {
return properties.getDownload().getBaseUrl() + packageId;
}
/**
* Lädt eine Datei herunter.
* Gibt null zurück bei 404.
*/
private Path downloadFile(String urlString, String packageId) throws IOException {
URL url = URI.create(urlString).toURL();
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.setConnectTimeout((int) properties.getDownload().getDownloadTimeout());
connection.setReadTimeout((int) properties.getDownload().getDownloadTimeout());
connection.setInstanceFollowRedirects(true);
int responseCode = connection.getResponseCode();
if (responseCode == 404) {
log.info("Package not found (404): {}", urlString);
return null;
}
if (responseCode != 200) {
throw new IOException("HTTP " + responseCode + " for URL: " + urlString);
}
// Erstelle Download-Verzeichnis
Path downloadDir = Paths.get(properties.getDownload().getDownloadDirectory());
Files.createDirectories(downloadDir);
// Download file
Path targetPath = downloadDir.resolve(packageId + ".tar.gz");
try (InputStream in = connection.getInputStream()) {
Files.copy(in, targetPath, StandardCopyOption.REPLACE_EXISTING);
}
log.debug("Downloaded {} bytes to {}", Files.size(targetPath), targetPath);
return targetPath;
}
/**
* Berechnet SHA-256 Hash einer Datei.
*/
private String calculateSHA256(Path file) throws Exception {
MessageDigest digest = MessageDigest.getInstance("SHA-256");
try (InputStream is = Files.newInputStream(file)) {
byte[] buffer = new byte[8192];
int read;
while ((read = is.read(buffer)) > 0) {
digest.update(buffer, 0, read);
}
}
byte[] hashBytes = digest.digest();
StringBuilder sb = new StringBuilder();
for (byte b : hashBytes) {
sb.append(String.format("%02x", b));
}
return sb.toString();
}
/**
* Extrahiert tar.gz und gibt Liste der XML-Dateien zurück.
*/
private List<Path> extractTarGz(Path tarGzFile, String packageId) throws IOException {
List<Path> xmlFiles = new ArrayList<>();
Path extractDir = Paths.get(properties.getDownload().getExtractDirectory())
.resolve(packageId);
Files.createDirectories(extractDir);
try (FileInputStream fis = new FileInputStream(tarGzFile.toFile());
GzipCompressorInputStream gzis = new GzipCompressorInputStream(fis);
TarArchiveInputStream tais = new TarArchiveInputStream(gzis)) {
TarArchiveEntry entry;
while ((entry = tais.getNextTarEntry()) != null) {
if (entry.isDirectory()) {
continue;
}
String name = entry.getName();
if (!name.toLowerCase().endsWith(".xml")) {
continue;
}
// Extrahiere XML-Datei
Path outputPath = extractDir.resolve(new File(name).getName());
try (OutputStream os = Files.newOutputStream(outputPath)) {
byte[] buffer = new byte[8192];
int read;
while ((read = tais.read(buffer)) > 0) {
os.write(buffer, 0, read);
}
}
xmlFiles.add(outputPath);
}
}
log.debug("Extracted {} XML files from {}", xmlFiles.size(), tarGzFile.getFileName());
return xmlFiles;
}
/**
* Aktualisiert den Package-Status.
*/
private void updatePackageStatus(java.util.UUID packageId, TedDailyPackage.DownloadStatus status, String errorMessage) {
packageRepository.findById(packageId).ifPresent(pkg -> {
pkg.setDownloadStatus(status);
if (errorMessage != null) {
pkg.setErrorMessage(errorMessage);
}
packageRepository.save(pkg);
});
}
/**
* Löscht heruntergeladene Datei.
*/
private void cleanupDownload(Path file) {
try {
Files.deleteIfExists(file);
log.debug("Cleaned up download: {}", file);
} catch (IOException e) {
log.warn("Failed to delete file {}: {}", file, e.getMessage());
}
}
/**
* Package-Info Record.
*/
public record PackageInfo(int year, int serialNumber) {
public String getIdentifier() {
return String.format("%04d%05d", year, serialNumber);
}
}
/**
* Download-Ergebnis.
*/
public record DownloadResult(
TedDailyPackage packageEntity,
Status status,
List<Path> xmlFiles,
Exception error
) {
public enum Status {
SUCCESS,
ALREADY_EXISTS,
NOT_FOUND,
DUPLICATE,
FAILED
}
public static DownloadResult success(TedDailyPackage pkg, List<Path> files) {
return new DownloadResult(pkg, Status.SUCCESS, files, null);
}
public static DownloadResult alreadyExists(TedDailyPackage pkg) {
return new DownloadResult(pkg, Status.ALREADY_EXISTS, List.of(), null);
}
public static DownloadResult notFound(TedDailyPackage pkg) {
return new DownloadResult(pkg, Status.NOT_FOUND, List.of(), null);
}
public static DownloadResult duplicate(TedDailyPackage pkg) {
return new DownloadResult(pkg, Status.DUPLICATE, List.of(), null);
}
public static DownloadResult failed(TedDailyPackage pkg, Exception e) {
return new DownloadResult(pkg, Status.FAILED, List.of(), e);
}
public boolean isSuccess() {
return status == Status.SUCCESS;
}
}
}