You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
562 lines
21 KiB
Java
562 lines
21 KiB
Java
package at.procon.ted.service;
|
|
|
|
import at.procon.ted.config.TedProcessorProperties;
|
|
import at.procon.ted.model.entity.TedDailyPackage;
|
|
import at.procon.ted.repository.TedDailyPackageRepository;
|
|
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
|
import at.procon.dip.runtime.config.RuntimeMode;
|
|
import lombok.RequiredArgsConstructor;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import org.springframework.stereotype.Service;
|
|
import org.springframework.transaction.annotation.Transactional;
|
|
|
|
import java.io.*;
|
|
import java.net.HttpURLConnection;
|
|
import java.net.URI;
|
|
import java.net.URL;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.nio.file.Paths;
|
|
import java.nio.file.StandardCopyOption;
|
|
import java.security.MessageDigest;
|
|
import java.time.Duration;
|
|
import java.time.LocalDate;
|
|
import java.time.OffsetDateTime;
|
|
import java.time.Year;
|
|
import java.time.ZoneOffset;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.Optional;
|
|
import java.util.stream.Stream;
|
|
|
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
|
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
|
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
|
|
|
/**
|
|
* Service for downloading and processing TED Daily Packages.
|
|
*
|
|
* Features:
|
|
* - Automatic download from https://ted.europa.eu/packages/daily/
|
|
* - Hash-based idempotency check
|
|
* - tar.gz extraction
|
|
* - Integration with XML processing
|
|
*
|
|
* @author Martin.Schweitzer@procon.co.at and claude.ai
|
|
*/
|
|
@Service
|
|
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
|
|
@RequiredArgsConstructor
|
|
@Slf4j
|
|
public class TedPackageDownloadService {
|
|
|
|
private final TedProcessorProperties properties;
|
|
private final TedDailyPackageRepository packageRepository;
|
|
|
|
/**
|
|
* Generates package identifier in YYYYSSSSS format.
|
|
*/
|
|
public String generatePackageIdentifier(int year, int serialNumber) {
|
|
return String.format("%04d%05d", year, serialNumber);
|
|
}
|
|
|
|
/**
|
|
* Determines the next package to download.
|
|
*
|
|
* Strategy (starts with current year FIRST, then goes backward):
|
|
* 1. CURRENT YEAR (2026): Forward from max(nr) until 404 (get today's packages first!)
|
|
* 2. All years (2026 -> 2025 -> 2024...): Fill gaps (if min(nr) > 1, then backward to 1)
|
|
* 3. If current year complete (min=1 and 404 after max) -> previous year
|
|
* 4. Repeat backward until startYear
|
|
*
|
|
* This ensures we always get the newest data first!
|
|
*/
|
|
public PackageInfo getNextPackageToDownload() {
|
|
int currentYear = Year.now().getValue();
|
|
|
|
log.debug("Determining next package to download (current year: {})", currentYear);
|
|
|
|
// 1. PRIORITY: Current year forward crawling (max+1) - GET TODAY'S PACKAGES FIRST!
|
|
PackageInfo nextInCurrentYear = getNextForwardPackage(currentYear);
|
|
if (nextInCurrentYear != null) {
|
|
log.info("Next package: {} (CURRENT YEAR {} forward - newest data first!)",
|
|
nextInCurrentYear.getIdentifier(), currentYear);
|
|
return nextInCurrentYear;
|
|
}
|
|
|
|
log.debug("Current year {} complete or has 404, checking older years backward", currentYear);
|
|
|
|
// 2. Go through all years BACKWARD (current year -> startYear) for gaps and completion
|
|
for (int year = currentYear; year >= properties.getDownload().getStartYear(); year--) {
|
|
// 2a. Check if there are gaps (min > 1)
|
|
PackageInfo gapFiller = getGapFillerPackage(year);
|
|
if (gapFiller != null) {
|
|
log.info("Next package: {} (filling gap in year {})", gapFiller.getIdentifier(), year);
|
|
return gapFiller;
|
|
}
|
|
|
|
// 2b. If no gap filler, check if year is complete
|
|
if (!isYearComplete(year)) {
|
|
// Year not complete, try forward crawling
|
|
PackageInfo forwardPackage = getNextForwardPackage(year);
|
|
if (forwardPackage != null) {
|
|
log.info("Next package: {} (forward in year {})", forwardPackage.getIdentifier(), year);
|
|
return forwardPackage;
|
|
}
|
|
} else {
|
|
log.debug("Year {} is complete", year);
|
|
}
|
|
}
|
|
|
|
// 3. Check if we can open a new previous year
|
|
int oldestYear = getOldestYearWithData();
|
|
if (oldestYear > properties.getDownload().getStartYear()) {
|
|
int previousYear = oldestYear - 1;
|
|
if (previousYear >= properties.getDownload().getStartYear()) {
|
|
// Open new year, start with 1
|
|
log.info("Next package: {} (opening new year {})", String.format("%04d%05d", previousYear, 1), previousYear);
|
|
return new PackageInfo(previousYear, 1);
|
|
}
|
|
}
|
|
|
|
log.info("All years from {} to {} are complete - no more packages",
|
|
properties.getDownload().getStartYear(), currentYear);
|
|
return null; // No more packages
|
|
}
|
|
|
|
/**
|
|
* Finds the next package for forward crawling (max+1).
|
|
*
|
|
* Stronger NOT_FOUND handling:
|
|
* - Current year: a tail 404 is treated as "not available yet" and retried indefinitely
|
|
* - Older years: a tail 404 remains retryable until the configured grace period after year end expires
|
|
* - Final year completion is only assumed after that grace period
|
|
*/
|
|
private PackageInfo getNextForwardPackage(int year) {
|
|
Optional<TedDailyPackage> latest = packageRepository.findLatestByYear(year);
|
|
|
|
if (latest.isEmpty()) {
|
|
// No package for this year -> start with 1
|
|
log.debug("Year {} has no packages, starting with 1", year);
|
|
return new PackageInfo(year, 1);
|
|
}
|
|
|
|
TedDailyPackage latestPackage = latest.get();
|
|
|
|
if (latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) {
|
|
if (shouldRetryNotFoundPackage(latestPackage)) {
|
|
log.info("Retrying tail NOT_FOUND package {} for year {}", latestPackage.getPackageIdentifier(), year);
|
|
return new PackageInfo(year, latestPackage.getSerialNumber());
|
|
}
|
|
|
|
if (isNotFoundRetryableForYear(latestPackage)) {
|
|
log.debug("Year {} waiting until {} before retrying tail package {}",
|
|
year, calculateNextRetryAt(latestPackage), latestPackage.getPackageIdentifier());
|
|
return null;
|
|
}
|
|
|
|
log.debug("Year {} finalized after grace period at tail package {}",
|
|
year, latestPackage.getPackageIdentifier());
|
|
return null;
|
|
}
|
|
|
|
// Next package (max+1)
|
|
log.debug("Year {} continues from package {} to {}", year, latestPackage.getSerialNumber(), latestPackage.getSerialNumber() + 1);
|
|
return new PackageInfo(year, latestPackage.getSerialNumber() + 1);
|
|
}
|
|
|
|
/**
|
|
* Finds package for gap filling (min-1).
|
|
* Returns null if no gap exists (min = 1).
|
|
*/
|
|
private PackageInfo getGapFillerPackage(int year) {
|
|
Optional<TedDailyPackage> first = packageRepository.findFirstByYear(year);
|
|
|
|
if (first.isEmpty()) {
|
|
// No package for this year
|
|
return null;
|
|
}
|
|
|
|
int minSerial = first.get().getSerialNumber();
|
|
if (minSerial <= 1) {
|
|
// No gap, already starts at 1
|
|
return null;
|
|
}
|
|
|
|
// Gap found: Get (min-1)
|
|
return new PackageInfo(year, minSerial - 1);
|
|
}
|
|
|
|
/**
|
|
* Checks if a year is complete.
|
|
* A year is complete only when:
|
|
* - package numbering starts at 1, and
|
|
* - the current tail package is NOT_FOUND, and
|
|
* - that NOT_FOUND is no longer retryable (grace period expired)
|
|
*/
|
|
private boolean isYearComplete(int year) {
|
|
Optional<TedDailyPackage> first = packageRepository.findFirstByYear(year);
|
|
Optional<TedDailyPackage> latest = packageRepository.findLatestByYear(year);
|
|
|
|
if (first.isEmpty() || latest.isEmpty()) {
|
|
return false;
|
|
}
|
|
|
|
if (first.get().getSerialNumber() != 1) {
|
|
return false;
|
|
}
|
|
|
|
TedDailyPackage latestPackage = latest.get();
|
|
return latestPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND
|
|
&& !isNotFoundRetryableForYear(latestPackage);
|
|
}
|
|
|
|
|
|
private boolean shouldRetryNotFoundPackage(TedDailyPackage pkg) {
|
|
if (!isNotFoundRetryableForYear(pkg)) {
|
|
return false;
|
|
}
|
|
|
|
OffsetDateTime nextRetryAt = calculateNextRetryAt(pkg);
|
|
return !nextRetryAt.isAfter(OffsetDateTime.now());
|
|
}
|
|
|
|
private boolean isNotFoundRetryableForYear(TedDailyPackage pkg) {
|
|
int currentYear = Year.now().getValue();
|
|
int packageYear = pkg.getYear() != null ? pkg.getYear() : currentYear;
|
|
|
|
if (packageYear >= currentYear) {
|
|
return properties.getDownload().isRetryCurrentYearNotFoundIndefinitely();
|
|
}
|
|
|
|
return OffsetDateTime.now().isBefore(getYearRetryGraceDeadline(packageYear));
|
|
}
|
|
|
|
private OffsetDateTime calculateNextRetryAt(TedDailyPackage pkg) {
|
|
OffsetDateTime lastAttemptAt = pkg.getUpdatedAt() != null
|
|
? pkg.getUpdatedAt()
|
|
: (pkg.getCreatedAt() != null ? pkg.getCreatedAt() : OffsetDateTime.now());
|
|
|
|
return lastAttemptAt.plus(Duration.ofMillis(properties.getDownload().getNotFoundRetryInterval()));
|
|
}
|
|
|
|
private OffsetDateTime getYearRetryGraceDeadline(int year) {
|
|
return LocalDate.of(year + 1, 1, 1)
|
|
.atStartOfDay()
|
|
.atOffset(ZoneOffset.UTC)
|
|
.plusDays(properties.getDownload().getPreviousYearGracePeriodDays());
|
|
}
|
|
|
|
/**
|
|
* Finds the oldest year for which we have data.
|
|
*/
|
|
private int getOldestYearWithData() {
|
|
// Start from startYear and go forward to find the first year with data
|
|
int currentYear = Year.now().getValue();
|
|
for (int year = properties.getDownload().getStartYear(); year <= currentYear; year++) {
|
|
if (packageRepository.findLatestByYear(year).isPresent()) {
|
|
// Found the oldest year with data
|
|
return year;
|
|
}
|
|
}
|
|
return currentYear;
|
|
}
|
|
|
|
/**
|
|
* Lädt ein Package herunter und verarbeitet es.
|
|
*/
|
|
@Transactional
|
|
public DownloadResult downloadPackage(int year, int serialNumber) {
|
|
String packageId = generatePackageIdentifier(year, serialNumber);
|
|
|
|
log.debug("Starting download of package: {}", packageId);
|
|
|
|
// Prüfe ob Package bereits existiert
|
|
Optional<TedDailyPackage> existing = packageRepository.findByPackageIdentifier(packageId);
|
|
|
|
String downloadUrl = buildDownloadUrl(packageId);
|
|
TedDailyPackage packageEntity;
|
|
|
|
if (existing.isPresent()) {
|
|
TedDailyPackage existingPackage = existing.get();
|
|
|
|
if (existingPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND
|
|
&& isNotFoundRetryableForYear(existingPackage)) {
|
|
log.info("Retrying previously NOT_FOUND package: {}", packageId);
|
|
existingPackage.setDownloadUrl(downloadUrl);
|
|
existingPackage.setErrorMessage(null);
|
|
existingPackage.setDownloadStatus(TedDailyPackage.DownloadStatus.PENDING);
|
|
packageEntity = packageRepository.save(existingPackage);
|
|
} else {
|
|
log.debug("Package {} already exists with status: {}", packageId, existingPackage.getDownloadStatus());
|
|
return DownloadResult.alreadyExists(existingPackage);
|
|
}
|
|
} else {
|
|
// Erstelle Package-Eintrag
|
|
packageEntity = TedDailyPackage.builder()
|
|
.packageIdentifier(packageId)
|
|
.year(year)
|
|
.serialNumber(serialNumber)
|
|
.downloadUrl(downloadUrl)
|
|
.downloadStatus(TedDailyPackage.DownloadStatus.PENDING)
|
|
.build();
|
|
|
|
packageEntity = packageRepository.save(packageEntity);
|
|
}
|
|
|
|
long startTime = System.currentTimeMillis();
|
|
|
|
try {
|
|
// Update Status: DOWNLOADING
|
|
updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.DOWNLOADING, null);
|
|
|
|
// Download tar.gz file
|
|
Path downloadPath = downloadFile(downloadUrl, packageId);
|
|
|
|
if (downloadPath == null) {
|
|
// 404 - Package existiert nicht
|
|
updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.NOT_FOUND,
|
|
"Package not found (404)");
|
|
return DownloadResult.notFound(packageEntity);
|
|
}
|
|
|
|
// Berechne Hash
|
|
String fileHash = calculateSHA256(downloadPath);
|
|
|
|
// Prüfe auf Duplikat via Hash
|
|
Optional<TedDailyPackage> duplicateByHash = packageRepository.findAll().stream()
|
|
.filter(p -> fileHash.equals(p.getFileHash()))
|
|
.findFirst();
|
|
|
|
if (duplicateByHash.isPresent()) {
|
|
log.debug("Duplicate package detected via hash: {} = {}", packageId, duplicateByHash.get().getPackageIdentifier());
|
|
cleanupDownload(downloadPath);
|
|
updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.COMPLETED,
|
|
"Duplicate of " + duplicateByHash.get().getPackageIdentifier());
|
|
return DownloadResult.duplicate(packageEntity);
|
|
}
|
|
|
|
long downloadDuration = System.currentTimeMillis() - startTime;
|
|
|
|
// Update: DOWNLOADED
|
|
packageEntity = packageRepository.findById(packageEntity.getId()).orElseThrow();
|
|
packageEntity.setFileHash(fileHash);
|
|
packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.DOWNLOADED);
|
|
packageEntity.setDownloadedAt(OffsetDateTime.now());
|
|
packageEntity.setDownloadDurationMs(downloadDuration);
|
|
packageEntity = packageRepository.save(packageEntity);
|
|
|
|
// Extrahiere XML-Dateien
|
|
List<Path> xmlFiles = extractTarGz(downloadPath, packageId);
|
|
|
|
packageEntity.setXmlFileCount(xmlFiles.size());
|
|
packageEntity = packageRepository.save(packageEntity);
|
|
|
|
// Cleanup tar.gz if configured
|
|
if (properties.getDownload().isDeleteAfterExtraction()) {
|
|
cleanupDownload(downloadPath);
|
|
}
|
|
|
|
log.debug("Successfully downloaded package {}: {} XML files", packageId, xmlFiles.size());
|
|
|
|
return DownloadResult.success(packageEntity, xmlFiles);
|
|
|
|
} catch (Exception e) {
|
|
log.error("Failed to download package {}: {}", packageId, e.getMessage(), e);
|
|
updatePackageStatus(packageEntity.getId(), TedDailyPackage.DownloadStatus.FAILED,
|
|
e.getMessage());
|
|
return DownloadResult.failed(packageEntity, e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Baut die Download-URL.
|
|
*/
|
|
private String buildDownloadUrl(String packageId) {
|
|
return properties.getDownload().getBaseUrl() + packageId;
|
|
}
|
|
|
|
/**
|
|
* Lädt eine Datei herunter.
|
|
* Gibt null zurück bei 404.
|
|
*/
|
|
private Path downloadFile(String urlString, String packageId) throws IOException {
|
|
URL url = URI.create(urlString).toURL();
|
|
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
|
|
connection.setRequestMethod("GET");
|
|
connection.setConnectTimeout((int) properties.getDownload().getDownloadTimeout());
|
|
connection.setReadTimeout((int) properties.getDownload().getDownloadTimeout());
|
|
connection.setInstanceFollowRedirects(true);
|
|
|
|
int responseCode = connection.getResponseCode();
|
|
|
|
if (responseCode == 404) {
|
|
log.info("Package not found (404): {}", urlString);
|
|
return null;
|
|
}
|
|
|
|
if (responseCode != 200) {
|
|
throw new IOException("HTTP " + responseCode + " for URL: " + urlString);
|
|
}
|
|
|
|
// Erstelle Download-Verzeichnis
|
|
Path downloadDir = Paths.get(properties.getDownload().getDownloadDirectory());
|
|
Files.createDirectories(downloadDir);
|
|
|
|
// Download file
|
|
Path targetPath = downloadDir.resolve(packageId + ".tar.gz");
|
|
|
|
try (InputStream in = connection.getInputStream()) {
|
|
Files.copy(in, targetPath, StandardCopyOption.REPLACE_EXISTING);
|
|
}
|
|
|
|
log.debug("Downloaded {} bytes to {}", Files.size(targetPath), targetPath);
|
|
|
|
return targetPath;
|
|
}
|
|
|
|
/**
|
|
* Berechnet SHA-256 Hash einer Datei.
|
|
*/
|
|
private String calculateSHA256(Path file) throws Exception {
|
|
MessageDigest digest = MessageDigest.getInstance("SHA-256");
|
|
|
|
try (InputStream is = Files.newInputStream(file)) {
|
|
byte[] buffer = new byte[8192];
|
|
int read;
|
|
while ((read = is.read(buffer)) > 0) {
|
|
digest.update(buffer, 0, read);
|
|
}
|
|
}
|
|
|
|
byte[] hashBytes = digest.digest();
|
|
StringBuilder sb = new StringBuilder();
|
|
for (byte b : hashBytes) {
|
|
sb.append(String.format("%02x", b));
|
|
}
|
|
|
|
return sb.toString();
|
|
}
|
|
|
|
/**
|
|
* Extrahiert tar.gz und gibt Liste der XML-Dateien zurück.
|
|
*/
|
|
private List<Path> extractTarGz(Path tarGzFile, String packageId) throws IOException {
|
|
List<Path> xmlFiles = new ArrayList<>();
|
|
|
|
Path extractDir = Paths.get(properties.getDownload().getExtractDirectory())
|
|
.resolve(packageId);
|
|
Files.createDirectories(extractDir);
|
|
|
|
try (FileInputStream fis = new FileInputStream(tarGzFile.toFile());
|
|
GzipCompressorInputStream gzis = new GzipCompressorInputStream(fis);
|
|
TarArchiveInputStream tais = new TarArchiveInputStream(gzis)) {
|
|
|
|
TarArchiveEntry entry;
|
|
while ((entry = tais.getNextTarEntry()) != null) {
|
|
if (entry.isDirectory()) {
|
|
continue;
|
|
}
|
|
|
|
String name = entry.getName();
|
|
if (!name.toLowerCase().endsWith(".xml")) {
|
|
continue;
|
|
}
|
|
|
|
// Extrahiere XML-Datei
|
|
Path outputPath = extractDir.resolve(new File(name).getName());
|
|
|
|
try (OutputStream os = Files.newOutputStream(outputPath)) {
|
|
byte[] buffer = new byte[8192];
|
|
int read;
|
|
while ((read = tais.read(buffer)) > 0) {
|
|
os.write(buffer, 0, read);
|
|
}
|
|
}
|
|
|
|
xmlFiles.add(outputPath);
|
|
}
|
|
}
|
|
|
|
log.debug("Extracted {} XML files from {}", xmlFiles.size(), tarGzFile.getFileName());
|
|
|
|
return xmlFiles;
|
|
}
|
|
|
|
/**
|
|
* Aktualisiert den Package-Status.
|
|
*/
|
|
private void updatePackageStatus(java.util.UUID packageId, TedDailyPackage.DownloadStatus status, String errorMessage) {
|
|
packageRepository.findById(packageId).ifPresent(pkg -> {
|
|
pkg.setDownloadStatus(status);
|
|
if (errorMessage != null) {
|
|
pkg.setErrorMessage(errorMessage);
|
|
}
|
|
packageRepository.save(pkg);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Löscht heruntergeladene Datei.
|
|
*/
|
|
private void cleanupDownload(Path file) {
|
|
try {
|
|
Files.deleteIfExists(file);
|
|
log.debug("Cleaned up download: {}", file);
|
|
} catch (IOException e) {
|
|
log.warn("Failed to delete file {}: {}", file, e.getMessage());
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Package-Info Record.
|
|
*/
|
|
public record PackageInfo(int year, int serialNumber) {
|
|
public String getIdentifier() {
|
|
return String.format("%04d%05d", year, serialNumber);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Download-Ergebnis.
|
|
*/
|
|
public record DownloadResult(
|
|
TedDailyPackage packageEntity,
|
|
Status status,
|
|
List<Path> xmlFiles,
|
|
Exception error
|
|
) {
|
|
public enum Status {
|
|
SUCCESS,
|
|
ALREADY_EXISTS,
|
|
NOT_FOUND,
|
|
DUPLICATE,
|
|
FAILED
|
|
}
|
|
|
|
public static DownloadResult success(TedDailyPackage pkg, List<Path> files) {
|
|
return new DownloadResult(pkg, Status.SUCCESS, files, null);
|
|
}
|
|
|
|
public static DownloadResult alreadyExists(TedDailyPackage pkg) {
|
|
return new DownloadResult(pkg, Status.ALREADY_EXISTS, List.of(), null);
|
|
}
|
|
|
|
public static DownloadResult notFound(TedDailyPackage pkg) {
|
|
return new DownloadResult(pkg, Status.NOT_FOUND, List.of(), null);
|
|
}
|
|
|
|
public static DownloadResult duplicate(TedDailyPackage pkg) {
|
|
return new DownloadResult(pkg, Status.DUPLICATE, List.of(), null);
|
|
}
|
|
|
|
public static DownloadResult failed(TedDailyPackage pkg, Exception e) {
|
|
return new DownloadResult(pkg, Status.FAILED, List.of(), e);
|
|
}
|
|
|
|
public boolean isSuccess() {
|
|
return status == Status.SUCCESS;
|
|
}
|
|
}
|
|
}
|