ted legacy documents: audition and repair

master
trifonovt 2 weeks ago
parent 6ae39b4ea5
commit 00ad3aad38

@ -16,8 +16,8 @@ import org.springframework.scheduling.annotation.EnableAsync;
*/ */
@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"}) @SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"})
@EnableAsync @EnableAsync
@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity"}) @EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity", "at.procon.dip.migration.audit.entity"})
@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository"}) @EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository", "at.procon.dip.migration.audit.repository"})
public class DocumentIntelligencePlatformApplication { public class DocumentIntelligencePlatformApplication {
public static void main(String[] args) { public static void main(String[] args) {

@ -55,10 +55,10 @@ public class TedNoticeOrganization {
@Column(name = "company_id", length = 1000) @Column(name = "company_id", length = 1000)
private String companyId; private String companyId;
@Column(name = "country_code", length = 10) @Column(name = "country_code", columnDefinition = "TEXT")
private String countryCode; private String countryCode;
@Column(name = "city", length = 255) @Column(name = "city", columnDefinition = "TEXT")
private String city; private String city;
@Column(name = "postal_code", length = 255) @Column(name = "postal_code", length = 255)

@ -108,7 +108,7 @@ public class TedNoticeProjection {
@Column(name = "buyer_country_code", length = 10) @Column(name = "buyer_country_code", length = 10)
private String buyerCountryCode; private String buyerCountryCode;
@Column(name = "buyer_city", length = 255) @Column(name = "buyer_city", columnDefinition = "TEXT")
private String buyerCity; private String buyerCity;
@Column(name = "buyer_postal_code", length = 100) @Column(name = "buyer_postal_code", length = 100)
@ -129,7 +129,7 @@ public class TedNoticeProjection {
@Column(name = "project_description", columnDefinition = "TEXT") @Column(name = "project_description", columnDefinition = "TEXT")
private String projectDescription; private String projectDescription;
@Column(name = "internal_reference", length = 500) @Column(name = "internal_reference", columnDefinition = "TEXT")
private String internalReference; private String internalReference;
@Enumerated(EnumType.STRING) @Enumerated(EnumType.STRING)

@ -0,0 +1,47 @@
package at.procon.dip.migration.audit.config;
import jakarta.validation.constraints.Min;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
@Configuration
@ConfigurationProperties(prefix = "dip.migration.legacy-audit")
@Data
public class LegacyTedAuditProperties {
/**
* Enables the Wave 1 / Milestone A legacy TED audit subsystem.
*/
private boolean enabled = true;
/**
* Automatically runs the read-only audit on application startup.
*/
private boolean startupRunEnabled = false;
/**
* Maximum number of legacy TED documents to scan during startup.
* 0 means no limit.
*/
@Min(0)
private int startupRunLimit = 500;
/**
* Batch size for legacy TED document paging.
*/
@Min(1)
private int pageSize = 100;
/**
* Hard cap for persisted findings in a single run to avoid runaway audit volume.
*/
@Min(1)
private int maxFindingsPerRun = 10000;
/**
* Maximum number of duplicate/grouped samples recorded for global aggregate checks.
*/
@Min(1)
private int maxDuplicateSamples = 100;
}

@ -0,0 +1,87 @@
package at.procon.dip.migration.audit.entity;
import at.procon.dip.architecture.SchemaNames;
import jakarta.persistence.Column;
import jakarta.persistence.Entity;
import jakarta.persistence.EnumType;
import jakarta.persistence.Enumerated;
import jakarta.persistence.FetchType;
import jakarta.persistence.GeneratedValue;
import jakarta.persistence.GenerationType;
import jakarta.persistence.Id;
import jakarta.persistence.Index;
import jakarta.persistence.JoinColumn;
import jakarta.persistence.ManyToOne;
import jakarta.persistence.PrePersist;
import jakarta.persistence.Table;
import java.time.OffsetDateTime;
import java.util.UUID;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
@Entity
@Table(schema = SchemaNames.DOC, name = "doc_legacy_audit_finding", indexes = {
@Index(name = "idx_doc_legacy_audit_find_run", columnList = "run_id"),
@Index(name = "idx_doc_legacy_audit_find_type", columnList = "finding_type"),
@Index(name = "idx_doc_legacy_audit_find_severity", columnList = "severity"),
@Index(name = "idx_doc_legacy_audit_find_legacy_doc", columnList = "legacy_procurement_document_id"),
@Index(name = "idx_doc_legacy_audit_find_document", columnList = "document_id")
})
@Getter
@Setter
@NoArgsConstructor
@AllArgsConstructor
@Builder
public class LegacyTedAuditFinding {
@Id
@GeneratedValue(strategy = GenerationType.UUID)
private UUID id;
@ManyToOne(fetch = FetchType.LAZY, optional = false)
@JoinColumn(name = "run_id", nullable = false)
private LegacyTedAuditRun run;
@Enumerated(EnumType.STRING)
@Column(name = "severity", nullable = false, length = 16)
private LegacyTedAuditSeverity severity;
@Enumerated(EnumType.STRING)
@Column(name = "finding_type", nullable = false, length = 64)
private LegacyTedAuditFindingType findingType;
@Column(name = "package_identifier", length = 20)
private String packageIdentifier;
@Column(name = "legacy_procurement_document_id")
private UUID legacyProcurementDocumentId;
@Column(name = "document_id")
private UUID documentId;
@Column(name = "ted_notice_projection_id")
private UUID tedNoticeProjectionId;
@Column(name = "reference_key", length = 255)
private String referenceKey;
@Column(name = "message", nullable = false, columnDefinition = "TEXT")
private String message;
@Column(name = "details_text", columnDefinition = "TEXT")
private String detailsText;
@Builder.Default
@Column(name = "created_at", nullable = false, updatable = false)
private OffsetDateTime createdAt = OffsetDateTime.now();
@PrePersist
protected void onCreate() {
if (createdAt == null) {
createdAt = OffsetDateTime.now();
}
}
}

@ -0,0 +1,28 @@
package at.procon.dip.migration.audit.entity;
public enum LegacyTedAuditFindingType {
PACKAGE_SEQUENCE_GAP,
PACKAGE_INCOMPLETE,
PACKAGE_COMPLETED_WITHOUT_PROCESSED_AT,
PACKAGE_COMPLETED_COUNT_MISMATCH,
PACKAGE_MISSING_XML_FILE_COUNT,
PACKAGE_MISSING_FILE_HASH,
PACKAGE_FAILED_WITHOUT_ERROR_MESSAGE,
LEGACY_PUBLICATION_ID_DUPLICATE,
DOC_DEDUP_HASH_DUPLICATE,
LEGACY_DOCUMENT_MISSING_HASH,
LEGACY_DOCUMENT_MISSING_XML,
LEGACY_DOCUMENT_MISSING_TEXT,
LEGACY_DOCUMENT_MISSING_PUBLICATION_ID,
DOC_DOCUMENT_MISSING,
DOC_DOCUMENT_DUPLICATE,
DOC_SOURCE_MISSING,
DOC_ORIGINAL_CONTENT_MISSING,
DOC_ORIGINAL_CONTENT_DUPLICATE,
DOC_PRIMARY_REPRESENTATION_MISSING,
DOC_PRIMARY_REPRESENTATION_DUPLICATE,
TED_PROJECTION_MISSING,
TED_PROJECTION_MISSING_LEGACY_LINK,
TED_PROJECTION_DOCUMENT_MISMATCH,
FINDINGS_TRUNCATED
}

@ -0,0 +1,110 @@
package at.procon.dip.migration.audit.entity;
import at.procon.dip.architecture.SchemaNames;
import jakarta.persistence.Column;
import jakarta.persistence.Entity;
import jakarta.persistence.EnumType;
import jakarta.persistence.Enumerated;
import jakarta.persistence.GeneratedValue;
import jakarta.persistence.GenerationType;
import jakarta.persistence.Id;
import jakarta.persistence.Index;
import jakarta.persistence.PrePersist;
import jakarta.persistence.PreUpdate;
import jakarta.persistence.Table;
import java.time.OffsetDateTime;
import java.util.UUID;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
@Entity
@Table(schema = SchemaNames.DOC, name = "doc_legacy_audit_run", indexes = {
@Index(name = "idx_doc_legacy_audit_run_status", columnList = "status"),
@Index(name = "idx_doc_legacy_audit_run_started", columnList = "started_at")
})
@Getter
@Setter
@NoArgsConstructor
@AllArgsConstructor
@Builder
public class LegacyTedAuditRun {
@Id
@GeneratedValue(strategy = GenerationType.UUID)
private UUID id;
@Enumerated(EnumType.STRING)
@Column(name = "status", nullable = false, length = 32)
private LegacyTedAuditRunStatus status;
@Column(name = "requested_limit")
private Integer requestedLimit;
@Column(name = "page_size", nullable = false)
private Integer pageSize;
@Column(name = "scanned_packages", nullable = false)
@Builder.Default
private Integer scannedPackages = 0;
@Column(name = "scanned_legacy_documents", nullable = false)
@Builder.Default
private Integer scannedLegacyDocuments = 0;
@Column(name = "finding_count", nullable = false)
@Builder.Default
private Integer findingCount = 0;
@Column(name = "info_count", nullable = false)
@Builder.Default
private Integer infoCount = 0;
@Column(name = "warning_count", nullable = false)
@Builder.Default
private Integer warningCount = 0;
@Column(name = "error_count", nullable = false)
@Builder.Default
private Integer errorCount = 0;
@Column(name = "started_at", nullable = false)
private OffsetDateTime startedAt;
@Column(name = "completed_at")
private OffsetDateTime completedAt;
@Column(name = "summary_text", columnDefinition = "TEXT")
private String summaryText;
@Column(name = "failure_message", columnDefinition = "TEXT")
private String failureMessage;
@Builder.Default
@Column(name = "created_at", nullable = false, updatable = false)
private OffsetDateTime createdAt = OffsetDateTime.now();
@Builder.Default
@Column(name = "updated_at", nullable = false)
private OffsetDateTime updatedAt = OffsetDateTime.now();
@PrePersist
protected void onCreate() {
if (startedAt == null) {
startedAt = OffsetDateTime.now();
}
if (createdAt == null) {
createdAt = OffsetDateTime.now();
}
if (updatedAt == null) {
updatedAt = OffsetDateTime.now();
}
}
@PreUpdate
protected void onUpdate() {
updatedAt = OffsetDateTime.now();
}
}

@ -0,0 +1,7 @@
package at.procon.dip.migration.audit.entity;
public enum LegacyTedAuditRunStatus {
RUNNING,
COMPLETED,
FAILED
}

@ -0,0 +1,7 @@
package at.procon.dip.migration.audit.entity;
public enum LegacyTedAuditSeverity {
INFO,
WARNING,
ERROR
}

@ -0,0 +1,8 @@
package at.procon.dip.migration.audit.repository;
import at.procon.dip.migration.audit.entity.LegacyTedAuditFinding;
import java.util.UUID;
import org.springframework.data.jpa.repository.JpaRepository;
public interface LegacyTedAuditFindingRepository extends JpaRepository<LegacyTedAuditFinding, UUID> {
}

@ -0,0 +1,8 @@
package at.procon.dip.migration.audit.repository;
import at.procon.dip.migration.audit.entity.LegacyTedAuditRun;
import java.util.UUID;
import org.springframework.data.jpa.repository.JpaRepository;
public interface LegacyTedAuditRunRepository extends JpaRepository<LegacyTedAuditRun, UUID> {
}

@ -0,0 +1,610 @@
package at.procon.dip.migration.audit.service;
import at.procon.dip.migration.audit.config.LegacyTedAuditProperties;
import at.procon.dip.migration.audit.entity.LegacyTedAuditFinding;
import at.procon.dip.migration.audit.entity.LegacyTedAuditFindingType;
import at.procon.dip.migration.audit.entity.LegacyTedAuditRun;
import at.procon.dip.migration.audit.entity.LegacyTedAuditRunStatus;
import at.procon.dip.migration.audit.entity.LegacyTedAuditSeverity;
import at.procon.dip.migration.audit.repository.LegacyTedAuditFindingRepository;
import at.procon.dip.migration.audit.repository.LegacyTedAuditRunRepository;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.model.entity.ProcurementDocument;
import at.procon.ted.model.entity.TedDailyPackage;
import at.procon.ted.repository.ProcurementDocumentRepository;
import at.procon.ted.repository.TedDailyPackageRepository;
import java.time.OffsetDateTime;
import java.time.Year;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.UUID;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Sort;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;
@Service
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@RequiredArgsConstructor
@Slf4j
public class LegacyTedAuditService {
private final LegacyTedAuditProperties properties;
private final TedDailyPackageRepository tedDailyPackageRepository;
private final ProcurementDocumentRepository procurementDocumentRepository;
private final LegacyTedAuditRunRepository runRepository;
private final LegacyTedAuditFindingRepository findingRepository;
private final JdbcTemplate jdbcTemplate;
public LegacyTedAuditRun executeAudit() {
return executeAudit(properties.getStartupRunLimit());
}
public LegacyTedAuditRun executeAudit(int requestedLimit) {
if (!properties.isEnabled()) {
throw new IllegalStateException("Legacy TED audit is disabled by configuration");
}
Integer effectiveLimit = requestedLimit > 0 ? requestedLimit : null;
int pageSize = properties.getPageSize();
AuditAccumulator accumulator = new AuditAccumulator();
LegacyTedAuditRun run = LegacyTedAuditRun.builder()
.status(LegacyTedAuditRunStatus.RUNNING)
.requestedLimit(effectiveLimit)
.pageSize(pageSize)
.startedAt(OffsetDateTime.now())
.build();
run = runRepository.save(run);
try {
int scannedPackages = auditPackages(run, accumulator);
auditGlobalDuplicates(run, accumulator);
int scannedLegacyDocuments = 0;//auditLegacyDocuments(run, accumulator, effectiveLimit, pageSize);
run.setStatus(LegacyTedAuditRunStatus.COMPLETED);
run.setCompletedAt(OffsetDateTime.now());
run.setScannedPackages(scannedPackages);
run.setScannedLegacyDocuments(scannedLegacyDocuments);
run.setFindingCount(accumulator.totalFindings());
run.setInfoCount(accumulator.infoCount());
run.setWarningCount(accumulator.warningCount());
run.setErrorCount(accumulator.errorCount());
run.setSummaryText(buildSummary(scannedPackages, scannedLegacyDocuments, accumulator));
run.setFailureMessage(null);
run = runRepository.save(run);
log.info("Wave 1 / Milestone A legacy-only audit completed: runId={}, packages={}, documents={}, findings={}, warnings={}, errors={}",
run.getId(), scannedPackages, scannedLegacyDocuments, accumulator.totalFindings(),
accumulator.warningCount(), accumulator.errorCount());
return run;
} catch (RuntimeException ex) {
run.setStatus(LegacyTedAuditRunStatus.FAILED);
run.setCompletedAt(OffsetDateTime.now());
run.setScannedPackages(accumulator.scannedPackages());
run.setScannedLegacyDocuments(accumulator.scannedLegacyDocuments());
run.setFindingCount(accumulator.totalFindings());
run.setInfoCount(accumulator.infoCount());
run.setWarningCount(accumulator.warningCount());
run.setErrorCount(accumulator.errorCount());
run.setFailureMessage(ex.getMessage());
run.setSummaryText(buildSummary(accumulator.scannedPackages(), accumulator.scannedLegacyDocuments(), accumulator));
runRepository.save(run);
log.error("Wave 1 / Milestone A legacy-only audit failed: runId={}", run.getId(), ex);
throw ex;
}
}
private int auditPackages(LegacyTedAuditRun run, AuditAccumulator accumulator) {
List<TedDailyPackage> packages = tedDailyPackageRepository.findAll(Sort.by(Sort.Direction.ASC, "year", "serialNumber"));
if (packages.isEmpty()) {
return 0;
}
Map<Integer, List<TedDailyPackage>> packagesByYear = new TreeMap<>();
for (TedDailyPackage dailyPackage : packages) {
packagesByYear.computeIfAbsent(dailyPackage.getYear(), ignored -> new ArrayList<>()).add(dailyPackage);
}
int firstYear = packagesByYear.keySet().iterator().next();
int currentYear = Year.now().getValue();
for (int year = firstYear; year <= currentYear; year++) {
List<TedDailyPackage> yearPackages = packagesByYear.get(year);
if (yearPackages == null || yearPackages.isEmpty()) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.WARNING,
LegacyTedAuditFindingType.PACKAGE_SEQUENCE_GAP,
null,
null,
null,
null,
"year:" + year,
"No TED package rows exist for this year inside the audited interval",
"year=" + year + ", intervalStartYear=" + firstYear + ", intervalEndYear=" + currentYear);
continue;
}
auditYearPackageSequence(run, accumulator, year, yearPackages);
for (TedDailyPackage dailyPackage : yearPackages) {
accumulator.incrementScannedPackages();
auditSinglePackage(run, accumulator, dailyPackage);
}
}
return packages.size();
}
private void auditYearPackageSequence(LegacyTedAuditRun run,
AuditAccumulator accumulator,
int year,
List<TedDailyPackage> yearPackages) {
yearPackages.sort((left, right) -> Integer.compare(safeInt(left.getSerialNumber()), safeInt(right.getSerialNumber())));
int firstSerial = safeInt(yearPackages.getFirst().getSerialNumber());
if (firstSerial > 1) {
recordMissingPackageRange(run, accumulator, year, 1, firstSerial - 1,
"TED package year starts after serial 1");
}
for (int i = 1; i < yearPackages.size(); i++) {
int previousSerial = safeInt(yearPackages.get(i - 1).getSerialNumber());
int currentSerial = safeInt(yearPackages.get(i).getSerialNumber());
if (currentSerial > previousSerial + 1) {
recordMissingPackageRange(run, accumulator, year, previousSerial + 1, currentSerial - 1,
"TED package sequence gap detected");
}
}
}
private void recordMissingPackageRange(LegacyTedAuditRun run,
AuditAccumulator accumulator,
int year,
int startSerial,
int endSerial,
String message) {
String startPackageId = formatPackageIdentifier(year, startSerial);
String endPackageId = formatPackageIdentifier(year, endSerial);
String referenceKey = startSerial == endSerial ? startPackageId : startPackageId + "-" + endPackageId;
recordFinding(run, accumulator,
LegacyTedAuditSeverity.WARNING,
LegacyTedAuditFindingType.PACKAGE_SEQUENCE_GAP,
startSerial == endSerial ? startPackageId : null,
null,
null,
null,
referenceKey,
message,
"year=" + year + ", missingStartSerial=" + startSerial + ", missingEndSerial=" + endSerial);
}
private void auditSinglePackage(LegacyTedAuditRun run,
AuditAccumulator accumulator,
TedDailyPackage dailyPackage) {
String packageIdentifier = dailyPackage.getPackageIdentifier();
int processedCount = safeInt(dailyPackage.getProcessedCount());
int failedCount = safeInt(dailyPackage.getFailedCount());
int accountedDocuments = processedCount + failedCount;
if (dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.COMPLETED
&& dailyPackage.getProcessedAt() == null) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.WARNING,
LegacyTedAuditFindingType.PACKAGE_COMPLETED_WITHOUT_PROCESSED_AT,
packageIdentifier,
null,
null,
null,
packageIdentifier,
"TED package is marked COMPLETED but processedAt is null",
null);
}
if (dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.COMPLETED
&& dailyPackage.getXmlFileCount() == null) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.WARNING,
LegacyTedAuditFindingType.PACKAGE_MISSING_XML_FILE_COUNT,
packageIdentifier,
null,
null,
null,
packageIdentifier,
"TED package is marked COMPLETED but xmlFileCount is null",
null);
}
if ((dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.DOWNLOADED
|| dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.PROCESSING
|| dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.COMPLETED)
&& !StringUtils.hasText(dailyPackage.getFileHash())) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.WARNING,
LegacyTedAuditFindingType.PACKAGE_MISSING_FILE_HASH,
packageIdentifier,
null,
null,
null,
packageIdentifier,
"TED package has no file hash recorded",
"downloadStatus=" + dailyPackage.getDownloadStatus());
}
if (dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.FAILED
&& !StringUtils.hasText(dailyPackage.getErrorMessage())) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.WARNING,
LegacyTedAuditFindingType.PACKAGE_FAILED_WITHOUT_ERROR_MESSAGE,
packageIdentifier,
null,
null,
null,
packageIdentifier,
"TED package is marked FAILED but has no error message",
null);
}
if (dailyPackage.getXmlFileCount() != null) {
if (accountedDocuments > dailyPackage.getXmlFileCount()) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.ERROR,
LegacyTedAuditFindingType.PACKAGE_COMPLETED_COUNT_MISMATCH,
packageIdentifier,
null,
null,
null,
packageIdentifier,
"TED package accounting exceeds xmlFileCount",
"xmlFileCount=" + dailyPackage.getXmlFileCount()
+ ", processedCount=" + processedCount
+ ", failedCount=" + failedCount);
} else if (dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.COMPLETED
&& accountedDocuments < dailyPackage.getXmlFileCount()) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.WARNING,
LegacyTedAuditFindingType.PACKAGE_COMPLETED_COUNT_MISMATCH,
packageIdentifier,
null,
null,
null,
packageIdentifier,
"TED package accounting is below xmlFileCount",
"xmlFileCount=" + dailyPackage.getXmlFileCount()
+ ", processedCount=" + processedCount
+ ", failedCount=" + failedCount);
}
}
if (isPackageIncompleteForReimport(dailyPackage, processedCount, failedCount, accountedDocuments)) {
recordFinding(run, accumulator,
dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.FAILED
? LegacyTedAuditSeverity.ERROR
: LegacyTedAuditSeverity.WARNING,
LegacyTedAuditFindingType.PACKAGE_INCOMPLETE,
packageIdentifier,
null,
null,
null,
packageIdentifier,
"TED package is not fully imported and should be considered for re-import",
buildIncompletePackageDetails(dailyPackage, processedCount, failedCount, accountedDocuments));
}
}
private boolean isPackageIncompleteForReimport(TedDailyPackage dailyPackage,
int processedCount,
int failedCount,
int accountedDocuments) {
TedDailyPackage.DownloadStatus status = dailyPackage.getDownloadStatus();
if (status == null) {
return true;
}
if (status == TedDailyPackage.DownloadStatus.NOT_FOUND) {
return false;
}
if (status == TedDailyPackage.DownloadStatus.PENDING
|| status == TedDailyPackage.DownloadStatus.DOWNLOADING
|| status == TedDailyPackage.DownloadStatus.DOWNLOADED
|| status == TedDailyPackage.DownloadStatus.PROCESSING
|| status == TedDailyPackage.DownloadStatus.FAILED) {
return true;
}
if (status != TedDailyPackage.DownloadStatus.COMPLETED) {
return true;
}
if (dailyPackage.getXmlFileCount() == null) {
return true;
}
if (failedCount > 0) {
return true;
}
return processedCount < dailyPackage.getXmlFileCount()
|| accountedDocuments != dailyPackage.getXmlFileCount();
}
private String buildIncompletePackageDetails(TedDailyPackage dailyPackage,
int processedCount,
int failedCount,
int accountedDocuments) {
return "status=" + dailyPackage.getDownloadStatus()
+ ", xmlFileCount=" + dailyPackage.getXmlFileCount()
+ ", processedCount=" + processedCount
+ ", failedCount=" + failedCount
+ ", accountedDocuments=" + accountedDocuments;
}
private void auditGlobalDuplicates(LegacyTedAuditRun run, AuditAccumulator accumulator) {
int limit = properties.getMaxDuplicateSamples();
jdbcTemplate.query(
"""
SELECT publication_id, COUNT(*) AS duplicate_count
FROM ted.procurement_document
WHERE publication_id IS NOT NULL AND publication_id <> ''
GROUP BY publication_id
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC, publication_id ASC
LIMIT ?
""",
ps -> ps.setInt(1, limit),
(rs, rowNum) -> {
String publicationId = rs.getString("publication_id");
long duplicateCount = rs.getLong("duplicate_count");
recordFinding(run, accumulator,
LegacyTedAuditSeverity.ERROR,
LegacyTedAuditFindingType.LEGACY_PUBLICATION_ID_DUPLICATE,
null,
null,
null,
null,
publicationId,
"Legacy TED publicationId appears multiple times",
"publicationId=" + publicationId + ", duplicateCount=" + duplicateCount);
return null;
});
}
private int auditLegacyDocuments(LegacyTedAuditRun run,
AuditAccumulator accumulator,
Integer requestedLimit,
int pageSize) {
int processed = 0;
int pageNumber = 0;
while (requestedLimit == null || processed < requestedLimit) {
Page<ProcurementDocument> page = procurementDocumentRepository.findAll(
PageRequest.of(pageNumber, pageSize, Sort.by(Sort.Direction.ASC, "createdAt", "id")));
if (page.isEmpty()) {
break;
}
for (ProcurementDocument legacyDocument : page.getContent()) {
auditSingleLegacyDocument(run, accumulator, legacyDocument);
accumulator.incrementScannedLegacyDocuments();
processed++;
if (requestedLimit != null && processed >= requestedLimit) {
return processed;
}
}
if (!page.hasNext()) {
break;
}
pageNumber++;
}
return processed;
}
private void auditSingleLegacyDocument(LegacyTedAuditRun run,
AuditAccumulator accumulator,
ProcurementDocument legacyDocument) {
UUID legacyDocumentId = legacyDocument.getId();
String referenceKey = buildReferenceKey(legacyDocument);
String documentHash = legacyDocument.getDocumentHash();
if (!StringUtils.hasText(documentHash)) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.ERROR,
LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_HASH,
null,
legacyDocumentId,
null,
null,
referenceKey,
"Legacy TED document has no documentHash",
null);
return;
}
if (!StringUtils.hasText(legacyDocument.getXmlDocument())) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.ERROR,
LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_XML,
null,
legacyDocumentId,
null,
null,
referenceKey,
"Legacy TED document has no xmlDocument payload",
"documentHash=" + documentHash);
}
if (!StringUtils.hasText(legacyDocument.getTextContent())) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.WARNING,
LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_TEXT,
null,
legacyDocumentId,
null,
null,
referenceKey,
"Legacy TED document has no normalized textContent",
"documentHash=" + documentHash);
}
if (!StringUtils.hasText(legacyDocument.getPublicationId())) {
recordFinding(run, accumulator,
LegacyTedAuditSeverity.WARNING,
LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_PUBLICATION_ID,
null,
legacyDocumentId,
null,
null,
referenceKey,
"Legacy TED document has no publicationId",
"documentHash=" + documentHash);
}
}
private void recordFinding(LegacyTedAuditRun run,
AuditAccumulator accumulator,
LegacyTedAuditSeverity severity,
LegacyTedAuditFindingType findingType,
String packageIdentifier,
UUID legacyProcurementDocumentId,
UUID genericDocumentId,
UUID tedProjectionId,
String referenceKey,
String message,
String detailsText) {
if (accumulator.totalFindings() >= properties.getMaxFindingsPerRun()) {
accumulator.markTruncated();
if (!accumulator.truncationRecorded()) {
LegacyTedAuditFinding truncatedFinding = LegacyTedAuditFinding.builder()
.run(run)
.severity(LegacyTedAuditSeverity.INFO)
.findingType(LegacyTedAuditFindingType.FINDINGS_TRUNCATED)
.referenceKey(referenceKey != null ? referenceKey : "max-findings-per-run")
.message("Legacy TED audit finding limit reached; additional findings were suppressed")
.detailsText("maxFindingsPerRun=" + properties.getMaxFindingsPerRun())
.build();
findingRepository.save(truncatedFinding);
accumulator.recordFinding(LegacyTedAuditSeverity.INFO, true);
}
return;
}
LegacyTedAuditFinding finding = LegacyTedAuditFinding.builder()
.run(run)
.severity(severity)
.findingType(findingType)
.packageIdentifier(packageIdentifier)
.legacyProcurementDocumentId(legacyProcurementDocumentId)
.documentId(genericDocumentId)
.tedNoticeProjectionId(tedProjectionId)
.referenceKey(referenceKey)
.message(message)
.detailsText(detailsText)
.build();
findingRepository.save(finding);
accumulator.recordFinding(severity, false);
}
private String buildReferenceKey(ProcurementDocument legacyDocument) {
if (StringUtils.hasText(legacyDocument.getPublicationId())) {
return legacyDocument.getPublicationId();
}
if (StringUtils.hasText(legacyDocument.getNoticeId())) {
return legacyDocument.getNoticeId();
}
if (StringUtils.hasText(legacyDocument.getSourceFilename())) {
return legacyDocument.getSourceFilename();
}
return String.valueOf(legacyDocument.getId());
}
private int safeInt(Integer value) {
return value != null ? value : 0;
}
private String formatPackageIdentifier(int year, int serialNumber) {
return "%04d%05d".formatted(year, serialNumber);
}
private String buildSummary(int scannedPackages,
int scannedLegacyDocuments,
AuditAccumulator accumulator) {
return "packages=" + scannedPackages
+ ", legacyDocuments=" + scannedLegacyDocuments
+ ", findings=" + accumulator.totalFindings()
+ ", warnings=" + accumulator.warningCount()
+ ", errors=" + accumulator.errorCount()
+ (accumulator.truncated() ? ", truncated=true" : "");
}
private static final class AuditAccumulator {
private int scannedPackages;
private int scannedLegacyDocuments;
private int infoCount;
private int warningCount;
private int errorCount;
private boolean truncated;
private boolean truncationRecorded;
void incrementScannedPackages() {
scannedPackages++;
}
void incrementScannedLegacyDocuments() {
scannedLegacyDocuments++;
}
void recordFinding(LegacyTedAuditSeverity severity, boolean truncationFindingRecordedNow) {
switch (severity) {
case INFO -> infoCount++;
case WARNING -> warningCount++;
case ERROR -> errorCount++;
}
if (truncationFindingRecordedNow) {
truncationRecorded = true;
}
}
void markTruncated() {
truncated = true;
}
int totalFindings() {
return infoCount + warningCount + errorCount;
}
int infoCount() {
return infoCount;
}
int warningCount() {
return warningCount;
}
int errorCount() {
return errorCount;
}
int scannedPackages() {
return scannedPackages;
}
int scannedLegacyDocuments() {
return scannedLegacyDocuments;
}
boolean truncated() {
return truncated;
}
boolean truncationRecorded() {
return truncationRecorded;
}
}
}

@ -0,0 +1,33 @@
package at.procon.dip.migration.audit.startup;
import at.procon.dip.migration.audit.config.LegacyTedAuditProperties;
import at.procon.dip.migration.audit.service.LegacyTedAuditService;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.stereotype.Component;
@Component
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@RequiredArgsConstructor
@Slf4j
public class LegacyTedAuditStartupRunner implements ApplicationRunner {
private final LegacyTedAuditProperties properties;
private final LegacyTedAuditService legacyTedAuditService;
@Override
public void run(ApplicationArguments args) {
if (!properties.isEnabled() || !properties.isStartupRunEnabled()) {
return;
}
int requestedLimit = properties.getStartupRunLimit();
log.info("Wave 1 / Milestone A startup audit enabled - scanning legacy TED data with limit {}",
requestedLimit > 0 ? requestedLimit : "unbounded");
legacyTedAuditService.executeAudit(requestedLimit);
}
}

@ -34,6 +34,7 @@ public class TedProcessorProperties {
private SolutionBriefProperties solutionBrief = new SolutionBriefProperties(); private SolutionBriefProperties solutionBrief = new SolutionBriefProperties();
private ProjectionProperties projection = new ProjectionProperties(); private ProjectionProperties projection = new ProjectionProperties();
private GenericIngestionProperties genericIngestion = new GenericIngestionProperties(); private GenericIngestionProperties genericIngestion = new GenericIngestionProperties();
private RepairProperties repair = new RepairProperties();
/** /**
* Input directory configuration for Apache Camel file consumer. * Input directory configuration for Apache Camel file consumer.
@ -356,6 +357,64 @@ public class TedProcessorProperties {
private boolean prioritizeCurrentYear = true; private boolean prioritizeCurrentYear = true;
} }
/**
* Legacy TED package repair / re-import configuration.
*/
@Data
public static class RepairProperties {
/**
* Enable startup repair of incomplete or missing TED packages.
*/
private boolean enabled = false;
/**
* If true, only logs the selected package candidates without modifying data.
*/
private boolean dryRun = false;
/**
* Maximum number of packages to process in one startup run.
*/
@Positive
private int maxPackages = 100;
/**
* Optional explicit package identifiers (YYYYSSSSS) to repair.
*/
private java.util.List<String> packageIdentifiers = new java.util.ArrayList<>();
/**
* Optional lower bound package identifier (inclusive).
*/
private String fromPackageIdentifier;
/**
* Optional upper bound package identifier (inclusive).
*/
private String toPackageIdentifier;
/**
* Include missing package sequence numbers inside the selected range.
*/
private boolean includeMissingSequenceGaps = true;
/**
* Re-download the package archive when it is missing locally.
*/
private boolean redownloadMissingArchives = true;
/**
* Always re-download the package archive even when a local archive already exists.
*/
private boolean forceRedownload = false;
/**
* Refuse startup repair while the automatic legacy package download scheduler is enabled.
*/
private boolean allowWhileDownloadEnabled = false;
}
/** /**
* IMAP Mail configuration for email processing. * IMAP Mail configuration for email processing.
*/ */

@ -58,7 +58,7 @@ public class Organization {
@Column(name = "country_code", length = 10) @Column(name = "country_code", length = 10)
private String countryCode; private String countryCode;
@Column(name = "city", length = 255) @Column(name = "city", columnDefinition = "TEXT")
private String city; private String city;
@Column(name = "postal_code", length = 255) @Column(name = "postal_code", length = 255)

@ -102,7 +102,7 @@ public class ProcurementDocument {
@Column(name = "buyer_country_code", length = 10) @Column(name = "buyer_country_code", length = 10)
private String buyerCountryCode; private String buyerCountryCode;
@Column(name = "buyer_city", length = 255) @Column(name = "buyer_city", columnDefinition = "TEXT")
private String buyerCity; private String buyerCity;
@Column(name = "buyer_postal_code", length = 100) @Column(name = "buyer_postal_code", length = 100)
@ -124,7 +124,7 @@ public class ProcurementDocument {
@Column(name = "project_description", columnDefinition = "TEXT") @Column(name = "project_description", columnDefinition = "TEXT")
private String projectDescription; private String projectDescription;
@Column(name = "internal_reference", length = 500) @Column(name = "internal_reference", columnDefinition = "TEXT")
private String internalReference; private String internalReference;
@Enumerated(EnumType.STRING) @Enumerated(EnumType.STRING)

@ -0,0 +1,446 @@
package at.procon.ted.repair;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.model.entity.TedDailyPackage;
import at.procon.ted.repository.TedDailyPackageRepository;
import at.procon.ted.service.BatchDocumentProcessingService;
import at.procon.ted.service.TedPackageDownloadService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.data.domain.Sort;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Startup tool for repairing / re-importing incomplete legacy TED daily packages.
*
* Strategy:
* - Identify incomplete package rows from {@code ted.ted_daily_package}
* - Optionally include missing sequence numbers inside a configured package range
* - Reuse existing batch XML processing so already-imported XML documents are skipped by hash,
* while missing documents are inserted during the repair run
*/
@Service
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
@RequiredArgsConstructor
@Slf4j
public class TedPackageRepairService {
private static final Pattern PACKAGE_IDENTIFIER_PATTERN = Pattern.compile("\\d{9}");
private static final int PROCESSING_CHUNK_SIZE = 25;
private final TedProcessorProperties properties;
private final TedDailyPackageRepository packageRepository;
private final TedPackageDownloadService downloadService;
private final BatchDocumentProcessingService batchProcessingService;
public RepairSummary repairConfiguredPackages() {
TedProcessorProperties.RepairProperties repairProperties = properties.getRepair();
List<RepairCandidate> candidates = resolveCandidates(repairProperties);
if (candidates.isEmpty()) {
log.info("TED package repair found no matching incomplete packages");
return new RepairSummary(0, 0, 0, 0, List.of());
}
log.info("TED package repair selected {} package candidates (dryRun={})", candidates.size(), repairProperties.isDryRun());
candidates.forEach(candidate -> log.info("Repair candidate: {} [{}]", candidate.packageIdentifier(), candidate.reason()));
if (repairProperties.isDryRun()) {
return new RepairSummary(candidates.size(), 0, 0, 0,
candidates.stream().map(RepairCandidate::packageIdentifier).toList());
}
int succeeded = 0;
int failed = 0;
int notFound = 0;
List<String> processed = new ArrayList<>();
for (RepairCandidate candidate : candidates) {
try {
RepairExecutionResult result = repairCandidate(candidate, repairProperties);
processed.add(candidate.packageIdentifier());
switch (result.outcome()) {
case COMPLETED -> succeeded++;
case NOT_FOUND -> notFound++;
case FAILED -> failed++;
}
} catch (Exception e) {
failed++;
log.error("TED package repair failed for {}: {}", candidate.packageIdentifier(), e.getMessage(), e);
markExistingPackageFailure(candidate.existingPackage(), "Repair run failed: " + e.getMessage());
}
}
log.info("TED package repair finished: selected={}, succeeded={}, failed={}, notFound={}",
candidates.size(), succeeded, failed, notFound);
return new RepairSummary(candidates.size(), succeeded, failed, notFound, processed);
}
List<RepairCandidate> resolveCandidates(TedProcessorProperties.RepairProperties repairProperties) {
List<TedDailyPackage> existingPackages = packageRepository.findAll(Sort.by(Sort.Direction.ASC, "year", "serialNumber"));
Map<String, TedDailyPackage> existingByIdentifier = existingPackages.stream()
.collect(Collectors.toMap(TedDailyPackage::getPackageIdentifier, pkg -> pkg, (left, right) -> left, LinkedHashMap::new));
if (!repairProperties.getPackageIdentifiers().isEmpty()) {
return resolveExplicitCandidates(repairProperties.getPackageIdentifiers(), existingByIdentifier, repairProperties.getMaxPackages());
}
if (existingPackages.isEmpty()) {
return List.of();
}
List<RepairCandidate> candidates = new ArrayList<>();
Set<String> seen = new LinkedHashSet<>();
boolean inspectSequenceRange = repairProperties.isIncludeMissingSequenceGaps()
|| hasText(repairProperties.getFromPackageIdentifier())
|| hasText(repairProperties.getToPackageIdentifier());
if (!inspectSequenceRange) {
for (TedDailyPackage pkg : existingPackages) {
if (isIncomplete(pkg) && seen.add(pkg.getPackageIdentifier())) {
candidates.add(RepairCandidate.existing(pkg, repairReasonFor(pkg)));
}
}
return limitCandidates(candidates, repairProperties.getMaxPackages());
}
PackageCoordinates first = parseIdentifier(
hasText(repairProperties.getFromPackageIdentifier())
? repairProperties.getFromPackageIdentifier()
: existingPackages.getFirst().getPackageIdentifier());
PackageCoordinates last = parseIdentifier(
hasText(repairProperties.getToPackageIdentifier())
? repairProperties.getToPackageIdentifier()
: existingPackages.getLast().getPackageIdentifier());
if (first.compareTo(last) > 0) {
throw new IllegalArgumentException("Repair package range is invalid: from > to");
}
Map<Integer, Integer> observedMaxByYear = existingPackages.stream()
.collect(Collectors.groupingBy(TedDailyPackage::getYear,
LinkedHashMap::new,
Collectors.collectingAndThen(
Collectors.maxBy(Comparator.comparingInt(TedDailyPackage::getSerialNumber)),
optional -> optional.map(TedDailyPackage::getSerialNumber).orElse(0))));
for (int year = first.year(); year <= last.year(); year++) {
int startSerial = year == first.year() ? first.serialNumber() : 1;
int defaultEndSerial = observedMaxByYear.getOrDefault(year, 0);
int endSerial = year == last.year() ? last.serialNumber() : defaultEndSerial;
if (endSerial < startSerial || endSerial <= 0) {
continue;
}
for (int serial = startSerial; serial <= endSerial; serial++) {
String packageIdentifier = formatPackageIdentifier(year, serial);
TedDailyPackage existingPackage = existingByIdentifier.get(packageIdentifier);
if (existingPackage != null) {
if (isIncomplete(existingPackage) && seen.add(packageIdentifier)) {
candidates.add(RepairCandidate.existing(existingPackage, repairReasonFor(existingPackage)));
}
} else if (repairProperties.isIncludeMissingSequenceGaps() && seen.add(packageIdentifier)) {
candidates.add(RepairCandidate.missing(year, serial, packageIdentifier, "MISSING_SEQUENCE_GAP"));
}
}
}
return limitCandidates(candidates, repairProperties.getMaxPackages());
}
private List<RepairCandidate> resolveExplicitCandidates(Collection<String> packageIdentifiers,
Map<String, TedDailyPackage> existingByIdentifier,
int maxPackages) {
List<RepairCandidate> candidates = new ArrayList<>();
Set<String> seen = new LinkedHashSet<>();
for (String rawIdentifier : packageIdentifiers) {
if (!hasText(rawIdentifier)) {
continue;
}
String normalized = rawIdentifier.trim();
if (!seen.add(normalized)) {
continue;
}
PackageCoordinates coordinates = parseIdentifier(normalized);
TedDailyPackage existing = existingByIdentifier.get(normalized);
if (existing != null) {
candidates.add(RepairCandidate.existing(existing, repairReasonFor(existing)));
} else {
candidates.add(RepairCandidate.missing(coordinates.year(), coordinates.serialNumber(), normalized, "EXPLICIT_PACKAGE"));
}
}
return limitCandidates(candidates, maxPackages);
}
private List<RepairCandidate> limitCandidates(List<RepairCandidate> candidates, int maxPackages) {
if (candidates.size() <= maxPackages) {
return candidates;
}
return new ArrayList<>(candidates.subList(0, maxPackages));
}
@Transactional
RepairExecutionResult repairCandidate(RepairCandidate candidate, TedProcessorProperties.RepairProperties repairProperties) throws Exception {
TedDailyPackage packageEntity = candidate.existingPackage() != null
? candidate.existingPackage()
: createMissingPackageRecord(candidate);
String packageIdentifier = candidate.packageIdentifier();
boolean downloadedNow = false;
long startNanos = System.nanoTime();
Path archivePath = packageArchivePath(packageIdentifier);
if (repairProperties.isForceRedownload() || !Files.exists(archivePath)) {
if (!repairProperties.isRedownloadMissingArchives()) {
String message = "Package archive is missing locally and re-download is disabled";
markFailure(packageEntity, message);
return new RepairExecutionResult(RepairOutcome.FAILED, message);
}
Path downloadedArchive = downloadService.downloadArchive(packageIdentifier);
if (downloadedArchive == null) {
packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.NOT_FOUND);
packageEntity.setErrorMessage("Package not found during repair run");
packageRepository.save(packageEntity);
return new RepairExecutionResult(RepairOutcome.NOT_FOUND, "HTTP 404");
}
archivePath = downloadedArchive;
downloadedNow = true;
packageEntity.setDownloadedAt(OffsetDateTime.now());
packageEntity.setDownloadUrl(downloadService.buildDownloadUrlForPackage(packageIdentifier));
}
packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.PROCESSING);
packageEntity.setErrorMessage(null);
packageEntity.setProcessedCount(0);
packageEntity.setFailedCount(0);
packageEntity.setFileHash(downloadService.calculateArchiveHash(archivePath));
packageRepository.save(packageEntity);
List<Path> xmlFiles = downloadService.extractArchive(archivePath, packageIdentifier);
packageEntity.setXmlFileCount(xmlFiles.size());
packageRepository.save(packageEntity);
int totalProcessed = 0;
int totalFailed = 0;
try {
for (int i = 0; i < xmlFiles.size(); i += PROCESSING_CHUNK_SIZE) {
int end = Math.min(i + PROCESSING_CHUNK_SIZE, xmlFiles.size());
List<Path> chunk = xmlFiles.subList(i, end);
BatchDocumentProcessingService.BatchProcessingResult result = batchProcessingService.processBatch(chunk);
totalProcessed += result.insertedCount() + result.duplicateCount();
totalFailed += result.errorCount();
packageEntity.setProcessedCount(totalProcessed);
packageEntity.setFailedCount(totalFailed);
packageRepository.save(packageEntity);
}
} finally {
cleanupExtractedXmlFiles(xmlFiles);
if (downloadedNow && properties.getDownload().isDeleteAfterExtraction()) {
deleteQuietly(archivePath);
}
}
packageEntity.setProcessedAt(OffsetDateTime.now());
packageEntity.setProcessingDurationMs((System.nanoTime() - startNanos) / 1_000_000L);
packageEntity.setProcessedCount(totalProcessed);
packageEntity.setFailedCount(totalFailed);
if (totalFailed == 0 && totalProcessed == xmlFiles.size()) {
packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.COMPLETED);
packageEntity.setErrorMessage(null);
packageRepository.save(packageEntity);
return new RepairExecutionResult(RepairOutcome.COMPLETED, "Package repaired successfully");
}
String failureMessage = String.format(Locale.ROOT,
"Repair incomplete: xmlFiles=%d, processed=%d, failed=%d",
xmlFiles.size(), totalProcessed, totalFailed);
markFailure(packageEntity, failureMessage);
return new RepairExecutionResult(RepairOutcome.FAILED, failureMessage);
}
private TedDailyPackage createMissingPackageRecord(RepairCandidate candidate) {
TedDailyPackage pkg = TedDailyPackage.builder()
.packageIdentifier(candidate.packageIdentifier())
.year(candidate.year())
.serialNumber(candidate.serialNumber())
.downloadUrl(downloadService.buildDownloadUrlForPackage(candidate.packageIdentifier()))
.downloadStatus(TedDailyPackage.DownloadStatus.PENDING)
.build();
return packageRepository.save(pkg);
}
private void markFailure(TedDailyPackage packageEntity, String message) {
packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.FAILED);
packageEntity.setErrorMessage(message);
packageRepository.save(packageEntity);
}
private void markExistingPackageFailure(TedDailyPackage packageEntity, String message) {
if (packageEntity == null) {
return;
}
packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.FAILED);
packageEntity.setErrorMessage(message);
packageRepository.save(packageEntity);
}
private Path packageArchivePath(String packageIdentifier) {
return Paths.get(properties.getDownload().getDownloadDirectory()).resolve(packageIdentifier + ".tar.gz");
}
private void cleanupExtractedXmlFiles(List<Path> xmlFiles) {
if (xmlFiles.isEmpty()) {
return;
}
Path packageDirectory = xmlFiles.getFirst().getParent();
for (Path xmlFile : xmlFiles) {
deleteQuietly(xmlFile);
}
if (packageDirectory != null) {
try (var stream = Files.list(packageDirectory)) {
if (stream.findAny().isEmpty()) {
deleteQuietly(packageDirectory);
}
} catch (IOException e) {
log.debug("Could not clean extracted package directory {}: {}", packageDirectory, e.getMessage());
}
}
}
private void deleteQuietly(Path path) {
try {
Files.deleteIfExists(path);
} catch (IOException e) {
log.debug("Could not delete {}: {}", path, e.getMessage());
}
}
boolean isIncomplete(TedDailyPackage pkg) {
if (pkg == null || pkg.getDownloadStatus() == null) {
return false;
}
if (pkg.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) {
return false;
}
if (pkg.getDownloadStatus() != TedDailyPackage.DownloadStatus.COMPLETED) {
return true;
}
Integer xmlFileCount = pkg.getXmlFileCount();
int processedCount = pkg.getProcessedCount() != null ? pkg.getProcessedCount() : 0;
int failedCount = pkg.getFailedCount() != null ? pkg.getFailedCount() : 0;
if (xmlFileCount == null || xmlFileCount <= 0) {
return true;
}
if (failedCount > 0) {
return true;
}
return processedCount != xmlFileCount;
}
private String repairReasonFor(TedDailyPackage pkg) {
if (pkg.getDownloadStatus() != TedDailyPackage.DownloadStatus.COMPLETED) {
return "STATUS_" + pkg.getDownloadStatus();
}
if (pkg.getXmlFileCount() == null || pkg.getXmlFileCount() <= 0) {
return "MISSING_XML_COUNT";
}
if (pkg.getFailedCount() != null && pkg.getFailedCount() > 0) {
return "FAILED_DOCUMENTS";
}
return "COUNT_MISMATCH";
}
private PackageCoordinates parseIdentifier(String packageIdentifier) {
String normalized = packageIdentifier != null ? packageIdentifier.trim() : "";
if (!PACKAGE_IDENTIFIER_PATTERN.matcher(normalized).matches()) {
throw new IllegalArgumentException("Invalid package identifier: " + packageIdentifier);
}
return new PackageCoordinates(
Integer.parseInt(normalized.substring(0, 4)),
Integer.parseInt(normalized.substring(4)));
}
private String formatPackageIdentifier(int year, int serialNumber) {
return String.format(Locale.ROOT, "%04d%05d", year, serialNumber);
}
private boolean hasText(String value) {
return value != null && !value.isBlank();
}
record PackageCoordinates(int year, int serialNumber) implements Comparable<PackageCoordinates> {
@Override
public int compareTo(PackageCoordinates other) {
int yearCompare = Integer.compare(this.year, other.year);
if (yearCompare != 0) {
return yearCompare;
}
return Integer.compare(this.serialNumber, other.serialNumber);
}
}
public record RepairCandidate(int year,
int serialNumber,
String packageIdentifier,
TedDailyPackage existingPackage,
String reason) {
static RepairCandidate existing(TedDailyPackage pkg, String reason) {
return new RepairCandidate(pkg.getYear(), pkg.getSerialNumber(), pkg.getPackageIdentifier(), pkg, reason);
}
static RepairCandidate missing(int year, int serialNumber, String packageIdentifier, String reason) {
return new RepairCandidate(year, serialNumber, packageIdentifier, null, reason);
}
}
enum RepairOutcome {
COMPLETED,
FAILED,
NOT_FOUND
}
record RepairExecutionResult(RepairOutcome outcome, String message) {
}
public record RepairSummary(int selected,
int succeeded,
int failed,
int notFound,
List<String> processedPackageIdentifiers) {
}
}

@ -369,6 +369,35 @@ public class TedPackageDownloadService {
} }
} }
/**
* Builds the download URL for a TED package identifier.
*/
public String buildDownloadUrlForPackage(String packageId) {
return buildDownloadUrl(packageId);
}
/**
* Downloads a package archive to the configured download directory.
* Returns {@code null} when the remote package does not exist (HTTP 404).
*/
public Path downloadArchive(String packageId) throws IOException {
return downloadFile(buildDownloadUrl(packageId), packageId);
}
/**
* Calculates the SHA-256 hash for a previously downloaded TED package archive.
*/
public String calculateArchiveHash(Path archivePath) throws Exception {
return calculateSHA256(archivePath);
}
/**
* Extracts XML files from a previously downloaded TED package archive.
*/
public List<Path> extractArchive(Path tarGzFile, String packageId) throws IOException {
return extractTarGz(tarGzFile, packageId);
}
/** /**
* Baut die Download-URL. * Baut die Download-URL.
*/ */

@ -0,0 +1,42 @@
package at.procon.ted.startup;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.repair.TedPackageRepairService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
/**
* Optional startup runner that repairs / re-imports incomplete legacy TED packages.
*/
@Component
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
@RequiredArgsConstructor
@Slf4j
@Order(50)
public class TedPackageRepairStartupRunner implements ApplicationRunner {
private final TedProcessorProperties properties;
private final TedPackageRepairService repairService;
@Override
public void run(ApplicationArguments args) {
if (!properties.getRepair().isEnabled()) {
return;
}
if (properties.getDownload().isEnabled() && !properties.getRepair().isAllowWhileDownloadEnabled()) {
throw new IllegalStateException(
"ted.repair.enabled=true requires ted.download.enabled=false " +
"or ted.repair.allow-while-download-enabled=true to avoid concurrent package processing");
}
log.info("Starting legacy TED package repair tool...");
repairService.repairConfiguredPackages();
}
}

@ -69,7 +69,7 @@ ted:
# Max consecutive 404 errors before stopping # Max consecutive 404 errors before stopping
max-consecutive-404: 4 max-consecutive-404: 4
# Polling interval (milliseconds) - 2 minutes # Polling interval (milliseconds) - 2 minutes
poll-interval: 300000 poll-interval: 120000
# Retry interval for tail NOT_FOUND packages - 6 hours # Retry interval for tail NOT_FOUND packages - 6 hours
not-found-retry-interval: 21600000 not-found-retry-interval: 21600000
# Grace period after year end before a previous-year tail 404 is treated as final # Grace period after year end before a previous-year tail 404 is treated as final
@ -87,6 +87,27 @@ ted:
# Prioritize current year first # Prioritize current year first
prioritize-current-year: false prioritize-current-year: false
repair:
# Enable one-off repair / re-import of incomplete TED packages on startup
enabled: false
# Only list candidate packages without modifying data
dry-run: false
# Safety cap for one startup run
max-packages: 100
# Optional explicit package identifiers to repair
package-identifiers: []
# Optional inclusive package range
from-package-identifier:
to-package-identifier:
# Also try to fill missing sequence numbers inside the selected range
include-missing-sequence-gaps: true
# Download missing archives when not available locally
redownload-missing-archives: true
# Always refresh the archive from TED before repairing
force-redownload: false
# Leave false unless the automatic download scheduler is disabled
allow-while-download-enabled: false
# IMAP Mail configuration # IMAP Mail configuration
mail: mail:
# Enable/disable mail processing # Enable/disable mail processing

@ -8,15 +8,15 @@ server:
spring: spring:
profiles: profiles:
active: legacy active: new
application: application:
name: document-intelligence-platform name: document-intelligence-platform
datasource: datasource:
url: jdbc:postgresql://localhost:5432/RELM url: jdbc:postgresql://94.130.218.54:32333/RELM
username: ${DB_USERNAME:postgres} username: ${DB_USERNAME:postgres}
password: ${DB_PASSWORD:P54!pcd#Wi} password: ${DB_PASSWORD:PDmXRx0Rbk9OFOn9qO5Gm/mPCfqW8zwbZ+/YIU1lySc=}
driver-class-name: org.postgresql.Driver driver-class-name: org.postgresql.Driver
hikari: hikari:
maximum-pool-size: 5 maximum-pool-size: 5
@ -28,7 +28,7 @@ spring:
jpa: jpa:
hibernate: hibernate:
ddl-auto: update ddl-auto: validate
show-sql: false show-sql: false
open-in-view: false open-in-view: false
properties: properties:

@ -0,0 +1,57 @@
-- Wave 1 / Milestone A: read-only legacy audit run/finding persistence.
-- Additive tables only; no legacy business data is modified by this migration.
CREATE TABLE IF NOT EXISTS DOC.doc_legacy_audit_run (
id UUID PRIMARY KEY,
status VARCHAR(32) NOT NULL,
requested_limit INTEGER,
page_size INTEGER NOT NULL,
scanned_packages INTEGER NOT NULL DEFAULT 0,
scanned_legacy_documents INTEGER NOT NULL DEFAULT 0,
finding_count INTEGER NOT NULL DEFAULT 0,
info_count INTEGER NOT NULL DEFAULT 0,
warning_count INTEGER NOT NULL DEFAULT 0,
error_count INTEGER NOT NULL DEFAULT 0,
started_at TIMESTAMPTZ NOT NULL,
completed_at TIMESTAMPTZ,
summary_text TEXT,
failure_message TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_run_status
ON DOC.doc_legacy_audit_run(status);
CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_run_started
ON DOC.doc_legacy_audit_run(started_at DESC);
CREATE TABLE IF NOT EXISTS DOC.doc_legacy_audit_finding (
id UUID PRIMARY KEY,
run_id UUID NOT NULL REFERENCES DOC.doc_legacy_audit_run(id) ON DELETE CASCADE,
severity VARCHAR(16) NOT NULL,
finding_type VARCHAR(64) NOT NULL,
package_identifier VARCHAR(20),
legacy_procurement_document_id UUID,
document_id UUID,
ted_notice_projection_id UUID,
reference_key VARCHAR(255),
message TEXT NOT NULL,
details_text TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_find_run
ON DOC.doc_legacy_audit_finding(run_id);
CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_find_type
ON DOC.doc_legacy_audit_finding(finding_type);
CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_find_severity
ON DOC.doc_legacy_audit_finding(severity);
CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_find_legacy_doc
ON DOC.doc_legacy_audit_finding(legacy_procurement_document_id);
CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_find_document
ON DOC.doc_legacy_audit_finding(document_id);

@ -0,0 +1,241 @@
package at.procon.dip.migration.audit.service;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.when;
import at.procon.dip.migration.audit.config.LegacyTedAuditProperties;
import at.procon.dip.migration.audit.entity.LegacyTedAuditFinding;
import at.procon.dip.migration.audit.entity.LegacyTedAuditFindingType;
import at.procon.dip.migration.audit.entity.LegacyTedAuditRun;
import at.procon.dip.migration.audit.entity.LegacyTedAuditRunStatus;
import at.procon.dip.migration.audit.repository.LegacyTedAuditFindingRepository;
import at.procon.dip.migration.audit.repository.LegacyTedAuditRunRepository;
import at.procon.ted.model.entity.NoticeType;
import at.procon.ted.model.entity.ProcurementDocument;
import at.procon.ted.model.entity.TedDailyPackage;
import at.procon.ted.repository.ProcurementDocumentRepository;
import at.procon.ted.repository.TedDailyPackageRepository;
import java.time.OffsetDateTime;
import java.time.Year;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageImpl;
import org.springframework.jdbc.core.JdbcTemplate;
@ExtendWith(MockitoExtension.class)
class LegacyTedAuditServiceTest {
@Mock
private TedDailyPackageRepository tedDailyPackageRepository;
@Mock
private ProcurementDocumentRepository procurementDocumentRepository;
@Mock
private LegacyTedAuditRunRepository runRepository;
@Mock
private LegacyTedAuditFindingRepository findingRepository;
@Mock
private JdbcTemplate jdbcTemplate;
private LegacyTedAuditService service;
private List<LegacyTedAuditFinding> persistedFindings;
@BeforeEach
void setUp() {
LegacyTedAuditProperties properties = new LegacyTedAuditProperties();
properties.setEnabled(true);
properties.setPageSize(50);
properties.setMaxFindingsPerRun(100);
properties.setMaxDuplicateSamples(10);
service = new LegacyTedAuditService(
properties,
tedDailyPackageRepository,
procurementDocumentRepository,
runRepository,
findingRepository,
jdbcTemplate
);
persistedFindings = new ArrayList<>();
when(runRepository.save(any(LegacyTedAuditRun.class))).thenAnswer(invocation -> {
LegacyTedAuditRun run = invocation.getArgument(0);
if (run.getId() == null) {
run.setId(UUID.randomUUID());
}
return run;
});
when(findingRepository.save(any(LegacyTedAuditFinding.class))).thenAnswer(invocation -> {
LegacyTedAuditFinding finding = invocation.getArgument(0);
if (finding.getId() == null) {
finding.setId(UUID.randomUUID());
}
persistedFindings.add(finding);
return finding;
});
when(procurementDocumentRepository.findAll(any(org.springframework.data.domain.Pageable.class)))
.thenReturn(new PageImpl<>(List.of()));
}
@Test
void executeAudit_should_record_package_sequence_gaps_and_incomplete_packages() {
int currentYear = Year.now().getValue();
when(tedDailyPackageRepository.findAll(any(org.springframework.data.domain.Sort.class))).thenReturn(List.of(
TedDailyPackage.builder()
.packageIdentifier(formatPackageIdentifier(currentYear, 1))
.year(currentYear)
.serialNumber(1)
.downloadStatus(TedDailyPackage.DownloadStatus.COMPLETED)
.xmlFileCount(10)
.processedCount(10)
.failedCount(0)
.fileHash("hash-1")
.processedAt(OffsetDateTime.now())
.build(),
TedDailyPackage.builder()
.packageIdentifier(formatPackageIdentifier(currentYear, 3))
.year(currentYear)
.serialNumber(3)
.downloadStatus(TedDailyPackage.DownloadStatus.COMPLETED)
.xmlFileCount(10)
.processedCount(9)
.failedCount(1)
.fileHash("hash-3")
.processedAt(OffsetDateTime.now())
.build(),
TedDailyPackage.builder()
.packageIdentifier(formatPackageIdentifier(currentYear, 4))
.year(currentYear)
.serialNumber(4)
.downloadStatus(TedDailyPackage.DownloadStatus.FAILED)
.xmlFileCount(12)
.processedCount(0)
.failedCount(0)
.errorMessage("processing failed")
.build()
));
LegacyTedAuditRun run = service.executeAudit(0);
assertThat(run.getStatus()).isEqualTo(LegacyTedAuditRunStatus.COMPLETED);
assertThat(run.getScannedPackages()).isEqualTo(3);
assertThat(persistedFindings)
.extracting(LegacyTedAuditFinding::getFindingType)
.contains(LegacyTedAuditFindingType.PACKAGE_SEQUENCE_GAP,
LegacyTedAuditFindingType.PACKAGE_INCOMPLETE);
assertThat(persistedFindings)
.filteredOn(f -> f.getFindingType() == LegacyTedAuditFindingType.PACKAGE_SEQUENCE_GAP)
.extracting(LegacyTedAuditFinding::getReferenceKey)
.contains(formatPackageIdentifier(currentYear, 2));
assertThat(persistedFindings)
.filteredOn(f -> f.getFindingType() == LegacyTedAuditFindingType.PACKAGE_INCOMPLETE)
.extracting(LegacyTedAuditFinding::getPackageIdentifier)
.contains(formatPackageIdentifier(currentYear, 3), formatPackageIdentifier(currentYear, 4));
}
@Test
void executeAudit_should_record_missing_years_inside_audited_interval() {
int currentYear = Year.now().getValue();
when(tedDailyPackageRepository.findAll(any(org.springframework.data.domain.Sort.class))).thenReturn(List.of(
TedDailyPackage.builder()
.packageIdentifier(formatPackageIdentifier(currentYear - 2, 1))
.year(currentYear - 2)
.serialNumber(1)
.downloadStatus(TedDailyPackage.DownloadStatus.COMPLETED)
.xmlFileCount(1)
.processedCount(1)
.failedCount(0)
.fileHash("hash-a")
.processedAt(OffsetDateTime.now())
.build(),
TedDailyPackage.builder()
.packageIdentifier(formatPackageIdentifier(currentYear, 1))
.year(currentYear)
.serialNumber(1)
.downloadStatus(TedDailyPackage.DownloadStatus.COMPLETED)
.xmlFileCount(1)
.processedCount(1)
.failedCount(0)
.fileHash("hash-b")
.processedAt(OffsetDateTime.now())
.build()
));
LegacyTedAuditRun run = service.executeAudit(0);
assertThat(run.getStatus()).isEqualTo(LegacyTedAuditRunStatus.COMPLETED);
assertThat(persistedFindings)
.filteredOn(f -> f.getFindingType() == LegacyTedAuditFindingType.PACKAGE_SEQUENCE_GAP)
.extracting(LegacyTedAuditFinding::getReferenceKey)
.contains("year:" + (currentYear - 1));
}
@Test
void executeAudit_should_record_legacy_document_integrity_findings_only() {
ProcurementDocument missingXml = ProcurementDocument.builder()
.id(UUID.randomUUID())
.documentHash("hash-1")
.publicationId("2025/S 001-000001")
.noticeType(NoticeType.CONTRACT_NOTICE)
.xmlDocument(null)
.textContent("hello")
.build();
ProcurementDocument missingTextAndPublicationId = ProcurementDocument.builder()
.id(UUID.randomUUID())
.documentHash("hash-2")
.publicationId(null)
.noticeType(NoticeType.CONTRACT_NOTICE)
.xmlDocument("<xml/>")
.textContent(null)
.build();
when(tedDailyPackageRepository.findAll(any(org.springframework.data.domain.Sort.class))).thenReturn(List.of());
when(procurementDocumentRepository.findAll(any(org.springframework.data.domain.Pageable.class)))
.thenReturn(pageOf(missingXml, missingTextAndPublicationId));
LegacyTedAuditRun run = service.executeAudit(10);
assertThat(run.getStatus()).isEqualTo(LegacyTedAuditRunStatus.COMPLETED);
assertThat(run.getScannedLegacyDocuments()).isEqualTo(2);
assertThat(persistedFindings)
.extracting(LegacyTedAuditFinding::getFindingType)
.contains(
LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_XML,
LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_TEXT,
LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_PUBLICATION_ID
)
.doesNotContain(
LegacyTedAuditFindingType.DOC_DOCUMENT_MISSING,
LegacyTedAuditFindingType.DOC_SOURCE_MISSING,
LegacyTedAuditFindingType.DOC_ORIGINAL_CONTENT_MISSING,
LegacyTedAuditFindingType.DOC_PRIMARY_REPRESENTATION_MISSING,
LegacyTedAuditFindingType.TED_PROJECTION_MISSING,
LegacyTedAuditFindingType.TED_PROJECTION_MISSING_LEGACY_LINK,
LegacyTedAuditFindingType.TED_PROJECTION_DOCUMENT_MISMATCH,
LegacyTedAuditFindingType.DOC_DEDUP_HASH_DUPLICATE
);
}
private Page<ProcurementDocument> pageOf(ProcurementDocument... documents) {
return new PageImpl<>(List.of(documents));
}
private String formatPackageIdentifier(int year, int serialNumber) {
return "%04d%05d".formatted(year, serialNumber);
}
}

@ -0,0 +1,120 @@
package at.procon.ted.repair;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.model.entity.TedDailyPackage;
import at.procon.ted.repository.TedDailyPackageRepository;
import at.procon.ted.service.BatchDocumentProcessingService;
import at.procon.ted.service.TedPackageDownloadService;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.springframework.data.domain.Sort;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.OffsetDateTime;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
class TedPackageRepairServiceTest {
@TempDir
Path tempDir;
@Test
void resolveCandidatesIncludesIncompletePackagesAndMissingSequenceGaps() {
TedProcessorProperties properties = new TedProcessorProperties();
properties.getRepair().setEnabled(true);
properties.getRepair().setFromPackageIdentifier("202600001");
properties.getRepair().setToPackageIdentifier("202600003");
properties.getRepair().setIncludeMissingSequenceGaps(true);
properties.getRepair().setMaxPackages(10);
TedDailyPackageRepository repository = mock(TedDailyPackageRepository.class);
TedDailyPackage pkg1 = newPackage("202600001", 2026, 1, TedDailyPackage.DownloadStatus.COMPLETED, 20, 20, 0);
TedDailyPackage pkg3 = newPackage("202600003", 2026, 3, TedDailyPackage.DownloadStatus.PROCESSING, 20, 5, 0);
when(repository.findAll(any(Sort.class))).thenReturn(List.of(pkg1, pkg3));
TedPackageRepairService service = new TedPackageRepairService(
properties,
repository,
mock(TedPackageDownloadService.class),
mock(BatchDocumentProcessingService.class));
List<TedPackageRepairService.RepairCandidate> candidates = service.resolveCandidates(properties.getRepair());
assertThat(candidates).extracting(TedPackageRepairService.RepairCandidate::packageIdentifier)
.containsExactly("202600002", "202600003");
assertThat(candidates).extracting(TedPackageRepairService.RepairCandidate::reason)
.containsExactly("MISSING_SEQUENCE_GAP", "STATUS_PROCESSING");
}
@Test
void repairCandidateProcessesExistingArchiveAndMarksPackageCompleted() throws Exception {
TedProcessorProperties properties = new TedProcessorProperties();
properties.getRepair().setEnabled(true);
properties.getRepair().setRedownloadMissingArchives(false);
properties.getDownload().setDownloadDirectory(tempDir.toString());
properties.getDownload().setDeleteAfterExtraction(false);
Path archive = tempDir.resolve("202600003.tar.gz");
Files.writeString(archive, "dummy");
TedDailyPackageRepository repository = mock(TedDailyPackageRepository.class);
TedDailyPackage pkg = newPackage("202600003", 2026, 3, TedDailyPackage.DownloadStatus.PROCESSING, 3, 0, 0);
when(repository.save(any(TedDailyPackage.class))).thenAnswer(invocation -> invocation.getArgument(0));
when(repository.findByPackageIdentifier("202600003")).thenReturn(Optional.of(pkg));
TedPackageDownloadService downloadService = mock(TedPackageDownloadService.class);
Path extractedDir = Files.createDirectory(tempDir.resolve("extracted"));
Path xml1 = Files.writeString(extractedDir.resolve("a.xml"), "<a/>");
Path xml2 = Files.writeString(extractedDir.resolve("b.xml"), "<b/>");
Path xml3 = Files.writeString(extractedDir.resolve("c.xml"), "<c/>");
when(downloadService.calculateArchiveHash(eq(archive))).thenReturn("hash-1");
when(downloadService.extractArchive(eq(archive), eq("202600003"))).thenReturn(List.of(xml1, xml2, xml3));
BatchDocumentProcessingService batchService = mock(BatchDocumentProcessingService.class);
when(batchService.processBatch(any())).thenReturn(new BatchDocumentProcessingService.BatchProcessingResult(
1, 2, 0, 5L, List.of(UUID.randomUUID()), List.of()));
TedPackageRepairService service = new TedPackageRepairService(properties, repository, downloadService, batchService);
TedPackageRepairService.RepairCandidate candidate = TedPackageRepairService.RepairCandidate.existing(pkg, "STATUS_PROCESSING");
var result = service.repairCandidate(candidate, properties.getRepair());
assertThat(result.outcome()).isEqualTo(TedPackageRepairService.RepairOutcome.COMPLETED);
assertThat(pkg.getDownloadStatus()).isEqualTo(TedDailyPackage.DownloadStatus.COMPLETED);
assertThat(pkg.getProcessedCount()).isEqualTo(3);
assertThat(pkg.getFailedCount()).isZero();
assertThat(pkg.getFileHash()).isEqualTo("hash-1");
assertThat(pkg.getProcessedAt()).isNotNull();
}
private TedDailyPackage newPackage(String packageIdentifier,
int year,
int serial,
TedDailyPackage.DownloadStatus status,
Integer xmlCount,
Integer processed,
Integer failed) {
TedDailyPackage pkg = new TedDailyPackage();
pkg.setId(UUID.randomUUID());
pkg.setPackageIdentifier(packageIdentifier);
pkg.setYear(year);
pkg.setSerialNumber(serial);
pkg.setDownloadStatus(status);
pkg.setXmlFileCount(xmlCount);
pkg.setProcessedCount(processed);
pkg.setFailedCount(failed);
pkg.setDownloadUrl("https://ted.europa.eu/packages/daily/" + packageIdentifier);
pkg.setCreatedAt(OffsetDateTime.now());
pkg.setUpdatedAt(OffsetDateTime.now());
return pkg;
}
}
Loading…
Cancel
Save