diff --git a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java index 2d7011d..1a475f7 100644 --- a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java +++ b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java @@ -16,8 +16,8 @@ import org.springframework.scheduling.annotation.EnableAsync; */ @SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"}) @EnableAsync -@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity"}) -@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository"}) +@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity", "at.procon.dip.domain.ted.entity", "at.procon.dip.embedding.job.entity", "at.procon.dip.migration.audit.entity"}) +@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository", "at.procon.dip.domain.ted.repository", "at.procon.dip.embedding.job.repository", "at.procon.dip.migration.audit.repository"}) public class DocumentIntelligencePlatformApplication { public static void main(String[] args) { diff --git a/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeOrganization.java b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeOrganization.java index 9f63f9c..d6738ce 100644 --- a/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeOrganization.java +++ b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeOrganization.java @@ -55,10 +55,10 @@ public class TedNoticeOrganization { @Column(name = "company_id", length = 1000) private String companyId; - @Column(name = "country_code", length = 10) + @Column(name = "country_code", columnDefinition = "TEXT") private String countryCode; - @Column(name = "city", length = 255) + @Column(name = "city", columnDefinition = "TEXT") private String city; @Column(name = "postal_code", length = 255) diff --git a/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java index b06dea1..704b5b9 100644 --- a/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java +++ b/src/main/java/at/procon/dip/domain/ted/entity/TedNoticeProjection.java @@ -108,7 +108,7 @@ public class TedNoticeProjection { @Column(name = "buyer_country_code", length = 10) private String buyerCountryCode; - @Column(name = "buyer_city", length = 255) + @Column(name = "buyer_city", columnDefinition = "TEXT") private String buyerCity; @Column(name = "buyer_postal_code", length = 100) @@ -129,7 +129,7 @@ public class TedNoticeProjection { @Column(name = "project_description", columnDefinition = "TEXT") private String projectDescription; - @Column(name = "internal_reference", length = 500) + @Column(name = "internal_reference", columnDefinition = "TEXT") private String internalReference; @Enumerated(EnumType.STRING) diff --git a/src/main/java/at/procon/dip/migration/audit/config/LegacyTedAuditProperties.java b/src/main/java/at/procon/dip/migration/audit/config/LegacyTedAuditProperties.java new file mode 100644 index 0000000..a4a2793 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/audit/config/LegacyTedAuditProperties.java @@ -0,0 +1,47 @@ +package at.procon.dip.migration.audit.config; + +import jakarta.validation.constraints.Min; +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Configuration; + +@Configuration +@ConfigurationProperties(prefix = "dip.migration.legacy-audit") +@Data +public class LegacyTedAuditProperties { + + /** + * Enables the Wave 1 / Milestone A legacy TED audit subsystem. + */ + private boolean enabled = true; + + /** + * Automatically runs the read-only audit on application startup. + */ + private boolean startupRunEnabled = false; + + /** + * Maximum number of legacy TED documents to scan during startup. + * 0 means no limit. + */ + @Min(0) + private int startupRunLimit = 500; + + /** + * Batch size for legacy TED document paging. + */ + @Min(1) + private int pageSize = 100; + + /** + * Hard cap for persisted findings in a single run to avoid runaway audit volume. + */ + @Min(1) + private int maxFindingsPerRun = 10000; + + /** + * Maximum number of duplicate/grouped samples recorded for global aggregate checks. + */ + @Min(1) + private int maxDuplicateSamples = 100; +} diff --git a/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditFinding.java b/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditFinding.java new file mode 100644 index 0000000..f794279 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditFinding.java @@ -0,0 +1,87 @@ +package at.procon.dip.migration.audit.entity; + +import at.procon.dip.architecture.SchemaNames; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.ManyToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_legacy_audit_finding", indexes = { + @Index(name = "idx_doc_legacy_audit_find_run", columnList = "run_id"), + @Index(name = "idx_doc_legacy_audit_find_type", columnList = "finding_type"), + @Index(name = "idx_doc_legacy_audit_find_severity", columnList = "severity"), + @Index(name = "idx_doc_legacy_audit_find_legacy_doc", columnList = "legacy_procurement_document_id"), + @Index(name = "idx_doc_legacy_audit_find_document", columnList = "document_id") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class LegacyTedAuditFinding { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "run_id", nullable = false) + private LegacyTedAuditRun run; + + @Enumerated(EnumType.STRING) + @Column(name = "severity", nullable = false, length = 16) + private LegacyTedAuditSeverity severity; + + @Enumerated(EnumType.STRING) + @Column(name = "finding_type", nullable = false, length = 64) + private LegacyTedAuditFindingType findingType; + + @Column(name = "package_identifier", length = 20) + private String packageIdentifier; + + @Column(name = "legacy_procurement_document_id") + private UUID legacyProcurementDocumentId; + + @Column(name = "document_id") + private UUID documentId; + + @Column(name = "ted_notice_projection_id") + private UUID tedNoticeProjectionId; + + @Column(name = "reference_key", length = 255) + private String referenceKey; + + @Column(name = "message", nullable = false, columnDefinition = "TEXT") + private String message; + + @Column(name = "details_text", columnDefinition = "TEXT") + private String detailsText; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + if (createdAt == null) { + createdAt = OffsetDateTime.now(); + } + } +} diff --git a/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditFindingType.java b/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditFindingType.java new file mode 100644 index 0000000..5fd7d31 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditFindingType.java @@ -0,0 +1,28 @@ +package at.procon.dip.migration.audit.entity; + +public enum LegacyTedAuditFindingType { + PACKAGE_SEQUENCE_GAP, + PACKAGE_INCOMPLETE, + PACKAGE_COMPLETED_WITHOUT_PROCESSED_AT, + PACKAGE_COMPLETED_COUNT_MISMATCH, + PACKAGE_MISSING_XML_FILE_COUNT, + PACKAGE_MISSING_FILE_HASH, + PACKAGE_FAILED_WITHOUT_ERROR_MESSAGE, + LEGACY_PUBLICATION_ID_DUPLICATE, + DOC_DEDUP_HASH_DUPLICATE, + LEGACY_DOCUMENT_MISSING_HASH, + LEGACY_DOCUMENT_MISSING_XML, + LEGACY_DOCUMENT_MISSING_TEXT, + LEGACY_DOCUMENT_MISSING_PUBLICATION_ID, + DOC_DOCUMENT_MISSING, + DOC_DOCUMENT_DUPLICATE, + DOC_SOURCE_MISSING, + DOC_ORIGINAL_CONTENT_MISSING, + DOC_ORIGINAL_CONTENT_DUPLICATE, + DOC_PRIMARY_REPRESENTATION_MISSING, + DOC_PRIMARY_REPRESENTATION_DUPLICATE, + TED_PROJECTION_MISSING, + TED_PROJECTION_MISSING_LEGACY_LINK, + TED_PROJECTION_DOCUMENT_MISMATCH, + FINDINGS_TRUNCATED +} diff --git a/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditRun.java b/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditRun.java new file mode 100644 index 0000000..cd37174 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditRun.java @@ -0,0 +1,110 @@ +package at.procon.dip.migration.audit.entity; + +import at.procon.dip.architecture.SchemaNames; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.PrePersist; +import jakarta.persistence.PreUpdate; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_legacy_audit_run", indexes = { + @Index(name = "idx_doc_legacy_audit_run_status", columnList = "status"), + @Index(name = "idx_doc_legacy_audit_run_started", columnList = "started_at") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class LegacyTedAuditRun { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @Enumerated(EnumType.STRING) + @Column(name = "status", nullable = false, length = 32) + private LegacyTedAuditRunStatus status; + + @Column(name = "requested_limit") + private Integer requestedLimit; + + @Column(name = "page_size", nullable = false) + private Integer pageSize; + + @Column(name = "scanned_packages", nullable = false) + @Builder.Default + private Integer scannedPackages = 0; + + @Column(name = "scanned_legacy_documents", nullable = false) + @Builder.Default + private Integer scannedLegacyDocuments = 0; + + @Column(name = "finding_count", nullable = false) + @Builder.Default + private Integer findingCount = 0; + + @Column(name = "info_count", nullable = false) + @Builder.Default + private Integer infoCount = 0; + + @Column(name = "warning_count", nullable = false) + @Builder.Default + private Integer warningCount = 0; + + @Column(name = "error_count", nullable = false) + @Builder.Default + private Integer errorCount = 0; + + @Column(name = "started_at", nullable = false) + private OffsetDateTime startedAt; + + @Column(name = "completed_at") + private OffsetDateTime completedAt; + + @Column(name = "summary_text", columnDefinition = "TEXT") + private String summaryText; + + @Column(name = "failure_message", columnDefinition = "TEXT") + private String failureMessage; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @Builder.Default + @Column(name = "updated_at", nullable = false) + private OffsetDateTime updatedAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + if (startedAt == null) { + startedAt = OffsetDateTime.now(); + } + if (createdAt == null) { + createdAt = OffsetDateTime.now(); + } + if (updatedAt == null) { + updatedAt = OffsetDateTime.now(); + } + } + + @PreUpdate + protected void onUpdate() { + updatedAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditRunStatus.java b/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditRunStatus.java new file mode 100644 index 0000000..98f3e09 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditRunStatus.java @@ -0,0 +1,7 @@ +package at.procon.dip.migration.audit.entity; + +public enum LegacyTedAuditRunStatus { + RUNNING, + COMPLETED, + FAILED +} diff --git a/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditSeverity.java b/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditSeverity.java new file mode 100644 index 0000000..9984553 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/audit/entity/LegacyTedAuditSeverity.java @@ -0,0 +1,7 @@ +package at.procon.dip.migration.audit.entity; + +public enum LegacyTedAuditSeverity { + INFO, + WARNING, + ERROR +} diff --git a/src/main/java/at/procon/dip/migration/audit/repository/LegacyTedAuditFindingRepository.java b/src/main/java/at/procon/dip/migration/audit/repository/LegacyTedAuditFindingRepository.java new file mode 100644 index 0000000..9f017d3 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/audit/repository/LegacyTedAuditFindingRepository.java @@ -0,0 +1,8 @@ +package at.procon.dip.migration.audit.repository; + +import at.procon.dip.migration.audit.entity.LegacyTedAuditFinding; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface LegacyTedAuditFindingRepository extends JpaRepository { +} diff --git a/src/main/java/at/procon/dip/migration/audit/repository/LegacyTedAuditRunRepository.java b/src/main/java/at/procon/dip/migration/audit/repository/LegacyTedAuditRunRepository.java new file mode 100644 index 0000000..66f5d8d --- /dev/null +++ b/src/main/java/at/procon/dip/migration/audit/repository/LegacyTedAuditRunRepository.java @@ -0,0 +1,8 @@ +package at.procon.dip.migration.audit.repository; + +import at.procon.dip.migration.audit.entity.LegacyTedAuditRun; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface LegacyTedAuditRunRepository extends JpaRepository { +} diff --git a/src/main/java/at/procon/dip/migration/audit/service/LegacyTedAuditService.java b/src/main/java/at/procon/dip/migration/audit/service/LegacyTedAuditService.java new file mode 100644 index 0000000..854ab3f --- /dev/null +++ b/src/main/java/at/procon/dip/migration/audit/service/LegacyTedAuditService.java @@ -0,0 +1,610 @@ +package at.procon.dip.migration.audit.service; + +import at.procon.dip.migration.audit.config.LegacyTedAuditProperties; +import at.procon.dip.migration.audit.entity.LegacyTedAuditFinding; +import at.procon.dip.migration.audit.entity.LegacyTedAuditFindingType; +import at.procon.dip.migration.audit.entity.LegacyTedAuditRun; +import at.procon.dip.migration.audit.entity.LegacyTedAuditRunStatus; +import at.procon.dip.migration.audit.entity.LegacyTedAuditSeverity; +import at.procon.dip.migration.audit.repository.LegacyTedAuditFindingRepository; +import at.procon.dip.migration.audit.repository.LegacyTedAuditRunRepository; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.ted.model.entity.ProcurementDocument; +import at.procon.ted.model.entity.TedDailyPackage; +import at.procon.ted.repository.ProcurementDocumentRepository; +import at.procon.ted.repository.TedDailyPackageRepository; +import java.time.OffsetDateTime; +import java.time.Year; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.data.domain.Page; +import org.springframework.data.domain.PageRequest; +import org.springframework.data.domain.Sort; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.stereotype.Service; +import org.springframework.util.StringUtils; + +@Service +@ConditionalOnRuntimeMode(RuntimeMode.NEW) +@RequiredArgsConstructor +@Slf4j +public class LegacyTedAuditService { + + private final LegacyTedAuditProperties properties; + private final TedDailyPackageRepository tedDailyPackageRepository; + private final ProcurementDocumentRepository procurementDocumentRepository; + private final LegacyTedAuditRunRepository runRepository; + private final LegacyTedAuditFindingRepository findingRepository; + private final JdbcTemplate jdbcTemplate; + + public LegacyTedAuditRun executeAudit() { + return executeAudit(properties.getStartupRunLimit()); + } + + public LegacyTedAuditRun executeAudit(int requestedLimit) { + if (!properties.isEnabled()) { + throw new IllegalStateException("Legacy TED audit is disabled by configuration"); + } + + Integer effectiveLimit = requestedLimit > 0 ? requestedLimit : null; + int pageSize = properties.getPageSize(); + AuditAccumulator accumulator = new AuditAccumulator(); + + LegacyTedAuditRun run = LegacyTedAuditRun.builder() + .status(LegacyTedAuditRunStatus.RUNNING) + .requestedLimit(effectiveLimit) + .pageSize(pageSize) + .startedAt(OffsetDateTime.now()) + .build(); + run = runRepository.save(run); + + try { + int scannedPackages = auditPackages(run, accumulator); + auditGlobalDuplicates(run, accumulator); + int scannedLegacyDocuments = 0;//auditLegacyDocuments(run, accumulator, effectiveLimit, pageSize); + + run.setStatus(LegacyTedAuditRunStatus.COMPLETED); + run.setCompletedAt(OffsetDateTime.now()); + run.setScannedPackages(scannedPackages); + run.setScannedLegacyDocuments(scannedLegacyDocuments); + run.setFindingCount(accumulator.totalFindings()); + run.setInfoCount(accumulator.infoCount()); + run.setWarningCount(accumulator.warningCount()); + run.setErrorCount(accumulator.errorCount()); + run.setSummaryText(buildSummary(scannedPackages, scannedLegacyDocuments, accumulator)); + run.setFailureMessage(null); + run = runRepository.save(run); + + log.info("Wave 1 / Milestone A legacy-only audit completed: runId={}, packages={}, documents={}, findings={}, warnings={}, errors={}", + run.getId(), scannedPackages, scannedLegacyDocuments, accumulator.totalFindings(), + accumulator.warningCount(), accumulator.errorCount()); + return run; + } catch (RuntimeException ex) { + run.setStatus(LegacyTedAuditRunStatus.FAILED); + run.setCompletedAt(OffsetDateTime.now()); + run.setScannedPackages(accumulator.scannedPackages()); + run.setScannedLegacyDocuments(accumulator.scannedLegacyDocuments()); + run.setFindingCount(accumulator.totalFindings()); + run.setInfoCount(accumulator.infoCount()); + run.setWarningCount(accumulator.warningCount()); + run.setErrorCount(accumulator.errorCount()); + run.setFailureMessage(ex.getMessage()); + run.setSummaryText(buildSummary(accumulator.scannedPackages(), accumulator.scannedLegacyDocuments(), accumulator)); + runRepository.save(run); + log.error("Wave 1 / Milestone A legacy-only audit failed: runId={}", run.getId(), ex); + throw ex; + } + } + + private int auditPackages(LegacyTedAuditRun run, AuditAccumulator accumulator) { + List packages = tedDailyPackageRepository.findAll(Sort.by(Sort.Direction.ASC, "year", "serialNumber")); + if (packages.isEmpty()) { + return 0; + } + + Map> packagesByYear = new TreeMap<>(); + for (TedDailyPackage dailyPackage : packages) { + packagesByYear.computeIfAbsent(dailyPackage.getYear(), ignored -> new ArrayList<>()).add(dailyPackage); + } + + int firstYear = packagesByYear.keySet().iterator().next(); + int currentYear = Year.now().getValue(); + + for (int year = firstYear; year <= currentYear; year++) { + List yearPackages = packagesByYear.get(year); + if (yearPackages == null || yearPackages.isEmpty()) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.WARNING, + LegacyTedAuditFindingType.PACKAGE_SEQUENCE_GAP, + null, + null, + null, + null, + "year:" + year, + "No TED package rows exist for this year inside the audited interval", + "year=" + year + ", intervalStartYear=" + firstYear + ", intervalEndYear=" + currentYear); + continue; + } + + auditYearPackageSequence(run, accumulator, year, yearPackages); + + for (TedDailyPackage dailyPackage : yearPackages) { + accumulator.incrementScannedPackages(); + auditSinglePackage(run, accumulator, dailyPackage); + } + } + + return packages.size(); + } + + private void auditYearPackageSequence(LegacyTedAuditRun run, + AuditAccumulator accumulator, + int year, + List yearPackages) { + yearPackages.sort((left, right) -> Integer.compare(safeInt(left.getSerialNumber()), safeInt(right.getSerialNumber()))); + + int firstSerial = safeInt(yearPackages.getFirst().getSerialNumber()); + if (firstSerial > 1) { + recordMissingPackageRange(run, accumulator, year, 1, firstSerial - 1, + "TED package year starts after serial 1"); + } + + for (int i = 1; i < yearPackages.size(); i++) { + int previousSerial = safeInt(yearPackages.get(i - 1).getSerialNumber()); + int currentSerial = safeInt(yearPackages.get(i).getSerialNumber()); + if (currentSerial > previousSerial + 1) { + recordMissingPackageRange(run, accumulator, year, previousSerial + 1, currentSerial - 1, + "TED package sequence gap detected"); + } + } + } + + private void recordMissingPackageRange(LegacyTedAuditRun run, + AuditAccumulator accumulator, + int year, + int startSerial, + int endSerial, + String message) { + String startPackageId = formatPackageIdentifier(year, startSerial); + String endPackageId = formatPackageIdentifier(year, endSerial); + String referenceKey = startSerial == endSerial ? startPackageId : startPackageId + "-" + endPackageId; + + recordFinding(run, accumulator, + LegacyTedAuditSeverity.WARNING, + LegacyTedAuditFindingType.PACKAGE_SEQUENCE_GAP, + startSerial == endSerial ? startPackageId : null, + null, + null, + null, + referenceKey, + message, + "year=" + year + ", missingStartSerial=" + startSerial + ", missingEndSerial=" + endSerial); + } + + private void auditSinglePackage(LegacyTedAuditRun run, + AuditAccumulator accumulator, + TedDailyPackage dailyPackage) { + String packageIdentifier = dailyPackage.getPackageIdentifier(); + int processedCount = safeInt(dailyPackage.getProcessedCount()); + int failedCount = safeInt(dailyPackage.getFailedCount()); + int accountedDocuments = processedCount + failedCount; + + if (dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.COMPLETED + && dailyPackage.getProcessedAt() == null) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.WARNING, + LegacyTedAuditFindingType.PACKAGE_COMPLETED_WITHOUT_PROCESSED_AT, + packageIdentifier, + null, + null, + null, + packageIdentifier, + "TED package is marked COMPLETED but processedAt is null", + null); + } + + if (dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.COMPLETED + && dailyPackage.getXmlFileCount() == null) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.WARNING, + LegacyTedAuditFindingType.PACKAGE_MISSING_XML_FILE_COUNT, + packageIdentifier, + null, + null, + null, + packageIdentifier, + "TED package is marked COMPLETED but xmlFileCount is null", + null); + } + + if ((dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.DOWNLOADED + || dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.PROCESSING + || dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.COMPLETED) + && !StringUtils.hasText(dailyPackage.getFileHash())) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.WARNING, + LegacyTedAuditFindingType.PACKAGE_MISSING_FILE_HASH, + packageIdentifier, + null, + null, + null, + packageIdentifier, + "TED package has no file hash recorded", + "downloadStatus=" + dailyPackage.getDownloadStatus()); + } + + if (dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.FAILED + && !StringUtils.hasText(dailyPackage.getErrorMessage())) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.WARNING, + LegacyTedAuditFindingType.PACKAGE_FAILED_WITHOUT_ERROR_MESSAGE, + packageIdentifier, + null, + null, + null, + packageIdentifier, + "TED package is marked FAILED but has no error message", + null); + } + + if (dailyPackage.getXmlFileCount() != null) { + if (accountedDocuments > dailyPackage.getXmlFileCount()) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.ERROR, + LegacyTedAuditFindingType.PACKAGE_COMPLETED_COUNT_MISMATCH, + packageIdentifier, + null, + null, + null, + packageIdentifier, + "TED package accounting exceeds xmlFileCount", + "xmlFileCount=" + dailyPackage.getXmlFileCount() + + ", processedCount=" + processedCount + + ", failedCount=" + failedCount); + } else if (dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.COMPLETED + && accountedDocuments < dailyPackage.getXmlFileCount()) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.WARNING, + LegacyTedAuditFindingType.PACKAGE_COMPLETED_COUNT_MISMATCH, + packageIdentifier, + null, + null, + null, + packageIdentifier, + "TED package accounting is below xmlFileCount", + "xmlFileCount=" + dailyPackage.getXmlFileCount() + + ", processedCount=" + processedCount + + ", failedCount=" + failedCount); + } + } + + if (isPackageIncompleteForReimport(dailyPackage, processedCount, failedCount, accountedDocuments)) { + recordFinding(run, accumulator, + dailyPackage.getDownloadStatus() == TedDailyPackage.DownloadStatus.FAILED + ? LegacyTedAuditSeverity.ERROR + : LegacyTedAuditSeverity.WARNING, + LegacyTedAuditFindingType.PACKAGE_INCOMPLETE, + packageIdentifier, + null, + null, + null, + packageIdentifier, + "TED package is not fully imported and should be considered for re-import", + buildIncompletePackageDetails(dailyPackage, processedCount, failedCount, accountedDocuments)); + } + } + + private boolean isPackageIncompleteForReimport(TedDailyPackage dailyPackage, + int processedCount, + int failedCount, + int accountedDocuments) { + TedDailyPackage.DownloadStatus status = dailyPackage.getDownloadStatus(); + if (status == null) { + return true; + } + if (status == TedDailyPackage.DownloadStatus.NOT_FOUND) { + return false; + } + if (status == TedDailyPackage.DownloadStatus.PENDING + || status == TedDailyPackage.DownloadStatus.DOWNLOADING + || status == TedDailyPackage.DownloadStatus.DOWNLOADED + || status == TedDailyPackage.DownloadStatus.PROCESSING + || status == TedDailyPackage.DownloadStatus.FAILED) { + return true; + } + if (status != TedDailyPackage.DownloadStatus.COMPLETED) { + return true; + } + if (dailyPackage.getXmlFileCount() == null) { + return true; + } + if (failedCount > 0) { + return true; + } + return processedCount < dailyPackage.getXmlFileCount() + || accountedDocuments != dailyPackage.getXmlFileCount(); + } + + private String buildIncompletePackageDetails(TedDailyPackage dailyPackage, + int processedCount, + int failedCount, + int accountedDocuments) { + return "status=" + dailyPackage.getDownloadStatus() + + ", xmlFileCount=" + dailyPackage.getXmlFileCount() + + ", processedCount=" + processedCount + + ", failedCount=" + failedCount + + ", accountedDocuments=" + accountedDocuments; + } + + private void auditGlobalDuplicates(LegacyTedAuditRun run, AuditAccumulator accumulator) { + int limit = properties.getMaxDuplicateSamples(); + + jdbcTemplate.query( + """ + SELECT publication_id, COUNT(*) AS duplicate_count + FROM ted.procurement_document + WHERE publication_id IS NOT NULL AND publication_id <> '' + GROUP BY publication_id + HAVING COUNT(*) > 1 + ORDER BY duplicate_count DESC, publication_id ASC + LIMIT ? + """, + ps -> ps.setInt(1, limit), + (rs, rowNum) -> { + String publicationId = rs.getString("publication_id"); + long duplicateCount = rs.getLong("duplicate_count"); + recordFinding(run, accumulator, + LegacyTedAuditSeverity.ERROR, + LegacyTedAuditFindingType.LEGACY_PUBLICATION_ID_DUPLICATE, + null, + null, + null, + null, + publicationId, + "Legacy TED publicationId appears multiple times", + "publicationId=" + publicationId + ", duplicateCount=" + duplicateCount); + return null; + }); + } + + private int auditLegacyDocuments(LegacyTedAuditRun run, + AuditAccumulator accumulator, + Integer requestedLimit, + int pageSize) { + int processed = 0; + int pageNumber = 0; + + while (requestedLimit == null || processed < requestedLimit) { + Page page = procurementDocumentRepository.findAll( + PageRequest.of(pageNumber, pageSize, Sort.by(Sort.Direction.ASC, "createdAt", "id"))); + + if (page.isEmpty()) { + break; + } + + for (ProcurementDocument legacyDocument : page.getContent()) { + auditSingleLegacyDocument(run, accumulator, legacyDocument); + accumulator.incrementScannedLegacyDocuments(); + processed++; + if (requestedLimit != null && processed >= requestedLimit) { + return processed; + } + } + + if (!page.hasNext()) { + break; + } + pageNumber++; + } + + return processed; + } + + private void auditSingleLegacyDocument(LegacyTedAuditRun run, + AuditAccumulator accumulator, + ProcurementDocument legacyDocument) { + UUID legacyDocumentId = legacyDocument.getId(); + String referenceKey = buildReferenceKey(legacyDocument); + String documentHash = legacyDocument.getDocumentHash(); + + if (!StringUtils.hasText(documentHash)) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.ERROR, + LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_HASH, + null, + legacyDocumentId, + null, + null, + referenceKey, + "Legacy TED document has no documentHash", + null); + return; + } + + if (!StringUtils.hasText(legacyDocument.getXmlDocument())) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.ERROR, + LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_XML, + null, + legacyDocumentId, + null, + null, + referenceKey, + "Legacy TED document has no xmlDocument payload", + "documentHash=" + documentHash); + } + + if (!StringUtils.hasText(legacyDocument.getTextContent())) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.WARNING, + LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_TEXT, + null, + legacyDocumentId, + null, + null, + referenceKey, + "Legacy TED document has no normalized textContent", + "documentHash=" + documentHash); + } + + if (!StringUtils.hasText(legacyDocument.getPublicationId())) { + recordFinding(run, accumulator, + LegacyTedAuditSeverity.WARNING, + LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_PUBLICATION_ID, + null, + legacyDocumentId, + null, + null, + referenceKey, + "Legacy TED document has no publicationId", + "documentHash=" + documentHash); + } + } + + private void recordFinding(LegacyTedAuditRun run, + AuditAccumulator accumulator, + LegacyTedAuditSeverity severity, + LegacyTedAuditFindingType findingType, + String packageIdentifier, + UUID legacyProcurementDocumentId, + UUID genericDocumentId, + UUID tedProjectionId, + String referenceKey, + String message, + String detailsText) { + if (accumulator.totalFindings() >= properties.getMaxFindingsPerRun()) { + accumulator.markTruncated(); + if (!accumulator.truncationRecorded()) { + LegacyTedAuditFinding truncatedFinding = LegacyTedAuditFinding.builder() + .run(run) + .severity(LegacyTedAuditSeverity.INFO) + .findingType(LegacyTedAuditFindingType.FINDINGS_TRUNCATED) + .referenceKey(referenceKey != null ? referenceKey : "max-findings-per-run") + .message("Legacy TED audit finding limit reached; additional findings were suppressed") + .detailsText("maxFindingsPerRun=" + properties.getMaxFindingsPerRun()) + .build(); + findingRepository.save(truncatedFinding); + accumulator.recordFinding(LegacyTedAuditSeverity.INFO, true); + } + return; + } + + LegacyTedAuditFinding finding = LegacyTedAuditFinding.builder() + .run(run) + .severity(severity) + .findingType(findingType) + .packageIdentifier(packageIdentifier) + .legacyProcurementDocumentId(legacyProcurementDocumentId) + .documentId(genericDocumentId) + .tedNoticeProjectionId(tedProjectionId) + .referenceKey(referenceKey) + .message(message) + .detailsText(detailsText) + .build(); + findingRepository.save(finding); + accumulator.recordFinding(severity, false); + } + + private String buildReferenceKey(ProcurementDocument legacyDocument) { + if (StringUtils.hasText(legacyDocument.getPublicationId())) { + return legacyDocument.getPublicationId(); + } + if (StringUtils.hasText(legacyDocument.getNoticeId())) { + return legacyDocument.getNoticeId(); + } + if (StringUtils.hasText(legacyDocument.getSourceFilename())) { + return legacyDocument.getSourceFilename(); + } + return String.valueOf(legacyDocument.getId()); + } + + private int safeInt(Integer value) { + return value != null ? value : 0; + } + + private String formatPackageIdentifier(int year, int serialNumber) { + return "%04d%05d".formatted(year, serialNumber); + } + + private String buildSummary(int scannedPackages, + int scannedLegacyDocuments, + AuditAccumulator accumulator) { + return "packages=" + scannedPackages + + ", legacyDocuments=" + scannedLegacyDocuments + + ", findings=" + accumulator.totalFindings() + + ", warnings=" + accumulator.warningCount() + + ", errors=" + accumulator.errorCount() + + (accumulator.truncated() ? ", truncated=true" : ""); + } + + private static final class AuditAccumulator { + private int scannedPackages; + private int scannedLegacyDocuments; + private int infoCount; + private int warningCount; + private int errorCount; + private boolean truncated; + private boolean truncationRecorded; + + void incrementScannedPackages() { + scannedPackages++; + } + + void incrementScannedLegacyDocuments() { + scannedLegacyDocuments++; + } + + void recordFinding(LegacyTedAuditSeverity severity, boolean truncationFindingRecordedNow) { + switch (severity) { + case INFO -> infoCount++; + case WARNING -> warningCount++; + case ERROR -> errorCount++; + } + if (truncationFindingRecordedNow) { + truncationRecorded = true; + } + } + + void markTruncated() { + truncated = true; + } + + int totalFindings() { + return infoCount + warningCount + errorCount; + } + + int infoCount() { + return infoCount; + } + + int warningCount() { + return warningCount; + } + + int errorCount() { + return errorCount; + } + + int scannedPackages() { + return scannedPackages; + } + + int scannedLegacyDocuments() { + return scannedLegacyDocuments; + } + + boolean truncated() { + return truncated; + } + + boolean truncationRecorded() { + return truncationRecorded; + } + } +} diff --git a/src/main/java/at/procon/dip/migration/audit/startup/LegacyTedAuditStartupRunner.java b/src/main/java/at/procon/dip/migration/audit/startup/LegacyTedAuditStartupRunner.java new file mode 100644 index 0000000..4270789 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/audit/startup/LegacyTedAuditStartupRunner.java @@ -0,0 +1,33 @@ +package at.procon.dip.migration.audit.startup; + +import at.procon.dip.migration.audit.config.LegacyTedAuditProperties; +import at.procon.dip.migration.audit.service.LegacyTedAuditService; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.stereotype.Component; + +@Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) +@RequiredArgsConstructor +@Slf4j +public class LegacyTedAuditStartupRunner implements ApplicationRunner { + + private final LegacyTedAuditProperties properties; + private final LegacyTedAuditService legacyTedAuditService; + + @Override + public void run(ApplicationArguments args) { + if (!properties.isEnabled() || !properties.isStartupRunEnabled()) { + return; + } + + int requestedLimit = properties.getStartupRunLimit(); + log.info("Wave 1 / Milestone A startup audit enabled - scanning legacy TED data with limit {}", + requestedLimit > 0 ? requestedLimit : "unbounded"); + legacyTedAuditService.executeAudit(requestedLimit); + } +} diff --git a/src/main/java/at/procon/ted/config/TedProcessorProperties.java b/src/main/java/at/procon/ted/config/TedProcessorProperties.java index b624937..f7e2770 100644 --- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java +++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java @@ -34,6 +34,7 @@ public class TedProcessorProperties { private SolutionBriefProperties solutionBrief = new SolutionBriefProperties(); private ProjectionProperties projection = new ProjectionProperties(); private GenericIngestionProperties genericIngestion = new GenericIngestionProperties(); + private RepairProperties repair = new RepairProperties(); /** * Input directory configuration for Apache Camel file consumer. @@ -356,6 +357,64 @@ public class TedProcessorProperties { private boolean prioritizeCurrentYear = true; } + /** + * Legacy TED package repair / re-import configuration. + */ + @Data + public static class RepairProperties { + + /** + * Enable startup repair of incomplete or missing TED packages. + */ + private boolean enabled = false; + + /** + * If true, only logs the selected package candidates without modifying data. + */ + private boolean dryRun = false; + + /** + * Maximum number of packages to process in one startup run. + */ + @Positive + private int maxPackages = 100; + + /** + * Optional explicit package identifiers (YYYYSSSSS) to repair. + */ + private java.util.List packageIdentifiers = new java.util.ArrayList<>(); + + /** + * Optional lower bound package identifier (inclusive). + */ + private String fromPackageIdentifier; + + /** + * Optional upper bound package identifier (inclusive). + */ + private String toPackageIdentifier; + + /** + * Include missing package sequence numbers inside the selected range. + */ + private boolean includeMissingSequenceGaps = true; + + /** + * Re-download the package archive when it is missing locally. + */ + private boolean redownloadMissingArchives = true; + + /** + * Always re-download the package archive even when a local archive already exists. + */ + private boolean forceRedownload = false; + + /** + * Refuse startup repair while the automatic legacy package download scheduler is enabled. + */ + private boolean allowWhileDownloadEnabled = false; + } + /** * IMAP Mail configuration for email processing. */ diff --git a/src/main/java/at/procon/ted/model/entity/Organization.java b/src/main/java/at/procon/ted/model/entity/Organization.java index d7f2652..1972916 100644 --- a/src/main/java/at/procon/ted/model/entity/Organization.java +++ b/src/main/java/at/procon/ted/model/entity/Organization.java @@ -58,7 +58,7 @@ public class Organization { @Column(name = "country_code", length = 10) private String countryCode; - @Column(name = "city", length = 255) + @Column(name = "city", columnDefinition = "TEXT") private String city; @Column(name = "postal_code", length = 255) diff --git a/src/main/java/at/procon/ted/model/entity/ProcurementDocument.java b/src/main/java/at/procon/ted/model/entity/ProcurementDocument.java index b260df9..9e56436 100644 --- a/src/main/java/at/procon/ted/model/entity/ProcurementDocument.java +++ b/src/main/java/at/procon/ted/model/entity/ProcurementDocument.java @@ -102,7 +102,7 @@ public class ProcurementDocument { @Column(name = "buyer_country_code", length = 10) private String buyerCountryCode; - @Column(name = "buyer_city", length = 255) + @Column(name = "buyer_city", columnDefinition = "TEXT") private String buyerCity; @Column(name = "buyer_postal_code", length = 100) @@ -124,7 +124,7 @@ public class ProcurementDocument { @Column(name = "project_description", columnDefinition = "TEXT") private String projectDescription; - @Column(name = "internal_reference", length = 500) + @Column(name = "internal_reference", columnDefinition = "TEXT") private String internalReference; @Enumerated(EnumType.STRING) diff --git a/src/main/java/at/procon/ted/repair/TedPackageRepairService.java b/src/main/java/at/procon/ted/repair/TedPackageRepairService.java new file mode 100644 index 0000000..88b4bb5 --- /dev/null +++ b/src/main/java/at/procon/ted/repair/TedPackageRepairService.java @@ -0,0 +1,446 @@ +package at.procon.ted.repair; + +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.ted.config.TedProcessorProperties; +import at.procon.ted.model.entity.TedDailyPackage; +import at.procon.ted.repository.TedDailyPackageRepository; +import at.procon.ted.service.BatchDocumentProcessingService; +import at.procon.ted.service.TedPackageDownloadService; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.data.domain.Sort; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.OffsetDateTime; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Startup tool for repairing / re-importing incomplete legacy TED daily packages. + * + * Strategy: + * - Identify incomplete package rows from {@code ted.ted_daily_package} + * - Optionally include missing sequence numbers inside a configured package range + * - Reuse existing batch XML processing so already-imported XML documents are skipped by hash, + * while missing documents are inserted during the repair run + */ +@Service +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) +@RequiredArgsConstructor +@Slf4j +public class TedPackageRepairService { + + private static final Pattern PACKAGE_IDENTIFIER_PATTERN = Pattern.compile("\\d{9}"); + private static final int PROCESSING_CHUNK_SIZE = 25; + + private final TedProcessorProperties properties; + private final TedDailyPackageRepository packageRepository; + private final TedPackageDownloadService downloadService; + private final BatchDocumentProcessingService batchProcessingService; + + public RepairSummary repairConfiguredPackages() { + TedProcessorProperties.RepairProperties repairProperties = properties.getRepair(); + List candidates = resolveCandidates(repairProperties); + + if (candidates.isEmpty()) { + log.info("TED package repair found no matching incomplete packages"); + return new RepairSummary(0, 0, 0, 0, List.of()); + } + + log.info("TED package repair selected {} package candidates (dryRun={})", candidates.size(), repairProperties.isDryRun()); + candidates.forEach(candidate -> log.info("Repair candidate: {} [{}]", candidate.packageIdentifier(), candidate.reason())); + + if (repairProperties.isDryRun()) { + return new RepairSummary(candidates.size(), 0, 0, 0, + candidates.stream().map(RepairCandidate::packageIdentifier).toList()); + } + + int succeeded = 0; + int failed = 0; + int notFound = 0; + List processed = new ArrayList<>(); + + for (RepairCandidate candidate : candidates) { + try { + RepairExecutionResult result = repairCandidate(candidate, repairProperties); + processed.add(candidate.packageIdentifier()); + switch (result.outcome()) { + case COMPLETED -> succeeded++; + case NOT_FOUND -> notFound++; + case FAILED -> failed++; + } + } catch (Exception e) { + failed++; + log.error("TED package repair failed for {}: {}", candidate.packageIdentifier(), e.getMessage(), e); + markExistingPackageFailure(candidate.existingPackage(), "Repair run failed: " + e.getMessage()); + } + } + + log.info("TED package repair finished: selected={}, succeeded={}, failed={}, notFound={}", + candidates.size(), succeeded, failed, notFound); + return new RepairSummary(candidates.size(), succeeded, failed, notFound, processed); + } + + List resolveCandidates(TedProcessorProperties.RepairProperties repairProperties) { + List existingPackages = packageRepository.findAll(Sort.by(Sort.Direction.ASC, "year", "serialNumber")); + Map existingByIdentifier = existingPackages.stream() + .collect(Collectors.toMap(TedDailyPackage::getPackageIdentifier, pkg -> pkg, (left, right) -> left, LinkedHashMap::new)); + + if (!repairProperties.getPackageIdentifiers().isEmpty()) { + return resolveExplicitCandidates(repairProperties.getPackageIdentifiers(), existingByIdentifier, repairProperties.getMaxPackages()); + } + + if (existingPackages.isEmpty()) { + return List.of(); + } + + List candidates = new ArrayList<>(); + Set seen = new LinkedHashSet<>(); + + boolean inspectSequenceRange = repairProperties.isIncludeMissingSequenceGaps() + || hasText(repairProperties.getFromPackageIdentifier()) + || hasText(repairProperties.getToPackageIdentifier()); + + if (!inspectSequenceRange) { + for (TedDailyPackage pkg : existingPackages) { + if (isIncomplete(pkg) && seen.add(pkg.getPackageIdentifier())) { + candidates.add(RepairCandidate.existing(pkg, repairReasonFor(pkg))); + } + } + return limitCandidates(candidates, repairProperties.getMaxPackages()); + } + + PackageCoordinates first = parseIdentifier( + hasText(repairProperties.getFromPackageIdentifier()) + ? repairProperties.getFromPackageIdentifier() + : existingPackages.getFirst().getPackageIdentifier()); + + PackageCoordinates last = parseIdentifier( + hasText(repairProperties.getToPackageIdentifier()) + ? repairProperties.getToPackageIdentifier() + : existingPackages.getLast().getPackageIdentifier()); + + if (first.compareTo(last) > 0) { + throw new IllegalArgumentException("Repair package range is invalid: from > to"); + } + + Map observedMaxByYear = existingPackages.stream() + .collect(Collectors.groupingBy(TedDailyPackage::getYear, + LinkedHashMap::new, + Collectors.collectingAndThen( + Collectors.maxBy(Comparator.comparingInt(TedDailyPackage::getSerialNumber)), + optional -> optional.map(TedDailyPackage::getSerialNumber).orElse(0)))); + + for (int year = first.year(); year <= last.year(); year++) { + int startSerial = year == first.year() ? first.serialNumber() : 1; + int defaultEndSerial = observedMaxByYear.getOrDefault(year, 0); + int endSerial = year == last.year() ? last.serialNumber() : defaultEndSerial; + + if (endSerial < startSerial || endSerial <= 0) { + continue; + } + + for (int serial = startSerial; serial <= endSerial; serial++) { + String packageIdentifier = formatPackageIdentifier(year, serial); + TedDailyPackage existingPackage = existingByIdentifier.get(packageIdentifier); + if (existingPackage != null) { + if (isIncomplete(existingPackage) && seen.add(packageIdentifier)) { + candidates.add(RepairCandidate.existing(existingPackage, repairReasonFor(existingPackage))); + } + } else if (repairProperties.isIncludeMissingSequenceGaps() && seen.add(packageIdentifier)) { + candidates.add(RepairCandidate.missing(year, serial, packageIdentifier, "MISSING_SEQUENCE_GAP")); + } + } + } + + return limitCandidates(candidates, repairProperties.getMaxPackages()); + } + + private List resolveExplicitCandidates(Collection packageIdentifiers, + Map existingByIdentifier, + int maxPackages) { + List candidates = new ArrayList<>(); + Set seen = new LinkedHashSet<>(); + + for (String rawIdentifier : packageIdentifiers) { + if (!hasText(rawIdentifier)) { + continue; + } + String normalized = rawIdentifier.trim(); + if (!seen.add(normalized)) { + continue; + } + PackageCoordinates coordinates = parseIdentifier(normalized); + TedDailyPackage existing = existingByIdentifier.get(normalized); + if (existing != null) { + candidates.add(RepairCandidate.existing(existing, repairReasonFor(existing))); + } else { + candidates.add(RepairCandidate.missing(coordinates.year(), coordinates.serialNumber(), normalized, "EXPLICIT_PACKAGE")); + } + } + + return limitCandidates(candidates, maxPackages); + } + + private List limitCandidates(List candidates, int maxPackages) { + if (candidates.size() <= maxPackages) { + return candidates; + } + return new ArrayList<>(candidates.subList(0, maxPackages)); + } + + @Transactional + RepairExecutionResult repairCandidate(RepairCandidate candidate, TedProcessorProperties.RepairProperties repairProperties) throws Exception { + TedDailyPackage packageEntity = candidate.existingPackage() != null + ? candidate.existingPackage() + : createMissingPackageRecord(candidate); + + String packageIdentifier = candidate.packageIdentifier(); + boolean downloadedNow = false; + long startNanos = System.nanoTime(); + + Path archivePath = packageArchivePath(packageIdentifier); + if (repairProperties.isForceRedownload() || !Files.exists(archivePath)) { + if (!repairProperties.isRedownloadMissingArchives()) { + String message = "Package archive is missing locally and re-download is disabled"; + markFailure(packageEntity, message); + return new RepairExecutionResult(RepairOutcome.FAILED, message); + } + + Path downloadedArchive = downloadService.downloadArchive(packageIdentifier); + if (downloadedArchive == null) { + packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.NOT_FOUND); + packageEntity.setErrorMessage("Package not found during repair run"); + packageRepository.save(packageEntity); + return new RepairExecutionResult(RepairOutcome.NOT_FOUND, "HTTP 404"); + } + archivePath = downloadedArchive; + downloadedNow = true; + packageEntity.setDownloadedAt(OffsetDateTime.now()); + packageEntity.setDownloadUrl(downloadService.buildDownloadUrlForPackage(packageIdentifier)); + } + + packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.PROCESSING); + packageEntity.setErrorMessage(null); + packageEntity.setProcessedCount(0); + packageEntity.setFailedCount(0); + packageEntity.setFileHash(downloadService.calculateArchiveHash(archivePath)); + packageRepository.save(packageEntity); + + List xmlFiles = downloadService.extractArchive(archivePath, packageIdentifier); + packageEntity.setXmlFileCount(xmlFiles.size()); + packageRepository.save(packageEntity); + + int totalProcessed = 0; + int totalFailed = 0; + try { + for (int i = 0; i < xmlFiles.size(); i += PROCESSING_CHUNK_SIZE) { + int end = Math.min(i + PROCESSING_CHUNK_SIZE, xmlFiles.size()); + List chunk = xmlFiles.subList(i, end); + BatchDocumentProcessingService.BatchProcessingResult result = batchProcessingService.processBatch(chunk); + totalProcessed += result.insertedCount() + result.duplicateCount(); + totalFailed += result.errorCount(); + + packageEntity.setProcessedCount(totalProcessed); + packageEntity.setFailedCount(totalFailed); + packageRepository.save(packageEntity); + } + } finally { + cleanupExtractedXmlFiles(xmlFiles); + if (downloadedNow && properties.getDownload().isDeleteAfterExtraction()) { + deleteQuietly(archivePath); + } + } + + packageEntity.setProcessedAt(OffsetDateTime.now()); + packageEntity.setProcessingDurationMs((System.nanoTime() - startNanos) / 1_000_000L); + packageEntity.setProcessedCount(totalProcessed); + packageEntity.setFailedCount(totalFailed); + + if (totalFailed == 0 && totalProcessed == xmlFiles.size()) { + packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.COMPLETED); + packageEntity.setErrorMessage(null); + packageRepository.save(packageEntity); + return new RepairExecutionResult(RepairOutcome.COMPLETED, "Package repaired successfully"); + } + + String failureMessage = String.format(Locale.ROOT, + "Repair incomplete: xmlFiles=%d, processed=%d, failed=%d", + xmlFiles.size(), totalProcessed, totalFailed); + markFailure(packageEntity, failureMessage); + return new RepairExecutionResult(RepairOutcome.FAILED, failureMessage); + } + + private TedDailyPackage createMissingPackageRecord(RepairCandidate candidate) { + TedDailyPackage pkg = TedDailyPackage.builder() + .packageIdentifier(candidate.packageIdentifier()) + .year(candidate.year()) + .serialNumber(candidate.serialNumber()) + .downloadUrl(downloadService.buildDownloadUrlForPackage(candidate.packageIdentifier())) + .downloadStatus(TedDailyPackage.DownloadStatus.PENDING) + .build(); + return packageRepository.save(pkg); + } + + private void markFailure(TedDailyPackage packageEntity, String message) { + packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.FAILED); + packageEntity.setErrorMessage(message); + packageRepository.save(packageEntity); + } + + private void markExistingPackageFailure(TedDailyPackage packageEntity, String message) { + if (packageEntity == null) { + return; + } + packageEntity.setDownloadStatus(TedDailyPackage.DownloadStatus.FAILED); + packageEntity.setErrorMessage(message); + packageRepository.save(packageEntity); + } + + private Path packageArchivePath(String packageIdentifier) { + return Paths.get(properties.getDownload().getDownloadDirectory()).resolve(packageIdentifier + ".tar.gz"); + } + + private void cleanupExtractedXmlFiles(List xmlFiles) { + if (xmlFiles.isEmpty()) { + return; + } + + Path packageDirectory = xmlFiles.getFirst().getParent(); + for (Path xmlFile : xmlFiles) { + deleteQuietly(xmlFile); + } + + if (packageDirectory != null) { + try (var stream = Files.list(packageDirectory)) { + if (stream.findAny().isEmpty()) { + deleteQuietly(packageDirectory); + } + } catch (IOException e) { + log.debug("Could not clean extracted package directory {}: {}", packageDirectory, e.getMessage()); + } + } + } + + private void deleteQuietly(Path path) { + try { + Files.deleteIfExists(path); + } catch (IOException e) { + log.debug("Could not delete {}: {}", path, e.getMessage()); + } + } + + boolean isIncomplete(TedDailyPackage pkg) { + if (pkg == null || pkg.getDownloadStatus() == null) { + return false; + } + + if (pkg.getDownloadStatus() == TedDailyPackage.DownloadStatus.NOT_FOUND) { + return false; + } + + if (pkg.getDownloadStatus() != TedDailyPackage.DownloadStatus.COMPLETED) { + return true; + } + + Integer xmlFileCount = pkg.getXmlFileCount(); + int processedCount = pkg.getProcessedCount() != null ? pkg.getProcessedCount() : 0; + int failedCount = pkg.getFailedCount() != null ? pkg.getFailedCount() : 0; + + if (xmlFileCount == null || xmlFileCount <= 0) { + return true; + } + if (failedCount > 0) { + return true; + } + return processedCount != xmlFileCount; + } + + private String repairReasonFor(TedDailyPackage pkg) { + if (pkg.getDownloadStatus() != TedDailyPackage.DownloadStatus.COMPLETED) { + return "STATUS_" + pkg.getDownloadStatus(); + } + if (pkg.getXmlFileCount() == null || pkg.getXmlFileCount() <= 0) { + return "MISSING_XML_COUNT"; + } + if (pkg.getFailedCount() != null && pkg.getFailedCount() > 0) { + return "FAILED_DOCUMENTS"; + } + return "COUNT_MISMATCH"; + } + + private PackageCoordinates parseIdentifier(String packageIdentifier) { + String normalized = packageIdentifier != null ? packageIdentifier.trim() : ""; + if (!PACKAGE_IDENTIFIER_PATTERN.matcher(normalized).matches()) { + throw new IllegalArgumentException("Invalid package identifier: " + packageIdentifier); + } + return new PackageCoordinates( + Integer.parseInt(normalized.substring(0, 4)), + Integer.parseInt(normalized.substring(4))); + } + + private String formatPackageIdentifier(int year, int serialNumber) { + return String.format(Locale.ROOT, "%04d%05d", year, serialNumber); + } + + private boolean hasText(String value) { + return value != null && !value.isBlank(); + } + + record PackageCoordinates(int year, int serialNumber) implements Comparable { + @Override + public int compareTo(PackageCoordinates other) { + int yearCompare = Integer.compare(this.year, other.year); + if (yearCompare != 0) { + return yearCompare; + } + return Integer.compare(this.serialNumber, other.serialNumber); + } + } + + public record RepairCandidate(int year, + int serialNumber, + String packageIdentifier, + TedDailyPackage existingPackage, + String reason) { + static RepairCandidate existing(TedDailyPackage pkg, String reason) { + return new RepairCandidate(pkg.getYear(), pkg.getSerialNumber(), pkg.getPackageIdentifier(), pkg, reason); + } + + static RepairCandidate missing(int year, int serialNumber, String packageIdentifier, String reason) { + return new RepairCandidate(year, serialNumber, packageIdentifier, null, reason); + } + } + + enum RepairOutcome { + COMPLETED, + FAILED, + NOT_FOUND + } + + record RepairExecutionResult(RepairOutcome outcome, String message) { + } + + public record RepairSummary(int selected, + int succeeded, + int failed, + int notFound, + List processedPackageIdentifiers) { + } +} diff --git a/src/main/java/at/procon/ted/service/TedPackageDownloadService.java b/src/main/java/at/procon/ted/service/TedPackageDownloadService.java index fafb169..b9cb14d 100644 --- a/src/main/java/at/procon/ted/service/TedPackageDownloadService.java +++ b/src/main/java/at/procon/ted/service/TedPackageDownloadService.java @@ -369,6 +369,35 @@ public class TedPackageDownloadService { } } + /** + * Builds the download URL for a TED package identifier. + */ + public String buildDownloadUrlForPackage(String packageId) { + return buildDownloadUrl(packageId); + } + + /** + * Downloads a package archive to the configured download directory. + * Returns {@code null} when the remote package does not exist (HTTP 404). + */ + public Path downloadArchive(String packageId) throws IOException { + return downloadFile(buildDownloadUrl(packageId), packageId); + } + + /** + * Calculates the SHA-256 hash for a previously downloaded TED package archive. + */ + public String calculateArchiveHash(Path archivePath) throws Exception { + return calculateSHA256(archivePath); + } + + /** + * Extracts XML files from a previously downloaded TED package archive. + */ + public List extractArchive(Path tarGzFile, String packageId) throws IOException { + return extractTarGz(tarGzFile, packageId); + } + /** * Baut die Download-URL. */ diff --git a/src/main/java/at/procon/ted/startup/TedPackageRepairStartupRunner.java b/src/main/java/at/procon/ted/startup/TedPackageRepairStartupRunner.java new file mode 100644 index 0000000..aabf348 --- /dev/null +++ b/src/main/java/at/procon/ted/startup/TedPackageRepairStartupRunner.java @@ -0,0 +1,42 @@ +package at.procon.ted.startup; + +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.ted.config.TedProcessorProperties; +import at.procon.ted.repair.TedPackageRepairService; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.core.annotation.Order; +import org.springframework.stereotype.Component; + +/** + * Optional startup runner that repairs / re-imports incomplete legacy TED packages. + */ +@Component +@ConditionalOnRuntimeMode(RuntimeMode.LEGACY) +@RequiredArgsConstructor +@Slf4j +@Order(50) +public class TedPackageRepairStartupRunner implements ApplicationRunner { + + private final TedProcessorProperties properties; + private final TedPackageRepairService repairService; + + @Override + public void run(ApplicationArguments args) { + if (!properties.getRepair().isEnabled()) { + return; + } + + if (properties.getDownload().isEnabled() && !properties.getRepair().isAllowWhileDownloadEnabled()) { + throw new IllegalStateException( + "ted.repair.enabled=true requires ted.download.enabled=false " + + "or ted.repair.allow-while-download-enabled=true to avoid concurrent package processing"); + } + + log.info("Starting legacy TED package repair tool..."); + repairService.repairConfiguredPackages(); + } +} diff --git a/src/main/resources/application-legacy.yml b/src/main/resources/application-legacy.yml index fd7c494..05a7529 100644 --- a/src/main/resources/application-legacy.yml +++ b/src/main/resources/application-legacy.yml @@ -69,7 +69,7 @@ ted: # Max consecutive 404 errors before stopping max-consecutive-404: 4 # Polling interval (milliseconds) - 2 minutes - poll-interval: 300000 + poll-interval: 120000 # Retry interval for tail NOT_FOUND packages - 6 hours not-found-retry-interval: 21600000 # Grace period after year end before a previous-year tail 404 is treated as final @@ -87,6 +87,27 @@ ted: # Prioritize current year first prioritize-current-year: false + repair: + # Enable one-off repair / re-import of incomplete TED packages on startup + enabled: false + # Only list candidate packages without modifying data + dry-run: false + # Safety cap for one startup run + max-packages: 100 + # Optional explicit package identifiers to repair + package-identifiers: [] + # Optional inclusive package range + from-package-identifier: + to-package-identifier: + # Also try to fill missing sequence numbers inside the selected range + include-missing-sequence-gaps: true + # Download missing archives when not available locally + redownload-missing-archives: true + # Always refresh the archive from TED before repairing + force-redownload: false + # Leave false unless the automatic download scheduler is disabled + allow-while-download-enabled: false + # IMAP Mail configuration mail: # Enable/disable mail processing diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 0066a1c..470a90c 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -8,15 +8,15 @@ server: spring: profiles: - active: legacy + active: new application: name: document-intelligence-platform datasource: - url: jdbc:postgresql://localhost:5432/RELM + url: jdbc:postgresql://94.130.218.54:32333/RELM username: ${DB_USERNAME:postgres} - password: ${DB_PASSWORD:P54!pcd#Wi} + password: ${DB_PASSWORD:PDmXRx0Rbk9OFOn9qO5Gm/mPCfqW8zwbZ+/YIU1lySc=} driver-class-name: org.postgresql.Driver hikari: maximum-pool-size: 5 @@ -28,7 +28,7 @@ spring: jpa: hibernate: - ddl-auto: update + ddl-auto: validate show-sql: false open-in-view: false properties: diff --git a/src/main/resources/db/migration/V12__doc_wave1_legacy_audit.sql b/src/main/resources/db/migration/V12__doc_wave1_legacy_audit.sql new file mode 100644 index 0000000..16b8897 --- /dev/null +++ b/src/main/resources/db/migration/V12__doc_wave1_legacy_audit.sql @@ -0,0 +1,57 @@ +-- Wave 1 / Milestone A: read-only legacy audit run/finding persistence. +-- Additive tables only; no legacy business data is modified by this migration. + +CREATE TABLE IF NOT EXISTS DOC.doc_legacy_audit_run ( + id UUID PRIMARY KEY, + status VARCHAR(32) NOT NULL, + requested_limit INTEGER, + page_size INTEGER NOT NULL, + scanned_packages INTEGER NOT NULL DEFAULT 0, + scanned_legacy_documents INTEGER NOT NULL DEFAULT 0, + finding_count INTEGER NOT NULL DEFAULT 0, + info_count INTEGER NOT NULL DEFAULT 0, + warning_count INTEGER NOT NULL DEFAULT 0, + error_count INTEGER NOT NULL DEFAULT 0, + started_at TIMESTAMPTZ NOT NULL, + completed_at TIMESTAMPTZ, + summary_text TEXT, + failure_message TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_run_status + ON DOC.doc_legacy_audit_run(status); + +CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_run_started + ON DOC.doc_legacy_audit_run(started_at DESC); + +CREATE TABLE IF NOT EXISTS DOC.doc_legacy_audit_finding ( + id UUID PRIMARY KEY, + run_id UUID NOT NULL REFERENCES DOC.doc_legacy_audit_run(id) ON DELETE CASCADE, + severity VARCHAR(16) NOT NULL, + finding_type VARCHAR(64) NOT NULL, + package_identifier VARCHAR(20), + legacy_procurement_document_id UUID, + document_id UUID, + ted_notice_projection_id UUID, + reference_key VARCHAR(255), + message TEXT NOT NULL, + details_text TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_find_run + ON DOC.doc_legacy_audit_finding(run_id); + +CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_find_type + ON DOC.doc_legacy_audit_finding(finding_type); + +CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_find_severity + ON DOC.doc_legacy_audit_finding(severity); + +CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_find_legacy_doc + ON DOC.doc_legacy_audit_finding(legacy_procurement_document_id); + +CREATE INDEX IF NOT EXISTS idx_doc_legacy_audit_find_document + ON DOC.doc_legacy_audit_finding(document_id); diff --git a/src/test/java/at/procon/dip/migration/audit/service/LegacyTedAuditServiceTest.java b/src/test/java/at/procon/dip/migration/audit/service/LegacyTedAuditServiceTest.java new file mode 100644 index 0000000..f65ef70 --- /dev/null +++ b/src/test/java/at/procon/dip/migration/audit/service/LegacyTedAuditServiceTest.java @@ -0,0 +1,241 @@ +package at.procon.dip.migration.audit.service; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.when; + +import at.procon.dip.migration.audit.config.LegacyTedAuditProperties; +import at.procon.dip.migration.audit.entity.LegacyTedAuditFinding; +import at.procon.dip.migration.audit.entity.LegacyTedAuditFindingType; +import at.procon.dip.migration.audit.entity.LegacyTedAuditRun; +import at.procon.dip.migration.audit.entity.LegacyTedAuditRunStatus; +import at.procon.dip.migration.audit.repository.LegacyTedAuditFindingRepository; +import at.procon.dip.migration.audit.repository.LegacyTedAuditRunRepository; +import at.procon.ted.model.entity.NoticeType; +import at.procon.ted.model.entity.ProcurementDocument; +import at.procon.ted.model.entity.TedDailyPackage; +import at.procon.ted.repository.ProcurementDocumentRepository; +import at.procon.ted.repository.TedDailyPackageRepository; +import java.time.OffsetDateTime; +import java.time.Year; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.springframework.data.domain.Page; +import org.springframework.data.domain.PageImpl; +import org.springframework.jdbc.core.JdbcTemplate; + +@ExtendWith(MockitoExtension.class) +class LegacyTedAuditServiceTest { + + @Mock + private TedDailyPackageRepository tedDailyPackageRepository; + @Mock + private ProcurementDocumentRepository procurementDocumentRepository; + @Mock + private LegacyTedAuditRunRepository runRepository; + @Mock + private LegacyTedAuditFindingRepository findingRepository; + @Mock + private JdbcTemplate jdbcTemplate; + + private LegacyTedAuditService service; + private List persistedFindings; + + @BeforeEach + void setUp() { + LegacyTedAuditProperties properties = new LegacyTedAuditProperties(); + properties.setEnabled(true); + properties.setPageSize(50); + properties.setMaxFindingsPerRun(100); + properties.setMaxDuplicateSamples(10); + + service = new LegacyTedAuditService( + properties, + tedDailyPackageRepository, + procurementDocumentRepository, + runRepository, + findingRepository, + jdbcTemplate + ); + + persistedFindings = new ArrayList<>(); + + when(runRepository.save(any(LegacyTedAuditRun.class))).thenAnswer(invocation -> { + LegacyTedAuditRun run = invocation.getArgument(0); + if (run.getId() == null) { + run.setId(UUID.randomUUID()); + } + return run; + }); + + when(findingRepository.save(any(LegacyTedAuditFinding.class))).thenAnswer(invocation -> { + LegacyTedAuditFinding finding = invocation.getArgument(0); + if (finding.getId() == null) { + finding.setId(UUID.randomUUID()); + } + persistedFindings.add(finding); + return finding; + }); + + when(procurementDocumentRepository.findAll(any(org.springframework.data.domain.Pageable.class))) + .thenReturn(new PageImpl<>(List.of())); + } + + @Test + void executeAudit_should_record_package_sequence_gaps_and_incomplete_packages() { + int currentYear = Year.now().getValue(); + + when(tedDailyPackageRepository.findAll(any(org.springframework.data.domain.Sort.class))).thenReturn(List.of( + TedDailyPackage.builder() + .packageIdentifier(formatPackageIdentifier(currentYear, 1)) + .year(currentYear) + .serialNumber(1) + .downloadStatus(TedDailyPackage.DownloadStatus.COMPLETED) + .xmlFileCount(10) + .processedCount(10) + .failedCount(0) + .fileHash("hash-1") + .processedAt(OffsetDateTime.now()) + .build(), + TedDailyPackage.builder() + .packageIdentifier(formatPackageIdentifier(currentYear, 3)) + .year(currentYear) + .serialNumber(3) + .downloadStatus(TedDailyPackage.DownloadStatus.COMPLETED) + .xmlFileCount(10) + .processedCount(9) + .failedCount(1) + .fileHash("hash-3") + .processedAt(OffsetDateTime.now()) + .build(), + TedDailyPackage.builder() + .packageIdentifier(formatPackageIdentifier(currentYear, 4)) + .year(currentYear) + .serialNumber(4) + .downloadStatus(TedDailyPackage.DownloadStatus.FAILED) + .xmlFileCount(12) + .processedCount(0) + .failedCount(0) + .errorMessage("processing failed") + .build() + )); + + LegacyTedAuditRun run = service.executeAudit(0); + + assertThat(run.getStatus()).isEqualTo(LegacyTedAuditRunStatus.COMPLETED); + assertThat(run.getScannedPackages()).isEqualTo(3); + assertThat(persistedFindings) + .extracting(LegacyTedAuditFinding::getFindingType) + .contains(LegacyTedAuditFindingType.PACKAGE_SEQUENCE_GAP, + LegacyTedAuditFindingType.PACKAGE_INCOMPLETE); + + assertThat(persistedFindings) + .filteredOn(f -> f.getFindingType() == LegacyTedAuditFindingType.PACKAGE_SEQUENCE_GAP) + .extracting(LegacyTedAuditFinding::getReferenceKey) + .contains(formatPackageIdentifier(currentYear, 2)); + + assertThat(persistedFindings) + .filteredOn(f -> f.getFindingType() == LegacyTedAuditFindingType.PACKAGE_INCOMPLETE) + .extracting(LegacyTedAuditFinding::getPackageIdentifier) + .contains(formatPackageIdentifier(currentYear, 3), formatPackageIdentifier(currentYear, 4)); + } + + @Test + void executeAudit_should_record_missing_years_inside_audited_interval() { + int currentYear = Year.now().getValue(); + + when(tedDailyPackageRepository.findAll(any(org.springframework.data.domain.Sort.class))).thenReturn(List.of( + TedDailyPackage.builder() + .packageIdentifier(formatPackageIdentifier(currentYear - 2, 1)) + .year(currentYear - 2) + .serialNumber(1) + .downloadStatus(TedDailyPackage.DownloadStatus.COMPLETED) + .xmlFileCount(1) + .processedCount(1) + .failedCount(0) + .fileHash("hash-a") + .processedAt(OffsetDateTime.now()) + .build(), + TedDailyPackage.builder() + .packageIdentifier(formatPackageIdentifier(currentYear, 1)) + .year(currentYear) + .serialNumber(1) + .downloadStatus(TedDailyPackage.DownloadStatus.COMPLETED) + .xmlFileCount(1) + .processedCount(1) + .failedCount(0) + .fileHash("hash-b") + .processedAt(OffsetDateTime.now()) + .build() + )); + + LegacyTedAuditRun run = service.executeAudit(0); + + assertThat(run.getStatus()).isEqualTo(LegacyTedAuditRunStatus.COMPLETED); + assertThat(persistedFindings) + .filteredOn(f -> f.getFindingType() == LegacyTedAuditFindingType.PACKAGE_SEQUENCE_GAP) + .extracting(LegacyTedAuditFinding::getReferenceKey) + .contains("year:" + (currentYear - 1)); + } + + @Test + void executeAudit_should_record_legacy_document_integrity_findings_only() { + ProcurementDocument missingXml = ProcurementDocument.builder() + .id(UUID.randomUUID()) + .documentHash("hash-1") + .publicationId("2025/S 001-000001") + .noticeType(NoticeType.CONTRACT_NOTICE) + .xmlDocument(null) + .textContent("hello") + .build(); + + ProcurementDocument missingTextAndPublicationId = ProcurementDocument.builder() + .id(UUID.randomUUID()) + .documentHash("hash-2") + .publicationId(null) + .noticeType(NoticeType.CONTRACT_NOTICE) + .xmlDocument("") + .textContent(null) + .build(); + + when(tedDailyPackageRepository.findAll(any(org.springframework.data.domain.Sort.class))).thenReturn(List.of()); + when(procurementDocumentRepository.findAll(any(org.springframework.data.domain.Pageable.class))) + .thenReturn(pageOf(missingXml, missingTextAndPublicationId)); + + LegacyTedAuditRun run = service.executeAudit(10); + + assertThat(run.getStatus()).isEqualTo(LegacyTedAuditRunStatus.COMPLETED); + assertThat(run.getScannedLegacyDocuments()).isEqualTo(2); + assertThat(persistedFindings) + .extracting(LegacyTedAuditFinding::getFindingType) + .contains( + LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_XML, + LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_TEXT, + LegacyTedAuditFindingType.LEGACY_DOCUMENT_MISSING_PUBLICATION_ID + ) + .doesNotContain( + LegacyTedAuditFindingType.DOC_DOCUMENT_MISSING, + LegacyTedAuditFindingType.DOC_SOURCE_MISSING, + LegacyTedAuditFindingType.DOC_ORIGINAL_CONTENT_MISSING, + LegacyTedAuditFindingType.DOC_PRIMARY_REPRESENTATION_MISSING, + LegacyTedAuditFindingType.TED_PROJECTION_MISSING, + LegacyTedAuditFindingType.TED_PROJECTION_MISSING_LEGACY_LINK, + LegacyTedAuditFindingType.TED_PROJECTION_DOCUMENT_MISMATCH, + LegacyTedAuditFindingType.DOC_DEDUP_HASH_DUPLICATE + ); + } + + private Page pageOf(ProcurementDocument... documents) { + return new PageImpl<>(List.of(documents)); + } + + private String formatPackageIdentifier(int year, int serialNumber) { + return "%04d%05d".formatted(year, serialNumber); + } +} diff --git a/src/test/java/at/procon/ted/repair/TedPackageRepairServiceTest.java b/src/test/java/at/procon/ted/repair/TedPackageRepairServiceTest.java new file mode 100644 index 0000000..433f453 --- /dev/null +++ b/src/test/java/at/procon/ted/repair/TedPackageRepairServiceTest.java @@ -0,0 +1,120 @@ +package at.procon.ted.repair; + +import at.procon.ted.config.TedProcessorProperties; +import at.procon.ted.model.entity.TedDailyPackage; +import at.procon.ted.repository.TedDailyPackageRepository; +import at.procon.ted.service.BatchDocumentProcessingService; +import at.procon.ted.service.TedPackageDownloadService; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.springframework.data.domain.Sort; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.OffsetDateTime; +import java.util.List; +import java.util.Optional; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +class TedPackageRepairServiceTest { + + @TempDir + Path tempDir; + + @Test + void resolveCandidatesIncludesIncompletePackagesAndMissingSequenceGaps() { + TedProcessorProperties properties = new TedProcessorProperties(); + properties.getRepair().setEnabled(true); + properties.getRepair().setFromPackageIdentifier("202600001"); + properties.getRepair().setToPackageIdentifier("202600003"); + properties.getRepair().setIncludeMissingSequenceGaps(true); + properties.getRepair().setMaxPackages(10); + + TedDailyPackageRepository repository = mock(TedDailyPackageRepository.class); + TedDailyPackage pkg1 = newPackage("202600001", 2026, 1, TedDailyPackage.DownloadStatus.COMPLETED, 20, 20, 0); + TedDailyPackage pkg3 = newPackage("202600003", 2026, 3, TedDailyPackage.DownloadStatus.PROCESSING, 20, 5, 0); + when(repository.findAll(any(Sort.class))).thenReturn(List.of(pkg1, pkg3)); + + TedPackageRepairService service = new TedPackageRepairService( + properties, + repository, + mock(TedPackageDownloadService.class), + mock(BatchDocumentProcessingService.class)); + + List candidates = service.resolveCandidates(properties.getRepair()); + + assertThat(candidates).extracting(TedPackageRepairService.RepairCandidate::packageIdentifier) + .containsExactly("202600002", "202600003"); + assertThat(candidates).extracting(TedPackageRepairService.RepairCandidate::reason) + .containsExactly("MISSING_SEQUENCE_GAP", "STATUS_PROCESSING"); + } + + @Test + void repairCandidateProcessesExistingArchiveAndMarksPackageCompleted() throws Exception { + TedProcessorProperties properties = new TedProcessorProperties(); + properties.getRepair().setEnabled(true); + properties.getRepair().setRedownloadMissingArchives(false); + properties.getDownload().setDownloadDirectory(tempDir.toString()); + properties.getDownload().setDeleteAfterExtraction(false); + + Path archive = tempDir.resolve("202600003.tar.gz"); + Files.writeString(archive, "dummy"); + + TedDailyPackageRepository repository = mock(TedDailyPackageRepository.class); + TedDailyPackage pkg = newPackage("202600003", 2026, 3, TedDailyPackage.DownloadStatus.PROCESSING, 3, 0, 0); + when(repository.save(any(TedDailyPackage.class))).thenAnswer(invocation -> invocation.getArgument(0)); + when(repository.findByPackageIdentifier("202600003")).thenReturn(Optional.of(pkg)); + + TedPackageDownloadService downloadService = mock(TedPackageDownloadService.class); + Path extractedDir = Files.createDirectory(tempDir.resolve("extracted")); + Path xml1 = Files.writeString(extractedDir.resolve("a.xml"), ""); + Path xml2 = Files.writeString(extractedDir.resolve("b.xml"), ""); + Path xml3 = Files.writeString(extractedDir.resolve("c.xml"), ""); + when(downloadService.calculateArchiveHash(eq(archive))).thenReturn("hash-1"); + when(downloadService.extractArchive(eq(archive), eq("202600003"))).thenReturn(List.of(xml1, xml2, xml3)); + + BatchDocumentProcessingService batchService = mock(BatchDocumentProcessingService.class); + when(batchService.processBatch(any())).thenReturn(new BatchDocumentProcessingService.BatchProcessingResult( + 1, 2, 0, 5L, List.of(UUID.randomUUID()), List.of())); + + TedPackageRepairService service = new TedPackageRepairService(properties, repository, downloadService, batchService); + TedPackageRepairService.RepairCandidate candidate = TedPackageRepairService.RepairCandidate.existing(pkg, "STATUS_PROCESSING"); + + var result = service.repairCandidate(candidate, properties.getRepair()); + + assertThat(result.outcome()).isEqualTo(TedPackageRepairService.RepairOutcome.COMPLETED); + assertThat(pkg.getDownloadStatus()).isEqualTo(TedDailyPackage.DownloadStatus.COMPLETED); + assertThat(pkg.getProcessedCount()).isEqualTo(3); + assertThat(pkg.getFailedCount()).isZero(); + assertThat(pkg.getFileHash()).isEqualTo("hash-1"); + assertThat(pkg.getProcessedAt()).isNotNull(); + } + + private TedDailyPackage newPackage(String packageIdentifier, + int year, + int serial, + TedDailyPackage.DownloadStatus status, + Integer xmlCount, + Integer processed, + Integer failed) { + TedDailyPackage pkg = new TedDailyPackage(); + pkg.setId(UUID.randomUUID()); + pkg.setPackageIdentifier(packageIdentifier); + pkg.setYear(year); + pkg.setSerialNumber(serial); + pkg.setDownloadStatus(status); + pkg.setXmlFileCount(xmlCount); + pkg.setProcessedCount(processed); + pkg.setFailedCount(failed); + pkg.setDownloadUrl("https://ted.europa.eu/packages/daily/" + packageIdentifier); + pkg.setCreatedAt(OffsetDateTime.now()); + pkg.setUpdatedAt(OffsetDateTime.now()); + return pkg; + } +}