DIP/src/main/java/at/procon/ted/service/XmlParserService.java

950 lines
42 KiB
Java

package at.procon.ted.service;
import at.procon.ted.model.entity.*;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.w3c.dom.*;
import org.xml.sax.InputSource;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.*;
import java.io.StringReader;
import java.time.LocalDate;
import java.time.LocalTime;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeParseException;
import java.util.*;
/**
* Service for parsing EU eForms XML documents.
* Extracts structured data from TED procurement notices.
*
* Uses XPath for navigation through the UBL 2.3 document structure with eForms extensions.
*
* @author Martin.Schweitzer@procon.co.at and claude.ai
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class XmlParserService {
// Namespace URIs for eForms/UBL documents
private static final String NS_CN = "urn:oasis:names:specification:ubl:schema:xsd:ContractNotice-2";
private static final String NS_CAC = "urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2";
private static final String NS_CBC = "urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2";
private static final String NS_EFAC = "http://data.europa.eu/p27/eforms-ubl-extension-aggregate-components/1";
private static final String NS_EFBC = "http://data.europa.eu/p27/eforms-ubl-extension-basic-components/1";
private static final String NS_EFEXT = "http://data.europa.eu/p27/eforms-ubl-extensions/1";
private static final String NS_EXT = "urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2";
private final DocumentBuilderFactory documentBuilderFactory;
private final XPathFactory xPathFactory;
public XmlParserService() {
this.documentBuilderFactory = DocumentBuilderFactory.newInstance();
this.documentBuilderFactory.setNamespaceAware(true);
this.xPathFactory = XPathFactory.newInstance();
}
/**
* Parse an eForms XML document and extract structured data.
*
* @param xmlContent The XML content as string
* @return Populated ProcurementDocument entity (without ID or hash)
*/
public ProcurementDocument parseDocument(String xmlContent) {
try {
DocumentBuilder builder = newDocumentBuilder();
Document doc = builder.parse(new InputSource(new StringReader(xmlContent)));
XPath xpath = newXPath();
ProcurementDocument document = ProcurementDocument.builder()
.xmlDocument(xmlContent)
.build();
// Parse basic notice information
parseNoticeMetadata(doc, xpath, document);
// Parse contracting party (buyer) information
parseContractingParty(doc, xpath, document);
// Parse procurement project information
parseProcurementProject(doc, xpath, document);
// Parse tendering process
parseTenderingProcess(doc, xpath, document);
// Parse organizations from extensions
parseOrganizations(doc, xpath, document);
// Parse lots
parseLots(doc, xpath, document);
// Parse publication information
parsePublication(doc, xpath, document);
// Generate text content for vectorization
document.setTextContent(generateTextContent(document));
return document;
} catch (Exception e) {
log.error("Error parsing XML document: {}", e.getMessage(), e);
throw new XmlParsingException("Failed to parse XML document", e);
}
}
private void parseNoticeMetadata(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
// UBL Version
document.setUblVersion(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:UBLVersionID"));
// SDK Version (customization ID)
document.setSdkVersion(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:CustomizationID"));
// Notice ID
document.setNoticeId(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:ID"));
// Contract Folder ID
document.setContractFolderId(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:ContractFolderID"));
// Issue Date and Time - combined into single OffsetDateTime
String issueDateStr = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:IssueDate");
String issueTimeStr = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:IssueTime");
if (issueDateStr != null) {
document.setIssueDateTime(parseDateTime(issueDateStr, issueTimeStr));
}
// Notice Language
document.setLanguageCode(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:NoticeLanguageCode"));
// Notice Type Code
String noticeTypeCode = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:NoticeTypeCode");
document.setNoticeType(mapNoticeType(noticeTypeCode));
// Regulatory Domain
document.setRegulatoryDomain(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:RegulatoryDomain"));
// Notice Subtype from extensions
String subtypeCode = getTextContent(xpath, doc,
"//efext:EformsExtension/efac:NoticeSubType/cbc:SubTypeCode");
document.setNoticeSubtypeCode(subtypeCode);
}
private void parseContractingParty(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
// Activity type
document.setBuyerActivityType(getTextContent(xpath, doc,
"//cac:ContractingParty/cac:ContractingActivity/cbc:ActivityTypeCode"));
// Legal type
document.setBuyerLegalType(getTextContent(xpath, doc,
"//cac:ContractingParty/cac:ContractingPartyType/cbc:PartyTypeCode"));
// Organization reference to link with organizations
String orgRef = getTextContent(xpath, doc,
"//cac:ContractingParty/cac:Party/cac:PartyIdentification/cbc:ID");
// Buyer details will be populated from organizations
}
private void parseProcurementProject(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
// Project title
document.setProjectTitle(getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:Name"));
// Project description
document.setProjectDescription(getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:Description"));
// Internal reference
document.setInternalReference(getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:ID"));
// Contract nature
String contractNature = getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:ProcurementTypeCode");
document.setContractNature(mapContractNature(contractNature));
// CPV codes
List<String> cpvCodes = getTextContents(xpath, doc,
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:MainCommodityClassification/cbc:ItemClassificationCode");
cpvCodes.addAll(getTextContents(xpath, doc,
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:AdditionalCommodityClassification/cbc:ItemClassificationCode"));
document.setCpvCodes(cpvCodes.toArray(new String[0]));
// Location - country and NUTS codes
document.setBuyerCountryCode(getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:RealizedLocation/cac:Address/cac:Country/cbc:IdentificationCode"));
document.setBuyerNutsCode(getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode"));
document.setBuyerCity(getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:RealizedLocation/cac:Address/cbc:CityName"));
// All NUTS codes from project and lots
List<String> nutsCodes = getTextContents(xpath, doc,
"//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode");
document.setNutsCodes(nutsCodes.stream().distinct().toArray(String[]::new));
}
private void parseTenderingProcess(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
// Procedure type
String procedureCode = getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cac:TenderingProcess/cbc:ProcedureCode");
document.setProcedureType(mapProcedureType(procedureCode));
// Lot distribution
String maxLotsAwarded = getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cac:TenderingTerms/cac:LotDistribution/cbc:MaximumLotsAwardedNumeric");
if (maxLotsAwarded != null) {
document.setMaxLotsAwarded(Integer.parseInt(maxLotsAwarded));
}
String maxLotsSubmitted = getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cac:TenderingTerms/cac:LotDistribution/cbc:MaximumLotsSubmittedNumeric");
if (maxLotsSubmitted != null) {
document.setMaxLotsSubmitted(Integer.parseInt(maxLotsSubmitted));
}
}
private void parseOrganizations(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
NodeList orgNodes = (NodeList) xpath.evaluate(
"//efac:Organizations/efac:Organization", doc, XPathConstants.NODESET);
boolean buyerInfoSet = false;
for (int i = 0; i < orgNodes.getLength(); i++) {
Node orgNode = orgNodes.item(i);
Organization org = Organization.builder().build();
// Organization reference
org.setOrgReference(getTextContent(xpath, orgNode, ".//cac:PartyIdentification/cbc:ID"));
// Name
org.setName(getTextContent(xpath, orgNode, ".//cac:PartyName/cbc:Name"));
if(org.getName() == null) org.setName("");
// Company ID
org.setCompanyId(getTextContent(xpath, orgNode, ".//cac:PartyLegalEntity/cbc:CompanyID"));
// Address
org.setStreetName(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:StreetName"));
org.setCity(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:CityName"));
org.setPostalCode(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:PostalZone"));
org.setNutsCode(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:CountrySubentityCode"));
org.setCountryCode(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cac:Country/cbc:IdentificationCode"));
// Contact
org.setWebsiteUri(getTextContent(xpath, orgNode, ".//cbc:WebsiteURI"));
org.setEmail(getTextContent(xpath, orgNode, ".//cac:Contact/cbc:ElectronicMail"));
org.setPhone(getTextContent(xpath, orgNode, ".//cac:Contact/cbc:Telephone"));
document.addOrganization(org);
// Set buyer info from first organization (typically ORG-0001)
if (!buyerInfoSet && "ORG-0001".equals(org.getOrgReference())) {
document.setBuyerName(org.getName());
if (document.getBuyerCountryCode() == null) {
document.setBuyerCountryCode(org.getCountryCode());
}
if (document.getBuyerCity() == null) {
document.setBuyerCity(org.getCity());
}
document.setBuyerPostalCode(org.getPostalCode());
if (document.getBuyerNutsCode() == null) {
document.setBuyerNutsCode(org.getNutsCode());
}
buyerInfoSet = true;
}
}
}
private DocumentBuilder newDocumentBuilder() throws Exception {
synchronized (documentBuilderFactory) {
documentBuilderFactory.setNamespaceAware(true);
return documentBuilderFactory.newDocumentBuilder();
}
}
private XPath newXPath() {
synchronized (xPathFactory) {
XPath xpath = xPathFactory.newXPath();
xpath.setNamespaceContext(createNamespaceContext());
return xpath;
}
}
private String getTextContent(XPath xpath, Object item, String expression) throws XPathExpressionException {
Node node = (Node) xpath.evaluate(expression, item, XPathConstants.NODE);
return node != null ? node.getTextContent().trim() : null;
}
private Node getNode(XPath xpath, Object item, String expression) throws XPathExpressionException {
return (Node) xpath.evaluate(expression, item, XPathConstants.NODE);
}
private NodeList getNodes(XPath xpath, Object item, String expression) throws XPathExpressionException {
return (NodeList) xpath.evaluate(expression, item, XPathConstants.NODESET);
}
private Element getDirectChild(Element parent, String namespaceUri, String localName) {
Node child = parent.getFirstChild();
while (child != null) {
if (child.getNodeType() == Node.ELEMENT_NODE) {
Element el = (Element) child;
if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) {
return el;
}
}
child = child.getNextSibling();
}
return null;
}
private List<Element> getDirectChildren(Element parent, String namespaceUri, String localName) {
List<Element> result = new ArrayList<>();
Node child = parent.getFirstChild();
while (child != null) {
if (child.getNodeType() == Node.ELEMENT_NODE) {
Element el = (Element) child;
if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) {
result.add(el);
}
}
child = child.getNextSibling();
}
return result;
}
private String getDirectChildText(Element parent, String namespaceUri, String localName) {
Element child = getDirectChild(parent, namespaceUri, localName);
if (child == null) {
return null;
}
return trimToNull(child.getTextContent());
}
private String trimToNull(String value) {
if (value == null) {
return null;
}
String trimmed = value.trim();
return trimmed.isEmpty() ? null : trimmed;
}
private void parseLotsDOM(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot");
document.setTotalLots(lotNodes.getLength());
for (int i = 0; i < lotNodes.getLength(); i++) {
Node lotNode = lotNodes.item(i);
if (lotNode.getNodeType() != Node.ELEMENT_NODE) {
continue;
}
Element lotEl = (Element) lotNode;
ProcurementLot lot = ProcurementLot.builder().build();
// Direct child values on the lot
lot.setLotId(getDirectChildText(lotEl, NS_CBC, "ID"));
Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject");
if (procurementProjectEl != null) {
lot.setInternalId(getDirectChildText(procurementProjectEl, NS_CBC, "ID"));
lot.setTitle(getDirectChildText(procurementProjectEl, NS_CBC, "Name"));
lot.setDescription(getDirectChildText(procurementProjectEl, NS_CBC, "Description"));
// CPV codes
List<String> lotCpvCodes = new ArrayList<>();
for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) {
String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode");
if (cpv != null && !cpv.isEmpty()) {
lotCpvCodes.add(cpv);
}
}
lot.setCpvCodes(lotCpvCodes.toArray(new String[0]));
// NUTS codes
List<String> lotNutsCodes = new ArrayList<>();
for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) {
Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address");
if (addressEl != null) {
String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode");
if (nuts != null && !nuts.isEmpty()) {
lotNutsCodes.add(nuts);
}
}
}
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
// Duration
Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod");
if (plannedPeriodEl != null) {
Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure");
if (durationEl != null) {
String durationValue = trimToNull(durationEl.getTextContent());
if (durationValue != null) {
try {
lot.setDurationValue(Double.parseDouble(durationValue));
} catch (NumberFormatException e) {
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
}
}
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
if (unitCode != null) {
lot.setDurationUnit(unitCode);
}
}
}
}
// Submission deadline
Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess");
if (tenderingProcessEl != null) {
Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod");
if (deadlinePeriodEl != null) {
String endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate");
if (endDate != null) {
String endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime");
lot.setSubmissionDeadline(parseDateTime(endDate, endTime));
if (document.getSubmissionDeadline() == null) {
document.setSubmissionDeadline(lot.getSubmissionDeadline());
}
}
}
}
// EU funded
Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms");
if (tenderingTermsEl != null) {
String fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode");
lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds"));
}
document.addLot(lot);
}
document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded())));
}
private void parseLots(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot");
document.setTotalLots(lotNodes.getLength());
for (int i = 0; i < lotNodes.getLength(); i++) {
Node lotNode = lotNodes.item(i);
if (lotNode.getNodeType() != Node.ELEMENT_NODE) {
continue;
}
Element lotEl = (Element) lotNode;
ProcurementLot lot = ProcurementLot.builder().build();
// Fast direct children
Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject");
Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess");
Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms");
// --- Lot ID ---
String lotId = getDirectChildText(lotEl, NS_CBC, "ID");
if (lotId == null) {
lotId = getTextContent(xpath, lotNode, "cbc:ID");
}
lot.setLotId(lotId);
// --- Internal ID ---
String internalId = null;
if (procurementProjectEl != null) {
internalId = getDirectChildText(procurementProjectEl, NS_CBC, "ID");
}
if (internalId == null) {
internalId = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:ID");
}
lot.setInternalId(internalId);
// --- Title ---
String title = null;
if (procurementProjectEl != null) {
title = getDirectChildText(procurementProjectEl, NS_CBC, "Name");
}
if (title == null) {
title = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Name");
}
lot.setTitle(title);
// --- Description ---
String description = null;
if (procurementProjectEl != null) {
description = getDirectChildText(procurementProjectEl, NS_CBC, "Description");
}
if (description == null) {
description = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Description");
}
lot.setDescription(description);
// --- CPV codes ---
List<String> lotCpvCodes = new ArrayList<>();
if (procurementProjectEl != null) {
for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) {
String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode");
if (cpv != null && !cpv.isEmpty()) {
lotCpvCodes.add(cpv);
}
}
}
if (lotCpvCodes.isEmpty()) {
NodeList cpvNodes = getNodes(xpath, lotNode,
".//cac:MainCommodityClassification/cbc:ItemClassificationCode");
for (int j = 0; j < cpvNodes.getLength(); j++) {
String cpv = trimToNull(cpvNodes.item(j).getTextContent());
if (cpv != null) {
lotCpvCodes.add(cpv);
}
}
}
lot.setCpvCodes(lotCpvCodes.toArray(new String[0]));
// --- NUTS codes ---
List<String> lotNutsCodes = new ArrayList<>();
if (procurementProjectEl != null) {
for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) {
Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address");
if (addressEl != null) {
String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode");
if (nuts != null && !nuts.isEmpty()) {
lotNutsCodes.add(nuts);
}
}
}
}
if (lotNutsCodes.isEmpty()) {
NodeList nutsNodes = getNodes(xpath, lotNode,
".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode");
for (int j = 0; j < nutsNodes.getLength(); j++) {
String nuts = trimToNull(nutsNodes.item(j).getTextContent());
if (nuts != null) {
lotNutsCodes.add(nuts);
}
}
}
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
// --- Duration ---
boolean durationSet = false;
if (procurementProjectEl != null) {
Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod");
if (plannedPeriodEl != null) {
Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure");
if (durationEl != null) {
String durationValue = trimToNull(durationEl.getTextContent());
if (durationValue != null) {
try {
lot.setDurationValue(Double.parseDouble(durationValue));
} catch (NumberFormatException e) {
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
}
}
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
if (unitCode != null) {
lot.setDurationUnit(unitCode);
}
durationSet = true;
}
}
}
if (!durationSet) {
Node durationNode = getNode(xpath, lotNode,
"cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure");
if (durationNode != null) {
String durationValue = trimToNull(durationNode.getTextContent());
if (durationValue != null) {
try {
lot.setDurationValue(Double.parseDouble(durationValue));
} catch (NumberFormatException e) {
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
}
}
if (durationNode instanceof Element durationEl) {
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
if (unitCode != null) {
lot.setDurationUnit(unitCode);
}
}
}
}
// --- Submission deadline ---
String endDate = null;
String endTime = null;
if (tenderingProcessEl != null) {
Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod");
if (deadlinePeriodEl != null) {
endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate");
endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime");
}
}
if (endDate == null) {
endDate = getTextContent(xpath, lotNode,
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndDate");
endTime = getTextContent(xpath, lotNode,
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndTime");
}
if (endDate != null) {
lot.setSubmissionDeadline(parseDateTime(endDate, endTime));
if (document.getSubmissionDeadline() == null) {
document.setSubmissionDeadline(lot.getSubmissionDeadline());
}
}
// --- EU funded ---
String fundingProgramCode = null;
if (tenderingTermsEl != null) {
fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode");
}
if (fundingProgramCode == null) {
fundingProgramCode = getTextContent(xpath, lotNode,
"cac:TenderingTerms/cbc:FundingProgramCode");
}
lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds"));
document.addLot(lot);
}
document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded())));
}
private void parseLotsOld(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
NodeList lotNodes = (NodeList) xpath.evaluate(
"//cac:ProcurementProjectLot", doc, XPathConstants.NODESET);
document.setTotalLots(lotNodes.getLength());
for (int i = 0; i < lotNodes.getLength(); i++) {
Node lotNode = lotNodes.item(i);
ProcurementLot lot = ProcurementLot.builder().build();
// Lot ID
lot.setLotId(getTextContent(xpath, lotNode, "cbc:ID"));
// Internal ID
lot.setInternalId(getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:ID"));
// Title and description
lot.setTitle(getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Name"));
lot.setDescription(getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Description"));
// CPV codes for this lot
List<String> lotCpvCodes = new ArrayList<>();
NodeList cpvNodes = (NodeList) xpath.evaluate(
".//cac:MainCommodityClassification/cbc:ItemClassificationCode",
lotNode, XPathConstants.NODESET);
for (int j = 0; j < cpvNodes.getLength(); j++) {
lotCpvCodes.add(cpvNodes.item(j).getTextContent());
}
lot.setCpvCodes(lotCpvCodes.toArray(new String[0]));
// NUTS codes for this lot
List<String> lotNutsCodes = new ArrayList<>();
NodeList nutsNodes = (NodeList) xpath.evaluate(
".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode",
lotNode, XPathConstants.NODESET);
for (int j = 0; j < nutsNodes.getLength(); j++) {
lotNutsCodes.add(nutsNodes.item(j).getTextContent());
}
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
// Duration
String durationValue = getTextContent(xpath, lotNode,
"cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure");
if (durationValue != null) {
try {
lot.setDurationValue(Double.parseDouble(durationValue));
} catch (NumberFormatException e) {
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
}
}
lot.setDurationUnit(getAttributeValue(xpath, lotNode,
"cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure", "unitCode"));
// Submission deadline
String endDate = getTextContent(xpath, lotNode,
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndDate");
String endTime = getTextContent(xpath, lotNode,
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndTime");
if (endDate != null) {
lot.setSubmissionDeadline(parseDateTime(endDate, endTime));
// Set document-level deadline from first lot if not set
if (document.getSubmissionDeadline() == null) {
document.setSubmissionDeadline(lot.getSubmissionDeadline());
}
}
// EU funded
String euFunded = getTextContent(xpath, lotNode,
"cac:TenderingTerms/cbc:FundingProgramCode");
lot.setEuFunded(euFunded != null && !euFunded.contains("no-eu-funds"));
document.addLot(lot);
}
// Check if any lot is EU funded
document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded())));
}
private void parsePublication(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
// Publication ID (OJS notice ID)
document.setPublicationId(getTextContent(xpath, doc,
"//efac:Publication/efbc:NoticePublicationID"));
// OJS ID (gazette ID)
document.setOjsId(getTextContent(xpath, doc,
"//efac:Publication/efbc:GazetteID"));
// Publication date
String pubDate = getTextContent(xpath, doc,
"//efac:Publication/efbc:PublicationDate");
if (pubDate != null) {
document.setPublicationDate(parseDate(pubDate));
}
// Fallback to requested publication date
if (document.getPublicationDate() == null) {
String requestedPubDate = getTextContent(xpath, doc,
"/*[local-name()='ContractNotice']/cbc:RequestedPublicationDate");
if (requestedPubDate != null) {
document.setPublicationDate(parseDate(requestedPubDate));
}
}
}
/**
* Generate a textual representation for vectorization.
*/
private String generateTextContent(ProcurementDocument document) {
StringBuilder sb = new StringBuilder();
// Title (most important)
if (document.getProjectTitle() != null) {
sb.append("Title: ").append(document.getProjectTitle()).append("\n\n");
}
// Description
if (document.getProjectDescription() != null) {
sb.append("Description: ").append(document.getProjectDescription()).append("\n\n");
}
// Buyer information
if (document.getBuyerName() != null) {
sb.append("Contracting Authority: ").append(document.getBuyerName());
if (document.getBuyerCity() != null) {
sb.append(", ").append(document.getBuyerCity());
}
if (document.getBuyerCountryCode() != null) {
sb.append(" (").append(document.getBuyerCountryCode()).append(")");
}
sb.append("\n\n");
}
// Contract type and procedure
if (document.getContractNature() != null) {
sb.append("Contract Type: ").append(document.getContractNature()).append("\n");
}
if (document.getProcedureType() != null) {
sb.append("Procedure: ").append(document.getProcedureType()).append("\n");
}
// CPV classification
if (document.getCpvCodes() != null && document.getCpvCodes().length > 0) {
sb.append("CPV Codes: ").append(String.join(", ", document.getCpvCodes())).append("\n");
}
// Lot information
if (document.getLots() != null && !document.getLots().isEmpty()) {
sb.append("\nLots (").append(document.getLots().size()).append("):\n");
for (ProcurementLot lot : document.getLots()) {
if (lot.getTitle() != null) {
sb.append("- ").append(lot.getLotId()).append(": ").append(lot.getTitle());
if (lot.getDescription() != null && !lot.getDescription().equals(lot.getTitle())) {
sb.append(" - ").append(lot.getDescription());
}
sb.append("\n");
}
}
}
return sb.toString().trim();
}
// Helper methods
private List<String> getTextContents(XPath xpath, Object item, String expression) throws XPathExpressionException {
List<String> results = new ArrayList<>();
NodeList nodes = getNodes(xpath, item, expression);
for (int i = 0; i < nodes.getLength(); i++) {
String text = nodes.item(i).getTextContent().trim();
if (!text.isEmpty()) {
results.add(text);
}
}
return results;
}
private String getAttributeValue(XPath xpath, Object item, String expression, String attrName) throws XPathExpressionException {
Node node = getNode(xpath, item, expression);
if (node instanceof Element element) {
String value = element.getAttribute(attrName);
return trimToNull(value);
}
return null;
}
private LocalDate parseDate(String dateStr) {
if (dateStr == null || dateStr.isEmpty()) return null;
try {
// Handle various date formats
dateStr = dateStr.trim();
// Handle datetime with dash separator (e.g. "2025-04-23-03:00")
// Extract only the date part (first 10 characters: YYYY-MM-DD)
if (dateStr.matches("\\d{4}-\\d{2}-\\d{2}-\\d{2}:\\d{2}.*")) {
dateStr = dateStr.substring(0, 10);
}
if (dateStr.contains("+")) {
dateStr = dateStr.substring(0, dateStr.indexOf("+"));
}
if (dateStr.endsWith("Z")) {
dateStr = dateStr.substring(0, dateStr.length() - 1);
}
return LocalDate.parse(dateStr);
} catch (DateTimeParseException e) {
log.warn("Failed to parse date: {} . Error: {}", dateStr, e.getMessage());
return null;
}
}
private LocalTime parseTime(String timeStr) {
if (timeStr == null || timeStr.isEmpty()) return null;
try {
timeStr = timeStr.trim();
// Handle time with offset (e.g. "12:00:00-03:00")
// Extract only the time part (first 8 characters: HH:mm:ss)
if (timeStr.matches("\\d{2}:\\d{2}:\\d{2}[+-]\\d{2}:\\d{2}")) {
timeStr = timeStr.substring(0, 8);
}
if (timeStr.contains("+")) {
timeStr = timeStr.substring(0, timeStr.indexOf("+"));
}
if (timeStr.endsWith("Z")) {
timeStr = timeStr.substring(0, timeStr.length() - 1);
}
return LocalTime.parse(timeStr);
} catch (DateTimeParseException e) {
log.warn("Failed to parse time: {} . Error: {}", timeStr, e.getMessage());
return null;
}
}
private OffsetDateTime parseDateTime(String dateStr, String timeStr) {
LocalDate date = parseDate(dateStr);
if (date == null) return null;
LocalTime time = timeStr != null ? parseTime(timeStr) : LocalTime.MIDNIGHT;
if (time == null) time = LocalTime.MIDNIGHT;
// Parse timezone offset if present in date string
ZoneOffset offset = ZoneOffset.UTC;
if (dateStr != null && dateStr.contains("+")) {
try {
String offsetStr = dateStr.substring(dateStr.indexOf("+"));
offset = ZoneOffset.of(offsetStr);
} catch (Exception e) {
// Default to UTC
}
}
return OffsetDateTime.of(date, time, offset);
}
private NoticeType mapNoticeType(String code) {
if (code == null) return NoticeType.OTHER;
return switch (code.toLowerCase()) {
case "cn-standard", "cn-social", "cn-defence" -> NoticeType.CONTRACT_NOTICE;
case "pin-only", "pin-rtl", "pin-cfc-standard" -> NoticeType.PRIOR_INFORMATION_NOTICE;
case "can-standard", "can-social", "can-modif" -> NoticeType.CONTRACT_AWARD_NOTICE;
default -> NoticeType.OTHER;
};
}
private ContractNature mapContractNature(String code) {
if (code == null) return ContractNature.UNKNOWN;
return switch (code.toLowerCase()) {
case "supplies" -> ContractNature.SUPPLIES;
case "services" -> ContractNature.SERVICES;
case "works" -> ContractNature.WORKS;
case "mixed" -> ContractNature.MIXED;
default -> ContractNature.UNKNOWN;
};
}
private ProcedureType mapProcedureType(String code) {
if (code == null) return ProcedureType.OTHER;
return switch (code.toLowerCase()) {
case "open" -> ProcedureType.OPEN;
case "restricted" -> ProcedureType.RESTRICTED;
case "comp-dial" -> ProcedureType.COMPETITIVE_DIALOGUE;
case "innovation" -> ProcedureType.INNOVATION_PARTNERSHIP;
case "neg-wo-pub" -> ProcedureType.NEGOTIATED_WITHOUT_PUBLICATION;
case "neg-w-pub" -> ProcedureType.NEGOTIATED_WITH_PUBLICATION;
default -> ProcedureType.OTHER;
};
}
private NamespaceContext createNamespaceContext() {
return new NamespaceContext() {
@Override
public String getNamespaceURI(String prefix) {
return switch (prefix) {
case "cn" -> NS_CN;
case "cac" -> NS_CAC;
case "cbc" -> NS_CBC;
case "efac" -> NS_EFAC;
case "efbc" -> NS_EFBC;
case "efext" -> NS_EFEXT;
case "ext" -> NS_EXT;
default -> null;
};
}
@Override
public String getPrefix(String namespaceURI) {
return null;
}
@Override
public Iterator<String> getPrefixes(String namespaceURI) {
return null;
}
};
}
/**
* Exception thrown when XML parsing fails.
*/
public static class XmlParsingException extends RuntimeException {
public XmlParsingException(String message, Throwable cause) {
super(message, cause);
}
}
}