950 lines
42 KiB
Java
950 lines
42 KiB
Java
package at.procon.ted.service;
|
|
|
|
import at.procon.ted.model.entity.*;
|
|
import lombok.RequiredArgsConstructor;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import org.springframework.stereotype.Service;
|
|
import org.w3c.dom.*;
|
|
import org.xml.sax.InputSource;
|
|
|
|
import javax.xml.namespace.NamespaceContext;
|
|
import javax.xml.parsers.DocumentBuilder;
|
|
import javax.xml.parsers.DocumentBuilderFactory;
|
|
import javax.xml.xpath.*;
|
|
import java.io.StringReader;
|
|
import java.time.LocalDate;
|
|
import java.time.LocalTime;
|
|
import java.time.OffsetDateTime;
|
|
import java.time.ZoneOffset;
|
|
import java.time.format.DateTimeParseException;
|
|
import java.util.*;
|
|
|
|
/**
|
|
* Service for parsing EU eForms XML documents.
|
|
* Extracts structured data from TED procurement notices.
|
|
*
|
|
* Uses XPath for navigation through the UBL 2.3 document structure with eForms extensions.
|
|
*
|
|
* @author Martin.Schweitzer@procon.co.at and claude.ai
|
|
*/
|
|
@Service
|
|
@RequiredArgsConstructor
|
|
@Slf4j
|
|
public class XmlParserService {
|
|
|
|
// Namespace URIs for eForms/UBL documents
|
|
private static final String NS_CN = "urn:oasis:names:specification:ubl:schema:xsd:ContractNotice-2";
|
|
private static final String NS_CAC = "urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2";
|
|
private static final String NS_CBC = "urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2";
|
|
private static final String NS_EFAC = "http://data.europa.eu/p27/eforms-ubl-extension-aggregate-components/1";
|
|
private static final String NS_EFBC = "http://data.europa.eu/p27/eforms-ubl-extension-basic-components/1";
|
|
private static final String NS_EFEXT = "http://data.europa.eu/p27/eforms-ubl-extensions/1";
|
|
private static final String NS_EXT = "urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2";
|
|
|
|
private final DocumentBuilderFactory documentBuilderFactory;
|
|
private final XPathFactory xPathFactory;
|
|
|
|
public XmlParserService() {
|
|
this.documentBuilderFactory = DocumentBuilderFactory.newInstance();
|
|
this.documentBuilderFactory.setNamespaceAware(true);
|
|
this.xPathFactory = XPathFactory.newInstance();
|
|
}
|
|
|
|
/**
|
|
* Parse an eForms XML document and extract structured data.
|
|
*
|
|
* @param xmlContent The XML content as string
|
|
* @return Populated ProcurementDocument entity (without ID or hash)
|
|
*/
|
|
public ProcurementDocument parseDocument(String xmlContent) {
|
|
try {
|
|
DocumentBuilder builder = newDocumentBuilder();
|
|
Document doc = builder.parse(new InputSource(new StringReader(xmlContent)));
|
|
|
|
XPath xpath = newXPath();
|
|
|
|
ProcurementDocument document = ProcurementDocument.builder()
|
|
.xmlDocument(xmlContent)
|
|
.build();
|
|
|
|
// Parse basic notice information
|
|
parseNoticeMetadata(doc, xpath, document);
|
|
|
|
// Parse contracting party (buyer) information
|
|
parseContractingParty(doc, xpath, document);
|
|
|
|
// Parse procurement project information
|
|
parseProcurementProject(doc, xpath, document);
|
|
|
|
// Parse tendering process
|
|
parseTenderingProcess(doc, xpath, document);
|
|
|
|
// Parse organizations from extensions
|
|
parseOrganizations(doc, xpath, document);
|
|
|
|
// Parse lots
|
|
parseLots(doc, xpath, document);
|
|
|
|
// Parse publication information
|
|
parsePublication(doc, xpath, document);
|
|
|
|
// Generate text content for vectorization
|
|
document.setTextContent(generateTextContent(document));
|
|
|
|
return document;
|
|
|
|
} catch (Exception e) {
|
|
log.error("Error parsing XML document: {}", e.getMessage(), e);
|
|
throw new XmlParsingException("Failed to parse XML document", e);
|
|
}
|
|
}
|
|
|
|
private void parseNoticeMetadata(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
// UBL Version
|
|
document.setUblVersion(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:UBLVersionID"));
|
|
|
|
// SDK Version (customization ID)
|
|
document.setSdkVersion(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:CustomizationID"));
|
|
|
|
// Notice ID
|
|
document.setNoticeId(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:ID"));
|
|
|
|
// Contract Folder ID
|
|
document.setContractFolderId(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:ContractFolderID"));
|
|
|
|
// Issue Date and Time - combined into single OffsetDateTime
|
|
String issueDateStr = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:IssueDate");
|
|
String issueTimeStr = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:IssueTime");
|
|
if (issueDateStr != null) {
|
|
document.setIssueDateTime(parseDateTime(issueDateStr, issueTimeStr));
|
|
}
|
|
|
|
// Notice Language
|
|
document.setLanguageCode(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:NoticeLanguageCode"));
|
|
|
|
// Notice Type Code
|
|
String noticeTypeCode = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:NoticeTypeCode");
|
|
document.setNoticeType(mapNoticeType(noticeTypeCode));
|
|
|
|
// Regulatory Domain
|
|
document.setRegulatoryDomain(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:RegulatoryDomain"));
|
|
|
|
// Notice Subtype from extensions
|
|
String subtypeCode = getTextContent(xpath, doc,
|
|
"//efext:EformsExtension/efac:NoticeSubType/cbc:SubTypeCode");
|
|
document.setNoticeSubtypeCode(subtypeCode);
|
|
}
|
|
|
|
private void parseContractingParty(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
// Activity type
|
|
document.setBuyerActivityType(getTextContent(xpath, doc,
|
|
"//cac:ContractingParty/cac:ContractingActivity/cbc:ActivityTypeCode"));
|
|
|
|
// Legal type
|
|
document.setBuyerLegalType(getTextContent(xpath, doc,
|
|
"//cac:ContractingParty/cac:ContractingPartyType/cbc:PartyTypeCode"));
|
|
|
|
// Organization reference to link with organizations
|
|
String orgRef = getTextContent(xpath, doc,
|
|
"//cac:ContractingParty/cac:Party/cac:PartyIdentification/cbc:ID");
|
|
|
|
// Buyer details will be populated from organizations
|
|
}
|
|
|
|
private void parseProcurementProject(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
// Project title
|
|
document.setProjectTitle(getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:Name"));
|
|
|
|
// Project description
|
|
document.setProjectDescription(getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:Description"));
|
|
|
|
// Internal reference
|
|
document.setInternalReference(getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:ID"));
|
|
|
|
// Contract nature
|
|
String contractNature = getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:ProcurementTypeCode");
|
|
document.setContractNature(mapContractNature(contractNature));
|
|
|
|
// CPV codes
|
|
List<String> cpvCodes = getTextContents(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:MainCommodityClassification/cbc:ItemClassificationCode");
|
|
cpvCodes.addAll(getTextContents(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:AdditionalCommodityClassification/cbc:ItemClassificationCode"));
|
|
document.setCpvCodes(cpvCodes.toArray(new String[0]));
|
|
|
|
// Location - country and NUTS codes
|
|
document.setBuyerCountryCode(getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:RealizedLocation/cac:Address/cac:Country/cbc:IdentificationCode"));
|
|
document.setBuyerNutsCode(getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode"));
|
|
document.setBuyerCity(getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:RealizedLocation/cac:Address/cbc:CityName"));
|
|
|
|
// All NUTS codes from project and lots
|
|
List<String> nutsCodes = getTextContents(xpath, doc,
|
|
"//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode");
|
|
document.setNutsCodes(nutsCodes.stream().distinct().toArray(String[]::new));
|
|
}
|
|
|
|
private void parseTenderingProcess(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
// Procedure type
|
|
String procedureCode = getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:TenderingProcess/cbc:ProcedureCode");
|
|
document.setProcedureType(mapProcedureType(procedureCode));
|
|
|
|
// Lot distribution
|
|
String maxLotsAwarded = getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:TenderingTerms/cac:LotDistribution/cbc:MaximumLotsAwardedNumeric");
|
|
if (maxLotsAwarded != null) {
|
|
document.setMaxLotsAwarded(Integer.parseInt(maxLotsAwarded));
|
|
}
|
|
|
|
String maxLotsSubmitted = getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cac:TenderingTerms/cac:LotDistribution/cbc:MaximumLotsSubmittedNumeric");
|
|
if (maxLotsSubmitted != null) {
|
|
document.setMaxLotsSubmitted(Integer.parseInt(maxLotsSubmitted));
|
|
}
|
|
}
|
|
|
|
private void parseOrganizations(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
NodeList orgNodes = (NodeList) xpath.evaluate(
|
|
"//efac:Organizations/efac:Organization", doc, XPathConstants.NODESET);
|
|
|
|
boolean buyerInfoSet = false;
|
|
|
|
for (int i = 0; i < orgNodes.getLength(); i++) {
|
|
Node orgNode = orgNodes.item(i);
|
|
|
|
Organization org = Organization.builder().build();
|
|
|
|
// Organization reference
|
|
org.setOrgReference(getTextContent(xpath, orgNode, ".//cac:PartyIdentification/cbc:ID"));
|
|
|
|
// Name
|
|
org.setName(getTextContent(xpath, orgNode, ".//cac:PartyName/cbc:Name"));
|
|
if(org.getName() == null) org.setName("");
|
|
|
|
// Company ID
|
|
org.setCompanyId(getTextContent(xpath, orgNode, ".//cac:PartyLegalEntity/cbc:CompanyID"));
|
|
|
|
// Address
|
|
org.setStreetName(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:StreetName"));
|
|
org.setCity(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:CityName"));
|
|
org.setPostalCode(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:PostalZone"));
|
|
org.setNutsCode(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:CountrySubentityCode"));
|
|
org.setCountryCode(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cac:Country/cbc:IdentificationCode"));
|
|
|
|
// Contact
|
|
org.setWebsiteUri(getTextContent(xpath, orgNode, ".//cbc:WebsiteURI"));
|
|
org.setEmail(getTextContent(xpath, orgNode, ".//cac:Contact/cbc:ElectronicMail"));
|
|
org.setPhone(getTextContent(xpath, orgNode, ".//cac:Contact/cbc:Telephone"));
|
|
|
|
document.addOrganization(org);
|
|
|
|
// Set buyer info from first organization (typically ORG-0001)
|
|
if (!buyerInfoSet && "ORG-0001".equals(org.getOrgReference())) {
|
|
document.setBuyerName(org.getName());
|
|
if (document.getBuyerCountryCode() == null) {
|
|
document.setBuyerCountryCode(org.getCountryCode());
|
|
}
|
|
if (document.getBuyerCity() == null) {
|
|
document.setBuyerCity(org.getCity());
|
|
}
|
|
document.setBuyerPostalCode(org.getPostalCode());
|
|
if (document.getBuyerNutsCode() == null) {
|
|
document.setBuyerNutsCode(org.getNutsCode());
|
|
}
|
|
buyerInfoSet = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
private DocumentBuilder newDocumentBuilder() throws Exception {
|
|
synchronized (documentBuilderFactory) {
|
|
documentBuilderFactory.setNamespaceAware(true);
|
|
return documentBuilderFactory.newDocumentBuilder();
|
|
}
|
|
}
|
|
|
|
private XPath newXPath() {
|
|
synchronized (xPathFactory) {
|
|
XPath xpath = xPathFactory.newXPath();
|
|
xpath.setNamespaceContext(createNamespaceContext());
|
|
return xpath;
|
|
}
|
|
}
|
|
|
|
private String getTextContent(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
|
Node node = (Node) xpath.evaluate(expression, item, XPathConstants.NODE);
|
|
return node != null ? node.getTextContent().trim() : null;
|
|
}
|
|
|
|
private Node getNode(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
|
return (Node) xpath.evaluate(expression, item, XPathConstants.NODE);
|
|
}
|
|
|
|
private NodeList getNodes(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
|
return (NodeList) xpath.evaluate(expression, item, XPathConstants.NODESET);
|
|
}
|
|
|
|
private Element getDirectChild(Element parent, String namespaceUri, String localName) {
|
|
Node child = parent.getFirstChild();
|
|
while (child != null) {
|
|
if (child.getNodeType() == Node.ELEMENT_NODE) {
|
|
Element el = (Element) child;
|
|
if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) {
|
|
return el;
|
|
}
|
|
}
|
|
child = child.getNextSibling();
|
|
}
|
|
return null;
|
|
}
|
|
|
|
private List<Element> getDirectChildren(Element parent, String namespaceUri, String localName) {
|
|
List<Element> result = new ArrayList<>();
|
|
Node child = parent.getFirstChild();
|
|
while (child != null) {
|
|
if (child.getNodeType() == Node.ELEMENT_NODE) {
|
|
Element el = (Element) child;
|
|
if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) {
|
|
result.add(el);
|
|
}
|
|
}
|
|
child = child.getNextSibling();
|
|
}
|
|
return result;
|
|
}
|
|
|
|
private String getDirectChildText(Element parent, String namespaceUri, String localName) {
|
|
Element child = getDirectChild(parent, namespaceUri, localName);
|
|
if (child == null) {
|
|
return null;
|
|
}
|
|
return trimToNull(child.getTextContent());
|
|
}
|
|
|
|
private String trimToNull(String value) {
|
|
if (value == null) {
|
|
return null;
|
|
}
|
|
String trimmed = value.trim();
|
|
return trimmed.isEmpty() ? null : trimmed;
|
|
}
|
|
|
|
private void parseLotsDOM(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot");
|
|
document.setTotalLots(lotNodes.getLength());
|
|
|
|
for (int i = 0; i < lotNodes.getLength(); i++) {
|
|
Node lotNode = lotNodes.item(i);
|
|
if (lotNode.getNodeType() != Node.ELEMENT_NODE) {
|
|
continue;
|
|
}
|
|
|
|
Element lotEl = (Element) lotNode;
|
|
ProcurementLot lot = ProcurementLot.builder().build();
|
|
|
|
// Direct child values on the lot
|
|
lot.setLotId(getDirectChildText(lotEl, NS_CBC, "ID"));
|
|
|
|
Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject");
|
|
if (procurementProjectEl != null) {
|
|
lot.setInternalId(getDirectChildText(procurementProjectEl, NS_CBC, "ID"));
|
|
lot.setTitle(getDirectChildText(procurementProjectEl, NS_CBC, "Name"));
|
|
lot.setDescription(getDirectChildText(procurementProjectEl, NS_CBC, "Description"));
|
|
|
|
// CPV codes
|
|
List<String> lotCpvCodes = new ArrayList<>();
|
|
for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) {
|
|
String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode");
|
|
if (cpv != null && !cpv.isEmpty()) {
|
|
lotCpvCodes.add(cpv);
|
|
}
|
|
}
|
|
lot.setCpvCodes(lotCpvCodes.toArray(new String[0]));
|
|
|
|
// NUTS codes
|
|
List<String> lotNutsCodes = new ArrayList<>();
|
|
for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) {
|
|
Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address");
|
|
if (addressEl != null) {
|
|
String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode");
|
|
if (nuts != null && !nuts.isEmpty()) {
|
|
lotNutsCodes.add(nuts);
|
|
}
|
|
}
|
|
}
|
|
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
|
|
|
|
// Duration
|
|
Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod");
|
|
if (plannedPeriodEl != null) {
|
|
Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure");
|
|
if (durationEl != null) {
|
|
String durationValue = trimToNull(durationEl.getTextContent());
|
|
if (durationValue != null) {
|
|
try {
|
|
lot.setDurationValue(Double.parseDouble(durationValue));
|
|
} catch (NumberFormatException e) {
|
|
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
|
|
}
|
|
}
|
|
|
|
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
|
|
if (unitCode != null) {
|
|
lot.setDurationUnit(unitCode);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Submission deadline
|
|
Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess");
|
|
if (tenderingProcessEl != null) {
|
|
Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod");
|
|
if (deadlinePeriodEl != null) {
|
|
String endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate");
|
|
if (endDate != null) {
|
|
String endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime");
|
|
lot.setSubmissionDeadline(parseDateTime(endDate, endTime));
|
|
|
|
if (document.getSubmissionDeadline() == null) {
|
|
document.setSubmissionDeadline(lot.getSubmissionDeadline());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// EU funded
|
|
Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms");
|
|
if (tenderingTermsEl != null) {
|
|
String fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode");
|
|
lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds"));
|
|
}
|
|
|
|
document.addLot(lot);
|
|
}
|
|
|
|
document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded())));
|
|
}
|
|
|
|
private void parseLots(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot");
|
|
document.setTotalLots(lotNodes.getLength());
|
|
|
|
for (int i = 0; i < lotNodes.getLength(); i++) {
|
|
Node lotNode = lotNodes.item(i);
|
|
if (lotNode.getNodeType() != Node.ELEMENT_NODE) {
|
|
continue;
|
|
}
|
|
|
|
Element lotEl = (Element) lotNode;
|
|
ProcurementLot lot = ProcurementLot.builder().build();
|
|
|
|
// Fast direct children
|
|
Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject");
|
|
Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess");
|
|
Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms");
|
|
|
|
// --- Lot ID ---
|
|
String lotId = getDirectChildText(lotEl, NS_CBC, "ID");
|
|
if (lotId == null) {
|
|
lotId = getTextContent(xpath, lotNode, "cbc:ID");
|
|
}
|
|
lot.setLotId(lotId);
|
|
|
|
// --- Internal ID ---
|
|
String internalId = null;
|
|
if (procurementProjectEl != null) {
|
|
internalId = getDirectChildText(procurementProjectEl, NS_CBC, "ID");
|
|
}
|
|
if (internalId == null) {
|
|
internalId = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:ID");
|
|
}
|
|
lot.setInternalId(internalId);
|
|
|
|
// --- Title ---
|
|
String title = null;
|
|
if (procurementProjectEl != null) {
|
|
title = getDirectChildText(procurementProjectEl, NS_CBC, "Name");
|
|
}
|
|
if (title == null) {
|
|
title = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Name");
|
|
}
|
|
lot.setTitle(title);
|
|
|
|
// --- Description ---
|
|
String description = null;
|
|
if (procurementProjectEl != null) {
|
|
description = getDirectChildText(procurementProjectEl, NS_CBC, "Description");
|
|
}
|
|
if (description == null) {
|
|
description = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Description");
|
|
}
|
|
lot.setDescription(description);
|
|
|
|
// --- CPV codes ---
|
|
List<String> lotCpvCodes = new ArrayList<>();
|
|
if (procurementProjectEl != null) {
|
|
for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) {
|
|
String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode");
|
|
if (cpv != null && !cpv.isEmpty()) {
|
|
lotCpvCodes.add(cpv);
|
|
}
|
|
}
|
|
}
|
|
if (lotCpvCodes.isEmpty()) {
|
|
NodeList cpvNodes = getNodes(xpath, lotNode,
|
|
".//cac:MainCommodityClassification/cbc:ItemClassificationCode");
|
|
for (int j = 0; j < cpvNodes.getLength(); j++) {
|
|
String cpv = trimToNull(cpvNodes.item(j).getTextContent());
|
|
if (cpv != null) {
|
|
lotCpvCodes.add(cpv);
|
|
}
|
|
}
|
|
}
|
|
lot.setCpvCodes(lotCpvCodes.toArray(new String[0]));
|
|
|
|
// --- NUTS codes ---
|
|
List<String> lotNutsCodes = new ArrayList<>();
|
|
if (procurementProjectEl != null) {
|
|
for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) {
|
|
Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address");
|
|
if (addressEl != null) {
|
|
String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode");
|
|
if (nuts != null && !nuts.isEmpty()) {
|
|
lotNutsCodes.add(nuts);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (lotNutsCodes.isEmpty()) {
|
|
NodeList nutsNodes = getNodes(xpath, lotNode,
|
|
".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode");
|
|
for (int j = 0; j < nutsNodes.getLength(); j++) {
|
|
String nuts = trimToNull(nutsNodes.item(j).getTextContent());
|
|
if (nuts != null) {
|
|
lotNutsCodes.add(nuts);
|
|
}
|
|
}
|
|
}
|
|
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
|
|
|
|
// --- Duration ---
|
|
boolean durationSet = false;
|
|
if (procurementProjectEl != null) {
|
|
Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod");
|
|
if (plannedPeriodEl != null) {
|
|
Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure");
|
|
if (durationEl != null) {
|
|
String durationValue = trimToNull(durationEl.getTextContent());
|
|
if (durationValue != null) {
|
|
try {
|
|
lot.setDurationValue(Double.parseDouble(durationValue));
|
|
} catch (NumberFormatException e) {
|
|
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
|
|
}
|
|
}
|
|
|
|
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
|
|
if (unitCode != null) {
|
|
lot.setDurationUnit(unitCode);
|
|
}
|
|
durationSet = true;
|
|
}
|
|
}
|
|
}
|
|
if (!durationSet) {
|
|
Node durationNode = getNode(xpath, lotNode,
|
|
"cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure");
|
|
if (durationNode != null) {
|
|
String durationValue = trimToNull(durationNode.getTextContent());
|
|
if (durationValue != null) {
|
|
try {
|
|
lot.setDurationValue(Double.parseDouble(durationValue));
|
|
} catch (NumberFormatException e) {
|
|
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
|
|
}
|
|
}
|
|
if (durationNode instanceof Element durationEl) {
|
|
String unitCode = trimToNull(durationEl.getAttribute("unitCode"));
|
|
if (unitCode != null) {
|
|
lot.setDurationUnit(unitCode);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Submission deadline ---
|
|
String endDate = null;
|
|
String endTime = null;
|
|
if (tenderingProcessEl != null) {
|
|
Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod");
|
|
if (deadlinePeriodEl != null) {
|
|
endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate");
|
|
endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime");
|
|
}
|
|
}
|
|
if (endDate == null) {
|
|
endDate = getTextContent(xpath, lotNode,
|
|
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndDate");
|
|
endTime = getTextContent(xpath, lotNode,
|
|
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndTime");
|
|
}
|
|
if (endDate != null) {
|
|
lot.setSubmissionDeadline(parseDateTime(endDate, endTime));
|
|
if (document.getSubmissionDeadline() == null) {
|
|
document.setSubmissionDeadline(lot.getSubmissionDeadline());
|
|
}
|
|
}
|
|
|
|
// --- EU funded ---
|
|
String fundingProgramCode = null;
|
|
if (tenderingTermsEl != null) {
|
|
fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode");
|
|
}
|
|
if (fundingProgramCode == null) {
|
|
fundingProgramCode = getTextContent(xpath, lotNode,
|
|
"cac:TenderingTerms/cbc:FundingProgramCode");
|
|
}
|
|
lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds"));
|
|
|
|
document.addLot(lot);
|
|
}
|
|
|
|
document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded())));
|
|
}
|
|
|
|
private void parseLotsOld(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
NodeList lotNodes = (NodeList) xpath.evaluate(
|
|
"//cac:ProcurementProjectLot", doc, XPathConstants.NODESET);
|
|
|
|
document.setTotalLots(lotNodes.getLength());
|
|
|
|
for (int i = 0; i < lotNodes.getLength(); i++) {
|
|
Node lotNode = lotNodes.item(i);
|
|
|
|
ProcurementLot lot = ProcurementLot.builder().build();
|
|
|
|
// Lot ID
|
|
lot.setLotId(getTextContent(xpath, lotNode, "cbc:ID"));
|
|
|
|
// Internal ID
|
|
lot.setInternalId(getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:ID"));
|
|
|
|
// Title and description
|
|
lot.setTitle(getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Name"));
|
|
lot.setDescription(getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Description"));
|
|
|
|
// CPV codes for this lot
|
|
List<String> lotCpvCodes = new ArrayList<>();
|
|
NodeList cpvNodes = (NodeList) xpath.evaluate(
|
|
".//cac:MainCommodityClassification/cbc:ItemClassificationCode",
|
|
lotNode, XPathConstants.NODESET);
|
|
for (int j = 0; j < cpvNodes.getLength(); j++) {
|
|
lotCpvCodes.add(cpvNodes.item(j).getTextContent());
|
|
}
|
|
lot.setCpvCodes(lotCpvCodes.toArray(new String[0]));
|
|
|
|
// NUTS codes for this lot
|
|
List<String> lotNutsCodes = new ArrayList<>();
|
|
NodeList nutsNodes = (NodeList) xpath.evaluate(
|
|
".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode",
|
|
lotNode, XPathConstants.NODESET);
|
|
for (int j = 0; j < nutsNodes.getLength(); j++) {
|
|
lotNutsCodes.add(nutsNodes.item(j).getTextContent());
|
|
}
|
|
lot.setNutsCodes(lotNutsCodes.toArray(new String[0]));
|
|
|
|
// Duration
|
|
String durationValue = getTextContent(xpath, lotNode,
|
|
"cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure");
|
|
if (durationValue != null) {
|
|
try {
|
|
lot.setDurationValue(Double.parseDouble(durationValue));
|
|
} catch (NumberFormatException e) {
|
|
log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId());
|
|
}
|
|
}
|
|
lot.setDurationUnit(getAttributeValue(xpath, lotNode,
|
|
"cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure", "unitCode"));
|
|
|
|
// Submission deadline
|
|
String endDate = getTextContent(xpath, lotNode,
|
|
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndDate");
|
|
String endTime = getTextContent(xpath, lotNode,
|
|
"cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndTime");
|
|
if (endDate != null) {
|
|
lot.setSubmissionDeadline(parseDateTime(endDate, endTime));
|
|
// Set document-level deadline from first lot if not set
|
|
if (document.getSubmissionDeadline() == null) {
|
|
document.setSubmissionDeadline(lot.getSubmissionDeadline());
|
|
}
|
|
}
|
|
|
|
// EU funded
|
|
String euFunded = getTextContent(xpath, lotNode,
|
|
"cac:TenderingTerms/cbc:FundingProgramCode");
|
|
lot.setEuFunded(euFunded != null && !euFunded.contains("no-eu-funds"));
|
|
|
|
document.addLot(lot);
|
|
}
|
|
|
|
// Check if any lot is EU funded
|
|
document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded())));
|
|
}
|
|
|
|
private void parsePublication(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException {
|
|
// Publication ID (OJS notice ID)
|
|
document.setPublicationId(getTextContent(xpath, doc,
|
|
"//efac:Publication/efbc:NoticePublicationID"));
|
|
|
|
// OJS ID (gazette ID)
|
|
document.setOjsId(getTextContent(xpath, doc,
|
|
"//efac:Publication/efbc:GazetteID"));
|
|
|
|
// Publication date
|
|
String pubDate = getTextContent(xpath, doc,
|
|
"//efac:Publication/efbc:PublicationDate");
|
|
if (pubDate != null) {
|
|
document.setPublicationDate(parseDate(pubDate));
|
|
}
|
|
|
|
// Fallback to requested publication date
|
|
if (document.getPublicationDate() == null) {
|
|
String requestedPubDate = getTextContent(xpath, doc,
|
|
"/*[local-name()='ContractNotice']/cbc:RequestedPublicationDate");
|
|
if (requestedPubDate != null) {
|
|
document.setPublicationDate(parseDate(requestedPubDate));
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate a textual representation for vectorization.
|
|
*/
|
|
private String generateTextContent(ProcurementDocument document) {
|
|
StringBuilder sb = new StringBuilder();
|
|
|
|
// Title (most important)
|
|
if (document.getProjectTitle() != null) {
|
|
sb.append("Title: ").append(document.getProjectTitle()).append("\n\n");
|
|
}
|
|
|
|
// Description
|
|
if (document.getProjectDescription() != null) {
|
|
sb.append("Description: ").append(document.getProjectDescription()).append("\n\n");
|
|
}
|
|
|
|
// Buyer information
|
|
if (document.getBuyerName() != null) {
|
|
sb.append("Contracting Authority: ").append(document.getBuyerName());
|
|
if (document.getBuyerCity() != null) {
|
|
sb.append(", ").append(document.getBuyerCity());
|
|
}
|
|
if (document.getBuyerCountryCode() != null) {
|
|
sb.append(" (").append(document.getBuyerCountryCode()).append(")");
|
|
}
|
|
sb.append("\n\n");
|
|
}
|
|
|
|
// Contract type and procedure
|
|
if (document.getContractNature() != null) {
|
|
sb.append("Contract Type: ").append(document.getContractNature()).append("\n");
|
|
}
|
|
if (document.getProcedureType() != null) {
|
|
sb.append("Procedure: ").append(document.getProcedureType()).append("\n");
|
|
}
|
|
|
|
// CPV classification
|
|
if (document.getCpvCodes() != null && document.getCpvCodes().length > 0) {
|
|
sb.append("CPV Codes: ").append(String.join(", ", document.getCpvCodes())).append("\n");
|
|
}
|
|
|
|
// Lot information
|
|
if (document.getLots() != null && !document.getLots().isEmpty()) {
|
|
sb.append("\nLots (").append(document.getLots().size()).append("):\n");
|
|
for (ProcurementLot lot : document.getLots()) {
|
|
if (lot.getTitle() != null) {
|
|
sb.append("- ").append(lot.getLotId()).append(": ").append(lot.getTitle());
|
|
if (lot.getDescription() != null && !lot.getDescription().equals(lot.getTitle())) {
|
|
sb.append(" - ").append(lot.getDescription());
|
|
}
|
|
sb.append("\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
return sb.toString().trim();
|
|
}
|
|
|
|
// Helper methods
|
|
|
|
private List<String> getTextContents(XPath xpath, Object item, String expression) throws XPathExpressionException {
|
|
List<String> results = new ArrayList<>();
|
|
NodeList nodes = getNodes(xpath, item, expression);
|
|
for (int i = 0; i < nodes.getLength(); i++) {
|
|
String text = nodes.item(i).getTextContent().trim();
|
|
if (!text.isEmpty()) {
|
|
results.add(text);
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
private String getAttributeValue(XPath xpath, Object item, String expression, String attrName) throws XPathExpressionException {
|
|
Node node = getNode(xpath, item, expression);
|
|
if (node instanceof Element element) {
|
|
String value = element.getAttribute(attrName);
|
|
return trimToNull(value);
|
|
}
|
|
return null;
|
|
}
|
|
|
|
private LocalDate parseDate(String dateStr) {
|
|
if (dateStr == null || dateStr.isEmpty()) return null;
|
|
try {
|
|
// Handle various date formats
|
|
dateStr = dateStr.trim();
|
|
|
|
// Handle datetime with dash separator (e.g. "2025-04-23-03:00")
|
|
// Extract only the date part (first 10 characters: YYYY-MM-DD)
|
|
if (dateStr.matches("\\d{4}-\\d{2}-\\d{2}-\\d{2}:\\d{2}.*")) {
|
|
dateStr = dateStr.substring(0, 10);
|
|
}
|
|
|
|
if (dateStr.contains("+")) {
|
|
dateStr = dateStr.substring(0, dateStr.indexOf("+"));
|
|
}
|
|
if (dateStr.endsWith("Z")) {
|
|
dateStr = dateStr.substring(0, dateStr.length() - 1);
|
|
}
|
|
return LocalDate.parse(dateStr);
|
|
} catch (DateTimeParseException e) {
|
|
log.warn("Failed to parse date: {} . Error: {}", dateStr, e.getMessage());
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private LocalTime parseTime(String timeStr) {
|
|
if (timeStr == null || timeStr.isEmpty()) return null;
|
|
try {
|
|
timeStr = timeStr.trim();
|
|
|
|
// Handle time with offset (e.g. "12:00:00-03:00")
|
|
// Extract only the time part (first 8 characters: HH:mm:ss)
|
|
if (timeStr.matches("\\d{2}:\\d{2}:\\d{2}[+-]\\d{2}:\\d{2}")) {
|
|
timeStr = timeStr.substring(0, 8);
|
|
}
|
|
|
|
if (timeStr.contains("+")) {
|
|
timeStr = timeStr.substring(0, timeStr.indexOf("+"));
|
|
}
|
|
if (timeStr.endsWith("Z")) {
|
|
timeStr = timeStr.substring(0, timeStr.length() - 1);
|
|
}
|
|
return LocalTime.parse(timeStr);
|
|
} catch (DateTimeParseException e) {
|
|
log.warn("Failed to parse time: {} . Error: {}", timeStr, e.getMessage());
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private OffsetDateTime parseDateTime(String dateStr, String timeStr) {
|
|
LocalDate date = parseDate(dateStr);
|
|
if (date == null) return null;
|
|
|
|
LocalTime time = timeStr != null ? parseTime(timeStr) : LocalTime.MIDNIGHT;
|
|
if (time == null) time = LocalTime.MIDNIGHT;
|
|
|
|
// Parse timezone offset if present in date string
|
|
ZoneOffset offset = ZoneOffset.UTC;
|
|
if (dateStr != null && dateStr.contains("+")) {
|
|
try {
|
|
String offsetStr = dateStr.substring(dateStr.indexOf("+"));
|
|
offset = ZoneOffset.of(offsetStr);
|
|
} catch (Exception e) {
|
|
// Default to UTC
|
|
}
|
|
}
|
|
|
|
return OffsetDateTime.of(date, time, offset);
|
|
}
|
|
|
|
private NoticeType mapNoticeType(String code) {
|
|
if (code == null) return NoticeType.OTHER;
|
|
return switch (code.toLowerCase()) {
|
|
case "cn-standard", "cn-social", "cn-defence" -> NoticeType.CONTRACT_NOTICE;
|
|
case "pin-only", "pin-rtl", "pin-cfc-standard" -> NoticeType.PRIOR_INFORMATION_NOTICE;
|
|
case "can-standard", "can-social", "can-modif" -> NoticeType.CONTRACT_AWARD_NOTICE;
|
|
default -> NoticeType.OTHER;
|
|
};
|
|
}
|
|
|
|
private ContractNature mapContractNature(String code) {
|
|
if (code == null) return ContractNature.UNKNOWN;
|
|
return switch (code.toLowerCase()) {
|
|
case "supplies" -> ContractNature.SUPPLIES;
|
|
case "services" -> ContractNature.SERVICES;
|
|
case "works" -> ContractNature.WORKS;
|
|
case "mixed" -> ContractNature.MIXED;
|
|
default -> ContractNature.UNKNOWN;
|
|
};
|
|
}
|
|
|
|
private ProcedureType mapProcedureType(String code) {
|
|
if (code == null) return ProcedureType.OTHER;
|
|
return switch (code.toLowerCase()) {
|
|
case "open" -> ProcedureType.OPEN;
|
|
case "restricted" -> ProcedureType.RESTRICTED;
|
|
case "comp-dial" -> ProcedureType.COMPETITIVE_DIALOGUE;
|
|
case "innovation" -> ProcedureType.INNOVATION_PARTNERSHIP;
|
|
case "neg-wo-pub" -> ProcedureType.NEGOTIATED_WITHOUT_PUBLICATION;
|
|
case "neg-w-pub" -> ProcedureType.NEGOTIATED_WITH_PUBLICATION;
|
|
default -> ProcedureType.OTHER;
|
|
};
|
|
}
|
|
|
|
private NamespaceContext createNamespaceContext() {
|
|
return new NamespaceContext() {
|
|
@Override
|
|
public String getNamespaceURI(String prefix) {
|
|
return switch (prefix) {
|
|
case "cn" -> NS_CN;
|
|
case "cac" -> NS_CAC;
|
|
case "cbc" -> NS_CBC;
|
|
case "efac" -> NS_EFAC;
|
|
case "efbc" -> NS_EFBC;
|
|
case "efext" -> NS_EFEXT;
|
|
case "ext" -> NS_EXT;
|
|
default -> null;
|
|
};
|
|
}
|
|
|
|
@Override
|
|
public String getPrefix(String namespaceURI) {
|
|
return null;
|
|
}
|
|
|
|
@Override
|
|
public Iterator<String> getPrefixes(String namespaceURI) {
|
|
return null;
|
|
}
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Exception thrown when XML parsing fails.
|
|
*/
|
|
public static class XmlParsingException extends RuntimeException {
|
|
public XmlParsingException(String message, Throwable cause) {
|
|
super(message, cause);
|
|
}
|
|
}
|
|
}
|