package at.procon.ted.service; import at.procon.ted.model.entity.*; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; import org.w3c.dom.*; import org.xml.sax.InputSource; import javax.xml.namespace.NamespaceContext; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.xpath.*; import java.io.StringReader; import java.time.LocalDate; import java.time.LocalTime; import java.time.OffsetDateTime; import java.time.ZoneOffset; import java.time.format.DateTimeParseException; import java.util.*; /** * Service for parsing EU eForms XML documents. * Extracts structured data from TED procurement notices. * * Uses XPath for navigation through the UBL 2.3 document structure with eForms extensions. * * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service @RequiredArgsConstructor @Slf4j public class XmlParserService { // Namespace URIs for eForms/UBL documents private static final String NS_CN = "urn:oasis:names:specification:ubl:schema:xsd:ContractNotice-2"; private static final String NS_CAC = "urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"; private static final String NS_CBC = "urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2"; private static final String NS_EFAC = "http://data.europa.eu/p27/eforms-ubl-extension-aggregate-components/1"; private static final String NS_EFBC = "http://data.europa.eu/p27/eforms-ubl-extension-basic-components/1"; private static final String NS_EFEXT = "http://data.europa.eu/p27/eforms-ubl-extensions/1"; private static final String NS_EXT = "urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2"; private final DocumentBuilderFactory documentBuilderFactory; private final XPathFactory xPathFactory; public XmlParserService() { this.documentBuilderFactory = DocumentBuilderFactory.newInstance(); this.documentBuilderFactory.setNamespaceAware(true); this.xPathFactory = XPathFactory.newInstance(); } /** * Parse an eForms XML document and extract structured data. * * @param xmlContent The XML content as string * @return Populated ProcurementDocument entity (without ID or hash) */ public ProcurementDocument parseDocument(String xmlContent) { try { DocumentBuilder builder = newDocumentBuilder(); Document doc = builder.parse(new InputSource(new StringReader(xmlContent))); XPath xpath = newXPath(); ProcurementDocument document = ProcurementDocument.builder() .xmlDocument(xmlContent) .build(); // Parse basic notice information parseNoticeMetadata(doc, xpath, document); // Parse contracting party (buyer) information parseContractingParty(doc, xpath, document); // Parse procurement project information parseProcurementProject(doc, xpath, document); // Parse tendering process parseTenderingProcess(doc, xpath, document); // Parse organizations from extensions parseOrganizations(doc, xpath, document); // Parse lots parseLots(doc, xpath, document); // Parse publication information parsePublication(doc, xpath, document); // Generate text content for vectorization document.setTextContent(generateTextContent(document)); return document; } catch (Exception e) { log.error("Error parsing XML document: {}", e.getMessage(), e); throw new XmlParsingException("Failed to parse XML document", e); } } private void parseNoticeMetadata(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { // UBL Version document.setUblVersion(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:UBLVersionID")); // SDK Version (customization ID) document.setSdkVersion(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:CustomizationID")); // Notice ID document.setNoticeId(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:ID")); // Contract Folder ID document.setContractFolderId(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:ContractFolderID")); // Issue Date and Time - combined into single OffsetDateTime String issueDateStr = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:IssueDate"); String issueTimeStr = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:IssueTime"); if (issueDateStr != null) { document.setIssueDateTime(parseDateTime(issueDateStr, issueTimeStr)); } // Notice Language document.setLanguageCode(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:NoticeLanguageCode")); // Notice Type Code String noticeTypeCode = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:NoticeTypeCode"); document.setNoticeType(mapNoticeType(noticeTypeCode)); // Regulatory Domain document.setRegulatoryDomain(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:RegulatoryDomain")); // Notice Subtype from extensions String subtypeCode = getTextContent(xpath, doc, "//efext:EformsExtension/efac:NoticeSubType/cbc:SubTypeCode"); document.setNoticeSubtypeCode(subtypeCode); } private void parseContractingParty(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { // Activity type document.setBuyerActivityType(getTextContent(xpath, doc, "//cac:ContractingParty/cac:ContractingActivity/cbc:ActivityTypeCode")); // Legal type document.setBuyerLegalType(getTextContent(xpath, doc, "//cac:ContractingParty/cac:ContractingPartyType/cbc:PartyTypeCode")); // Organization reference to link with organizations String orgRef = getTextContent(xpath, doc, "//cac:ContractingParty/cac:Party/cac:PartyIdentification/cbc:ID"); // Buyer details will be populated from organizations } private void parseProcurementProject(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { // Project title document.setProjectTitle(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:Name")); // Project description document.setProjectDescription(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:Description")); // Internal reference document.setInternalReference(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:ID")); // Contract nature String contractNature = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cac:ProcurementProject/cbc:ProcurementTypeCode"); document.setContractNature(mapContractNature(contractNature)); // CPV codes List cpvCodes = getTextContents(xpath, doc, "/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:MainCommodityClassification/cbc:ItemClassificationCode"); cpvCodes.addAll(getTextContents(xpath, doc, "/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:AdditionalCommodityClassification/cbc:ItemClassificationCode")); document.setCpvCodes(cpvCodes.toArray(new String[0])); // Location - country and NUTS codes document.setBuyerCountryCode(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:RealizedLocation/cac:Address/cac:Country/cbc:IdentificationCode")); document.setBuyerNutsCode(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode")); document.setBuyerCity(getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cac:ProcurementProject/cac:RealizedLocation/cac:Address/cbc:CityName")); // All NUTS codes from project and lots List nutsCodes = getTextContents(xpath, doc, "//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode"); document.setNutsCodes(nutsCodes.stream().distinct().toArray(String[]::new)); } private void parseTenderingProcess(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { // Procedure type String procedureCode = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cac:TenderingProcess/cbc:ProcedureCode"); document.setProcedureType(mapProcedureType(procedureCode)); // Lot distribution String maxLotsAwarded = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cac:TenderingTerms/cac:LotDistribution/cbc:MaximumLotsAwardedNumeric"); if (maxLotsAwarded != null) { document.setMaxLotsAwarded(Integer.parseInt(maxLotsAwarded)); } String maxLotsSubmitted = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cac:TenderingTerms/cac:LotDistribution/cbc:MaximumLotsSubmittedNumeric"); if (maxLotsSubmitted != null) { document.setMaxLotsSubmitted(Integer.parseInt(maxLotsSubmitted)); } } private void parseOrganizations(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { NodeList orgNodes = (NodeList) xpath.evaluate( "//efac:Organizations/efac:Organization", doc, XPathConstants.NODESET); boolean buyerInfoSet = false; for (int i = 0; i < orgNodes.getLength(); i++) { Node orgNode = orgNodes.item(i); Organization org = Organization.builder().build(); // Organization reference org.setOrgReference(getTextContent(xpath, orgNode, ".//cac:PartyIdentification/cbc:ID")); // Name org.setName(getTextContent(xpath, orgNode, ".//cac:PartyName/cbc:Name")); if(org.getName() == null) org.setName(""); // Company ID org.setCompanyId(getTextContent(xpath, orgNode, ".//cac:PartyLegalEntity/cbc:CompanyID")); // Address org.setStreetName(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:StreetName")); org.setCity(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:CityName")); org.setPostalCode(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:PostalZone")); org.setNutsCode(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cbc:CountrySubentityCode")); org.setCountryCode(getTextContent(xpath, orgNode, ".//cac:PostalAddress/cac:Country/cbc:IdentificationCode")); // Contact org.setWebsiteUri(getTextContent(xpath, orgNode, ".//cbc:WebsiteURI")); org.setEmail(getTextContent(xpath, orgNode, ".//cac:Contact/cbc:ElectronicMail")); org.setPhone(getTextContent(xpath, orgNode, ".//cac:Contact/cbc:Telephone")); document.addOrganization(org); // Set buyer info from first organization (typically ORG-0001) if (!buyerInfoSet && "ORG-0001".equals(org.getOrgReference())) { document.setBuyerName(org.getName()); if (document.getBuyerCountryCode() == null) { document.setBuyerCountryCode(org.getCountryCode()); } if (document.getBuyerCity() == null) { document.setBuyerCity(org.getCity()); } document.setBuyerPostalCode(org.getPostalCode()); if (document.getBuyerNutsCode() == null) { document.setBuyerNutsCode(org.getNutsCode()); } buyerInfoSet = true; } } } private DocumentBuilder newDocumentBuilder() throws Exception { synchronized (documentBuilderFactory) { documentBuilderFactory.setNamespaceAware(true); return documentBuilderFactory.newDocumentBuilder(); } } private XPath newXPath() { synchronized (xPathFactory) { XPath xpath = xPathFactory.newXPath(); xpath.setNamespaceContext(createNamespaceContext()); return xpath; } } private String getTextContent(XPath xpath, Object item, String expression) throws XPathExpressionException { Node node = (Node) xpath.evaluate(expression, item, XPathConstants.NODE); return node != null ? node.getTextContent().trim() : null; } private Node getNode(XPath xpath, Object item, String expression) throws XPathExpressionException { return (Node) xpath.evaluate(expression, item, XPathConstants.NODE); } private NodeList getNodes(XPath xpath, Object item, String expression) throws XPathExpressionException { return (NodeList) xpath.evaluate(expression, item, XPathConstants.NODESET); } private Element getDirectChild(Element parent, String namespaceUri, String localName) { Node child = parent.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) { Element el = (Element) child; if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) { return el; } } child = child.getNextSibling(); } return null; } private List getDirectChildren(Element parent, String namespaceUri, String localName) { List result = new ArrayList<>(); Node child = parent.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) { Element el = (Element) child; if (localName.equals(el.getLocalName()) && namespaceUri.equals(el.getNamespaceURI())) { result.add(el); } } child = child.getNextSibling(); } return result; } private String getDirectChildText(Element parent, String namespaceUri, String localName) { Element child = getDirectChild(parent, namespaceUri, localName); if (child == null) { return null; } return trimToNull(child.getTextContent()); } private String trimToNull(String value) { if (value == null) { return null; } String trimmed = value.trim(); return trimmed.isEmpty() ? null : trimmed; } private void parseLotsDOM(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot"); document.setTotalLots(lotNodes.getLength()); for (int i = 0; i < lotNodes.getLength(); i++) { Node lotNode = lotNodes.item(i); if (lotNode.getNodeType() != Node.ELEMENT_NODE) { continue; } Element lotEl = (Element) lotNode; ProcurementLot lot = ProcurementLot.builder().build(); // Direct child values on the lot lot.setLotId(getDirectChildText(lotEl, NS_CBC, "ID")); Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject"); if (procurementProjectEl != null) { lot.setInternalId(getDirectChildText(procurementProjectEl, NS_CBC, "ID")); lot.setTitle(getDirectChildText(procurementProjectEl, NS_CBC, "Name")); lot.setDescription(getDirectChildText(procurementProjectEl, NS_CBC, "Description")); // CPV codes List lotCpvCodes = new ArrayList<>(); for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) { String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode"); if (cpv != null && !cpv.isEmpty()) { lotCpvCodes.add(cpv); } } lot.setCpvCodes(lotCpvCodes.toArray(new String[0])); // NUTS codes List lotNutsCodes = new ArrayList<>(); for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) { Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address"); if (addressEl != null) { String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode"); if (nuts != null && !nuts.isEmpty()) { lotNutsCodes.add(nuts); } } } lot.setNutsCodes(lotNutsCodes.toArray(new String[0])); // Duration Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod"); if (plannedPeriodEl != null) { Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure"); if (durationEl != null) { String durationValue = trimToNull(durationEl.getTextContent()); if (durationValue != null) { try { lot.setDurationValue(Double.parseDouble(durationValue)); } catch (NumberFormatException e) { log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId()); } } String unitCode = trimToNull(durationEl.getAttribute("unitCode")); if (unitCode != null) { lot.setDurationUnit(unitCode); } } } } // Submission deadline Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess"); if (tenderingProcessEl != null) { Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod"); if (deadlinePeriodEl != null) { String endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate"); if (endDate != null) { String endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime"); lot.setSubmissionDeadline(parseDateTime(endDate, endTime)); if (document.getSubmissionDeadline() == null) { document.setSubmissionDeadline(lot.getSubmissionDeadline()); } } } } // EU funded Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms"); if (tenderingTermsEl != null) { String fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode"); lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds")); } document.addLot(lot); } document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded()))); } private void parseLots(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { NodeList lotNodes = getNodes(xpath, doc, "//cac:ProcurementProjectLot"); document.setTotalLots(lotNodes.getLength()); for (int i = 0; i < lotNodes.getLength(); i++) { Node lotNode = lotNodes.item(i); if (lotNode.getNodeType() != Node.ELEMENT_NODE) { continue; } Element lotEl = (Element) lotNode; ProcurementLot lot = ProcurementLot.builder().build(); // Fast direct children Element procurementProjectEl = getDirectChild(lotEl, NS_CAC, "ProcurementProject"); Element tenderingProcessEl = getDirectChild(lotEl, NS_CAC, "TenderingProcess"); Element tenderingTermsEl = getDirectChild(lotEl, NS_CAC, "TenderingTerms"); // --- Lot ID --- String lotId = getDirectChildText(lotEl, NS_CBC, "ID"); if (lotId == null) { lotId = getTextContent(xpath, lotNode, "cbc:ID"); } lot.setLotId(lotId); // --- Internal ID --- String internalId = null; if (procurementProjectEl != null) { internalId = getDirectChildText(procurementProjectEl, NS_CBC, "ID"); } if (internalId == null) { internalId = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:ID"); } lot.setInternalId(internalId); // --- Title --- String title = null; if (procurementProjectEl != null) { title = getDirectChildText(procurementProjectEl, NS_CBC, "Name"); } if (title == null) { title = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Name"); } lot.setTitle(title); // --- Description --- String description = null; if (procurementProjectEl != null) { description = getDirectChildText(procurementProjectEl, NS_CBC, "Description"); } if (description == null) { description = getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Description"); } lot.setDescription(description); // --- CPV codes --- List lotCpvCodes = new ArrayList<>(); if (procurementProjectEl != null) { for (Element mainCommodityEl : getDirectChildren(procurementProjectEl, NS_CAC, "MainCommodityClassification")) { String cpv = getDirectChildText(mainCommodityEl, NS_CBC, "ItemClassificationCode"); if (cpv != null && !cpv.isEmpty()) { lotCpvCodes.add(cpv); } } } if (lotCpvCodes.isEmpty()) { NodeList cpvNodes = getNodes(xpath, lotNode, ".//cac:MainCommodityClassification/cbc:ItemClassificationCode"); for (int j = 0; j < cpvNodes.getLength(); j++) { String cpv = trimToNull(cpvNodes.item(j).getTextContent()); if (cpv != null) { lotCpvCodes.add(cpv); } } } lot.setCpvCodes(lotCpvCodes.toArray(new String[0])); // --- NUTS codes --- List lotNutsCodes = new ArrayList<>(); if (procurementProjectEl != null) { for (Element realizedLocationEl : getDirectChildren(procurementProjectEl, NS_CAC, "RealizedLocation")) { Element addressEl = getDirectChild(realizedLocationEl, NS_CAC, "Address"); if (addressEl != null) { String nuts = getDirectChildText(addressEl, NS_CBC, "CountrySubentityCode"); if (nuts != null && !nuts.isEmpty()) { lotNutsCodes.add(nuts); } } } } if (lotNutsCodes.isEmpty()) { NodeList nutsNodes = getNodes(xpath, lotNode, ".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode"); for (int j = 0; j < nutsNodes.getLength(); j++) { String nuts = trimToNull(nutsNodes.item(j).getTextContent()); if (nuts != null) { lotNutsCodes.add(nuts); } } } lot.setNutsCodes(lotNutsCodes.toArray(new String[0])); // --- Duration --- boolean durationSet = false; if (procurementProjectEl != null) { Element plannedPeriodEl = getDirectChild(procurementProjectEl, NS_CAC, "PlannedPeriod"); if (plannedPeriodEl != null) { Element durationEl = getDirectChild(plannedPeriodEl, NS_CBC, "DurationMeasure"); if (durationEl != null) { String durationValue = trimToNull(durationEl.getTextContent()); if (durationValue != null) { try { lot.setDurationValue(Double.parseDouble(durationValue)); } catch (NumberFormatException e) { log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId()); } } String unitCode = trimToNull(durationEl.getAttribute("unitCode")); if (unitCode != null) { lot.setDurationUnit(unitCode); } durationSet = true; } } } if (!durationSet) { Node durationNode = getNode(xpath, lotNode, "cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure"); if (durationNode != null) { String durationValue = trimToNull(durationNode.getTextContent()); if (durationValue != null) { try { lot.setDurationValue(Double.parseDouble(durationValue)); } catch (NumberFormatException e) { log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId()); } } if (durationNode instanceof Element durationEl) { String unitCode = trimToNull(durationEl.getAttribute("unitCode")); if (unitCode != null) { lot.setDurationUnit(unitCode); } } } } // --- Submission deadline --- String endDate = null; String endTime = null; if (tenderingProcessEl != null) { Element deadlinePeriodEl = getDirectChild(tenderingProcessEl, NS_CAC, "TenderSubmissionDeadlinePeriod"); if (deadlinePeriodEl != null) { endDate = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndDate"); endTime = getDirectChildText(deadlinePeriodEl, NS_CBC, "EndTime"); } } if (endDate == null) { endDate = getTextContent(xpath, lotNode, "cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndDate"); endTime = getTextContent(xpath, lotNode, "cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndTime"); } if (endDate != null) { lot.setSubmissionDeadline(parseDateTime(endDate, endTime)); if (document.getSubmissionDeadline() == null) { document.setSubmissionDeadline(lot.getSubmissionDeadline()); } } // --- EU funded --- String fundingProgramCode = null; if (tenderingTermsEl != null) { fundingProgramCode = getDirectChildText(tenderingTermsEl, NS_CBC, "FundingProgramCode"); } if (fundingProgramCode == null) { fundingProgramCode = getTextContent(xpath, lotNode, "cac:TenderingTerms/cbc:FundingProgramCode"); } lot.setEuFunded(fundingProgramCode != null && !fundingProgramCode.contains("no-eu-funds")); document.addLot(lot); } document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded()))); } private void parseLotsOld(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { NodeList lotNodes = (NodeList) xpath.evaluate( "//cac:ProcurementProjectLot", doc, XPathConstants.NODESET); document.setTotalLots(lotNodes.getLength()); for (int i = 0; i < lotNodes.getLength(); i++) { Node lotNode = lotNodes.item(i); ProcurementLot lot = ProcurementLot.builder().build(); // Lot ID lot.setLotId(getTextContent(xpath, lotNode, "cbc:ID")); // Internal ID lot.setInternalId(getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:ID")); // Title and description lot.setTitle(getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Name")); lot.setDescription(getTextContent(xpath, lotNode, "cac:ProcurementProject/cbc:Description")); // CPV codes for this lot List lotCpvCodes = new ArrayList<>(); NodeList cpvNodes = (NodeList) xpath.evaluate( ".//cac:MainCommodityClassification/cbc:ItemClassificationCode", lotNode, XPathConstants.NODESET); for (int j = 0; j < cpvNodes.getLength(); j++) { lotCpvCodes.add(cpvNodes.item(j).getTextContent()); } lot.setCpvCodes(lotCpvCodes.toArray(new String[0])); // NUTS codes for this lot List lotNutsCodes = new ArrayList<>(); NodeList nutsNodes = (NodeList) xpath.evaluate( ".//cac:RealizedLocation/cac:Address/cbc:CountrySubentityCode", lotNode, XPathConstants.NODESET); for (int j = 0; j < nutsNodes.getLength(); j++) { lotNutsCodes.add(nutsNodes.item(j).getTextContent()); } lot.setNutsCodes(lotNutsCodes.toArray(new String[0])); // Duration String durationValue = getTextContent(xpath, lotNode, "cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure"); if (durationValue != null) { try { lot.setDurationValue(Double.parseDouble(durationValue)); } catch (NumberFormatException e) { log.warn("Invalid duration value '{}' in lot {}, skipping", durationValue, lot.getLotId()); } } lot.setDurationUnit(getAttributeValue(xpath, lotNode, "cac:ProcurementProject/cac:PlannedPeriod/cbc:DurationMeasure", "unitCode")); // Submission deadline String endDate = getTextContent(xpath, lotNode, "cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndDate"); String endTime = getTextContent(xpath, lotNode, "cac:TenderingProcess/cac:TenderSubmissionDeadlinePeriod/cbc:EndTime"); if (endDate != null) { lot.setSubmissionDeadline(parseDateTime(endDate, endTime)); // Set document-level deadline from first lot if not set if (document.getSubmissionDeadline() == null) { document.setSubmissionDeadline(lot.getSubmissionDeadline()); } } // EU funded String euFunded = getTextContent(xpath, lotNode, "cac:TenderingTerms/cbc:FundingProgramCode"); lot.setEuFunded(euFunded != null && !euFunded.contains("no-eu-funds")); document.addLot(lot); } // Check if any lot is EU funded document.setEuFunded(document.getLots().stream().anyMatch(l -> Boolean.TRUE.equals(l.getEuFunded()))); } private void parsePublication(Document doc, XPath xpath, ProcurementDocument document) throws XPathExpressionException { // Publication ID (OJS notice ID) document.setPublicationId(getTextContent(xpath, doc, "//efac:Publication/efbc:NoticePublicationID")); // OJS ID (gazette ID) document.setOjsId(getTextContent(xpath, doc, "//efac:Publication/efbc:GazetteID")); // Publication date String pubDate = getTextContent(xpath, doc, "//efac:Publication/efbc:PublicationDate"); if (pubDate != null) { document.setPublicationDate(parseDate(pubDate)); } // Fallback to requested publication date if (document.getPublicationDate() == null) { String requestedPubDate = getTextContent(xpath, doc, "/*[local-name()='ContractNotice']/cbc:RequestedPublicationDate"); if (requestedPubDate != null) { document.setPublicationDate(parseDate(requestedPubDate)); } } } /** * Generate a textual representation for vectorization. */ private String generateTextContent(ProcurementDocument document) { StringBuilder sb = new StringBuilder(); // Title (most important) if (document.getProjectTitle() != null) { sb.append("Title: ").append(document.getProjectTitle()).append("\n\n"); } // Description if (document.getProjectDescription() != null) { sb.append("Description: ").append(document.getProjectDescription()).append("\n\n"); } // Buyer information if (document.getBuyerName() != null) { sb.append("Contracting Authority: ").append(document.getBuyerName()); if (document.getBuyerCity() != null) { sb.append(", ").append(document.getBuyerCity()); } if (document.getBuyerCountryCode() != null) { sb.append(" (").append(document.getBuyerCountryCode()).append(")"); } sb.append("\n\n"); } // Contract type and procedure if (document.getContractNature() != null) { sb.append("Contract Type: ").append(document.getContractNature()).append("\n"); } if (document.getProcedureType() != null) { sb.append("Procedure: ").append(document.getProcedureType()).append("\n"); } // CPV classification if (document.getCpvCodes() != null && document.getCpvCodes().length > 0) { sb.append("CPV Codes: ").append(String.join(", ", document.getCpvCodes())).append("\n"); } // Lot information if (document.getLots() != null && !document.getLots().isEmpty()) { sb.append("\nLots (").append(document.getLots().size()).append("):\n"); for (ProcurementLot lot : document.getLots()) { if (lot.getTitle() != null) { sb.append("- ").append(lot.getLotId()).append(": ").append(lot.getTitle()); if (lot.getDescription() != null && !lot.getDescription().equals(lot.getTitle())) { sb.append(" - ").append(lot.getDescription()); } sb.append("\n"); } } } return sb.toString().trim(); } // Helper methods private List getTextContents(XPath xpath, Object item, String expression) throws XPathExpressionException { List results = new ArrayList<>(); NodeList nodes = getNodes(xpath, item, expression); for (int i = 0; i < nodes.getLength(); i++) { String text = nodes.item(i).getTextContent().trim(); if (!text.isEmpty()) { results.add(text); } } return results; } private String getAttributeValue(XPath xpath, Object item, String expression, String attrName) throws XPathExpressionException { Node node = getNode(xpath, item, expression); if (node instanceof Element element) { String value = element.getAttribute(attrName); return trimToNull(value); } return null; } private LocalDate parseDate(String dateStr) { if (dateStr == null || dateStr.isEmpty()) return null; try { // Handle various date formats dateStr = dateStr.trim(); // Handle datetime with dash separator (e.g. "2025-04-23-03:00") // Extract only the date part (first 10 characters: YYYY-MM-DD) if (dateStr.matches("\\d{4}-\\d{2}-\\d{2}-\\d{2}:\\d{2}.*")) { dateStr = dateStr.substring(0, 10); } if (dateStr.contains("+")) { dateStr = dateStr.substring(0, dateStr.indexOf("+")); } if (dateStr.endsWith("Z")) { dateStr = dateStr.substring(0, dateStr.length() - 1); } return LocalDate.parse(dateStr); } catch (DateTimeParseException e) { log.warn("Failed to parse date: {} . Error: {}", dateStr, e.getMessage()); return null; } } private LocalTime parseTime(String timeStr) { if (timeStr == null || timeStr.isEmpty()) return null; try { timeStr = timeStr.trim(); // Handle time with offset (e.g. "12:00:00-03:00") // Extract only the time part (first 8 characters: HH:mm:ss) if (timeStr.matches("\\d{2}:\\d{2}:\\d{2}[+-]\\d{2}:\\d{2}")) { timeStr = timeStr.substring(0, 8); } if (timeStr.contains("+")) { timeStr = timeStr.substring(0, timeStr.indexOf("+")); } if (timeStr.endsWith("Z")) { timeStr = timeStr.substring(0, timeStr.length() - 1); } return LocalTime.parse(timeStr); } catch (DateTimeParseException e) { log.warn("Failed to parse time: {} . Error: {}", timeStr, e.getMessage()); return null; } } private OffsetDateTime parseDateTime(String dateStr, String timeStr) { LocalDate date = parseDate(dateStr); if (date == null) return null; LocalTime time = timeStr != null ? parseTime(timeStr) : LocalTime.MIDNIGHT; if (time == null) time = LocalTime.MIDNIGHT; // Parse timezone offset if present in date string ZoneOffset offset = ZoneOffset.UTC; if (dateStr != null && dateStr.contains("+")) { try { String offsetStr = dateStr.substring(dateStr.indexOf("+")); offset = ZoneOffset.of(offsetStr); } catch (Exception e) { // Default to UTC } } return OffsetDateTime.of(date, time, offset); } private NoticeType mapNoticeType(String code) { if (code == null) return NoticeType.OTHER; return switch (code.toLowerCase()) { case "cn-standard", "cn-social", "cn-defence" -> NoticeType.CONTRACT_NOTICE; case "pin-only", "pin-rtl", "pin-cfc-standard" -> NoticeType.PRIOR_INFORMATION_NOTICE; case "can-standard", "can-social", "can-modif" -> NoticeType.CONTRACT_AWARD_NOTICE; default -> NoticeType.OTHER; }; } private ContractNature mapContractNature(String code) { if (code == null) return ContractNature.UNKNOWN; return switch (code.toLowerCase()) { case "supplies" -> ContractNature.SUPPLIES; case "services" -> ContractNature.SERVICES; case "works" -> ContractNature.WORKS; case "mixed" -> ContractNature.MIXED; default -> ContractNature.UNKNOWN; }; } private ProcedureType mapProcedureType(String code) { if (code == null) return ProcedureType.OTHER; return switch (code.toLowerCase()) { case "open" -> ProcedureType.OPEN; case "restricted" -> ProcedureType.RESTRICTED; case "comp-dial" -> ProcedureType.COMPETITIVE_DIALOGUE; case "innovation" -> ProcedureType.INNOVATION_PARTNERSHIP; case "neg-wo-pub" -> ProcedureType.NEGOTIATED_WITHOUT_PUBLICATION; case "neg-w-pub" -> ProcedureType.NEGOTIATED_WITH_PUBLICATION; default -> ProcedureType.OTHER; }; } private NamespaceContext createNamespaceContext() { return new NamespaceContext() { @Override public String getNamespaceURI(String prefix) { return switch (prefix) { case "cn" -> NS_CN; case "cac" -> NS_CAC; case "cbc" -> NS_CBC; case "efac" -> NS_EFAC; case "efbc" -> NS_EFBC; case "efext" -> NS_EFEXT; case "ext" -> NS_EXT; default -> null; }; } @Override public String getPrefix(String namespaceURI) { return null; } @Override public Iterator getPrefixes(String namespaceURI) { return null; } }; } /** * Exception thrown when XML parsing fails. */ public static class XmlParsingException extends RuntimeException { public XmlParsingException(String message, Throwable cause) { super(message, cause); } } }