package at.procon.ted.camel; import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.service.attachment.AttachmentExtractor; import at.procon.ted.service.attachment.AttachmentProcessingService; import jakarta.mail.BodyPart; import jakarta.mail.Message; import jakarta.mail.Multipart; import jakarta.mail.Part; import jakarta.mail.Session; import jakarta.mail.internet.MimeMessage; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.camel.Exchange; import org.apache.camel.LoggingLevel; import org.apache.camel.builder.RouteBuilder; import org.jsoup.Jsoup; import org.springframework.stereotype.Component; import java.io.ByteArrayInputStream; import java.io.File; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.Properties; /** * Apache Camel route for IMAP mail processing. * * Features: * - IMAP connection with SSL/TLS to mail server * - MIME message decoding * - Asynchronous attachment processing with idempotency * - PDF text extraction * - ZIP file extraction with recursive processing * - HTML to plain text conversion * * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Component @RequiredArgsConstructor @Slf4j public class MailRoute extends RouteBuilder { private static final String ROUTE_ID_IMAP = "mail-imap-consumer"; private static final String ROUTE_ID_MIME_FILE = "mail-mime-file-consumer"; private static final String ROUTE_ID_MIME = "mail-mime-decoder"; private static final String ROUTE_ID_ATTACHMENT = "mail-attachment-processor"; private static final String ROUTE_ID_ATTACHMENT_ASYNC = "mail-attachment-async"; private final TedProcessorProperties properties; private final AttachmentProcessingService attachmentProcessingService; @Override public void configure() throws Exception { TedProcessorProperties.MailProperties mail = properties.getMail(); if (!mail.isEnabled()) { log.info("Mail processing is disabled, skipping route configuration"); return; } log.info("Configuring mail routes (host={}, port={}, ssl={}, user={})", mail.getHost(), mail.getPort(), mail.isSsl(), mail.getUsername()); // Ensure attachment output directory exists File attachmentDir = new File(mail.getAttachmentOutputDirectory()); if (!attachmentDir.exists()) { attachmentDir.mkdirs(); log.info("Created attachment output directory: {}", attachmentDir.getAbsolutePath()); } // Error handler for mail processing errorHandler(deadLetterChannel("direct:mail-error-handler") .maximumRedeliveries(3) .redeliveryDelay(5000) .retryAttemptedLogLevel(LoggingLevel.WARN) .logStackTrace(true)); // Mail error handler route from("direct:mail-error-handler") .routeId("mail-error-handler") .process(exchange -> { Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class); String subject = exchange.getIn().getHeader("mailSubject", String.class); if (exception != null) { log.error("Mail processing error for subject '{}': {}", subject, exception.getMessage(), exception); } }) .log(LoggingLevel.ERROR, "Mail processing failed: ${exception.message}"); // IMAP consumer route from(buildImapUri()) .routeId(ROUTE_ID_IMAP) .log(LoggingLevel.INFO, "Received email: ${header.subject} from ${header.from}") .to("direct:mime"); // MIME file consumer route - reads .eml files from directory if (mail.isMimeInputEnabled()) { configureMimeFileConsumer(mail); } // MIME decoder route - decodes the email and extracts content/attachments from("direct:mime") .routeId(ROUTE_ID_MIME) .process(exchange -> { Message mailMessage = exchange.getIn().getBody(Message.class); if (mailMessage == null) { log.warn("Received null mail message, skipping"); return; } String subject = mailMessage.getSubject(); String from = mailMessage.getFrom() != null && mailMessage.getFrom().length > 0 ? mailMessage.getFrom()[0].toString() : "unknown"; log.info("Processing MIME message: subject='{}', from='{}'", subject, from); // Store mail metadata in headers exchange.getIn().setHeader("mailSubject", subject); exchange.getIn().setHeader("mailFrom", from); exchange.getIn().setHeader("mailReceivedDate", mailMessage.getReceivedDate()); // Process the content List attachments = new ArrayList<>(); StringBuilder textContent = new StringBuilder(); StringBuilder htmlContent = new StringBuilder(); processMessageContent(mailMessage, textContent, htmlContent, attachments); // Convert HTML to plain text if we have HTML but no plain text String finalTextContent; if (textContent.length() == 0 && htmlContent.length() > 0) { finalTextContent = convertHtmlToText(htmlContent.toString()); log.debug("Converted HTML mail to plain text ({} chars)", finalTextContent.length()); } else { finalTextContent = textContent.toString(); } // Store results exchange.getIn().setHeader("mailTextContent", finalTextContent); exchange.getIn().setHeader("mailHtmlContent", htmlContent.toString()); exchange.getIn().setHeader("mailAttachments", attachments); exchange.getIn().setHeader("mailAttachmentCount", attachments.size()); log.info("MIME decoded: subject='{}', textLength={}, htmlLength={}, attachments={}", subject, finalTextContent.length(), htmlContent.length(), attachments.size()); }) // Queue attachments for async processing .choice() .when(simple("${header.mailAttachmentCount} > 0")) .log(LoggingLevel.INFO, "Queueing ${header.mailAttachmentCount} attachments for async processing") .otherwise() .log(LoggingLevel.DEBUG, "No attachments in email: ${header.mailSubject}") .end() // Process attachments asynchronously via SEDA .filter(simple("${header.mailAttachmentCount} > 0")) .split(header("mailAttachments")) .to("seda:attachment-async?waitForTaskToComplete=Never&size=500") .end() .end() .log(LoggingLevel.INFO, "Mail processing completed: ${header.mailSubject}"); // Async attachment processor route via SEDA from("seda:attachment-async?concurrentConsumers=2&size=500") .routeId(ROUTE_ID_ATTACHMENT_ASYNC) .to("direct:attachment"); // Attachment processor route - handles individual attachments with idempotency from("direct:attachment") .routeId(ROUTE_ID_ATTACHMENT) .process(exchange -> { AttachmentInfo attachment = exchange.getIn().getBody(AttachmentInfo.class); if (attachment == null) { log.warn("Received null attachment info, skipping"); return; } String mailSubject = exchange.getIn().getHeader("mailSubject", String.class); String mailFrom = exchange.getIn().getHeader("mailFrom", String.class); String parentHash = exchange.getIn().getHeader("parentHash", String.class); log.info("Processing attachment: '{}' ({} bytes, type={}) from email '{}'", attachment.getFilename(), attachment.getSize(), attachment.getContentType(), mailSubject); // Process attachment with idempotency check AttachmentProcessingService.ProcessingResult result = attachmentProcessingService.processAttachment( attachment.getData(), attachment.getFilename(), attachment.getContentType(), mailSubject, mailFrom, parentHash ); if (result.isDuplicate()) { log.info("Attachment is duplicate, skipping: '{}'", attachment.getFilename()); exchange.setProperty("isDuplicate", true); return; } if (!result.isSuccess()) { log.warn("Attachment processing failed: '{}' - {}", attachment.getFilename(), result.errorMessage()); return; } // Store result in exchange exchange.getIn().setHeader("attachmentId", result.attachment().getId()); exchange.getIn().setHeader("attachmentHash", result.attachment().getContentHash()); exchange.getIn().setHeader("extractedText", result.attachment().getExtractedText() != null ? result.attachment().getExtractedText().length() + " chars" : "none"); // Queue child attachments (from ZIP) for recursive processing if (result.hasChildren()) { log.info("Queueing {} child attachments from ZIP '{}'", result.childAttachments().size(), attachment.getFilename()); for (AttachmentExtractor.ChildAttachment child : result.childAttachments()) { // Create AttachmentInfo for child and send to SEDA queue AttachmentInfo childInfo = new AttachmentInfo( child.filename(), child.contentType(), child.data(), child.data().length ); // Send to SEDA for async processing with parent hash getContext().createProducerTemplate().sendBodyAndHeaders( "seda:attachment-async?waitForTaskToComplete=Never", childInfo, java.util.Map.of( "mailSubject", mailSubject != null ? mailSubject : "", "mailFrom", mailFrom != null ? mailFrom : "", "parentHash", result.attachment().getContentHash(), "pathInArchive", child.pathInArchive() ) ); } } }) .choice() .when(exchangeProperty("isDuplicate").isEqualTo(true)) .log(LoggingLevel.DEBUG, "Skipped duplicate attachment") .otherwise() .log(LoggingLevel.INFO, "Attachment processed: ${header.attachmentId}, extracted=${header.extractedText}") .end(); } /** * Configure the MIME file consumer route. */ private void configureMimeFileConsumer(TedProcessorProperties.MailProperties mail) throws Exception { // Ensure MIME input directory exists File mimeInputDir = new File(mail.getMimeInputDirectory()); if (!mimeInputDir.exists()) { mimeInputDir.mkdirs(); log.info("Created MIME input directory: {}", mimeInputDir.getAbsolutePath()); } String mimeFileUri = buildMimeFileUri(mail); log.info("Configuring MIME file consumer: {}", mimeFileUri); // MIME file consumer - reads .eml files and sends to direct:mime from(mimeFileUri) .routeId(ROUTE_ID_MIME_FILE) .log(LoggingLevel.INFO, "Reading MIME file: ${header.CamelFileName}") .process(exchange -> { // Read file content as bytes byte[] fileContent = exchange.getIn().getBody(byte[].class); String filename = exchange.getIn().getHeader(Exchange.FILE_NAME, String.class); if (fileContent == null || fileContent.length == 0) { log.warn("Empty MIME file: {}", filename); throw new RuntimeException("Empty MIME file: " + filename); } log.debug("Parsing MIME file: {} ({} bytes)", filename, fileContent.length); // Parse the file as a MimeMessage Session session = Session.getDefaultInstance(new Properties()); try (ByteArrayInputStream bais = new ByteArrayInputStream(fileContent)) { MimeMessage mimeMessage = new MimeMessage(session, bais); // Set the parsed message as body for direct:mime exchange.getIn().setBody(mimeMessage); log.info("Parsed MIME file: {} -> subject='{}'", filename, mimeMessage.getSubject()); } }) .to("direct:mime") .log(LoggingLevel.INFO, "MIME file processed successfully: ${header.CamelFileName}"); } /** * Build the file URI for MIME file consumer. */ private String buildMimeFileUri(TedProcessorProperties.MailProperties mail) { String directory = mail.getMimeInputDirectory().replace("\\", "/"); StringBuilder uri = new StringBuilder("file:"); uri.append(directory); uri.append("?"); // File pattern uri.append("include=").append(mail.getMimeInputPattern()); // Polling interval uri.append("&delay=").append(mail.getMimeInputPollInterval()); // Move to .processed after successful processing uri.append("&move=.processed"); // Move to .error on failure uri.append("&moveFailed=.error"); // Read lock to prevent processing incomplete files uri.append("&readLock=changed"); uri.append("&readLockCheckInterval=1000"); uri.append("&readLockTimeout=30000"); // Sort by name for consistent ordering uri.append("&sortBy=file:name"); // Don't process hidden files uri.append("&exclude=^\\..*"); // Recursive scanning disabled by default uri.append("&recursive=false"); return uri.toString(); } /** * Build the IMAP URI for the mail consumer. */ private String buildImapUri() { TedProcessorProperties.MailProperties mail = properties.getMail(); StringBuilder uri = new StringBuilder(); uri.append(mail.isSsl() ? "imaps://" : "imap://"); uri.append(mail.getHost()); uri.append(":").append(mail.getPort()); uri.append("?username=").append(encodeUriComponent(mail.getUsername())); uri.append("&password=").append(encodeUriComponent(mail.getPassword())); uri.append("&folderName=").append(mail.getFolderName()); uri.append("&delete=").append(mail.isDelete()); // peek=false means messages will be marked as SEEN after fetch // peek=true means messages will NOT be marked as SEEN (peek only) uri.append("&peek=").append(!mail.isSeen()); uri.append("&unseen=").append(mail.isUnseen()); uri.append("&delay=").append(mail.getDelay()); uri.append("&maxMessagesPerPoll=").append(mail.getMaxMessagesPerPoll()); // Connection settings uri.append("&connectionTimeout=30000"); uri.append("&fetchSize=-1"); // Fetch entire message uri.append("&debugMode=false"); log.info("IMAP URI configured (password hidden): {}://{}:{}?username={}&folderName={}", mail.isSsl() ? "imaps" : "imap", mail.getHost(), mail.getPort(), mail.getUsername(), mail.getFolderName()); return uri.toString(); } /** * URL-encode a URI component. */ private String encodeUriComponent(String value) { if (value == null) return ""; try { return java.net.URLEncoder.encode(value, StandardCharsets.UTF_8); } catch (Exception e) { return value; } } /** * Recursively process message content to extract text, HTML, and attachments. */ private void processMessageContent(Part part, StringBuilder textContent, StringBuilder htmlContent, List attachments) throws Exception { String contentType = part.getContentType().toLowerCase(); String disposition = part.getDisposition(); // Check if this is an attachment if (disposition != null && (disposition.equalsIgnoreCase(Part.ATTACHMENT) || disposition.equalsIgnoreCase(Part.INLINE))) { extractAttachment(part, attachments); return; } Object content = part.getContent(); if (content instanceof Multipart multipart) { // Process each part of the multipart message for (int i = 0; i < multipart.getCount(); i++) { BodyPart bodyPart = multipart.getBodyPart(i); processMessageContent(bodyPart, textContent, htmlContent, attachments); } } else if (contentType.contains("text/plain")) { // Plain text content String text = content.toString(); textContent.append(text); } else if (contentType.contains("text/html")) { // HTML content String html = content.toString(); htmlContent.append(html); } else if (part.getFileName() != null) { // Has filename - treat as attachment extractAttachment(part, attachments); } } /** * Extract attachment data from a message part. */ private void extractAttachment(Part part, List attachments) throws Exception { String filename = part.getFileName(); if (filename == null) { filename = "unnamed_attachment"; } // Decode filename if necessary (might be MIME-encoded) try { filename = jakarta.mail.internet.MimeUtility.decodeText(filename); } catch (Exception e) { log.debug("Could not decode filename: {}", filename); } String contentType = part.getContentType(); // Read attachment data byte[] data; try (InputStream is = part.getInputStream()) { data = is.readAllBytes(); } AttachmentInfo info = new AttachmentInfo(filename, contentType, data, data.length); attachments.add(info); log.debug("Extracted attachment: '{}' ({} bytes, type={})", filename, data.length, contentType); } /** * Convert HTML content to plain text using JSoup. */ private String convertHtmlToText(String html) { if (html == null || html.isBlank()) { return ""; } try { // Parse HTML and extract text org.jsoup.nodes.Document doc = Jsoup.parse(html); // Remove script and style elements doc.select("script, style").remove(); // Get text with whitespace preservation String text = doc.text(); // Clean up excessive whitespace text = text.replaceAll("\\s+", " ").trim(); return text; } catch (Exception e) { log.warn("Failed to convert HTML to text: {}", e.getMessage()); // Fallback: strip HTML tags with regex return html.replaceAll("<[^>]+>", " ").replaceAll("\\s+", " ").trim(); } } /** * DTO for attachment information. */ @lombok.Data @lombok.AllArgsConstructor public static class AttachmentInfo { private String filename; private String contentType; private byte[] data; private int size; } }