package at.procon.ted.service.attachment; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.net.URLConnection; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; /** * Service for extracting files from ZIP archives. * Extracts all contained files as child attachments for recursive processing. * * @author Martin.Schweitzer@procon.co.at and claude.ai */ @Service @Slf4j public class ZipExtractionService implements AttachmentExtractor { private static final Set SUPPORTED_EXTENSIONS = Set.of("zip"); private static final Set SUPPORTED_MIME_TYPES = Set.of( "application/zip", "application/x-zip", "application/x-zip-compressed", "application/octet-stream" // Often used for ZIP files ); // Security limits private static final long MAX_TOTAL_SIZE = 500 * 1024 * 1024; // 500 MB total extracted size private static final long MAX_SINGLE_FILE_SIZE = 100 * 1024 * 1024; // 100 MB per file private static final int MAX_FILES = 1000; // Maximum number of files in archive private static final int MAX_PATH_LENGTH = 500; // Maximum path length @Override public Set getSupportedExtensions() { return SUPPORTED_EXTENSIONS; } @Override public Set getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @Override public boolean canHandle(String filename, String contentType) { if (filename != null) { String lowerFilename = filename.toLowerCase(); if (SUPPORTED_EXTENSIONS.stream().anyMatch(ext -> lowerFilename.endsWith("." + ext))) { return true; } } // Only use MIME type if it's explicitly zip, not application/octet-stream if (contentType != null) { String lowerContentType = contentType.toLowerCase().split(";")[0].trim(); if (lowerContentType.contains("zip")) { return true; } } return false; } @Override public ExtractionResult extract(byte[] data, String filename, String contentType) { if (data == null || data.length == 0) { return ExtractionResult.failure("Empty ZIP data"); } log.debug("Extracting files from ZIP: {} ({} bytes)", filename, data.length); List children = new ArrayList<>(); long totalExtractedSize = 0; int fileCount = 0; try (ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(data))) { ZipEntry entry; while ((entry = zis.getNextEntry()) != null) { // Security check: skip directories if (entry.isDirectory()) { zis.closeEntry(); continue; } String entryName = entry.getName(); // Security check: path traversal protection if (entryName.contains("..") || entryName.startsWith("/") || entryName.startsWith("\\")) { log.warn("Skipping potentially malicious ZIP entry: {}", entryName); zis.closeEntry(); continue; } // Security check: path length if (entryName.length() > MAX_PATH_LENGTH) { log.warn("Skipping ZIP entry with too long path: {}", entryName.substring(0, 100) + "..."); zis.closeEntry(); continue; } // Security check: maximum files if (fileCount >= MAX_FILES) { log.warn("ZIP file contains too many files, stopping at {} files", MAX_FILES); break; } // Read entry content ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] buffer = new byte[8192]; int len; long entrySize = 0; while ((len = zis.read(buffer)) > 0) { entrySize += len; // Security check: single file size if (entrySize > MAX_SINGLE_FILE_SIZE) { log.warn("Skipping ZIP entry exceeding max file size: {} (> {} MB)", entryName, MAX_SINGLE_FILE_SIZE / 1024 / 1024); break; } // Security check: total extracted size (zip bomb protection) if (totalExtractedSize + entrySize > MAX_TOTAL_SIZE) { log.warn("ZIP extraction stopped: total extracted size exceeds limit ({} MB)", MAX_TOTAL_SIZE / 1024 / 1024); return ExtractionResult.successWithChildren(children); } baos.write(buffer, 0, len); } if (entrySize > MAX_SINGLE_FILE_SIZE) { zis.closeEntry(); continue; } byte[] entryData = baos.toByteArray(); totalExtractedSize += entryData.length; fileCount++; // Determine content type from filename String childContentType = guessContentType(entryName); // Extract just the filename from the path String childFilename = extractFilename(entryName); ChildAttachment child = new ChildAttachment( childFilename, childContentType, entryData, entryName ); children.add(child); log.debug("Extracted from ZIP: {} ({} bytes, type={})", entryName, entryData.length, childContentType); zis.closeEntry(); } log.info("Successfully extracted {} files ({} bytes total) from ZIP: {}", children.size(), totalExtractedSize, filename); return ExtractionResult.successWithChildren(children); } catch (Exception e) { log.error("Failed to extract ZIP '{}': {}", filename, e.getMessage(), e); return ExtractionResult.failure("ZIP extraction failed: " + e.getMessage()); } } /** * Guess the MIME content type from a filename. */ private String guessContentType(String filename) { if (filename == null) { return "application/octet-stream"; } String lowerFilename = filename.toLowerCase(); // Common types if (lowerFilename.endsWith(".pdf")) { return "application/pdf"; } else if (lowerFilename.endsWith(".xml")) { return "application/xml"; } else if (lowerFilename.endsWith(".zip")) { return "application/zip"; } else if (lowerFilename.endsWith(".txt")) { return "text/plain"; } else if (lowerFilename.endsWith(".html") || lowerFilename.endsWith(".htm")) { return "text/html"; } else if (lowerFilename.endsWith(".json")) { return "application/json"; } else if (lowerFilename.endsWith(".doc")) { return "application/msword"; } else if (lowerFilename.endsWith(".docx")) { return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; } else if (lowerFilename.endsWith(".xls")) { return "application/vnd.ms-excel"; } else if (lowerFilename.endsWith(".xlsx")) { return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; } else if (lowerFilename.endsWith(".png")) { return "image/png"; } else if (lowerFilename.endsWith(".jpg") || lowerFilename.endsWith(".jpeg")) { return "image/jpeg"; } // Try to guess from URLConnection String guessed = URLConnection.guessContentTypeFromName(filename); return guessed != null ? guessed : "application/octet-stream"; } /** * Extract just the filename from a path (handles both / and \ separators). */ private String extractFilename(String path) { if (path == null) { return "unnamed"; } int lastSlash = Math.max(path.lastIndexOf('/'), path.lastIndexOf('\\')); if (lastSlash >= 0 && lastSlash < path.length() - 1) { return path.substring(lastSlash + 1); } return path; } }