DIP/src/main/java/at/procon/ted/service/attachment/ZipExtractionService.java

package at.procon.ted.service.attachment;

import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

/**
 * Service for extracting files from ZIP archives.
 * Extracts all contained files as child attachments for recursive processing.
 *
 * @author Martin.Schweitzer@procon.co.at and claude.ai
 */
@Service
@Slf4j
public class ZipExtractionService implements AttachmentExtractor {

    private static final Set<String> SUPPORTED_EXTENSIONS = Set.of("zip");
    private static final Set<String> SUPPORTED_MIME_TYPES = Set.of(
            "application/zip",
            "application/x-zip",
            "application/x-zip-compressed",
            "application/octet-stream" // Often used for ZIP files
    );

    // Security limits
    private static final long MAX_TOTAL_SIZE = 500 * 1024 * 1024; // 500 MB total extracted size
    private static final long MAX_SINGLE_FILE_SIZE = 100 * 1024 * 1024; // 100 MB per file
    private static final int MAX_FILES = 1000; // Maximum number of files in archive
    private static final int MAX_PATH_LENGTH = 500; // Maximum path length

    @Override
    public Set<String> getSupportedExtensions() {
        return SUPPORTED_EXTENSIONS;
    }

    @Override
    public Set<String> getSupportedMimeTypes() {
        return SUPPORTED_MIME_TYPES;
    }

    @Override
    public boolean canHandle(String filename, String contentType) {
        if (filename != null) {
            String lowerFilename = filename.toLowerCase();
            if (SUPPORTED_EXTENSIONS.stream().anyMatch(ext -> lowerFilename.endsWith("." + ext))) {
                return true;
            }
        }
        // Only use MIME type if it's explicitly zip, not application/octet-stream
        if (contentType != null) {
            String lowerContentType = contentType.toLowerCase().split(";")[0].trim();
            if (lowerContentType.contains("zip")) {
                return true;
            }
        }
        return false;
    }

    @Override
    public ExtractionResult extract(byte[] data, String filename, String contentType) {
        if (data == null || data.length == 0) {
            return ExtractionResult.failure("Empty ZIP data");
        }

        log.debug("Extracting files from ZIP: {} ({} bytes)", filename, data.length);

        List<ChildAttachment> children = new ArrayList<>();
        long totalExtractedSize = 0;
        int fileCount = 0;

        try (ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(data))) {
            ZipEntry entry;

            while ((entry = zis.getNextEntry()) != null) {
                // Security check: skip directories
                if (entry.isDirectory()) {
                    zis.closeEntry();
                    continue;
                }

                String entryName = entry.getName();

                // Security check: path traversal protection
                if (entryName.contains("..") || entryName.startsWith("/") || entryName.startsWith("\\")) {
                    log.warn("Skipping potentially malicious ZIP entry: {}", entryName);
                    zis.closeEntry();
                    continue;
                }

                // Security check: path length
                if (entryName.length() > MAX_PATH_LENGTH) {
                    log.warn("Skipping ZIP entry with too long path: {}", entryName.substring(0, 100) + "...");
                    zis.closeEntry();
                    continue;
                }

                // Security check: maximum files
                if (fileCount >= MAX_FILES) {
                    log.warn("ZIP file contains too many files, stopping at {} files", MAX_FILES);
                    break;
                }

                // Read entry content
                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                byte[] buffer = new byte[8192];
                int len;
                long entrySize = 0;

                while ((len = zis.read(buffer)) > 0) {
                    entrySize += len;

                    // Security check: single file size
                    if (entrySize > MAX_SINGLE_FILE_SIZE) {
                        log.warn("Skipping ZIP entry exceeding max file size: {} (> {} MB)",
                                entryName, MAX_SINGLE_FILE_SIZE / 1024 / 1024);
                        break;
                    }

                    // Security check: total extracted size (zip bomb protection)
                    if (totalExtractedSize + entrySize > MAX_TOTAL_SIZE) {
                        log.warn("ZIP extraction stopped: total extracted size exceeds limit ({} MB)",
                                MAX_TOTAL_SIZE / 1024 / 1024);
                        return ExtractionResult.successWithChildren(children);
                    }

                    baos.write(buffer, 0, len);
                }

                if (entrySize > MAX_SINGLE_FILE_SIZE) {
                    zis.closeEntry();
                    continue;
                }

                byte[] entryData = baos.toByteArray();
                totalExtractedSize += entryData.length;
                fileCount++;

                // Determine content type from filename
                String childContentType = guessContentType(entryName);

                // Extract just the filename from the path
                String childFilename = extractFilename(entryName);

                ChildAttachment child = new ChildAttachment(
                        childFilename,
                        childContentType,
                        entryData,
                        entryName
                );
                children.add(child);

                log.debug("Extracted from ZIP: {} ({} bytes, type={})",
                        entryName, entryData.length, childContentType);

                zis.closeEntry();
            }

            log.info("Successfully extracted {} files ({} bytes total) from ZIP: {}",
                    children.size(), totalExtractedSize, filename);

            return ExtractionResult.successWithChildren(children);

        } catch (Exception e) {
            log.error("Failed to extract ZIP '{}': {}", filename, e.getMessage(), e);
            return ExtractionResult.failure("ZIP extraction failed: " + e.getMessage());
        }
    }

    /**
     * Guess the MIME content type from a filename.
     */
    private String guessContentType(String filename) {
        if (filename == null) {
            return "application/octet-stream";
        }

        String lowerFilename = filename.toLowerCase();

        // Common types
        if (lowerFilename.endsWith(".pdf")) {
            return "application/pdf";
        } else if (lowerFilename.endsWith(".xml")) {
            return "application/xml";
        } else if (lowerFilename.endsWith(".zip")) {
            return "application/zip";
        } else if (lowerFilename.endsWith(".txt")) {
            return "text/plain";
        } else if (lowerFilename.endsWith(".html") || lowerFilename.endsWith(".htm")) {
            return "text/html";
        } else if (lowerFilename.endsWith(".json")) {
            return "application/json";
        } else if (lowerFilename.endsWith(".doc")) {
            return "application/msword";
        } else if (lowerFilename.endsWith(".docx")) {
            return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
        } else if (lowerFilename.endsWith(".xls")) {
            return "application/vnd.ms-excel";
        } else if (lowerFilename.endsWith(".xlsx")) {
            return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
        } else if (lowerFilename.endsWith(".png")) {
            return "image/png";
        } else if (lowerFilename.endsWith(".jpg") || lowerFilename.endsWith(".jpeg")) {
            return "image/jpeg";
        }

        // Try to guess from URLConnection
        String guessed = URLConnection.guessContentTypeFromName(filename);
        return guessed != null ? guessed : "application/octet-stream";
    }

    /**
     * Extract just the filename from a path (handles both / and \ separators).
     */
    private String extractFilename(String path) {
        if (path == null) {
            return "unnamed";
        }
        int lastSlash = Math.max(path.lastIndexOf('/'), path.lastIndexOf('\\'));
        if (lastSlash >= 0 && lastSlash < path.length() - 1) {
            return path.substring(lastSlash + 1);
        }
        return path;
    }
}