You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

235 lines
8.6 KiB
Java

package at.procon.ted.service.attachment;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
/**
* Service for extracting files from ZIP archives.
* Extracts all contained files as child attachments for recursive processing.
*
* @author Martin.Schweitzer@procon.co.at and claude.ai
*/
@Service
@Slf4j
public class ZipExtractionService implements AttachmentExtractor {
private static final Set<String> SUPPORTED_EXTENSIONS = Set.of("zip");
private static final Set<String> SUPPORTED_MIME_TYPES = Set.of(
"application/zip",
"application/x-zip",
"application/x-zip-compressed",
"application/octet-stream" // Often used for ZIP files
);
// Security limits
private static final long MAX_TOTAL_SIZE = 500 * 1024 * 1024; // 500 MB total extracted size
private static final long MAX_SINGLE_FILE_SIZE = 100 * 1024 * 1024; // 100 MB per file
private static final int MAX_FILES = 1000; // Maximum number of files in archive
private static final int MAX_PATH_LENGTH = 500; // Maximum path length
@Override
public Set<String> getSupportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override
public Set<String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@Override
public boolean canHandle(String filename, String contentType) {
if (filename != null) {
String lowerFilename = filename.toLowerCase();
if (SUPPORTED_EXTENSIONS.stream().anyMatch(ext -> lowerFilename.endsWith("." + ext))) {
return true;
}
}
// Only use MIME type if it's explicitly zip, not application/octet-stream
if (contentType != null) {
String lowerContentType = contentType.toLowerCase().split(";")[0].trim();
if (lowerContentType.contains("zip")) {
return true;
}
}
return false;
}
@Override
public ExtractionResult extract(byte[] data, String filename, String contentType) {
if (data == null || data.length == 0) {
return ExtractionResult.failure("Empty ZIP data");
}
log.debug("Extracting files from ZIP: {} ({} bytes)", filename, data.length);
List<ChildAttachment> children = new ArrayList<>();
long totalExtractedSize = 0;
int fileCount = 0;
try (ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(data))) {
ZipEntry entry;
while ((entry = zis.getNextEntry()) != null) {
// Security check: skip directories
if (entry.isDirectory()) {
zis.closeEntry();
continue;
}
String entryName = entry.getName();
// Security check: path traversal protection
if (entryName.contains("..") || entryName.startsWith("/") || entryName.startsWith("\\")) {
log.warn("Skipping potentially malicious ZIP entry: {}", entryName);
zis.closeEntry();
continue;
}
// Security check: path length
if (entryName.length() > MAX_PATH_LENGTH) {
log.warn("Skipping ZIP entry with too long path: {}", entryName.substring(0, 100) + "...");
zis.closeEntry();
continue;
}
// Security check: maximum files
if (fileCount >= MAX_FILES) {
log.warn("ZIP file contains too many files, stopping at {} files", MAX_FILES);
break;
}
// Read entry content
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[8192];
int len;
long entrySize = 0;
while ((len = zis.read(buffer)) > 0) {
entrySize += len;
// Security check: single file size
if (entrySize > MAX_SINGLE_FILE_SIZE) {
log.warn("Skipping ZIP entry exceeding max file size: {} (> {} MB)",
entryName, MAX_SINGLE_FILE_SIZE / 1024 / 1024);
break;
}
// Security check: total extracted size (zip bomb protection)
if (totalExtractedSize + entrySize > MAX_TOTAL_SIZE) {
log.warn("ZIP extraction stopped: total extracted size exceeds limit ({} MB)",
MAX_TOTAL_SIZE / 1024 / 1024);
return ExtractionResult.successWithChildren(children);
}
baos.write(buffer, 0, len);
}
if (entrySize > MAX_SINGLE_FILE_SIZE) {
zis.closeEntry();
continue;
}
byte[] entryData = baos.toByteArray();
totalExtractedSize += entryData.length;
fileCount++;
// Determine content type from filename
String childContentType = guessContentType(entryName);
// Extract just the filename from the path
String childFilename = extractFilename(entryName);
ChildAttachment child = new ChildAttachment(
childFilename,
childContentType,
entryData,
entryName
);
children.add(child);
log.debug("Extracted from ZIP: {} ({} bytes, type={})",
entryName, entryData.length, childContentType);
zis.closeEntry();
}
log.info("Successfully extracted {} files ({} bytes total) from ZIP: {}",
children.size(), totalExtractedSize, filename);
return ExtractionResult.successWithChildren(children);
} catch (Exception e) {
log.error("Failed to extract ZIP '{}': {}", filename, e.getMessage(), e);
return ExtractionResult.failure("ZIP extraction failed: " + e.getMessage());
}
}
/**
* Guess the MIME content type from a filename.
*/
private String guessContentType(String filename) {
if (filename == null) {
return "application/octet-stream";
}
String lowerFilename = filename.toLowerCase();
// Common types
if (lowerFilename.endsWith(".pdf")) {
return "application/pdf";
} else if (lowerFilename.endsWith(".xml")) {
return "application/xml";
} else if (lowerFilename.endsWith(".zip")) {
return "application/zip";
} else if (lowerFilename.endsWith(".txt")) {
return "text/plain";
} else if (lowerFilename.endsWith(".html") || lowerFilename.endsWith(".htm")) {
return "text/html";
} else if (lowerFilename.endsWith(".json")) {
return "application/json";
} else if (lowerFilename.endsWith(".doc")) {
return "application/msword";
} else if (lowerFilename.endsWith(".docx")) {
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
} else if (lowerFilename.endsWith(".xls")) {
return "application/vnd.ms-excel";
} else if (lowerFilename.endsWith(".xlsx")) {
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
} else if (lowerFilename.endsWith(".png")) {
return "image/png";
} else if (lowerFilename.endsWith(".jpg") || lowerFilename.endsWith(".jpeg")) {
return "image/jpeg";
}
// Try to guess from URLConnection
String guessed = URLConnection.guessContentTypeFromName(filename);
return guessed != null ? guessed : "application/octet-stream";
}
/**
* Extract just the filename from a path (handles both / and \ separators).
*/
private String extractFilename(String path) {
if (path == null) {
return "unnamed";
}
int lastSlash = Math.max(path.lastIndexOf('/'), path.lastIndexOf('\\'));
if (lastSlash >= 0 && lastSlash < path.length() - 1) {
return path.substring(lastSlash + 1);
}
return path;
}
}