You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
235 lines
8.6 KiB
Java
235 lines
8.6 KiB
Java
package at.procon.ted.service.attachment;
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import org.springframework.stereotype.Service;
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
import java.io.ByteArrayOutputStream;
|
|
import java.net.URLConnection;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.Set;
|
|
import java.util.zip.ZipEntry;
|
|
import java.util.zip.ZipInputStream;
|
|
|
|
/**
|
|
* Service for extracting files from ZIP archives.
|
|
* Extracts all contained files as child attachments for recursive processing.
|
|
*
|
|
* @author Martin.Schweitzer@procon.co.at and claude.ai
|
|
*/
|
|
@Service
|
|
@Slf4j
|
|
public class ZipExtractionService implements AttachmentExtractor {
|
|
|
|
private static final Set<String> SUPPORTED_EXTENSIONS = Set.of("zip");
|
|
private static final Set<String> SUPPORTED_MIME_TYPES = Set.of(
|
|
"application/zip",
|
|
"application/x-zip",
|
|
"application/x-zip-compressed",
|
|
"application/octet-stream" // Often used for ZIP files
|
|
);
|
|
|
|
// Security limits
|
|
private static final long MAX_TOTAL_SIZE = 500 * 1024 * 1024; // 500 MB total extracted size
|
|
private static final long MAX_SINGLE_FILE_SIZE = 100 * 1024 * 1024; // 100 MB per file
|
|
private static final int MAX_FILES = 1000; // Maximum number of files in archive
|
|
private static final int MAX_PATH_LENGTH = 500; // Maximum path length
|
|
|
|
@Override
|
|
public Set<String> getSupportedExtensions() {
|
|
return SUPPORTED_EXTENSIONS;
|
|
}
|
|
|
|
@Override
|
|
public Set<String> getSupportedMimeTypes() {
|
|
return SUPPORTED_MIME_TYPES;
|
|
}
|
|
|
|
@Override
|
|
public boolean canHandle(String filename, String contentType) {
|
|
if (filename != null) {
|
|
String lowerFilename = filename.toLowerCase();
|
|
if (SUPPORTED_EXTENSIONS.stream().anyMatch(ext -> lowerFilename.endsWith("." + ext))) {
|
|
return true;
|
|
}
|
|
}
|
|
// Only use MIME type if it's explicitly zip, not application/octet-stream
|
|
if (contentType != null) {
|
|
String lowerContentType = contentType.toLowerCase().split(";")[0].trim();
|
|
if (lowerContentType.contains("zip")) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
@Override
|
|
public ExtractionResult extract(byte[] data, String filename, String contentType) {
|
|
if (data == null || data.length == 0) {
|
|
return ExtractionResult.failure("Empty ZIP data");
|
|
}
|
|
|
|
log.debug("Extracting files from ZIP: {} ({} bytes)", filename, data.length);
|
|
|
|
List<ChildAttachment> children = new ArrayList<>();
|
|
long totalExtractedSize = 0;
|
|
int fileCount = 0;
|
|
|
|
try (ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(data))) {
|
|
ZipEntry entry;
|
|
|
|
while ((entry = zis.getNextEntry()) != null) {
|
|
// Security check: skip directories
|
|
if (entry.isDirectory()) {
|
|
zis.closeEntry();
|
|
continue;
|
|
}
|
|
|
|
String entryName = entry.getName();
|
|
|
|
// Security check: path traversal protection
|
|
if (entryName.contains("..") || entryName.startsWith("/") || entryName.startsWith("\\")) {
|
|
log.warn("Skipping potentially malicious ZIP entry: {}", entryName);
|
|
zis.closeEntry();
|
|
continue;
|
|
}
|
|
|
|
// Security check: path length
|
|
if (entryName.length() > MAX_PATH_LENGTH) {
|
|
log.warn("Skipping ZIP entry with too long path: {}", entryName.substring(0, 100) + "...");
|
|
zis.closeEntry();
|
|
continue;
|
|
}
|
|
|
|
// Security check: maximum files
|
|
if (fileCount >= MAX_FILES) {
|
|
log.warn("ZIP file contains too many files, stopping at {} files", MAX_FILES);
|
|
break;
|
|
}
|
|
|
|
// Read entry content
|
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
|
byte[] buffer = new byte[8192];
|
|
int len;
|
|
long entrySize = 0;
|
|
|
|
while ((len = zis.read(buffer)) > 0) {
|
|
entrySize += len;
|
|
|
|
// Security check: single file size
|
|
if (entrySize > MAX_SINGLE_FILE_SIZE) {
|
|
log.warn("Skipping ZIP entry exceeding max file size: {} (> {} MB)",
|
|
entryName, MAX_SINGLE_FILE_SIZE / 1024 / 1024);
|
|
break;
|
|
}
|
|
|
|
// Security check: total extracted size (zip bomb protection)
|
|
if (totalExtractedSize + entrySize > MAX_TOTAL_SIZE) {
|
|
log.warn("ZIP extraction stopped: total extracted size exceeds limit ({} MB)",
|
|
MAX_TOTAL_SIZE / 1024 / 1024);
|
|
return ExtractionResult.successWithChildren(children);
|
|
}
|
|
|
|
baos.write(buffer, 0, len);
|
|
}
|
|
|
|
if (entrySize > MAX_SINGLE_FILE_SIZE) {
|
|
zis.closeEntry();
|
|
continue;
|
|
}
|
|
|
|
byte[] entryData = baos.toByteArray();
|
|
totalExtractedSize += entryData.length;
|
|
fileCount++;
|
|
|
|
// Determine content type from filename
|
|
String childContentType = guessContentType(entryName);
|
|
|
|
// Extract just the filename from the path
|
|
String childFilename = extractFilename(entryName);
|
|
|
|
ChildAttachment child = new ChildAttachment(
|
|
childFilename,
|
|
childContentType,
|
|
entryData,
|
|
entryName
|
|
);
|
|
children.add(child);
|
|
|
|
log.debug("Extracted from ZIP: {} ({} bytes, type={})",
|
|
entryName, entryData.length, childContentType);
|
|
|
|
zis.closeEntry();
|
|
}
|
|
|
|
log.info("Successfully extracted {} files ({} bytes total) from ZIP: {}",
|
|
children.size(), totalExtractedSize, filename);
|
|
|
|
return ExtractionResult.successWithChildren(children);
|
|
|
|
} catch (Exception e) {
|
|
log.error("Failed to extract ZIP '{}': {}", filename, e.getMessage(), e);
|
|
return ExtractionResult.failure("ZIP extraction failed: " + e.getMessage());
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Guess the MIME content type from a filename.
|
|
*/
|
|
private String guessContentType(String filename) {
|
|
if (filename == null) {
|
|
return "application/octet-stream";
|
|
}
|
|
|
|
String lowerFilename = filename.toLowerCase();
|
|
|
|
// Common types
|
|
if (lowerFilename.endsWith(".pdf")) {
|
|
return "application/pdf";
|
|
} else if (lowerFilename.endsWith(".xml")) {
|
|
return "application/xml";
|
|
} else if (lowerFilename.endsWith(".zip")) {
|
|
return "application/zip";
|
|
} else if (lowerFilename.endsWith(".txt")) {
|
|
return "text/plain";
|
|
} else if (lowerFilename.endsWith(".html") || lowerFilename.endsWith(".htm")) {
|
|
return "text/html";
|
|
} else if (lowerFilename.endsWith(".json")) {
|
|
return "application/json";
|
|
} else if (lowerFilename.endsWith(".doc")) {
|
|
return "application/msword";
|
|
} else if (lowerFilename.endsWith(".docx")) {
|
|
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
|
} else if (lowerFilename.endsWith(".xls")) {
|
|
return "application/vnd.ms-excel";
|
|
} else if (lowerFilename.endsWith(".xlsx")) {
|
|
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
|
|
} else if (lowerFilename.endsWith(".png")) {
|
|
return "image/png";
|
|
} else if (lowerFilename.endsWith(".jpg") || lowerFilename.endsWith(".jpeg")) {
|
|
return "image/jpeg";
|
|
}
|
|
|
|
// Try to guess from URLConnection
|
|
String guessed = URLConnection.guessContentTypeFromName(filename);
|
|
return guessed != null ? guessed : "application/octet-stream";
|
|
}
|
|
|
|
/**
|
|
* Extract just the filename from a path (handles both / and \ separators).
|
|
*/
|
|
private String extractFilename(String path) {
|
|
if (path == null) {
|
|
return "unnamed";
|
|
}
|
|
int lastSlash = Math.max(path.lastIndexOf('/'), path.lastIndexOf('\\'));
|
|
if (lastSlash >= 0 && lastSlash < path.length() - 1) {
|
|
return path.substring(lastSlash + 1);
|
|
}
|
|
return path;
|
|
}
|
|
}
|