Read and write ZIP files.
import struct, os, time, sys, shutil
import binascii, cStringIO, stat
import zlib # We may need its compression method
__all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile",
"ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile" ]
class BadZipfile(Exception):
class LargeZipFile(Exception):
Raised when writing a zipfile, the zipfile requires ZIP64 extensions
and those extensions are disabled.
error = BadZipfile # The exception raised by this module
ZIP64_LIMIT = (1 << 31) - 1
ZIP_FILECOUNT_LIMIT = (1 << 16) - 1
ZIP_MAX_COMMENT = (1 << 16) - 1
# constants for Zip file compression methods
# Other ZIP compression methods not supported
# Below are some formats and associated data for reading/writing headers using
# the struct module. The names and structures of headers/records are those used
# in the PKWARE description of the ZIP file format:
# http://www.pkware.com/documents/casestudies/APPNOTE.TXT
# (URL valid as of January 2008)
# The "end of central directory" structure, magic number, size, and indices
# (section V.I in the format document)
structEndArchive = "<4s4H2LH"
stringEndArchive = "PK\005\006"
sizeEndCentDir = struct.calcsize(structEndArchive)
_ECD_ENTRIES_THIS_DISK = 3
# These last two indices are not part of the structure as defined in the
# spec, but they are used internally by this module as a convenience
# The "central directory" structure, magic number, size, and indices
# of entries in the structure (section V.F in the format document)
structCentralDir = "<4s4B4HL2L5H2L"
stringCentralDir = "PK\001\002"
sizeCentralDir = struct.calcsize(structCentralDir)
# indexes of entries in the central directory structure
_CD_UNCOMPRESSED_SIZE = 11
_CD_EXTRA_FIELD_LENGTH = 13
_CD_DISK_NUMBER_START = 15
_CD_INTERNAL_FILE_ATTRIBUTES = 16
_CD_EXTERNAL_FILE_ATTRIBUTES = 17
_CD_LOCAL_HEADER_OFFSET = 18
# The "local file header" structure, magic number, size, and indices
# (section V.A in the format document)
structFileHeader = "<4s2B4HL2L2H"
stringFileHeader = "PK\003\004"
sizeFileHeader = struct.calcsize(structFileHeader)
_FH_GENERAL_PURPOSE_FLAG_BITS = 3
_FH_COMPRESSION_METHOD = 4
_FH_UNCOMPRESSED_SIZE = 9
_FH_EXTRA_FIELD_LENGTH = 11
# The "Zip64 end of central directory locator" structure, magic number, and size
structEndArchive64Locator = "<4sLQL"
stringEndArchive64Locator = "PK\x06\x07"
sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator)
# The "Zip64 end of central directory" record, magic number, size, and indices
# (section V.G in the format document)
structEndArchive64 = "<4sQ2H2L4Q"
stringEndArchive64 = "PK\x06\x06"
sizeEndCentDir64 = struct.calcsize(structEndArchive64)
_CD64_DIRECTORY_RECSIZE = 1
_CD64_EXTRACT_VERSION = 3
_CD64_DISK_NUMBER_START = 5
_CD64_NUMBER_ENTRIES_THIS_DISK = 6
_CD64_NUMBER_ENTRIES_TOTAL = 7
_CD64_OFFSET_START_CENTDIR = 9
_DD_SIGNATURE = 0x08074b50
_EXTRA_FIELD_STRUCT = struct.Struct('<HH')
def _strip_extra(extra, xids):
# Remove Extra Fields with specified IDs.
unpack = _EXTRA_FIELD_STRUCT.unpack
while i + 4 <= len(extra):
xid, xlen = unpack(extra[i : i + 4])
buffer.append(extra[start : i])
return True # file has correct magic number
def is_zipfile(filename):
"""Quickly see if a file is a ZIP file by checking the magic number.
The filename argument may be a file or file-like object too.
if hasattr(filename, "read"):
result = _check_zipfile(fp=filename)
with open(filename, "rb") as fp:
result = _check_zipfile(fp)
def _EndRecData64(fpin, offset, endrec):
Read the ZIP64 end-of-archive records and use that to update endrec
fpin.seek(offset - sizeEndCentDir64Locator, 2)
# If the seek fails, the file is not large enough to contain a ZIP64
# end-of-archive record, so just return the end record we were given.
data = fpin.read(sizeEndCentDir64Locator)
if len(data) != sizeEndCentDir64Locator:
sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
if sig != stringEndArchive64Locator:
if diskno != 0 or disks != 1:
raise BadZipfile("zipfiles that span multiple disks are not supported")
# Assume no 'zip64 extensible data'
fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2)
data = fpin.read(sizeEndCentDir64)
if len(data) != sizeEndCentDir64:
sig, sz, create_version, read_version, disk_num, disk_dir, \
dircount, dircount2, dirsize, diroffset = \
struct.unpack(structEndArchive64, data)
if sig != stringEndArchive64:
# Update the original endrec using data from the ZIP64 record
endrec[_ECD_SIGNATURE] = sig
endrec[_ECD_DISK_NUMBER] = disk_num
endrec[_ECD_DISK_START] = disk_dir
endrec[_ECD_ENTRIES_THIS_DISK] = dircount
endrec[_ECD_ENTRIES_TOTAL] = dircount2
endrec[_ECD_SIZE] = dirsize
endrec[_ECD_OFFSET] = diroffset
"""Return data from the "End of Central Directory" record, or None.
The data is a list of the nine items in the ZIP "End of central dir"
record followed by a tenth item, the file seek offset of this record."""
# Check to see if this is ZIP file with no archive comment (the
# "end of central directory" structure should be the last item in the
# file if this is the case).
fpin.seek(-sizeEndCentDir, 2)
if (len(data) == sizeEndCentDir and
data[0:4] == stringEndArchive and
data[-2:] == b"\000\000"):
# the signature is correct and there's no comment, unpack structure
endrec = struct.unpack(structEndArchive, data)
# Append a blank comment and record start offset
endrec.append(filesize - sizeEndCentDir)
# Try to read the "Zip64 end of central directory" structure
return _EndRecData64(fpin, -sizeEndCentDir, endrec)
# Either this is not a ZIP file, or it is a ZIP file with an archive
# comment. Search the end of the file for the "end of central directory"
# record signature. The comment is the last item in the ZIP file and may be
# up to 64K long. It is assumed that the "end of central directory" magic
# number does not appear in the comment.
maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0)
fpin.seek(maxCommentStart, 0)
start = data.rfind(stringEndArchive)
# found the magic number; attempt to unpack and interpret
recData = data[start:start+sizeEndCentDir]
if len(recData) != sizeEndCentDir:
endrec = list(struct.unpack(structEndArchive, recData))
commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file
comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize]
endrec.append(maxCommentStart + start)
# Try to read the "Zip64 end of central directory" structure
return _EndRecData64(fpin, maxCommentStart + start - filesize,
# Unable to find a valid end of central directory structure
"""Class with attributes describing each file in the ZIP archive."""
def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
self.orig_filename = filename # Original file name in archive
# Terminate the file name at the first null byte. Null bytes in file
# names are used as tricks by viruses in archives.
null_byte = filename.find(chr(0))
filename = filename[0:null_byte]
# This is used to ensure paths in generated ZIP files always use
# forward slashes as the directory separator, as required by the
# ZIP format specification.
if os.sep != "/" and os.sep in filename:
filename = filename.replace(os.sep, "/")
self.filename = filename # Normalized file name
self.date_time = date_time # year, month, day, hour, min, sec
raise ValueError('ZIP does not support timestamps before 1980')
self.compress_type = ZIP_STORED # Type of compression for the file
self.comment = "" # Comment for each file
self.extra = "" # ZIP extra data
if sys.platform == 'win32':
self.create_system = 0 # System which created ZIP archive
# Assume everything else is unix-y
self.create_system = 3 # System which created ZIP archive
self.create_version = 20 # Version which created ZIP archive
self.extract_version = 20 # Version needed to extract archive
self.reserved = 0 # Must be zero
self.flag_bits = 0 # ZIP flag bits
self.volume = 0 # Volume number of file header
self.internal_attr = 0 # Internal attributes
self.external_attr = 0 # External file attributes
# Other attributes are set by class ZipFile:
# header_offset Byte offset to the file header
# CRC CRC-32 of the uncompressed file
# compress_size Size of the compressed file
# file_size Size of the uncompressed file
def FileHeader(self, zip64=None):
"""Return the per-file header as a string."""
dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
if self.flag_bits & 0x08:
# Set these to zero because we write them after the file data
CRC = compress_size = file_size = 0
compress_size = self.compress_size
file_size = self.file_size
zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT
extra = extra + struct.pack(fmt,
1, struct.calcsize(fmt)-4, file_size, compress_size)
if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT:
raise LargeZipFile("Filesize would require ZIP64 extensions")
# File is larger than what fits into a 4 byte integer,
# fall back to the ZIP64 extension
compress_size = 0xffffffff
self.extract_version = max(45, self.extract_version)
self.create_version = max(45, self.extract_version)
filename, flag_bits = self._encodeFilenameFlags()
header = struct.pack(structFileHeader, stringFileHeader,
self.extract_version, self.reserved, flag_bits,
self.compress_type, dostime, dosdate, CRC,
compress_size, file_size,
len(filename), len(extra))
return header + filename + extra
def _encodeFilenameFlags(self):
if isinstance(self.filename, unicode):
return self.filename.encode('ascii'), self.flag_bits
except UnicodeEncodeError:
return self.filename.encode('utf-8'), self.flag_bits | 0x800
return self.filename, self.flag_bits
def _decodeFilename(self):
if self.flag_bits & 0x800:
return self.filename.decode('utf-8')
# Try to decode the extra field.
tp, ln = unpack('<HH', extra[:4])
counts = unpack('<QQQ', extra[4:28])
counts = unpack('<QQ', extra[4:20])
counts = unpack('<Q', extra[4:12])
raise RuntimeError, "Corrupt extra field %s"%(ln,)
# ZIP64 extension (large files and/or large archives)
if self.file_size in (0xffffffffffffffffL, 0xffffffffL):
self.file_size = counts[idx]
if self.compress_size == 0xFFFFFFFFL:
self.compress_size = counts[idx]
if self.header_offset == 0xffffffffL:
self.header_offset = counts[idx]
"""Class to handle decryption of files stored within a ZIP archive.
ZIP supports a password-based form of encryption. Even though known
plaintext attacks have been found against it, it is still useful
to be able to get data out of such a file.
zd = _ZipDecrypter(mypwd)
plain_char = zd(cypher_char)
plain_text = map(zd, cypher_text)
"""Generate a CRC-32 table.
ZIP encryption uses the CRC32 one-byte primitive for scrambling some
internal keys. We noticed that a direct implementation is faster than
relying on binascii.crc32().
crc = ((crc >> 1) & 0x7FFFFFFF) ^ poly
crc = ((crc >> 1) & 0x7FFFFFFF)
crctable = _GenerateCRCTable()
def _crc32(self, ch, crc):
"""Compute the CRC32 primitive on one byte."""
return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ord(ch)) & 0xff]
def _UpdateKeys(self, c):
self.key0 = self._crc32(c, self.key0)
self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295
self.key1 = (self.key1 * 134775813 + 1) & 4294967295
self.key2 = self._crc32(chr((self.key1 >> 24) & 255), self.key2)
"""Decrypt a single character."""
c = c ^ (((k * (k^1)) >> 8) & 255)