Update calibre zipfile module with the patches from python trunk

This commit is contained in:
Kovid Goyal 2011-03-23 10:29:22 -06:00
parent fec408de43
commit 24663f7853
2 changed files with 118 additions and 65 deletions

View File

@ -99,7 +99,7 @@ def sanitize_file_name_unicode(name, substitute='_'):
**WARNING:** This function also replaces path separators, so only pass file names **WARNING:** This function also replaces path separators, so only pass file names
and not full paths to it. and not full paths to it.
''' '''
if not isinstance(name, unicode): if isbytestring(name):
return sanitize_file_name(name, substitute=substitute, as_unicode=True) return sanitize_file_name(name, substitute=substitute, as_unicode=True)
chars = [substitute if c in _filename_sanitize_unicode else c for c in chars = [substitute if c in _filename_sanitize_unicode else c for c in
name] name]
@ -115,6 +115,14 @@ def sanitize_file_name_unicode(name, substitute='_'):
one = '_' + one[1:] one = '_' + one[1:]
return one return one
def sanitize_file_name2(name, substitute='_'):
'''
Sanitize filenames removing invalid chars. Keeps unicode names as unicode
and bytestrings as bytestrings
'''
if isbytestring(name):
return sanitize_file_name(name, substitute=substitute)
return sanitize_file_name_unicode(name, substitute=substitute)
def prints(*args, **kwargs): def prints(*args, **kwargs):
''' '''

View File

@ -1,14 +1,13 @@
""" """
Read and write ZIP files. Modified by Kovid Goyal to support replacing files in Read and write ZIP files. Modified by Kovid Goyal to support replacing files in
a zip archive. a zip archive, detecting filename encoding, updating zip files, etc.
""" """
from __future__ import with_statement import struct, os, time, sys, shutil, stat
import struct, os, time, sys, shutil
import binascii, cStringIO import binascii, cStringIO
from contextlib import closing from contextlib import closing
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from calibre import sanitize_file_name from calibre import sanitize_file_name2
from calibre.constants import filesystem_encoding from calibre.constants import filesystem_encoding
from calibre.ebooks.chardet import detect from calibre.ebooks.chardet import detect
@ -34,7 +33,7 @@ class LargeZipFile(Exception):
error = BadZipfile # The exception raised by this module error = BadZipfile # The exception raised by this module
ZIP64_LIMIT= (1 << 31) - 1 ZIP64_LIMIT = (1 << 31) - 1
ZIP_FILECOUNT_LIMIT = 1 << 16 ZIP_FILECOUNT_LIMIT = 1 << 16
ZIP_MAX_COMMENT = (1 << 16) - 1 ZIP_MAX_COMMENT = (1 << 16) - 1
@ -150,23 +149,41 @@ def decode_arcname(name):
return name return name
def is_zipfile(filename): def _check_zipfile(fp):
"""Quickly see if file is a ZIP file by checking the magic number."""
try: try:
fpin = open(filename, "rb") if _EndRecData(fp):
endrec = _EndRecData(fpin) return True # file has correct magic number
fpin.close()
if endrec:
return True # file has correct magic number
except IOError: except IOError:
pass pass
return False return False
def is_zipfile(filename):
"""Quickly see if a file is a ZIP file by checking the magic number.
The filename argument may be a file or file-like object too.
"""
result = False
try:
if hasattr(filename, "read"):
result = _check_zipfile(filename)
else:
with open(filename, "rb") as fp:
result = _check_zipfile(fp)
except IOError:
pass
return result
def _EndRecData64(fpin, offset, endrec): def _EndRecData64(fpin, offset, endrec):
""" """
Read the ZIP64 end-of-archive records and use that to update endrec Read the ZIP64 end-of-archive records and use that to update endrec
""" """
fpin.seek(offset - sizeEndCentDir64Locator, 2) try:
fpin.seek(offset - sizeEndCentDir64Locator, 2)
except IOError:
# If the seek fails, the file is not large enough to contain a ZIP64
# end-of-archive record, so just return the end record we were given.
return endrec
data = fpin.read(sizeEndCentDir64Locator) data = fpin.read(sizeEndCentDir64Locator)
sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
if sig != stringEndArchive64Locator: if sig != stringEndArchive64Locator:
@ -185,6 +202,7 @@ def _EndRecData64(fpin, offset, endrec):
return endrec return endrec
# Update the original endrec using data from the ZIP64 record # Update the original endrec using data from the ZIP64 record
endrec[_ECD_SIGNATURE] = sig
endrec[_ECD_DISK_NUMBER] = disk_num endrec[_ECD_DISK_NUMBER] = disk_num
endrec[_ECD_DISK_START] = disk_dir endrec[_ECD_DISK_START] = disk_dir
endrec[_ECD_ENTRIES_THIS_DISK] = dircount endrec[_ECD_ENTRIES_THIS_DISK] = dircount
@ -207,7 +225,10 @@ def _EndRecData(fpin):
# Check to see if this is ZIP file with no archive comment (the # Check to see if this is ZIP file with no archive comment (the
# "end of central directory" structure should be the last item in the # "end of central directory" structure should be the last item in the
# file if this is the case). # file if this is the case).
fpin.seek(-sizeEndCentDir, 2) try:
fpin.seek(-sizeEndCentDir, 2)
except IOError:
return None
data = fpin.read() data = fpin.read()
if data[0:4] == stringEndArchive and data[-2:] == "\000\000": if data[0:4] == stringEndArchive and data[-2:] == "\000\000":
# the signature is correct and there's no comment, unpack structure # the signature is correct and there's no comment, unpack structure
@ -217,13 +238,9 @@ def _EndRecData(fpin):
# Append a blank comment and record start offset # Append a blank comment and record start offset
endrec.append("") endrec.append("")
endrec.append(filesize - sizeEndCentDir) endrec.append(filesize - sizeEndCentDir)
if endrec[_ECD_OFFSET] == 0xffffffff:
# the value for the "offset of the start of the central directory"
# indicates that there is a "Zip64 end of central directory"
# structure present, so go look for it
return _EndRecData64(fpin, -sizeEndCentDir, endrec)
return endrec # Try to read the "Zip64 end of central directory" structure
return _EndRecData64(fpin, -sizeEndCentDir, endrec)
# Either this is not a ZIP file, or it is a ZIP file with an archive # Either this is not a ZIP file, or it is a ZIP file with an archive
# comment. Search the end of the file for the "end of central directory" # comment. Search the end of the file for the "end of central directory"
@ -245,11 +262,10 @@ def _EndRecData(fpin):
# Append the archive comment and start offset # Append the archive comment and start offset
endrec.append(comment) endrec.append(comment)
endrec.append(maxCommentStart + start) endrec.append(maxCommentStart + start)
if endrec[_ECD_OFFSET] == 0xffffffff:
# There is apparently a "Zip64 end of central directory" # Try to read the "Zip64 end of central directory" structure
# structure present, so go look for it return _EndRecData64(fpin, maxCommentStart + start - filesize,
return _EndRecData64(fpin, start - filesize, endrec) endrec)
return endrec
# Unable to find a valid end of central directory structure # Unable to find a valid end of central directory structure
return return
@ -733,21 +749,35 @@ class ZipFile:
if key == 'r': if key == 'r':
self._GetContents() self._GetContents()
elif key == 'w': elif key == 'w':
pass # set the modified flag so central directory gets written
# even if no files are added to the archive
self._didModify = True
elif key == 'a': elif key == 'a':
try: # See if file is a zip file try:
# See if file is a zip file
self._RealGetContents() self._RealGetContents()
self._calculate_file_offsets() self._calculate_file_offsets()
# seek to start of directory and overwrite # seek to start of directory and overwrite
self.fp.seek(self.start_dir, 0) self.fp.seek(self.start_dir, 0)
except BadZipfile: # file is not a zip file, just append except BadZipfile:
# file is not a zip file, just append
self.fp.seek(0, 2) self.fp.seek(0, 2)
# set the modified flag so central directory gets written
# even if no files are added to the archive
self._didModify = True
else: else:
if not self._filePassed: if not self._filePassed:
self.fp.close() self.fp.close()
self.fp = None self.fp = None
raise RuntimeError, 'Mode must be "r", "w" or "a"' raise RuntimeError, 'Mode must be "r", "w" or "a"'
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
def _GetContents(self): def _GetContents(self):
"""Read the directory, making sure we close the file if the format """Read the directory, making sure we close the file if the format
is bad.""" is bad."""
@ -762,9 +792,12 @@ class ZipFile:
def _RealGetContents(self): def _RealGetContents(self):
"""Read in the table of contents for the ZIP file.""" """Read in the table of contents for the ZIP file."""
fp = self.fp fp = self.fp
endrec = _EndRecData(fp) try:
endrec = _EndRecData(fp)
except IOError:
raise BadZipfile("File is not a zip file")
if not endrec: if not endrec:
raise BadZipfile, "File is not a zip file" raise BadZipfile("File is not a zip file")
if self.debug > 1: if self.debug > 1:
print endrec print endrec
size_cd = endrec[_ECD_SIZE] # bytes in central directory size_cd = endrec[_ECD_SIZE] # bytes in central directory
@ -773,9 +806,8 @@ class ZipFile:
# "concat" is zero, unless zip was concatenated to another file # "concat" is zero, unless zip was concatenated to another file
concat = endrec[_ECD_LOCATION] - size_cd - offset_cd concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
if endrec[_ECD_LOCATION] > ZIP64_LIMIT: if endrec[_ECD_SIGNATURE] == stringEndArchive64:
# If the offset of the "End of Central Dir" record requires Zip64 # If Zip64 extension structures are present, account for them
# extension structures, account for them
concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
if self.debug > 2: if self.debug > 2:
@ -918,9 +950,14 @@ class ZipFile:
def testzip(self): def testzip(self):
"""Read all the files and check the CRC.""" """Read all the files and check the CRC."""
chunk_size = 2 ** 20
for zinfo in self.filelist: for zinfo in self.filelist:
try: try:
self.read(zinfo.filename) # Check CRC-32 # Read by chunks, to avoid an OverflowError or a
# MemoryError with very large embedded files.
f = self.open(zinfo.filename, "r")
while f.read(chunk_size): # Check CRC-32
pass
except BadZipfile: except BadZipfile:
return zinfo.filename return zinfo.filename
@ -982,9 +1019,9 @@ class ZipFile:
zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
if fname != zinfo.orig_filename: if fname != zinfo.orig_filename:
print ('WARNING: Header (%r) and directory (%r) filenames do not' print (('WARNING: Header (%r) and directory (%r) filenames do not'
' match inside ZipFile')%(fname, zinfo.orig_filename) ' match inside ZipFile')%(fname, zinfo.orig_filename))
print 'Using directory filename %r'%zinfo.orig_filename print ('Using directory filename %r'%zinfo.orig_filename)
#raise BadZipfile, \ #raise BadZipfile, \
# 'File name in directory "%r" and header "%r" differ.' % ( # 'File name in directory "%r" and header "%r" differ.' % (
# zinfo.orig_filename, fname) # zinfo.orig_filename, fname)
@ -1059,13 +1096,13 @@ class ZipFile:
""" """
# build the destination pathname, replacing # build the destination pathname, replacing
# forward slashes to platform specific separators. # forward slashes to platform specific separators.
if targetpath[-1:] == "/": # Strip trailing path separator, unless it represents the root.
if (targetpath[-1:] in (os.path.sep, os.path.altsep)
and len(os.path.splitdrive(targetpath)[1]) > 1):
targetpath = targetpath[:-1] targetpath = targetpath[:-1]
# don't include leading "/" from file name if present # don't include leading "/" from file name if present
fname = member.filename fname = member.filename
if isinstance(fname, unicode):
fname = fname.encode(filesystem_encoding, 'replace')
if fname.startswith('/'): if fname.startswith('/'):
fname = fname[1:] fname = fname[1:]
targetpath = os.path.join(targetpath, fname) targetpath = os.path.join(targetpath, fname)
@ -1074,13 +1111,6 @@ class ZipFile:
# Create all upper directories if necessary. # Create all upper directories if necessary.
upperdirs = os.path.dirname(targetpath) upperdirs = os.path.dirname(targetpath)
while upperdirs:
if os.path.exists(upperdirs):
if os.path.isdir(upperdirs):
break
os.remove(upperdirs)
upperdirs = os.path.dirname(upperdirs)
upperdirs = os.path.dirname(targetpath)
if upperdirs and not os.path.exists(upperdirs): if upperdirs and not os.path.exists(upperdirs):
os.makedirs(upperdirs) os.makedirs(upperdirs)
@ -1090,8 +1120,9 @@ class ZipFile:
with open(targetpath, 'wb') as target: with open(targetpath, 'wb') as target:
shutil.copyfileobj(source, target) shutil.copyfileobj(source, target)
except: except:
# Try sanitizing the file name to remove invalid characters
components = list(os.path.split(targetpath)) components = list(os.path.split(targetpath))
components[-1] = sanitize_file_name(components[-1]) components[-1] = sanitize_file_name2(components[-1])
targetpath = os.sep.join(components) targetpath = os.sep.join(components)
with open(targetpath, 'wb') as target: with open(targetpath, 'wb') as target:
shutil.copyfileobj(source, target) shutil.copyfileobj(source, target)
@ -1129,6 +1160,7 @@ class ZipFile:
"Attempt to write to ZIP archive that was already closed") "Attempt to write to ZIP archive that was already closed")
st = os.stat(filename) st = os.stat(filename)
isdir = stat.S_ISDIR(st.st_mode)
mtime = time.localtime(st.st_mtime) mtime = time.localtime(st.st_mtime)
date_time = mtime[0:6] date_time = mtime[0:6]
# Create ZipInfo instance to store file information # Create ZipInfo instance to store file information
@ -1139,6 +1171,8 @@ class ZipFile:
arcname = arcname[1:] arcname = arcname[1:]
if not isinstance(arcname, unicode): if not isinstance(arcname, unicode):
arcname = arcname.decode(filesystem_encoding) arcname = arcname.decode(filesystem_encoding)
if isdir and not arcname.endswith('/'):
arcname += '/'
zinfo = ZipInfo(arcname, date_time) zinfo = ZipInfo(arcname, date_time)
zinfo.external_attr = (st[0] & 0xFFFF) << 16L # Unix attributes zinfo.external_attr = (st[0] & 0xFFFF) << 16L # Unix attributes
if compress_type is None: if compress_type is None:
@ -1152,6 +1186,16 @@ class ZipFile:
self._writecheck(zinfo) self._writecheck(zinfo)
self._didModify = True self._didModify = True
if isdir:
zinfo.file_size = 0
zinfo.compress_size = 0
zinfo.CRC = 0
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo
self.fp.write(zinfo.FileHeader())
return
with open(filename, "rb") as fp: with open(filename, "rb") as fp:
# Must overwrite CRC and sizes with correct data later # Must overwrite CRC and sizes with correct data later
zinfo.CRC = CRC = 0 zinfo.CRC = CRC = 0
@ -1261,12 +1305,6 @@ class ZipFile:
"""Call the "close()" method in case the user forgot.""" """Call the "close()" method in case the user forgot."""
self.close() self.close()
def __enter__(self):
return self
def __exit__(self, typ, value, traceback):
self.close()
def close(self): def close(self):
"""Close the file, and for mode "w" and "a" write the ending """Close the file, and for mode "w" and "a" write the ending
records.""" records."""
@ -1338,19 +1376,26 @@ class ZipFile:
pos2 = self.fp.tell() pos2 = self.fp.tell()
# Write end-of-zip-archive record # Write end-of-zip-archive record
centDirCount = count
centDirSize = pos2 - pos1
centDirOffset = pos1 centDirOffset = pos1
if pos1 > ZIP64_LIMIT: if (centDirCount >= ZIP_FILECOUNT_LIMIT or
centDirOffset > ZIP64_LIMIT or
centDirSize > ZIP64_LIMIT):
# Need to write the ZIP64 end-of-archive records # Need to write the ZIP64 end-of-archive records
zip64endrec = struct.pack( zip64endrec = struct.pack(
structEndArchive64, stringEndArchive64, structEndArchive64, stringEndArchive64,
44, 45, 45, 0, 0, count, count, pos2 - pos1, pos1) 44, 45, 45, 0, 0, centDirCount, centDirCount,
centDirSize, centDirOffset)
self.fp.write(zip64endrec) self.fp.write(zip64endrec)
zip64locrec = struct.pack( zip64locrec = struct.pack(
structEndArchive64Locator, structEndArchive64Locator,
stringEndArchive64Locator, 0, pos2, 1) stringEndArchive64Locator, 0, pos2, 1)
self.fp.write(zip64locrec) self.fp.write(zip64locrec)
centDirOffset = 0xFFFFFFFF centDirCount = min(centDirCount, 0xFFFF)
centDirSize = min(centDirSize, 0xFFFFFFFF)
centDirOffset = min(centDirOffset, 0xFFFFFFFF)
# check for valid comment length # check for valid comment length
if len(self.comment) >= ZIP_MAX_COMMENT: if len(self.comment) >= ZIP_MAX_COMMENT:
@ -1361,9 +1406,8 @@ class ZipFile:
self.comment = self.comment[:ZIP_MAX_COMMENT] self.comment = self.comment[:ZIP_MAX_COMMENT]
endrec = struct.pack(structEndArchive, stringEndArchive, endrec = struct.pack(structEndArchive, stringEndArchive,
0, 0, count % ZIP_FILECOUNT_LIMIT, 0, 0, centDirCount, centDirCount,
count % ZIP_FILECOUNT_LIMIT, pos2 - pos1, centDirSize, centDirOffset, len(self.comment))
centDirOffset, len(self.comment))
self.fp.write(endrec) self.fp.write(endrec)
self.fp.write(self.comment) self.fp.write(self.comment)
self.fp.flush() self.fp.flush()
@ -1544,7 +1588,9 @@ def main(args = None):
print USAGE print USAGE
sys.exit(1) sys.exit(1)
zf = ZipFile(args[1], 'r') zf = ZipFile(args[1], 'r')
zf.testzip() badfile = zf.testzip()
if badfile:
print("The following enclosed file is corrupted: {!r}".format(badfile))
print "Done testing" print "Done testing"
elif args[0] == '-e': elif args[0] == '-e':
@ -1563,9 +1609,8 @@ def main(args = None):
tgtdir = os.path.dirname(tgt) tgtdir = os.path.dirname(tgt)
if not os.path.exists(tgtdir): if not os.path.exists(tgtdir):
os.makedirs(tgtdir) os.makedirs(tgtdir)
fp = open(tgt, 'wb') with open(tgt, 'wb') as fp:
fp.write(zf.read(path)) fp.write(zf.read(path))
fp.close()
zf.close() zf.close()
elif args[0] == '-c': elif args[0] == '-c':