From 24663f78537dd4909ebf47b0e8af4094bcc142a9 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 23 Mar 2011 10:29:22 -0600
Subject: [PATCH] Update calibre zipfile module with the patches from python
 trunk

---
 src/calibre/__init__.py      |  10 +-
 src/calibre/utils/zipfile.py | 173 ++++++++++++++++++++++-------------
 2 files changed, 118 insertions(+), 65 deletions(-)

diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py
index ab578d8ae6..4150b4b339 100644
--- a/src/calibre/__init__.py
+++ b/src/calibre/__init__.py
@@ -99,7 +99,7 @@ def sanitize_file_name_unicode(name, substitute='_'):
     **WARNING:** This function also replaces path separators, so only pass file names
     and not full paths to it.
     '''
-    if not isinstance(name, unicode):
+    if isbytestring(name):
         return sanitize_file_name(name, substitute=substitute, as_unicode=True)
     chars = [substitute if c in _filename_sanitize_unicode else c for c in
             name]
@@ -115,6 +115,14 @@ def sanitize_file_name_unicode(name, substitute='_'):
         one = '_' + one[1:]
     return one
 
+def sanitize_file_name2(name, substitute='_'):
+    '''
+    Sanitize filenames removing invalid chars. Keeps unicode names as unicode
+    and bytestrings as bytestrings
+    '''
+    if isbytestring(name):
+        return sanitize_file_name(name, substitute=substitute)
+    return sanitize_file_name_unicode(name, substitute=substitute)
 
 def prints(*args, **kwargs):
     '''
diff --git a/src/calibre/utils/zipfile.py b/src/calibre/utils/zipfile.py
index c230b9dfa7..c279825fd2 100644
--- a/src/calibre/utils/zipfile.py
+++ b/src/calibre/utils/zipfile.py
@@ -1,14 +1,13 @@
 """
 Read and write ZIP files. Modified by Kovid Goyal to support replacing files in
-a zip archive.
+a zip archive, detecting filename encoding, updating zip files, etc.
 """
-from __future__ import with_statement
-import struct, os, time, sys, shutil
+import struct, os, time, sys, shutil, stat
 import binascii, cStringIO
 from contextlib import closing
 from tempfile import SpooledTemporaryFile
 
-from calibre import sanitize_file_name
+from calibre import sanitize_file_name2
 from calibre.constants import filesystem_encoding
 from calibre.ebooks.chardet import detect
 
@@ -34,7 +33,7 @@ class LargeZipFile(Exception):
 
 error = BadZipfile      # The exception raised by this module
 
-ZIP64_LIMIT= (1 << 31) - 1
+ZIP64_LIMIT = (1 << 31) - 1
 ZIP_FILECOUNT_LIMIT = 1 << 16
 ZIP_MAX_COMMENT = (1 << 16) - 1
 
@@ -150,23 +149,41 @@ def decode_arcname(name):
     return name
 
 
-def is_zipfile(filename):
-    """Quickly see if file is a ZIP file by checking the magic number."""
+def _check_zipfile(fp):
     try:
-        fpin = open(filename, "rb")
-        endrec = _EndRecData(fpin)
-        fpin.close()
-        if endrec:
-            return True                 # file has correct magic number
+        if _EndRecData(fp):
+            return True         # file has correct magic number
     except IOError:
         pass
     return False
 
+def is_zipfile(filename):
+    """Quickly see if a file is a ZIP file by checking the magic number.
+
+    The filename argument may be a file or file-like object too.
+    """
+    result = False
+    try:
+        if hasattr(filename, "read"):
+            result = _check_zipfile(filename)
+        else:
+            with open(filename, "rb") as fp:
+                result = _check_zipfile(fp)
+    except IOError:
+        pass
+    return result
+
 def _EndRecData64(fpin, offset, endrec):
     """
     Read the ZIP64 end-of-archive records and use that to update endrec
     """
-    fpin.seek(offset - sizeEndCentDir64Locator, 2)
+    try:
+        fpin.seek(offset - sizeEndCentDir64Locator, 2)
+    except IOError:
+        # If the seek fails, the file is not large enough to contain a ZIP64
+        # end-of-archive record, so just return the end record we were given.
+        return endrec
+
     data = fpin.read(sizeEndCentDir64Locator)
     sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
     if sig != stringEndArchive64Locator:
@@ -185,6 +202,7 @@ def _EndRecData64(fpin, offset, endrec):
         return endrec
 
     # Update the original endrec using data from the ZIP64 record
+    endrec[_ECD_SIGNATURE] = sig
     endrec[_ECD_DISK_NUMBER] = disk_num
     endrec[_ECD_DISK_START] = disk_dir
     endrec[_ECD_ENTRIES_THIS_DISK] = dircount
@@ -207,7 +225,10 @@ def _EndRecData(fpin):
     # Check to see if this is ZIP file with no archive comment (the
     # "end of central directory" structure should be the last item in the
     # file if this is the case).
-    fpin.seek(-sizeEndCentDir, 2)
+    try:
+        fpin.seek(-sizeEndCentDir, 2)
+    except IOError:
+        return None
     data = fpin.read()
     if data[0:4] == stringEndArchive and data[-2:] == "\000\000":
         # the signature is correct and there's no comment, unpack structure
@@ -217,13 +238,9 @@ def _EndRecData(fpin):
         # Append a blank comment and record start offset
         endrec.append("")
         endrec.append(filesize - sizeEndCentDir)
-        if endrec[_ECD_OFFSET] == 0xffffffff:
-            # the value for the "offset of the start of the central directory"
-            # indicates that there is a "Zip64 end of central directory"
-            # structure present, so go look for it
-            return _EndRecData64(fpin, -sizeEndCentDir, endrec)
 
-        return endrec
+        # Try to read the "Zip64 end of central directory" structure
+        return _EndRecData64(fpin, -sizeEndCentDir, endrec)
 
     # Either this is not a ZIP file, or it is a ZIP file with an archive
     # comment.  Search the end of the file for the "end of central directory"
@@ -245,11 +262,10 @@ def _EndRecData(fpin):
             # Append the archive comment and start offset
             endrec.append(comment)
             endrec.append(maxCommentStart + start)
-            if endrec[_ECD_OFFSET] == 0xffffffff:
-                # There is apparently a "Zip64 end of central directory"
-                # structure present, so go look for it
-                return _EndRecData64(fpin, start - filesize, endrec)
-            return endrec
+
+            # Try to read the "Zip64 end of central directory" structure
+            return _EndRecData64(fpin, maxCommentStart + start - filesize,
+                                 endrec)
 
     # Unable to find a valid end of central directory structure
     return
@@ -733,21 +749,35 @@ class ZipFile:
         if key == 'r':
             self._GetContents()
         elif key == 'w':
-            pass
+            # set the modified flag so central directory gets written
+            # even if no files are added to the archive
+            self._didModify = True
         elif key == 'a':
-            try:                        # See if file is a zip file
+            try:
+                # See if file is a zip file
                 self._RealGetContents()
                 self._calculate_file_offsets()
                 # seek to start of directory and overwrite
                 self.fp.seek(self.start_dir, 0)
-            except BadZipfile:          # file is not a zip file, just append
+            except BadZipfile:
+                # file is not a zip file, just append
                 self.fp.seek(0, 2)
+
+                # set the modified flag so central directory gets written
+                # even if no files are added to the archive
+                self._didModify = True
         else:
             if not self._filePassed:
                 self.fp.close()
                 self.fp = None
             raise RuntimeError, 'Mode must be "r", "w" or "a"'
 
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
     def _GetContents(self):
         """Read the directory, making sure we close the file if the format
         is bad."""
@@ -762,9 +792,12 @@ class ZipFile:
     def _RealGetContents(self):
         """Read in the table of contents for the ZIP file."""
         fp = self.fp
-        endrec = _EndRecData(fp)
+        try:
+            endrec = _EndRecData(fp)
+        except IOError:
+            raise BadZipfile("File is not a zip file")
         if not endrec:
-            raise BadZipfile, "File is not a zip file"
+            raise BadZipfile("File is not a zip file")
         if self.debug > 1:
             print endrec
         size_cd = endrec[_ECD_SIZE]             # bytes in central directory
@@ -773,9 +806,8 @@ class ZipFile:
 
         # "concat" is zero, unless zip was concatenated to another file
         concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
-        if endrec[_ECD_LOCATION] > ZIP64_LIMIT:
-            # If the offset of the "End of Central Dir" record requires Zip64
-            # extension structures, account for them
+        if endrec[_ECD_SIGNATURE] == stringEndArchive64:
+            # If Zip64 extension structures are present, account for them
             concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
 
         if self.debug > 2:
@@ -918,9 +950,14 @@ class ZipFile:
 
     def testzip(self):
         """Read all the files and check the CRC."""
+        chunk_size = 2 ** 20
         for zinfo in self.filelist:
             try:
-                self.read(zinfo.filename)       # Check CRC-32
+                # Read by chunks, to avoid an OverflowError or a
+                # MemoryError with very large embedded files.
+                f = self.open(zinfo.filename, "r")
+                while f.read(chunk_size):     # Check CRC-32
+                    pass
             except BadZipfile:
                 return zinfo.filename
 
@@ -982,9 +1019,9 @@ class ZipFile:
             zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
 
         if fname != zinfo.orig_filename:
-            print ('WARNING: Header (%r) and directory (%r) filenames do not'
-                    ' match inside ZipFile')%(fname, zinfo.orig_filename)
-            print 'Using directory filename %r'%zinfo.orig_filename
+            print (('WARNING: Header (%r) and directory (%r) filenames do not'
+                    ' match inside ZipFile')%(fname, zinfo.orig_filename))
+            print ('Using directory filename %r'%zinfo.orig_filename)
             #raise BadZipfile, \
             #          'File name in directory "%r" and header "%r" differ.' % (
             #              zinfo.orig_filename, fname)
@@ -1059,13 +1096,13 @@ class ZipFile:
         """
         # build the destination pathname, replacing
         # forward slashes to platform specific separators.
-        if targetpath[-1:] == "/":
+        # Strip trailing path separator, unless it represents the root.
+        if (targetpath[-1:] in (os.path.sep, os.path.altsep)
+            and len(os.path.splitdrive(targetpath)[1]) > 1):
             targetpath = targetpath[:-1]
 
         # don't include leading "/" from file name if present
         fname = member.filename
-        if isinstance(fname, unicode):
-            fname = fname.encode(filesystem_encoding, 'replace')
         if fname.startswith('/'):
             fname = fname[1:]
         targetpath = os.path.join(targetpath, fname)
@@ -1074,13 +1111,6 @@ class ZipFile:
 
         # Create all upper directories if necessary.
         upperdirs = os.path.dirname(targetpath)
-        while upperdirs:
-            if os.path.exists(upperdirs):
-                if os.path.isdir(upperdirs):
-                    break
-                os.remove(upperdirs)
-            upperdirs = os.path.dirname(upperdirs)
-        upperdirs = os.path.dirname(targetpath)
         if upperdirs and not os.path.exists(upperdirs):
             os.makedirs(upperdirs)
 
@@ -1090,8 +1120,9 @@ class ZipFile:
                     with open(targetpath, 'wb') as target:
                         shutil.copyfileobj(source, target)
                 except:
+                    # Try sanitizing the file name to remove invalid characters
                     components = list(os.path.split(targetpath))
-                    components[-1] = sanitize_file_name(components[-1])
+                    components[-1] = sanitize_file_name2(components[-1])
                     targetpath = os.sep.join(components)
                     with open(targetpath, 'wb') as target:
                         shutil.copyfileobj(source, target)
@@ -1129,6 +1160,7 @@ class ZipFile:
                   "Attempt to write to ZIP archive that was already closed")
 
         st = os.stat(filename)
+        isdir = stat.S_ISDIR(st.st_mode)
         mtime = time.localtime(st.st_mtime)
         date_time = mtime[0:6]
         # Create ZipInfo instance to store file information
@@ -1139,6 +1171,8 @@ class ZipFile:
             arcname = arcname[1:]
         if not isinstance(arcname, unicode):
             arcname = arcname.decode(filesystem_encoding)
+        if isdir and not arcname.endswith('/'):
+            arcname += '/'
         zinfo = ZipInfo(arcname, date_time)
         zinfo.external_attr = (st[0] & 0xFFFF) << 16L      # Unix attributes
         if compress_type is None:
@@ -1152,6 +1186,16 @@ class ZipFile:
 
         self._writecheck(zinfo)
         self._didModify = True
+
+        if isdir:
+            zinfo.file_size = 0
+            zinfo.compress_size = 0
+            zinfo.CRC = 0
+            self.filelist.append(zinfo)
+            self.NameToInfo[zinfo.filename] = zinfo
+            self.fp.write(zinfo.FileHeader())
+            return
+
         with open(filename, "rb") as fp:
             # Must overwrite CRC and sizes with correct data later
             zinfo.CRC = CRC = 0
@@ -1261,12 +1305,6 @@ class ZipFile:
         """Call the "close()" method in case the user forgot."""
         self.close()
 
-    def __enter__(self):
-        return self
-
-    def __exit__(self, typ, value, traceback):
-        self.close()
-
     def close(self):
         """Close the file, and for mode "w" and "a" write the ending
         records."""
@@ -1338,19 +1376,26 @@ class ZipFile:
 
             pos2 = self.fp.tell()
             # Write end-of-zip-archive record
+            centDirCount = count
+            centDirSize = pos2 - pos1
             centDirOffset = pos1
-            if pos1 > ZIP64_LIMIT:
+            if (centDirCount >= ZIP_FILECOUNT_LIMIT or
+                centDirOffset > ZIP64_LIMIT or
+                centDirSize > ZIP64_LIMIT):
                 # Need to write the ZIP64 end-of-archive records
                 zip64endrec = struct.pack(
                         structEndArchive64, stringEndArchive64,
-                        44, 45, 45, 0, 0, count, count, pos2 - pos1, pos1)
+                        44, 45, 45, 0, 0, centDirCount, centDirCount,
+                        centDirSize, centDirOffset)
                 self.fp.write(zip64endrec)
 
                 zip64locrec = struct.pack(
                         structEndArchive64Locator,
                         stringEndArchive64Locator, 0, pos2, 1)
                 self.fp.write(zip64locrec)
-                centDirOffset = 0xFFFFFFFF
+                centDirCount = min(centDirCount, 0xFFFF)
+                centDirSize = min(centDirSize, 0xFFFFFFFF)
+                centDirOffset = min(centDirOffset, 0xFFFFFFFF)
 
             # check for valid comment length
             if len(self.comment) >= ZIP_MAX_COMMENT:
@@ -1361,9 +1406,8 @@ class ZipFile:
                 self.comment = self.comment[:ZIP_MAX_COMMENT]
 
             endrec = struct.pack(structEndArchive, stringEndArchive,
-                                 0, 0, count % ZIP_FILECOUNT_LIMIT,
-                                 count % ZIP_FILECOUNT_LIMIT, pos2 - pos1,
-                                 centDirOffset, len(self.comment))
+                                 0, 0, centDirCount, centDirCount,
+                                 centDirSize, centDirOffset, len(self.comment))
             self.fp.write(endrec)
             self.fp.write(self.comment)
             self.fp.flush()
@@ -1544,7 +1588,9 @@ def main(args = None):
             print USAGE
             sys.exit(1)
         zf = ZipFile(args[1], 'r')
-        zf.testzip()
+        badfile = zf.testzip()
+        if badfile:
+            print("The following enclosed file is corrupted: {!r}".format(badfile))
         print "Done testing"
 
     elif args[0] == '-e':
@@ -1563,9 +1609,8 @@ def main(args = None):
             tgtdir = os.path.dirname(tgt)
             if not os.path.exists(tgtdir):
                 os.makedirs(tgtdir)
-            fp = open(tgt, 'wb')
-            fp.write(zf.read(path))
-            fp.close()
+            with open(tgt, 'wb') as fp:
+                fp.write(zf.read(path))
         zf.close()
 
     elif args[0] == '-c':