From d42c3031ae69eaff839a44279508621d475d0c66 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 28 Jul 2008 15:03:28 -0700 Subject: [PATCH] Add support for writing metadata to epub-meta --- src/calibre/ebooks/metadata/epub.py | 55 +- src/calibre/ebooks/metadata/meta.py | 3 + src/calibre/linux.py | 3 +- src/calibre/utils/zipfile.py | 914 +++++++++++++++++++++------- 4 files changed, 747 insertions(+), 228 deletions(-) diff --git a/src/calibre/ebooks/metadata/epub.py b/src/calibre/ebooks/metadata/epub.py index 6f8f211552..44eaedaf26 100644 --- a/src/calibre/ebooks/metadata/epub.py +++ b/src/calibre/ebooks/metadata/epub.py @@ -7,13 +7,13 @@ __copyright__ = '2008, Kovid Goyal ' import sys, os -from zipfile import ZipFile, BadZipfile +from calibre.utils.zipfile import ZipFile, BadZipfile from cStringIO import StringIO from contextlib import closing from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup -from calibre.ebooks.metadata.opf import OPF, OPFReader - +from calibre.ebooks.metadata.opf import OPF, OPFReader, OPFCreator +from calibre.ebooks.metadata import get_parser, MetaInformation class EPubException(Exception): pass @@ -71,9 +71,9 @@ class OCFReader(OCF): raise EPubException("missing OPF package file") class OCFZipReader(OCFReader): - def __init__(self, stream): + def __init__(self, stream, mode='r'): try: - self.archive = ZipFile(stream, 'r') + self.archive = ZipFile(stream, mode) except BadZipfile: raise EPubException("not a ZIP .epub OCF container") self.root = getattr(stream, 'name', os.getcwd()) @@ -81,6 +81,19 @@ class OCFZipReader(OCFReader): def open(self, name, mode='r'): return StringIO(self.archive.read(name)) + +class OCFZipWriter(OCFZipReader): + + def __init__(self, stream): + OCFZipReader.__init__(self, stream, mode='a') + + def set_metadata(self, mi): + name = self.container[OPF.MIMETYPE] + stream = StringIO() + opf = OPFCreator(self.root, mi) + opf.render(stream) + self.archive.delete(name) + self.archive.writestr(name, stream.getvalue()) class OCFDirReader(OCFReader): def __init__(self, path): @@ -95,13 +108,35 @@ def get_metadata(stream): """ Return metadata as a L{MetaInfo} object """ return OCFZipReader(stream).opf +def set_metadata(stream, mi): + OCFZipWriter(stream).set_metadata(mi) + +def option_parser(): + parser = get_parser('epub') + parser.remove_option('--category') + parser.add_option('--tags', default=None, help=_('A comma separated list of tags to set')) + return parser + def main(args=sys.argv): - if len(args) != 2 or '--help' in args or '-h' in args: - print >>sys.stderr, _('Usage:'), args[0], _('mybook.epub') + parser = option_parser() + opts, args = parser.parse_args(args) + if len(args) != 2: + parser.print_help() return 1 - - path = os.path.abspath(os.path.expanduser(args[1])) - print unicode(get_metadata(open(path, 'rb'))) + stream = open(args[1], 'r+b') + mi = MetaInformation(OCFZipReader(stream).opf) + if opts.title: + mi.title = opts.title + if opts.authors: + mi.authors = opts.authors.split(',') + if opts.tags: + mi.tags = opts.tags.split(',') + if opts.comment: + mi.comments = opts.comment + + set_metadata(stream, mi) + + print unicode(mi) return 0 if __name__ == '__main__': diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index f05a31654e..15db121cb0 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -14,6 +14,7 @@ from calibre.ebooks.mobi.reader import get_metadata as mobi_metadata from calibre.ebooks.metadata.opf import OPFReader from calibre.ebooks.metadata.rtf import set_metadata as set_rtf_metadata from calibre.ebooks.lrf.meta import set_metadata as set_lrf_metadata +from calibre.ebooks.metadata.epub import set_metadata as set_epub_metadata from calibre.ebooks.metadata import MetaInformation @@ -88,6 +89,8 @@ def set_metadata(stream, mi, stream_type='lrf'): if stream_type: stream_type = stream_type.lower() if stream_type == 'lrf': set_lrf_metadata(stream, mi) + elif stream_type == 'epub': + set_epub_metadata(stream, mi) elif stream_type == 'rtf': set_rtf_metadata(stream, mi) diff --git a/src/calibre/linux.py b/src/calibre/linux.py index eaaeabcb4c..1cb0a1ff81 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -165,6 +165,7 @@ def setup_completion(fatal_errors): from calibre.web.feeds.main import option_parser as feeds2disk from calibre.web.feeds.recipes import titles as feed_titles from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf + from calibre.ebooks.metadata.epub import option_parser as epub_meta f = open_file('/etc/bash_completion.d/libprs500') f.close() @@ -192,7 +193,7 @@ def setup_completion(fatal_errors): f.write(opts_and_exts('pdf-meta', metaop, ['pdf'])) f.write(opts_and_exts('lit-meta', metaop, ['lit'])) f.write(opts_and_exts('opf-meta', metaop, ['opf'])) - f.write(opts_and_exts('epub-meta', metaop, ['epub'])) + f.write(opts_and_exts('epub-meta', epub_meta, ['epub'])) f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf'])) f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf'])) f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc'])) diff --git a/src/calibre/utils/zipfile.py b/src/calibre/utils/zipfile.py index 56fca9349b..02f7eac5c4 100644 --- a/src/calibre/utils/zipfile.py +++ b/src/calibre/utils/zipfile.py @@ -1,14 +1,16 @@ """ -Read and write ZIP files. Modified by Kovid Goyal to allow replacing of files -in the ZIP archive. +Read and write ZIP files. Modified by Kovid Goyal to support replacing files in +a zip archive. """ -import struct, os, time, sys +import struct, os, time, sys, shutil import binascii, cStringIO try: import zlib # We may need its compression method + crc32 = zlib.crc32 except ImportError: zlib = None + crc32 = binascii.crc32 __all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile" ] @@ -26,31 +28,51 @@ class LargeZipFile(Exception): error = BadZipfile # The exception raised by this module ZIP64_LIMIT= (1 << 31) - 1 +ZIP_FILECOUNT_LIMIT = 1 << 16 +ZIP_MAX_COMMENT = (1 << 16) - 1 # constants for Zip file compression methods ZIP_STORED = 0 ZIP_DEFLATED = 8 # Other ZIP compression methods not supported -# Here are some struct module formats for reading headers -structEndArchive = "<4s4H2LH" # 9 items, end of archive, 22 bytes -stringEndArchive = "PK\005\006" # magic number for end of archive record -structCentralDir = "<4s4B4HlLL5HLL"# 19 items, central directory, 46 bytes -stringCentralDir = "PK\001\002" # magic number for central directory -structFileHeader = "<4s2B4HlLL2H" # 12 items, file header record, 30 bytes -stringFileHeader = "PK\003\004" # magic number for file header -structEndArchive64Locator = "<4slql" # 4 items, locate Zip64 header, 20 bytes -stringEndArchive64Locator = "PK\x06\x07" # magic token for locator header -structEndArchive64 = "<4sqhhllqqqq" # 10 items, end of archive (Zip64), 56 bytes -stringEndArchive64 = "PK\x06\x06" # magic token for Zip64 header +# Below are some formats and associated data for reading/writing headers using +# the struct module. The names and structures of headers/records are those used +# in the PKWARE description of the ZIP file format: +# http://www.pkware.com/documents/casestudies/APPNOTE.TXT +# (URL valid as of January 2008) +# The "end of central directory" structure, magic number, size, and indices +# (section V.I in the format document) +structEndArchive = "<4s4H2LH" +stringEndArchive = "PK\005\006" +sizeEndCentDir = struct.calcsize(structEndArchive) + +_ECD_SIGNATURE = 0 +_ECD_DISK_NUMBER = 1 +_ECD_DISK_START = 2 +_ECD_ENTRIES_THIS_DISK = 3 +_ECD_ENTRIES_TOTAL = 4 +_ECD_SIZE = 5 +_ECD_OFFSET = 6 +_ECD_COMMENT_SIZE = 7 +# These last two indices are not part of the structure as defined in the +# spec, but they are used internally by this module as a convenience +_ECD_COMMENT = 8 +_ECD_LOCATION = 9 + +# The "central directory" structure, magic number, size, and indices +# of entries in the structure (section V.F in the format document) +structCentralDir = "<4s4B4HL2L5H2L" +stringCentralDir = "PK\001\002" +sizeCentralDir = struct.calcsize(structCentralDir) # indexes of entries in the central directory structure _CD_SIGNATURE = 0 _CD_CREATE_VERSION = 1 _CD_CREATE_SYSTEM = 2 _CD_EXTRACT_VERSION = 3 -_CD_EXTRACT_SYSTEM = 4 # is this meaningful? +_CD_EXTRACT_SYSTEM = 4 _CD_FLAG_BITS = 5 _CD_COMPRESS_TYPE = 6 _CD_TIME = 7 @@ -66,10 +88,15 @@ _CD_INTERNAL_FILE_ATTRIBUTES = 16 _CD_EXTERNAL_FILE_ATTRIBUTES = 17 _CD_LOCAL_HEADER_OFFSET = 18 -# indexes of entries in the local file header structure +# The "local file header" structure, magic number, size, and indices +# (section V.A in the format document) +structFileHeader = "<4s2B4HL2L2H" +stringFileHeader = "PK\003\004" +sizeFileHeader = struct.calcsize(structFileHeader) + _FH_SIGNATURE = 0 _FH_EXTRACT_VERSION = 1 -_FH_EXTRACT_SYSTEM = 2 # is this meaningful? +_FH_EXTRACT_SYSTEM = 2 _FH_GENERAL_PURPOSE_FLAG_BITS = 3 _FH_COMPRESSION_METHOD = 4 _FH_LAST_MOD_TIME = 5 @@ -80,6 +107,28 @@ _FH_UNCOMPRESSED_SIZE = 9 _FH_FILENAME_LENGTH = 10 _FH_EXTRA_FIELD_LENGTH = 11 +# The "Zip64 end of central directory locator" structure, magic number, and size +structEndArchive64Locator = "<4sLQL" +stringEndArchive64Locator = "PK\x06\x07" +sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator) + +# The "Zip64 end of central directory" record, magic number, size, and indices +# (section V.G in the format document) +structEndArchive64 = "<4sQ2H2L4Q" +stringEndArchive64 = "PK\x06\x06" +sizeEndCentDir64 = struct.calcsize(structEndArchive64) + +_CD64_SIGNATURE = 0 +_CD64_DIRECTORY_RECSIZE = 1 +_CD64_CREATE_VERSION = 2 +_CD64_EXTRACT_VERSION = 3 +_CD64_DISK_NUMBER = 4 +_CD64_DISK_NUMBER_START = 5 +_CD64_NUMBER_ENTRIES_THIS_DISK = 6 +_CD64_NUMBER_ENTRIES_TOTAL = 7 +_CD64_DIRECTORY_SIZE = 8 +_CD64_OFFSET_START_CENTDIR = 9 + def is_zipfile(filename): """Quickly see if file is a ZIP file by checking the magic number.""" try: @@ -96,9 +145,8 @@ def _EndRecData64(fpin, offset, endrec): """ Read the ZIP64 end-of-archive records and use that to update endrec """ - locatorSize = struct.calcsize(structEndArchive64Locator) - fpin.seek(offset - locatorSize, 2) - data = fpin.read(locatorSize) + fpin.seek(offset - sizeEndCentDir64Locator, 2) + data = fpin.read(sizeEndCentDir64Locator) sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) if sig != stringEndArchive64Locator: return endrec @@ -107,9 +155,8 @@ def _EndRecData64(fpin, offset, endrec): raise BadZipfile("zipfiles that span multiple disks are not supported") # Assume no 'zip64 extensible data' - endArchiveSize = struct.calcsize(structEndArchive64) - fpin.seek(offset - locatorSize - endArchiveSize, 2) - data = fpin.read(endArchiveSize) + fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2) + data = fpin.read(sizeEndCentDir64) sig, sz, create_version, read_version, disk_num, disk_dir, \ dircount, dircount2, dirsize, diroffset = \ struct.unpack(structEndArchive64, data) @@ -117,12 +164,12 @@ def _EndRecData64(fpin, offset, endrec): return endrec # Update the original endrec using data from the ZIP64 record - endrec[1] = disk_num - endrec[2] = disk_dir - endrec[3] = dircount - endrec[4] = dircount2 - endrec[5] = dirsize - endrec[6] = diroffset + endrec[_ECD_DISK_NUMBER] = disk_num + endrec[_ECD_DISK_START] = disk_dir + endrec[_ECD_ENTRIES_THIS_DISK] = dircount + endrec[_ECD_ENTRIES_TOTAL] = dircount2 + endrec[_ECD_SIZE] = dirsize + endrec[_ECD_OFFSET] = diroffset return endrec @@ -131,38 +178,59 @@ def _EndRecData(fpin): The data is a list of the nine items in the ZIP "End of central dir" record followed by a tenth item, the file seek offset of this record.""" - fpin.seek(-22, 2) # Assume no archive comment. - filesize = fpin.tell() + 22 # Get file size + + # Determine file size + fpin.seek(0, 2) + filesize = fpin.tell() + + # Check to see if this is ZIP file with no archive comment (the + # "end of central directory" structure should be the last item in the + # file if this is the case). + fpin.seek(-sizeEndCentDir, 2) data = fpin.read() if data[0:4] == stringEndArchive and data[-2:] == "\000\000": + # the signature is correct and there's no comment, unpack structure endrec = struct.unpack(structEndArchive, data) - endrec = list(endrec) - endrec.append("") # Append the archive comment - endrec.append(filesize - 22) # Append the record start offset - if endrec[-4] == -1 or endrec[-4] == 0xffffffff: - return _EndRecData64(fpin, -22, endrec) + endrec=list(endrec) + + # Append a blank comment and record start offset + endrec.append("") + endrec.append(filesize - sizeEndCentDir) + if endrec[_ECD_OFFSET] == 0xffffffff: + # the value for the "offset of the start of the central directory" + # indicates that there is a "Zip64 end of central directory" + # structure present, so go look for it + return _EndRecData64(fpin, -sizeEndCentDir, endrec) + return endrec - # Search the last END_BLOCK bytes of the file for the record signature. - # The comment is appended to the ZIP file and has a 16 bit length. - # So the comment may be up to 64K long. We limit the search for the - # signature to a few Kbytes at the end of the file for efficiency. - # also, the signature must not appear in the comment. - END_BLOCK = min(filesize, 1024 * 4) - fpin.seek(filesize - END_BLOCK, 0) + + # Either this is not a ZIP file, or it is a ZIP file with an archive + # comment. Search the end of the file for the "end of central directory" + # record signature. The comment is the last item in the ZIP file and may be + # up to 64K long. It is assumed that the "end of central directory" magic + # number does not appear in the comment. + maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0) + fpin.seek(maxCommentStart, 0) data = fpin.read() start = data.rfind(stringEndArchive) - if start >= 0: # Correct signature string was found - endrec = struct.unpack(structEndArchive, data[start:start+22]) - endrec = list(endrec) - comment = data[start+22:] - if endrec[7] == len(comment): # Comment length checks out + if start >= 0: + # found the magic number; attempt to unpack and interpret + recData = data[start:start+sizeEndCentDir] + endrec = list(struct.unpack(structEndArchive, recData)) + comment = data[start+sizeEndCentDir:] + # check that comment length is correct + if endrec[_ECD_COMMENT_SIZE] == len(comment): # Append the archive comment and start offset endrec.append(comment) - endrec.append(filesize - END_BLOCK + start) - if endrec[-4] == -1 or endrec[-4] == 0xffffffff: - return _EndRecData64(fpin, - END_BLOCK + start, endrec) + endrec.append(maxCommentStart + start) + if endrec[_ECD_OFFSET] == 0xffffffff: + # There is apparently a "Zip64 end of central directory" + # structure present, so go look for it + return _EndRecData64(fpin, start - filesize, endrec) return endrec - return # Error, return None + + # Unable to find a valid end of central directory structure + return class ZipInfo (object): @@ -187,6 +255,7 @@ class ZipInfo (object): 'CRC', 'compress_size', 'file_size', + '_raw_time', 'file_offset', ) @@ -247,34 +316,50 @@ class ZipInfo (object): if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT: # File is larger than what fits into a 4 byte integer, # fall back to the ZIP64 extension - fmt = '= 24: - counts = unpack('> 1) & 0x7FFFFFFF) ^ poly + else: + crc = ((crc >> 1) & 0x7FFFFFFF) + table[i] = crc + return table + crctable = _GenerateCRCTable() + + def _crc32(self, ch, crc): + """Compute the CRC32 primitive on one byte.""" + return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ord(ch)) & 0xff] + + def __init__(self, pwd): + self.key0 = 305419896 + self.key1 = 591751049 + self.key2 = 878082192 + for p in pwd: + self._UpdateKeys(p) + + def _UpdateKeys(self, c): + self.key0 = self._crc32(c, self.key0) + self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295 + self.key1 = (self.key1 * 134775813 + 1) & 4294967295 + self.key2 = self._crc32(chr((self.key1 >> 24) & 255), self.key2) + + def __call__(self, c): + """Decrypt a single character.""" + c = ord(c) + k = self.key2 | 2 + c = c ^ (((k * (k^1)) >> 8) & 255) + c = chr(c) + self._UpdateKeys(c) + return c + +class ZipExtFile: + """File-like object for reading an archive member. + Is returned by ZipFile.open(). + """ + + def __init__(self, fileobj, zipinfo, decrypt=None): + self.fileobj = fileobj + self.decrypter = decrypt + self.bytes_read = 0L + self.rawbuffer = '' + self.readbuffer = '' + self.linebuffer = '' + self.eof = False + self.univ_newlines = False + self.nlSeps = ("\n", ) + self.lastdiscard = '' + + self.compress_type = zipinfo.compress_type + self.compress_size = zipinfo.compress_size + + self.closed = False + self.mode = "r" + self.name = zipinfo.filename + + # read from compressed files in 64k blocks + self.compreadsize = 64*1024 + if self.compress_type == ZIP_DEFLATED: + self.dc = zlib.decompressobj(-15) + + def set_univ_newlines(self, univ_newlines): + self.univ_newlines = univ_newlines + + # pick line separator char(s) based on universal newlines flag + self.nlSeps = ("\n", ) + if self.univ_newlines: + self.nlSeps = ("\r\n", "\r", "\n") + + def __iter__(self): + return self + + def next(self): + nextline = self.readline() + if not nextline: + raise StopIteration() + + return nextline + + def close(self): + self.closed = True + + def _checkfornewline(self): + nl, nllen = -1, -1 + if self.linebuffer: + # ugly check for cases where half of an \r\n pair was + # read on the last pass, and the \r was discarded. In this + # case we just throw away the \n at the start of the buffer. + if (self.lastdiscard, self.linebuffer[0]) == ('\r','\n'): + self.linebuffer = self.linebuffer[1:] + + for sep in self.nlSeps: + nl = self.linebuffer.find(sep) + if nl >= 0: + nllen = len(sep) + return nl, nllen + + return nl, nllen + + def readline(self, size = -1): + """Read a line with approx. size. If size is negative, + read a whole line. + """ + if size < 0: + size = sys.maxint + elif size == 0: + return '' + + # check for a newline already in buffer + nl, nllen = self._checkfornewline() + + if nl >= 0: + # the next line was already in the buffer + nl = min(nl, size) + else: + # no line break in buffer - try to read more + size -= len(self.linebuffer) + while nl < 0 and size > 0: + buf = self.read(min(size, 100)) + if not buf: + break + self.linebuffer += buf + size -= len(buf) + + # check for a newline in buffer + nl, nllen = self._checkfornewline() + + # we either ran out of bytes in the file, or + # met the specified size limit without finding a newline, + # so return current buffer + if nl < 0: + s = self.linebuffer + self.linebuffer = '' + return s + + buf = self.linebuffer[:nl] + self.lastdiscard = self.linebuffer[nl:nl + nllen] + self.linebuffer = self.linebuffer[nl + nllen:] + + # line is always returned with \n as newline char (except possibly + # for a final incomplete line in the file, which is handled above). + return buf + "\n" + + def readlines(self, sizehint = -1): + """Return a list with all (following) lines. The sizehint parameter + is ignored in this implementation. + """ + result = [] + while True: + line = self.readline() + if not line: break + result.append(line) + return result + + def read(self, size = None): + # act like file() obj and return empty string if size is 0 + if size == 0: + return '' + + # determine read size + bytesToRead = self.compress_size - self.bytes_read + + # adjust read size for encrypted files since the first 12 bytes + # are for the encryption/password information + if self.decrypter is not None: + bytesToRead -= 12 + + if size is not None and size >= 0: + if self.compress_type == ZIP_STORED: + lr = len(self.readbuffer) + bytesToRead = min(bytesToRead, size - lr) + elif self.compress_type == ZIP_DEFLATED: + if len(self.readbuffer) > size: + # the user has requested fewer bytes than we've already + # pulled through the decompressor; don't read any more + bytesToRead = 0 + else: + # user will use up the buffer, so read some more + lr = len(self.rawbuffer) + bytesToRead = min(bytesToRead, self.compreadsize - lr) + + # avoid reading past end of file contents + if bytesToRead + self.bytes_read > self.compress_size: + bytesToRead = self.compress_size - self.bytes_read + + # try to read from file (if necessary) + if bytesToRead > 0: + bytes = self.fileobj.read(bytesToRead) + self.bytes_read += len(bytes) + self.rawbuffer += bytes + + # handle contents of raw buffer + if self.rawbuffer: + newdata = self.rawbuffer + self.rawbuffer = '' + + # decrypt new data if we were given an object to handle that + if newdata and self.decrypter is not None: + newdata = ''.join(map(self.decrypter, newdata)) + + # decompress newly read data if necessary + if newdata and self.compress_type == ZIP_DEFLATED: + newdata = self.dc.decompress(newdata) + self.rawbuffer = self.dc.unconsumed_tail + if self.eof and len(self.rawbuffer) == 0: + # we're out of raw bytes (both from the file and + # the local buffer); flush just to make sure the + # decompressor is done + newdata += self.dc.flush() + # prevent decompressor from being used again + self.dc = None + + self.readbuffer += newdata + + + # return what the user asked for + if size is None or len(self.readbuffer) <= size: + bytes = self.readbuffer + self.readbuffer = '' + else: + bytes = self.readbuffer[:size] + self.readbuffer = self.readbuffer[size:] + + return bytes + + +class ZipFile: + """ Class with methods to open, read, write, close, list and update zip files. + + z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=False) file: Either the path to the file, or a file-like object. If it is a path, the file will be opened and closed by ZipFile. @@ -318,8 +655,9 @@ class ZipFile: def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=False): """Open the ZIP file with mode read "r", write "w" or append "a".""" - self._allowZip64 = allowZip64 - self._didModify = False + if mode not in ("r", "w", "a"): + raise RuntimeError('ZipFile() requires mode "r", "w", or "a"') + if compression == ZIP_STORED: pass elif compression == ZIP_DEFLATED: @@ -328,18 +666,30 @@ class ZipFile: "Compression requires the (missing) zlib module" else: raise RuntimeError, "That compression method is not supported" + + self._allowZip64 = allowZip64 + self._didModify = False self.debug = 0 # Level of printing: 0 through 3 self.NameToInfo = {} # Find file info given name self.filelist = [] # List of ZipInfo instances for archive self.compression = compression # Method of compression self.mode = key = mode.replace('b', '')[0] + self.pwd = None + self.comment = '' # Check if we were passed a file-like object if isinstance(file, basestring): self._filePassed = 0 self.filename = file modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'} - self.fp = open(file, modeDict[mode]) + try: + self.fp = open(file, modeDict[mode]) + except IOError: + if mode == 'a': + mode = key = 'w' + self.fp = open(file, modeDict[mode]) + else: + raise else: self._filePassed = 1 self.fp = file @@ -382,18 +732,20 @@ class ZipFile: raise BadZipfile, "File is not a zip file" if self.debug > 1: print endrec - size_cd = endrec[5] # bytes in central directory - offset_cd = endrec[6] # offset of central directory - self.comment = endrec[8] # archive comment - # endrec[9] is the offset of the "End of Central Dir" record - if endrec[9] > ZIP64_LIMIT: - x = endrec[9] - size_cd - 56 - 20 - else: - x = endrec[9] - size_cd + size_cd = endrec[_ECD_SIZE] # bytes in central directory + offset_cd = endrec[_ECD_OFFSET] # offset of central directory + self.comment = endrec[_ECD_COMMENT] # archive comment + # "concat" is zero, unless zip was concatenated to another file - concat = x - offset_cd + concat = endrec[_ECD_LOCATION] - size_cd - offset_cd + if endrec[_ECD_LOCATION] > ZIP64_LIMIT: + # If the offset of the "End of Central Dir" record requires Zip64 + # extension structures, account for them + concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) + if self.debug > 2: - print "given, inferred, offset", offset_cd, x, concat + inferred = concat + offset_cd + print "given, inferred, offset", offset_cd, inferred, concat # self.start_dir: Position of start of central directory self.start_dir = offset_cd + concat fp.seek(self.start_dir, 0) @@ -401,8 +753,7 @@ class ZipFile: fp = cStringIO.StringIO(data) total = 0 while total < size_cd: - centdir = fp.read(46) - total = total + 46 + centdir = fp.read(sizeCentralDir) if centdir[0:4] != stringCentralDir: raise BadZipfile, "Bad magic number for central directory" centdir = struct.unpack(structCentralDir, centdir) @@ -413,100 +764,103 @@ class ZipFile: x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) - total = (total + centdir[_CD_FILENAME_LENGTH] - + centdir[_CD_EXTRA_FIELD_LENGTH] - + centdir[_CD_COMMENT_LENGTH]) x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] (x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size) = centdir[1:12] x.volume, x.internal_attr, x.external_attr = centdir[15:18] # Convert date/time code to (year, month, day, hour, min, sec) + x._raw_time = t x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) x._decodeExtra() x.header_offset = x.header_offset + concat + x.filename = x._decodeFilename() self.filelist.append(x) self.NameToInfo[x.filename] = x + + # update total bytes read from central directory + total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + + centdir[_CD_EXTRA_FIELD_LENGTH] + + centdir[_CD_COMMENT_LENGTH]) + if self.debug > 2: print "total", total - - + def _calculate_file_offsets(self): - for zip_info in self.filelist: - self.fp.seek(zip_info.header_offset, 0) - fheader = self.fp.read(30) - if fheader[0:4] != stringFileHeader: - raise BadZipfile, "Bad magic number for file header" - fheader = struct.unpack(structFileHeader, fheader) - # file_offset is computed here, since the extra field for - # the central directory and for the local file header - # refer to different fields, and they can have different - # lengths - file_offset = (zip_info.header_offset + 30 - + fheader[_FH_FILENAME_LENGTH] - + fheader[_FH_EXTRA_FIELD_LENGTH]) - fname = self.fp.read(fheader[_FH_FILENAME_LENGTH]) - if fname != zip_info.orig_filename: - raise RuntimeError, \ - 'File name in directory "%s" and header "%s" differ.' % ( - zip_info.orig_filename, fname) - + for zip_info in self.filelist: + self.fp.seek(zip_info.header_offset, 0) + fheader = self.fp.read(30) + if fheader[0:4] != stringFileHeader: + raise BadZipfile, "Bad magic number for file header" + fheader = struct.unpack(structFileHeader, fheader) + # file_offset is computed here, since the extra field for + # the central directory and for the local file header + # refer to different fields, and they can have different + # lengths + file_offset = (zip_info.header_offset + 30 + + fheader[_FH_FILENAME_LENGTH] + + fheader[_FH_EXTRA_FIELD_LENGTH]) + fname = self.fp.read(fheader[_FH_FILENAME_LENGTH]) + if fname != zip_info.orig_filename: + raise RuntimeError( + 'File name in directory "%s" and header "%s" differ.' % ( + zip_info.orig_filename, fname)) + zip_info.file_offset = file_offset + + def replace(self, filename, arcname=None, compress_type=None): + """Delete arcname, and put the bytes from filename into the + archive under the name arcname.""" + deleteName = arcname + if deleteName is None: + deleteName = filename + self.delete(deleteName) + self.write(filename, arcname, compress_type) + + def replacestr(self, zinfo, bytes): + """Delete zinfo.filename, and write a new file into the archive. The + contents is the string 'bytes'.""" + self.delete(zinfo.filename) + self.writestr(zinfo, bytes) - - def replace(self, filename, arcname=None, compress_type=None): - """Delete arcname, and put the bytes from filename into the - archive under the name arcname.""" - deleteName = arcname - if deleteName is None: - deleteName = filename - self.delete(deleteName) - self.write(filename, arcname, compress_type) - - def replacestr(self, zinfo, bytes): - """Delete zinfo.filename, and write a new file into the archive. The - contents is the string 'bytes'.""" - self.delete(zinfo.filename) - self.writestr(zinfo, bytes) - - def delete(self, name): - """Delete the file from the archive. If it appears multiple - times only the first instance will be deleted.""" - for i in range (0, len(self.filelist)): - if self.filelist[i].filename == name: - if self.debug: - print "Removing", name - deleted_offset = self.filelist[i].header_offset + def delete(self, name): + """Delete the file from the archive. If it appears multiple + times only the first instance will be deleted.""" + for i in range (0, len(self.filelist)): + if self.filelist[i].filename == name: + if self.debug: + print "Removing", name + deleted_offset = self.filelist[i].header_offset deleted_size = (self.filelist[i].file_offset - self.filelist[i].header_offset) + self.filelist[i].compress_size - zinfo_size = struct.calcsize(structCentralDir) + len(self.filelist[i].filename) + len(self.filelist[i].extra) - # Remove the file's data from the archive. - current_offset = self.fp.tell() - self.fp.seek(0, 2) - archive_size = self.fp.tell() - self.fp.seek(deleted_offset + deleted_size) - buf = self.fp.read() - self.fp.seek(deleted_offset) - self.fp.write(buf) - self.fp.truncate(archive_size - deleted_size - zinfo_size) - if current_offset > deleted_offset + deleted_size: - current_offset -= deleted_size - elif current_offset > deleted_offset: - current_offset = deleted_offset - self.fp.seek(current_offset, 0) - # Remove file from central directory. - del self.filelist[i] - # Adjust the remaining offsets in the central directory. - for j in range (i, len(self.filelist)): - if self.filelist[j].header_offset > deleted_offset: - self.filelist[j].header_offset -= deleted_size - if self.filelist[j].file_offset > deleted_offset: - self.filelist[j].file_offset -= deleted_size - return - if self.debug: - print name, "not in archive" - + zinfo_size = struct.calcsize(structCentralDir) + len(self.filelist[i].filename) + len(self.filelist[i].extra) + # Remove the file's data from the archive. + current_offset = self.fp.tell() + self.fp.seek(0, 2) + archive_size = self.fp.tell() + self.fp.seek(deleted_offset + deleted_size) + buf = self.fp.read() + self.fp.seek(deleted_offset) + self.fp.write(buf) + self.fp.truncate(archive_size - deleted_size - zinfo_size) + if current_offset > deleted_offset + deleted_size: + current_offset -= deleted_size + elif current_offset > deleted_offset: + current_offset = deleted_offset + self.fp.seek(current_offset, 0) + # Remove file from central directory. + del self.filelist[i] + # Adjust the remaining offsets in the central directory. + for j in range (i, len(self.filelist)): + if self.filelist[j].header_offset > deleted_offset: + self.filelist[j].header_offset -= deleted_size + if self.filelist[j].file_offset > deleted_offset: + self.filelist[j].file_offset -= deleted_size + return + if self.debug: + print name, "not in archive" + def namelist(self): """Return a list of file names in the archive.""" l = [] @@ -534,61 +888,156 @@ class ZipFile: except BadZipfile: return zinfo.filename - def getinfo(self, name): """Return the instance of ZipInfo given 'name'.""" - return self.NameToInfo[name] + info = self.NameToInfo.get(name) + if info is None: + raise KeyError( + 'There is no item named %r in the archive' % name) - def read(self, name): + return info + + def setpassword(self, pwd): + """Set default password for encrypted files.""" + self.pwd = pwd + + def read(self, name, pwd=None): """Return file bytes (as a string) for name.""" - if self.mode not in ("r", "a"): - raise RuntimeError, 'read() requires mode "r" or "a"' + return self.open(name, "r", pwd).read() + + def open(self, name, mode="r", pwd=None): + """Return file-like object for 'name'.""" + if mode not in ("r", "U", "rU"): + raise RuntimeError, 'open() requires mode "r", "U", or "rU"' if not self.fp: raise RuntimeError, \ "Attempt to read ZIP archive that was already closed" - zinfo = self.getinfo(name) - filepos = self.fp.tell() - self.fp.seek(zinfo.header_offset, 0) + # Only open a new file for instances where we were not + # given a file object in the constructor + if self._filePassed: + zef_file = self.fp + else: + zef_file = open(self.filename, 'rb') + + # Make sure we have an info object + if isinstance(name, ZipInfo): + # 'name' is already an info object + zinfo = name + else: + # Get info object for name + zinfo = self.getinfo(name) + + zef_file.seek(zinfo.header_offset, 0) # Skip the file header: - fheader = self.fp.read(30) + fheader = zef_file.read(sizeFileHeader) if fheader[0:4] != stringFileHeader: raise BadZipfile, "Bad magic number for file header" fheader = struct.unpack(structFileHeader, fheader) - fname = self.fp.read(fheader[_FH_FILENAME_LENGTH]) + fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) if fheader[_FH_EXTRA_FIELD_LENGTH]: - self.fp.read(fheader[_FH_EXTRA_FIELD_LENGTH]) + zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) if fname != zinfo.orig_filename: raise BadZipfile, \ 'File name in directory "%s" and header "%s" differ.' % ( zinfo.orig_filename, fname) - bytes = self.fp.read(zinfo.compress_size) - self.fp.seek(filepos, 0) - if zinfo.compress_type == ZIP_STORED: - pass - elif zinfo.compress_type == ZIP_DEFLATED: - if not zlib: - raise RuntimeError, \ - "De-compression requires the (missing) zlib module" - # zlib compress/decompress code by Jeremy Hylton of CNRI - dc = zlib.decompressobj(-15) - bytes = dc.decompress(bytes) - # need to feed in unused pad byte so that zlib won't choke - ex = dc.decompress('Z') + dc.flush() - if ex: - bytes = bytes + ex + # check for encrypted flag & handle password + is_encrypted = zinfo.flag_bits & 0x1 + zd = None + if is_encrypted: + if not pwd: + pwd = self.pwd + if not pwd: + raise RuntimeError, "File %s is encrypted, " \ + "password required for extraction" % name + + zd = _ZipDecrypter(pwd) + # The first 12 bytes in the cypher stream is an encryption header + # used to strengthen the algorithm. The first 11 bytes are + # completely random, while the 12th contains the MSB of the CRC, + # or the MSB of the file time depending on the header type + # and is used to check the correctness of the password. + bytes = zef_file.read(12) + h = map(zd, bytes[0:12]) + if zinfo.flag_bits & 0x8: + # compare against the file type from extended local headers + check_byte = (zinfo._raw_time >> 8) & 0xff + else: + # compare against the CRC otherwise + check_byte = (zinfo.CRC >> 24) & 0xff + if ord(h[11]) != check_byte: + raise RuntimeError("Bad password for file", name) + + # build and return a ZipExtFile + if zd is None: + zef = ZipExtFile(zef_file, zinfo) else: - raise BadZipfile, \ - "Unsupported compression method %d for file %s" % \ - (zinfo.compress_type, name) - crc = binascii.crc32(bytes) - if crc != zinfo.CRC: - raise BadZipfile, "Bad CRC-32 for file %s" % name - return bytes + zef = ZipExtFile(zef_file, zinfo, zd) + + # set universal newlines on ZipExtFile if necessary + if "U" in mode: + zef.set_univ_newlines(True) + return zef + + def extract(self, member, path=None, pwd=None): + """Extract a member from the archive to the current working directory, + using its full name. Its file information is extracted as accurately + as possible. `member' may be a filename or a ZipInfo object. You can + specify a different directory using `path'. + """ + if not isinstance(member, ZipInfo): + member = self.getinfo(member) + + if path is None: + path = os.getcwd() + + return self._extract_member(member, path, pwd) + + def extractall(self, path=None, members=None, pwd=None): + """Extract all members from the archive to the current working + directory. `path' specifies a different directory to extract to. + `members' is optional and must be a subset of the list returned + by namelist(). + """ + if members is None: + members = self.namelist() + + for zipinfo in members: + self.extract(zipinfo, path, pwd) + + def _extract_member(self, member, targetpath, pwd): + """Extract the ZipInfo object 'member' to a physical + file on the path targetpath. + """ + # build the destination pathname, replacing + # forward slashes to platform specific separators. + if targetpath[-1:] == "/": + targetpath = targetpath[:-1] + + # don't include leading "/" from file name if present + if os.path.isabs(member.filename): + targetpath = os.path.join(targetpath, member.filename[1:]) + else: + targetpath = os.path.join(targetpath, member.filename) + + targetpath = os.path.normpath(targetpath) + + # Create all upper directories if necessary. + upperdirs = os.path.dirname(targetpath) + if upperdirs and not os.path.exists(upperdirs): + os.makedirs(upperdirs) + + source = self.open(member, pwd=pwd) + target = file(targetpath, "wb") + shutil.copyfileobj(source, target) + source.close() + target.close() + + return targetpath def _writecheck(self, zinfo): """Check for errors before writing a file to the archive.""" @@ -616,6 +1065,10 @@ class ZipFile: def write(self, filename, arcname=None, compress_type=None): """Put the bytes from filename into the archive under the name arcname.""" + if not self.fp: + raise RuntimeError( + "Attempt to write to ZIP archive that was already closed") + st = os.stat(filename) mtime = time.localtime(st.st_mtime) date_time = mtime[0:6] @@ -654,7 +1107,7 @@ class ZipFile: if not buf: break file_size = file_size + len(buf) - CRC = binascii.crc32(buf, CRC) + CRC = crc32(buf, CRC) & 0xffffffff if cmpr: buf = cmpr.compress(buf) compress_size = compress_size + len(buf) @@ -672,7 +1125,7 @@ class ZipFile: # Seek backwards and write CRC and file sizes position = self.fp.tell() # Preserve current position in file self.fp.seek(zinfo.header_offset + 14, 0) - self.fp.write(struct.pack(" ZIP64_LIMIT: extra.append(zinfo.file_size) extra.append(zinfo.compress_size) - file_size = 0xffffffff #-1 - compress_size = 0xffffffff #-1 + file_size = 0xffffffff + compress_size = 0xffffffff else: file_size = zinfo.file_size compress_size = zinfo.compress_size if zinfo.header_offset > ZIP64_LIMIT: extra.append(zinfo.header_offset) - header_offset = -1 # struct "l" format: 32 one bits + header_offset = 0xffffffffL else: header_offset = zinfo.header_offset @@ -750,7 +1209,7 @@ class ZipFile: if extra: # Append a ZIP64 field to the extra's extra_data = struct.pack( - '>sys.stderr, (structCentralDir, + stringCentralDir, create_version, + zinfo.create_system, extract_version, zinfo.reserved, + zinfo.flag_bits, zinfo.compress_type, dostime, dosdate, + zinfo.CRC, compress_size, file_size, + len(zinfo.filename), len(extra_data), len(zinfo.comment), + 0, zinfo.internal_attr, zinfo.external_attr, + header_offset) + raise self.fp.write(centdir) - self.fp.write(zinfo.filename) + self.fp.write(filename) self.fp.write(extra_data) self.fp.write(zinfo.comment) pos2 = self.fp.tell() # Write end-of-zip-archive record + centDirOffset = pos1 if pos1 > ZIP64_LIMIT: # Need to write the ZIP64 end-of-archive records zip64endrec = struct.pack( @@ -785,16 +1257,24 @@ class ZipFile: structEndArchive64Locator, stringEndArchive64Locator, 0, pos2, 1) self.fp.write(zip64locrec) + centDirOffset = 0xFFFFFFFF - endrec = struct.pack(structEndArchive, stringEndArchive, - 0, 0, count, count, pos2 - pos1, -1, 0) - self.fp.write(endrec) + # check for valid comment length + if len(self.comment) >= ZIP_MAX_COMMENT: + if self.debug > 0: + msg = 'Archive comment is too long; truncating to %d bytes' \ + % ZIP_MAX_COMMENT + print msg + self.comment = self.comment[:ZIP_MAX_COMMENT] - else: - endrec = struct.pack(structEndArchive, stringEndArchive, - 0, 0, count, count, pos2 - pos1, pos1, 0) - self.fp.write(endrec) + endrec = struct.pack(structEndArchive, stringEndArchive, + 0, 0, count % ZIP_FILECOUNT_LIMIT, + count % ZIP_FILECOUNT_LIMIT, pos2 - pos1, + centDirOffset, len(self.comment)) + self.fp.write(endrec) + self.fp.write(self.comment) self.fp.flush() + if not self._filePassed: self.fp.close() self.fp = None