From 9f570abd9b039dffe043904a7c6bbd710f398c6c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 7 Nov 2012 17:23:03 +0530 Subject: [PATCH] EPUB Input: Try to workaround EPUBs that have missing or damaged ZIP central directories. calibre should now be able to read/convert such an EPUB file, provided it does not suffer from further corruption. --- .../ebooks/conversion/plugins/epub_input.py | 11 +- src/calibre/ebooks/metadata/epub.py | 39 ++- src/calibre/utils/localunzip.py | 267 ++++++++++++++++++ 3 files changed, 307 insertions(+), 10 deletions(-) create mode 100644 src/calibre/utils/localunzip.py diff --git a/src/calibre/ebooks/conversion/plugins/epub_input.py b/src/calibre/ebooks/conversion/plugins/epub_input.py index f0af2d28c5..70a561226d 100644 --- a/src/calibre/ebooks/conversion/plugins/epub_input.py +++ b/src/calibre/ebooks/conversion/plugins/epub_input.py @@ -150,8 +150,15 @@ class EPUBInput(InputFormatPlugin): from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF - zf = ZipFile(stream) - zf.extractall(os.getcwdu()) + try: + zf = ZipFile(stream) + zf.extractall(os.getcwdu()) + except: + log.exception('EPUB appears to be invalid ZIP file, trying a' + ' more forgiving ZIP parser') + from calibre.utils.localunzip import extractall + stream.seek(0) + extractall(stream) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = self.find_opf() if opf is None: diff --git a/src/calibre/ebooks/metadata/epub.py b/src/calibre/ebooks/metadata/epub.py index c62f265633..bc81df5a79 100644 --- a/src/calibre/ebooks/metadata/epub.py +++ b/src/calibre/ebooks/metadata/epub.py @@ -10,6 +10,7 @@ from cStringIO import StringIO from contextlib import closing from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace +from calibre.utils.localunzip import LocalZipFile from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPF @@ -105,10 +106,13 @@ class OCFReader(OCF): class OCFZipReader(OCFReader): def __init__(self, stream, mode='r', root=None): - try: - self.archive = ZipFile(stream, mode=mode) - except BadZipfile: - raise EPubException("not a ZIP .epub OCF container") + if isinstance(stream, (LocalZipFile, ZipFile)): + self.archive = stream + else: + try: + self.archive = ZipFile(stream, mode=mode) + except BadZipfile: + raise EPubException("not a ZIP .epub OCF container") self.root = root if self.root is None: name = getattr(stream, 'name', False) @@ -119,8 +123,18 @@ class OCFZipReader(OCFReader): super(OCFZipReader, self).__init__() def open(self, name, mode='r'): + if isinstance(self.archive, LocalZipFile): + return self.archive.open(name) return StringIO(self.archive.read(name)) +def get_zip_reader(stream, root=None): + try: + zf = ZipFile(stream, mode='r') + except: + stream.seek(0) + zf = LocalZipFile(stream) + return OCFZipReader(zf, root=root) + class OCFDirReader(OCFReader): def __init__(self, path): self.root = path @@ -184,7 +198,12 @@ def render_cover(opf, opf_path, zf, reader=None): def get_cover(opf, opf_path, stream, reader=None): raster_cover = opf.raster_cover stream.seek(0) - zf = ZipFile(stream) + try: + zf = ZipFile(stream) + except: + stream.seek(0) + zf = LocalZipFile(stream) + if raster_cover: base = posixpath.dirname(opf_path) cpath = posixpath.normpath(posixpath.join(base, raster_cover)) @@ -207,7 +226,7 @@ def get_cover(opf, opf_path, stream, reader=None): def get_metadata(stream, extract_cover=True): """ Return metadata as a :class:`Metadata` object """ stream.seek(0) - reader = OCFZipReader(stream) + reader = get_zip_reader(stream) mi = reader.opf.to_book_metadata() if extract_cover: try: @@ -232,7 +251,7 @@ def _write_new_cover(new_cdata, cpath): def set_metadata(stream, mi, apply_null=False, update_timestamp=False): stream.seek(0) - reader = OCFZipReader(stream, root=os.getcwdu()) + reader = get_zip_reader(stream, root=os.getcwdu()) raster_cover = reader.opf.raster_cover mi = MetaInformation(mi) new_cdata = None @@ -283,7 +302,11 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False): reader.opf.timestamp = mi.timestamp newopf = StringIO(reader.opf.render()) - safe_replace(stream, reader.container[OPF.MIMETYPE], newopf, + if isinstance(reader.archive, LocalZipFile): + reader.archive.safe_replace(reader.container[OPF.MIMETYPE], newopf, + extra_replacements=replacements) + else: + safe_replace(stream, reader.container[OPF.MIMETYPE], newopf, extra_replacements=replacements) try: if cpath is not None: diff --git a/src/calibre/utils/localunzip.py b/src/calibre/utils/localunzip.py new file mode 100644 index 0000000000..48c51b7af6 --- /dev/null +++ b/src/calibre/utils/localunzip.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +''' +Try to read invalid zip files with missing or damaged central directories. +These are apparently produced in large numbers by the fruitcakes over at B&N. + +Tries to only use the local headers to extract data from the damaged zip file. +''' + +import os, sys, zlib, shutil +from struct import calcsize, unpack, pack +from collections import namedtuple, OrderedDict +from tempfile import SpooledTemporaryFile + +HEADER_SIG = 0x04034b50 +HEADER_BYTE_SIG = pack(b' 20: + raise ValueError('This ZIP file uses unsupported features') + if header.flags & 0b1: + raise ValueError('This ZIP file is encrypted') + if header.flags & (1 << 3): + raise ValueError('This ZIP file uses data descriptors. This is unsupported') + if header.flags & (1 << 13): + raise ValueError('This ZIP file uses masking, unsupported.') + if header.compression_method not in {ZIP_STORED, ZIP_DEFLATED}: + raise ValueError('This ZIP file uses an unsupported compression method') + fname = extra = None + if header.filename_length > 0: + fname = f.read(header.filename_length) + if len(fname) != header.filename_length: + return + try: + fname = fname.decode('ascii') + except UnicodeDecodeError: + if header.flags & (1 << 11): + try: + fname = fname.decode('utf-8') + except UnicodeDecodeError: + pass + fname = decode_arcname(fname).replace('\\', '/') + if header.extra_length > 0: + extra = f.read(header.extra_length) + if len(extra) != header.extra_length: + return + return LocalHeader(*( + header[:-2] + (fname, extra) + )) + +def read_compressed_data(f, header): + cdata = f.read(header.compressed_size) + return cdata + +def copy_stored_file(src, size, dest): + read = 0 + amt = min(size, 20*1024) + while read < size: + raw = src.read(min(size-read, amt)) + if not raw: + raise ValueError('Premature end of file') + dest.write(raw) + read += len(raw) + +def copy_compressed_file(src, size, dest): + d = zlib.decompressobj(-15) + read = 0 + amt = min(size, 20*1024) + while read < size: + raw = src.read(min(size-read, amt)) + read += len(raw) + dest.write(d.decompress(raw, 200*1024)) + count = 0 + while d.unconsumed_tail: + count += 1 + dest.write(d.decompress(d.unconsumed_tail, 200*1024)) + + if count > 100: + raise ValueError('This ZIP file contains a ZIP bomb in %s'% + os.path.basename(dest.name)) + +def _extractall(f, path=None, file_info=None): + found = False + while True: + header = read_local_file_header(f) + if not header: + break + found = True + parts = header.filename.split('/') + if header.uncompressed_size == 0: + # Directory + f.seek(f.tell() + header.compressed_size) + if path is not None: + bdir = os.path.join(path, *parts) + if not os.path.exists(bdir): + os.makedirs(bdir) + continue + + # File + if file_info is not None: + file_info[header.filename] = (f.tell(), header) + if path is not None: + bdir = os.path.join(path, *(parts[:-1])) + if not os.path.exists(bdir): + os.makedirs(bdir) + dest = os.path.join(path, *parts) + with open(dest, 'wb') as o: + if header.compression_method == ZIP_STORED: + copy_stored_file(f, header.compressed_size, o) + else: + copy_compressed_file(f, header.compressed_size, o) + else: + f.seek(f.tell() + header.compressed_size) + + if not found: + raise ValueError('Not a ZIP file') + + +def extractall(path_or_stream, path=None): + f = path_or_stream + close_at_end = False + if not hasattr(f, 'read'): + f = open(f, 'rb') + close_at_end = True + if path is None: + path = os.getcwdu() + pos = f.tell() + try: + _extractall(f, path) + finally: + f.seek(pos) + if close_at_end: + f.close() + + +class LocalZipFile(object): + + def __init__(self, stream): + self.file_info = OrderedDict() + _extractall(stream, file_info=self.file_info) + self.stream = stream + + def open(self, name, spool_size=5*1024*1024): + if isinstance(name, LocalHeader): + name = name.filename + try: + offset, header = self.file_info.get(name) + except KeyError: + raise ValueError('This ZIP container has no file named: %s'%name) + + self.stream.seek(offset) + dest = SpooledTemporaryFile(max_size=spool_size) + + if header.compression_method == ZIP_STORED: + copy_stored_file(self.stream, header.compressed_size, dest) + else: + copy_compressed_file(self.stream, header.compressed_size, dest) + dest.seek(0) + return dest + + def getinfo(self, name): + try: + offset, header = self.file_info.get(name) + except KeyError: + raise ValueError('This ZIP container has no file named: %s'%name) + return header + + def read(self, name, spool_size=5*1024*1024): + with self.open(name, spool_size=spool_size) as f: + return f.read() + + def extractall(self, path=None): + self.stream.seek(0) + _extractall(self.stream, path=(path or os.getcwdu())) + + def close(self): + pass + + def safe_replace(self, name, datastream, extra_replacements={}, + add_missing=False): + from calibre.utils.zipfile import ZipFile, ZipInfo + replacements = {name:datastream} + replacements.update(extra_replacements) + names = frozenset(replacements.keys()) + found = set([]) + with SpooledTemporaryFile(max_size=100*1024*1024) as temp: + ztemp = ZipFile(temp, 'w') + for offset, header in self.file_info.itervalues(): + if header.filename in names: + zi = ZipInfo(header.filename) + zi.compress_type = header.compression_method + ztemp.writestr(zi, replacements[header.filename].read()) + found.add(header.filename) + else: + ztemp.writestr(header.filename, self.read(header.filename, + spool_size=0)) + if add_missing: + for name in names - found: + ztemp.writestr(name, replacements[name].read()) + ztemp.close() + zipstream = self.stream + temp.seek(0) + zipstream.seek(0) + zipstream.truncate() + shutil.copyfileobj(temp, zipstream) + zipstream.flush() + +if __name__ == '__main__': + extractall(sys.argv[-1]) +