EPUB Input: Try to workaround EPUBs that have missing or damaged ZIP central directories. calibre should now be able to read/convert such an EPUB file, provided it does not suffer from further corruption.

This commit is contained in:
Kovid Goyal 2012-11-07 17:23:03 +05:30
parent f397686804
commit 9f570abd9b
3 changed files with 307 additions and 10 deletions

View File

@ -150,8 +150,15 @@ class EPUBInput(InputFormatPlugin):
from calibre import walk from calibre import walk
from calibre.ebooks import DRMError from calibre.ebooks import DRMError
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.opf2 import OPF
zf = ZipFile(stream) try:
zf.extractall(os.getcwdu()) zf = ZipFile(stream)
zf.extractall(os.getcwdu())
except:
log.exception('EPUB appears to be invalid ZIP file, trying a'
' more forgiving ZIP parser')
from calibre.utils.localunzip import extractall
stream.seek(0)
extractall(stream)
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
opf = self.find_opf() opf = self.find_opf()
if opf is None: if opf is None:

View File

@ -10,6 +10,7 @@ from cStringIO import StringIO
from contextlib import closing from contextlib import closing
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
from calibre.utils.localunzip import LocalZipFile
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.opf2 import OPF
@ -105,10 +106,13 @@ class OCFReader(OCF):
class OCFZipReader(OCFReader): class OCFZipReader(OCFReader):
def __init__(self, stream, mode='r', root=None): def __init__(self, stream, mode='r', root=None):
try: if isinstance(stream, (LocalZipFile, ZipFile)):
self.archive = ZipFile(stream, mode=mode) self.archive = stream
except BadZipfile: else:
raise EPubException("not a ZIP .epub OCF container") try:
self.archive = ZipFile(stream, mode=mode)
except BadZipfile:
raise EPubException("not a ZIP .epub OCF container")
self.root = root self.root = root
if self.root is None: if self.root is None:
name = getattr(stream, 'name', False) name = getattr(stream, 'name', False)
@ -119,8 +123,18 @@ class OCFZipReader(OCFReader):
super(OCFZipReader, self).__init__() super(OCFZipReader, self).__init__()
def open(self, name, mode='r'): def open(self, name, mode='r'):
if isinstance(self.archive, LocalZipFile):
return self.archive.open(name)
return StringIO(self.archive.read(name)) return StringIO(self.archive.read(name))
def get_zip_reader(stream, root=None):
try:
zf = ZipFile(stream, mode='r')
except:
stream.seek(0)
zf = LocalZipFile(stream)
return OCFZipReader(zf, root=root)
class OCFDirReader(OCFReader): class OCFDirReader(OCFReader):
def __init__(self, path): def __init__(self, path):
self.root = path self.root = path
@ -184,7 +198,12 @@ def render_cover(opf, opf_path, zf, reader=None):
def get_cover(opf, opf_path, stream, reader=None): def get_cover(opf, opf_path, stream, reader=None):
raster_cover = opf.raster_cover raster_cover = opf.raster_cover
stream.seek(0) stream.seek(0)
zf = ZipFile(stream) try:
zf = ZipFile(stream)
except:
stream.seek(0)
zf = LocalZipFile(stream)
if raster_cover: if raster_cover:
base = posixpath.dirname(opf_path) base = posixpath.dirname(opf_path)
cpath = posixpath.normpath(posixpath.join(base, raster_cover)) cpath = posixpath.normpath(posixpath.join(base, raster_cover))
@ -207,7 +226,7 @@ def get_cover(opf, opf_path, stream, reader=None):
def get_metadata(stream, extract_cover=True): def get_metadata(stream, extract_cover=True):
""" Return metadata as a :class:`Metadata` object """ """ Return metadata as a :class:`Metadata` object """
stream.seek(0) stream.seek(0)
reader = OCFZipReader(stream) reader = get_zip_reader(stream)
mi = reader.opf.to_book_metadata() mi = reader.opf.to_book_metadata()
if extract_cover: if extract_cover:
try: try:
@ -232,7 +251,7 @@ def _write_new_cover(new_cdata, cpath):
def set_metadata(stream, mi, apply_null=False, update_timestamp=False): def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
stream.seek(0) stream.seek(0)
reader = OCFZipReader(stream, root=os.getcwdu()) reader = get_zip_reader(stream, root=os.getcwdu())
raster_cover = reader.opf.raster_cover raster_cover = reader.opf.raster_cover
mi = MetaInformation(mi) mi = MetaInformation(mi)
new_cdata = None new_cdata = None
@ -283,7 +302,11 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
reader.opf.timestamp = mi.timestamp reader.opf.timestamp = mi.timestamp
newopf = StringIO(reader.opf.render()) newopf = StringIO(reader.opf.render())
safe_replace(stream, reader.container[OPF.MIMETYPE], newopf, if isinstance(reader.archive, LocalZipFile):
reader.archive.safe_replace(reader.container[OPF.MIMETYPE], newopf,
extra_replacements=replacements)
else:
safe_replace(stream, reader.container[OPF.MIMETYPE], newopf,
extra_replacements=replacements) extra_replacements=replacements)
try: try:
if cpath is not None: if cpath is not None:

View File

@ -0,0 +1,267 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Try to read invalid zip files with missing or damaged central directories.
These are apparently produced in large numbers by the fruitcakes over at B&N.
Tries to only use the local headers to extract data from the damaged zip file.
'''
import os, sys, zlib, shutil
from struct import calcsize, unpack, pack
from collections import namedtuple, OrderedDict
from tempfile import SpooledTemporaryFile
HEADER_SIG = 0x04034b50
HEADER_BYTE_SIG = pack(b'<L', HEADER_SIG)
local_header_fmt = b'<L5HL2L2H'
local_header_sz = calcsize(local_header_fmt)
ZIP_STORED, ZIP_DEFLATED = 0, 8
LocalHeader = namedtuple('LocalHeader',
'signature min_version flags compression_method mod_time mod_date '
'crc32 compressed_size uncompressed_size filename_length extra_length '
'filename extra')
def decode_arcname(name):
if isinstance(name, bytes):
from calibre.ebooks.chardet import detect
try:
name = name.decode('utf-8')
except:
res = detect(name)
encoding = res['encoding']
try:
name = name.decode(encoding)
except:
name = name.decode('utf-8', 'replace')
return name
def find_local_header(f):
pos = f.tell()
raw = f.read(50*1024)
try:
f.seek(pos + raw.index(HEADER_BYTE_SIG))
except ValueError:
f.seek(pos)
return
raw = f.read(local_header_sz)
if len(raw) != local_header_sz:
f.seek(pos)
return
header = LocalHeader(*(unpack(local_header_fmt, raw) + (None, None)))
if header.signature == HEADER_SIG:
return header
f.seek(pos)
def read_local_file_header(f):
pos = f.tell()
raw = f.read(local_header_sz)
if len(raw) != local_header_sz:
f.seek(pos)
return
header = LocalHeader(*(unpack(local_header_fmt, raw) + (None, None)))
if header.signature != HEADER_SIG:
f.seek(pos)
header = find_local_header(f)
if header is None:
return
if header.min_version > 20:
raise ValueError('This ZIP file uses unsupported features')
if header.flags & 0b1:
raise ValueError('This ZIP file is encrypted')
if header.flags & (1 << 3):
raise ValueError('This ZIP file uses data descriptors. This is unsupported')
if header.flags & (1 << 13):
raise ValueError('This ZIP file uses masking, unsupported.')
if header.compression_method not in {ZIP_STORED, ZIP_DEFLATED}:
raise ValueError('This ZIP file uses an unsupported compression method')
fname = extra = None
if header.filename_length > 0:
fname = f.read(header.filename_length)
if len(fname) != header.filename_length:
return
try:
fname = fname.decode('ascii')
except UnicodeDecodeError:
if header.flags & (1 << 11):
try:
fname = fname.decode('utf-8')
except UnicodeDecodeError:
pass
fname = decode_arcname(fname).replace('\\', '/')
if header.extra_length > 0:
extra = f.read(header.extra_length)
if len(extra) != header.extra_length:
return
return LocalHeader(*(
header[:-2] + (fname, extra)
))
def read_compressed_data(f, header):
cdata = f.read(header.compressed_size)
return cdata
def copy_stored_file(src, size, dest):
read = 0
amt = min(size, 20*1024)
while read < size:
raw = src.read(min(size-read, amt))
if not raw:
raise ValueError('Premature end of file')
dest.write(raw)
read += len(raw)
def copy_compressed_file(src, size, dest):
d = zlib.decompressobj(-15)
read = 0
amt = min(size, 20*1024)
while read < size:
raw = src.read(min(size-read, amt))
read += len(raw)
dest.write(d.decompress(raw, 200*1024))
count = 0
while d.unconsumed_tail:
count += 1
dest.write(d.decompress(d.unconsumed_tail, 200*1024))
if count > 100:
raise ValueError('This ZIP file contains a ZIP bomb in %s'%
os.path.basename(dest.name))
def _extractall(f, path=None, file_info=None):
found = False
while True:
header = read_local_file_header(f)
if not header:
break
found = True
parts = header.filename.split('/')
if header.uncompressed_size == 0:
# Directory
f.seek(f.tell() + header.compressed_size)
if path is not None:
bdir = os.path.join(path, *parts)
if not os.path.exists(bdir):
os.makedirs(bdir)
continue
# File
if file_info is not None:
file_info[header.filename] = (f.tell(), header)
if path is not None:
bdir = os.path.join(path, *(parts[:-1]))
if not os.path.exists(bdir):
os.makedirs(bdir)
dest = os.path.join(path, *parts)
with open(dest, 'wb') as o:
if header.compression_method == ZIP_STORED:
copy_stored_file(f, header.compressed_size, o)
else:
copy_compressed_file(f, header.compressed_size, o)
else:
f.seek(f.tell() + header.compressed_size)
if not found:
raise ValueError('Not a ZIP file')
def extractall(path_or_stream, path=None):
f = path_or_stream
close_at_end = False
if not hasattr(f, 'read'):
f = open(f, 'rb')
close_at_end = True
if path is None:
path = os.getcwdu()
pos = f.tell()
try:
_extractall(f, path)
finally:
f.seek(pos)
if close_at_end:
f.close()
class LocalZipFile(object):
def __init__(self, stream):
self.file_info = OrderedDict()
_extractall(stream, file_info=self.file_info)
self.stream = stream
def open(self, name, spool_size=5*1024*1024):
if isinstance(name, LocalHeader):
name = name.filename
try:
offset, header = self.file_info.get(name)
except KeyError:
raise ValueError('This ZIP container has no file named: %s'%name)
self.stream.seek(offset)
dest = SpooledTemporaryFile(max_size=spool_size)
if header.compression_method == ZIP_STORED:
copy_stored_file(self.stream, header.compressed_size, dest)
else:
copy_compressed_file(self.stream, header.compressed_size, dest)
dest.seek(0)
return dest
def getinfo(self, name):
try:
offset, header = self.file_info.get(name)
except KeyError:
raise ValueError('This ZIP container has no file named: %s'%name)
return header
def read(self, name, spool_size=5*1024*1024):
with self.open(name, spool_size=spool_size) as f:
return f.read()
def extractall(self, path=None):
self.stream.seek(0)
_extractall(self.stream, path=(path or os.getcwdu()))
def close(self):
pass
def safe_replace(self, name, datastream, extra_replacements={},
add_missing=False):
from calibre.utils.zipfile import ZipFile, ZipInfo
replacements = {name:datastream}
replacements.update(extra_replacements)
names = frozenset(replacements.keys())
found = set([])
with SpooledTemporaryFile(max_size=100*1024*1024) as temp:
ztemp = ZipFile(temp, 'w')
for offset, header in self.file_info.itervalues():
if header.filename in names:
zi = ZipInfo(header.filename)
zi.compress_type = header.compression_method
ztemp.writestr(zi, replacements[header.filename].read())
found.add(header.filename)
else:
ztemp.writestr(header.filename, self.read(header.filename,
spool_size=0))
if add_missing:
for name in names - found:
ztemp.writestr(name, replacements[name].read())
ztemp.close()
zipstream = self.stream
temp.seek(0)
zipstream.seek(0)
zipstream.truncate()
shutil.copyfileobj(temp, zipstream)
zipstream.flush()
if __name__ == '__main__':
extractall(sys.argv[-1])