mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
EPUB Input: Try to workaround EPUBs that have missing or damaged ZIP central directories. calibre should now be able to read/convert such an EPUB file, provided it does not suffer from further corruption.
This commit is contained in:
parent
f397686804
commit
9f570abd9b
@ -150,8 +150,15 @@ class EPUBInput(InputFormatPlugin):
|
|||||||
from calibre import walk
|
from calibre import walk
|
||||||
from calibre.ebooks import DRMError
|
from calibre.ebooks import DRMError
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
zf = ZipFile(stream)
|
try:
|
||||||
zf.extractall(os.getcwdu())
|
zf = ZipFile(stream)
|
||||||
|
zf.extractall(os.getcwdu())
|
||||||
|
except:
|
||||||
|
log.exception('EPUB appears to be invalid ZIP file, trying a'
|
||||||
|
' more forgiving ZIP parser')
|
||||||
|
from calibre.utils.localunzip import extractall
|
||||||
|
stream.seek(0)
|
||||||
|
extractall(stream)
|
||||||
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
|
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
|
||||||
opf = self.find_opf()
|
opf = self.find_opf()
|
||||||
if opf is None:
|
if opf is None:
|
||||||
|
@ -10,6 +10,7 @@ from cStringIO import StringIO
|
|||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
|
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
|
||||||
|
from calibre.utils.localunzip import LocalZipFile
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
@ -105,10 +106,13 @@ class OCFReader(OCF):
|
|||||||
|
|
||||||
class OCFZipReader(OCFReader):
|
class OCFZipReader(OCFReader):
|
||||||
def __init__(self, stream, mode='r', root=None):
|
def __init__(self, stream, mode='r', root=None):
|
||||||
try:
|
if isinstance(stream, (LocalZipFile, ZipFile)):
|
||||||
self.archive = ZipFile(stream, mode=mode)
|
self.archive = stream
|
||||||
except BadZipfile:
|
else:
|
||||||
raise EPubException("not a ZIP .epub OCF container")
|
try:
|
||||||
|
self.archive = ZipFile(stream, mode=mode)
|
||||||
|
except BadZipfile:
|
||||||
|
raise EPubException("not a ZIP .epub OCF container")
|
||||||
self.root = root
|
self.root = root
|
||||||
if self.root is None:
|
if self.root is None:
|
||||||
name = getattr(stream, 'name', False)
|
name = getattr(stream, 'name', False)
|
||||||
@ -119,8 +123,18 @@ class OCFZipReader(OCFReader):
|
|||||||
super(OCFZipReader, self).__init__()
|
super(OCFZipReader, self).__init__()
|
||||||
|
|
||||||
def open(self, name, mode='r'):
|
def open(self, name, mode='r'):
|
||||||
|
if isinstance(self.archive, LocalZipFile):
|
||||||
|
return self.archive.open(name)
|
||||||
return StringIO(self.archive.read(name))
|
return StringIO(self.archive.read(name))
|
||||||
|
|
||||||
|
def get_zip_reader(stream, root=None):
|
||||||
|
try:
|
||||||
|
zf = ZipFile(stream, mode='r')
|
||||||
|
except:
|
||||||
|
stream.seek(0)
|
||||||
|
zf = LocalZipFile(stream)
|
||||||
|
return OCFZipReader(zf, root=root)
|
||||||
|
|
||||||
class OCFDirReader(OCFReader):
|
class OCFDirReader(OCFReader):
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
self.root = path
|
self.root = path
|
||||||
@ -184,7 +198,12 @@ def render_cover(opf, opf_path, zf, reader=None):
|
|||||||
def get_cover(opf, opf_path, stream, reader=None):
|
def get_cover(opf, opf_path, stream, reader=None):
|
||||||
raster_cover = opf.raster_cover
|
raster_cover = opf.raster_cover
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
zf = ZipFile(stream)
|
try:
|
||||||
|
zf = ZipFile(stream)
|
||||||
|
except:
|
||||||
|
stream.seek(0)
|
||||||
|
zf = LocalZipFile(stream)
|
||||||
|
|
||||||
if raster_cover:
|
if raster_cover:
|
||||||
base = posixpath.dirname(opf_path)
|
base = posixpath.dirname(opf_path)
|
||||||
cpath = posixpath.normpath(posixpath.join(base, raster_cover))
|
cpath = posixpath.normpath(posixpath.join(base, raster_cover))
|
||||||
@ -207,7 +226,7 @@ def get_cover(opf, opf_path, stream, reader=None):
|
|||||||
def get_metadata(stream, extract_cover=True):
|
def get_metadata(stream, extract_cover=True):
|
||||||
""" Return metadata as a :class:`Metadata` object """
|
""" Return metadata as a :class:`Metadata` object """
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
reader = OCFZipReader(stream)
|
reader = get_zip_reader(stream)
|
||||||
mi = reader.opf.to_book_metadata()
|
mi = reader.opf.to_book_metadata()
|
||||||
if extract_cover:
|
if extract_cover:
|
||||||
try:
|
try:
|
||||||
@ -232,7 +251,7 @@ def _write_new_cover(new_cdata, cpath):
|
|||||||
|
|
||||||
def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
|
def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
reader = OCFZipReader(stream, root=os.getcwdu())
|
reader = get_zip_reader(stream, root=os.getcwdu())
|
||||||
raster_cover = reader.opf.raster_cover
|
raster_cover = reader.opf.raster_cover
|
||||||
mi = MetaInformation(mi)
|
mi = MetaInformation(mi)
|
||||||
new_cdata = None
|
new_cdata = None
|
||||||
@ -283,7 +302,11 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
|
|||||||
reader.opf.timestamp = mi.timestamp
|
reader.opf.timestamp = mi.timestamp
|
||||||
|
|
||||||
newopf = StringIO(reader.opf.render())
|
newopf = StringIO(reader.opf.render())
|
||||||
safe_replace(stream, reader.container[OPF.MIMETYPE], newopf,
|
if isinstance(reader.archive, LocalZipFile):
|
||||||
|
reader.archive.safe_replace(reader.container[OPF.MIMETYPE], newopf,
|
||||||
|
extra_replacements=replacements)
|
||||||
|
else:
|
||||||
|
safe_replace(stream, reader.container[OPF.MIMETYPE], newopf,
|
||||||
extra_replacements=replacements)
|
extra_replacements=replacements)
|
||||||
try:
|
try:
|
||||||
if cpath is not None:
|
if cpath is not None:
|
||||||
|
267
src/calibre/utils/localunzip.py
Normal file
267
src/calibre/utils/localunzip.py
Normal file
@ -0,0 +1,267 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
'''
|
||||||
|
Try to read invalid zip files with missing or damaged central directories.
|
||||||
|
These are apparently produced in large numbers by the fruitcakes over at B&N.
|
||||||
|
|
||||||
|
Tries to only use the local headers to extract data from the damaged zip file.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import os, sys, zlib, shutil
|
||||||
|
from struct import calcsize, unpack, pack
|
||||||
|
from collections import namedtuple, OrderedDict
|
||||||
|
from tempfile import SpooledTemporaryFile
|
||||||
|
|
||||||
|
HEADER_SIG = 0x04034b50
|
||||||
|
HEADER_BYTE_SIG = pack(b'<L', HEADER_SIG)
|
||||||
|
local_header_fmt = b'<L5HL2L2H'
|
||||||
|
local_header_sz = calcsize(local_header_fmt)
|
||||||
|
ZIP_STORED, ZIP_DEFLATED = 0, 8
|
||||||
|
|
||||||
|
LocalHeader = namedtuple('LocalHeader',
|
||||||
|
'signature min_version flags compression_method mod_time mod_date '
|
||||||
|
'crc32 compressed_size uncompressed_size filename_length extra_length '
|
||||||
|
'filename extra')
|
||||||
|
|
||||||
|
def decode_arcname(name):
|
||||||
|
if isinstance(name, bytes):
|
||||||
|
from calibre.ebooks.chardet import detect
|
||||||
|
try:
|
||||||
|
name = name.decode('utf-8')
|
||||||
|
except:
|
||||||
|
res = detect(name)
|
||||||
|
encoding = res['encoding']
|
||||||
|
try:
|
||||||
|
name = name.decode(encoding)
|
||||||
|
except:
|
||||||
|
name = name.decode('utf-8', 'replace')
|
||||||
|
return name
|
||||||
|
|
||||||
|
def find_local_header(f):
|
||||||
|
pos = f.tell()
|
||||||
|
raw = f.read(50*1024)
|
||||||
|
try:
|
||||||
|
f.seek(pos + raw.index(HEADER_BYTE_SIG))
|
||||||
|
except ValueError:
|
||||||
|
f.seek(pos)
|
||||||
|
return
|
||||||
|
raw = f.read(local_header_sz)
|
||||||
|
if len(raw) != local_header_sz:
|
||||||
|
f.seek(pos)
|
||||||
|
return
|
||||||
|
header = LocalHeader(*(unpack(local_header_fmt, raw) + (None, None)))
|
||||||
|
if header.signature == HEADER_SIG:
|
||||||
|
return header
|
||||||
|
f.seek(pos)
|
||||||
|
|
||||||
|
def read_local_file_header(f):
|
||||||
|
pos = f.tell()
|
||||||
|
raw = f.read(local_header_sz)
|
||||||
|
if len(raw) != local_header_sz:
|
||||||
|
f.seek(pos)
|
||||||
|
return
|
||||||
|
header = LocalHeader(*(unpack(local_header_fmt, raw) + (None, None)))
|
||||||
|
if header.signature != HEADER_SIG:
|
||||||
|
f.seek(pos)
|
||||||
|
header = find_local_header(f)
|
||||||
|
if header is None:
|
||||||
|
return
|
||||||
|
if header.min_version > 20:
|
||||||
|
raise ValueError('This ZIP file uses unsupported features')
|
||||||
|
if header.flags & 0b1:
|
||||||
|
raise ValueError('This ZIP file is encrypted')
|
||||||
|
if header.flags & (1 << 3):
|
||||||
|
raise ValueError('This ZIP file uses data descriptors. This is unsupported')
|
||||||
|
if header.flags & (1 << 13):
|
||||||
|
raise ValueError('This ZIP file uses masking, unsupported.')
|
||||||
|
if header.compression_method not in {ZIP_STORED, ZIP_DEFLATED}:
|
||||||
|
raise ValueError('This ZIP file uses an unsupported compression method')
|
||||||
|
fname = extra = None
|
||||||
|
if header.filename_length > 0:
|
||||||
|
fname = f.read(header.filename_length)
|
||||||
|
if len(fname) != header.filename_length:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
fname = fname.decode('ascii')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
if header.flags & (1 << 11):
|
||||||
|
try:
|
||||||
|
fname = fname.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
fname = decode_arcname(fname).replace('\\', '/')
|
||||||
|
if header.extra_length > 0:
|
||||||
|
extra = f.read(header.extra_length)
|
||||||
|
if len(extra) != header.extra_length:
|
||||||
|
return
|
||||||
|
return LocalHeader(*(
|
||||||
|
header[:-2] + (fname, extra)
|
||||||
|
))
|
||||||
|
|
||||||
|
def read_compressed_data(f, header):
|
||||||
|
cdata = f.read(header.compressed_size)
|
||||||
|
return cdata
|
||||||
|
|
||||||
|
def copy_stored_file(src, size, dest):
|
||||||
|
read = 0
|
||||||
|
amt = min(size, 20*1024)
|
||||||
|
while read < size:
|
||||||
|
raw = src.read(min(size-read, amt))
|
||||||
|
if not raw:
|
||||||
|
raise ValueError('Premature end of file')
|
||||||
|
dest.write(raw)
|
||||||
|
read += len(raw)
|
||||||
|
|
||||||
|
def copy_compressed_file(src, size, dest):
|
||||||
|
d = zlib.decompressobj(-15)
|
||||||
|
read = 0
|
||||||
|
amt = min(size, 20*1024)
|
||||||
|
while read < size:
|
||||||
|
raw = src.read(min(size-read, amt))
|
||||||
|
read += len(raw)
|
||||||
|
dest.write(d.decompress(raw, 200*1024))
|
||||||
|
count = 0
|
||||||
|
while d.unconsumed_tail:
|
||||||
|
count += 1
|
||||||
|
dest.write(d.decompress(d.unconsumed_tail, 200*1024))
|
||||||
|
|
||||||
|
if count > 100:
|
||||||
|
raise ValueError('This ZIP file contains a ZIP bomb in %s'%
|
||||||
|
os.path.basename(dest.name))
|
||||||
|
|
||||||
|
def _extractall(f, path=None, file_info=None):
|
||||||
|
found = False
|
||||||
|
while True:
|
||||||
|
header = read_local_file_header(f)
|
||||||
|
if not header:
|
||||||
|
break
|
||||||
|
found = True
|
||||||
|
parts = header.filename.split('/')
|
||||||
|
if header.uncompressed_size == 0:
|
||||||
|
# Directory
|
||||||
|
f.seek(f.tell() + header.compressed_size)
|
||||||
|
if path is not None:
|
||||||
|
bdir = os.path.join(path, *parts)
|
||||||
|
if not os.path.exists(bdir):
|
||||||
|
os.makedirs(bdir)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# File
|
||||||
|
if file_info is not None:
|
||||||
|
file_info[header.filename] = (f.tell(), header)
|
||||||
|
if path is not None:
|
||||||
|
bdir = os.path.join(path, *(parts[:-1]))
|
||||||
|
if not os.path.exists(bdir):
|
||||||
|
os.makedirs(bdir)
|
||||||
|
dest = os.path.join(path, *parts)
|
||||||
|
with open(dest, 'wb') as o:
|
||||||
|
if header.compression_method == ZIP_STORED:
|
||||||
|
copy_stored_file(f, header.compressed_size, o)
|
||||||
|
else:
|
||||||
|
copy_compressed_file(f, header.compressed_size, o)
|
||||||
|
else:
|
||||||
|
f.seek(f.tell() + header.compressed_size)
|
||||||
|
|
||||||
|
if not found:
|
||||||
|
raise ValueError('Not a ZIP file')
|
||||||
|
|
||||||
|
|
||||||
|
def extractall(path_or_stream, path=None):
|
||||||
|
f = path_or_stream
|
||||||
|
close_at_end = False
|
||||||
|
if not hasattr(f, 'read'):
|
||||||
|
f = open(f, 'rb')
|
||||||
|
close_at_end = True
|
||||||
|
if path is None:
|
||||||
|
path = os.getcwdu()
|
||||||
|
pos = f.tell()
|
||||||
|
try:
|
||||||
|
_extractall(f, path)
|
||||||
|
finally:
|
||||||
|
f.seek(pos)
|
||||||
|
if close_at_end:
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
class LocalZipFile(object):
|
||||||
|
|
||||||
|
def __init__(self, stream):
|
||||||
|
self.file_info = OrderedDict()
|
||||||
|
_extractall(stream, file_info=self.file_info)
|
||||||
|
self.stream = stream
|
||||||
|
|
||||||
|
def open(self, name, spool_size=5*1024*1024):
|
||||||
|
if isinstance(name, LocalHeader):
|
||||||
|
name = name.filename
|
||||||
|
try:
|
||||||
|
offset, header = self.file_info.get(name)
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError('This ZIP container has no file named: %s'%name)
|
||||||
|
|
||||||
|
self.stream.seek(offset)
|
||||||
|
dest = SpooledTemporaryFile(max_size=spool_size)
|
||||||
|
|
||||||
|
if header.compression_method == ZIP_STORED:
|
||||||
|
copy_stored_file(self.stream, header.compressed_size, dest)
|
||||||
|
else:
|
||||||
|
copy_compressed_file(self.stream, header.compressed_size, dest)
|
||||||
|
dest.seek(0)
|
||||||
|
return dest
|
||||||
|
|
||||||
|
def getinfo(self, name):
|
||||||
|
try:
|
||||||
|
offset, header = self.file_info.get(name)
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError('This ZIP container has no file named: %s'%name)
|
||||||
|
return header
|
||||||
|
|
||||||
|
def read(self, name, spool_size=5*1024*1024):
|
||||||
|
with self.open(name, spool_size=spool_size) as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
def extractall(self, path=None):
|
||||||
|
self.stream.seek(0)
|
||||||
|
_extractall(self.stream, path=(path or os.getcwdu()))
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def safe_replace(self, name, datastream, extra_replacements={},
|
||||||
|
add_missing=False):
|
||||||
|
from calibre.utils.zipfile import ZipFile, ZipInfo
|
||||||
|
replacements = {name:datastream}
|
||||||
|
replacements.update(extra_replacements)
|
||||||
|
names = frozenset(replacements.keys())
|
||||||
|
found = set([])
|
||||||
|
with SpooledTemporaryFile(max_size=100*1024*1024) as temp:
|
||||||
|
ztemp = ZipFile(temp, 'w')
|
||||||
|
for offset, header in self.file_info.itervalues():
|
||||||
|
if header.filename in names:
|
||||||
|
zi = ZipInfo(header.filename)
|
||||||
|
zi.compress_type = header.compression_method
|
||||||
|
ztemp.writestr(zi, replacements[header.filename].read())
|
||||||
|
found.add(header.filename)
|
||||||
|
else:
|
||||||
|
ztemp.writestr(header.filename, self.read(header.filename,
|
||||||
|
spool_size=0))
|
||||||
|
if add_missing:
|
||||||
|
for name in names - found:
|
||||||
|
ztemp.writestr(name, replacements[name].read())
|
||||||
|
ztemp.close()
|
||||||
|
zipstream = self.stream
|
||||||
|
temp.seek(0)
|
||||||
|
zipstream.seek(0)
|
||||||
|
zipstream.truncate()
|
||||||
|
shutil.copyfileobj(temp, zipstream)
|
||||||
|
zipstream.flush()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
extractall(sys.argv[-1])
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user