mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Speedup unzipping of large ZIP files by 25%
parallelize I/O and decompress/crc
This commit is contained in:
parent
5504e9fdf9
commit
0594f026ba
@ -42,7 +42,7 @@ class KepubifyTests(BaseTest):
|
||||
actual = get_container(outpath, tweak_mode=True)
|
||||
self.assertEqual(
|
||||
tuple(expected.manifest_items_with_property('cover-image')), tuple(actual.manifest_items_with_property('cover-image')))
|
||||
self.assertEqual(tuple(expected.mime_map), tuple(actual.mime_map))
|
||||
self.assertEqual(expected.mime_map, actual.mime_map)
|
||||
for name, mt in expected.mime_map.items():
|
||||
if mt in OEB_DOCS or mt in OEB_STYLES or name.endswith('.opf'):
|
||||
self.assertEqual(expected.open(name, 'rb').read(), actual.open(name, 'rb').read())
|
||||
|
@ -11,7 +11,9 @@ import stat
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from contextlib import closing
|
||||
from threading import Lock
|
||||
|
||||
from calibre import sanitize_file_name
|
||||
from calibre.constants import filesystem_encoding
|
||||
@ -551,6 +553,7 @@ class ZipExtFile(io.BufferedIOBase):
|
||||
|
||||
self.mode = mode
|
||||
self.name = zipinfo.filename
|
||||
self.uncompressed_size = zipinfo.file_size
|
||||
|
||||
if hasattr(zipinfo, 'CRC'):
|
||||
self._expected_crc = zipinfo.CRC
|
||||
@ -713,6 +716,26 @@ class ZipExtFile(io.BufferedIOBase):
|
||||
self._fileobj.seek(pos)
|
||||
return raw
|
||||
|
||||
def decrypt_and_uncompress(self, raw: bytes) -> bytes:
|
||||
if self._decrypter is not None and raw:
|
||||
raw = b''.join(bytes(bytearray(map(self._decrypter, bytearray(raw)))))
|
||||
if self._compress_type == ZIP_DEFLATED:
|
||||
raw = zlib.decompress(raw, -15, max(self.uncompressed_size, zlib.DEF_BUF_SIZE))
|
||||
return raw
|
||||
|
||||
def check_crc(self, raw: bytes) -> None:
|
||||
if self._expected_crc is not None:
|
||||
crc = crc32(raw) & 0xffffffff
|
||||
# Check the CRC if we're at the end of the file
|
||||
if crc != self._expected_crc:
|
||||
raise BadZipfile(f'Bad CRC-32 for file {self.name!r}')
|
||||
|
||||
def readall(self):
|
||||
raw = self.read_raw()
|
||||
raw = self.decrypt_and_uncompress(raw)
|
||||
self.check_crc(raw)
|
||||
return raw
|
||||
|
||||
|
||||
class ZipFile:
|
||||
''' Class with methods to open, read, write, close, list and update zip files.
|
||||
@ -1005,12 +1028,13 @@ class ZipFile:
|
||||
|
||||
def read(self, name, pwd=None):
|
||||
'''Return file bytes (as a string) for name.'''
|
||||
return self.open(name, 'r', pwd).read()
|
||||
with closing(self.open(name, 'r', pwd)) as f:
|
||||
return f.readall()
|
||||
|
||||
def read_raw(self, name, mode='r', pwd=None):
|
||||
'''Return the raw bytes in the zipfile corresponding to name.'''
|
||||
zef = self.open(name, mode=mode, pwd=pwd)
|
||||
return zef.read_raw()
|
||||
with closing(self.open(name, mode=mode, pwd=pwd)) as zef:
|
||||
return zef.read_raw()
|
||||
|
||||
def open(self, name, mode='r', pwd=None):
|
||||
'''Return file-like object for 'name'.'''
|
||||
@ -1020,13 +1044,7 @@ class ZipFile:
|
||||
raise RuntimeError(
|
||||
'Attempt to read ZIP archive that was already closed')
|
||||
|
||||
# Only open a new file for instances where we were not
|
||||
# given a file object in the constructor
|
||||
if self._filePassed:
|
||||
zef_file = self.fp
|
||||
else:
|
||||
zef_file = open(self.filename, 'rb')
|
||||
|
||||
zef_file = self.fp
|
||||
# Make sure we have an info object
|
||||
if isinstance(name, ZipInfo):
|
||||
# 'name' is already an info object
|
||||
@ -1035,7 +1053,7 @@ class ZipFile:
|
||||
# Get info object for name
|
||||
zinfo = self.getinfo(name)
|
||||
|
||||
zef_file.seek(zinfo.header_offset, 0)
|
||||
zef_file.seek(zinfo.header_offset, os.SEEK_SET)
|
||||
|
||||
# Skip the file header:
|
||||
fheader = zef_file.read(sizeFileHeader)
|
||||
@ -1100,28 +1118,28 @@ class ZipFile:
|
||||
return self._extract_member(member, path, pwd)
|
||||
|
||||
def extractall(self, path=None, members=None, pwd=None):
|
||||
'''Extract all members from the archive to the current working
|
||||
directory. `path' specifies a different directory to extract to.
|
||||
`members' is optional and must be a subset of the list returned
|
||||
by namelist().
|
||||
'''
|
||||
Extract all members from the archive to the current working
|
||||
directory. `path' specifies a different directory to extract to.
|
||||
`members' is optional and must be a subset of the list returned
|
||||
by namelist(). Uses multiple worker threads for max throughput.
|
||||
'''
|
||||
if members is None:
|
||||
members = self.namelist()
|
||||
if path is None:
|
||||
path = os.getcwd()
|
||||
|
||||
# Kovid: Extract longer names first, just in case the zip file has
|
||||
# an entry for a directory without a trailing slash
|
||||
members.sort(key=len, reverse=True)
|
||||
args = []
|
||||
for name in members:
|
||||
zi = self.getinfo(name)
|
||||
dest = self._get_targetpath(zi, path)
|
||||
args.append((zi, dest, pwd))
|
||||
|
||||
lock = Lock()
|
||||
def do_one(a):
|
||||
return self._extract_member_to(*a)
|
||||
tuple(map(do_one, args))
|
||||
return self._extract_member_to(*a, lock=lock)
|
||||
with ThreadPoolExecutor(thread_name_prefix='ZipFile-') as e:
|
||||
tuple(e.map(do_one, args))
|
||||
|
||||
def _get_targetpath(self, member: ZipInfo, targetpath: str) -> str:
|
||||
# build the destination pathname, replacing
|
||||
@ -1162,7 +1180,7 @@ class ZipFile:
|
||||
def _extract_member(self, member, targetpath, pwd):
|
||||
return self._extract_member_to(member, self._get_targetpath(member, targetpath), pwd)
|
||||
|
||||
def _extract_member_to(self, member, targetpath, pwd):
|
||||
def _extract_member_to(self, member, targetpath, pwd, lock=None):
|
||||
'''Extract the ZipInfo object 'member' to a physical
|
||||
file on the path targetpath.
|
||||
'''
|
||||
@ -1175,23 +1193,30 @@ class ZipFile:
|
||||
os.mkdir(targetpath)
|
||||
return targetpath
|
||||
|
||||
if not os.path.exists(targetpath):
|
||||
# Kovid: Could be a previously automatically created directory
|
||||
# in which case it is ignored
|
||||
try:
|
||||
target = open(targetpath, 'wb')
|
||||
except OSError:
|
||||
targetpath = os.path.join(os.path.dirname(targetpath), sanitize_file_name(os.path.basename(targetpath)))
|
||||
target = open(targetpath, 'wb')
|
||||
try:
|
||||
target = open(targetpath, 'wb')
|
||||
except IsADirectoryError:
|
||||
return targetpath
|
||||
except OSError:
|
||||
targetpath = os.path.join(os.path.dirname(targetpath), sanitize_file_name(os.path.basename(targetpath)))
|
||||
target = open(targetpath, 'wb')
|
||||
|
||||
with target, closing(self.open(member, pwd=pwd)) as source:
|
||||
shutil.copyfileobj(source, target)
|
||||
with target:
|
||||
if lock is None:
|
||||
with closing(self.open(member, pwd=pwd)) as source:
|
||||
shutil.copyfileobj(source, target)
|
||||
else:
|
||||
with lock, closing(self.open(member, pwd=pwd)) as source:
|
||||
src = source.read_raw()
|
||||
src = source.decrypt_and_uncompress(src)
|
||||
source.check_crc(src)
|
||||
target.write(src)
|
||||
# Kovid: Try to preserve the timestamps in the ZIP file
|
||||
try:
|
||||
mtime = time.localtime()
|
||||
mtime = time.mktime(member.date_time + (0, 0) + (mtime.tm_isdst,))
|
||||
os.utime(targetpath, (mtime, mtime))
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
return targetpath
|
||||
|
||||
@ -1680,17 +1705,7 @@ def main(args=None):
|
||||
|
||||
zf = ZipFile(args[1], 'r')
|
||||
out = args[2]
|
||||
for path in zf.namelist():
|
||||
if path.startswith('./'):
|
||||
tgt = os.path.join(out, path[2:])
|
||||
else:
|
||||
tgt = os.path.join(out, path)
|
||||
|
||||
tgtdir = os.path.dirname(tgt)
|
||||
if not os.path.exists(tgtdir):
|
||||
os.makedirs(tgtdir)
|
||||
with open(tgt, 'wb') as fp:
|
||||
fp.write(zf.read(path))
|
||||
zf.extractall(out)
|
||||
zf.close()
|
||||
|
||||
elif args[0] == '-c':
|
||||
|
Loading…
x
Reference in New Issue
Block a user