mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Speedup unzipping of large ZIP files by 25%
parallelize I/O and decompress/crc
This commit is contained in:
parent
5504e9fdf9
commit
0594f026ba
@ -42,7 +42,7 @@ class KepubifyTests(BaseTest):
|
|||||||
actual = get_container(outpath, tweak_mode=True)
|
actual = get_container(outpath, tweak_mode=True)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
tuple(expected.manifest_items_with_property('cover-image')), tuple(actual.manifest_items_with_property('cover-image')))
|
tuple(expected.manifest_items_with_property('cover-image')), tuple(actual.manifest_items_with_property('cover-image')))
|
||||||
self.assertEqual(tuple(expected.mime_map), tuple(actual.mime_map))
|
self.assertEqual(expected.mime_map, actual.mime_map)
|
||||||
for name, mt in expected.mime_map.items():
|
for name, mt in expected.mime_map.items():
|
||||||
if mt in OEB_DOCS or mt in OEB_STYLES or name.endswith('.opf'):
|
if mt in OEB_DOCS or mt in OEB_STYLES or name.endswith('.opf'):
|
||||||
self.assertEqual(expected.open(name, 'rb').read(), actual.open(name, 'rb').read())
|
self.assertEqual(expected.open(name, 'rb').read(), actual.open(name, 'rb').read())
|
||||||
|
@ -11,7 +11,9 @@ import stat
|
|||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
from threading import Lock
|
||||||
|
|
||||||
from calibre import sanitize_file_name
|
from calibre import sanitize_file_name
|
||||||
from calibre.constants import filesystem_encoding
|
from calibre.constants import filesystem_encoding
|
||||||
@ -551,6 +553,7 @@ class ZipExtFile(io.BufferedIOBase):
|
|||||||
|
|
||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.name = zipinfo.filename
|
self.name = zipinfo.filename
|
||||||
|
self.uncompressed_size = zipinfo.file_size
|
||||||
|
|
||||||
if hasattr(zipinfo, 'CRC'):
|
if hasattr(zipinfo, 'CRC'):
|
||||||
self._expected_crc = zipinfo.CRC
|
self._expected_crc = zipinfo.CRC
|
||||||
@ -713,6 +716,26 @@ class ZipExtFile(io.BufferedIOBase):
|
|||||||
self._fileobj.seek(pos)
|
self._fileobj.seek(pos)
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
def decrypt_and_uncompress(self, raw: bytes) -> bytes:
|
||||||
|
if self._decrypter is not None and raw:
|
||||||
|
raw = b''.join(bytes(bytearray(map(self._decrypter, bytearray(raw)))))
|
||||||
|
if self._compress_type == ZIP_DEFLATED:
|
||||||
|
raw = zlib.decompress(raw, -15, max(self.uncompressed_size, zlib.DEF_BUF_SIZE))
|
||||||
|
return raw
|
||||||
|
|
||||||
|
def check_crc(self, raw: bytes) -> None:
|
||||||
|
if self._expected_crc is not None:
|
||||||
|
crc = crc32(raw) & 0xffffffff
|
||||||
|
# Check the CRC if we're at the end of the file
|
||||||
|
if crc != self._expected_crc:
|
||||||
|
raise BadZipfile(f'Bad CRC-32 for file {self.name!r}')
|
||||||
|
|
||||||
|
def readall(self):
|
||||||
|
raw = self.read_raw()
|
||||||
|
raw = self.decrypt_and_uncompress(raw)
|
||||||
|
self.check_crc(raw)
|
||||||
|
return raw
|
||||||
|
|
||||||
|
|
||||||
class ZipFile:
|
class ZipFile:
|
||||||
''' Class with methods to open, read, write, close, list and update zip files.
|
''' Class with methods to open, read, write, close, list and update zip files.
|
||||||
@ -1005,12 +1028,13 @@ class ZipFile:
|
|||||||
|
|
||||||
def read(self, name, pwd=None):
|
def read(self, name, pwd=None):
|
||||||
'''Return file bytes (as a string) for name.'''
|
'''Return file bytes (as a string) for name.'''
|
||||||
return self.open(name, 'r', pwd).read()
|
with closing(self.open(name, 'r', pwd)) as f:
|
||||||
|
return f.readall()
|
||||||
|
|
||||||
def read_raw(self, name, mode='r', pwd=None):
|
def read_raw(self, name, mode='r', pwd=None):
|
||||||
'''Return the raw bytes in the zipfile corresponding to name.'''
|
'''Return the raw bytes in the zipfile corresponding to name.'''
|
||||||
zef = self.open(name, mode=mode, pwd=pwd)
|
with closing(self.open(name, mode=mode, pwd=pwd)) as zef:
|
||||||
return zef.read_raw()
|
return zef.read_raw()
|
||||||
|
|
||||||
def open(self, name, mode='r', pwd=None):
|
def open(self, name, mode='r', pwd=None):
|
||||||
'''Return file-like object for 'name'.'''
|
'''Return file-like object for 'name'.'''
|
||||||
@ -1020,13 +1044,7 @@ class ZipFile:
|
|||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
'Attempt to read ZIP archive that was already closed')
|
'Attempt to read ZIP archive that was already closed')
|
||||||
|
|
||||||
# Only open a new file for instances where we were not
|
zef_file = self.fp
|
||||||
# given a file object in the constructor
|
|
||||||
if self._filePassed:
|
|
||||||
zef_file = self.fp
|
|
||||||
else:
|
|
||||||
zef_file = open(self.filename, 'rb')
|
|
||||||
|
|
||||||
# Make sure we have an info object
|
# Make sure we have an info object
|
||||||
if isinstance(name, ZipInfo):
|
if isinstance(name, ZipInfo):
|
||||||
# 'name' is already an info object
|
# 'name' is already an info object
|
||||||
@ -1035,7 +1053,7 @@ class ZipFile:
|
|||||||
# Get info object for name
|
# Get info object for name
|
||||||
zinfo = self.getinfo(name)
|
zinfo = self.getinfo(name)
|
||||||
|
|
||||||
zef_file.seek(zinfo.header_offset, 0)
|
zef_file.seek(zinfo.header_offset, os.SEEK_SET)
|
||||||
|
|
||||||
# Skip the file header:
|
# Skip the file header:
|
||||||
fheader = zef_file.read(sizeFileHeader)
|
fheader = zef_file.read(sizeFileHeader)
|
||||||
@ -1100,28 +1118,28 @@ class ZipFile:
|
|||||||
return self._extract_member(member, path, pwd)
|
return self._extract_member(member, path, pwd)
|
||||||
|
|
||||||
def extractall(self, path=None, members=None, pwd=None):
|
def extractall(self, path=None, members=None, pwd=None):
|
||||||
'''Extract all members from the archive to the current working
|
'''
|
||||||
directory. `path' specifies a different directory to extract to.
|
Extract all members from the archive to the current working
|
||||||
`members' is optional and must be a subset of the list returned
|
directory. `path' specifies a different directory to extract to.
|
||||||
by namelist().
|
`members' is optional and must be a subset of the list returned
|
||||||
|
by namelist(). Uses multiple worker threads for max throughput.
|
||||||
'''
|
'''
|
||||||
if members is None:
|
if members is None:
|
||||||
members = self.namelist()
|
members = self.namelist()
|
||||||
if path is None:
|
if path is None:
|
||||||
path = os.getcwd()
|
path = os.getcwd()
|
||||||
|
|
||||||
# Kovid: Extract longer names first, just in case the zip file has
|
|
||||||
# an entry for a directory without a trailing slash
|
|
||||||
members.sort(key=len, reverse=True)
|
|
||||||
args = []
|
args = []
|
||||||
for name in members:
|
for name in members:
|
||||||
zi = self.getinfo(name)
|
zi = self.getinfo(name)
|
||||||
dest = self._get_targetpath(zi, path)
|
dest = self._get_targetpath(zi, path)
|
||||||
args.append((zi, dest, pwd))
|
args.append((zi, dest, pwd))
|
||||||
|
|
||||||
|
lock = Lock()
|
||||||
def do_one(a):
|
def do_one(a):
|
||||||
return self._extract_member_to(*a)
|
return self._extract_member_to(*a, lock=lock)
|
||||||
tuple(map(do_one, args))
|
with ThreadPoolExecutor(thread_name_prefix='ZipFile-') as e:
|
||||||
|
tuple(e.map(do_one, args))
|
||||||
|
|
||||||
def _get_targetpath(self, member: ZipInfo, targetpath: str) -> str:
|
def _get_targetpath(self, member: ZipInfo, targetpath: str) -> str:
|
||||||
# build the destination pathname, replacing
|
# build the destination pathname, replacing
|
||||||
@ -1162,7 +1180,7 @@ class ZipFile:
|
|||||||
def _extract_member(self, member, targetpath, pwd):
|
def _extract_member(self, member, targetpath, pwd):
|
||||||
return self._extract_member_to(member, self._get_targetpath(member, targetpath), pwd)
|
return self._extract_member_to(member, self._get_targetpath(member, targetpath), pwd)
|
||||||
|
|
||||||
def _extract_member_to(self, member, targetpath, pwd):
|
def _extract_member_to(self, member, targetpath, pwd, lock=None):
|
||||||
'''Extract the ZipInfo object 'member' to a physical
|
'''Extract the ZipInfo object 'member' to a physical
|
||||||
file on the path targetpath.
|
file on the path targetpath.
|
||||||
'''
|
'''
|
||||||
@ -1175,23 +1193,30 @@ class ZipFile:
|
|||||||
os.mkdir(targetpath)
|
os.mkdir(targetpath)
|
||||||
return targetpath
|
return targetpath
|
||||||
|
|
||||||
if not os.path.exists(targetpath):
|
try:
|
||||||
# Kovid: Could be a previously automatically created directory
|
target = open(targetpath, 'wb')
|
||||||
# in which case it is ignored
|
except IsADirectoryError:
|
||||||
try:
|
return targetpath
|
||||||
target = open(targetpath, 'wb')
|
except OSError:
|
||||||
except OSError:
|
targetpath = os.path.join(os.path.dirname(targetpath), sanitize_file_name(os.path.basename(targetpath)))
|
||||||
targetpath = os.path.join(os.path.dirname(targetpath), sanitize_file_name(os.path.basename(targetpath)))
|
target = open(targetpath, 'wb')
|
||||||
target = open(targetpath, 'wb')
|
|
||||||
|
|
||||||
with target, closing(self.open(member, pwd=pwd)) as source:
|
with target:
|
||||||
shutil.copyfileobj(source, target)
|
if lock is None:
|
||||||
|
with closing(self.open(member, pwd=pwd)) as source:
|
||||||
|
shutil.copyfileobj(source, target)
|
||||||
|
else:
|
||||||
|
with lock, closing(self.open(member, pwd=pwd)) as source:
|
||||||
|
src = source.read_raw()
|
||||||
|
src = source.decrypt_and_uncompress(src)
|
||||||
|
source.check_crc(src)
|
||||||
|
target.write(src)
|
||||||
# Kovid: Try to preserve the timestamps in the ZIP file
|
# Kovid: Try to preserve the timestamps in the ZIP file
|
||||||
try:
|
try:
|
||||||
mtime = time.localtime()
|
mtime = time.localtime()
|
||||||
mtime = time.mktime(member.date_time + (0, 0) + (mtime.tm_isdst,))
|
mtime = time.mktime(member.date_time + (0, 0) + (mtime.tm_isdst,))
|
||||||
os.utime(targetpath, (mtime, mtime))
|
os.utime(targetpath, (mtime, mtime))
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return targetpath
|
return targetpath
|
||||||
|
|
||||||
@ -1680,17 +1705,7 @@ def main(args=None):
|
|||||||
|
|
||||||
zf = ZipFile(args[1], 'r')
|
zf = ZipFile(args[1], 'r')
|
||||||
out = args[2]
|
out = args[2]
|
||||||
for path in zf.namelist():
|
zf.extractall(out)
|
||||||
if path.startswith('./'):
|
|
||||||
tgt = os.path.join(out, path[2:])
|
|
||||||
else:
|
|
||||||
tgt = os.path.join(out, path)
|
|
||||||
|
|
||||||
tgtdir = os.path.dirname(tgt)
|
|
||||||
if not os.path.exists(tgtdir):
|
|
||||||
os.makedirs(tgtdir)
|
|
||||||
with open(tgt, 'wb') as fp:
|
|
||||||
fp.write(zf.read(path))
|
|
||||||
zf.close()
|
zf.close()
|
||||||
|
|
||||||
elif args[0] == '-c':
|
elif args[0] == '-c':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user