Speedup unzipping of large ZIP files by 25%

parallelize I/O and decompress/crc
This commit is contained in:
Kovid Goyal 2025-04-08 16:09:46 +05:30
parent 5504e9fdf9
commit 0594f026ba
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 59 additions and 44 deletions

View File

@ -42,7 +42,7 @@ class KepubifyTests(BaseTest):
actual = get_container(outpath, tweak_mode=True) actual = get_container(outpath, tweak_mode=True)
self.assertEqual( self.assertEqual(
tuple(expected.manifest_items_with_property('cover-image')), tuple(actual.manifest_items_with_property('cover-image'))) tuple(expected.manifest_items_with_property('cover-image')), tuple(actual.manifest_items_with_property('cover-image')))
self.assertEqual(tuple(expected.mime_map), tuple(actual.mime_map)) self.assertEqual(expected.mime_map, actual.mime_map)
for name, mt in expected.mime_map.items(): for name, mt in expected.mime_map.items():
if mt in OEB_DOCS or mt in OEB_STYLES or name.endswith('.opf'): if mt in OEB_DOCS or mt in OEB_STYLES or name.endswith('.opf'):
self.assertEqual(expected.open(name, 'rb').read(), actual.open(name, 'rb').read()) self.assertEqual(expected.open(name, 'rb').read(), actual.open(name, 'rb').read())

View File

@ -11,7 +11,9 @@ import stat
import struct import struct
import sys import sys
import time import time
from concurrent.futures import ThreadPoolExecutor
from contextlib import closing from contextlib import closing
from threading import Lock
from calibre import sanitize_file_name from calibre import sanitize_file_name
from calibre.constants import filesystem_encoding from calibre.constants import filesystem_encoding
@ -551,6 +553,7 @@ class ZipExtFile(io.BufferedIOBase):
self.mode = mode self.mode = mode
self.name = zipinfo.filename self.name = zipinfo.filename
self.uncompressed_size = zipinfo.file_size
if hasattr(zipinfo, 'CRC'): if hasattr(zipinfo, 'CRC'):
self._expected_crc = zipinfo.CRC self._expected_crc = zipinfo.CRC
@ -713,6 +716,26 @@ class ZipExtFile(io.BufferedIOBase):
self._fileobj.seek(pos) self._fileobj.seek(pos)
return raw return raw
def decrypt_and_uncompress(self, raw: bytes) -> bytes:
if self._decrypter is not None and raw:
raw = b''.join(bytes(bytearray(map(self._decrypter, bytearray(raw)))))
if self._compress_type == ZIP_DEFLATED:
raw = zlib.decompress(raw, -15, max(self.uncompressed_size, zlib.DEF_BUF_SIZE))
return raw
def check_crc(self, raw: bytes) -> None:
if self._expected_crc is not None:
crc = crc32(raw) & 0xffffffff
# Check the CRC if we're at the end of the file
if crc != self._expected_crc:
raise BadZipfile(f'Bad CRC-32 for file {self.name!r}')
def readall(self):
raw = self.read_raw()
raw = self.decrypt_and_uncompress(raw)
self.check_crc(raw)
return raw
class ZipFile: class ZipFile:
''' Class with methods to open, read, write, close, list and update zip files. ''' Class with methods to open, read, write, close, list and update zip files.
@ -1005,12 +1028,13 @@ class ZipFile:
def read(self, name, pwd=None): def read(self, name, pwd=None):
'''Return file bytes (as a string) for name.''' '''Return file bytes (as a string) for name.'''
return self.open(name, 'r', pwd).read() with closing(self.open(name, 'r', pwd)) as f:
return f.readall()
def read_raw(self, name, mode='r', pwd=None): def read_raw(self, name, mode='r', pwd=None):
'''Return the raw bytes in the zipfile corresponding to name.''' '''Return the raw bytes in the zipfile corresponding to name.'''
zef = self.open(name, mode=mode, pwd=pwd) with closing(self.open(name, mode=mode, pwd=pwd)) as zef:
return zef.read_raw() return zef.read_raw()
def open(self, name, mode='r', pwd=None): def open(self, name, mode='r', pwd=None):
'''Return file-like object for 'name'.''' '''Return file-like object for 'name'.'''
@ -1020,13 +1044,7 @@ class ZipFile:
raise RuntimeError( raise RuntimeError(
'Attempt to read ZIP archive that was already closed') 'Attempt to read ZIP archive that was already closed')
# Only open a new file for instances where we were not zef_file = self.fp
# given a file object in the constructor
if self._filePassed:
zef_file = self.fp
else:
zef_file = open(self.filename, 'rb')
# Make sure we have an info object # Make sure we have an info object
if isinstance(name, ZipInfo): if isinstance(name, ZipInfo):
# 'name' is already an info object # 'name' is already an info object
@ -1035,7 +1053,7 @@ class ZipFile:
# Get info object for name # Get info object for name
zinfo = self.getinfo(name) zinfo = self.getinfo(name)
zef_file.seek(zinfo.header_offset, 0) zef_file.seek(zinfo.header_offset, os.SEEK_SET)
# Skip the file header: # Skip the file header:
fheader = zef_file.read(sizeFileHeader) fheader = zef_file.read(sizeFileHeader)
@ -1100,28 +1118,28 @@ class ZipFile:
return self._extract_member(member, path, pwd) return self._extract_member(member, path, pwd)
def extractall(self, path=None, members=None, pwd=None): def extractall(self, path=None, members=None, pwd=None):
'''Extract all members from the archive to the current working '''
directory. `path' specifies a different directory to extract to. Extract all members from the archive to the current working
`members' is optional and must be a subset of the list returned directory. `path' specifies a different directory to extract to.
by namelist(). `members' is optional and must be a subset of the list returned
by namelist(). Uses multiple worker threads for max throughput.
''' '''
if members is None: if members is None:
members = self.namelist() members = self.namelist()
if path is None: if path is None:
path = os.getcwd() path = os.getcwd()
# Kovid: Extract longer names first, just in case the zip file has
# an entry for a directory without a trailing slash
members.sort(key=len, reverse=True)
args = [] args = []
for name in members: for name in members:
zi = self.getinfo(name) zi = self.getinfo(name)
dest = self._get_targetpath(zi, path) dest = self._get_targetpath(zi, path)
args.append((zi, dest, pwd)) args.append((zi, dest, pwd))
lock = Lock()
def do_one(a): def do_one(a):
return self._extract_member_to(*a) return self._extract_member_to(*a, lock=lock)
tuple(map(do_one, args)) with ThreadPoolExecutor(thread_name_prefix='ZipFile-') as e:
tuple(e.map(do_one, args))
def _get_targetpath(self, member: ZipInfo, targetpath: str) -> str: def _get_targetpath(self, member: ZipInfo, targetpath: str) -> str:
# build the destination pathname, replacing # build the destination pathname, replacing
@ -1162,7 +1180,7 @@ class ZipFile:
def _extract_member(self, member, targetpath, pwd): def _extract_member(self, member, targetpath, pwd):
return self._extract_member_to(member, self._get_targetpath(member, targetpath), pwd) return self._extract_member_to(member, self._get_targetpath(member, targetpath), pwd)
def _extract_member_to(self, member, targetpath, pwd): def _extract_member_to(self, member, targetpath, pwd, lock=None):
'''Extract the ZipInfo object 'member' to a physical '''Extract the ZipInfo object 'member' to a physical
file on the path targetpath. file on the path targetpath.
''' '''
@ -1175,23 +1193,30 @@ class ZipFile:
os.mkdir(targetpath) os.mkdir(targetpath)
return targetpath return targetpath
if not os.path.exists(targetpath): try:
# Kovid: Could be a previously automatically created directory target = open(targetpath, 'wb')
# in which case it is ignored except IsADirectoryError:
try: return targetpath
target = open(targetpath, 'wb') except OSError:
except OSError: targetpath = os.path.join(os.path.dirname(targetpath), sanitize_file_name(os.path.basename(targetpath)))
targetpath = os.path.join(os.path.dirname(targetpath), sanitize_file_name(os.path.basename(targetpath))) target = open(targetpath, 'wb')
target = open(targetpath, 'wb')
with target, closing(self.open(member, pwd=pwd)) as source: with target:
shutil.copyfileobj(source, target) if lock is None:
with closing(self.open(member, pwd=pwd)) as source:
shutil.copyfileobj(source, target)
else:
with lock, closing(self.open(member, pwd=pwd)) as source:
src = source.read_raw()
src = source.decrypt_and_uncompress(src)
source.check_crc(src)
target.write(src)
# Kovid: Try to preserve the timestamps in the ZIP file # Kovid: Try to preserve the timestamps in the ZIP file
try: try:
mtime = time.localtime() mtime = time.localtime()
mtime = time.mktime(member.date_time + (0, 0) + (mtime.tm_isdst,)) mtime = time.mktime(member.date_time + (0, 0) + (mtime.tm_isdst,))
os.utime(targetpath, (mtime, mtime)) os.utime(targetpath, (mtime, mtime))
except: except Exception:
pass pass
return targetpath return targetpath
@ -1680,17 +1705,7 @@ def main(args=None):
zf = ZipFile(args[1], 'r') zf = ZipFile(args[1], 'r')
out = args[2] out = args[2]
for path in zf.namelist(): zf.extractall(out)
if path.startswith('./'):
tgt = os.path.join(out, path[2:])
else:
tgt = os.path.join(out, path)
tgtdir = os.path.dirname(tgt)
if not os.path.exists(tgtdir):
os.makedirs(tgtdir)
with open(tgt, 'wb') as fp:
fp.write(zf.read(path))
zf.close() zf.close()
elif args[0] == '-c': elif args[0] == '-c':