Use only a single file for repeated data URIs

This commit is contained in:
Kovid Goyal 2018-06-12 18:18:43 +05:30
parent e8fdd7a83c
commit b5644ec6b0
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 14 additions and 4 deletions

View File

@ -103,9 +103,10 @@ def sanitize_file_name(x):
return make_filename_safe(x)
def download_one(tdir, timeout, progress_report, url):
def download_one(tdir, timeout, progress_report, data_uri_map, url):
try:
purl = urlparse(url)
data_url_key = None
with NamedTemporaryFile(dir=tdir, delete=False) as df:
if purl.scheme == 'file':
src = lopen(purl.path, 'rb')
@ -117,6 +118,10 @@ def download_one(tdir, timeout, progress_report, url):
if parts and parts[-1].lower() == 'base64':
payload = re.sub(r'\s+', '', payload)
payload = standard_b64decode(payload)
seen_before = data_uri_map.get(payload)
if seen_before is not None:
return True, (url, filename, seen_before, guess_type(seen_before))
data_url_key = payload
else:
payload = payload.encode('utf-8')
src = BytesIO(payload)
@ -137,6 +142,8 @@ def download_one(tdir, timeout, progress_report, url):
dest = ProgressTracker(df, url, sz, progress_report)
with closing(src):
shutil.copyfileobj(src, dest)
if data_url_key is not None:
data_uri_map[data_url_key] = dest.name
filename = sanitize_file_name(filename)
mt = guess_type(filename)
if mt in OEB_DOCS:
@ -151,10 +158,11 @@ def download_one(tdir, timeout, progress_report, url):
def download_external_resources(container, urls, timeout=60, progress_report=lambda url, done, total: None):
failures = {}
replacements = {}
data_uri_map = {}
with TemporaryDirectory('editor-download') as tdir:
pool = Pool(10)
with closing(pool):
for ok, result in pool.imap_unordered(partial(download_one, tdir, timeout, progress_report), urls):
for ok, result in pool.imap_unordered(partial(download_one, tdir, timeout, progress_report, data_uri_map), urls):
if ok:
url, suggested_filename, downloaded_file, mt = result
with lopen(downloaded_file, 'rb') as src:

View File

@ -49,11 +49,13 @@ class ChooseResources(QWidget):
self.items.clear()
self.original_resources = resources
dc = 0
for url in resources:
for url, matches in resources.iteritems():
text = url
num = len(matches)
if text.startswith('data:'):
dc += 1
text = _('Data URL ({})').format(dc)
text = _('Data URL #{}').format(dc)
text += ' ({})'.format(ngettext('one instance', '{} instances', num).format(num))
i = QListWidgetItem(text, self.items)
i.setData(Qt.UserRole, url)
i.setCheckState(Qt.Checked)