mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Start work on editor tool to download external resources
This commit is contained in:
parent
7914ada946
commit
5ad2a42f64
122
src/calibre/ebooks/oeb/polish/download.py
Normal file
122
src/calibre/ebooks/oeb/polish/download.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
import shutil, os, posixpath, cgi, mimetypes
|
||||||
|
from collections import defaultdict
|
||||||
|
from contextlib import closing
|
||||||
|
from urlparse import urlparse
|
||||||
|
from multiprocessing.dummy import Pool
|
||||||
|
from functools import partial
|
||||||
|
from tempfile import NamedTemporaryFile
|
||||||
|
from urllib2 import urlopen
|
||||||
|
|
||||||
|
from calibre import as_unicode, sanitize_file_name2
|
||||||
|
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||||
|
from calibre.ebooks.oeb.base import OEB_DOCS, iterlinks, barename, OEB_STYLES
|
||||||
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
|
from calibre.web import get_download_filename_from_response
|
||||||
|
|
||||||
|
|
||||||
|
def is_external(url):
|
||||||
|
try:
|
||||||
|
purl = urlparse(url)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
return purl.scheme in ('http', 'https', 'file', 'ftp')
|
||||||
|
|
||||||
|
|
||||||
|
def iterhtmllinks(container, name):
|
||||||
|
for el, attr, link, pos in iterlinks(container.parsed(name)):
|
||||||
|
tag = barename(el.tag).lower()
|
||||||
|
if tag != 'a' and is_external(link):
|
||||||
|
yield el, attr, link
|
||||||
|
|
||||||
|
|
||||||
|
def get_external_resources(container):
|
||||||
|
ans = defaultdict(list)
|
||||||
|
for name, media_type in container.mime_map.iteritems():
|
||||||
|
if container.has_name(name) and container.exists(name):
|
||||||
|
if media_type in OEB_DOCS:
|
||||||
|
for el, attr, link in iterhtmllinks(container, name):
|
||||||
|
ans[link].append(name)
|
||||||
|
elif media_type in OEB_STYLES:
|
||||||
|
for link in container.iterlinks(name):
|
||||||
|
ans[link].append(name)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def get_filename(original_url_parsed, response):
|
||||||
|
ans = get_download_filename_from_response(response) or posixpath.basename(original_url_parsed.path) or 'unknown'
|
||||||
|
ct = response.info().get('Content-Type', '')
|
||||||
|
if ct:
|
||||||
|
ct = cgi.parse_header(ct)[0].lower()
|
||||||
|
if ct:
|
||||||
|
mt = guess_type(ans)
|
||||||
|
if mt != ct:
|
||||||
|
exts = mimetypes.guess_all_extensions(ct)
|
||||||
|
if exts:
|
||||||
|
ans += exts[0]
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def download_one(tdir, timeout, url):
|
||||||
|
try:
|
||||||
|
purl = urlparse(url)
|
||||||
|
with NamedTemporaryFile(dir=tdir, delete=False) as dest:
|
||||||
|
if purl.scheme == 'file':
|
||||||
|
src = lopen(purl.path, 'rb')
|
||||||
|
filename = os.path.basename(src)
|
||||||
|
else:
|
||||||
|
src = urlopen(url, timeout=timeout)
|
||||||
|
filename = get_filename(purl, src)
|
||||||
|
with src:
|
||||||
|
shutil.copyfileobj(src, dest)
|
||||||
|
filename = sanitize_file_name2(filename)
|
||||||
|
mt = guess_type(filename)
|
||||||
|
if mt in OEB_DOCS:
|
||||||
|
raise ValueError('The external resource {} looks like a HTML document ({})'.format(url, filename))
|
||||||
|
if not mt or mt == 'application/octet-stream' or '.' not in filename:
|
||||||
|
raise ValueError('The external resource {} is not of a known type'.format(url))
|
||||||
|
return True, (url, sanitize_file_name2(filename), dest.name, mt)
|
||||||
|
except Exception as err:
|
||||||
|
return False, url, as_unicode(err)
|
||||||
|
|
||||||
|
|
||||||
|
def download_external_resoures(container, urls, timeout=60):
|
||||||
|
failures = {}
|
||||||
|
replacements = {}
|
||||||
|
with TemporaryDirectory('editor_download') as tdir:
|
||||||
|
pool = Pool(10)
|
||||||
|
with closing(pool):
|
||||||
|
for ok, result in pool.imap_unordered(partial(download_one, tdir, timeout), urls):
|
||||||
|
if ok:
|
||||||
|
url, suggested_filename, downloaded_file, mt = result
|
||||||
|
with lopen(downloaded_file, 'rb') as src:
|
||||||
|
name = container.add_file(suggested_filename, src, mt, modify_name_if_needed=True)
|
||||||
|
replacements[url] = name
|
||||||
|
else:
|
||||||
|
url, err = result
|
||||||
|
failures[url] = err
|
||||||
|
return replacements, failures
|
||||||
|
|
||||||
|
def replacer(url_map):
|
||||||
|
def replace(url):
|
||||||
|
r = url_map.get(url)
|
||||||
|
replace.replaced |= r is not None
|
||||||
|
return url if r is None else r
|
||||||
|
replace.replaced = False
|
||||||
|
return replace
|
||||||
|
|
||||||
|
def replace_resources(container, urls, replacements):
|
||||||
|
url_maps = defaultdict(dict)
|
||||||
|
changed = False
|
||||||
|
for url, name in urls.iteritems():
|
||||||
|
replacement = replacements.get(url)
|
||||||
|
if replacement is not None:
|
||||||
|
url_maps[name][url] = container.name_to_href(replacement, name)
|
||||||
|
for name, url_map in url_maps.iteritems():
|
||||||
|
r = replacer(url_map)
|
||||||
|
container.replace_links(name, r)
|
||||||
|
changed |= r.replaced
|
||||||
|
return changed
|
Loading…
x
Reference in New Issue
Block a user