Refactor various ebook download functions

The functions are now easily testable individually. An extra HTTP connection
when downloading a file from a URL directly is avoided. A workaround for
Project Gutenberg refusing to serve ebook files to browsers is added.
See #1354735 (When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.) [When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.](https://bugs.launchpad.net/calibre/+bug/1354735)
This commit is contained in:
Kovid Goyal 2014-08-18 11:42:08 +05:30
parent 5b9e305d5d
commit b0f52e2a4d
5 changed files with 150 additions and 124 deletions

View File

@ -641,50 +641,6 @@ def url_slash_cleaner(url):
''' '''
return re.sub(r'(?<!:)/{2,}', '/', url) return re.sub(r'(?<!:)/{2,}', '/', url)
def get_download_filename(url, cookie_file=None):
'''
Get a local filename for a URL using the content disposition header
Returns empty string if no content disposition header present
'''
from contextlib import closing
from urllib2 import unquote as urllib2_unquote
filename = ''
br = browser()
if cookie_file:
from mechanize import MozillaCookieJar
cj = MozillaCookieJar()
cj.load(cookie_file)
br.set_cookiejar(cj)
last_part_name = ''
try:
with closing(br.open(url)) as r:
last_part_name = r.geturl().split('/')[-1]
disposition = r.info().get('Content-disposition', '')
for p in disposition.split(';'):
if 'filename' in p:
if '*=' in disposition:
parts = disposition.split('*=')[-1]
filename = parts.split('\'')[-1]
else:
filename = disposition.split('=')[-1]
if filename[0] in ('\'', '"'):
filename = filename[1:]
if filename[-1] in ('\'', '"'):
filename = filename[:-1]
filename = urllib2_unquote(filename)
break
except:
import traceback
traceback.print_exc()
if not filename:
filename = last_part_name
return filename
def human_readable(size, sep=' '): def human_readable(size, sep=' '):
""" Convert a size in bytes into a human readable form """ """ Convert a size in bytes into a human readable form """
divisor, suffix = 1, "B" divisor, suffix = 1, "B"

View File

@ -11,12 +11,43 @@ import shutil
from contextlib import closing from contextlib import closing
from mechanize import MozillaCookieJar from mechanize import MozillaCookieJar
from calibre import browser, get_download_filename from calibre import browser
from calibre.constants import __appname__, __version__
from calibre.ebooks import BOOK_EXTENSIONS from calibre.ebooks import BOOK_EXTENSIONS
from calibre.gui2 import Dispatcher from calibre.gui2 import Dispatcher
from calibre.gui2.threaded_jobs import ThreadedJob from calibre.gui2.threaded_jobs import ThreadedJob
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.web import get_download_filename_from_response
def get_download_filename(response):
filename = get_download_filename_from_response(response)
filename, ext = os.path.splitext(filename)
filename = filename[:60] + ext
filename = ascii_filename(filename)
return filename
def download_file(url, cookie_file=None, filename=None):
user_agent = None
if url.startswith('http://www.gutenberg.org'):
# Project Gutenberg returns an HTML page if the user agent is a normal
# browser user agent
user_agent = '%s/%s' % (__appname__, __version__)
br = browser(user_agent=user_agent)
if cookie_file:
cj = MozillaCookieJar()
cj.load(cookie_file)
br.set_cookiejar(cj)
with closing(br.open(url)) as r:
if not filename:
filename = get_download_filename(r)
temp_path = os.path.join(PersistentTemporaryDirectory(), filename)
with open(temp_path, 'w+b') as tf:
shutil.copyfileobj(r, tf)
dfilename = tf.name
return dfilename
class EbookDownload(object): class EbookDownload(object):
@ -36,32 +67,12 @@ class EbookDownload(object):
pass pass
def _download(self, cookie_file, url, filename, save_loc, add_to_lib): def _download(self, cookie_file, url, filename, save_loc, add_to_lib):
dfilename = ''
if not url: if not url:
raise Exception(_('No file specified to download.')) raise Exception(_('No file specified to download.'))
if not save_loc and not add_to_lib: if not save_loc and not add_to_lib:
# Nothing to do. # Nothing to do.
return dfilename return ''
return download_file(url, cookie_file, filename)
if not filename:
filename = get_download_filename(url, cookie_file)
filename, ext = os.path.splitext(filename)
filename = filename[:60] + ext
filename = ascii_filename(filename)
br = browser()
if cookie_file:
cj = MozillaCookieJar()
cj.load(cookie_file)
br.set_cookiejar(cj)
with closing(br.open(url)) as r:
temp_path = os.path.join(PersistentTemporaryDirectory(), filename)
tf = open(temp_path, 'w+b')
tf.write(r.read())
dfilename = tf.name
return dfilename
def _add(self, filename, gui, add_to_lib, tags): def _add(self, filename, gui, add_to_lib, tags):
if not add_to_lib or not filename: if not add_to_lib or not filename:
@ -90,7 +101,9 @@ gui_ebook_download = EbookDownload()
def start_ebook_download(callback, job_manager, gui, cookie_file=None, url='', filename='', save_loc='', add_to_lib=True, tags=[]): def start_ebook_download(callback, job_manager, gui, cookie_file=None, url='', filename='', save_loc='', add_to_lib=True, tags=[]):
description = _('Downloading %s') % filename.decode('utf-8', 'ignore') if filename else url.decode('utf-8', 'ignore') description = _('Downloading %s') % filename.decode('utf-8', 'ignore') if filename else url.decode('utf-8', 'ignore')
job = ThreadedJob('ebook_download', description, gui_ebook_download, (gui, cookie_file, url, filename, save_loc, add_to_lib, tags), {}, callback, max_concurrent_count=2, killable=False) job = ThreadedJob('ebook_download', description, gui_ebook_download, (
gui, cookie_file, url, filename, save_loc, add_to_lib, tags), {},
callback, max_concurrent_count=2, killable=False)
job_manager.run_threaded_job(job) job_manager.run_threaded_job(job)

View File

@ -21,10 +21,67 @@ from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
web_url = 'http://m.gutenberg.org/'
def search(query, max_results=10, timeout=60):
url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query)
counter = max_results
br = browser(user_agent='calibre/'+__version__)
with closing(br.open(url, timeout=timeout)) as f:
doc = etree.fromstring(f.read())
for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0:
break
counter -= 1
s = SearchResult()
# We could use the <link rel="alternate" type="text/html" ...> tag from the
# detail odps page but this is easier.
id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub('[^\d]', '', id)))
if not s.detail_item:
continue
s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
if not s.title or not s.author:
continue
# Get the formats and direct download links.
with closing(br.open(id, timeout=timeout/4)) as nf:
ndoc = etree.fromstring(nf.read())
for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
type = link.get('type')
href = link.get('href')
if type:
ext = mimetypes.guess_extension(type)
if ext:
ext = ext[1:].upper().strip()
s.downloads[ext] = href
s.formats = ', '.join(s.downloads.keys())
if not s.formats:
continue
for link in data.xpath('./*[local-name() = "link"]'):
rel = link.get('rel')
href = link.get('href')
type = link.get('type')
if rel and href and type:
if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
if href.startswith('data:image/png;base64,'):
s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))
yield s
class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore): class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml' open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
web_url = 'http://m.gutenberg.org/' web_url = web_url
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
''' '''
@ -48,57 +105,5 @@ class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
* Images are not links but base64 encoded strings. They are also not * Images are not links but base64 encoded strings. They are also not
real cover images but a little blue book thumbnail. real cover images but a little blue book thumbnail.
''' '''
for result in search(query, max_results, timeout):
url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query) yield result
counter = max_results
br = browser(user_agent='calibre/'+__version__)
with closing(br.open(url, timeout=timeout)) as f:
doc = etree.fromstring(f.read())
for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0:
break
counter -= 1
s = SearchResult()
# We could use the <link rel="alternate" type="text/html" ...> tag from the
# detail odps page but this is easier.
id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (self.web_url, re.sub('[^\d]', '', id)))
if not s.detail_item:
continue
s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
if not s.title or not s.author:
continue
# Get the formats and direct download links.
with closing(br.open(id, timeout=timeout/4)) as nf:
ndoc = etree.fromstring(nf.read())
for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
type = link.get('type')
href = link.get('href')
if type:
ext = mimetypes.guess_extension(type)
if ext:
ext = ext[1:].upper().strip()
s.downloads[ext] = href
s.formats = ', '.join(s.downloads.keys())
if not s.formats:
continue
for link in data.xpath('./*[local-name() = "link"]'):
rel = link.get('rel')
href = link.get('href')
type = link.get('type')
if rel and href and type:
if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
if href.startswith('data:image/png;base64,'):
s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))
yield s

View File

@ -12,11 +12,12 @@ from urlparse import urlparse
from PyQt5.Qt import QNetworkCookieJar, QNetworkProxy, QUrl from PyQt5.Qt import QNetworkCookieJar, QNetworkProxy, QUrl
from PyQt5.QtWebKitWidgets import QWebView, QWebPage from PyQt5.QtWebKitWidgets import QWebView, QWebPage
from calibre import USER_AGENT, get_proxies, get_download_filename from calibre import USER_AGENT, get_proxies
from calibre.ebooks import BOOK_EXTENSIONS from calibre.ebooks import BOOK_EXTENSIONS
from calibre.gui2 import choose_save_file from calibre.gui2 import choose_save_file
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.web import get_download_filename
class NPWebView(QWebView): class NPWebView(QWebView):

View File

@ -5,3 +5,54 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
class Recipe(object): class Recipe(object):
pass pass
def get_download_filename_from_response(response):
from urllib2 import unquote as urllib2_unquote
filename = last_part_name = ''
try:
last_part_name = response.geturl().split('/')[-1]
disposition = response.info().get('Content-disposition', '')
for p in disposition.split(';'):
if 'filename' in p:
if '*=' in disposition:
parts = disposition.split('*=')[-1]
filename = parts.split('\'')[-1]
else:
filename = disposition.split('=')[-1]
if filename[0] in ('\'', '"'):
filename = filename[1:]
if filename[-1] in ('\'', '"'):
filename = filename[:-1]
filename = urllib2_unquote(filename)
break
except Exception:
import traceback
traceback.print_exc()
return filename or last_part_name
def get_download_filename(url, cookie_file=None):
'''
Get a local filename for a URL using the content disposition header
Returns empty string if an error occurs.
'''
from calibre import browser
from contextlib import closing
filename = ''
br = browser()
if cookie_file:
from mechanize import MozillaCookieJar
cj = MozillaCookieJar()
cj.load(cookie_file)
br.set_cookiejar(cj)
try:
with closing(br.open(url)) as r:
filename = get_download_filename_from_response(r)
except:
import traceback
traceback.print_exc()
return filename