Refactor various ebook download functions

The functions are now easily testable individually. An extra HTTP connection when downloading a file from a URL directly is avoided. A workaround for Project Gutenberg refusing to serve ebook files to browsers is added. See #1354735 (When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.) [When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.](https://bugs.launchpad.net/calibre/+bug/1354735)
2025-07-09 03:04:10 -04:00 · 2014-08-18 11:42:08 +05:30 · 2014-08-18 11:42:08 +05:30 · b0f52e2a4d
commit b0f52e2a4d
parent 5b9e305d5d
5 changed files with 150 additions and 124 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -641,50 +641,6 @@ def url_slash_cleaner(url):
    '''
    return re.sub(r'(?<!:)/{2,}', '/', url)
 def get_download_filename(url, cookie_file=None):
    '''
    Get a local filename for a URL using the content disposition header
    Returns empty string if no content disposition header present
    '''
    from contextlib import closing
    from urllib2 import unquote as urllib2_unquote
    filename = ''
    br = browser()
    if cookie_file:
        from mechanize import MozillaCookieJar
        cj = MozillaCookieJar()
        cj.load(cookie_file)
        br.set_cookiejar(cj)
    last_part_name = ''
    try:
        with closing(br.open(url)) as r:
            last_part_name = r.geturl().split('/')[-1]
            disposition = r.info().get('Content-disposition', '')
            for p in disposition.split(';'):
                if 'filename' in p:
                    if '*=' in disposition:
                        parts = disposition.split('*=')[-1]
                        filename = parts.split('\'')[-1]
                    else:
                        filename = disposition.split('=')[-1]
                    if filename[0] in ('\'', '"'):
                        filename = filename[1:]
                    if filename[-1] in ('\'', '"'):
                        filename = filename[:-1]
                    filename = urllib2_unquote(filename)
                    break
    except:
        import traceback
        traceback.print_exc()
    if not filename:
        filename = last_part_name
    return filename
 def human_readable(size, sep=' '):
    """ Convert a size in bytes into a human readable form """
    divisor, suffix = 1, "B"
--- a/src/calibre/gui2/ebook_download.py
+++ b/src/calibre/gui2/ebook_download.py
@ -11,12 +11,43 @@ import shutil
 from contextlib import closing
 from mechanize import MozillaCookieJar
-from calibre import browser, get_download_filename
+from calibre import browser
 from calibre.constants import __appname__, __version__
 from calibre.ebooks import BOOK_EXTENSIONS
 from calibre.gui2 import Dispatcher
 from calibre.gui2.threaded_jobs import ThreadedJob
 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.utils.filenames import ascii_filename
 from calibre.web import get_download_filename_from_response
 def get_download_filename(response):
    filename = get_download_filename_from_response(response)
    filename, ext = os.path.splitext(filename)
    filename = filename[:60] + ext
    filename = ascii_filename(filename)
    return filename
 def download_file(url, cookie_file=None, filename=None):
    user_agent = None
    if url.startswith('http://www.gutenberg.org'):
        # Project Gutenberg returns an HTML page if the user agent is a normal
        # browser user agent
        user_agent = '%s/%s' % (__appname__, __version__)
    br = browser(user_agent=user_agent)
    if cookie_file:
        cj = MozillaCookieJar()
        cj.load(cookie_file)
        br.set_cookiejar(cj)
    with closing(br.open(url)) as r:
        if not filename:
            filename = get_download_filename(r)
        temp_path = os.path.join(PersistentTemporaryDirectory(), filename)
        with open(temp_path, 'w+b') as tf:
            shutil.copyfileobj(r, tf)
            dfilename = tf.name
    return dfilename
 class EbookDownload(object):
@ -36,32 +67,12 @@ class EbookDownload(object):
                pass
    def _download(self, cookie_file, url, filename, save_loc, add_to_lib):
        dfilename = ''
        if not url:
            raise Exception(_('No file specified to download.'))
        if not save_loc and not add_to_lib:
            # Nothing to do.
-            return dfilename
+            return ''
-
+        return download_file(url, cookie_file, filename)
        if not filename:
            filename = get_download_filename(url, cookie_file)
            filename, ext = os.path.splitext(filename)
            filename = filename[:60] + ext
            filename = ascii_filename(filename)
        br = browser()
        if cookie_file:
            cj = MozillaCookieJar()
            cj.load(cookie_file)
            br.set_cookiejar(cj)
        with closing(br.open(url)) as r:
            temp_path = os.path.join(PersistentTemporaryDirectory(), filename)
            tf = open(temp_path, 'w+b')
            tf.write(r.read())
            dfilename = tf.name
        return dfilename
    def _add(self, filename, gui, add_to_lib, tags):
        if not add_to_lib or not filename:
@ -90,7 +101,9 @@ gui_ebook_download = EbookDownload()
 def start_ebook_download(callback, job_manager, gui, cookie_file=None, url='', filename='', save_loc='', add_to_lib=True, tags=[]):
    description = _('Downloading %s') % filename.decode('utf-8', 'ignore') if filename else url.decode('utf-8', 'ignore')
-    job = ThreadedJob('ebook_download', description, gui_ebook_download, (gui, cookie_file, url, filename, save_loc, add_to_lib, tags), {}, callback, max_concurrent_count=2, killable=False)
+    job = ThreadedJob('ebook_download', description, gui_ebook_download, (
        gui, cookie_file, url, filename, save_loc, add_to_lib, tags), {},
                      callback, max_concurrent_count=2, killable=False)
    job_manager.run_threaded_job(job)
--- a/src/calibre/gui2/store/stores/gutenberg_plugin.py
+++ b/src/calibre/gui2/store/stores/gutenberg_plugin.py
@ -21,34 +21,9 @@ from calibre.gui2.store.basic_config import BasicStoreConfig
 from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
 from calibre.gui2.store.search_result import SearchResult
-class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
+web_url = 'http://m.gutenberg.org/'
    open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
    web_url = 'http://m.gutenberg.org/'
    def search(self, query, max_results=10, timeout=60):
        '''
        Gutenberg's ODPS feed is poorly implmented and has a number of issues
        which require very special handling to fix the results.
        Issues:
          * "Sort Alphabetically" and "Sort by Release Date" are returned
            as book entries.
          * The author is put into a "content" tag and not the author tag.
          * The link to the book itself goes to an odps page which we need
            to turn into a link to a web page.
          * acquisition links are not part of the search result so we have
            to go to the odps item itself. Detail item pages have a nasty
            note saying:
              DON'T USE THIS PAGE FOR SCRAPING.
              Seriously. You'll only get your IP blocked.
            We're using the ODPS feed because people are getting blocked with
            the previous implementation so due to this using ODPS probably
            won't solve this issue.
          * Images are not links but base64 encoded strings. They are also not
            real cover images but a little blue book thumbnail.
        '''
 def search(query, max_results=10, timeout=60):
    url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query)
    counter = max_results
@ -66,7 +41,7 @@ class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
            # We could use the <link rel="alternate" type="text/html" ...> tag from the
            # detail odps page but this is easier.
            id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
-                s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (self.web_url, re.sub('[^\d]', '', id)))
+            s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub('[^\d]', '', id)))
            if not s.detail_item:
                continue
@ -102,3 +77,33 @@ class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
                            s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))
            yield s
 class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
    open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
    web_url = web_url
    def search(self, query, max_results=10, timeout=60):
        '''
        Gutenberg's ODPS feed is poorly implmented and has a number of issues
        which require very special handling to fix the results.
        Issues:
          * "Sort Alphabetically" and "Sort by Release Date" are returned
            as book entries.
          * The author is put into a "content" tag and not the author tag.
          * The link to the book itself goes to an odps page which we need
            to turn into a link to a web page.
          * acquisition links are not part of the search result so we have
            to go to the odps item itself. Detail item pages have a nasty
            note saying:
              DON'T USE THIS PAGE FOR SCRAPING.
              Seriously. You'll only get your IP blocked.
            We're using the ODPS feed because people are getting blocked with
            the previous implementation so due to this using ODPS probably
            won't solve this issue.
          * Images are not links but base64 encoded strings. They are also not
            real cover images but a little blue book thumbnail.
        '''
        for result in search(query, max_results, timeout):
            yield result
--- a/src/calibre/gui2/store/web_control.py
+++ b/src/calibre/gui2/store/web_control.py
@ -12,11 +12,12 @@ from urlparse import urlparse
 from PyQt5.Qt import QNetworkCookieJar, QNetworkProxy, QUrl
 from PyQt5.QtWebKitWidgets import QWebView, QWebPage
-from calibre import USER_AGENT, get_proxies, get_download_filename
+from calibre import USER_AGENT, get_proxies
 from calibre.ebooks import BOOK_EXTENSIONS
 from calibre.gui2 import choose_save_file
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.utils.filenames import ascii_filename
 from calibre.web import get_download_filename
 class NPWebView(QWebView):
--- a/src/calibre/web/init.py
+++ b/src/calibre/web/init.py
@ -5,3 +5,54 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 class Recipe(object):
    pass
 def get_download_filename_from_response(response):
    from urllib2 import unquote as urllib2_unquote
    filename = last_part_name = ''
    try:
        last_part_name = response.geturl().split('/')[-1]
        disposition = response.info().get('Content-disposition', '')
        for p in disposition.split(';'):
            if 'filename' in p:
                if '*=' in disposition:
                    parts = disposition.split('*=')[-1]
                    filename = parts.split('\'')[-1]
                else:
                    filename = disposition.split('=')[-1]
                if filename[0] in ('\'', '"'):
                    filename = filename[1:]
                if filename[-1] in ('\'', '"'):
                    filename = filename[:-1]
                filename = urllib2_unquote(filename)
                break
    except Exception:
        import traceback
        traceback.print_exc()
    return filename or last_part_name
 def get_download_filename(url, cookie_file=None):
    '''
    Get a local filename for a URL using the content disposition header
    Returns empty string if an error occurs.
    '''
    from calibre import browser
    from contextlib import closing
    filename = ''
    br = browser()
    if cookie_file:
        from mechanize import MozillaCookieJar
        cj = MozillaCookieJar()
        cj.load(cookie_file)
        br.set_cookiejar(cj)
    try:
        with closing(br.open(url)) as r:
            filename = get_download_filename_from_response(r)
    except:
        import traceback
        traceback.print_exc()
    return filename