From b0f52e2a4dcf3d46eda32de3952d892fdf0ea0fa Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 18 Aug 2014 11:42:08 +0530
Subject: [PATCH] Refactor various ebook download functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The functions are now easily testable individually. An extra HTTP connection
when downloading a file from a URL directly is avoided. A workaround for
Project Gutenberg refusing to serve ebook files to browsers is added.
See #1354735 (When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.) [When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.](https://bugs.launchpad.net/calibre/+bug/1354735)
---
 src/calibre/__init__.py                       |  44 -------
 src/calibre/gui2/ebook_download.py            |  61 ++++++----
 .../gui2/store/stores/gutenberg_plugin.py     | 115 +++++++++---------
 src/calibre/gui2/store/web_control.py         |   3 +-
 src/calibre/web/__init__.py                   |  51 ++++++++
 5 files changed, 150 insertions(+), 124 deletions(-)

diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py
index 9d599c9cf0..81c8027bb4 100644
--- a/src/calibre/__init__.py
+++ b/src/calibre/__init__.py
@@ -641,50 +641,6 @@ def url_slash_cleaner(url):
     '''
     return re.sub(r'(?<!:)/{2,}', '/', url)
 
-def get_download_filename(url, cookie_file=None):
-    '''
-    Get a local filename for a URL using the content disposition header
-    Returns empty string if no content disposition header present
-    '''
-    from contextlib import closing
-    from urllib2 import unquote as urllib2_unquote
-
-    filename = ''
-
-    br = browser()
-    if cookie_file:
-        from mechanize import MozillaCookieJar
-        cj = MozillaCookieJar()
-        cj.load(cookie_file)
-        br.set_cookiejar(cj)
-
-    last_part_name = ''
-    try:
-        with closing(br.open(url)) as r:
-            last_part_name = r.geturl().split('/')[-1]
-            disposition = r.info().get('Content-disposition', '')
-            for p in disposition.split(';'):
-                if 'filename' in p:
-                    if '*=' in disposition:
-                        parts = disposition.split('*=')[-1]
-                        filename = parts.split('\'')[-1]
-                    else:
-                        filename = disposition.split('=')[-1]
-                    if filename[0] in ('\'', '"'):
-                        filename = filename[1:]
-                    if filename[-1] in ('\'', '"'):
-                        filename = filename[:-1]
-                    filename = urllib2_unquote(filename)
-                    break
-    except:
-        import traceback
-        traceback.print_exc()
-
-    if not filename:
-        filename = last_part_name
-
-    return filename
-
 def human_readable(size, sep=' '):
     """ Convert a size in bytes into a human readable form """
     divisor, suffix = 1, "B"
diff --git a/src/calibre/gui2/ebook_download.py b/src/calibre/gui2/ebook_download.py
index facc5016f2..71e8e43480 100644
--- a/src/calibre/gui2/ebook_download.py
+++ b/src/calibre/gui2/ebook_download.py
@@ -11,12 +11,43 @@ import shutil
 from contextlib import closing
 from mechanize import MozillaCookieJar
 
-from calibre import browser, get_download_filename
+from calibre import browser
+from calibre.constants import __appname__, __version__
 from calibre.ebooks import BOOK_EXTENSIONS
 from calibre.gui2 import Dispatcher
 from calibre.gui2.threaded_jobs import ThreadedJob
 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.utils.filenames import ascii_filename
+from calibre.web import get_download_filename_from_response
+
+def get_download_filename(response):
+    filename = get_download_filename_from_response(response)
+    filename, ext = os.path.splitext(filename)
+    filename = filename[:60] + ext
+    filename = ascii_filename(filename)
+    return filename
+
+def download_file(url, cookie_file=None, filename=None):
+    user_agent = None
+    if url.startswith('http://www.gutenberg.org'):
+        # Project Gutenberg returns an HTML page if the user agent is a normal
+        # browser user agent
+        user_agent = '%s/%s' % (__appname__, __version__)
+    br = browser(user_agent=user_agent)
+    if cookie_file:
+        cj = MozillaCookieJar()
+        cj.load(cookie_file)
+        br.set_cookiejar(cj)
+    with closing(br.open(url)) as r:
+        if not filename:
+            filename = get_download_filename(r)
+        temp_path = os.path.join(PersistentTemporaryDirectory(), filename)
+        with open(temp_path, 'w+b') as tf:
+            shutil.copyfileobj(r, tf)
+            dfilename = tf.name
+
+    return dfilename
+
 
 class EbookDownload(object):
 
@@ -36,32 +67,12 @@ class EbookDownload(object):
                 pass
 
     def _download(self, cookie_file, url, filename, save_loc, add_to_lib):
-        dfilename = ''
-
         if not url:
             raise Exception(_('No file specified to download.'))
         if not save_loc and not add_to_lib:
             # Nothing to do.
-            return dfilename
-
-        if not filename:
-            filename = get_download_filename(url, cookie_file)
-            filename, ext = os.path.splitext(filename)
-            filename = filename[:60] + ext
-            filename = ascii_filename(filename)
-
-        br = browser()
-        if cookie_file:
-            cj = MozillaCookieJar()
-            cj.load(cookie_file)
-            br.set_cookiejar(cj)
-        with closing(br.open(url)) as r:
-            temp_path = os.path.join(PersistentTemporaryDirectory(), filename)
-            tf = open(temp_path, 'w+b')
-            tf.write(r.read())
-            dfilename = tf.name
-
-        return dfilename
+            return ''
+        return download_file(url, cookie_file, filename)
 
     def _add(self, filename, gui, add_to_lib, tags):
         if not add_to_lib or not filename:
@@ -90,7 +101,9 @@ gui_ebook_download = EbookDownload()
 
 def start_ebook_download(callback, job_manager, gui, cookie_file=None, url='', filename='', save_loc='', add_to_lib=True, tags=[]):
     description = _('Downloading %s') % filename.decode('utf-8', 'ignore') if filename else url.decode('utf-8', 'ignore')
-    job = ThreadedJob('ebook_download', description, gui_ebook_download, (gui, cookie_file, url, filename, save_loc, add_to_lib, tags), {}, callback, max_concurrent_count=2, killable=False)
+    job = ThreadedJob('ebook_download', description, gui_ebook_download, (
+        gui, cookie_file, url, filename, save_loc, add_to_lib, tags), {},
+                      callback, max_concurrent_count=2, killable=False)
     job_manager.run_threaded_job(job)
 
 
diff --git a/src/calibre/gui2/store/stores/gutenberg_plugin.py b/src/calibre/gui2/store/stores/gutenberg_plugin.py
index 7b82c9b017..55593dbfea 100644
--- a/src/calibre/gui2/store/stores/gutenberg_plugin.py
+++ b/src/calibre/gui2/store/stores/gutenberg_plugin.py
@@ -21,10 +21,67 @@ from calibre.gui2.store.basic_config import BasicStoreConfig
 from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
 from calibre.gui2.store.search_result import SearchResult
 
+web_url = 'http://m.gutenberg.org/'
+
+def search(query, max_results=10, timeout=60):
+    url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query)
+
+    counter = max_results
+    br = browser(user_agent='calibre/'+__version__)
+    with closing(br.open(url, timeout=timeout)) as f:
+        doc = etree.fromstring(f.read())
+        for data in doc.xpath('//*[local-name() = "entry"]'):
+            if counter <= 0:
+                break
+
+            counter -= 1
+
+            s = SearchResult()
+
+            # We could use the <link rel="alternate" type="text/html" ...> tag from the
+            # detail odps page but this is easier.
+            id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
+            s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub('[^\d]', '', id)))
+            if not s.detail_item:
+                continue
+
+            s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
+            s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
+            if not s.title or not s.author:
+                continue
+
+            # Get the formats and direct download links.
+            with closing(br.open(id, timeout=timeout/4)) as nf:
+                ndoc = etree.fromstring(nf.read())
+                for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
+                    type = link.get('type')
+                    href = link.get('href')
+                    if type:
+                        ext = mimetypes.guess_extension(type)
+                        if ext:
+                            ext = ext[1:].upper().strip()
+                            s.downloads[ext] = href
+
+            s.formats = ', '.join(s.downloads.keys())
+            if not s.formats:
+                continue
+
+            for link in data.xpath('./*[local-name() = "link"]'):
+                rel = link.get('rel')
+                href = link.get('href')
+                type = link.get('type')
+
+                if rel and href and type:
+                    if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
+                        if href.startswith('data:image/png;base64,'):
+                            s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))
+
+            yield s
+
 class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
 
     open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
-    web_url = 'http://m.gutenberg.org/'
+    web_url = web_url
 
     def search(self, query, max_results=10, timeout=60):
         '''
@@ -48,57 +105,5 @@ class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
           * Images are not links but base64 encoded strings. They are also not
             real cover images but a little blue book thumbnail.
         '''
-
-        url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query)
-
-        counter = max_results
-        br = browser(user_agent='calibre/'+__version__)
-        with closing(br.open(url, timeout=timeout)) as f:
-            doc = etree.fromstring(f.read())
-            for data in doc.xpath('//*[local-name() = "entry"]'):
-                if counter <= 0:
-                    break
-
-                counter -= 1
-
-                s = SearchResult()
-
-                # We could use the <link rel="alternate" type="text/html" ...> tag from the
-                # detail odps page but this is easier.
-                id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
-                s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (self.web_url, re.sub('[^\d]', '', id)))
-                if not s.detail_item:
-                    continue
-
-                s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
-                s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
-                if not s.title or not s.author:
-                    continue
-
-                # Get the formats and direct download links.
-                with closing(br.open(id, timeout=timeout/4)) as nf:
-                    ndoc = etree.fromstring(nf.read())
-                    for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
-                        type = link.get('type')
-                        href = link.get('href')
-                        if type:
-                            ext = mimetypes.guess_extension(type)
-                            if ext:
-                                ext = ext[1:].upper().strip()
-                                s.downloads[ext] = href
-
-                s.formats = ', '.join(s.downloads.keys())
-                if not s.formats:
-                    continue
-
-                for link in data.xpath('./*[local-name() = "link"]'):
-                    rel = link.get('rel')
-                    href = link.get('href')
-                    type = link.get('type')
-
-                    if rel and href and type:
-                        if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
-                            if href.startswith('data:image/png;base64,'):
-                                s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))
-
-                yield s
+        for result in search(query, max_results, timeout):
+            yield result
diff --git a/src/calibre/gui2/store/web_control.py b/src/calibre/gui2/store/web_control.py
index c2f38f2bd4..49f7f5634d 100644
--- a/src/calibre/gui2/store/web_control.py
+++ b/src/calibre/gui2/store/web_control.py
@@ -12,11 +12,12 @@ from urlparse import urlparse
 from PyQt5.Qt import QNetworkCookieJar, QNetworkProxy, QUrl
 from PyQt5.QtWebKitWidgets import QWebView, QWebPage
 
-from calibre import USER_AGENT, get_proxies, get_download_filename
+from calibre import USER_AGENT, get_proxies
 from calibre.ebooks import BOOK_EXTENSIONS
 from calibre.gui2 import choose_save_file
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.utils.filenames import ascii_filename
+from calibre.web import get_download_filename
 
 class NPWebView(QWebView):
 
diff --git a/src/calibre/web/__init__.py b/src/calibre/web/__init__.py
index b14dc0ce28..761da488f5 100644
--- a/src/calibre/web/__init__.py
+++ b/src/calibre/web/__init__.py
@@ -5,3 +5,54 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 class Recipe(object):
     pass
 
+def get_download_filename_from_response(response):
+    from urllib2 import unquote as urllib2_unquote
+    filename = last_part_name = ''
+    try:
+        last_part_name = response.geturl().split('/')[-1]
+        disposition = response.info().get('Content-disposition', '')
+        for p in disposition.split(';'):
+            if 'filename' in p:
+                if '*=' in disposition:
+                    parts = disposition.split('*=')[-1]
+                    filename = parts.split('\'')[-1]
+                else:
+                    filename = disposition.split('=')[-1]
+                if filename[0] in ('\'', '"'):
+                    filename = filename[1:]
+                if filename[-1] in ('\'', '"'):
+                    filename = filename[:-1]
+                filename = urllib2_unquote(filename)
+                break
+    except Exception:
+        import traceback
+        traceback.print_exc()
+    return filename or last_part_name
+
+
+def get_download_filename(url, cookie_file=None):
+    '''
+    Get a local filename for a URL using the content disposition header
+    Returns empty string if an error occurs.
+    '''
+    from calibre import browser
+    from contextlib import closing
+
+    filename = ''
+
+    br = browser()
+    if cookie_file:
+        from mechanize import MozillaCookieJar
+        cj = MozillaCookieJar()
+        cj.load(cookie_file)
+        br.set_cookiejar(cj)
+
+    try:
+        with closing(br.open(url)) as r:
+            filename = get_download_filename_from_response(r)
+    except:
+        import traceback
+        traceback.print_exc()
+
+    return filename
+