From b0f52e2a4dcf3d46eda32de3952d892fdf0ea0fa Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 18 Aug 2014 11:42:08 +0530 Subject: [PATCH] Refactor various ebook download functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions are now easily testable individually. An extra HTTP connection when downloading a file from a URL directly is avoided. A workaround for Project Gutenberg refusing to serve ebook files to browsers is added. See #1354735 (When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.) [When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.](https://bugs.launchpad.net/calibre/+bug/1354735) --- src/calibre/__init__.py | 44 ------- src/calibre/gui2/ebook_download.py | 61 ++++++---- .../gui2/store/stores/gutenberg_plugin.py | 115 +++++++++--------- src/calibre/gui2/store/web_control.py | 3 +- src/calibre/web/__init__.py | 51 ++++++++ 5 files changed, 150 insertions(+), 124 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 9d599c9cf0..81c8027bb4 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -641,50 +641,6 @@ def url_slash_cleaner(url): ''' return re.sub(r'(? tag from the + # detail odps page but this is easier. + id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() + s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub('[^\d]', '', id))) + if not s.detail_item: + continue + + s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() + s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip() + if not s.title or not s.author: + continue + + # Get the formats and direct download links. + with closing(br.open(id, timeout=timeout/4)) as nf: + ndoc = etree.fromstring(nf.read()) + for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): + type = link.get('type') + href = link.get('href') + if type: + ext = mimetypes.guess_extension(type) + if ext: + ext = ext[1:].upper().strip() + s.downloads[ext] = href + + s.formats = ', '.join(s.downloads.keys()) + if not s.formats: + continue + + for link in data.xpath('./*[local-name() = "link"]'): + rel = link.get('rel') + href = link.get('href') + type = link.get('type') + + if rel and href and type: + if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): + if href.startswith('data:image/png;base64,'): + s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', '')) + + yield s + class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore): open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml' - web_url = 'http://m.gutenberg.org/' + web_url = web_url def search(self, query, max_results=10, timeout=60): ''' @@ -48,57 +105,5 @@ class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore): * Images are not links but base64 encoded strings. They are also not real cover images but a little blue book thumbnail. ''' - - url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query) - - counter = max_results - br = browser(user_agent='calibre/'+__version__) - with closing(br.open(url, timeout=timeout)) as f: - doc = etree.fromstring(f.read()) - for data in doc.xpath('//*[local-name() = "entry"]'): - if counter <= 0: - break - - counter -= 1 - - s = SearchResult() - - # We could use the tag from the - # detail odps page but this is easier. - id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() - s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (self.web_url, re.sub('[^\d]', '', id))) - if not s.detail_item: - continue - - s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() - s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip() - if not s.title or not s.author: - continue - - # Get the formats and direct download links. - with closing(br.open(id, timeout=timeout/4)) as nf: - ndoc = etree.fromstring(nf.read()) - for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): - type = link.get('type') - href = link.get('href') - if type: - ext = mimetypes.guess_extension(type) - if ext: - ext = ext[1:].upper().strip() - s.downloads[ext] = href - - s.formats = ', '.join(s.downloads.keys()) - if not s.formats: - continue - - for link in data.xpath('./*[local-name() = "link"]'): - rel = link.get('rel') - href = link.get('href') - type = link.get('type') - - if rel and href and type: - if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): - if href.startswith('data:image/png;base64,'): - s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', '')) - - yield s + for result in search(query, max_results, timeout): + yield result diff --git a/src/calibre/gui2/store/web_control.py b/src/calibre/gui2/store/web_control.py index c2f38f2bd4..49f7f5634d 100644 --- a/src/calibre/gui2/store/web_control.py +++ b/src/calibre/gui2/store/web_control.py @@ -12,11 +12,12 @@ from urlparse import urlparse from PyQt5.Qt import QNetworkCookieJar, QNetworkProxy, QUrl from PyQt5.QtWebKitWidgets import QWebView, QWebPage -from calibre import USER_AGENT, get_proxies, get_download_filename +from calibre import USER_AGENT, get_proxies from calibre.ebooks import BOOK_EXTENSIONS from calibre.gui2 import choose_save_file from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.filenames import ascii_filename +from calibre.web import get_download_filename class NPWebView(QWebView): diff --git a/src/calibre/web/__init__.py b/src/calibre/web/__init__.py index b14dc0ce28..761da488f5 100644 --- a/src/calibre/web/__init__.py +++ b/src/calibre/web/__init__.py @@ -5,3 +5,54 @@ __copyright__ = '2008, Kovid Goyal ' class Recipe(object): pass +def get_download_filename_from_response(response): + from urllib2 import unquote as urllib2_unquote + filename = last_part_name = '' + try: + last_part_name = response.geturl().split('/')[-1] + disposition = response.info().get('Content-disposition', '') + for p in disposition.split(';'): + if 'filename' in p: + if '*=' in disposition: + parts = disposition.split('*=')[-1] + filename = parts.split('\'')[-1] + else: + filename = disposition.split('=')[-1] + if filename[0] in ('\'', '"'): + filename = filename[1:] + if filename[-1] in ('\'', '"'): + filename = filename[:-1] + filename = urllib2_unquote(filename) + break + except Exception: + import traceback + traceback.print_exc() + return filename or last_part_name + + +def get_download_filename(url, cookie_file=None): + ''' + Get a local filename for a URL using the content disposition header + Returns empty string if an error occurs. + ''' + from calibre import browser + from contextlib import closing + + filename = '' + + br = browser() + if cookie_file: + from mechanize import MozillaCookieJar + cj = MozillaCookieJar() + cj.load(cookie_file) + br.set_cookiejar(cj) + + try: + with closing(br.open(url)) as r: + filename = get_download_filename_from_response(r) + except: + import traceback + traceback.print_exc() + + return filename +