Refactor various ebook download functions

The functions are now easily testable individually. An extra HTTP connection
when downloading a file from a URL directly is avoided. A workaround for
Project Gutenberg refusing to serve ebook files to browsers is added.
See #1354735 (When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.) [When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.](https://bugs.launchpad.net/calibre/+bug/1354735)
This commit is contained in:
Kovid Goyal 2014-08-18 11:42:08 +05:30
parent 5b9e305d5d
commit b0f52e2a4d
5 changed files with 150 additions and 124 deletions

View File

@ -641,50 +641,6 @@ def url_slash_cleaner(url):
''' '''
return re.sub(r'(?<!:)/{2,}', '/', url) return re.sub(r'(?<!:)/{2,}', '/', url)
def get_download_filename(url, cookie_file=None):
'''
Get a local filename for a URL using the content disposition header
Returns empty string if no content disposition header present
'''
from contextlib import closing
from urllib2 import unquote as urllib2_unquote
filename = ''
br = browser()
if cookie_file:
from mechanize import MozillaCookieJar
cj = MozillaCookieJar()
cj.load(cookie_file)
br.set_cookiejar(cj)
last_part_name = ''
try:
with closing(br.open(url)) as r:
last_part_name = r.geturl().split('/')[-1]
disposition = r.info().get('Content-disposition', '')
for p in disposition.split(';'):
if 'filename' in p:
if '*=' in disposition:
parts = disposition.split('*=')[-1]
filename = parts.split('\'')[-1]
else:
filename = disposition.split('=')[-1]
if filename[0] in ('\'', '"'):
filename = filename[1:]
if filename[-1] in ('\'', '"'):
filename = filename[:-1]
filename = urllib2_unquote(filename)
break
except:
import traceback
traceback.print_exc()
if not filename:
filename = last_part_name
return filename
def human_readable(size, sep=' '): def human_readable(size, sep=' '):
""" Convert a size in bytes into a human readable form """ """ Convert a size in bytes into a human readable form """
divisor, suffix = 1, "B" divisor, suffix = 1, "B"

View File

@ -11,12 +11,43 @@ import shutil
from contextlib import closing from contextlib import closing
from mechanize import MozillaCookieJar from mechanize import MozillaCookieJar
from calibre import browser, get_download_filename from calibre import browser
from calibre.constants import __appname__, __version__
from calibre.ebooks import BOOK_EXTENSIONS from calibre.ebooks import BOOK_EXTENSIONS
from calibre.gui2 import Dispatcher from calibre.gui2 import Dispatcher
from calibre.gui2.threaded_jobs import ThreadedJob from calibre.gui2.threaded_jobs import ThreadedJob
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.web import get_download_filename_from_response
def get_download_filename(response):
filename = get_download_filename_from_response(response)
filename, ext = os.path.splitext(filename)
filename = filename[:60] + ext
filename = ascii_filename(filename)
return filename
def download_file(url, cookie_file=None, filename=None):
user_agent = None
if url.startswith('http://www.gutenberg.org'):
# Project Gutenberg returns an HTML page if the user agent is a normal
# browser user agent
user_agent = '%s/%s' % (__appname__, __version__)
br = browser(user_agent=user_agent)
if cookie_file:
cj = MozillaCookieJar()
cj.load(cookie_file)
br.set_cookiejar(cj)
with closing(br.open(url)) as r:
if not filename:
filename = get_download_filename(r)
temp_path = os.path.join(PersistentTemporaryDirectory(), filename)
with open(temp_path, 'w+b') as tf:
shutil.copyfileobj(r, tf)
dfilename = tf.name
return dfilename
class EbookDownload(object): class EbookDownload(object):
@ -36,32 +67,12 @@ class EbookDownload(object):
pass pass
def _download(self, cookie_file, url, filename, save_loc, add_to_lib): def _download(self, cookie_file, url, filename, save_loc, add_to_lib):
dfilename = ''
if not url: if not url:
raise Exception(_('No file specified to download.')) raise Exception(_('No file specified to download.'))
if not save_loc and not add_to_lib: if not save_loc and not add_to_lib:
# Nothing to do. # Nothing to do.
return dfilename return ''
return download_file(url, cookie_file, filename)
if not filename:
filename = get_download_filename(url, cookie_file)
filename, ext = os.path.splitext(filename)
filename = filename[:60] + ext
filename = ascii_filename(filename)
br = browser()
if cookie_file:
cj = MozillaCookieJar()
cj.load(cookie_file)
br.set_cookiejar(cj)
with closing(br.open(url)) as r:
temp_path = os.path.join(PersistentTemporaryDirectory(), filename)
tf = open(temp_path, 'w+b')
tf.write(r.read())
dfilename = tf.name
return dfilename
def _add(self, filename, gui, add_to_lib, tags): def _add(self, filename, gui, add_to_lib, tags):
if not add_to_lib or not filename: if not add_to_lib or not filename:
@ -90,7 +101,9 @@ gui_ebook_download = EbookDownload()
def start_ebook_download(callback, job_manager, gui, cookie_file=None, url='', filename='', save_loc='', add_to_lib=True, tags=[]): def start_ebook_download(callback, job_manager, gui, cookie_file=None, url='', filename='', save_loc='', add_to_lib=True, tags=[]):
description = _('Downloading %s') % filename.decode('utf-8', 'ignore') if filename else url.decode('utf-8', 'ignore') description = _('Downloading %s') % filename.decode('utf-8', 'ignore') if filename else url.decode('utf-8', 'ignore')
job = ThreadedJob('ebook_download', description, gui_ebook_download, (gui, cookie_file, url, filename, save_loc, add_to_lib, tags), {}, callback, max_concurrent_count=2, killable=False) job = ThreadedJob('ebook_download', description, gui_ebook_download, (
gui, cookie_file, url, filename, save_loc, add_to_lib, tags), {},
callback, max_concurrent_count=2, killable=False)
job_manager.run_threaded_job(job) job_manager.run_threaded_job(job)

View File

@ -21,34 +21,9 @@ from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore): web_url = 'http://m.gutenberg.org/'
open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
web_url = 'http://m.gutenberg.org/'
def search(self, query, max_results=10, timeout=60):
'''
Gutenberg's ODPS feed is poorly implmented and has a number of issues
which require very special handling to fix the results.
Issues:
* "Sort Alphabetically" and "Sort by Release Date" are returned
as book entries.
* The author is put into a "content" tag and not the author tag.
* The link to the book itself goes to an odps page which we need
to turn into a link to a web page.
* acquisition links are not part of the search result so we have
to go to the odps item itself. Detail item pages have a nasty
note saying:
DON'T USE THIS PAGE FOR SCRAPING.
Seriously. You'll only get your IP blocked.
We're using the ODPS feed because people are getting blocked with
the previous implementation so due to this using ODPS probably
won't solve this issue.
* Images are not links but base64 encoded strings. They are also not
real cover images but a little blue book thumbnail.
'''
def search(query, max_results=10, timeout=60):
url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query) url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query)
counter = max_results counter = max_results
@ -66,7 +41,7 @@ class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
# We could use the <link rel="alternate" type="text/html" ...> tag from the # We could use the <link rel="alternate" type="text/html" ...> tag from the
# detail odps page but this is easier. # detail odps page but this is easier.
id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (self.web_url, re.sub('[^\d]', '', id))) s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub('[^\d]', '', id)))
if not s.detail_item: if not s.detail_item:
continue continue
@ -102,3 +77,33 @@ class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', '')) s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))
yield s yield s
class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
web_url = web_url
def search(self, query, max_results=10, timeout=60):
'''
Gutenberg's ODPS feed is poorly implmented and has a number of issues
which require very special handling to fix the results.
Issues:
* "Sort Alphabetically" and "Sort by Release Date" are returned
as book entries.
* The author is put into a "content" tag and not the author tag.
* The link to the book itself goes to an odps page which we need
to turn into a link to a web page.
* acquisition links are not part of the search result so we have
to go to the odps item itself. Detail item pages have a nasty
note saying:
DON'T USE THIS PAGE FOR SCRAPING.
Seriously. You'll only get your IP blocked.
We're using the ODPS feed because people are getting blocked with
the previous implementation so due to this using ODPS probably
won't solve this issue.
* Images are not links but base64 encoded strings. They are also not
real cover images but a little blue book thumbnail.
'''
for result in search(query, max_results, timeout):
yield result

View File

@ -12,11 +12,12 @@ from urlparse import urlparse
from PyQt5.Qt import QNetworkCookieJar, QNetworkProxy, QUrl from PyQt5.Qt import QNetworkCookieJar, QNetworkProxy, QUrl
from PyQt5.QtWebKitWidgets import QWebView, QWebPage from PyQt5.QtWebKitWidgets import QWebView, QWebPage
from calibre import USER_AGENT, get_proxies, get_download_filename from calibre import USER_AGENT, get_proxies
from calibre.ebooks import BOOK_EXTENSIONS from calibre.ebooks import BOOK_EXTENSIONS
from calibre.gui2 import choose_save_file from calibre.gui2 import choose_save_file
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.web import get_download_filename
class NPWebView(QWebView): class NPWebView(QWebView):

View File

@ -5,3 +5,54 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
class Recipe(object): class Recipe(object):
pass pass
def get_download_filename_from_response(response):
from urllib2 import unquote as urllib2_unquote
filename = last_part_name = ''
try:
last_part_name = response.geturl().split('/')[-1]
disposition = response.info().get('Content-disposition', '')
for p in disposition.split(';'):
if 'filename' in p:
if '*=' in disposition:
parts = disposition.split('*=')[-1]
filename = parts.split('\'')[-1]
else:
filename = disposition.split('=')[-1]
if filename[0] in ('\'', '"'):
filename = filename[1:]
if filename[-1] in ('\'', '"'):
filename = filename[:-1]
filename = urllib2_unquote(filename)
break
except Exception:
import traceback
traceback.print_exc()
return filename or last_part_name
def get_download_filename(url, cookie_file=None):
'''
Get a local filename for a URL using the content disposition header
Returns empty string if an error occurs.
'''
from calibre import browser
from contextlib import closing
filename = ''
br = browser()
if cookie_file:
from mechanize import MozillaCookieJar
cj = MozillaCookieJar()
cj.load(cookie_file)
br.set_cookiejar(cj)
try:
with closing(br.open(url)) as r:
filename = get_download_filename_from_response(r)
except:
import traceback
traceback.print_exc()
return filename