mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactor various ebook download functions
The functions are now easily testable individually. An extra HTTP connection when downloading a file from a URL directly is avoided. A workaround for Project Gutenberg refusing to serve ebook files to browsers is added. See #1354735 (When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.) [When using "Get Books," all downloaded Project Gutenberg ePub books (and only Pr. Gut. books) are .2 MB, that is, they show up in my Calibre library, but they are empty. This wasn't so before as I have downloaded many Pr. Gut. ePub books in the past using “Get Books.” I have updated Calibre versions twice since this problem began. ePub downloads through other providers (eg., Feedbooks, Legimi, MobileRead) using “Get Books” functions correctly. Mac OS 10.7.5, Calibre 1.48.0.](https://bugs.launchpad.net/calibre/+bug/1354735)
This commit is contained in:
parent
5b9e305d5d
commit
b0f52e2a4d
@ -641,50 +641,6 @@ def url_slash_cleaner(url):
|
|||||||
'''
|
'''
|
||||||
return re.sub(r'(?<!:)/{2,}', '/', url)
|
return re.sub(r'(?<!:)/{2,}', '/', url)
|
||||||
|
|
||||||
def get_download_filename(url, cookie_file=None):
|
|
||||||
'''
|
|
||||||
Get a local filename for a URL using the content disposition header
|
|
||||||
Returns empty string if no content disposition header present
|
|
||||||
'''
|
|
||||||
from contextlib import closing
|
|
||||||
from urllib2 import unquote as urllib2_unquote
|
|
||||||
|
|
||||||
filename = ''
|
|
||||||
|
|
||||||
br = browser()
|
|
||||||
if cookie_file:
|
|
||||||
from mechanize import MozillaCookieJar
|
|
||||||
cj = MozillaCookieJar()
|
|
||||||
cj.load(cookie_file)
|
|
||||||
br.set_cookiejar(cj)
|
|
||||||
|
|
||||||
last_part_name = ''
|
|
||||||
try:
|
|
||||||
with closing(br.open(url)) as r:
|
|
||||||
last_part_name = r.geturl().split('/')[-1]
|
|
||||||
disposition = r.info().get('Content-disposition', '')
|
|
||||||
for p in disposition.split(';'):
|
|
||||||
if 'filename' in p:
|
|
||||||
if '*=' in disposition:
|
|
||||||
parts = disposition.split('*=')[-1]
|
|
||||||
filename = parts.split('\'')[-1]
|
|
||||||
else:
|
|
||||||
filename = disposition.split('=')[-1]
|
|
||||||
if filename[0] in ('\'', '"'):
|
|
||||||
filename = filename[1:]
|
|
||||||
if filename[-1] in ('\'', '"'):
|
|
||||||
filename = filename[:-1]
|
|
||||||
filename = urllib2_unquote(filename)
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
if not filename:
|
|
||||||
filename = last_part_name
|
|
||||||
|
|
||||||
return filename
|
|
||||||
|
|
||||||
def human_readable(size, sep=' '):
|
def human_readable(size, sep=' '):
|
||||||
""" Convert a size in bytes into a human readable form """
|
""" Convert a size in bytes into a human readable form """
|
||||||
divisor, suffix = 1, "B"
|
divisor, suffix = 1, "B"
|
||||||
|
@ -11,12 +11,43 @@ import shutil
|
|||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
from mechanize import MozillaCookieJar
|
from mechanize import MozillaCookieJar
|
||||||
|
|
||||||
from calibre import browser, get_download_filename
|
from calibre import browser
|
||||||
|
from calibre.constants import __appname__, __version__
|
||||||
from calibre.ebooks import BOOK_EXTENSIONS
|
from calibre.ebooks import BOOK_EXTENSIONS
|
||||||
from calibre.gui2 import Dispatcher
|
from calibre.gui2 import Dispatcher
|
||||||
from calibre.gui2.threaded_jobs import ThreadedJob
|
from calibre.gui2.threaded_jobs import ThreadedJob
|
||||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
|
from calibre.web import get_download_filename_from_response
|
||||||
|
|
||||||
|
def get_download_filename(response):
|
||||||
|
filename = get_download_filename_from_response(response)
|
||||||
|
filename, ext = os.path.splitext(filename)
|
||||||
|
filename = filename[:60] + ext
|
||||||
|
filename = ascii_filename(filename)
|
||||||
|
return filename
|
||||||
|
|
||||||
|
def download_file(url, cookie_file=None, filename=None):
|
||||||
|
user_agent = None
|
||||||
|
if url.startswith('http://www.gutenberg.org'):
|
||||||
|
# Project Gutenberg returns an HTML page if the user agent is a normal
|
||||||
|
# browser user agent
|
||||||
|
user_agent = '%s/%s' % (__appname__, __version__)
|
||||||
|
br = browser(user_agent=user_agent)
|
||||||
|
if cookie_file:
|
||||||
|
cj = MozillaCookieJar()
|
||||||
|
cj.load(cookie_file)
|
||||||
|
br.set_cookiejar(cj)
|
||||||
|
with closing(br.open(url)) as r:
|
||||||
|
if not filename:
|
||||||
|
filename = get_download_filename(r)
|
||||||
|
temp_path = os.path.join(PersistentTemporaryDirectory(), filename)
|
||||||
|
with open(temp_path, 'w+b') as tf:
|
||||||
|
shutil.copyfileobj(r, tf)
|
||||||
|
dfilename = tf.name
|
||||||
|
|
||||||
|
return dfilename
|
||||||
|
|
||||||
|
|
||||||
class EbookDownload(object):
|
class EbookDownload(object):
|
||||||
|
|
||||||
@ -36,32 +67,12 @@ class EbookDownload(object):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def _download(self, cookie_file, url, filename, save_loc, add_to_lib):
|
def _download(self, cookie_file, url, filename, save_loc, add_to_lib):
|
||||||
dfilename = ''
|
|
||||||
|
|
||||||
if not url:
|
if not url:
|
||||||
raise Exception(_('No file specified to download.'))
|
raise Exception(_('No file specified to download.'))
|
||||||
if not save_loc and not add_to_lib:
|
if not save_loc and not add_to_lib:
|
||||||
# Nothing to do.
|
# Nothing to do.
|
||||||
return dfilename
|
return ''
|
||||||
|
return download_file(url, cookie_file, filename)
|
||||||
if not filename:
|
|
||||||
filename = get_download_filename(url, cookie_file)
|
|
||||||
filename, ext = os.path.splitext(filename)
|
|
||||||
filename = filename[:60] + ext
|
|
||||||
filename = ascii_filename(filename)
|
|
||||||
|
|
||||||
br = browser()
|
|
||||||
if cookie_file:
|
|
||||||
cj = MozillaCookieJar()
|
|
||||||
cj.load(cookie_file)
|
|
||||||
br.set_cookiejar(cj)
|
|
||||||
with closing(br.open(url)) as r:
|
|
||||||
temp_path = os.path.join(PersistentTemporaryDirectory(), filename)
|
|
||||||
tf = open(temp_path, 'w+b')
|
|
||||||
tf.write(r.read())
|
|
||||||
dfilename = tf.name
|
|
||||||
|
|
||||||
return dfilename
|
|
||||||
|
|
||||||
def _add(self, filename, gui, add_to_lib, tags):
|
def _add(self, filename, gui, add_to_lib, tags):
|
||||||
if not add_to_lib or not filename:
|
if not add_to_lib or not filename:
|
||||||
@ -90,7 +101,9 @@ gui_ebook_download = EbookDownload()
|
|||||||
|
|
||||||
def start_ebook_download(callback, job_manager, gui, cookie_file=None, url='', filename='', save_loc='', add_to_lib=True, tags=[]):
|
def start_ebook_download(callback, job_manager, gui, cookie_file=None, url='', filename='', save_loc='', add_to_lib=True, tags=[]):
|
||||||
description = _('Downloading %s') % filename.decode('utf-8', 'ignore') if filename else url.decode('utf-8', 'ignore')
|
description = _('Downloading %s') % filename.decode('utf-8', 'ignore') if filename else url.decode('utf-8', 'ignore')
|
||||||
job = ThreadedJob('ebook_download', description, gui_ebook_download, (gui, cookie_file, url, filename, save_loc, add_to_lib, tags), {}, callback, max_concurrent_count=2, killable=False)
|
job = ThreadedJob('ebook_download', description, gui_ebook_download, (
|
||||||
|
gui, cookie_file, url, filename, save_loc, add_to_lib, tags), {},
|
||||||
|
callback, max_concurrent_count=2, killable=False)
|
||||||
job_manager.run_threaded_job(job)
|
job_manager.run_threaded_job(job)
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,34 +21,9 @@ from calibre.gui2.store.basic_config import BasicStoreConfig
|
|||||||
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
|
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
|
||||||
from calibre.gui2.store.search_result import SearchResult
|
from calibre.gui2.store.search_result import SearchResult
|
||||||
|
|
||||||
class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
|
|
||||||
|
|
||||||
open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
|
|
||||||
web_url = 'http://m.gutenberg.org/'
|
web_url = 'http://m.gutenberg.org/'
|
||||||
|
|
||||||
def search(self, query, max_results=10, timeout=60):
|
def search(query, max_results=10, timeout=60):
|
||||||
'''
|
|
||||||
Gutenberg's ODPS feed is poorly implmented and has a number of issues
|
|
||||||
which require very special handling to fix the results.
|
|
||||||
|
|
||||||
Issues:
|
|
||||||
* "Sort Alphabetically" and "Sort by Release Date" are returned
|
|
||||||
as book entries.
|
|
||||||
* The author is put into a "content" tag and not the author tag.
|
|
||||||
* The link to the book itself goes to an odps page which we need
|
|
||||||
to turn into a link to a web page.
|
|
||||||
* acquisition links are not part of the search result so we have
|
|
||||||
to go to the odps item itself. Detail item pages have a nasty
|
|
||||||
note saying:
|
|
||||||
DON'T USE THIS PAGE FOR SCRAPING.
|
|
||||||
Seriously. You'll only get your IP blocked.
|
|
||||||
We're using the ODPS feed because people are getting blocked with
|
|
||||||
the previous implementation so due to this using ODPS probably
|
|
||||||
won't solve this issue.
|
|
||||||
* Images are not links but base64 encoded strings. They are also not
|
|
||||||
real cover images but a little blue book thumbnail.
|
|
||||||
'''
|
|
||||||
|
|
||||||
url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query)
|
url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query)
|
||||||
|
|
||||||
counter = max_results
|
counter = max_results
|
||||||
@ -66,7 +41,7 @@ class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
|
|||||||
# We could use the <link rel="alternate" type="text/html" ...> tag from the
|
# We could use the <link rel="alternate" type="text/html" ...> tag from the
|
||||||
# detail odps page but this is easier.
|
# detail odps page but this is easier.
|
||||||
id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
|
id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
|
||||||
s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (self.web_url, re.sub('[^\d]', '', id)))
|
s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub('[^\d]', '', id)))
|
||||||
if not s.detail_item:
|
if not s.detail_item:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -102,3 +77,33 @@ class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
|
|||||||
s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))
|
s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))
|
||||||
|
|
||||||
yield s
|
yield s
|
||||||
|
|
||||||
|
class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
|
||||||
|
|
||||||
|
open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
|
||||||
|
web_url = web_url
|
||||||
|
|
||||||
|
def search(self, query, max_results=10, timeout=60):
|
||||||
|
'''
|
||||||
|
Gutenberg's ODPS feed is poorly implmented and has a number of issues
|
||||||
|
which require very special handling to fix the results.
|
||||||
|
|
||||||
|
Issues:
|
||||||
|
* "Sort Alphabetically" and "Sort by Release Date" are returned
|
||||||
|
as book entries.
|
||||||
|
* The author is put into a "content" tag and not the author tag.
|
||||||
|
* The link to the book itself goes to an odps page which we need
|
||||||
|
to turn into a link to a web page.
|
||||||
|
* acquisition links are not part of the search result so we have
|
||||||
|
to go to the odps item itself. Detail item pages have a nasty
|
||||||
|
note saying:
|
||||||
|
DON'T USE THIS PAGE FOR SCRAPING.
|
||||||
|
Seriously. You'll only get your IP blocked.
|
||||||
|
We're using the ODPS feed because people are getting blocked with
|
||||||
|
the previous implementation so due to this using ODPS probably
|
||||||
|
won't solve this issue.
|
||||||
|
* Images are not links but base64 encoded strings. They are also not
|
||||||
|
real cover images but a little blue book thumbnail.
|
||||||
|
'''
|
||||||
|
for result in search(query, max_results, timeout):
|
||||||
|
yield result
|
||||||
|
@ -12,11 +12,12 @@ from urlparse import urlparse
|
|||||||
from PyQt5.Qt import QNetworkCookieJar, QNetworkProxy, QUrl
|
from PyQt5.Qt import QNetworkCookieJar, QNetworkProxy, QUrl
|
||||||
from PyQt5.QtWebKitWidgets import QWebView, QWebPage
|
from PyQt5.QtWebKitWidgets import QWebView, QWebPage
|
||||||
|
|
||||||
from calibre import USER_AGENT, get_proxies, get_download_filename
|
from calibre import USER_AGENT, get_proxies
|
||||||
from calibre.ebooks import BOOK_EXTENSIONS
|
from calibre.ebooks import BOOK_EXTENSIONS
|
||||||
from calibre.gui2 import choose_save_file
|
from calibre.gui2 import choose_save_file
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
|
from calibre.web import get_download_filename
|
||||||
|
|
||||||
class NPWebView(QWebView):
|
class NPWebView(QWebView):
|
||||||
|
|
||||||
|
@ -5,3 +5,54 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
class Recipe(object):
|
class Recipe(object):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def get_download_filename_from_response(response):
|
||||||
|
from urllib2 import unquote as urllib2_unquote
|
||||||
|
filename = last_part_name = ''
|
||||||
|
try:
|
||||||
|
last_part_name = response.geturl().split('/')[-1]
|
||||||
|
disposition = response.info().get('Content-disposition', '')
|
||||||
|
for p in disposition.split(';'):
|
||||||
|
if 'filename' in p:
|
||||||
|
if '*=' in disposition:
|
||||||
|
parts = disposition.split('*=')[-1]
|
||||||
|
filename = parts.split('\'')[-1]
|
||||||
|
else:
|
||||||
|
filename = disposition.split('=')[-1]
|
||||||
|
if filename[0] in ('\'', '"'):
|
||||||
|
filename = filename[1:]
|
||||||
|
if filename[-1] in ('\'', '"'):
|
||||||
|
filename = filename[:-1]
|
||||||
|
filename = urllib2_unquote(filename)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return filename or last_part_name
|
||||||
|
|
||||||
|
|
||||||
|
def get_download_filename(url, cookie_file=None):
|
||||||
|
'''
|
||||||
|
Get a local filename for a URL using the content disposition header
|
||||||
|
Returns empty string if an error occurs.
|
||||||
|
'''
|
||||||
|
from calibre import browser
|
||||||
|
from contextlib import closing
|
||||||
|
|
||||||
|
filename = ''
|
||||||
|
|
||||||
|
br = browser()
|
||||||
|
if cookie_file:
|
||||||
|
from mechanize import MozillaCookieJar
|
||||||
|
cj = MozillaCookieJar()
|
||||||
|
cj.load(cookie_file)
|
||||||
|
br.set_cookiejar(cj)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with closing(br.open(url)) as r:
|
||||||
|
filename = get_download_filename_from_response(r)
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
return filename
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user