mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
News download: Workaround lack of thread safety in python mechanize. Fixes #7321 (Cancelling news downloads causes loss of internet connectivity)
This commit is contained in:
parent
5c1a40534b
commit
d3886b3910
@ -60,7 +60,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
|
||||
timefmt = ''
|
||||
needs_subscription = True
|
||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||
cover_margins = (18,18,'grey99')
|
||||
|
||||
|
@ -21,8 +21,6 @@ from calibre.constants import iswindows, isosx, islinux, isfreebsd, isfrozen, \
|
||||
filesystem_encoding, plugins, config_dir
|
||||
from calibre.startup import winutil, winutilerror
|
||||
|
||||
import mechanize
|
||||
|
||||
uuid.uuid4() # Imported before PyQt4 to workaround PyQt4 util-linux conflict on gentoo
|
||||
|
||||
if False:
|
||||
@ -269,7 +267,8 @@ def browser(honor_time=True, max_time=2, mobile_browser=False):
|
||||
:param honor_time: If True honors pause time in refresh requests
|
||||
:param max_time: Maximum time in seconds to wait during a refresh request
|
||||
'''
|
||||
opener = mechanize.Browser()
|
||||
from calibre.utils.browser import Browser
|
||||
opener = Browser()
|
||||
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
|
||||
opener.set_handle_robots(False)
|
||||
opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
|
||||
|
96
src/calibre/utils/browser.py
Normal file
96
src/calibre/utils/browser.py
Normal file
@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import copy
|
||||
from cookielib import CookieJar
|
||||
|
||||
from mechanize import Browser as B
|
||||
|
||||
class Browser(B):
|
||||
'A cloneable mechanize browser'
|
||||
|
||||
def __init__(self):
|
||||
self._clone_actions = {}
|
||||
|
||||
B.__init__(self)
|
||||
self.set_cookiejar(CookieJar())
|
||||
|
||||
def set_handle_refresh(self, *args, **kwargs):
|
||||
B.set_handle_refresh(self, *args, **kwargs)
|
||||
self._clone_actions['set_handle_refresh'] = ('set_handle_refresh',
|
||||
args, kwargs)
|
||||
|
||||
def set_cookiejar(self, *args, **kwargs):
|
||||
B.set_cookiejar(self, *args, **kwargs)
|
||||
self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
|
||||
|
||||
def set_handle_redirect(self, *args, **kwargs):
|
||||
B.set_handle_redirect(self, *args, **kwargs)
|
||||
self._clone_actions['set_handle_redirect'] = ('set_handle_redirect',
|
||||
args, kwargs)
|
||||
|
||||
def set_handle_equiv(self, *args, **kwargs):
|
||||
B.set_handle_equiv(self, *args, **kwargs)
|
||||
self._clone_actions['set_handle_equiv'] = ('set_handle_equiv',
|
||||
args, kwargs)
|
||||
|
||||
def set_handle_gzip(self, *args, **kwargs):
|
||||
B.set_handle_gzip(self, *args, **kwargs)
|
||||
self._clone_actions['set_handle_gzip'] = ('set_handle_gzip',
|
||||
args, kwargs)
|
||||
|
||||
def set_debug_redirect(self, *args, **kwargs):
|
||||
B.set_debug_redirect(self, *args, **kwargs)
|
||||
self._clone_actions['set_debug_redirect'] = ('set_debug_redirect',
|
||||
args, kwargs)
|
||||
|
||||
def set_debug_responses(self, *args, **kwargs):
|
||||
B.set_debug_responses(self, *args, **kwargs)
|
||||
self._clone_actions['set_debug_responses'] = ('set_debug_responses',
|
||||
args, kwargs)
|
||||
|
||||
def set_debug_http(self, *args, **kwargs):
|
||||
B.set_debug_http(self, *args, **kwargs)
|
||||
self._clone_actions['set_debug_http'] = ('set_debug_http',
|
||||
args, kwargs)
|
||||
|
||||
def set_handle_robots(self, *args, **kwargs):
|
||||
B.set_handle_robots(self, *args, **kwargs)
|
||||
self._clone_actions['set_handle_robots'] = ('set_handle_robots',
|
||||
args, kwargs)
|
||||
|
||||
def set_proxies(self, *args, **kwargs):
|
||||
B.set_proxies(self, *args, **kwargs)
|
||||
self._clone_actions['set_proxies'] = ('set_proxies', args, kwargs)
|
||||
|
||||
def add_password(self, *args, **kwargs):
|
||||
B.add_password(self, *args, **kwargs)
|
||||
self._clone_actions['add_password'] = ('add_password', args, kwargs)
|
||||
|
||||
def add_proxy_password(self, *args, **kwargs):
|
||||
B.add_proxy_password(self, *args, **kwargs)
|
||||
self._clone_actions['add_proxy_password'] = ('add_proxy_password', args, kwargs)
|
||||
|
||||
def clone_browser(self):
|
||||
clone = Browser()
|
||||
clone.addheaders = copy.deepcopy(self.addheaders)
|
||||
for func, args, kwargs in self._clone_actions.values():
|
||||
func = getattr(clone, func)
|
||||
func(*args, **kwargs)
|
||||
return clone
|
||||
|
||||
if __name__ == '__main__':
|
||||
from calibre import browser
|
||||
from pprint import pprint
|
||||
orig = browser()
|
||||
clone = orig.clone_browser()
|
||||
pprint( orig._ua_handlers)
|
||||
pprint(clone._ua_handlers)
|
||||
assert orig._ua_handlers.keys() == clone._ua_handlers.keys()
|
||||
assert orig._ua_handlers['_cookies'].cookiejar is \
|
||||
clone._ua_handlers['_cookies'].cookiejar
|
||||
|
@ -82,9 +82,6 @@ class BasicNewsRecipe(Recipe):
|
||||
#: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
|
||||
simultaneous_downloads = 5
|
||||
|
||||
#: If False the remote server is contacted by only one thread at a time
|
||||
multithreaded_fetch = False
|
||||
|
||||
#: Timeout for fetching files from server in seconds
|
||||
timeout = 120.0
|
||||
|
||||
@ -402,6 +399,23 @@ class BasicNewsRecipe(Recipe):
|
||||
'''
|
||||
return browser(*args, **kwargs)
|
||||
|
||||
def clone_browser(self, br):
|
||||
'''
|
||||
Clone the browser br. Cloned browsers are used for multi-threaded
|
||||
downloads, since mechanize is not thread safe. The default cloning
|
||||
routines should capture most browser customization, but if you do
|
||||
something exotic in your recipe, you should override this method in
|
||||
your recipe and clone manually.
|
||||
|
||||
Cloned browser instances use the same, thread-safe CookieJar by
|
||||
default, unless you have customized cookie handling.
|
||||
'''
|
||||
if callable(getattr(br, 'clone_browser', None)):
|
||||
return br.clone_browser()
|
||||
|
||||
# Uh-oh recipe using something exotic, call get_browser
|
||||
return self.get_browser()
|
||||
|
||||
def get_article_url(self, article):
|
||||
'''
|
||||
Override in a subclass to customize extraction of the :term:`URL` that points
|
||||
@ -798,17 +812,22 @@ class BasicNewsRecipe(Recipe):
|
||||
extra_css=css).render(doctype='xhtml')
|
||||
|
||||
|
||||
def _fetch_article(self, url, dir, f, a, num_of_feeds):
|
||||
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
|
||||
def _fetch_article(self, url, dir_, f, a, num_of_feeds):
|
||||
br = self.browser
|
||||
if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
|
||||
# We are using the default get_browser, which means no need to
|
||||
# clone
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
else:
|
||||
br = self.clone_browser(self.browser)
|
||||
self.web2disk_options.browser = br
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, self.log,
|
||||
self.image_map, self.css_map,
|
||||
(url, f, a, num_of_feeds))
|
||||
fetcher.base_dir = dir
|
||||
fetcher.current_dir = dir
|
||||
fetcher.base_dir = dir_
|
||||
fetcher.current_dir = dir_
|
||||
fetcher.show_progress = False
|
||||
fetcher.image_url_processor = self.image_url_processor
|
||||
if self.multithreaded_fetch:
|
||||
fetcher.browser_lock = fetcher.DUMMY_LOCK
|
||||
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
|
||||
if not res or not os.path.exists(res):
|
||||
raise Exception(_('Could not fetch article. Run with -vv to see the reason'))
|
||||
@ -1387,7 +1406,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
|
||||
def download(self):
|
||||
index = os.path.abspath(self.custom_index())
|
||||
url = 'file:'+index if iswindows else 'file://'+index
|
||||
self.web2disk_options.browser = self.browser
|
||||
self.web2disk_options.browser = self.clone_browser(self.browser)
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, self.log)
|
||||
fetcher.base_dir = self.output_dir
|
||||
fetcher.current_dir = self.output_dir
|
||||
|
@ -86,11 +86,6 @@ class response(str):
|
||||
obj.newurl = None
|
||||
return obj
|
||||
|
||||
class DummyLock(object):
|
||||
|
||||
def __enter__(self, *args): return self
|
||||
def __exit__(self, *args): pass
|
||||
|
||||
def default_is_link_wanted(url, tag):
|
||||
raise NotImplementedError()
|
||||
|
||||
@ -104,7 +99,6 @@ class RecursiveFetcher(object):
|
||||
# )
|
||||
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
||||
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
||||
DUMMY_LOCK = DummyLock()
|
||||
|
||||
def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
|
||||
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
||||
|
Loading…
x
Reference in New Issue
Block a user