News download: Workaround lack of thread safety in python mechanize. Fixes #7321 (Cancelling news downloads causes loss of internet connectivity)

This commit is contained in:
Kovid Goyal 2010-11-04 13:35:23 -06:00
parent 5c1a40534b
commit d3886b3910
5 changed files with 127 additions and 20 deletions

View File

@ -60,7 +60,6 @@ class NYTimes(BasicNewsRecipe):
timefmt = ''
needs_subscription = True
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
cover_margins = (18,18,'grey99')

View File

@ -21,8 +21,6 @@ from calibre.constants import iswindows, isosx, islinux, isfreebsd, isfrozen, \
filesystem_encoding, plugins, config_dir
from calibre.startup import winutil, winutilerror
import mechanize
uuid.uuid4() # Imported before PyQt4 to workaround PyQt4 util-linux conflict on gentoo
if False:
@ -269,7 +267,8 @@ def browser(honor_time=True, max_time=2, mobile_browser=False):
:param honor_time: If True honors pause time in refresh requests
:param max_time: Maximum time in seconds to wait during a refresh request
'''
opener = mechanize.Browser()
from calibre.utils.browser import Browser
opener = Browser()
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
opener.set_handle_robots(False)
opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \

View File

@ -0,0 +1,96 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import copy
from cookielib import CookieJar
from mechanize import Browser as B
class Browser(B):
'A cloneable mechanize browser'
def __init__(self):
self._clone_actions = {}
B.__init__(self)
self.set_cookiejar(CookieJar())
def set_handle_refresh(self, *args, **kwargs):
B.set_handle_refresh(self, *args, **kwargs)
self._clone_actions['set_handle_refresh'] = ('set_handle_refresh',
args, kwargs)
def set_cookiejar(self, *args, **kwargs):
B.set_cookiejar(self, *args, **kwargs)
self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
def set_handle_redirect(self, *args, **kwargs):
B.set_handle_redirect(self, *args, **kwargs)
self._clone_actions['set_handle_redirect'] = ('set_handle_redirect',
args, kwargs)
def set_handle_equiv(self, *args, **kwargs):
B.set_handle_equiv(self, *args, **kwargs)
self._clone_actions['set_handle_equiv'] = ('set_handle_equiv',
args, kwargs)
def set_handle_gzip(self, *args, **kwargs):
B.set_handle_gzip(self, *args, **kwargs)
self._clone_actions['set_handle_gzip'] = ('set_handle_gzip',
args, kwargs)
def set_debug_redirect(self, *args, **kwargs):
B.set_debug_redirect(self, *args, **kwargs)
self._clone_actions['set_debug_redirect'] = ('set_debug_redirect',
args, kwargs)
def set_debug_responses(self, *args, **kwargs):
B.set_debug_responses(self, *args, **kwargs)
self._clone_actions['set_debug_responses'] = ('set_debug_responses',
args, kwargs)
def set_debug_http(self, *args, **kwargs):
B.set_debug_http(self, *args, **kwargs)
self._clone_actions['set_debug_http'] = ('set_debug_http',
args, kwargs)
def set_handle_robots(self, *args, **kwargs):
B.set_handle_robots(self, *args, **kwargs)
self._clone_actions['set_handle_robots'] = ('set_handle_robots',
args, kwargs)
def set_proxies(self, *args, **kwargs):
B.set_proxies(self, *args, **kwargs)
self._clone_actions['set_proxies'] = ('set_proxies', args, kwargs)
def add_password(self, *args, **kwargs):
B.add_password(self, *args, **kwargs)
self._clone_actions['add_password'] = ('add_password', args, kwargs)
def add_proxy_password(self, *args, **kwargs):
B.add_proxy_password(self, *args, **kwargs)
self._clone_actions['add_proxy_password'] = ('add_proxy_password', args, kwargs)
def clone_browser(self):
clone = Browser()
clone.addheaders = copy.deepcopy(self.addheaders)
for func, args, kwargs in self._clone_actions.values():
func = getattr(clone, func)
func(*args, **kwargs)
return clone
if __name__ == '__main__':
from calibre import browser
from pprint import pprint
orig = browser()
clone = orig.clone_browser()
pprint( orig._ua_handlers)
pprint(clone._ua_handlers)
assert orig._ua_handlers.keys() == clone._ua_handlers.keys()
assert orig._ua_handlers['_cookies'].cookiejar is \
clone._ua_handlers['_cookies'].cookiejar

View File

@ -82,9 +82,6 @@ class BasicNewsRecipe(Recipe):
#: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
simultaneous_downloads = 5
#: If False the remote server is contacted by only one thread at a time
multithreaded_fetch = False
#: Timeout for fetching files from server in seconds
timeout = 120.0
@ -402,6 +399,23 @@ class BasicNewsRecipe(Recipe):
'''
return browser(*args, **kwargs)
def clone_browser(self, br):
'''
Clone the browser br. Cloned browsers are used for multi-threaded
downloads, since mechanize is not thread safe. The default cloning
routines should capture most browser customization, but if you do
something exotic in your recipe, you should override this method in
your recipe and clone manually.
Cloned browser instances use the same, thread-safe CookieJar by
default, unless you have customized cookie handling.
'''
if callable(getattr(br, 'clone_browser', None)):
return br.clone_browser()
# Uh-oh recipe using something exotic, call get_browser
return self.get_browser()
def get_article_url(self, article):
'''
Override in a subclass to customize extraction of the :term:`URL` that points
@ -798,17 +812,22 @@ class BasicNewsRecipe(Recipe):
extra_css=css).render(doctype='xhtml')
def _fetch_article(self, url, dir, f, a, num_of_feeds):
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
def _fetch_article(self, url, dir_, f, a, num_of_feeds):
br = self.browser
if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
# We are using the default get_browser, which means no need to
# clone
br = BasicNewsRecipe.get_browser(self)
else:
br = self.clone_browser(self.browser)
self.web2disk_options.browser = br
fetcher = RecursiveFetcher(self.web2disk_options, self.log,
self.image_map, self.css_map,
(url, f, a, num_of_feeds))
fetcher.base_dir = dir
fetcher.current_dir = dir
fetcher.base_dir = dir_
fetcher.current_dir = dir_
fetcher.show_progress = False
fetcher.image_url_processor = self.image_url_processor
if self.multithreaded_fetch:
fetcher.browser_lock = fetcher.DUMMY_LOCK
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
if not res or not os.path.exists(res):
raise Exception(_('Could not fetch article. Run with -vv to see the reason'))
@ -1387,7 +1406,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
def download(self):
index = os.path.abspath(self.custom_index())
url = 'file:'+index if iswindows else 'file://'+index
self.web2disk_options.browser = self.browser
self.web2disk_options.browser = self.clone_browser(self.browser)
fetcher = RecursiveFetcher(self.web2disk_options, self.log)
fetcher.base_dir = self.output_dir
fetcher.current_dir = self.output_dir

View File

@ -86,11 +86,6 @@ class response(str):
obj.newurl = None
return obj
class DummyLock(object):
def __enter__(self, *args): return self
def __exit__(self, *args): pass
def default_is_link_wanted(url, tag):
raise NotImplementedError()
@ -104,7 +99,6 @@ class RecursiveFetcher(object):
# )
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
DUMMY_LOCK = DummyLock()
def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))