diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index c656450990..16ddea9f8c 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -60,7 +60,6 @@ class NYTimes(BasicNewsRecipe): timefmt = '' - needs_subscription = True masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' cover_margins = (18,18,'grey99') diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 0579c75eea..1226ab3188 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -21,8 +21,6 @@ from calibre.constants import iswindows, isosx, islinux, isfreebsd, isfrozen, \ filesystem_encoding, plugins, config_dir from calibre.startup import winutil, winutilerror -import mechanize - uuid.uuid4() # Imported before PyQt4 to workaround PyQt4 util-linux conflict on gentoo if False: @@ -269,7 +267,8 @@ def browser(honor_time=True, max_time=2, mobile_browser=False): :param honor_time: If True honors pause time in refresh requests :param max_time: Maximum time in seconds to wait during a refresh request ''' - opener = mechanize.Browser() + from calibre.utils.browser import Browser + opener = Browser() opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time) opener.set_handle_robots(False) opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \ diff --git a/src/calibre/utils/browser.py b/src/calibre/utils/browser.py new file mode 100644 index 0000000000..75799aba5a --- /dev/null +++ b/src/calibre/utils/browser.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import copy +from cookielib import CookieJar + +from mechanize import Browser as B + +class Browser(B): + 'A cloneable mechanize browser' + + def __init__(self): + self._clone_actions = {} + + B.__init__(self) + self.set_cookiejar(CookieJar()) + + def set_handle_refresh(self, *args, **kwargs): + B.set_handle_refresh(self, *args, **kwargs) + self._clone_actions['set_handle_refresh'] = ('set_handle_refresh', + args, kwargs) + + def set_cookiejar(self, *args, **kwargs): + B.set_cookiejar(self, *args, **kwargs) + self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs) + + def set_handle_redirect(self, *args, **kwargs): + B.set_handle_redirect(self, *args, **kwargs) + self._clone_actions['set_handle_redirect'] = ('set_handle_redirect', + args, kwargs) + + def set_handle_equiv(self, *args, **kwargs): + B.set_handle_equiv(self, *args, **kwargs) + self._clone_actions['set_handle_equiv'] = ('set_handle_equiv', + args, kwargs) + + def set_handle_gzip(self, *args, **kwargs): + B.set_handle_gzip(self, *args, **kwargs) + self._clone_actions['set_handle_gzip'] = ('set_handle_gzip', + args, kwargs) + + def set_debug_redirect(self, *args, **kwargs): + B.set_debug_redirect(self, *args, **kwargs) + self._clone_actions['set_debug_redirect'] = ('set_debug_redirect', + args, kwargs) + + def set_debug_responses(self, *args, **kwargs): + B.set_debug_responses(self, *args, **kwargs) + self._clone_actions['set_debug_responses'] = ('set_debug_responses', + args, kwargs) + + def set_debug_http(self, *args, **kwargs): + B.set_debug_http(self, *args, **kwargs) + self._clone_actions['set_debug_http'] = ('set_debug_http', + args, kwargs) + + def set_handle_robots(self, *args, **kwargs): + B.set_handle_robots(self, *args, **kwargs) + self._clone_actions['set_handle_robots'] = ('set_handle_robots', + args, kwargs) + + def set_proxies(self, *args, **kwargs): + B.set_proxies(self, *args, **kwargs) + self._clone_actions['set_proxies'] = ('set_proxies', args, kwargs) + + def add_password(self, *args, **kwargs): + B.add_password(self, *args, **kwargs) + self._clone_actions['add_password'] = ('add_password', args, kwargs) + + def add_proxy_password(self, *args, **kwargs): + B.add_proxy_password(self, *args, **kwargs) + self._clone_actions['add_proxy_password'] = ('add_proxy_password', args, kwargs) + + def clone_browser(self): + clone = Browser() + clone.addheaders = copy.deepcopy(self.addheaders) + for func, args, kwargs in self._clone_actions.values(): + func = getattr(clone, func) + func(*args, **kwargs) + return clone + +if __name__ == '__main__': + from calibre import browser + from pprint import pprint + orig = browser() + clone = orig.clone_browser() + pprint( orig._ua_handlers) + pprint(clone._ua_handlers) + assert orig._ua_handlers.keys() == clone._ua_handlers.keys() + assert orig._ua_handlers['_cookies'].cookiejar is \ + clone._ua_handlers['_cookies'].cookiejar + diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index e081dc678e..4b92ae9b7c 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -82,9 +82,6 @@ class BasicNewsRecipe(Recipe): #: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0 simultaneous_downloads = 5 - #: If False the remote server is contacted by only one thread at a time - multithreaded_fetch = False - #: Timeout for fetching files from server in seconds timeout = 120.0 @@ -402,6 +399,23 @@ class BasicNewsRecipe(Recipe): ''' return browser(*args, **kwargs) + def clone_browser(self, br): + ''' + Clone the browser br. Cloned browsers are used for multi-threaded + downloads, since mechanize is not thread safe. The default cloning + routines should capture most browser customization, but if you do + something exotic in your recipe, you should override this method in + your recipe and clone manually. + + Cloned browser instances use the same, thread-safe CookieJar by + default, unless you have customized cookie handling. + ''' + if callable(getattr(br, 'clone_browser', None)): + return br.clone_browser() + + # Uh-oh recipe using something exotic, call get_browser + return self.get_browser() + def get_article_url(self, article): ''' Override in a subclass to customize extraction of the :term:`URL` that points @@ -798,17 +812,22 @@ class BasicNewsRecipe(Recipe): extra_css=css).render(doctype='xhtml') - def _fetch_article(self, url, dir, f, a, num_of_feeds): - self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser + def _fetch_article(self, url, dir_, f, a, num_of_feeds): + br = self.browser + if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func: + # We are using the default get_browser, which means no need to + # clone + br = BasicNewsRecipe.get_browser(self) + else: + br = self.clone_browser(self.browser) + self.web2disk_options.browser = br fetcher = RecursiveFetcher(self.web2disk_options, self.log, self.image_map, self.css_map, (url, f, a, num_of_feeds)) - fetcher.base_dir = dir - fetcher.current_dir = dir + fetcher.base_dir = dir_ + fetcher.current_dir = dir_ fetcher.show_progress = False fetcher.image_url_processor = self.image_url_processor - if self.multithreaded_fetch: - fetcher.browser_lock = fetcher.DUMMY_LOCK res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links if not res or not os.path.exists(res): raise Exception(_('Could not fetch article. Run with -vv to see the reason')) @@ -1387,7 +1406,7 @@ class CustomIndexRecipe(BasicNewsRecipe): def download(self): index = os.path.abspath(self.custom_index()) url = 'file:'+index if iswindows else 'file://'+index - self.web2disk_options.browser = self.browser + self.web2disk_options.browser = self.clone_browser(self.browser) fetcher = RecursiveFetcher(self.web2disk_options, self.log) fetcher.base_dir = self.output_dir fetcher.current_dir = self.output_dir diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 0bfca0fc54..8cbc0bb9d1 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -86,11 +86,6 @@ class response(str): obj.newurl = None return obj -class DummyLock(object): - - def __enter__(self, *args): return self - def __exit__(self, *args): pass - def default_is_link_wanted(url, tag): raise NotImplementedError() @@ -104,7 +99,6 @@ class RecursiveFetcher(object): # ) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__ - DUMMY_LOCK = DummyLock() def __init__(self, options, log, image_map={}, css_map={}, job_info=None): self.base_dir = os.path.abspath(os.path.expanduser(options.dir))