News download: Workaround lack of thread safety in python mechanize. Fixes #7321 (Cancelling news downloads causes loss of internet connectivity)

2025-11-07 15:23:13 -05:00 · 2010-11-04 13:35:23 -06:00 · 2010-11-04 13:35:23 -06:00 · d3886b3910
commit d3886b3910
parent 5c1a40534b
5 changed files with 127 additions and 20 deletions
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@ -60,7 +60,6 @@ class NYTimes(BasicNewsRecipe):


    timefmt = ''
-    needs_subscription = True
    masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
    cover_margins = (18,18,'grey99')

--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -21,8 +21,6 @@ from calibre.constants import iswindows, isosx, islinux, isfreebsd, isfrozen, \
                              filesystem_encoding, plugins, config_dir
 from calibre.startup import winutil, winutilerror

-import mechanize
-
 uuid.uuid4() # Imported before PyQt4 to workaround PyQt4 util-linux conflict on gentoo

 if False:
@ -269,7 +267,8 @@ def browser(honor_time=True, max_time=2, mobile_browser=False):
    :param honor_time: If True honors pause time in refresh requests
    :param max_time: Maximum time in seconds to wait during a refresh request
    '''
-    opener = mechanize.Browser()
+    from calibre.utils.browser import Browser
+    opener = Browser()
    opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
    opener.set_handle_robots(False)
    opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
--- a/src/calibre/utils/browser.py
+++ b/src/calibre/utils/browser.py
@ -0,0 +1,96 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import copy
+from cookielib import CookieJar
+
+from mechanize import Browser as B
+
+class Browser(B):
+    'A cloneable mechanize browser'
+
+    def __init__(self):
+        self._clone_actions = {}
+
+        B.__init__(self)
+        self.set_cookiejar(CookieJar())
+
+    def set_handle_refresh(self, *args, **kwargs):
+        B.set_handle_refresh(self, *args, **kwargs)
+        self._clone_actions['set_handle_refresh'] = ('set_handle_refresh',
+                args, kwargs)
+
+    def set_cookiejar(self, *args, **kwargs):
+        B.set_cookiejar(self, *args, **kwargs)
+        self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
+
+    def set_handle_redirect(self, *args, **kwargs):
+        B.set_handle_redirect(self, *args, **kwargs)
+        self._clone_actions['set_handle_redirect'] = ('set_handle_redirect',
+                args, kwargs)
+
+    def set_handle_equiv(self, *args, **kwargs):
+        B.set_handle_equiv(self, *args, **kwargs)
+        self._clone_actions['set_handle_equiv'] = ('set_handle_equiv',
+                args, kwargs)
+
+    def set_handle_gzip(self, *args, **kwargs):
+        B.set_handle_gzip(self, *args, **kwargs)
+        self._clone_actions['set_handle_gzip'] = ('set_handle_gzip',
+                args, kwargs)
+
+    def set_debug_redirect(self, *args, **kwargs):
+        B.set_debug_redirect(self, *args, **kwargs)
+        self._clone_actions['set_debug_redirect'] = ('set_debug_redirect',
+                args, kwargs)
+
+    def set_debug_responses(self, *args, **kwargs):
+        B.set_debug_responses(self, *args, **kwargs)
+        self._clone_actions['set_debug_responses'] = ('set_debug_responses',
+                args, kwargs)
+
+    def set_debug_http(self, *args, **kwargs):
+        B.set_debug_http(self, *args, **kwargs)
+        self._clone_actions['set_debug_http'] = ('set_debug_http',
+                args, kwargs)
+
+    def set_handle_robots(self, *args, **kwargs):
+        B.set_handle_robots(self, *args, **kwargs)
+        self._clone_actions['set_handle_robots'] = ('set_handle_robots',
+                args, kwargs)
+
+    def set_proxies(self, *args, **kwargs):
+        B.set_proxies(self, *args, **kwargs)
+        self._clone_actions['set_proxies'] = ('set_proxies', args, kwargs)
+
+    def add_password(self, *args, **kwargs):
+        B.add_password(self, *args, **kwargs)
+        self._clone_actions['add_password'] = ('add_password', args, kwargs)
+
+    def add_proxy_password(self, *args, **kwargs):
+        B.add_proxy_password(self, *args, **kwargs)
+        self._clone_actions['add_proxy_password'] = ('add_proxy_password', args, kwargs)
+
+    def clone_browser(self):
+        clone = Browser()
+        clone.addheaders = copy.deepcopy(self.addheaders)
+        for func, args, kwargs in self._clone_actions.values():
+            func = getattr(clone, func)
+            func(*args, **kwargs)
+        return clone
+
+if __name__ == '__main__':
+    from calibre import browser
+    from pprint import pprint
+    orig = browser()
+    clone = orig.clone_browser()
+    pprint( orig._ua_handlers)
+    pprint(clone._ua_handlers)
+    assert orig._ua_handlers.keys() == clone._ua_handlers.keys()
+    assert orig._ua_handlers['_cookies'].cookiejar is \
+            clone._ua_handlers['_cookies'].cookiejar
+
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -82,9 +82,6 @@ class BasicNewsRecipe(Recipe):
    #: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
    simultaneous_downloads = 5

-    #: If False the remote server is contacted by only one thread at a time
-    multithreaded_fetch = False
-
    #: Timeout for fetching files from server in seconds
    timeout                = 120.0

@ -402,6 +399,23 @@ class BasicNewsRecipe(Recipe):
        '''
        return browser(*args, **kwargs)

+    def clone_browser(self, br):
+        '''
+        Clone the browser br. Cloned browsers are used for multi-threaded
+        downloads, since mechanize is not thread safe. The default cloning
+        routines should capture most browser customization, but if you do
+        something exotic in your recipe, you should override this method in
+        your recipe and clone manually.
+
+        Cloned browser instances use the same, thread-safe CookieJar by
+        default, unless you have customized cookie handling.
+        '''
+        if callable(getattr(br, 'clone_browser', None)):
+            return br.clone_browser()
+
+        # Uh-oh recipe using something exotic, call get_browser
+        return self.get_browser()
+
    def get_article_url(self, article):
        '''
        Override in a subclass to customize extraction of the :term:`URL` that points
@ -798,17 +812,22 @@ class BasicNewsRecipe(Recipe):
                              extra_css=css).render(doctype='xhtml')


-    def _fetch_article(self, url, dir, f, a, num_of_feeds):
-        self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
+    def _fetch_article(self, url, dir_, f, a, num_of_feeds):
+        br = self.browser
+        if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
+            # We are using the default get_browser, which means no need to
+            # clone
+            br = BasicNewsRecipe.get_browser(self)
+        else:
+            br = self.clone_browser(self.browser)
+        self.web2disk_options.browser = br
        fetcher = RecursiveFetcher(self.web2disk_options, self.log,
                self.image_map, self.css_map,
                (url, f, a, num_of_feeds))
-        fetcher.base_dir = dir
-        fetcher.current_dir = dir
+        fetcher.base_dir = dir_
+        fetcher.current_dir = dir_
        fetcher.show_progress = False
        fetcher.image_url_processor = self.image_url_processor
-        if self.multithreaded_fetch:
-            fetcher.browser_lock = fetcher.DUMMY_LOCK
        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
        if not res or not os.path.exists(res):
            raise Exception(_('Could not fetch article. Run with -vv to see the reason'))
@ -1387,7 +1406,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
    def download(self):
        index = os.path.abspath(self.custom_index())
        url = 'file:'+index if iswindows else 'file://'+index
-        self.web2disk_options.browser = self.browser
+        self.web2disk_options.browser = self.clone_browser(self.browser)
        fetcher = RecursiveFetcher(self.web2disk_options, self.log)
        fetcher.base_dir = self.output_dir
        fetcher.current_dir = self.output_dir
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -86,11 +86,6 @@ class response(str):
        obj.newurl = None
        return obj

-class DummyLock(object):
-
-    def __enter__(self, *args): return self
-    def __exit__(self, *args): pass
-
 def default_is_link_wanted(url, tag):
    raise NotImplementedError()

@ -104,7 +99,6 @@ class RecursiveFetcher(object):
    #                       )
    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
    default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
-    DUMMY_LOCK = DummyLock()

    def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
        self.base_dir = os.path.abspath(os.path.expanduser(options.dir))