mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-04 03:27:00 -05:00 
			
		
		
		
	News download: Workaround lack of thread safety in python mechanize. Fixes #7321 (Cancelling news downloads causes loss of internet connectivity)
This commit is contained in:
		
							parent
							
								
									5c1a40534b
								
							
						
					
					
						commit
						d3886b3910
					
				@ -60,7 +60,6 @@ class NYTimes(BasicNewsRecipe):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    timefmt = ''
 | 
					    timefmt = ''
 | 
				
			||||||
    needs_subscription = True
 | 
					 | 
				
			||||||
    masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
 | 
					    masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
 | 
				
			||||||
    cover_margins = (18,18,'grey99')
 | 
					    cover_margins = (18,18,'grey99')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -21,8 +21,6 @@ from calibre.constants import iswindows, isosx, islinux, isfreebsd, isfrozen, \
 | 
				
			|||||||
                              filesystem_encoding, plugins, config_dir
 | 
					                              filesystem_encoding, plugins, config_dir
 | 
				
			||||||
from calibre.startup import winutil, winutilerror
 | 
					from calibre.startup import winutil, winutilerror
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import mechanize
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
uuid.uuid4() # Imported before PyQt4 to workaround PyQt4 util-linux conflict on gentoo
 | 
					uuid.uuid4() # Imported before PyQt4 to workaround PyQt4 util-linux conflict on gentoo
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if False:
 | 
					if False:
 | 
				
			||||||
@ -269,7 +267,8 @@ def browser(honor_time=True, max_time=2, mobile_browser=False):
 | 
				
			|||||||
    :param honor_time: If True honors pause time in refresh requests
 | 
					    :param honor_time: If True honors pause time in refresh requests
 | 
				
			||||||
    :param max_time: Maximum time in seconds to wait during a refresh request
 | 
					    :param max_time: Maximum time in seconds to wait during a refresh request
 | 
				
			||||||
    '''
 | 
					    '''
 | 
				
			||||||
    opener = mechanize.Browser()
 | 
					    from calibre.utils.browser import Browser
 | 
				
			||||||
 | 
					    opener = Browser()
 | 
				
			||||||
    opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
 | 
					    opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
 | 
				
			||||||
    opener.set_handle_robots(False)
 | 
					    opener.set_handle_robots(False)
 | 
				
			||||||
    opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
 | 
					    opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										96
									
								
								src/calibre/utils/browser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										96
									
								
								src/calibre/utils/browser.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,96 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env python
 | 
				
			||||||
 | 
					# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__license__   = 'GPL v3'
 | 
				
			||||||
 | 
					__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 | 
				
			||||||
 | 
					__docformat__ = 'restructuredtext en'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import copy
 | 
				
			||||||
 | 
					from cookielib import CookieJar
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from mechanize import Browser as B
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Browser(B):
 | 
				
			||||||
 | 
					    'A cloneable mechanize browser'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self):
 | 
				
			||||||
 | 
					        self._clone_actions = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        B.__init__(self)
 | 
				
			||||||
 | 
					        self.set_cookiejar(CookieJar())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_handle_refresh(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.set_handle_refresh(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['set_handle_refresh'] = ('set_handle_refresh',
 | 
				
			||||||
 | 
					                args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_cookiejar(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.set_cookiejar(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_handle_redirect(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.set_handle_redirect(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['set_handle_redirect'] = ('set_handle_redirect',
 | 
				
			||||||
 | 
					                args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_handle_equiv(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.set_handle_equiv(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['set_handle_equiv'] = ('set_handle_equiv',
 | 
				
			||||||
 | 
					                args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_handle_gzip(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.set_handle_gzip(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['set_handle_gzip'] = ('set_handle_gzip',
 | 
				
			||||||
 | 
					                args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_debug_redirect(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.set_debug_redirect(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['set_debug_redirect'] = ('set_debug_redirect',
 | 
				
			||||||
 | 
					                args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_debug_responses(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.set_debug_responses(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['set_debug_responses'] = ('set_debug_responses',
 | 
				
			||||||
 | 
					                args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_debug_http(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.set_debug_http(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['set_debug_http'] = ('set_debug_http',
 | 
				
			||||||
 | 
					                args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_handle_robots(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.set_handle_robots(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['set_handle_robots'] = ('set_handle_robots',
 | 
				
			||||||
 | 
					                args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_proxies(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.set_proxies(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['set_proxies'] = ('set_proxies', args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def add_password(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.add_password(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['add_password'] = ('add_password', args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def add_proxy_password(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        B.add_proxy_password(self, *args, **kwargs)
 | 
				
			||||||
 | 
					        self._clone_actions['add_proxy_password'] = ('add_proxy_password', args, kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def clone_browser(self):
 | 
				
			||||||
 | 
					        clone = Browser()
 | 
				
			||||||
 | 
					        clone.addheaders = copy.deepcopy(self.addheaders)
 | 
				
			||||||
 | 
					        for func, args, kwargs in self._clone_actions.values():
 | 
				
			||||||
 | 
					            func = getattr(clone, func)
 | 
				
			||||||
 | 
					            func(*args, **kwargs)
 | 
				
			||||||
 | 
					        return clone
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    from calibre import browser
 | 
				
			||||||
 | 
					    from pprint import pprint
 | 
				
			||||||
 | 
					    orig = browser()
 | 
				
			||||||
 | 
					    clone = orig.clone_browser()
 | 
				
			||||||
 | 
					    pprint( orig._ua_handlers)
 | 
				
			||||||
 | 
					    pprint(clone._ua_handlers)
 | 
				
			||||||
 | 
					    assert orig._ua_handlers.keys() == clone._ua_handlers.keys()
 | 
				
			||||||
 | 
					    assert orig._ua_handlers['_cookies'].cookiejar is \
 | 
				
			||||||
 | 
					            clone._ua_handlers['_cookies'].cookiejar
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -82,9 +82,6 @@ class BasicNewsRecipe(Recipe):
 | 
				
			|||||||
    #: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
 | 
					    #: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
 | 
				
			||||||
    simultaneous_downloads = 5
 | 
					    simultaneous_downloads = 5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #: If False the remote server is contacted by only one thread at a time
 | 
					 | 
				
			||||||
    multithreaded_fetch = False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #: Timeout for fetching files from server in seconds
 | 
					    #: Timeout for fetching files from server in seconds
 | 
				
			||||||
    timeout                = 120.0
 | 
					    timeout                = 120.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -402,6 +399,23 @@ class BasicNewsRecipe(Recipe):
 | 
				
			|||||||
        '''
 | 
					        '''
 | 
				
			||||||
        return browser(*args, **kwargs)
 | 
					        return browser(*args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def clone_browser(self, br):
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        Clone the browser br. Cloned browsers are used for multi-threaded
 | 
				
			||||||
 | 
					        downloads, since mechanize is not thread safe. The default cloning
 | 
				
			||||||
 | 
					        routines should capture most browser customization, but if you do
 | 
				
			||||||
 | 
					        something exotic in your recipe, you should override this method in
 | 
				
			||||||
 | 
					        your recipe and clone manually.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Cloned browser instances use the same, thread-safe CookieJar by
 | 
				
			||||||
 | 
					        default, unless you have customized cookie handling.
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        if callable(getattr(br, 'clone_browser', None)):
 | 
				
			||||||
 | 
					            return br.clone_browser()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Uh-oh recipe using something exotic, call get_browser
 | 
				
			||||||
 | 
					        return self.get_browser()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_article_url(self, article):
 | 
					    def get_article_url(self, article):
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        Override in a subclass to customize extraction of the :term:`URL` that points
 | 
					        Override in a subclass to customize extraction of the :term:`URL` that points
 | 
				
			||||||
@ -798,17 +812,22 @@ class BasicNewsRecipe(Recipe):
 | 
				
			|||||||
                              extra_css=css).render(doctype='xhtml')
 | 
					                              extra_css=css).render(doctype='xhtml')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _fetch_article(self, url, dir, f, a, num_of_feeds):
 | 
					    def _fetch_article(self, url, dir_, f, a, num_of_feeds):
 | 
				
			||||||
        self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
 | 
					        br = self.browser
 | 
				
			||||||
 | 
					        if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
 | 
				
			||||||
 | 
					            # We are using the default get_browser, which means no need to
 | 
				
			||||||
 | 
					            # clone
 | 
				
			||||||
 | 
					            br = BasicNewsRecipe.get_browser(self)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            br = self.clone_browser(self.browser)
 | 
				
			||||||
 | 
					        self.web2disk_options.browser = br
 | 
				
			||||||
        fetcher = RecursiveFetcher(self.web2disk_options, self.log,
 | 
					        fetcher = RecursiveFetcher(self.web2disk_options, self.log,
 | 
				
			||||||
                self.image_map, self.css_map,
 | 
					                self.image_map, self.css_map,
 | 
				
			||||||
                (url, f, a, num_of_feeds))
 | 
					                (url, f, a, num_of_feeds))
 | 
				
			||||||
        fetcher.base_dir = dir
 | 
					        fetcher.base_dir = dir_
 | 
				
			||||||
        fetcher.current_dir = dir
 | 
					        fetcher.current_dir = dir_
 | 
				
			||||||
        fetcher.show_progress = False
 | 
					        fetcher.show_progress = False
 | 
				
			||||||
        fetcher.image_url_processor = self.image_url_processor
 | 
					        fetcher.image_url_processor = self.image_url_processor
 | 
				
			||||||
        if self.multithreaded_fetch:
 | 
					 | 
				
			||||||
            fetcher.browser_lock = fetcher.DUMMY_LOCK
 | 
					 | 
				
			||||||
        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
 | 
					        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
 | 
				
			||||||
        if not res or not os.path.exists(res):
 | 
					        if not res or not os.path.exists(res):
 | 
				
			||||||
            raise Exception(_('Could not fetch article. Run with -vv to see the reason'))
 | 
					            raise Exception(_('Could not fetch article. Run with -vv to see the reason'))
 | 
				
			||||||
@ -1387,7 +1406,7 @@ class CustomIndexRecipe(BasicNewsRecipe):
 | 
				
			|||||||
    def download(self):
 | 
					    def download(self):
 | 
				
			||||||
        index = os.path.abspath(self.custom_index())
 | 
					        index = os.path.abspath(self.custom_index())
 | 
				
			||||||
        url = 'file:'+index if iswindows else 'file://'+index
 | 
					        url = 'file:'+index if iswindows else 'file://'+index
 | 
				
			||||||
        self.web2disk_options.browser = self.browser
 | 
					        self.web2disk_options.browser = self.clone_browser(self.browser)
 | 
				
			||||||
        fetcher = RecursiveFetcher(self.web2disk_options, self.log)
 | 
					        fetcher = RecursiveFetcher(self.web2disk_options, self.log)
 | 
				
			||||||
        fetcher.base_dir = self.output_dir
 | 
					        fetcher.base_dir = self.output_dir
 | 
				
			||||||
        fetcher.current_dir = self.output_dir
 | 
					        fetcher.current_dir = self.output_dir
 | 
				
			||||||
 | 
				
			|||||||
@ -86,11 +86,6 @@ class response(str):
 | 
				
			|||||||
        obj.newurl = None
 | 
					        obj.newurl = None
 | 
				
			||||||
        return obj
 | 
					        return obj
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DummyLock(object):
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __enter__(self, *args): return self
 | 
					 | 
				
			||||||
    def __exit__(self, *args): pass
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_is_link_wanted(url, tag):
 | 
					def default_is_link_wanted(url, tag):
 | 
				
			||||||
    raise NotImplementedError()
 | 
					    raise NotImplementedError()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -104,7 +99,6 @@ class RecursiveFetcher(object):
 | 
				
			|||||||
    #                       )
 | 
					    #                       )
 | 
				
			||||||
    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
 | 
					    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
 | 
				
			||||||
    default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
 | 
					    default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
 | 
				
			||||||
    DUMMY_LOCK = DummyLock()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
 | 
					    def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
 | 
				
			||||||
        self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
 | 
					        self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user