News download: Fix threading issues in skip_ad_pages() method.

2025-07-07 10:14:46 -04:00 · 2011-12-30 10:38:35 +05:30 · 2011-12-30 10:38:35 +05:30 · f3e85aa26d
commit f3e85aa26d
parent 407be8da53
3 changed files with 18 additions and 3 deletions
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -325,7 +325,8 @@ class NYTimes(BasicNewsRecipe):
        '''
        def get_the_soup(docEncoding, url_or_raw, raw=False) :
            if re.match(r'\w+://', url_or_raw):
-                f = self.browser.open(url_or_raw)
+                br = self.clone_browser(self.browser)
                f = br.open_novisit(url_or_raw)
                _raw = f.read()
                f.close()
                if not _raw:
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -364,7 +364,8 @@ class NYTimes(BasicNewsRecipe):
        '''
        def get_the_soup(docEncoding, url_or_raw, raw=False) :
            if re.match(r'\w+://', url_or_raw):
-                f = self.browser.open(url_or_raw)
+                br = self.clone_browser(self.browser)
                f = br.open_novisit(url_or_raw)
                _raw = f.read()
                f.close()
                if not _raw:
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -437,6 +437,16 @@ class BasicNewsRecipe(Recipe):
        # Uh-oh recipe using something exotic, call get_browser
        return self.get_browser()
    @property
    def cloned_browser(self):
        if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
            # We are using the default get_browser, which means no need to
            # clone
            br = BasicNewsRecipe.get_browser(self)
        else:
            br = self.clone_browser(self.browser)
        return br
    def get_article_url(self, article):
        '''
        Override in a subclass to customize extraction of the :term:`URL` that points
@ -534,7 +544,10 @@ class BasicNewsRecipe(Recipe):
        `url_or_raw`: Either a URL or the downloaded index page as a string
        '''
        if re.match(r'\w+://', url_or_raw):
-            open_func = getattr(self.browser, 'open_novisit', self.browser.open)
+            # We may be called in a thread (in the skip_ad_pages method), so
            # clone the browser to be safe
            br = self.cloned_browser
            open_func = getattr(br, 'open_novisit', br.open)
            with closing(open_func(url_or_raw)) as f:
                _raw = f.read()
            if not _raw: