News download: Fix threading issues in skip_ad_pages() method.

This commit is contained in:
Kovid Goyal 2011-12-30 10:38:35 +05:30
parent 407be8da53
commit f3e85aa26d
3 changed files with 18 additions and 3 deletions

View File

@ -325,7 +325,8 @@ class NYTimes(BasicNewsRecipe):
''' '''
def get_the_soup(docEncoding, url_or_raw, raw=False) : def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw): if re.match(r'\w+://', url_or_raw):
f = self.browser.open(url_or_raw) br = self.clone_browser(self.browser)
f = br.open_novisit(url_or_raw)
_raw = f.read() _raw = f.read()
f.close() f.close()
if not _raw: if not _raw:

View File

@ -364,7 +364,8 @@ class NYTimes(BasicNewsRecipe):
''' '''
def get_the_soup(docEncoding, url_or_raw, raw=False) : def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw): if re.match(r'\w+://', url_or_raw):
f = self.browser.open(url_or_raw) br = self.clone_browser(self.browser)
f = br.open_novisit(url_or_raw)
_raw = f.read() _raw = f.read()
f.close() f.close()
if not _raw: if not _raw:

View File

@ -437,6 +437,16 @@ class BasicNewsRecipe(Recipe):
# Uh-oh recipe using something exotic, call get_browser # Uh-oh recipe using something exotic, call get_browser
return self.get_browser() return self.get_browser()
@property
def cloned_browser(self):
if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
# We are using the default get_browser, which means no need to
# clone
br = BasicNewsRecipe.get_browser(self)
else:
br = self.clone_browser(self.browser)
return br
def get_article_url(self, article): def get_article_url(self, article):
''' '''
Override in a subclass to customize extraction of the :term:`URL` that points Override in a subclass to customize extraction of the :term:`URL` that points
@ -534,7 +544,10 @@ class BasicNewsRecipe(Recipe):
`url_or_raw`: Either a URL or the downloaded index page as a string `url_or_raw`: Either a URL or the downloaded index page as a string
''' '''
if re.match(r'\w+://', url_or_raw): if re.match(r'\w+://', url_or_raw):
open_func = getattr(self.browser, 'open_novisit', self.browser.open) # We may be called in a thread (in the skip_ad_pages method), so
# clone the browser to be safe
br = self.cloned_browser
open_func = getattr(br, 'open_novisit', br.open)
with closing(open_func(url_or_raw)) as f: with closing(open_func(url_or_raw)) as f:
_raw = f.read() _raw = f.read()
if not _raw: if not _raw: