One more fix for NYTimes

This commit is contained in:
Kovid Goyal 2010-06-02 10:46:53 -06:00
parent f4bbf10ee3
commit 3a12b18dc3
2 changed files with 12 additions and 12 deletions

View File

@ -391,10 +391,14 @@ class NYTimes(BasicNewsRecipe):
return ans return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):
# Skip ad pages before actual article # Skip ad pages served before actual article
skip_tag = soup.find(True, {'name':'skip'}) skip_tag = soup.find(True, {'name':'skip'})
if skip_tag is not None: if skip_tag is not None:
soup = self.index_to_soup(skip_tag.parent['href']) self.log.error("Found forwarding link: %s" % skip_tag.parent['href'])
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
url += '?pagewanted=all'
self.log.error("Skipping ad to article at '%s'" % url)
soup = self.index_to_soup(url)
return self.strip_anchors(soup) return self.strip_anchors(soup)
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):

View File

@ -280,18 +280,14 @@ class NYTimes(BasicNewsRecipe):
return ans return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):
''' # Skip ad pages served before actual article
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('http://www.nytimes.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
'''
# Skip ad pages before actual article
skip_tag = soup.find(True, {'name':'skip'}) skip_tag = soup.find(True, {'name':'skip'})
if skip_tag is not None: if skip_tag is not None:
soup = self.index_to_soup(skip_tag.parent['href']) self.log.error("Found forwarding link: %s" % skip_tag.parent['href'])
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
url += '?pagewanted=all'
self.log.error("Skipping ad to article at '%s'" % url)
soup = self.index_to_soup(url)
return self.strip_anchors(soup) return self.strip_anchors(soup)
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):