mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
One more fix for NYTimes
This commit is contained in:
parent
f4bbf10ee3
commit
3a12b18dc3
@ -391,10 +391,14 @@ class NYTimes(BasicNewsRecipe):
|
||||
return ans
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
# Skip ad pages before actual article
|
||||
# Skip ad pages served before actual article
|
||||
skip_tag = soup.find(True, {'name':'skip'})
|
||||
if skip_tag is not None:
|
||||
soup = self.index_to_soup(skip_tag.parent['href'])
|
||||
self.log.error("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url += '?pagewanted=all'
|
||||
self.log.error("Skipping ad to article at '%s'" % url)
|
||||
soup = self.index_to_soup(url)
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
|
@ -280,18 +280,14 @@ class NYTimes(BasicNewsRecipe):
|
||||
return ans
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
'''
|
||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||
if refresh is None:
|
||||
return soup
|
||||
content = refresh.get('content').partition('=')[2]
|
||||
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||
'''
|
||||
# Skip ad pages before actual article
|
||||
# Skip ad pages served before actual article
|
||||
skip_tag = soup.find(True, {'name':'skip'})
|
||||
if skip_tag is not None:
|
||||
soup = self.index_to_soup(skip_tag.parent['href'])
|
||||
self.log.error("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||
url += '?pagewanted=all'
|
||||
self.log.error("Skipping ad to article at '%s'" % url)
|
||||
soup = self.index_to_soup(url)
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
|
Loading…
x
Reference in New Issue
Block a user