mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
One more fix for NYTimes
This commit is contained in:
parent
f4bbf10ee3
commit
3a12b18dc3
@ -391,10 +391,14 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
# Skip ad pages before actual article
|
# Skip ad pages served before actual article
|
||||||
skip_tag = soup.find(True, {'name':'skip'})
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
if skip_tag is not None:
|
if skip_tag is not None:
|
||||||
soup = self.index_to_soup(skip_tag.parent['href'])
|
self.log.error("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||||
|
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||||
|
url += '?pagewanted=all'
|
||||||
|
self.log.error("Skipping ad to article at '%s'" % url)
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
@ -280,18 +280,14 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
'''
|
# Skip ad pages served before actual article
|
||||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
|
||||||
if refresh is None:
|
|
||||||
return soup
|
|
||||||
content = refresh.get('content').partition('=')[2]
|
|
||||||
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
|
||||||
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
|
||||||
'''
|
|
||||||
# Skip ad pages before actual article
|
|
||||||
skip_tag = soup.find(True, {'name':'skip'})
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
if skip_tag is not None:
|
if skip_tag is not None:
|
||||||
soup = self.index_to_soup(skip_tag.parent['href'])
|
self.log.error("Found forwarding link: %s" % skip_tag.parent['href'])
|
||||||
|
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||||
|
url += '?pagewanted=all'
|
||||||
|
self.log.error("Skipping ad to article at '%s'" % url)
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user