mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Fix NYTimes recipe to skip ads
This commit is contained in:
parent
3fcb930777
commit
c83e888bb9
@ -82,6 +82,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'articleExtras',
|
'articleExtras',
|
||||||
'articleInline',
|
'articleInline',
|
||||||
'blog_sidebar',
|
'blog_sidebar',
|
||||||
|
'businessSearchBar',
|
||||||
'cCol',
|
'cCol',
|
||||||
'entertainmentSearchBar',
|
'entertainmentSearchBar',
|
||||||
'footer',
|
'footer',
|
||||||
@ -286,9 +287,14 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||||
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||||
'''
|
'''
|
||||||
|
# Skip ad pages before actual article
|
||||||
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
|
if skip_tag is not None:
|
||||||
|
soup = self.index_to_soup(skip_tag.parent['href'])
|
||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
print "\npostprocess_html()\n"
|
||||||
|
|
||||||
if self.one_picture_per_article:
|
if self.one_picture_per_article:
|
||||||
# Remove all images after first
|
# Remove all images after first
|
||||||
@ -411,6 +417,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
def postprocess_book(self, oeb, opts, log) :
|
def postprocess_book(self, oeb, opts, log) :
|
||||||
|
print "\npostprocess_book()\n"
|
||||||
|
|
||||||
def extract_byline(href) :
|
def extract_byline(href) :
|
||||||
# <meta name="byline" content=
|
# <meta name="byline" content=
|
||||||
|
Loading…
x
Reference in New Issue
Block a user