mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Updated NYTimes Top Stories recipe
This commit is contained in:
parent
2efa863948
commit
9d292633c7
@ -14,8 +14,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = 'New York Times Top Stories'
|
title = 'New York Times Top Stories'
|
||||||
__author__ = 'GRiker'
|
__author__ = 'GRiker'
|
||||||
language = 'en'
|
language = _('English')
|
||||||
|
|
||||||
description = 'Top Stories from the New York Times'
|
description = 'Top Stories from the New York Times'
|
||||||
|
|
||||||
# List of sections typically included in Top Stories. Use a keyword from the
|
# List of sections typically included in Top Stories. Use a keyword from the
|
||||||
@ -56,11 +55,14 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
keep_only_tags = [ dict(attrs={ 'id':['article']})]
|
keep_only_tags = [ dict(attrs={ 'id':['article']}),
|
||||||
|
dict(attrs={'class':['blog wrap']}) ]
|
||||||
|
|
||||||
remove_tags = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix',
|
remove_tags = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix',
|
||||||
'inlineVideo left brightcove']}),
|
'inlineVideo left brightcove', 'entry-meta']}),
|
||||||
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles',
|
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles',
|
||||||
'portfolioInline','articleInline','readerscomment']}) ]
|
'portfolioInline','articleInline','readerscomment',
|
||||||
|
'nytRating']}) ]
|
||||||
|
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
@ -207,7 +209,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
(i.string.strip() > "") and \
|
(i.string.strip() > "") and \
|
||||||
not isinstance(i,Comment):
|
not isinstance(i,Comment):
|
||||||
contentString = i.strip().encode('utf-8')
|
contentString = i.strip().encode('utf-8')
|
||||||
if contentString[0:3] == 'By ' :
|
if contentString[0:3] == 'By ' and contentString[4].isupper() :
|
||||||
bylines.append(contentString)
|
bylines.append(contentString)
|
||||||
else :
|
else :
|
||||||
descriptions.append(contentString)
|
descriptions.append(contentString)
|
||||||
@ -265,14 +267,31 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
# refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||||
if refresh is None:
|
# if refresh is None:
|
||||||
return self.strip_anchors(soup)
|
# return self.strip_anchors(soup)
|
||||||
|
#
|
||||||
content = refresh.get('content').partition('=')[2]
|
# content = refresh.get('content').partition('=')[2]
|
||||||
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
# raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||||
soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
# soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||||
|
if refresh is not None:
|
||||||
|
content = refresh.get('content').partition('=')[2]
|
||||||
|
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||||
|
soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||||
|
|
||||||
|
soup = self.strip_anchors(soup)
|
||||||
|
|
||||||
|
# Test for empty content
|
||||||
|
body = soup.find('body')
|
||||||
|
tagCount = len(body.findAll(True))
|
||||||
|
if tagCount:
|
||||||
|
# print "%d tags in article" % tagCount
|
||||||
|
return soup
|
||||||
|
else:
|
||||||
|
print "no allowed content found, removing article"
|
||||||
|
raise StringError
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user