mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update NY Times some more
This commit is contained in:
parent
a60a80d125
commit
2d8dfc3a28
@ -14,8 +14,8 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
|||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
recursions=1 # set this to zero to omit Related articles lists
|
recursions=1 # set this to zero to omit Related articles lists
|
||||||
match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
|
match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
|
||||||
|
|
||||||
# set getTechBlogs to True to include the technology blogs
|
# set getTechBlogs to True to include the technology blogs
|
||||||
# set tech_oldest_article to control article age
|
# set tech_oldest_article to control article age
|
||||||
@ -28,12 +28,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
|
# set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
|
||||||
# otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
|
# otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
|
||||||
getPopularArticles = True
|
getPopularArticles = True
|
||||||
popularPeriod = '1' # set this to the number of days to include in the measurement
|
popularPeriod = '1' # set this to the number of days to include in the measurement
|
||||||
# e.g. 7 will get the most popular measured over the last 7 days
|
# e.g. 7 will get the most popular measured over the last 7 days
|
||||||
# and 30 will get the most popular measured over 30 days.
|
# and 30 will get the most popular measured over 30 days.
|
||||||
# you still only get up to 20 articles in each category
|
# you still only get up to 20 articles in each category
|
||||||
|
|
||||||
|
|
||||||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||||
headlinesOnly = True
|
headlinesOnly = True
|
||||||
|
|
||||||
@ -82,6 +81,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
# The maximum number of articles that will be downloaded
|
# The maximum number of articles that will be downloaded
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
||||||
# more than one section). If True, only the first occurance will be downloaded.
|
# more than one section). If True, only the first occurance will be downloaded.
|
||||||
@ -122,7 +122,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
if headlinesOnly:
|
if headlinesOnly:
|
||||||
title='New York Times Headlines'
|
title='New York Times Headlines'
|
||||||
description = 'Headlines from the New York Times'
|
description = 'Headlines from the New York Times'
|
||||||
@ -155,7 +154,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
earliest_date = date.today()
|
earliest_date = date.today()
|
||||||
else:
|
else:
|
||||||
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
||||||
oldest_article = 365 # by default, a long time ago
|
oldest_article = 365 # by default, a long time ago
|
||||||
|
|
||||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
@ -164,12 +163,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
|
|
||||||
#simultaneous_downloads = 1 # no longer required to deal with ads
|
# simultaneous_downloads = 1 # no longer required to deal with ads
|
||||||
|
|
||||||
cover_margins = (18,18,'grey99')
|
cover_margins = (18,18,'grey99')
|
||||||
|
|
||||||
remove_tags_before = dict(id='article')
|
keep_only_tags = dict(id=['article', 'story', 'content'])
|
||||||
remove_tags_after = dict(id='article')
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(attrs={'class':[
|
dict(attrs={'class':[
|
||||||
'articleFooter',
|
'articleFooter',
|
||||||
@ -184,6 +182,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'entry-response module',
|
'entry-response module',
|
||||||
'leftNavTabs',
|
'leftNavTabs',
|
||||||
'metaFootnote',
|
'metaFootnote',
|
||||||
|
'inside-story',
|
||||||
'module box nav',
|
'module box nav',
|
||||||
'nextArticleLink',
|
'nextArticleLink',
|
||||||
'nextArticleLink clearfix',
|
'nextArticleLink clearfix',
|
||||||
@ -192,28 +191,28 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'side_tool',
|
'side_tool',
|
||||||
'singleAd',
|
'singleAd',
|
||||||
'postCategory column',
|
'postCategory column',
|
||||||
'refer tagRefer', # added for bits blog post
|
'refer tagRefer', # added for bits blog post
|
||||||
'entry entry-utility', #added for DealBook
|
'entry entry-utility', # added for DealBook
|
||||||
'entry-tags', #added for DealBook
|
'entry-tags', # added for DealBook
|
||||||
'footer promos clearfix', #added for DealBook
|
'footer promos clearfix', # added for DealBook
|
||||||
'footer links clearfix', #added for DealBook
|
'footer links clearfix', # added for DealBook
|
||||||
'tabsContainer', #added for other blog downloads
|
'tabsContainer', # added for other blog downloads
|
||||||
'column lastColumn', #added for other blog downloads
|
'column lastColumn', # added for other blog downloads
|
||||||
'pageHeaderWithLabel', #added for other gadgetwise downloads
|
'pageHeaderWithLabel', # added for other gadgetwise downloads
|
||||||
'column two', #added for other blog downloads
|
'column two', # added for other blog downloads
|
||||||
'column two last', #added for other blog downloads
|
'column two last', # added for other blog downloads
|
||||||
'column three', #added for other blog downloads
|
'column three', # added for other blog downloads
|
||||||
'column three last', #added for other blog downloads
|
'column three last', # added for other blog downloads
|
||||||
'column four',#added for other blog downloads
|
'column four', # added for other blog downloads
|
||||||
'column four last',#added for other blog downloads
|
'column four last', # added for other blog downloads
|
||||||
'column last', #added for other blog downloads
|
'column last', # added for other blog downloads
|
||||||
'entry entry-related',
|
'entry entry-related',
|
||||||
'subNavigation tabContent active', #caucus blog navigation
|
'subNavigation tabContent active', # caucus blog navigation
|
||||||
'mediaOverlay slideshow',
|
'mediaOverlay slideshow',
|
||||||
'wideThumb',
|
'wideThumb',
|
||||||
'video', #added 02-11-2011
|
'video', # added 02-11-2011
|
||||||
'videoHeader',#added 02-11-2011
|
'videoHeader', # added 02-11-2011
|
||||||
'articleInlineVideoHolder', #added 02-11-2011
|
'articleInlineVideoHolder', # added 02-11-2011
|
||||||
'assetCompanionAd',
|
'assetCompanionAd',
|
||||||
'nytint-sectionHeader',
|
'nytint-sectionHeader',
|
||||||
re.compile('^subNavigation'),
|
re.compile('^subNavigation'),
|
||||||
@ -223,6 +222,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'credit'
|
'credit'
|
||||||
]}),
|
]}),
|
||||||
dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
|
dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
|
||||||
|
dict(attrs={'class':lambda x: x and 'interactive' in x.split()}),
|
||||||
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||||||
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||||||
dict(name='div', attrs={'class':'tweet'}),
|
dict(name='div', attrs={'class':'tweet'}),
|
||||||
@ -231,8 +231,8 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
|
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
|
||||||
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
|
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
|
||||||
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
|
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
|
||||||
dict(name='div', attrs={'id':re.compile('respond')}), # open
|
dict(name='div', attrs={'id':re.compile('respond')}), # open
|
||||||
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
|
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
|
||||||
dict(id=[
|
dict(id=[
|
||||||
'adxLeaderboard',
|
'adxLeaderboard',
|
||||||
'adxSponLink',
|
'adxSponLink',
|
||||||
@ -266,16 +266,18 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'side_index',
|
'side_index',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'toolsRight',
|
'toolsRight',
|
||||||
'skybox', #added for DealBook
|
'skybox', # added for DealBook
|
||||||
'TopAd', #added for DealBook
|
'TopAd', # added for DealBook
|
||||||
'related-content', #added for DealBook
|
'related-content', # added for DealBook
|
||||||
'whats-next',
|
'whats-next',
|
||||||
]),
|
]),
|
||||||
dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta'])]
|
dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta', 'footer'])]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||||||
.credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.credit { font-weight: normal; text-align: right; font-size:
|
||||||
|
50%; line-height:1em; margin-top:5px; margin-left:0;
|
||||||
|
margin-right:0; margin-bottom: 0; }
|
||||||
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
@ -291,7 +293,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
|
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
|
||||||
.source {text-align: left; font-size: x-small; }'''
|
.source {text-align: left; font-size: x-small; }'''
|
||||||
|
|
||||||
|
|
||||||
articles = {}
|
articles = {}
|
||||||
key = None
|
key = None
|
||||||
ans = []
|
ans = []
|
||||||
@ -313,22 +314,22 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
del ans[idx]
|
del ans[idx]
|
||||||
idx_max = idx_max-1
|
idx_max = idx_max-1
|
||||||
continue
|
continue
|
||||||
if True: #self.verbose
|
if True: # self.verbose
|
||||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])))
|
||||||
for article in ans[idx][1]:
|
for article in ans[idx][1]:
|
||||||
total_article_count += 1
|
total_article_count += 1
|
||||||
if True: #self.verbose
|
if True: # self.verbose
|
||||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
article['url'].encode('cp1252','replace')))
|
article['url'].encode('cp1252','replace')))
|
||||||
idx = idx+1
|
idx = idx+1
|
||||||
|
|
||||||
self.log( "Queued %d articles" % total_article_count )
|
self.log("Queued %d articles" % total_article_count)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def exclude_url(self,url):
|
def exclude_url(self,url):
|
||||||
if not url.startswith("http"):
|
if not url.startswith("http"):
|
||||||
return True
|
return True
|
||||||
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
|
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: # added for DealBook
|
||||||
return True
|
return True
|
||||||
if 'nytimes.com' not in url:
|
if 'nytimes.com' not in url:
|
||||||
return True
|
return True
|
||||||
@ -412,7 +413,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
def short_title(self):
|
def short_title(self):
|
||||||
return self.title
|
return self.title
|
||||||
|
|
||||||
|
|
||||||
def article_to_soup(self, url_or_raw, raw=False):
|
def article_to_soup(self, url_or_raw, raw=False):
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
import copy
|
import copy
|
||||||
@ -446,7 +446,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
usrc = self.preprocess_raw_html(usrc, url_or_raw)
|
usrc = self.preprocess_raw_html(usrc, url_or_raw)
|
||||||
return BeautifulSoup(usrc, markupMassage=nmassage)
|
return BeautifulSoup(usrc, markupMassage=nmassage)
|
||||||
|
|
||||||
|
|
||||||
def massageNCXText(self, description):
|
def massageNCXText(self, description):
|
||||||
# Kindle TOC descriptions won't render certain characters
|
# Kindle TOC descriptions won't render certain characters
|
||||||
if description:
|
if description:
|
||||||
@ -478,7 +477,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if self.webEdition:
|
if self.webEdition:
|
||||||
date_tag = self.decode_url_date(url)
|
date_tag = self.decode_url_date(url)
|
||||||
if date_tag is not None:
|
if date_tag is not None:
|
||||||
if self.oldest_web_article is not None:
|
if self.oldest_web_article is not None:
|
||||||
if date_tag < self.earliest_date:
|
if date_tag < self.earliest_date:
|
||||||
self.log("Skipping article %s" % url)
|
self.log("Skipping article %s" % url)
|
||||||
return
|
return
|
||||||
@ -501,7 +500,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if authorAttribution:
|
if authorAttribution:
|
||||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
feed = self.key if self.key is not None else 'Uncategorized'
|
feed = self.key if self.key is not None else 'Uncategorized'
|
||||||
if not self.articles.has_key(feed):
|
if feed not in self.articles:
|
||||||
self.ans.append(feed)
|
self.ans.append(feed)
|
||||||
self.articles[feed] = []
|
self.articles[feed] = []
|
||||||
self.articles[feed].append(
|
self.articles[feed].append(
|
||||||
@ -536,7 +535,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
desc = ''
|
desc = ''
|
||||||
return(title,url,author,desc)
|
return(title,url,author,desc)
|
||||||
|
|
||||||
|
|
||||||
have_emailed = False
|
have_emailed = False
|
||||||
emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
|
emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
|
||||||
for h3tag in emailed_soup.findAll('h3'):
|
for h3tag in emailed_soup.findAll('h3'):
|
||||||
@ -565,7 +563,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
dict(title=title, url=url, date=strftime('%a, %d %b'),
|
dict(title=title, url=url, date=strftime('%a, %d %b'),
|
||||||
description=desc, author=author,
|
description=desc, author=author,
|
||||||
content=''))
|
content=''))
|
||||||
viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)]
|
viewed_ans = [(k, popular_articles[k]) for k in key_list if k in popular_articles]
|
||||||
for x in viewed_ans:
|
for x in viewed_ans:
|
||||||
ans.append(x)
|
ans.append(x)
|
||||||
return ans
|
return ans
|
||||||
@ -588,10 +586,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tech_articles[f.title] = []
|
tech_articles[f.title] = []
|
||||||
for a in f.articles:
|
for a in f.articles:
|
||||||
tech_articles[f.title].append(
|
tech_articles[f.title].append(
|
||||||
dict(title=a.title, url=a.url, date=a.date,
|
dict(title=a.title, url=a.url.partition('?')[0], date=a.date,
|
||||||
description=a.summary, author=a.author,
|
description=a.summary, author=a.author,
|
||||||
content=a.content))
|
content=a.content))
|
||||||
tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
|
tech_ans = [(k, tech_articles[k]) for k in key_list if k in tech_articles]
|
||||||
for x in tech_ans:
|
for x in tech_ans:
|
||||||
ans.append(x)
|
ans.append(x)
|
||||||
return ans
|
return ans
|
||||||
@ -630,10 +628,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
for lidiv in div.findAll('li'):
|
for lidiv in div.findAll('li'):
|
||||||
self.handle_article(lidiv)
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
|
||||||
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
|
|
||||||
def parse_todays_index(self):
|
def parse_todays_index(self):
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
@ -663,7 +660,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if not skipping:
|
if not skipping:
|
||||||
self.handle_article(lidiv)
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
|
||||||
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
def parse_headline_index(self):
|
def parse_headline_index(self):
|
||||||
@ -709,13 +706,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
description = self.tag_to_string(desc,use_alt=False)
|
description = self.tag_to_string(desc,use_alt=False)
|
||||||
else:
|
else:
|
||||||
description = ''
|
description = ''
|
||||||
if not self.articles.has_key(section_name):
|
if section_name not in self.articles:
|
||||||
self.ans.append(section_name)
|
self.ans.append(section_name)
|
||||||
self.articles[section_name] = []
|
self.articles[section_name] = []
|
||||||
print('Title '+title+' author '+author)
|
print('Title '+title+' author '+author)
|
||||||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
|
||||||
return self.filter_ans(self.ans)
|
return self.filter_ans(self.ans)
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
@ -735,7 +732,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if kill_all or (self.recursions==0):
|
if kill_all or (self.recursions==0):
|
||||||
a.replaceWith(self.tag_to_string(a,False))
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
else:
|
else:
|
||||||
if a.has_key('href'):
|
if 'href' in a:
|
||||||
if a['href'].startswith('http://www.nytimes'):
|
if a['href'].startswith('http://www.nytimes'):
|
||||||
if not a['href'].endswith('pagewanted=all'):
|
if not a['href'].endswith('pagewanted=all'):
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
@ -743,13 +740,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
a.replaceWith(self.tag_to_string(a,False))
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
else:
|
else:
|
||||||
a['href'] = url+'?pagewanted=all'
|
a['href'] = url+'?pagewanted=all'
|
||||||
elif not (a['href'].startswith('http://pogue') or \
|
elif not (a['href'].startswith('http://pogue') or
|
||||||
a['href'].startswith('http://bits') or \
|
a['href'].startswith('http://bits') or
|
||||||
a['href'].startswith('http://travel') or \
|
a['href'].startswith('http://travel') or
|
||||||
a['href'].startswith('http://business') or \
|
a['href'].startswith('http://business') or
|
||||||
a['href'].startswith('http://tech') or \
|
a['href'].startswith('http://tech') or
|
||||||
a['href'].startswith('http://health') or \
|
a['href'].startswith('http://health') or
|
||||||
a['href'].startswith('http://dealbook') or \
|
a['href'].startswith('http://dealbook') or
|
||||||
a['href'].startswith('http://open')):
|
a['href'].startswith('http://open')):
|
||||||
a.replaceWith(self.tag_to_string(a,False))
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
return soup
|
return soup
|
||||||
@ -764,7 +761,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
## print("HANDLING AD FORWARD:")
|
## print("HANDLING AD FORWARD:")
|
||||||
## print(soup)
|
# print(soup)
|
||||||
if self.keep_only_tags:
|
if self.keep_only_tags:
|
||||||
body = Tag(soup, 'body')
|
body = Tag(soup, 'body')
|
||||||
try:
|
try:
|
||||||
@ -774,7 +771,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
for tag in soup.find('body').findAll(**spec):
|
for tag in soup.find('body').findAll(**spec):
|
||||||
body.insert(len(body.contents), tag)
|
body.insert(len(body.contents), tag)
|
||||||
soup.find('body').replaceWith(body)
|
soup.find('body').replaceWith(body)
|
||||||
except AttributeError: # soup has no body element
|
except AttributeError: # soup has no body element
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def remove_beyond(tag, next):
|
def remove_beyond(tag, next):
|
||||||
@ -802,7 +799,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
#print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
#print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||||
skip_tag = soup.find(True, {'name':'skip'})
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
@ -821,7 +817,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
old_body = soup.find('body')
|
old_body = soup.find('body')
|
||||||
new_body=Tag(soup,'body')
|
new_body=Tag(soup,'body')
|
||||||
new_body.append(soup.find('div',attrs={'id':'content'}))
|
new_body.append(soup.find('div',attrs={'id':'content'}))
|
||||||
new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
|
new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
|
||||||
old_body.replaceWith(new_body)
|
old_body.replaceWith(new_body)
|
||||||
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
|
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
|
||||||
if divr.find(text=re.compile('Sign up')):
|
if divr.find(text=re.compile('Sign up')):
|
||||||
@ -864,9 +860,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
img = atag.find('img')
|
img = atag.find('img')
|
||||||
if img is not None:
|
if img is not None:
|
||||||
atag.replaceWith(img)
|
atag.replaceWith(img)
|
||||||
elif not atag.has_key('href'):
|
elif 'href' not in atag:
|
||||||
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||||
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
|
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or
|
||||||
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
|
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
|
||||||
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||||
hdr = soup.find('address')
|
hdr = soup.find('address')
|
||||||
@ -879,11 +875,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
sp.append(span_credit)
|
sp.append(span_credit)
|
||||||
sp.append(Tag(soup,'br'))
|
sp.append(Tag(soup,'br'))
|
||||||
|
|
||||||
else: # nytimes article
|
else: # nytimes article
|
||||||
|
|
||||||
related = [] # these will be the related articles
|
related = [] # these will be the related articles
|
||||||
first_outer = None # first related outer tag
|
first_outer = None # first related outer tag
|
||||||
first_related = None # first related tag
|
first_related = None # first related tag
|
||||||
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||||
for rdiv in soup.findAll('div','columnGroup doubleRule'):
|
for rdiv in soup.findAll('div','columnGroup doubleRule'):
|
||||||
if rdiv.find('h3') is not None:
|
if rdiv.find('h3') is not None:
|
||||||
@ -916,19 +912,19 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
h6tag.extract()
|
h6tag.extract()
|
||||||
if related != []:
|
if related != []:
|
||||||
for r in related:
|
for r in related:
|
||||||
if r.h6: # don't want the anchor inside a h6 tag
|
if r.h6: # don't want the anchor inside a h6 tag
|
||||||
r.h6.replaceWith(r.h6.a)
|
r.h6.replaceWith(r.h6.a)
|
||||||
first_related.ul.append(r)
|
first_related.ul.append(r)
|
||||||
first_related.insert(0,Tag(soup,'hr'))
|
first_related.insert(0,Tag(soup,'hr'))
|
||||||
first_related.append(Tag(soup,'hr'))
|
first_related.append(Tag(soup,'hr'))
|
||||||
first_related['class'] = 'aside'
|
first_related['class'] = 'aside'
|
||||||
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
|
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
|
||||||
|
|
||||||
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||||
rdiv.extract()
|
rdiv.extract()
|
||||||
|
|
||||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||||
if kicker_tag: # remove Op_Ed author head shots
|
if kicker_tag: # remove Op_Ed author head shots
|
||||||
tagline = self.tag_to_string(kicker_tag)
|
tagline = self.tag_to_string(kicker_tag)
|
||||||
if tagline=='Op-Ed Columnist':
|
if tagline=='Op-Ed Columnist':
|
||||||
img_div = soup.find('div','inlineImage module')
|
img_div = soup.find('div','inlineImage module')
|
||||||
@ -937,7 +933,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
if self.useHighResImages:
|
if self.useHighResImages:
|
||||||
try:
|
try:
|
||||||
#open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
|
# open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
|
||||||
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||||||
if enlargeThisList:
|
if enlargeThisList:
|
||||||
for popupref in enlargeThisList:
|
for popupref in enlargeThisList:
|
||||||
@ -956,8 +952,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
year = str(st.tm_year)
|
year = str(st.tm_year)
|
||||||
month = "%.2d" % st.tm_mon
|
month = "%.2d" % st.tm_mon
|
||||||
day = "%.2d" % st.tm_mday
|
day = "%.2d" % st.tm_mday
|
||||||
imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
|
imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + \
|
||||||
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
|
len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
|
||||||
|
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \
|
||||||
|
month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
|
||||||
popupSoup = BeautifulSoup(popuphtml)
|
popupSoup = BeautifulSoup(popuphtml)
|
||||||
highResTag = popupSoup.find('img', {'src':highResImageLink})
|
highResTag = popupSoup.find('img', {'src':highResImageLink})
|
||||||
if highResTag:
|
if highResTag:
|
||||||
@ -979,7 +977,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log("Error pulling high resolution images")
|
self.log("Error pulling high resolution images")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#in case pulling images failed, delete the enlarge this text
|
# in case pulling images failed, delete the enlarge this text
|
||||||
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||||||
if enlargeThisList:
|
if enlargeThisList:
|
||||||
for popupref in enlargeThisList:
|
for popupref in enlargeThisList:
|
||||||
@ -987,11 +985,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
self.log("Error removing Enlarge this text")
|
self.log("Error removing Enlarge this text")
|
||||||
|
|
||||||
|
|
||||||
return self.strip_anchors(soup,False)
|
return self.strip_anchors(soup,False)
|
||||||
|
|
||||||
def postprocess_html(self,soup,first_fetch):
|
def postprocess_html(self,soup,first_fetch):
|
||||||
if not first_fetch: # remove Related links
|
if not first_fetch: # remove Related links
|
||||||
for aside in soup.findAll('div','aside'):
|
for aside in soup.findAll('div','aside'):
|
||||||
aside.extract()
|
aside.extract()
|
||||||
soup = self.strip_anchors(soup,True)
|
soup = self.strip_anchors(soup,True)
|
||||||
@ -1000,7 +997,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if soup.find('div',attrs={'id':'blogcontent'}) is None:
|
if soup.find('div',attrs={'id':'blogcontent'}) is None:
|
||||||
if first_fetch:
|
if first_fetch:
|
||||||
aside = soup.find('div','aside')
|
aside = soup.find('div','aside')
|
||||||
if aside is not None: # move the related list to the end of the article
|
if aside is not None: # move the related list to the end of the article
|
||||||
art = soup.find('div',attrs={'id':'article'})
|
art = soup.find('div',attrs={'id':'article'})
|
||||||
if art is None:
|
if art is None:
|
||||||
art = soup.find('div',attrs={'class':'article'})
|
art = soup.find('div',attrs={'class':'article'})
|
||||||
@ -1061,7 +1058,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
try:
|
try:
|
||||||
# Change <nyt_headline> to <h2>
|
# Change <nyt_headline> to <h2>
|
||||||
h1 = soup.find('h1')
|
h1 = soup.find('h1')
|
||||||
blogheadline = str(h1) #added for dealbook
|
blogheadline = str(h1) # added for dealbook
|
||||||
if h1:
|
if h1:
|
||||||
headline = h1.find("nyt_headline")
|
headline = h1.find("nyt_headline")
|
||||||
if headline:
|
if headline:
|
||||||
@ -1069,11 +1066,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
h1.replaceWith(tag)
|
h1.replaceWith(tag)
|
||||||
elif blogheadline.find('entry-title'):#added for dealbook
|
elif blogheadline.find('entry-title'): # added for dealbook
|
||||||
tag = Tag(soup, "h2")#added for dealbook
|
tag = Tag(soup, "h2") # added for dealbook
|
||||||
tag['class'] = "headline"#added for dealbook
|
tag['class'] = "headline" # added for dealbook
|
||||||
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
tag.insert(0, self.fixChars(h1.contents[0])) # added for dealbook
|
||||||
h1.replaceWith(tag)#added for dealbook
|
h1.replaceWith(tag) # added for dealbook
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||||||
@ -1090,7 +1087,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#if this is from a blog (dealbook, fix the byline format
|
# if this is from a blog (dealbook, fix the byline format
|
||||||
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||||||
if bylineauthor:
|
if bylineauthor:
|
||||||
tag = Tag(soup, "h6")
|
tag = Tag(soup, "h6")
|
||||||
@ -1101,7 +1098,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log("ERROR: fixing byline author format")
|
self.log("ERROR: fixing byline author format")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#if this is a blog (dealbook) fix the credit style for the pictures
|
# if this is a blog (dealbook) fix the credit style for the pictures
|
||||||
blogcredit = soup.find('div',attrs={'class':'credit'})
|
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||||||
if blogcredit:
|
if blogcredit:
|
||||||
tag = Tag(soup, "h6")
|
tag = Tag(soup, "h6")
|
||||||
@ -1111,7 +1108,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
self.log("ERROR: fixing credit format")
|
self.log("ERROR: fixing credit format")
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
masthead = soup.find("h1")
|
masthead = soup.find("h1")
|
||||||
@ -1135,7 +1131,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
try:
|
try:
|
||||||
#remove the <strong> update tag
|
# remove the <strong> update tag
|
||||||
blogupdated = soup.find('span', {'class':'update'})
|
blogupdated = soup.find('span', {'class':'update'})
|
||||||
if blogupdated:
|
if blogupdated:
|
||||||
blogupdated.replaceWith("")
|
blogupdated.replaceWith("")
|
||||||
@ -1184,9 +1180,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
paras = articlebody.findAll('p')
|
paras = articlebody.findAll('p')
|
||||||
for p in paras:
|
for p in paras:
|
||||||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
# account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||||
if len(refparagraph) > 0:
|
if len(refparagraph) > 0:
|
||||||
if len(refparagraph) > 70: #approximately one line of text
|
if len(refparagraph) > 70: # approximately one line of text
|
||||||
newpara = shortparagraph + refparagraph
|
newpara = shortparagraph + refparagraph
|
||||||
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||||||
if newparaEm == '':
|
if newparaEm == '':
|
||||||
@ -1205,4 +1201,3 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log("Error creating article descriptions")
|
self.log("Error creating article descriptions")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -14,8 +14,8 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
|||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
recursions=1 # set this to zero to omit Related articles lists
|
recursions=1 # set this to zero to omit Related articles lists
|
||||||
match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
|
match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
|
||||||
|
|
||||||
# set getTechBlogs to True to include the technology blogs
|
# set getTechBlogs to True to include the technology blogs
|
||||||
# set tech_oldest_article to control article age
|
# set tech_oldest_article to control article age
|
||||||
@ -28,12 +28,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
|
# set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
|
||||||
# otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
|
# otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
|
||||||
getPopularArticles = True
|
getPopularArticles = True
|
||||||
popularPeriod = '1' # set this to the number of days to include in the measurement
|
popularPeriod = '1' # set this to the number of days to include in the measurement
|
||||||
# e.g. 7 will get the most popular measured over the last 7 days
|
# e.g. 7 will get the most popular measured over the last 7 days
|
||||||
# and 30 will get the most popular measured over 30 days.
|
# and 30 will get the most popular measured over 30 days.
|
||||||
# you still only get up to 20 articles in each category
|
# you still only get up to 20 articles in each category
|
||||||
|
|
||||||
|
|
||||||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||||
headlinesOnly = False
|
headlinesOnly = False
|
||||||
|
|
||||||
@ -82,6 +81,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
# The maximum number of articles that will be downloaded
|
# The maximum number of articles that will be downloaded
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
||||||
# more than one section). If True, only the first occurance will be downloaded.
|
# more than one section). If True, only the first occurance will be downloaded.
|
||||||
@ -122,7 +122,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
if headlinesOnly:
|
if headlinesOnly:
|
||||||
title='New York Times Headlines'
|
title='New York Times Headlines'
|
||||||
description = 'Headlines from the New York Times'
|
description = 'Headlines from the New York Times'
|
||||||
@ -155,7 +154,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
earliest_date = date.today()
|
earliest_date = date.today()
|
||||||
else:
|
else:
|
||||||
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
||||||
oldest_article = 365 # by default, a long time ago
|
oldest_article = 365 # by default, a long time ago
|
||||||
|
|
||||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
@ -164,12 +163,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
|
|
||||||
#simultaneous_downloads = 1 # no longer required to deal with ads
|
# simultaneous_downloads = 1 # no longer required to deal with ads
|
||||||
|
|
||||||
cover_margins = (18,18,'grey99')
|
cover_margins = (18,18,'grey99')
|
||||||
|
|
||||||
remove_tags_before = dict(id='article')
|
keep_only_tags = dict(id=['article', 'story', 'content'])
|
||||||
remove_tags_after = dict(id='article')
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(attrs={'class':[
|
dict(attrs={'class':[
|
||||||
'articleFooter',
|
'articleFooter',
|
||||||
@ -184,6 +182,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'entry-response module',
|
'entry-response module',
|
||||||
'leftNavTabs',
|
'leftNavTabs',
|
||||||
'metaFootnote',
|
'metaFootnote',
|
||||||
|
'inside-story',
|
||||||
'module box nav',
|
'module box nav',
|
||||||
'nextArticleLink',
|
'nextArticleLink',
|
||||||
'nextArticleLink clearfix',
|
'nextArticleLink clearfix',
|
||||||
@ -192,28 +191,28 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'side_tool',
|
'side_tool',
|
||||||
'singleAd',
|
'singleAd',
|
||||||
'postCategory column',
|
'postCategory column',
|
||||||
'refer tagRefer', # added for bits blog post
|
'refer tagRefer', # added for bits blog post
|
||||||
'entry entry-utility', #added for DealBook
|
'entry entry-utility', # added for DealBook
|
||||||
'entry-tags', #added for DealBook
|
'entry-tags', # added for DealBook
|
||||||
'footer promos clearfix', #added for DealBook
|
'footer promos clearfix', # added for DealBook
|
||||||
'footer links clearfix', #added for DealBook
|
'footer links clearfix', # added for DealBook
|
||||||
'tabsContainer', #added for other blog downloads
|
'tabsContainer', # added for other blog downloads
|
||||||
'column lastColumn', #added for other blog downloads
|
'column lastColumn', # added for other blog downloads
|
||||||
'pageHeaderWithLabel', #added for other gadgetwise downloads
|
'pageHeaderWithLabel', # added for other gadgetwise downloads
|
||||||
'column two', #added for other blog downloads
|
'column two', # added for other blog downloads
|
||||||
'column two last', #added for other blog downloads
|
'column two last', # added for other blog downloads
|
||||||
'column three', #added for other blog downloads
|
'column three', # added for other blog downloads
|
||||||
'column three last', #added for other blog downloads
|
'column three last', # added for other blog downloads
|
||||||
'column four',#added for other blog downloads
|
'column four', # added for other blog downloads
|
||||||
'column four last',#added for other blog downloads
|
'column four last', # added for other blog downloads
|
||||||
'column last', #added for other blog downloads
|
'column last', # added for other blog downloads
|
||||||
'entry entry-related',
|
'entry entry-related',
|
||||||
'subNavigation tabContent active', #caucus blog navigation
|
'subNavigation tabContent active', # caucus blog navigation
|
||||||
'mediaOverlay slideshow',
|
'mediaOverlay slideshow',
|
||||||
'wideThumb',
|
'wideThumb',
|
||||||
'video', #added 02-11-2011
|
'video', # added 02-11-2011
|
||||||
'videoHeader',#added 02-11-2011
|
'videoHeader', # added 02-11-2011
|
||||||
'articleInlineVideoHolder', #added 02-11-2011
|
'articleInlineVideoHolder', # added 02-11-2011
|
||||||
'assetCompanionAd',
|
'assetCompanionAd',
|
||||||
'nytint-sectionHeader',
|
'nytint-sectionHeader',
|
||||||
re.compile('^subNavigation'),
|
re.compile('^subNavigation'),
|
||||||
@ -223,6 +222,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'credit'
|
'credit'
|
||||||
]}),
|
]}),
|
||||||
dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
|
dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
|
||||||
|
dict(attrs={'class':lambda x: x and 'interactive' in x.split()}),
|
||||||
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||||||
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||||||
dict(name='div', attrs={'class':'tweet'}),
|
dict(name='div', attrs={'class':'tweet'}),
|
||||||
@ -231,8 +231,8 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
|
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
|
||||||
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
|
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
|
||||||
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
|
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
|
||||||
dict(name='div', attrs={'id':re.compile('respond')}), # open
|
dict(name='div', attrs={'id':re.compile('respond')}), # open
|
||||||
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
|
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
|
||||||
dict(id=[
|
dict(id=[
|
||||||
'adxLeaderboard',
|
'adxLeaderboard',
|
||||||
'adxSponLink',
|
'adxSponLink',
|
||||||
@ -254,6 +254,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'masthead-nav',
|
'masthead-nav',
|
||||||
'memberTools',
|
'memberTools',
|
||||||
'navigation', 'navigation-ghost', 'navigation-modal', 'navigation-edge',
|
'navigation', 'navigation-ghost', 'navigation-modal', 'navigation-edge',
|
||||||
|
'page-footer',
|
||||||
'portfolioInline',
|
'portfolioInline',
|
||||||
'readerReviews',
|
'readerReviews',
|
||||||
'readerReviewsCount',
|
'readerReviewsCount',
|
||||||
@ -265,16 +266,18 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'side_index',
|
'side_index',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'toolsRight',
|
'toolsRight',
|
||||||
'skybox', #added for DealBook
|
'skybox', # added for DealBook
|
||||||
'TopAd', #added for DealBook
|
'TopAd', # added for DealBook
|
||||||
'related-content', #added for DealBook
|
'related-content', # added for DealBook
|
||||||
'whats-next',
|
'whats-next',
|
||||||
]),
|
]),
|
||||||
dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta'])]
|
dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta', 'footer'])]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||||||
.credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.credit { font-weight: normal; text-align: right; font-size:
|
||||||
|
50%; line-height:1em; margin-top:5px; margin-left:0;
|
||||||
|
margin-right:0; margin-bottom: 0; }
|
||||||
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
@ -290,7 +293,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
|
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
|
||||||
.source {text-align: left; font-size: x-small; }'''
|
.source {text-align: left; font-size: x-small; }'''
|
||||||
|
|
||||||
|
|
||||||
articles = {}
|
articles = {}
|
||||||
key = None
|
key = None
|
||||||
ans = []
|
ans = []
|
||||||
@ -312,22 +314,22 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
del ans[idx]
|
del ans[idx]
|
||||||
idx_max = idx_max-1
|
idx_max = idx_max-1
|
||||||
continue
|
continue
|
||||||
if True: #self.verbose
|
if True: # self.verbose
|
||||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])))
|
||||||
for article in ans[idx][1]:
|
for article in ans[idx][1]:
|
||||||
total_article_count += 1
|
total_article_count += 1
|
||||||
if True: #self.verbose
|
if True: # self.verbose
|
||||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
article['url'].encode('cp1252','replace')))
|
article['url'].encode('cp1252','replace')))
|
||||||
idx = idx+1
|
idx = idx+1
|
||||||
|
|
||||||
self.log( "Queued %d articles" % total_article_count )
|
self.log("Queued %d articles" % total_article_count)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def exclude_url(self,url):
|
def exclude_url(self,url):
|
||||||
if not url.startswith("http"):
|
if not url.startswith("http"):
|
||||||
return True
|
return True
|
||||||
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
|
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: # added for DealBook
|
||||||
return True
|
return True
|
||||||
if 'nytimes.com' not in url:
|
if 'nytimes.com' not in url:
|
||||||
return True
|
return True
|
||||||
@ -419,7 +421,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
def short_title(self):
|
def short_title(self):
|
||||||
return self.title
|
return self.title
|
||||||
|
|
||||||
|
|
||||||
def article_to_soup(self, url_or_raw, raw=False):
|
def article_to_soup(self, url_or_raw, raw=False):
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
import copy
|
import copy
|
||||||
@ -453,7 +454,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
usrc = self.preprocess_raw_html(usrc, url_or_raw)
|
usrc = self.preprocess_raw_html(usrc, url_or_raw)
|
||||||
return BeautifulSoup(usrc, markupMassage=nmassage)
|
return BeautifulSoup(usrc, markupMassage=nmassage)
|
||||||
|
|
||||||
|
|
||||||
def massageNCXText(self, description):
|
def massageNCXText(self, description):
|
||||||
# Kindle TOC descriptions won't render certain characters
|
# Kindle TOC descriptions won't render certain characters
|
||||||
if description:
|
if description:
|
||||||
@ -485,7 +485,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if self.webEdition:
|
if self.webEdition:
|
||||||
date_tag = self.decode_url_date(url)
|
date_tag = self.decode_url_date(url)
|
||||||
if date_tag is not None:
|
if date_tag is not None:
|
||||||
if self.oldest_web_article is not None:
|
if self.oldest_web_article is not None:
|
||||||
if date_tag < self.earliest_date:
|
if date_tag < self.earliest_date:
|
||||||
self.log("Skipping article %s" % url)
|
self.log("Skipping article %s" % url)
|
||||||
return
|
return
|
||||||
@ -508,7 +508,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if authorAttribution:
|
if authorAttribution:
|
||||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
feed = self.key if self.key is not None else 'Uncategorized'
|
feed = self.key if self.key is not None else 'Uncategorized'
|
||||||
if not self.articles.has_key(feed):
|
if feed not in self.articles:
|
||||||
self.ans.append(feed)
|
self.ans.append(feed)
|
||||||
self.articles[feed] = []
|
self.articles[feed] = []
|
||||||
self.articles[feed].append(
|
self.articles[feed].append(
|
||||||
@ -543,7 +543,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
desc = ''
|
desc = ''
|
||||||
return(title,url,author,desc)
|
return(title,url,author,desc)
|
||||||
|
|
||||||
|
|
||||||
have_emailed = False
|
have_emailed = False
|
||||||
emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
|
emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
|
||||||
for h3tag in emailed_soup.findAll('h3'):
|
for h3tag in emailed_soup.findAll('h3'):
|
||||||
@ -572,7 +571,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
dict(title=title, url=url, date=strftime('%a, %d %b'),
|
dict(title=title, url=url, date=strftime('%a, %d %b'),
|
||||||
description=desc, author=author,
|
description=desc, author=author,
|
||||||
content=''))
|
content=''))
|
||||||
viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)]
|
viewed_ans = [(k, popular_articles[k]) for k in key_list if k in popular_articles]
|
||||||
for x in viewed_ans:
|
for x in viewed_ans:
|
||||||
ans.append(x)
|
ans.append(x)
|
||||||
return ans
|
return ans
|
||||||
@ -595,10 +594,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tech_articles[f.title] = []
|
tech_articles[f.title] = []
|
||||||
for a in f.articles:
|
for a in f.articles:
|
||||||
tech_articles[f.title].append(
|
tech_articles[f.title].append(
|
||||||
dict(title=a.title, url=a.url, date=a.date,
|
dict(title=a.title, url=a.url.partition('?')[0], date=a.date,
|
||||||
description=a.summary, author=a.author,
|
description=a.summary, author=a.author,
|
||||||
content=a.content))
|
content=a.content))
|
||||||
tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
|
tech_ans = [(k, tech_articles[k]) for k in key_list if k in tech_articles]
|
||||||
for x in tech_ans:
|
for x in tech_ans:
|
||||||
ans.append(x)
|
ans.append(x)
|
||||||
return ans
|
return ans
|
||||||
@ -637,10 +636,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
for lidiv in div.findAll('li'):
|
for lidiv in div.findAll('li'):
|
||||||
self.handle_article(lidiv)
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
|
||||||
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
|
|
||||||
def parse_todays_index(self):
|
def parse_todays_index(self):
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
@ -670,7 +668,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if not skipping:
|
if not skipping:
|
||||||
self.handle_article(lidiv)
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
|
||||||
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
def parse_headline_index(self):
|
def parse_headline_index(self):
|
||||||
@ -716,13 +714,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
description = self.tag_to_string(desc,use_alt=False)
|
description = self.tag_to_string(desc,use_alt=False)
|
||||||
else:
|
else:
|
||||||
description = ''
|
description = ''
|
||||||
if not self.articles.has_key(section_name):
|
if section_name not in self.articles:
|
||||||
self.ans.append(section_name)
|
self.ans.append(section_name)
|
||||||
self.articles[section_name] = []
|
self.articles[section_name] = []
|
||||||
print('Title '+title+' author '+author)
|
print('Title '+title+' author '+author)
|
||||||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
|
||||||
return self.filter_ans(self.ans)
|
return self.filter_ans(self.ans)
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
@ -742,7 +740,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if kill_all or (self.recursions==0):
|
if kill_all or (self.recursions==0):
|
||||||
a.replaceWith(self.tag_to_string(a,False))
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
else:
|
else:
|
||||||
if a.has_key('href'):
|
if 'href' in a:
|
||||||
if a['href'].startswith('http://www.nytimes'):
|
if a['href'].startswith('http://www.nytimes'):
|
||||||
if not a['href'].endswith('pagewanted=all'):
|
if not a['href'].endswith('pagewanted=all'):
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
@ -750,13 +748,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
a.replaceWith(self.tag_to_string(a,False))
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
else:
|
else:
|
||||||
a['href'] = url+'?pagewanted=all'
|
a['href'] = url+'?pagewanted=all'
|
||||||
elif not (a['href'].startswith('http://pogue') or \
|
elif not (a['href'].startswith('http://pogue') or
|
||||||
a['href'].startswith('http://bits') or \
|
a['href'].startswith('http://bits') or
|
||||||
a['href'].startswith('http://travel') or \
|
a['href'].startswith('http://travel') or
|
||||||
a['href'].startswith('http://business') or \
|
a['href'].startswith('http://business') or
|
||||||
a['href'].startswith('http://tech') or \
|
a['href'].startswith('http://tech') or
|
||||||
a['href'].startswith('http://health') or \
|
a['href'].startswith('http://health') or
|
||||||
a['href'].startswith('http://dealbook') or \
|
a['href'].startswith('http://dealbook') or
|
||||||
a['href'].startswith('http://open')):
|
a['href'].startswith('http://open')):
|
||||||
a.replaceWith(self.tag_to_string(a,False))
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
return soup
|
return soup
|
||||||
@ -771,7 +769,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
## print("HANDLING AD FORWARD:")
|
## print("HANDLING AD FORWARD:")
|
||||||
## print(soup)
|
# print(soup)
|
||||||
if self.keep_only_tags:
|
if self.keep_only_tags:
|
||||||
body = Tag(soup, 'body')
|
body = Tag(soup, 'body')
|
||||||
try:
|
try:
|
||||||
@ -781,7 +779,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
for tag in soup.find('body').findAll(**spec):
|
for tag in soup.find('body').findAll(**spec):
|
||||||
body.insert(len(body.contents), tag)
|
body.insert(len(body.contents), tag)
|
||||||
soup.find('body').replaceWith(body)
|
soup.find('body').replaceWith(body)
|
||||||
except AttributeError: # soup has no body element
|
except AttributeError: # soup has no body element
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def remove_beyond(tag, next):
|
def remove_beyond(tag, next):
|
||||||
@ -809,7 +807,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
#print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
#print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||||
skip_tag = soup.find(True, {'name':'skip'})
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
@ -828,7 +825,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
old_body = soup.find('body')
|
old_body = soup.find('body')
|
||||||
new_body=Tag(soup,'body')
|
new_body=Tag(soup,'body')
|
||||||
new_body.append(soup.find('div',attrs={'id':'content'}))
|
new_body.append(soup.find('div',attrs={'id':'content'}))
|
||||||
new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
|
new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
|
||||||
old_body.replaceWith(new_body)
|
old_body.replaceWith(new_body)
|
||||||
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
|
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
|
||||||
if divr.find(text=re.compile('Sign up')):
|
if divr.find(text=re.compile('Sign up')):
|
||||||
@ -871,9 +868,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
img = atag.find('img')
|
img = atag.find('img')
|
||||||
if img is not None:
|
if img is not None:
|
||||||
atag.replaceWith(img)
|
atag.replaceWith(img)
|
||||||
elif not atag.has_key('href'):
|
elif 'href' not in atag:
|
||||||
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||||
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
|
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or
|
||||||
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
|
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
|
||||||
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||||
hdr = soup.find('address')
|
hdr = soup.find('address')
|
||||||
@ -886,11 +883,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
sp.append(span_credit)
|
sp.append(span_credit)
|
||||||
sp.append(Tag(soup,'br'))
|
sp.append(Tag(soup,'br'))
|
||||||
|
|
||||||
else: # nytimes article
|
else: # nytimes article
|
||||||
|
|
||||||
related = [] # these will be the related articles
|
related = [] # these will be the related articles
|
||||||
first_outer = None # first related outer tag
|
first_outer = None # first related outer tag
|
||||||
first_related = None # first related tag
|
first_related = None # first related tag
|
||||||
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||||
for rdiv in soup.findAll('div','columnGroup doubleRule'):
|
for rdiv in soup.findAll('div','columnGroup doubleRule'):
|
||||||
if rdiv.find('h3') is not None:
|
if rdiv.find('h3') is not None:
|
||||||
@ -923,19 +920,19 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
h6tag.extract()
|
h6tag.extract()
|
||||||
if related != []:
|
if related != []:
|
||||||
for r in related:
|
for r in related:
|
||||||
if r.h6: # don't want the anchor inside a h6 tag
|
if r.h6: # don't want the anchor inside a h6 tag
|
||||||
r.h6.replaceWith(r.h6.a)
|
r.h6.replaceWith(r.h6.a)
|
||||||
first_related.ul.append(r)
|
first_related.ul.append(r)
|
||||||
first_related.insert(0,Tag(soup,'hr'))
|
first_related.insert(0,Tag(soup,'hr'))
|
||||||
first_related.append(Tag(soup,'hr'))
|
first_related.append(Tag(soup,'hr'))
|
||||||
first_related['class'] = 'aside'
|
first_related['class'] = 'aside'
|
||||||
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
|
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
|
||||||
|
|
||||||
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||||
rdiv.extract()
|
rdiv.extract()
|
||||||
|
|
||||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||||
if kicker_tag: # remove Op_Ed author head shots
|
if kicker_tag: # remove Op_Ed author head shots
|
||||||
tagline = self.tag_to_string(kicker_tag)
|
tagline = self.tag_to_string(kicker_tag)
|
||||||
if tagline=='Op-Ed Columnist':
|
if tagline=='Op-Ed Columnist':
|
||||||
img_div = soup.find('div','inlineImage module')
|
img_div = soup.find('div','inlineImage module')
|
||||||
@ -944,7 +941,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
if self.useHighResImages:
|
if self.useHighResImages:
|
||||||
try:
|
try:
|
||||||
#open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
|
# open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
|
||||||
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||||||
if enlargeThisList:
|
if enlargeThisList:
|
||||||
for popupref in enlargeThisList:
|
for popupref in enlargeThisList:
|
||||||
@ -963,8 +960,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
year = str(st.tm_year)
|
year = str(st.tm_year)
|
||||||
month = "%.2d" % st.tm_mon
|
month = "%.2d" % st.tm_mon
|
||||||
day = "%.2d" % st.tm_mday
|
day = "%.2d" % st.tm_mday
|
||||||
imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
|
imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + \
|
||||||
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
|
len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
|
||||||
|
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \
|
||||||
|
month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
|
||||||
popupSoup = BeautifulSoup(popuphtml)
|
popupSoup = BeautifulSoup(popuphtml)
|
||||||
highResTag = popupSoup.find('img', {'src':highResImageLink})
|
highResTag = popupSoup.find('img', {'src':highResImageLink})
|
||||||
if highResTag:
|
if highResTag:
|
||||||
@ -986,7 +985,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log("Error pulling high resolution images")
|
self.log("Error pulling high resolution images")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#in case pulling images failed, delete the enlarge this text
|
# in case pulling images failed, delete the enlarge this text
|
||||||
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||||||
if enlargeThisList:
|
if enlargeThisList:
|
||||||
for popupref in enlargeThisList:
|
for popupref in enlargeThisList:
|
||||||
@ -994,11 +993,10 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
self.log("Error removing Enlarge this text")
|
self.log("Error removing Enlarge this text")
|
||||||
|
|
||||||
|
|
||||||
return self.strip_anchors(soup,False)
|
return self.strip_anchors(soup,False)
|
||||||
|
|
||||||
def postprocess_html(self,soup,first_fetch):
|
def postprocess_html(self,soup,first_fetch):
|
||||||
if not first_fetch: # remove Related links
|
if not first_fetch: # remove Related links
|
||||||
for aside in soup.findAll('div','aside'):
|
for aside in soup.findAll('div','aside'):
|
||||||
aside.extract()
|
aside.extract()
|
||||||
soup = self.strip_anchors(soup,True)
|
soup = self.strip_anchors(soup,True)
|
||||||
@ -1007,7 +1005,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if soup.find('div',attrs={'id':'blogcontent'}) is None:
|
if soup.find('div',attrs={'id':'blogcontent'}) is None:
|
||||||
if first_fetch:
|
if first_fetch:
|
||||||
aside = soup.find('div','aside')
|
aside = soup.find('div','aside')
|
||||||
if aside is not None: # move the related list to the end of the article
|
if aside is not None: # move the related list to the end of the article
|
||||||
art = soup.find('div',attrs={'id':'article'})
|
art = soup.find('div',attrs={'id':'article'})
|
||||||
if art is None:
|
if art is None:
|
||||||
art = soup.find('div',attrs={'class':'article'})
|
art = soup.find('div',attrs={'class':'article'})
|
||||||
@ -1068,7 +1066,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
try:
|
try:
|
||||||
# Change <nyt_headline> to <h2>
|
# Change <nyt_headline> to <h2>
|
||||||
h1 = soup.find('h1')
|
h1 = soup.find('h1')
|
||||||
blogheadline = str(h1) #added for dealbook
|
blogheadline = str(h1) # added for dealbook
|
||||||
if h1:
|
if h1:
|
||||||
headline = h1.find("nyt_headline")
|
headline = h1.find("nyt_headline")
|
||||||
if headline:
|
if headline:
|
||||||
@ -1076,11 +1074,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
h1.replaceWith(tag)
|
h1.replaceWith(tag)
|
||||||
elif blogheadline.find('entry-title'):#added for dealbook
|
elif blogheadline.find('entry-title'): # added for dealbook
|
||||||
tag = Tag(soup, "h2")#added for dealbook
|
tag = Tag(soup, "h2") # added for dealbook
|
||||||
tag['class'] = "headline"#added for dealbook
|
tag['class'] = "headline" # added for dealbook
|
||||||
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
tag.insert(0, self.fixChars(h1.contents[0])) # added for dealbook
|
||||||
h1.replaceWith(tag)#added for dealbook
|
h1.replaceWith(tag) # added for dealbook
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||||||
@ -1097,7 +1095,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#if this is from a blog (dealbook, fix the byline format
|
# if this is from a blog (dealbook, fix the byline format
|
||||||
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||||||
if bylineauthor:
|
if bylineauthor:
|
||||||
tag = Tag(soup, "h6")
|
tag = Tag(soup, "h6")
|
||||||
@ -1108,7 +1106,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log("ERROR: fixing byline author format")
|
self.log("ERROR: fixing byline author format")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#if this is a blog (dealbook) fix the credit style for the pictures
|
# if this is a blog (dealbook) fix the credit style for the pictures
|
||||||
blogcredit = soup.find('div',attrs={'class':'credit'})
|
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||||||
if blogcredit:
|
if blogcredit:
|
||||||
tag = Tag(soup, "h6")
|
tag = Tag(soup, "h6")
|
||||||
@ -1118,7 +1116,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
self.log("ERROR: fixing credit format")
|
self.log("ERROR: fixing credit format")
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
masthead = soup.find("h1")
|
masthead = soup.find("h1")
|
||||||
@ -1142,7 +1139,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
try:
|
try:
|
||||||
#remove the <strong> update tag
|
# remove the <strong> update tag
|
||||||
blogupdated = soup.find('span', {'class':'update'})
|
blogupdated = soup.find('span', {'class':'update'})
|
||||||
if blogupdated:
|
if blogupdated:
|
||||||
blogupdated.replaceWith("")
|
blogupdated.replaceWith("")
|
||||||
@ -1191,9 +1188,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
paras = articlebody.findAll('p')
|
paras = articlebody.findAll('p')
|
||||||
for p in paras:
|
for p in paras:
|
||||||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
# account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||||
if len(refparagraph) > 0:
|
if len(refparagraph) > 0:
|
||||||
if len(refparagraph) > 70: #approximately one line of text
|
if len(refparagraph) > 70: # approximately one line of text
|
||||||
newpara = shortparagraph + refparagraph
|
newpara = shortparagraph + refparagraph
|
||||||
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||||||
if newparaEm == '':
|
if newparaEm == '':
|
||||||
@ -1212,4 +1209,3 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log("Error creating article descriptions")
|
self.log("Error creating article descriptions")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user