Updated recipes for NYTimes and Newsweek. Also support for setting the author of downloaded articles in the metadata.

This commit is contained in:
Kovid Goyal 2009-07-09 13:10:29 -06:00
parent 25538e5c04
commit edf5bcbab6
4 changed files with 61 additions and 13 deletions

View File

@ -16,7 +16,7 @@ class Article(object):
time_offset = datetime.now() - datetime.utcnow() time_offset = datetime.now() - datetime.utcnow()
def __init__(self, id, title, url, summary, published, content): def __init__(self, id, title, url, author, summary, published, content):
self.downloaded = False self.downloaded = False
self.id = id self.id = id
self.title = title.strip() if title else title self.title = title.strip() if title else title
@ -26,6 +26,9 @@ class Article(object):
except: except:
pass pass
self.url = url self.url = url
self.author = author
if author and not isinstance(author, unicode):
author = author.decode('utf-8', 'replace')
self.summary = summary self.summary = summary
if summary and not isinstance(summary, unicode): if summary and not isinstance(summary, unicode):
summary = summary.decode('utf-8', 'replace') summary = summary.decode('utf-8', 'replace')
@ -39,6 +42,7 @@ class Article(object):
traceback.print_exc() traceback.print_exc()
summary = u'' summary = u''
self.text_summary = summary self.text_summary = summary
self.author = author
self.content = content self.content = content
self.date = published self.date = published
self.utctime = datetime(*self.date[:6]) self.utctime = datetime(*self.date[:6])
@ -50,10 +54,11 @@ class Article(object):
(u'''\ (u'''\
Title : %s Title : %s
URL : %s URL : %s
Author : %s
Summary : %s Summary : %s
Date : %s Date : %s
Has content : %s Has content : %s
'''%(self.title, self.url, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'), '''%(self.title, self.url, self.author, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
bool(self.content))).encode('utf-8') bool(self.content))).encode('utf-8')
def __str__(self): def __str__(self):
@ -124,7 +129,8 @@ class Feed(object):
link = item.get('url', None) link = item.get('url', None)
description = item.get('description', '') description = item.get('description', '')
content = item.get('content', '') content = item.get('content', '')
article = Article(id, title, link, description, published, content) author = item.get('author', '')
article = Article(id, title, link, author, description, published, content)
delta = datetime.utcnow() - article.utctime delta = datetime.utcnow() - article.utctime
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article) self.articles.append(article)
@ -149,7 +155,9 @@ class Feed(object):
self.logger.warning('Failed to get link for %s'%title) self.logger.warning('Failed to get link for %s'%title)
self.logger.debug(traceback.format_exc()) self.logger.debug(traceback.format_exc())
link = None link = None
description = item.get('summary', None) description = item.get('summary', None)
author = item.get('author', None)
content = [i.value for i in item.get('content', []) if i.value] content = [i.value for i in item.get('content', []) if i.value]
content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace') content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace')
@ -159,7 +167,7 @@ class Feed(object):
content = None content = None
if not link and not content: if not link and not content:
return return
article = Article(id, title, link, description, published, content) article = Article(id, title, link, author, description, published, content)
delta = datetime.utcnow() - article.utctime delta = datetime.utcnow() - article.utctime
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article) self.articles.append(article)

View File

@ -884,6 +884,9 @@ class BasicNewsRecipe(Recipe):
for j, a in enumerate(f): for j, a in enumerate(f):
if getattr(a, 'downloaded', False): if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(num, j) adir = 'feed_%d/article_%d/'%(num, j)
auth = a.author
if not auth:
auth = None
desc = a.text_summary desc = a.text_summary
if not desc: if not desc:
desc = None desc = None
@ -893,7 +896,7 @@ class BasicNewsRecipe(Recipe):
self.play_order_counter += 1 self.play_order_counter += 1
po = self.play_order_counter po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
play_order=po, description=desc) play_order=po, author=auth, description=desc)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages: for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp]) prefix = os.path.commonprefix([opf_path, sp])
@ -925,11 +928,15 @@ class BasicNewsRecipe(Recipe):
if po is None: if po is None:
self.play_order_counter += 1 self.play_order_counter += 1
po = self.play_order_counter po = self.play_order_counter
auth = getattr(f, 'author', None)
if not auth:
auth = None
desc = getattr(f, 'description', None) desc = getattr(f, 'description', None)
if not desc: if not desc:
desc = None desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc)) f.title, play_order=po, description=desc, author=auth))
else: else:
entries.append('feed_%d/index.html'%0) entries.append('feed_%d/index.html'%0)
feed_index(0, toc) feed_index(0, toc)

View File

@ -9,13 +9,26 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Newsweek(BasicNewsRecipe): class Newsweek(BasicNewsRecipe):
title = 'Newsweek' title = 'Newsweek'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal and Sujata Raman'
description = 'Weekly news and current affairs in the US' description = 'Weekly news and current affairs in the US'
no_stylesheets = True no_stylesheets = True
extra_css = '''
h1{color:#383733;font-family:Arial,Helvetica,sans-serif;font-size:large;}
.deck{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;color:#383733;font-size:small;}
.articleInfo{color:#474537;font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
.authorName{color:#B61900;font-family:Arial,Helvetica,sans-serif;font-size:medium;}
.authorInfo{color:#0066CC;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.articleUpdated{ font-size:xx-small; color:#73726C; font-family:Arial,Helvetica,sans-serif;}
.issueDate{font-family :Arial,Helvetica,sans-serif;font-size:xx-small;font-style:italic;}
.story{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;font-size:small;}
.photoCredit{color:#999999;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.photoCaption{color:#0A0A09;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;font-weight:bold;}'''
encoding = 'utf-8' encoding = 'utf-8'
language = _('English') language = _('English')
remove_tags = [ remove_tags = [
{'class':['navbar', 'ad', 'sponsorLinksArticle', 'mm-content', {'class':['fwArticle noHr','fwArticle','subinfo','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
'inline-social-links-wrapper', 'email-article', 'inline-social-links-wrapper', 'email-article',
'comments-and-social-links-wrapper', 'EmailArticleBlock']}, 'comments-and-social-links-wrapper', 'EmailArticleBlock']},
{'id' : ['footer', 'ticker-data', 'topTenVertical', {'id' : ['footer', 'ticker-data', 'topTenVertical',
@ -24,8 +37,6 @@ class Newsweek(BasicNewsRecipe):
{'class': re.compile('related-cloud')}, {'class': re.compile('related-cloud')},
] ]
keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent']}] keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent']}]
recursions = 1 recursions = 1
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']

View File

@ -16,7 +16,7 @@ class NYTimes(BasicNewsRecipe):
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
language = _('English') language = _('English')
description = 'Daily news from the New York Times (subscription version)' description = 'Daily news from the New York Times (subscription version)'
timefmt = ' [%a, %d %b, %Y]' timefmt = ''
needs_subscription = True needs_subscription = True
remove_tags_before = dict(id='article') remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article') remove_tags_after = dict(id='article')
@ -46,11 +46,22 @@ class NYTimes(BasicNewsRecipe):
articles = {} articles = {}
key = None key = None
ans = [] ans = []
allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
'New York','Business Day','Sports','Dining','Arts','Home','Styles']
excludeSectionKeywords = ['Sports','Dining','Styles']
# Find each instance of class="section-headline", class="story", class="story headline"
for div in soup.findAll(True, for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline']}): attrs={'class':['section-headline', 'story', 'story headline']}):
if div['class'] == 'section-headline': if div['class'] == 'section-headline':
key = string.capwords(feed_title(div)) key = string.capwords(feed_title(div))
excluded = re.compile('|'.join(excludeSectionKeywords))
if excluded.search(key):
self.log("Skipping section %s" % key)
continue
articles[key] = [] articles[key] = []
ans.append(key) ans.append(key)
@ -61,24 +72,35 @@ class NYTimes(BasicNewsRecipe):
url = re.sub(r'\?.*', '', a['href']) url = re.sub(r'\?.*', '', a['href'])
url += '?pagewanted=all' url += '?pagewanted=all'
title = self.tag_to_string(a, use_alt=True).strip() title = self.tag_to_string(a, use_alt=True).strip()
description = '' description = ''
pubdate = strftime('%a, %d %b') pubdate = strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'}) summary = div.find(True, attrs={'class':'summary'})
if summary: if summary:
description = self.tag_to_string(summary, use_alt=False) description = self.tag_to_string(summary, use_alt=False)
author = ''
authorAttribution = div.find(True, attrs={'class':'storyheadline-author'})
if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False)
else:
authorAttribution = div.find(True, attrs={'class':'byline'})
if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False)
feed = key if key is not None else 'Uncategorized' feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed): if not articles.has_key(feed):
articles[feed] = [] articles[feed] = []
if not 'podcasts' in url: if not 'podcasts' in url:
articles[feed].append( articles[feed].append(
dict(title=title, url=url, date=pubdate, dict(title=title, url=url, date=pubdate,
description=description, description=description, author=author,
content='')) content=''))
ans = self.sort_index_by(ans, {'The Front Page':-1, ans = self.sort_index_by(ans, {'The Front Page':-1,
'Dining In, Dining Out':1, 'Dining In, Dining Out':1,
'Obituaries':2}) 'Obituaries':2})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):