mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Updated recipes for NYTimes and Newsweek. Also support for setting the author of downloaded articles in the metadata.
This commit is contained in:
parent
25538e5c04
commit
edf5bcbab6
@ -16,7 +16,7 @@ class Article(object):
|
||||
|
||||
time_offset = datetime.now() - datetime.utcnow()
|
||||
|
||||
def __init__(self, id, title, url, summary, published, content):
|
||||
def __init__(self, id, title, url, author, summary, published, content):
|
||||
self.downloaded = False
|
||||
self.id = id
|
||||
self.title = title.strip() if title else title
|
||||
@ -26,6 +26,9 @@ class Article(object):
|
||||
except:
|
||||
pass
|
||||
self.url = url
|
||||
self.author = author
|
||||
if author and not isinstance(author, unicode):
|
||||
author = author.decode('utf-8', 'replace')
|
||||
self.summary = summary
|
||||
if summary and not isinstance(summary, unicode):
|
||||
summary = summary.decode('utf-8', 'replace')
|
||||
@ -39,6 +42,7 @@ class Article(object):
|
||||
traceback.print_exc()
|
||||
summary = u''
|
||||
self.text_summary = summary
|
||||
self.author = author
|
||||
self.content = content
|
||||
self.date = published
|
||||
self.utctime = datetime(*self.date[:6])
|
||||
@ -50,10 +54,11 @@ class Article(object):
|
||||
(u'''\
|
||||
Title : %s
|
||||
URL : %s
|
||||
Author : %s
|
||||
Summary : %s
|
||||
Date : %s
|
||||
Has content : %s
|
||||
'''%(self.title, self.url, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
|
||||
'''%(self.title, self.url, self.author, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
|
||||
bool(self.content))).encode('utf-8')
|
||||
|
||||
def __str__(self):
|
||||
@ -124,7 +129,8 @@ class Feed(object):
|
||||
link = item.get('url', None)
|
||||
description = item.get('description', '')
|
||||
content = item.get('content', '')
|
||||
article = Article(id, title, link, description, published, content)
|
||||
author = item.get('author', '')
|
||||
article = Article(id, title, link, author, description, published, content)
|
||||
delta = datetime.utcnow() - article.utctime
|
||||
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
||||
self.articles.append(article)
|
||||
@ -149,7 +155,9 @@ class Feed(object):
|
||||
self.logger.warning('Failed to get link for %s'%title)
|
||||
self.logger.debug(traceback.format_exc())
|
||||
link = None
|
||||
|
||||
description = item.get('summary', None)
|
||||
author = item.get('author', None)
|
||||
|
||||
content = [i.value for i in item.get('content', []) if i.value]
|
||||
content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace')
|
||||
@ -159,7 +167,7 @@ class Feed(object):
|
||||
content = None
|
||||
if not link and not content:
|
||||
return
|
||||
article = Article(id, title, link, description, published, content)
|
||||
article = Article(id, title, link, author, description, published, content)
|
||||
delta = datetime.utcnow() - article.utctime
|
||||
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
||||
self.articles.append(article)
|
||||
|
@ -884,6 +884,9 @@ class BasicNewsRecipe(Recipe):
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
@ -893,7 +896,7 @@ class BasicNewsRecipe(Recipe):
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, description=desc)
|
||||
play_order=po, author=auth, description=desc)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
@ -925,11 +928,15 @@ class BasicNewsRecipe(Recipe):
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc))
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
@ -9,13 +9,26 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Newsweek(BasicNewsRecipe):
|
||||
|
||||
title = 'Newsweek'
|
||||
__author__ = 'Kovid Goyal'
|
||||
__author__ = 'Kovid Goyal and Sujata Raman'
|
||||
description = 'Weekly news and current affairs in the US'
|
||||
no_stylesheets = True
|
||||
|
||||
extra_css = '''
|
||||
h1{color:#383733;font-family:Arial,Helvetica,sans-serif;font-size:large;}
|
||||
.deck{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;color:#383733;font-size:small;}
|
||||
.articleInfo{color:#474537;font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
.authorName{color:#B61900;font-family:Arial,Helvetica,sans-serif;font-size:medium;}
|
||||
.authorInfo{color:#0066CC;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
|
||||
.articleUpdated{ font-size:xx-small; color:#73726C; font-family:Arial,Helvetica,sans-serif;}
|
||||
.issueDate{font-family :Arial,Helvetica,sans-serif;font-size:xx-small;font-style:italic;}
|
||||
.story{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;font-size:small;}
|
||||
.photoCredit{color:#999999;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
|
||||
.photoCaption{color:#0A0A09;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;font-weight:bold;}'''
|
||||
|
||||
encoding = 'utf-8'
|
||||
language = _('English')
|
||||
remove_tags = [
|
||||
{'class':['navbar', 'ad', 'sponsorLinksArticle', 'mm-content',
|
||||
{'class':['fwArticle noHr','fwArticle','subinfo','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
|
||||
'inline-social-links-wrapper', 'email-article',
|
||||
'comments-and-social-links-wrapper', 'EmailArticleBlock']},
|
||||
{'id' : ['footer', 'ticker-data', 'topTenVertical',
|
||||
@ -24,8 +37,6 @@ class Newsweek(BasicNewsRecipe):
|
||||
{'class': re.compile('related-cloud')},
|
||||
]
|
||||
keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent']}]
|
||||
|
||||
|
||||
recursions = 1
|
||||
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
|
||||
|
||||
|
@ -16,7 +16,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
__author__ = 'Kovid Goyal'
|
||||
language = _('English')
|
||||
description = 'Daily news from the New York Times (subscription version)'
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
timefmt = ''
|
||||
needs_subscription = True
|
||||
remove_tags_before = dict(id='article')
|
||||
remove_tags_after = dict(id='article')
|
||||
@ -46,11 +46,22 @@ class NYTimes(BasicNewsRecipe):
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
|
||||
'New York','Business Day','Sports','Dining','Arts','Home','Styles']
|
||||
excludeSectionKeywords = ['Sports','Dining','Styles']
|
||||
|
||||
|
||||
# Find each instance of class="section-headline", class="story", class="story headline"
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline']}):
|
||||
|
||||
if div['class'] == 'section-headline':
|
||||
key = string.capwords(feed_title(div))
|
||||
excluded = re.compile('|'.join(excludeSectionKeywords))
|
||||
if excluded.search(key):
|
||||
self.log("Skipping section %s" % key)
|
||||
continue
|
||||
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
|
||||
@ -61,24 +72,35 @@ class NYTimes(BasicNewsRecipe):
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
url += '?pagewanted=all'
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
|
||||
author = ''
|
||||
authorAttribution = div.find(True, attrs={'class':'storyheadline-author'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
else:
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
|
||||
feed = key if key is not None else 'Uncategorized'
|
||||
if not articles.has_key(feed):
|
||||
articles[feed] = []
|
||||
if not 'podcasts' in url:
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description,
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
ans = self.sort_index_by(ans, {'The Front Page':-1,
|
||||
'Dining In, Dining Out':1,
|
||||
'Obituaries':2})
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
|
||||
return ans
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
Loading…
x
Reference in New Issue
Block a user