mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Updated recipes for NYTimes and Newsweek. Also support for setting the author of downloaded articles in the metadata.
This commit is contained in:
parent
25538e5c04
commit
edf5bcbab6
@ -16,7 +16,7 @@ class Article(object):
|
|||||||
|
|
||||||
time_offset = datetime.now() - datetime.utcnow()
|
time_offset = datetime.now() - datetime.utcnow()
|
||||||
|
|
||||||
def __init__(self, id, title, url, summary, published, content):
|
def __init__(self, id, title, url, author, summary, published, content):
|
||||||
self.downloaded = False
|
self.downloaded = False
|
||||||
self.id = id
|
self.id = id
|
||||||
self.title = title.strip() if title else title
|
self.title = title.strip() if title else title
|
||||||
@ -26,6 +26,9 @@ class Article(object):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
self.url = url
|
self.url = url
|
||||||
|
self.author = author
|
||||||
|
if author and not isinstance(author, unicode):
|
||||||
|
author = author.decode('utf-8', 'replace')
|
||||||
self.summary = summary
|
self.summary = summary
|
||||||
if summary and not isinstance(summary, unicode):
|
if summary and not isinstance(summary, unicode):
|
||||||
summary = summary.decode('utf-8', 'replace')
|
summary = summary.decode('utf-8', 'replace')
|
||||||
@ -39,6 +42,7 @@ class Article(object):
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
summary = u''
|
summary = u''
|
||||||
self.text_summary = summary
|
self.text_summary = summary
|
||||||
|
self.author = author
|
||||||
self.content = content
|
self.content = content
|
||||||
self.date = published
|
self.date = published
|
||||||
self.utctime = datetime(*self.date[:6])
|
self.utctime = datetime(*self.date[:6])
|
||||||
@ -50,10 +54,11 @@ class Article(object):
|
|||||||
(u'''\
|
(u'''\
|
||||||
Title : %s
|
Title : %s
|
||||||
URL : %s
|
URL : %s
|
||||||
|
Author : %s
|
||||||
Summary : %s
|
Summary : %s
|
||||||
Date : %s
|
Date : %s
|
||||||
Has content : %s
|
Has content : %s
|
||||||
'''%(self.title, self.url, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
|
'''%(self.title, self.url, self.author, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
|
||||||
bool(self.content))).encode('utf-8')
|
bool(self.content))).encode('utf-8')
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
@ -124,7 +129,8 @@ class Feed(object):
|
|||||||
link = item.get('url', None)
|
link = item.get('url', None)
|
||||||
description = item.get('description', '')
|
description = item.get('description', '')
|
||||||
content = item.get('content', '')
|
content = item.get('content', '')
|
||||||
article = Article(id, title, link, description, published, content)
|
author = item.get('author', '')
|
||||||
|
article = Article(id, title, link, author, description, published, content)
|
||||||
delta = datetime.utcnow() - article.utctime
|
delta = datetime.utcnow() - article.utctime
|
||||||
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
||||||
self.articles.append(article)
|
self.articles.append(article)
|
||||||
@ -149,7 +155,9 @@ class Feed(object):
|
|||||||
self.logger.warning('Failed to get link for %s'%title)
|
self.logger.warning('Failed to get link for %s'%title)
|
||||||
self.logger.debug(traceback.format_exc())
|
self.logger.debug(traceback.format_exc())
|
||||||
link = None
|
link = None
|
||||||
|
|
||||||
description = item.get('summary', None)
|
description = item.get('summary', None)
|
||||||
|
author = item.get('author', None)
|
||||||
|
|
||||||
content = [i.value for i in item.get('content', []) if i.value]
|
content = [i.value for i in item.get('content', []) if i.value]
|
||||||
content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace')
|
content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace')
|
||||||
@ -159,7 +167,7 @@ class Feed(object):
|
|||||||
content = None
|
content = None
|
||||||
if not link and not content:
|
if not link and not content:
|
||||||
return
|
return
|
||||||
article = Article(id, title, link, description, published, content)
|
article = Article(id, title, link, author, description, published, content)
|
||||||
delta = datetime.utcnow() - article.utctime
|
delta = datetime.utcnow() - article.utctime
|
||||||
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
||||||
self.articles.append(article)
|
self.articles.append(article)
|
||||||
|
@ -884,6 +884,9 @@ class BasicNewsRecipe(Recipe):
|
|||||||
for j, a in enumerate(f):
|
for j, a in enumerate(f):
|
||||||
if getattr(a, 'downloaded', False):
|
if getattr(a, 'downloaded', False):
|
||||||
adir = 'feed_%d/article_%d/'%(num, j)
|
adir = 'feed_%d/article_%d/'%(num, j)
|
||||||
|
auth = a.author
|
||||||
|
if not auth:
|
||||||
|
auth = None
|
||||||
desc = a.text_summary
|
desc = a.text_summary
|
||||||
if not desc:
|
if not desc:
|
||||||
desc = None
|
desc = None
|
||||||
@ -893,7 +896,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
self.play_order_counter += 1
|
self.play_order_counter += 1
|
||||||
po = self.play_order_counter
|
po = self.play_order_counter
|
||||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||||
play_order=po, description=desc)
|
play_order=po, author=auth, description=desc)
|
||||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
for sp in a.sub_pages:
|
for sp in a.sub_pages:
|
||||||
prefix = os.path.commonprefix([opf_path, sp])
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
@ -925,11 +928,15 @@ class BasicNewsRecipe(Recipe):
|
|||||||
if po is None:
|
if po is None:
|
||||||
self.play_order_counter += 1
|
self.play_order_counter += 1
|
||||||
po = self.play_order_counter
|
po = self.play_order_counter
|
||||||
|
auth = getattr(f, 'author', None)
|
||||||
|
if not auth:
|
||||||
|
auth = None
|
||||||
desc = getattr(f, 'description', None)
|
desc = getattr(f, 'description', None)
|
||||||
if not desc:
|
if not desc:
|
||||||
desc = None
|
desc = None
|
||||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||||
f.title, play_order=po, description=desc))
|
f.title, play_order=po, description=desc, author=auth))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
entries.append('feed_%d/index.html'%0)
|
entries.append('feed_%d/index.html'%0)
|
||||||
feed_index(0, toc)
|
feed_index(0, toc)
|
||||||
|
@ -9,13 +9,26 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
class Newsweek(BasicNewsRecipe):
|
class Newsweek(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'Newsweek'
|
title = 'Newsweek'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal and Sujata Raman'
|
||||||
description = 'Weekly news and current affairs in the US'
|
description = 'Weekly news and current affairs in the US'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{color:#383733;font-family:Arial,Helvetica,sans-serif;font-size:large;}
|
||||||
|
.deck{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;color:#383733;font-size:small;}
|
||||||
|
.articleInfo{color:#474537;font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||||
|
.authorName{color:#B61900;font-family:Arial,Helvetica,sans-serif;font-size:medium;}
|
||||||
|
.authorInfo{color:#0066CC;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
|
||||||
|
.articleUpdated{ font-size:xx-small; color:#73726C; font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
.issueDate{font-family :Arial,Helvetica,sans-serif;font-size:xx-small;font-style:italic;}
|
||||||
|
.story{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;font-size:small;}
|
||||||
|
.photoCredit{color:#999999;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
|
||||||
|
.photoCaption{color:#0A0A09;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;font-weight:bold;}'''
|
||||||
|
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = _('English')
|
language = _('English')
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
{'class':['navbar', 'ad', 'sponsorLinksArticle', 'mm-content',
|
{'class':['fwArticle noHr','fwArticle','subinfo','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
|
||||||
'inline-social-links-wrapper', 'email-article',
|
'inline-social-links-wrapper', 'email-article',
|
||||||
'comments-and-social-links-wrapper', 'EmailArticleBlock']},
|
'comments-and-social-links-wrapper', 'EmailArticleBlock']},
|
||||||
{'id' : ['footer', 'ticker-data', 'topTenVertical',
|
{'id' : ['footer', 'ticker-data', 'topTenVertical',
|
||||||
@ -24,8 +37,6 @@ class Newsweek(BasicNewsRecipe):
|
|||||||
{'class': re.compile('related-cloud')},
|
{'class': re.compile('related-cloud')},
|
||||||
]
|
]
|
||||||
keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent']}]
|
keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent']}]
|
||||||
|
|
||||||
|
|
||||||
recursions = 1
|
recursions = 1
|
||||||
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
|
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
language = _('English')
|
language = _('English')
|
||||||
description = 'Daily news from the New York Times (subscription version)'
|
description = 'Daily news from the New York Times (subscription version)'
|
||||||
timefmt = ' [%a, %d %b, %Y]'
|
timefmt = ''
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
remove_tags_before = dict(id='article')
|
remove_tags_before = dict(id='article')
|
||||||
remove_tags_after = dict(id='article')
|
remove_tags_after = dict(id='article')
|
||||||
@ -46,39 +46,61 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
articles = {}
|
articles = {}
|
||||||
key = None
|
key = None
|
||||||
ans = []
|
ans = []
|
||||||
|
allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
|
||||||
|
'New York','Business Day','Sports','Dining','Arts','Home','Styles']
|
||||||
|
excludeSectionKeywords = ['Sports','Dining','Styles']
|
||||||
|
|
||||||
|
|
||||||
|
# Find each instance of class="section-headline", class="story", class="story headline"
|
||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class':['section-headline', 'story', 'story headline']}):
|
attrs={'class':['section-headline', 'story', 'story headline']}):
|
||||||
|
|
||||||
if div['class'] == 'section-headline':
|
if div['class'] == 'section-headline':
|
||||||
key = string.capwords(feed_title(div))
|
key = string.capwords(feed_title(div))
|
||||||
|
excluded = re.compile('|'.join(excludeSectionKeywords))
|
||||||
|
if excluded.search(key):
|
||||||
|
self.log("Skipping section %s" % key)
|
||||||
|
continue
|
||||||
|
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
ans.append(key)
|
ans.append(key)
|
||||||
|
|
||||||
elif div['class'] in ['story', 'story headline']:
|
elif div['class'] in ['story', 'story headline'] :
|
||||||
a = div.find('a', href=True)
|
a = div.find('a', href=True)
|
||||||
if not a:
|
if not a:
|
||||||
continue
|
continue
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
url += '?pagewanted=all'
|
url += '?pagewanted=all'
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
|
||||||
description = ''
|
description = ''
|
||||||
pubdate = strftime('%a, %d %b')
|
pubdate = strftime('%a, %d %b')
|
||||||
summary = div.find(True, attrs={'class':'summary'})
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
if summary:
|
if summary:
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
|
||||||
|
author = ''
|
||||||
|
authorAttribution = div.find(True, attrs={'class':'storyheadline-author'})
|
||||||
|
if authorAttribution:
|
||||||
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
|
else:
|
||||||
|
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||||
|
if authorAttribution:
|
||||||
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
|
|
||||||
feed = key if key is not None else 'Uncategorized'
|
feed = key if key is not None else 'Uncategorized'
|
||||||
if not articles.has_key(feed):
|
if not articles.has_key(feed):
|
||||||
articles[feed] = []
|
articles[feed] = []
|
||||||
if not 'podcasts' in url:
|
if not 'podcasts' in url:
|
||||||
articles[feed].append(
|
articles[feed].append(
|
||||||
dict(title=title, url=url, date=pubdate,
|
dict(title=title, url=url, date=pubdate,
|
||||||
description=description,
|
description=description, author=author,
|
||||||
content=''))
|
content=''))
|
||||||
ans = self.sort_index_by(ans, {'The Front Page':-1,
|
ans = self.sort_index_by(ans, {'The Front Page':-1,
|
||||||
'Dining In, Dining Out':1,
|
'Dining In, Dining Out':1,
|
||||||
'Obituaries':2})
|
'Obituaries':2})
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user