diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index f1e0e11d63..14ca98f534 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -16,7 +16,7 @@ class Article(object): time_offset = datetime.now() - datetime.utcnow() - def __init__(self, id, title, url, summary, published, content): + def __init__(self, id, title, url, author, summary, published, content): self.downloaded = False self.id = id self.title = title.strip() if title else title @@ -26,6 +26,9 @@ class Article(object): except: pass self.url = url + self.author = author + if author and not isinstance(author, unicode): + author = author.decode('utf-8', 'replace') self.summary = summary if summary and not isinstance(summary, unicode): summary = summary.decode('utf-8', 'replace') @@ -39,6 +42,7 @@ class Article(object): traceback.print_exc() summary = u'' self.text_summary = summary + self.author = author self.content = content self.date = published self.utctime = datetime(*self.date[:6]) @@ -50,10 +54,11 @@ class Article(object): (u'''\ Title : %s URL : %s +Author : %s Summary : %s Date : %s Has content : %s -'''%(self.title, self.url, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'), +'''%(self.title, self.url, self.author, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'), bool(self.content))).encode('utf-8') def __str__(self): @@ -124,7 +129,8 @@ class Feed(object): link = item.get('url', None) description = item.get('description', '') content = item.get('content', '') - article = Article(id, title, link, description, published, content) + author = item.get('author', '') + article = Article(id, title, link, author, description, published, content) delta = datetime.utcnow() - article.utctime if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: self.articles.append(article) @@ -149,7 +155,9 @@ class Feed(object): self.logger.warning('Failed to get link for %s'%title) self.logger.debug(traceback.format_exc()) link = None + description = item.get('summary', None) + author = item.get('author', None) content = [i.value for i in item.get('content', []) if i.value] content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace') @@ -159,7 +167,7 @@ class Feed(object): content = None if not link and not content: return - article = Article(id, title, link, description, published, content) + article = Article(id, title, link, author, description, published, content) delta = datetime.utcnow() - article.utctime if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: self.articles.append(article) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 9f74b6263f..2bf9d04ca6 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -884,6 +884,9 @@ class BasicNewsRecipe(Recipe): for j, a in enumerate(f): if getattr(a, 'downloaded', False): adir = 'feed_%d/article_%d/'%(num, j) + auth = a.author + if not auth: + auth = None desc = a.text_summary if not desc: desc = None @@ -893,7 +896,7 @@ class BasicNewsRecipe(Recipe): self.play_order_counter += 1 po = self.play_order_counter parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), - play_order=po, description=desc) + play_order=po, author=auth, description=desc) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) @@ -925,11 +928,15 @@ class BasicNewsRecipe(Recipe): if po is None: self.play_order_counter += 1 po = self.play_order_counter + auth = getattr(f, 'author', None) + if not auth: + auth = None desc = getattr(f, 'description', None) if not desc: desc = None feed_index(i, toc.add_item('feed_%d/index.html'%i, None, - f.title, play_order=po, description=desc)) + f.title, play_order=po, description=desc, author=auth)) + else: entries.append('feed_%d/index.html'%0) feed_index(0, toc) diff --git a/src/calibre/web/feeds/recipes/recipe_newsweek.py b/src/calibre/web/feeds/recipes/recipe_newsweek.py index ffeb04f4a5..0146c0541f 100644 --- a/src/calibre/web/feeds/recipes/recipe_newsweek.py +++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py @@ -9,13 +9,26 @@ from calibre.web.feeds.news import BasicNewsRecipe class Newsweek(BasicNewsRecipe): title = 'Newsweek' - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal and Sujata Raman' description = 'Weekly news and current affairs in the US' no_stylesheets = True + + extra_css = ''' + h1{color:#383733;font-family:Arial,Helvetica,sans-serif;font-size:large;} + .deck{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;color:#383733;font-size:small;} + .articleInfo{color:#474537;font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} + .authorName{color:#B61900;font-family:Arial,Helvetica,sans-serif;font-size:medium;} + .authorInfo{color:#0066CC;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} + .articleUpdated{ font-size:xx-small; color:#73726C; font-family:Arial,Helvetica,sans-serif;} + .issueDate{font-family :Arial,Helvetica,sans-serif;font-size:xx-small;font-style:italic;} + .story{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;font-size:small;} + .photoCredit{color:#999999;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} + .photoCaption{color:#0A0A09;font-family:Arial,Helvetica,sans-serif;font-size:xx-small;font-weight:bold;}''' + encoding = 'utf-8' language = _('English') remove_tags = [ - {'class':['navbar', 'ad', 'sponsorLinksArticle', 'mm-content', + {'class':['fwArticle noHr','fwArticle','subinfo','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content', 'inline-social-links-wrapper', 'email-article', 'comments-and-social-links-wrapper', 'EmailArticleBlock']}, {'id' : ['footer', 'ticker-data', 'topTenVertical', @@ -24,8 +37,6 @@ class Newsweek(BasicNewsRecipe): {'class': re.compile('related-cloud')}, ] keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent']}] - - recursions = 1 match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] diff --git a/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py b/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py index 4449ba1aa2..c26b4af735 100644 --- a/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py +++ b/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py @@ -16,7 +16,7 @@ class NYTimes(BasicNewsRecipe): __author__ = 'Kovid Goyal' language = _('English') description = 'Daily news from the New York Times (subscription version)' - timefmt = ' [%a, %d %b, %Y]' + timefmt = '' needs_subscription = True remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') @@ -46,39 +46,61 @@ class NYTimes(BasicNewsRecipe): articles = {} key = None ans = [] + allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials', + 'New York','Business Day','Sports','Dining','Arts','Home','Styles'] + excludeSectionKeywords = ['Sports','Dining','Styles'] + + + # Find each instance of class="section-headline", class="story", class="story headline" for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline']}): if div['class'] == 'section-headline': key = string.capwords(feed_title(div)) + excluded = re.compile('|'.join(excludeSectionKeywords)) + if excluded.search(key): + self.log("Skipping section %s" % key) + continue + articles[key] = [] ans.append(key) - elif div['class'] in ['story', 'story headline']: + elif div['class'] in ['story', 'story headline'] : a = div.find('a', href=True) if not a: continue url = re.sub(r'\?.*', '', a['href']) url += '?pagewanted=all' title = self.tag_to_string(a, use_alt=True).strip() + description = '' pubdate = strftime('%a, %d %b') summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) + author = '' + authorAttribution = div.find(True, attrs={'class':'storyheadline-author'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + else: + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + feed = key if key is not None else 'Uncategorized' if not articles.has_key(feed): articles[feed] = [] if not 'podcasts' in url: articles[feed].append( dict(title=title, url=url, date=pubdate, - description=description, + description=description, author=author, content='')) ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans def preprocess_html(self, soup):