From 6f2c5a1f4d4e82561a68462698af6f944463a02c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 7 Mar 2013 14:35:31 +0530 Subject: [PATCH] Update New York Times Book Review --- recipes/nytimesbook.recipe | 58 ++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe index 5388da9dcb..6c47059e78 100644 --- a/recipes/nytimesbook.recipe +++ b/recipes/nytimesbook.recipe @@ -1,21 +1,55 @@ + from calibre.web.feeds.news import BasicNewsRecipe class NewYorkTimesBookReview(BasicNewsRecipe): title = u'New York Times Book Review' language = 'en' - __author__ = 'Krittika Goyal' - oldest_article = 8 #days - max_articles_per_feed = 1000 - #recursions = 2 - #encoding = 'latin1' - use_embedded_content = False + __author__ = 'Kovid Goyal' + no_stylesheets = True - auto_cleanup = True + no_javascript = True + keep_only_tags = [dict(id='article'), dict(id=lambda x:x and x.startswith('entry-'))] + remove_tags = [ + dict(attrs={'class':['articleBottomExtra', 'shareToolsBox', 'singleAd']}), + dict(attrs={'class':lambda x: x and ('shareTools' in x or 'enlargeThis' in x)}), + ] + + def parse_index(self): + soup = self.index_to_soup('http://www.nytimes.com/pages/books/review/index.html') + + # Find TOC + toc = soup.find('div', id='main').find( + 'div', attrs={'class':'abColumn'}) + feeds = [] + articles = [] + section_title = 'Features' + for x in toc.findAll(['div', 'h3', 'h6'], attrs={'class':['story', 'sectionHeader', 'ledeStory']}): + if x['class'] == 'sectionHeader': + if articles: + feeds.append((section_title, articles)) + section_title = self.tag_to_string(x) + articles = [] + self.log('Found section:', section_title) + continue + if x['class'] in {'story', 'ledeStory'}: + tt = 'h3' if x['class'] == 'story' else 'h1' + a = x.find(tt).find('a', href=True) + title = self.tag_to_string(a) + url = a['href'] + '&pagewanted=all' + self.log('\tFound article:', title, url) + desc = '' + byline = x.find('h6', attrs={'class':'byline'}) + if byline is not None: + desc = self.tag_to_string(byline) + summary = x.find('p', attrs={'class':'summary'}) + if summary is not None: + desc += self.tag_to_string(summary) + if desc: + self.log('\t\t', desc) + articles.append({'title':title, 'url':url, 'date':'', + 'description':desc}) + + return feeds - feeds = [ - ('New York Times Sunday Book Review', - 'http://feeds.nytimes.com/nyt/rss/SundayBookReview'), - ] -