diff --git a/src/calibre/web/feeds/recipes/recipe_nytimes.py b/src/calibre/web/feeds/recipes/recipe_nytimes.py index bd150bffcf..e50702ede5 100644 --- a/src/calibre/web/feeds/recipes/recipe_nytimes.py +++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py @@ -7,23 +7,27 @@ nytimes.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class NYTimes(BasicNewsRecipe): - title = 'NYTimes Top Stories' - __author__ = 'Greg Riker' + title = 'New York Times Top Stories' + __author__ = 'GRiker' language = _('English') description = 'Top Stories from the New York Times' #max_articles_per_feed = 3 timefmt = '' - needs_subscription = False - remove_tags_before = dict(id='article') - remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 'clearfix']}), - dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), - dict(name=['script', 'noscript', 'style'])] - encoding = 'cp1252' + needs_subscription = True + remove_tags_after = dict(attrs={'id':['comments']}) + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', + 'clearfix', 'nextArticleLink clearfix','inlineSearchControl', + 'columnGroup','entry-meta','entry-response module','jumpLink','nav', + 'columnGroup advertisementColumnGroup']}), + dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', + 'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login', + 'blog-header','searchForm','NYTLogo','insideNYTimes']), + dict(name=['script', 'noscript', 'style','hr'])] + encoding = None no_stylesheets = True #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' extra_css = '.headline {text-align:left;}\n\ @@ -34,6 +38,16 @@ class NYTimes(BasicNewsRecipe): flatPeriodical = True + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://www.nytimes.com/auth/login') + br.select_form(name='login') + br['USERID'] = self.username + br['PASSWORD'] = self.password + br.submit() + return br + def parse_index(self): soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') @@ -50,18 +64,21 @@ class NYTimes(BasicNewsRecipe): else : key = None - sections = { 'topstories' : 'Top Stories', - 'world' : 'World', - 'us' : 'U.S.', - 'politics' : 'Politics', - 'business' : 'Business', - 'technology' : 'Technology', - 'sports' : 'Sports', - 'arts' : 'Arts', - 'newyorkregion': 'New York/Region', - 'travel' : 'Travel', - 'editorials' : 'Editorials', - 'oped' : 'Op-Ed' + sections = { + 'arts' : 'Arts', + 'business' : 'Business', + 'editorials' : 'Editorials', + 'magazine' : 'Magazine', + 'mediaadvertising' : 'Media & Advertising', + 'newyorkregion' : 'New York/Region', + 'oped' : 'Op-Ed', + 'politics' : 'Politics', + 'sports' : 'Sports', + 'technology' : 'Technology', + 'topstories' : 'Top Stories', + 'travel' : 'Travel', + 'us' : 'U.S.', + 'world' : 'World' } #excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed'] @@ -131,6 +148,11 @@ class NYTimes(BasicNewsRecipe): section = i[i.find('=')+1:-2] if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section]) + if not sections.has_key(section) : + self.log( "Unrecognized section id: %s, skipping" % section ) + skipThisSection = True + break + # Check for excluded section if len(excludeSectionKeywords): key = sections[section] @@ -202,26 +224,65 @@ class NYTimes(BasicNewsRecipe): return ans + def preprocess_html(self, soup): + refresh = soup.find('meta', {'http-equiv':'refresh'}) + if refresh is None: + return soup + content = refresh.get('content').partition('=')[2] + raw = self.browser.open('http://www.nytimes.com'+content).read() + return BeautifulSoup(raw.decode('cp1252', 'replace')) + def postprocess_html(self,soup, True): if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ") + # Change class="kicker" to