diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index 8b95c8432b..cb037928b4 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -6,6 +6,7 @@ __copyright__ = '2008, Kovid Goyal ' nytimes.com ''' import re +import time from calibre import entity_to_unicode from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment @@ -14,9 +15,9 @@ class NYTimes(BasicNewsRecipe): title = 'New York Times Top Stories' __author__ = 'GRiker' - language = 'en' + language = _('English') description = 'Top Stories from the New York Times' - + # List of sections typically included in Top Stories. Use a keyword from the # right column in the excludeSectionKeywords[] list to skip downloading that section sections = { @@ -39,7 +40,7 @@ class NYTimes(BasicNewsRecipe): 'world' : 'World' } - # By default, no sections are skipped. + # By default, no sections are skipped. excludeSectionKeywords = [] # Add section keywords from the right column above to skip that section @@ -49,7 +50,7 @@ class NYTimes(BasicNewsRecipe): # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] # Fetch only Top Stories # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] - + # The maximum number of articles that will be downloaded max_articles_per_feed = 40 @@ -63,7 +64,7 @@ class NYTimes(BasicNewsRecipe): dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles', 'portfolioInline','articleInline','readerscomment', 'nytRating']}) ] - + encoding = 'cp1252' no_stylesheets = True extra_css = '.headline {text-align: left;}\n \ @@ -79,6 +80,14 @@ class NYTimes(BasicNewsRecipe): .authorId {text-align: left; \ font-style: italic;}\n ' +# def get_cover_url(self): +# st = time.localtime() +# year = str(st.tm_year) +# month = "%.2d" % st.tm_mon +# day = "%.2d" % st.tm_mday +# cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/' + 'scan.jpg' +# return cover + def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: @@ -105,13 +114,13 @@ class NYTimes(BasicNewsRecipe): _raw = url_or_raw if raw: return _raw - + if not isinstance(_raw, unicode) and self.encoding: _raw = _raw.decode(docEncoding, 'replace') massage = list(BeautifulSoup.MARKUP_MASSAGE) massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) return BeautifulSoup(_raw, markupMassage=massage) - + # Entry point soup = get_the_soup( self.encoding, url_or_raw ) contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) @@ -122,7 +131,7 @@ class NYTimes(BasicNewsRecipe): if self.verbose > 2: self.log( " document encoding: '%s'" % docEncoding) if docEncoding != self.encoding : - soup = get_the_soup(docEncoding, url_or_raw) + soup = get_the_soup(docEncoding, url_or_raw) return soup @@ -133,7 +142,7 @@ class NYTimes(BasicNewsRecipe): feed = key = 'All Top Stories' articles[key] = [] ans.append(key) - + soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') # Fetch the outer table @@ -179,7 +188,7 @@ class NYTimes(BasicNewsRecipe): bylines = [] descriptions = [] pubdate = None - + # Get the Section title for (x,i) in enumerate(sectionblock.contents) : skipThisSection = False @@ -201,22 +210,26 @@ class NYTimes(BasicNewsRecipe): break # Get the bylines and descriptions - if not skipThisSection : - for (x,i) in enumerate(sectionblock.contents) : - - # Extract the bylines and descriptions - if (i.string is not None) and \ - (i.string.strip() > "") and \ - not isinstance(i,Comment): - contentString = i.strip().encode('utf-8') - if contentString[0:3] == 'By ' and contentString[4].isupper() : - bylines.append(contentString) - else : - descriptions.append(contentString) - + if not skipThisSection : + lines = sectionblock.contents + contentStrings = [] + + for line in lines: + if not isinstance(line, Comment) and line.strip and line.strip() > "": + contentStrings.append(line.strip()) + + # Gather the byline/description pairs + bylines = [] + descriptions = [] + for contentString in contentStrings: + if contentString[0:3] == 'By ' and contentString[3].isupper() : + bylines.append(contentString) + else: + descriptions.append(contentString) + # Fetch the article titles and URLs articleCount = len(sectionblock.findAll('span')) - for (i,span) in enumerate(sectionblock.findAll('span')) : + for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) : a = span.find('a', href=True) url = re.sub(r'\?.*', '', a['href']) url += '?pagewanted=all' @@ -228,7 +241,11 @@ class NYTimes(BasicNewsRecipe): if not isinstance(title, unicode): title = title.decode('utf-8', 'replace') - description = descriptions[i] + # Allow for unattributed, undescribed entries "Editor's Note" + if i >= len(descriptions) : + description = None + else : + description = descriptions[i] if len(bylines) == articleCount : author = bylines[i] @@ -242,10 +259,10 @@ class NYTimes(BasicNewsRecipe): if url == article['url'] : duplicateFound = True break - - if duplicateFound: + + if duplicateFound: # Continue fetching, don't add this article - continue + continue if not articles.has_key(feed): articles[feed] = [] @@ -254,7 +271,7 @@ class NYTimes(BasicNewsRecipe): description=description, author=author, content='')) ans = self.sort_index_by(ans, {'Top Stories':-1}) - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans def strip_anchors(self,soup): @@ -270,7 +287,7 @@ class NYTimes(BasicNewsRecipe): # refresh = soup.find('meta', {'http-equiv':'refresh'}) # if refresh is None: # return self.strip_anchors(soup) -# +# # content = refresh.get('content').partition('=')[2] # raw = self.browser.open('http://www.nytimes.com'+content).read() # soup = BeautifulSoup(raw.decode('cp1252', 'replace')) @@ -280,7 +297,7 @@ class NYTimes(BasicNewsRecipe): content = refresh.get('content').partition('=')[2] raw = self.browser.open('http://www.nytimes.com'+content).read() soup = BeautifulSoup(raw.decode('cp1252', 'replace')) - + soup = self.strip_anchors(soup) # Test for empty content @@ -291,7 +308,7 @@ class NYTimes(BasicNewsRecipe): return soup else: print "no allowed content found, removing article" - raise Exception() + raise StringError def postprocess_html(self,soup, True): @@ -334,7 +351,7 @@ class NYTimes(BasicNewsRecipe): bTag = Tag(soup, "b") bTag.insert(0, subhead.contents[0]) subhead.replaceWith(bTag) - + # Synthesize a section header dsk = soup.find('meta', attrs={'name':'dsk'}) if dsk is not None and dsk.has_key('content'): @@ -343,12 +360,12 @@ class NYTimes(BasicNewsRecipe): hTag.insert(0,NavigableString(dsk['content'])) articleTag = soup.find(True, attrs={'id':'article'}) articleTag.insert(0,hTag) - + # Add class="articleBody" to
so we can format with CSS divTag = soup.find('div',attrs={'id':'articleBody'}) if divTag is not None : divTag['class'] = divTag['id'] - + # Add class="authorId" to
so we can format with CSS divTag = soup.find('div',attrs={'id':'authorId'}) if divTag is not None :