Fix #4009 (NYT Top Stories fails)

2025-07-09 03:04:10 -04:00 · 2009-11-13 15:36:17 -07:00 · 2009-11-13 15:36:17 -07:00 · 8e004db71b
commit 8e004db71b
parent 339df810b8
1 changed files with 52 additions and 35 deletions
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@ -6,6 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 nytimes.com
 '''
 import re
 import time
 from calibre import entity_to_unicode
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
@ -14,9 +15,9 @@ class NYTimes(BasicNewsRecipe):
    title       = 'New York Times Top Stories'
    __author__  = 'GRiker'
-    language = 'en'
+    language = _('English')
    description = 'Top Stories from the New York Times'
-
+    
    # List of sections typically included in Top Stories.  Use a keyword from the
    # right column in the excludeSectionKeywords[] list to skip downloading that section
    sections = {
@ -39,7 +40,7 @@ class NYTimes(BasicNewsRecipe):
                 'world'            :   'World'
               }
-    # By default, no sections are skipped.
+    # By default, no sections are skipped.  
    excludeSectionKeywords = []
    # Add section keywords from the right column above to skip that section
@ -49,7 +50,7 @@ class NYTimes(BasicNewsRecipe):
    # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
    # Fetch only Top Stories
    # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
-
+    
    # The maximum number of articles that will be downloaded
    max_articles_per_feed = 40
@ -63,7 +64,7 @@ class NYTimes(BasicNewsRecipe):
                                dict(attrs={   'id':['toolsRight','inlineBox','sidebarArticles',
                                                     'portfolioInline','articleInline','readerscomment',
                                                     'nytRating']}) ]
-
+        
    encoding = 'cp1252'
    no_stylesheets = True
    extra_css = '.headline      {text-align:    left;}\n    \
@ -79,6 +80,14 @@ class NYTimes(BasicNewsRecipe):
                 .authorId      {text-align:    left;       \
                                 font-style:    italic;}\n  '
 #     def get_cover_url(self):
 #        st = time.localtime()
 #        year = str(st.tm_year)
 #        month = "%.2d" % st.tm_mon
 #        day = "%.2d" % st.tm_mday
 #        cover = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/nytfrontpage/' + 'scan.jpg'
 #        return cover
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
@ -105,13 +114,13 @@ class NYTimes(BasicNewsRecipe):
                _raw = url_or_raw
            if raw:
                return _raw
-
+                
            if not isinstance(_raw, unicode) and self.encoding:
                _raw = _raw.decode(docEncoding, 'replace')
            massage = list(BeautifulSoup.MARKUP_MASSAGE)
            massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
            return BeautifulSoup(_raw, markupMassage=massage)
-
+        
        # Entry point
        soup = get_the_soup( self.encoding, url_or_raw )
        contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
@ -122,7 +131,7 @@ class NYTimes(BasicNewsRecipe):
        if self.verbose > 2:
            self.log( "  document encoding: '%s'" % docEncoding)
        if docEncoding != self.encoding :
-            soup = get_the_soup(docEncoding, url_or_raw)
+            soup = get_the_soup(docEncoding, url_or_raw)         
        return soup
@ -133,7 +142,7 @@ class NYTimes(BasicNewsRecipe):
        feed = key = 'All Top Stories'
        articles[key] = []
        ans.append(key)
-
+        
        soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
        # Fetch the outer table
@ -179,7 +188,7 @@ class NYTimes(BasicNewsRecipe):
                bylines = []
                descriptions = []
                pubdate = None
-
+        
                # Get the Section title
                for (x,i) in enumerate(sectionblock.contents) :
                    skipThisSection = False
@ -201,22 +210,26 @@ class NYTimes(BasicNewsRecipe):
                                break
                # Get the bylines and descriptions
-                if not skipThisSection :
+                if not skipThisSection :                    
-                    for (x,i) in enumerate(sectionblock.contents) :
+                    lines = sectionblock.contents
-
+                    contentStrings = []
-                        # Extract the bylines and descriptions
+                    
-                        if (i.string is not None) and       \
+                    for line in lines:
-                           (i.string.strip() > "") and      \
+                        if not isinstance(line, Comment) and line.strip and line.strip() > "":
-                           not isinstance(i,Comment):
+                            contentStrings.append(line.strip())
-                            contentString = i.strip().encode('utf-8')
+                
-                            if contentString[0:3] == 'By ' and contentString[4].isupper() :
+                    # Gather the byline/description pairs
-                                bylines.append(contentString)
+                    bylines = []
-                            else :
+                    descriptions = []
-                                descriptions.append(contentString)
+                    for contentString in contentStrings:
-
+                        if contentString[0:3] == 'By ' and contentString[3].isupper() :
                            bylines.append(contentString)
                        else:
                            descriptions.append(contentString)
                    # Fetch the article titles and URLs
                    articleCount = len(sectionblock.findAll('span'))
-                    for (i,span) in enumerate(sectionblock.findAll('span')) :
+                    for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
                        a = span.find('a', href=True)
                        url = re.sub(r'\?.*', '', a['href'])
                        url += '?pagewanted=all'
@ -228,7 +241,11 @@ class NYTimes(BasicNewsRecipe):
                        if not isinstance(title, unicode):
                            title = title.decode('utf-8', 'replace')
-                        description = descriptions[i]
+                        # Allow for unattributed, undescribed entries "Editor's Note"                                                
                        if i >= len(descriptions) :
                            description = None
                        else :
                            description = descriptions[i]
                        if len(bylines) == articleCount :
                            author = bylines[i]
@ -242,10 +259,10 @@ class NYTimes(BasicNewsRecipe):
                                if url == article['url'] :
                                    duplicateFound = True
                                    break
-
+                            
-                            if duplicateFound:
+                            if duplicateFound:        
                                # Continue fetching, don't add this article
-                                continue
+                                continue        
                        if not articles.has_key(feed):
                            articles[feed] = []
@ -254,7 +271,7 @@ class NYTimes(BasicNewsRecipe):
                                 description=description, author=author, content=''))
        ans = self.sort_index_by(ans, {'Top Stories':-1})
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]        
        return ans
    def strip_anchors(self,soup):
@ -270,7 +287,7 @@ class NYTimes(BasicNewsRecipe):
 #         refresh = soup.find('meta', {'http-equiv':'refresh'})
 #         if refresh is None:
 #             return self.strip_anchors(soup)
-#
+# 
 #         content = refresh.get('content').partition('=')[2]
 #         raw = self.browser.open('http://www.nytimes.com'+content).read()
 #         soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
@ -280,7 +297,7 @@ class NYTimes(BasicNewsRecipe):
            content = refresh.get('content').partition('=')[2]
            raw = self.browser.open('http://www.nytimes.com'+content).read()
            soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
-
+        
        soup = self.strip_anchors(soup)
        # Test for empty content
@ -291,7 +308,7 @@ class NYTimes(BasicNewsRecipe):
            return soup
        else:
            print "no allowed content found, removing article"
-            raise Exception()
+            raise StringError
    def postprocess_html(self,soup, True):
@ -334,7 +351,7 @@ class NYTimes(BasicNewsRecipe):
            bTag = Tag(soup, "b")
            bTag.insert(0, subhead.contents[0])
            subhead.replaceWith(bTag)
-
+            
        # Synthesize a section header
        dsk = soup.find('meta', attrs={'name':'dsk'})
        if dsk is not None and dsk.has_key('content'):
@ -343,12 +360,12 @@ class NYTimes(BasicNewsRecipe):
            hTag.insert(0,NavigableString(dsk['content']))
            articleTag = soup.find(True, attrs={'id':'article'})
            articleTag.insert(0,hTag)
-
+            
        # Add class="articleBody" to <div> so we can format with CSS
        divTag = soup.find('div',attrs={'id':'articleBody'})
        if divTag is not None :
            divTag['class'] = divTag['id']
-
+        
        # Add class="authorId" to <div> so we can format with CSS
        divTag = soup.find('div',attrs={'id':'authorId'})
        if divTag is not None :