Updated NYTimes Top Stories recipe

2025-07-09 03:04:10 -04:00 · 2009-09-23 08:25:27 -06:00 · 2009-09-23 08:25:27 -06:00 · 9d292633c7
commit 9d292633c7
parent 2efa863948
1 changed files with 32 additions and 13 deletions
--- a/src/calibre/web/feeds/recipes/recipe_nytimes.py
+++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py
@ -14,8 +14,7 @@ class NYTimes(BasicNewsRecipe):
    title       = 'New York Times Top Stories'
    __author__  = 'GRiker'
-    language = 'en'
+    language = _('English')
    description = 'Top Stories from the New York Times'
    # List of sections typically included in Top Stories.  Use a keyword from the
@ -56,11 +55,14 @@ class NYTimes(BasicNewsRecipe):
    timefmt = ''
    needs_subscription = True
-    keep_only_tags          = [ dict(attrs={   'id':['article']})]
+    keep_only_tags          = [ dict(attrs={   'id':['article']}),
                                dict(attrs={'class':['blog wrap']}) ]
    remove_tags             = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix',
-                                                     'inlineVideo left brightcove']}),
+                                                     'inlineVideo left brightcove', 'entry-meta']}),
                                dict(attrs={   'id':['toolsRight','inlineBox','sidebarArticles',
-                                                     'portfolioInline','articleInline','readerscomment']}) ]
+                                                     'portfolioInline','articleInline','readerscomment',
                                                     'nytRating']}) ]
    encoding = 'cp1252'
    no_stylesheets = True
@ -207,7 +209,7 @@ class NYTimes(BasicNewsRecipe):
                           (i.string.strip() > "") and      \
                           not isinstance(i,Comment):
                            contentString = i.strip().encode('utf-8')
-                            if contentString[0:3] == 'By ' :
+                            if contentString[0:3] == 'By ' and contentString[4].isupper() :
                                bylines.append(contentString)
                            else :
                                descriptions.append(contentString)
@ -265,14 +267,31 @@ class NYTimes(BasicNewsRecipe):
        return soup
    def preprocess_html(self, soup):
-        refresh = soup.find('meta', {'http-equiv':'refresh'})
+#         refresh = soup.find('meta', {'http-equiv':'refresh'})
-        if refresh is None:
+#         if refresh is None:
-            return self.strip_anchors(soup)
+#             return self.strip_anchors(soup)
-
+# 
-        content = refresh.get('content').partition('=')[2]
+#         content = refresh.get('content').partition('=')[2]
-        raw = self.browser.open('http://www.nytimes.com'+content).read()
+#         raw = self.browser.open('http://www.nytimes.com'+content).read()
-        soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
+#         soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
        return self.strip_anchors(soup)
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is not None:
            content = refresh.get('content').partition('=')[2]
            raw = self.browser.open('http://www.nytimes.com'+content).read()
            soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
        soup = self.strip_anchors(soup)
        # Test for empty content
        body = soup.find('body')
        tagCount = len(body.findAll(True))
        if tagCount:
 #            print "%d tags in article" % tagCount
            return soup
        else:
            print "no allowed content found, removing article"
            raise StringError
    def postprocess_html(self,soup, True):