Update New York Times Top Stories

2025-07-09 03:04:10 -04:00 · 2010-03-26 09:00:11 +05:30 · 2010-03-26 09:00:11 +05:30 · 656ce3eac8
commit 656ce3eac8
parent 041ca66102
2 changed files with 260 additions and 128 deletions
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@ -5,7 +5,8 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 nytimes.com
 '''
-import re, time
+import re
+import time
 from calibre import entity_to_unicode
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
@ -14,7 +15,7 @@ class NYTimes(BasicNewsRecipe):

    title       = 'New York Times Top Stories'
    __author__  = 'GRiker'
-    language = 'en'
+    language = _('English')
    description = 'Top Stories from the New York Times'

    # List of sections typically included in Top Stories.  Use a keyword from the
@ -39,9 +40,6 @@ class NYTimes(BasicNewsRecipe):
                 'world'            :   'World'
               }

-    # By default, no sections are skipped.
-    excludeSectionKeywords = []
-
    # Add section keywords from the right column above to skip that section
    # For example, to skip sections containing the word 'Sports' or 'Dining', use:
    # excludeSectionKeywords = ['Sports', 'Dining']
@ -49,36 +47,138 @@ class NYTimes(BasicNewsRecipe):
    # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
    # Fetch only Top Stories
    # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
+    # By default, no sections are skipped.
+    excludeSectionKeywords = []
+
+    # one_picture_per_article specifies that calibre should only use the first image
+    # from an article (if one exists).  If one_picture_per_article = True, the image
+    # will be moved to a location between the headline and the byline.
+    # If one_picture_per_article = False, all images from the article will be included
+    # and shown in their original location.
+    one_picture_per_article = True

    # The maximum number of articles that will be downloaded
    max_articles_per_feed = 40

    timefmt = ''
    needs_subscription = True
-    keep_only_tags          = [ dict(attrs={   'id':['article']}),
-                                dict(attrs={'class':['blog wrap']}) ]
+    masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'

-    remove_tags             = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix',
-                                                     'inlineVideo left brightcove', 'entry-meta']}),
-                                dict(attrs={   'id':['toolsRight','inlineBox','sidebarArticles',
-                                                     'portfolioInline','articleInline','readerscomment',
-                                                     'nytRating']}) ]
+    remove_tags_before = dict(id='article')
+    remove_tags_after  = dict(id='article')
+    remove_tags = [dict(attrs={'class':[
+                            'articleFooter',
+                            'articleTools',
+                            'columnGroup doubleRule',
+                            'columnGroup singleRule',
+                            'columnGroup last',
+                            'columnGroup  last',
+                            'doubleRule',
+                            'dottedLine',
+                            'entry-meta',
+                            'icon enlargeThis',
+                            'leftNavTabs',
+                            'module box nav',
+                            'nextArticleLink',
+                            'nextArticleLink clearfix',
+                            'post-tools',
+                            'relatedSearchesModule',
+                            'side_tool',
+                            'singleAd',
+                            'subNavigation tabContent active clearfix',
+                            ]}),
+                   dict(id=[
+                            'adxLeaderboard',
+                            'archive',
+                            'articleExtras',
+                            'articleInline',
+                            'blog_sidebar',
+                            'cCol',
+                            'entertainmentSearchBar',
+                            'footer',
+                            'header',
+                            'header_search',
+                            'login',
+                            'masthead',
+                            'memberTools',
+                            'navigation',
+                            'portfolioInline',
+                            'relatedArticles',
+                            'side_search',
+                            'side_index',
+                            'side_tool',
+                            'toolsRight',
+                            ]),
+                   dict(name=['script', 'noscript', 'style'])]

-    encoding = 'cp1252'
    no_stylesheets = True
    extra_css = '.headline      {text-align:    left;}\n    \
                 .byline        {font-family:   monospace;  \
                                 text-align:    left;       \
+                                 margin-top:    0px;        \
+                                 margin-bottom: 0px;}\n     \
+                 .dateline      {font-size:     small;      \
+                                 margin-top:    0px;        \
+                                 margin-bottom: 0px;}\n     \
+                 .timestamp     {font-size:     small;      \
+                                 margin-top:    0px;        \
                                 margin-bottom: 0px;}\n     \
-                 .timestamp     {font-size:     smaller;}\n \
                 .source        {text-align:    left;}\n    \
                 .image         {text-align:    center;}\n  \
                 .credit        {text-align:    right;      \
-                                 font-size:     smaller;}\n \
+                                 font-size:     small;      \
+                                 margin-top:    0px;        \
+                                 margin-bottom: 0px;}\n     \
                 .articleBody   {text-align:    left;}\n    \
                 .authorId      {text-align:    left;       \
                                 font-style:    italic;}\n  '

+    def dump_ans(self, ans) :
+        total_article_count = 0
+        for section in ans :
+            if self.verbose:
+                self.log("section %s: %d articles" % (section[0], len(section[1])) )
+            for article in section[1]:
+                total_article_count += 1
+                if self.verbose:
+                    self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
+                              article['url'].encode('cp1252','replace')))
+        self.log( "Queued %d articles" % total_article_count )
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","&#8216;",string)
+
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","&#8217;",fixed)
+
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","&#8220;",fixed)
+
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","&#8221;",fixed)
+
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","&#8211;",fixed)
+
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","&#8212;",fixed)
+
+        return fixed
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        if self.username is not None and self.password is not None:
+            try:
+                br.open('http://www.nytimes.com/auth/login')
+                br.select_form(name='login')
+                br['USERID']   = self.username
+                br['PASSWORD'] = self.password
+                br.submit()
+            except:
+                self.log("\nFailed to login")
+        return br
+
    def get_cover_url(self):
        cover = None
        st = time.localtime()
@ -94,26 +194,6 @@ class NYTimes(BasicNewsRecipe):
            cover = None
        return cover

-    def get_masthead_url(self):
-        masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
-        br = BasicNewsRecipe.get_browser()
-        try:
-            br.open(masthead)
-        except:
-            self.log("\nCover unavailable")
-            masthead = None
-        return masthead
-
-    def get_browser(self):
-        br = BasicNewsRecipe.get_browser()
-        if self.username is not None and self.password is not None:
-            br.open('http://www.nytimes.com/auth/login')
-            br.select_form(name='login')
-            br['USERID']   = self.username
-            br['PASSWORD'] = self.password
-            br.submit()
-        return br
-
    def index_to_soup(self, url_or_raw, raw=False):
        '''
        OVERRIDE of class method
@ -138,6 +218,7 @@ class NYTimes(BasicNewsRecipe):
            return BeautifulSoup(_raw, markupMassage=massage)

        # Entry point
+        print "index_to_soup()"
        soup = get_the_soup( self.encoding, url_or_raw )
        contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
        docEncoding =  str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
@ -151,6 +232,16 @@ class NYTimes(BasicNewsRecipe):

        return soup

+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&#38;'
+            massaged = re.sub("&","&#38;", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
    def parse_index(self):
        articles = {}
        ans = []
@ -158,12 +249,14 @@ class NYTimes(BasicNewsRecipe):
        feed = key = 'All Top Stories'
        articles[key] = []
        ans.append(key)
+        self.log("Scanning 1 section ...")

        soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')

        # Fetch the outer table
        table = soup.find('table')
        previousTable = table
+        contentTable = None

        # Find the deepest table containing the stories
        while True :
@ -191,8 +284,9 @@ class NYTimes(BasicNewsRecipe):
                continue

        skipThisSection = False
-
+        todays_article_count = 0
        # Within this table are <font face="times new roman, times, san serif"> entries
+        self.log("Fetching feed Top Stories")
        for tr in storyblock.findAllNext('tr'):
            if tr.find('span') is not None :

@ -244,6 +338,7 @@ class NYTimes(BasicNewsRecipe):

                    # Fetch the article titles and URLs
                    articleCount = len(sectionblock.findAll('span'))
+                    todays_article_count += articleCount
                    for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
                        a = span.find('a', href=True)
                        url = re.sub(r'\?.*', '', a['href'])
@ -277,6 +372,7 @@ class NYTimes(BasicNewsRecipe):

                            if duplicateFound:
                                # Continue fetching, don't add this article
+                                todays_article_count -= 1
                                continue

                        if not articles.has_key(feed):
@ -284,11 +380,138 @@ class NYTimes(BasicNewsRecipe):
                        articles[feed].append(
                            dict(title=title, url=url, date=pubdate,
                                 description=description, author=author, content=''))
+#        self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))

        ans = self.sort_index_by(ans, {'Top Stories':-1})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        self.dump_ans(ans)
        return ans

+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+    def postprocess_html(self,soup, True):
+
+        if self.one_picture_per_article:
+            # Remove all images after first
+            largeImg = soup.find(True, {'class':'articleSpanImage'})
+            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
+            if largeImg:
+                for inlineImg in inlineImgs:
+                    inlineImg.extract()
+            else:
+                if inlineImgs:
+                    firstImg = inlineImgs[0]
+                    for inlineImg in inlineImgs[1:]:
+                        inlineImg.extract()
+                    # Move firstImg after headline
+                    cgFirst = soup.find(True, {'class':'columnGroup  first'})
+                    if cgFirst:
+                        # Strip all sibling NavigableStrings: noise
+                        navstrings = cgFirst.findAll(text=True, recursive=False)
+                        [ns.extract() for ns in navstrings]
+                        headline_found = False
+                        tag = cgFirst.find(True)
+                        insertLoc = 0
+                        while True:
+                            insertLoc += 1
+                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
+                                    headline_found = True
+                                    break
+                            tag = tag.nextSibling
+                            if not tag:
+                                headline_found = False
+                                break
+                        if headline_found:
+                            cgFirst.insert(insertLoc,firstImg)
+                    else:
+                        self.log(">>> No class:'columnGroup  first' found <<<")
+        # Change class="kicker" to <h3>
+        kicker = soup.find(True, {'class':'kicker'})
+        if kicker and kicker.contents[0]:
+            h3Tag = Tag(soup, "h3")
+            h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
+                             use_alt=False)))
+            kicker.replaceWith(h3Tag)
+
+        # Change captions to italic -1
+        for caption in soup.findAll(True, {'class':'caption'}) :
+            if caption and caption.contents[0]:
+                emTag = Tag(soup, "em")
+                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+                mp_off = c.find("More Photos")
+                if mp_off >= 0:
+                    c = c[:mp_off]
+                emTag.insert(0, c)
+                hrTag = Tag(soup, 'hr')
+                #hrTag['style'] = "margin-top:0em;margin-bottom:0em"
+                emTag.insert(1, hrTag)
+                caption.replaceWith(emTag)
+
+        # Change <nyt_headline> to <h2>
+        h1 = soup.find('h1')
+        if h1:
+            headline = h1.find("nyt_headline")
+            if headline:
+                tag = Tag(soup, "h2")
+                tag['class'] = "headline"
+                tag.insert(0, self.fixChars(headline.contents[0]))
+                h1.replaceWith(tag)
+        else:
+            # Blog entry - replace headline, remove <hr> tags
+            headline = soup.find('title')
+            if headline:
+                tag = Tag(soup, "h2")
+                tag['class'] = "headline"
+                tag.insert(0, self.fixChars(headline.contents[0]))
+                soup.insert(0, tag)
+                hrs = soup.findAll('hr')
+                for hr in hrs:
+                    hr.extract()
+
+        # Change <h1> to <h3> - used in editorial blogs
+        masthead = soup.find("h1")
+        if masthead:
+            # Nuke the href
+            if masthead.a:
+                del(masthead.a['href'])
+            tag = Tag(soup, "h3")
+            tag.insert(0, self.fixChars(masthead.contents[0]))
+            masthead.replaceWith(tag)
+
+        # Change <span class="bold"> to <b>
+        for subhead in soup.findAll(True, {'class':'bold'}) :
+            if subhead.contents:
+                bTag = Tag(soup, "b")
+                bTag.insert(0, subhead.contents[0])
+                subhead.replaceWith(bTag)
+
+        # Synthesize a section header
+        dsk = soup.find('meta', attrs={'name':'dsk'})
+        if dsk and dsk.has_key('content'):
+            hTag = Tag(soup,'h3')
+            hTag['class'] = 'section'
+            hTag.insert(0,NavigableString(dsk['content']))
+            articleTag = soup.find(True, attrs={'id':'article'})
+            if articleTag:
+                articleTag.insert(0,hTag)
+
+        # Add class="articleBody" to <div> so we can format with CSS
+        divTag = soup.find('div',attrs={'id':'articleBody'})
+        if divTag:
+            divTag['class'] = divTag['id']
+
+        # Add class="authorId" to <div> so we can format with CSS
+        divTag = soup.find('div',attrs={'id':'authorId'})
+        if divTag and divTag.contents[0]:
+            tag = Tag(soup, "p")
+            tag['class'] = "authorId"
+            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
+                             use_alt=False)))
+            divTag.replaceWith(tag)
+
+        return soup
+
    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
@ -297,94 +520,3 @@ class NYTimes(BasicNewsRecipe):
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
-
-    def preprocess_html(self, soup):
-#         refresh = soup.find('meta', {'http-equiv':'refresh'})
-#         if refresh is None:
-#             return self.strip_anchors(soup)
-#
-#         content = refresh.get('content').partition('=')[2]
-#         raw = self.browser.open('http://www.nytimes.com'+content).read()
-#         soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
-        return self.strip_anchors(soup)
-        refresh = soup.find('meta', {'http-equiv':'refresh'})
-        if refresh is not None:
-            content = refresh.get('content').partition('=')[2]
-            raw = self.browser.open('http://www.nytimes.com'+content).read()
-            soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
-
-        soup = self.strip_anchors(soup)
-
-        # Test for empty content
-        body = soup.find('body')
-        tagCount = len(body.findAll(True))
-        if tagCount:
-#            print "%d tags in article" % tagCount
-            return soup
-        else:
-            print "no allowed content found, removing article"
-            raise Exception
-
-    def postprocess_html(self,soup, True):
-
-        # Change class="kicker" to <h3>
-        kicker = soup.find(True, {'class':'kicker'})
-        if kicker is not None :
-            h3Tag = Tag(soup, "h3")
-            h3Tag.insert(0, kicker.contents[0])
-            kicker.replaceWith(h3Tag)
-
-        # Change captions to italic -1
-        for caption in soup.findAll(True, {'class':'caption'}) :
-            if caption is not None:
-                emTag = Tag(soup, "em")
-                emTag.insert(0, caption.contents[0])
-                hrTag = Tag(soup, 'hr')
-                emTag.insert(1, hrTag)
-                caption.replaceWith(emTag)
-
-        # Change <nyt_headline> to <h2>
-        headline = soup.find("nyt_headline")
-        if headline is not None :
-            tag = Tag(soup, "h2")
-            tag['class'] = "headline"
-            tag.insert(0, headline.contents[0])
-            soup.h1.replaceWith(tag)
-
-        # Change <h1> to <h3> - used in editorial blogs
-        masthead = soup.find("h1")
-        if masthead is not None :
-            # Nuke the href
-            if masthead.a is not None :
-                del(masthead.a['href'])
-            tag = Tag(soup, "h3")
-            tag.insert(0, masthead.contents[0])
-            soup.h1.replaceWith(tag)
-
-        # Change <span class="bold"> to <b>
-        for subhead in soup.findAll(True, {'class':'bold'}) :
-            bTag = Tag(soup, "b")
-            bTag.insert(0, subhead.contents[0])
-            subhead.replaceWith(bTag)
-
-        # Synthesize a section header
-        dsk = soup.find('meta', attrs={'name':'dsk'})
-        if dsk is not None and dsk.has_key('content'):
-            hTag = Tag(soup,'h3')
-            hTag['class'] = 'section'
-            hTag.insert(0,NavigableString(dsk['content']))
-            articleTag = soup.find(True, attrs={'id':'article'})
-            articleTag.insert(0,hTag)
-
-        # Add class="articleBody" to <div> so we can format with CSS
-        divTag = soup.find('div',attrs={'id':'articleBody'})
-        if divTag is not None :
-            divTag['class'] = divTag['id']
-
-        # Add class="authorId" to <div> so we can format with CSS
-        divTag = soup.find('div',attrs={'id':'authorId'})
-        if divTag is not None :
-            divTag['class'] = divTag['id']
-
-        return soup
-
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -20,7 +20,7 @@ class ANDROID(USBMS):
    VENDOR_ID   = {
            0x0bb4 : { 0x0c02 : [0x100], 0x0c01 : [0x100]},
            0x22b8 : { 0x41d9 : [0x216]},
-            0x18d1 : { 0x4e11 : [0x0100], 0x4e12: [0x0100]},
+            0x18d1 : { 0x4e11 : [0x0100, 0x226], 0x4e12: [0x0100, 0x226]},
            0x04e8 : { 0x681d : [0x0222]},
            }
    EBOOK_DIR_MAIN = ['wordplayer/calibretransfer', 'eBooks/import', 'Books']