new recipe for the Top Stories from the NY Times by Greg Riker

2025-07-09 03:04:10 -04:00 · 2009-07-11 23:16:31 -06:00 · 2009-07-11 23:16:31 -06:00 · 9ff677f415
commit 9ff677f415
parent ff5490e87f
3 changed files with 234 additions and 103 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -766,8 +766,8 @@ class Manifest(object):
            data = self.oeb.html_preprocessor(data)
            # Remove DOCTYPE declaration as it messes up parsing
-            # Inparticular it causes a tostring to insert xmlns
+            # Inparticular it causes tostring to insert xmlns
-            # declarations, which messes up the coesrcing logic
+            # declarations, which messes up the coercing logic
            idx = data.find('<html')
            if idx > -1:
                pre = data[:idx]
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -15,7 +15,7 @@ recipe_modules = ['recipe_' + r for r in (
           'demorgen_be', 'de_standaard', 'ap', 'barrons', 'chr_mon', 'cnn', 'faznet',
           'jpost', 'jutarnji', 'nasa', 'reuters', 'spiegelde', 'wash_post', 'zeitde',
           'blic', 'novosti', 'danas', 'vreme', 'times_online', 'the_scotsman',
-           'nytimes_sub', 'security_watch', 'cyberpresse', 'st_petersburg_times',
+           'nytimes_sub', 'nytimes', 'security_watch', 'cyberpresse', 'st_petersburg_times',
           'clarin', 'financial_times', 'heise', 'le_monde', 'harpers', 'science_aas',
           'science_news', 'the_nation', 'lrb', 'harpers_full', 'liberation',
           'linux_magazine', 'telegraph_uk', 'utne', 'sciencedaily', 'forbes',
--- a/src/calibre/web/feeds/recipes/recipe_nytimes.py
+++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py
@ -1,110 +1,241 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
-mobile.nytimes.com
+nytimes.com
 '''
 import re
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.recipes import BasicNewsRecipe
-from lxml import html
+from calibre.ebooks.BeautifulSoup import Tag
-class NYTimesMobile(BasicNewsRecipe):
+class NYTimes(BasicNewsRecipe):
-    
+
-    title       = 'The New York Times'
+    title       = 'NYTimes Top Stories'
-    __author__  = 'Kovid Goyal'
+    __author__  = 'Greg Riker'
    language = _('English')
-    description = 'Daily news from the New York Times (mobile version)'
+    description = 'Top Stories from the New York Times'
-    timefmt     = ' [%a, %d %b, %Y]'
+    #max_articles_per_feed = 3
-    multithreaded_fetch = True
+    timefmt = ''
-    max_articles_per_feed  = 15
+    needs_subscription = False
    remove_tags_before = dict(id='article')
    remove_tags_after  = dict(id='article')
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 'clearfix']}),
                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
                   dict(name=['script', 'noscript', 'style'])]
    encoding = 'cp1252'
    no_stylesheets = True
-    extra_css = '''
+    #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
-    .h1 { font-size: x-large; font-weight: bold; font-family: sans-serif; text-align: left }
+    extra_css = '.headline  {text-align:left;}\n\
-    .h2 { font-size: large; font-weight: bold }
+                 .byline    {font:monospace; margin-bottom:0px;}\n\
-    .credit { font-size: small }
+                 .source    {align:left;}\n\
-    .aut { font-weight: bold }
+                 .credit    {align:right;}\n'
-    .bodycontent { font-family: serif }
+
-    ''' 
+
-    
+    flatPeriodical = True
-    remove_tags = [
+
                   dict(name='div', attrs={'class':['banner center', 'greyBackBlackTop', 'c bB']}), 
                   dict(name='a', href='/main')
                   ]
    remove_tags_after = [
                         dict(name='a', attrs={'name': 'bottom'})
                         ]
    def image_url_processor(self, baseurl, url):
        return re.sub(r'(&|&amp;).*', '', url)
    def get_browser(self):
        return BasicNewsRecipe.get_browser(mobile_browser=True)
    def download(self, for_lrf=False):
        if for_lrf:
            self.max_articles_per_feed = 10
        return BasicNewsRecipe.download(self, for_lrf=for_lrf)
    def process_section(self, href):
        raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
        articles = []
        while True:
            root = html.fromstring(raw)
            for art in self.find_articles(root):
                append = True
                for x in articles:
                    if x['title'] == art['title']:
                        append = False
                        break
                if append: articles.append(art)
            more = root.xpath('//a[starts-with(@href, "section") and contains(text(), "MORE")]')
            if not more:
                break
            href = more[0].get('href')
            raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
        return articles
    def find_articles(self, root):
        for a in root.xpath('//a[@accesskey]'):
            href = a.get('href')
            if href.startswith('http://'):
                url = href
            else:
                url = 'http://mobile.nytimes.com/article' + href[href.find('?'):]+'&single=1', 
            yield {
                   'title': a.text.strip(),
                   'date' : '',
                   'url'  : url, 
                   'description': '',
                   }
    def parse_index(self):
-        raw = self.index_to_soup('http://mobile.nytimes.com', raw=True)
+        soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
-        root = html.fromstring(raw)
+
-        feeds = [('Latest news', list(self.find_articles(root)))]
+        def feed_title(div):
-            
+            return ''.join(div.findAll(text=True, recursive=False)).strip()
-        for a in root.xpath('//a[starts-with(@href, "section")]'):
+
-            title = a.text.replace('&raquo;', '').replace(u'\xbb', '').strip()
+        articles = {}
-            print 'Processing section:', title
+
-            articles = self.process_section(a.get('href'))
+        ans = []
-            feeds.append((title, articles))
+        if self.flatPeriodical :
-            
+            feed = key = 'All Top Stories'
-        return feeds
+            articles[key] = []
-    
+            ans.append(key)
-    def postprocess_html(self, soup, first_fetch):
+        else :
-        for img in soup.findAll('img', width=True):
+            key = None
-            try:
+
-                width = int(img['width'].replace('px', ''))
+        sections = { 'topstories'   :   'Top Stories',
-                if width < 5:
+                     'world'        :   'World',
-                    img.extract()
+                     'us'           :   'U.S.',
-                    continue
+                     'politics'     :   'Politics',
-            except:
+                     'business'     :   'Business',
-                pass
+                     'technology'   :   'Technology',
-            del img['width']
+                     'sports'       :   'Sports',
-            del img['height']
+                     'arts'         :   'Arts',
-            del img.parent['style']
+                     'newyorkregion':   'New York/Region',
                     'travel'       :   'Travel',
                     'editorials'   :   'Editorials',
                     'oped'         :   'Op-Ed'
                   }
        #excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed']
        excludeSectionKeywords = []
        # Fetch the outer table
        table = soup.find('table')
        previousTable = table
        contentTable = None
        # Find the deepest table containing the stories
        while True :
            table = table.find('table')
            if table.find(text=re.compile('top stories start')) :
                if self.verbose > 2 : self.log( "*********** dropping one level deeper **************")
                previousTable = table
                continue
            else :
                if self.verbose > 2 : self.log( "found table with top stories")
                table = previousTable
                if self.verbose > 2 : self.log( "lowest table containing 'top stories start:\n%s" % table)
                break
        # There are multiple subtables, find the one containing the stories
        for block in table.findAll('table') :
            if block.find(text=re.compile('top stories start')) :
                if self.verbose > 2 : self.log( "found subtable with top stories")
                table = block
                if self.verbose > 2 : self.log( "lowest subtable containing 'top stories start:\n%s" % table)
                break
            else :
                if self.verbose > 2 : self.log( "trying next subtable")
                continue
        # Again there are multiple subtables, find the one containing the stories
        for storyblock in table.findAll('table') :
            if storyblock.find(text=re.compile('top stories start')) :
                if self.verbose > 2 : self.log( "found subsubtable with top stories\n" )
                # table = storyblock
                if self.verbose > 2 : self.log( "\nlowest subsubtable containing 'top stories start:\n%s" % storyblock)
                break
            else :
                if self.verbose > 2 : self.log( "trying next subsubtable")
                continue
        skipThisSection = False
        # Within this table are <font face="times new roman, times, san serif"> entries
        for tr in storyblock.findAllNext('tr'):
            if tr.find('span') is not None :
                sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
                                                         'times new roman,times, sans serif',
                                                         'times new roman, times, sans serif']})
                if self.verbose > 2 : self.log( "----------- new tr ----------------")
                section = None
                bylines = []
                descriptions = []
                pubdate = None
                # Get the Section title
                for (x,i) in enumerate(sectionblock.contents) :
                    skipThisSection = False
                    # Extract the section title
                    if ('Comment' in str(i.__class__)) :
                        if 'start(name=' in i :
                            section = i[i.find('=')+1:-2]
                            if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section])
                        # Check for excluded section
                        if len(excludeSectionKeywords):
                            key = sections[section]
                            excluded = re.compile('|'.join(excludeSectionKeywords))
                            if excluded.search(key) or articles.has_key(key):
                                if self.verbose > 2 : self.log("Skipping section %s" % key)
                                skipThisSection = True
                                break
                        if not self.flatPeriodical :
                            articles[key] = []
                            ans.append(key)
                # Get the bylines and descriptions
                if not skipThisSection :
                    for (x,i) in enumerate(sectionblock.contents) :
                        # Extract the bylines and descriptions
                        if (i.string is not None) and       \
                           (i.string.strip() > "") and      \
                           not ('Comment' in str(i.__class__)) :
                            contentString = i.strip().encode('utf-8')
                            if contentString[0:3] == 'By ' :
                                bylines.append(contentString)
                            else :
                                descriptions.append(contentString)
                    # Fetch the article titles and URLs
                    articleCount = len(sectionblock.findAll('span'))
                    for (i,span) in enumerate(sectionblock.findAll('span')) :
                        a = span.find('a', href=True)
                        #if not a:
                            #continue
                        url = re.sub(r'\?.*', '', a['href'])
                        url += '?pagewanted=all'
                        title = self.tag_to_string(a, use_alt=True)
                        if self.flatPeriodical :
                            # prepend the section name
                            title = sections[section] + " : " + title
                        if not isinstance(title, unicode):
                            title = title.decode('utf-8', 'replace')
                        description = descriptions[i]
                        if len(bylines) == articleCount :
                            author = bylines[i]
                        else :
                            author = None
                        if self.verbose > 2 : self.log( "      title: %s" % title)
                        if self.verbose > 2 : self.log( "        url: %s" % url)
                        if self.verbose > 2 : self.log( "     author: %s" % author)
                        if self.verbose > 2 : self.log( "description: %s" % description)
                        if not self.flatPeriodical :
                            feed = key
                        if not articles.has_key(feed):
                            if self.verbose > 2 : self.log( "adding %s to articles[]" % feed)
                            articles[feed] = []
                        if self.verbose > 2 : self.log( "     adding: %s to articles[%s]\n" % (title, feed))
                        articles[feed].append(
                            dict(title=title, url=url, date=pubdate,
                                 description=description, author=author, content=''))
        ans = self.sort_index_by(ans, {'Top Stories':-1})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        #sys.exit(1)
        return ans
    def postprocess_html(self,soup, True):
        if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ")
        # Change captions to italic -1
        for caption in soup.findAll(True, {'class':'caption'}) :
            emTag = Tag(soup, "em")
            #emTag['class'] = "caption"
            #emTag['font-size-adjust'] = "-1"
            emTag.insert(0, caption.contents[0])
            hrTag = Tag(soup, 'hr')
            emTag.insert(1, hrTag)
            caption.replaceWith(emTag)
        # Change <nyt_headline> to <h2>
        headline = soup.div.div.div.div.div.h1.nyt_headline
        tag = Tag(soup, "h2")
        tag['class'] = "headline"
        tag.insert(0, headline.contents[0])
        soup.h1.replaceWith(tag)
        return soup
    def postprocess_book(self, oeb, opts, log) :
        log( " ********** recipe.postprocess_book ********** ")
        log( list(oeb.toc) )
        log( "oeb: %s" % oeb.toc)
        log( "opts: %s" % opts.verbose)
        for sections in oeb.toc :
            log( "section:")
            for articleTOC in sections:
                log( "      title: %s" % articleTOC.title)
                log( "     author: %s" % articleTOC.author)
                log( "description: %s" % articleTOC.description)
                log( "       href: %s" % articleTOC.href)
                log( "    content: %s" % oeb.manifest.hrefs[articleTOC.href])
        return