calibre/resources/recipes/nytimes_sub.recipe

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
nytimes.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class NYTimes(BasicNewsRecipe):

    title       = 'The New York Times (subscription)'
    __author__  = 'Kovid Goyal'
    language = 'en'

    description = 'Daily news from the New York Times (subscription version)'
    timefmt = ''
    needs_subscription = True
    remove_tags_before = dict(id='article')
    remove_tags_after  = dict(id='article')
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
                   dict(name=['script', 'noscript', 'style'])]
    encoding = 'cp1252'
    no_stylesheets = True
    extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://www.nytimes.com/auth/login')
            br.select_form(name='login')
            br['USERID']   = self.username
            br['PASSWORD'] = self.password
            br.submit()
        return br

    def parse_index(self):
        soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')

        def feed_title(div):
            return ''.join(div.findAll(text=True, recursive=False)).strip()

        articles = {}
        key = None
        ans = []
        allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
                              'New York','Business Day','Sports','Dining','Arts','Home','Styles']
        excludeSectionKeywords = ['Dining','Styles']


        # Find each instance of class="section-headline", class="story", class="story headline"
        for div in soup.findAll(True,
            attrs={'class':['section-headline', 'story', 'story headline']}):

            if div['class'] == 'section-headline':
                key = string.capwords(feed_title(div))
                excluded = re.compile('|'.join(excludeSectionKeywords))
                if excluded.search(key):
                    self.log("Skipping section %s" % key)
                    continue

                articles[key] = []
                ans.append(key)

            elif div['class'] in ['story', 'story headline'] :
                a = div.find('a', href=True)
                if not a:
                    continue
                url = re.sub(r'\?.*', '', a['href'])
                url += '?pagewanted=all'
                title = self.tag_to_string(a, use_alt=True).strip()

                description = ''
                pubdate = strftime('%a, %d %b')
                summary = div.find(True, attrs={'class':'summary'})
                if summary:
                    description = self.tag_to_string(summary, use_alt=False)

                author = ''
                authorAttribution = div.find(True, attrs={'class':'storyheadline-author'})
                if authorAttribution:
                    author = self.tag_to_string(authorAttribution, use_alt=False)
                else:
                    authorAttribution = div.find(True, attrs={'class':'byline'})
                    if authorAttribution:
                        author = self.tag_to_string(authorAttribution, use_alt=False)

                feed = key if key is not None else 'Uncategorized'
                if not articles.has_key(feed):
                    articles[feed] = []
                if not 'podcasts' in url:
                    articles[feed].append(
                                  dict(title=title, url=url, date=pubdate,
                                       description=description, author=author,
                                       content=''))
        ans = self.sort_index_by(ans, {'The Front Page':-1,
                                       'Dining In, Dining Out':1,
                                       'Obituaries':2})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]

        return ans

    def preprocess_html(self, soup):
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
            return soup
        content = refresh.get('content').partition('=')[2]
        raw = self.browser.open('http://www.nytimes.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))