calibre/recipes/the_new_republic.recipe

import re
from calibre.web.feeds.recipes import BasicNewsRecipe


class TNR(BasicNewsRecipe):

    title = 'The New Republic'
    __author__ = 'Krittika Goyal'

    description = '''The New Republic is a journal of opinion with an emphasis
    on politics and domestic and international affairs. It carries feature
    articles by staff and contributing editors. The second half of each issue
    is devoted to book and the arts, theater, motion pictures, music and art.'''

    language = 'en'
    encoding = 'UTF-8'
    needs_subscription = True

    preprocess_regexps = [
        (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
        (re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
    ]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open('http://www.newrepublic.com/user')
        br.select_form(nr=1)
        try:
            br['user'] = self.username
        except:
            br['name'] = self.username
        br['pass'] = self.password
        self.log('Logging in...')
        raw = br.submit().read()
        if 'SIGN OUT' not in raw:
            raise ValueError(
                'Failed to log in to tnr.com, check your username and password')
        self.log('Logged in successfully')
        return br

    def parse_index(self):
        raw = self.index_to_soup(
            'http://www.newrepublic.com/current-issue', raw=True)
        # raw = self.index_to_soup(open('/t/raw.html').read().decode('utf-8'), raw=True)
        for pat, sub in self.preprocess_regexps:
            raw = pat.sub(sub, raw)
        soup = self.index_to_soup(raw)
        feed_title = 'The New Republic Magazine Articles'

        articles = []
        for div in soup.findAll('div', attrs={'class': lambda x: x and 'field-item' in x.split()}):
            a = div.find('a', href=True, attrs={
                         'class': lambda x: x != 'author'})
            if a is not None:
                art_title = self.tag_to_string(a)
                url = a.get('href')
                num = re.search(r'/(\d+)/', url)
                if num is not None:
                    art = num.group(1)
                    url = 'http://www.newrepublic.com/node/%s/print' % art
                    self.log.info('\tFound article:', art_title, 'at', url)
                    article = {'title': art_title, 'url': url,
                               'description': '', 'date': ''}
                    articles.append(article)

        return [(feed_title, articles)]