calibre/recipes/the_new_republic.recipe

import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict

class TNR(BasicNewsRecipe):

    title       = 'The New Republic'
    __author__  = 'Rick Shang'

    description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.'
    language = 'en'
    category = 'news'
    encoding = 'UTF-8'
    remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
    no_javascript = True
    no_stylesheets = True


    def parse_index(self):

        #Go to the issue
        soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues')
        issue = soup0.find('div',attrs={'id':'current_issue'})

        #Find date
        date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip()
        self.timefmt = u' [%s]'%date

        #Go to the main body
        current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href']
        soup = self.index_to_soup(current_issue_url)
        div = soup.find ('div', attrs={'class':'article_detail_body'})


        #Find cover
        self.cover_url = div.find('img',src=True)['src']

        feeds = OrderedDict()
        section_title = ''
        subsection_title = ''
        for post in div.findAll('p'):
            articles = []
            em=post.find('em')
            b=post.find('b')
            a=post.find('a',href=True)
            p=post.find('img', src=True)
            #Find cover
            if p is not None:
                self.cover_url = p['src'].strip()
            if em is not None:
                section_title = self.tag_to_string(em).strip()
                subsection_title = ''
            elif b is not None:
                subsection_title=self.tag_to_string(b).strip()
            elif a is not None:
                prefix = (subsection_title+': ') if subsection_title else ''
                url=re.sub('www.tnr.com','www.tnr.com/print', a['href'])
                author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL)
                title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author
                articles.append({'title':title, 'url':url, 'description':'', 'date':''})

            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans