calibre/recipes/harpers_full.recipe

__license__   = 'GPL v3'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
harpers.org - paid subscription/ printed issue articles
This recipe only get's article's published in text format
images and pdf's are ignored
'''

from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe

class Harpers_full(BasicNewsRecipe):
    title                 = "Harper's Magazine - articles from printed edition"
    __author__            = 'Darko Miletic'
    description           = "Harper's Magazine: Founded June 1850."
    publisher             = "Harpers's"
    category              = 'news, politics, USA'
    oldest_article        = 30
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    delay                 = 1
    language              = 'en'
    needs_subscription    = True
    masthead_url          = 'http://www.harpers.org/media/image/Harpers_305x100.gif'
    publication_type      = 'magazine'
    INDEX                 = strftime('http://www.harpers.org/archive/%Y/%m')
    LOGIN                 = 'http://www.harpers.org'
    cover_url             = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif')
    extra_css             = ' body{font-family: "Georgia",serif} '

    conversion_options = {
                          'comment'          : description
                        , 'tags'             : category
                        , 'publisher'        : publisher
                        , 'language'         : language
                        }

    keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
    remove_tags = [
                     dict(name='table', attrs={'class':['rcnt','rcnt topline']})
                    ,dict(name='link')
                  ]
    remove_attributes=['xmlns']

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open(self.LOGIN)
            br.select_form(nr=1)
            br['handle'  ] = self.username
            br['password'] = self.password
            br.submit()
        return br

    def parse_index(self):
        articles = []
        print 'Processing ' + self.INDEX
        soup = self.index_to_soup(self.INDEX)
        for item in soup.findAll('div', attrs={'class':'title'}):
            text_link = item.parent.find('img',attrs={'alt':'Text'})
            if text_link:
                url   = self.LOGIN + item.a['href']
                title = item.a.contents[0]
                date  = strftime(' %B %Y')
                articles.append({
                                  'title'      :title
                                 ,'date'       :date
                                 ,'url'        :url
                                 ,'description':''
                                })
        return [(soup.head.title.string, articles)]