calibre/recipes/ba_herald.recipe

__license__   = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.buenosairesherald.com
'''

from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe

class BuenosAiresHerald(BasicNewsRecipe):
    title                 = 'Buenos Aires Herald'
    __author__            = 'Darko Miletic'
    description           = 'A world of information in a few words'
    publisher             = 'Editorial Nefir S.A.'
    category              = 'news, politics, Argentina'
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en_AR'
    remove_empty_feeds    = True
    publication_type      = 'newspaper'
    masthead_url          = 'http://www.buenosairesherald.com/img/logo.jpg'
    INDEX                 = 'http://www.buenosairesherald.com'
    extra_css             = """
                               body{font-family: Arial,Helvetica,sans-serif }
                               img{margin-bottom: 0.4em; display:block}
                               h1{font-family: Georgia,serif}
                               #fecha{text-align: right; font-size: small}
                            """

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    remove_tags    = [dict(name=['meta','link','iframe'])]
    keep_only_tags = [dict(attrs={'class':'nota_texto p'})]


    feeds = [
              (u'Argentina'      , u'http://www.buenosairesherald.com/argentina'     )
             ,(u'World'          , u'http://www.buenosairesherald.com/world'         )
             ,(u'Latin America'  , u'http://www.buenosairesherald.com/latin-america' )
             ,(u'Entertainment'  , u'http://www.buenosairesherald.com/entertainment' )
             ,(u'Sports'         , u'http://www.buenosairesherald.com/sports'        )
            ]

    def print_version(self, url):
        artidraw = url.rpartition('/article/')[2]
        artid = artidraw.partition('/')[0]
        return 'http://www.buenosairesherald.com/articles/print.aspx?ix=' + artid


    def parse_index(self):
        totalfeeds = []
        lfeeds = self.get_feeds()
        for feedobj in lfeeds:
            feedtitle, feedurl = feedobj
            self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
            articles = []
            soup = self.index_to_soup(feedurl)
            for item in soup.findAll('div', attrs={'class':'nota_texto_seccion'}):
                description = self.tag_to_string(item.h2)
                atag = item.h2.find('a')
                if atag and atag.has_key('href'):
                    url         = self.INDEX + atag['href']
                    title       = description
                    date        = strftime(self.timefmt)
                    articles.append({
                                      'title'      :title
                                     ,'date'       :date
                                     ,'url'        :url
                                     ,'description':description
                                    })
            totalfeeds.append((feedtitle, articles))
        return totalfeeds