calibre/recipes/le_temps.recipe

#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement

__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

# -------------------------------
#   Modified by Roland Kessi - February 2014
# -------------------------------

from calibre.web.feeds.news import BasicNewsRecipe


class LeTemps(BasicNewsRecipe):
    title = u'Le Temps'
    oldest_article = 7
    max_articles_per_feed = 100
    __author__ = 'Kovid Goyal'
    description = 'French news. Needs a subscription from http://www.letemps.ch'
    no_stylesheets = True
    remove_javascript = True
    recursions = 1
    encoding = 'UTF-8'
    match_regexps = [r'http://www.letemps.ch/Page/Uuid/[-0-9a-f]+\|[1-9]']
    language = 'fr'
    needs_subscription = True
    simultaneous_downloads = 5
    use_embedded_content = False
    remove_empty_feeds = True

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open('http://www.letemps.ch/login')
        br.select_form(nr=1)
        br['username'] = self.username
        br['password'] = self.password
        raw = br.submit().read()
        if '>Login' in raw:
            raise ValueError('Failed to login to letemps.ch. Check '
                             'your username and password')
        return br

    def get_article_url(self, article):
        '''
            Override in a subclass to customize extraction of the :term:`URL` that points
            to the content for each article. Return the
            article URL. It is called with `article`, an object representing a parsed article
            from a feed. See `feedparser <http://packages.python.org/feedparser/>`_.
            By default it looks for the original link (for feeds syndicated via a
            service like feedburner or pheedo) and if found,
            returns that or else returns
            `article.link <http://packages.python.org/feedparser/reference-entry-link.html>`_.
        '''
        # =======================================================================
        # Avoid going through http://rss.feedsportal.com/...
        # =======================================================================
        for key in article.keys():
            if key.endswith('_origlink'):
                url = article[key]
                if url and url.startswith('http://'):
                    print ('Url is :', url)
                    return url
        ans = article.get('link', None)
        if not ans and getattr(article, 'links', None):
            for item in article.links:
                if item.get('rel', 'alternate') == 'alternate':
                    ans = item['href']
                    break
        pos = ans.find('letemps0Bch')
        if pos > -1:
            ans = 'http://www.' + ans[pos:]
            ans = ans.replace('0A', '0')
            ans = ans.replace('0B', '.')
            ans = ans.replace('0C', '/')
            ans = ans.replace('0E', '-')
        return ans

    keep_only_tags = [
        dict(name='div', attrs={'id': 'content'}),
    ]
    remove_tags = [
        dict(name='div', attrs={'id': 'html5_gallery'}),
        dict(name='ul', attrs={'class': ['tabs', 'social-buttons cf']}),
        dict(name='img', attrs={'class': ['bigImg']}),
        dict(name='div', attrs={'class': [
                                'box function', 'contentInserts', 'box banner', 'related-stories',
                                'box additional', 'galleryOverview', 'position', 'rightAd', 'bottomAd', 'video',
                                ]}),
    ]

    extra_css       =   '''
                        h1{font-family:"Georgia","Times New Roman",Times,serif;font-size:large;}
                        .headline{font-family:"Georgia","Times New Roman",Times,serif;font-size:large;color:#990000;}
                        .summary_gal{color:#777777;font-family:"Georgia","Times New Roman",Times,serif;font-size:x-small;}
                        #capt{color:#1B1B1B;font-family:"Georgia","Times New Roman",Times,serif;font-size:x-small;}
                        #content{font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;}
                        .box.article.important{font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;}
                        #h2 {font-size: 24px; line-height: 25px; margin-bottom: 14px; text-transform:uppercase;}
                        .lead {font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;font-weight: bold; margin: 10px 0;font-size:small;}
                        p {margin: 0 0 10px 0;}
                        h3{font-size:small;font-weight:bold;}
                        .description{font-size:x-small;font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;color:white; }
                        a {color:#1B1B1B; font-size:small;}
                        .linkbox{font-size:x-small;color:#1B1B1B;font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;}

                        h2{font-size:small;font-weight:bold;font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;}
                        p.clear{clear:both;}
                        .heading{font-size:x-small;}
                        .heading strong{color:#940026;}
                        .box dd { clear:both; }
                        .box dl { position:relative; }
                        dl.caption {float:left;overflow:hidden;position:relative;margin: 0 10px 12px -40px;}
                        .caption dd p,
                        .caption dt img { margin-right: 0;margin-bottom: 0;}
                        .caption dt img {float: left;}
                        .caption dd {width: 100%;bottom: -1px;position: absolute;}
                        .caption dd .description {z-index: 2;margin-left: 0px;padding: 3px 4px;position: relative;}
                        '''

    feeds = [
        (u'Actualité', u'http://letemps.ch/rss/site/'),
        (u'Actualité - Monde',
         u'http://letemps.ch/rss/site/actualite/monde'),
        (u'Actualité - Suisse & régions',
         u'http://letemps.ch/rss/site/actualite/suisse_regions'),
        (u'Actualité - Sport',
         u'http://letemps.ch/rss/site/actualite/sports'),
        (u'Actualité - Sciences & Environnement',
         u'http://letemps.ch/rss/site/actualite/sciences_environnement'),
        (u'Actualité - Multimédia',
         u'http://letemps.ch/rss/site/actualite/multimedia'),
        (u'Actualité - Société',
         u'http://letemps.ch/rss/site/actualite/societe'),
        (u'Actualité - Société | Quoi de neuf',
         u'http://letemps.ch/rss/site/actualite/societe/quoi_de_neuf'),
        (u'Economie & Finance',
         u'http://letemps.ch/rss/site/economie_finance'),
        (u'Economie & Finance - Finance',
         u'http://letemps.ch/rss/site/economie_finance/finance'),
        (u'Economie & Finance - Fonds de placement',
         u'http://letemps.ch/rss/site/economie_finance/fonds_placement'),
        (u'Economie & Finance - Carrières',
         u'http://letemps.ch/rss/site/economie_finance/carrieres'),
        (u'Culture', u'http://letemps.ch/rss/site/culture'),
        (u'Culture - Cinémas',
         u'http://letemps.ch/rss/site/culture/cinema'),
        (u'Culture - Musiques',
         u'http://letemps.ch/rss/site/culture/musiques'),
        (u'Culture - Scènes',
         u'http://letemps.ch/rss/site/culture/scenes'),
        (u'Culture - Arts plastiques',
         u'http://letemps.ch/rss/site/culture/arts_plastiques'),
        (u'Culture - Livres',
         u'http://letemps.ch/rss/site/culture/livres'),
        (u'Lifestyle - Luxe',
         u'http://letemps.ch/rss/site/lifestyle/luxe'),
        (u'Lifestyle - Mode',
         u'http://letemps.ch/rss/site/lifestyle/mode'),
        (u'Lifestyle - Horlogerie & Joaillerie',
         u'http://letemps.ch/rss/site/lifestyle/horlogerie_joaillerie'),
        (u'Lifestyle - Design',
         u'http://letemps.ch/rss/site/lifestyle/design'),
        (u'Lifestyle - Voyages',
         u'http://letemps.ch/rss/site/lifestyle/voyages'),
        (u'Lifestyle - Gastronomie',
         u'http://letemps.ch/rss/site/lifestyle/gastronomie'),
        (u'Lifestyle - Architecture & Immobilier',
         u'http://letemps.ch/rss/site/lifestyle/architecture_immobilier'),
        (u'Lifestyle - Automobile',
         u'http://letemps.ch/rss/site/lifestyle/automobile'),
        (u'Opinions', u'http://letemps.ch/rss/site/opinions'),
        (u'Opinions - Editoriaux',
         u'http://letemps.ch/rss/site/opinions/editoriaux'),
        (u'Opinions - Invités',
         u'http://letemps.ch/rss/site/opinions/invites'),
        (u'Opinions - Chroniques',
         u'http://letemps.ch/rss/site/opinions/chroniques'),
        (u'Opinions - Chappatte',
         u'http://letemps.ch/rss/site/opinions/chappatte')
    ]

    def parse_feeds(self):
        feeds = BasicNewsRecipe.parse_feeds(self)
        for feed in feeds:
            # The title says it all and the description has has bad characters
            # for "Le Temps"
            del feed.description
        return feeds

    def postprocess_html(self, soup, first):
        for tag in soup.findAll('div', attrs={'class': 'box pagination'}):
            tag.extract()
        if not first:
            h = soup.find('h1')
            if h is not None:
                h.extract()
        return soup