calibre/recipes/le_monde_sub.recipe

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
__author__ = 'S. Durand <sylvaindurand@users.noreply.github.com>'
__license__ = 'GPL v3'

from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class LeMonde(BasicNewsRecipe):

    title = u'Le Monde: Édition abonnés'
    __author__ = 'Sylvain Durand'
    description = u'La version papier du quotidien Le Monde, disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.'
    language = 'fr'
    encoding = 'utf8'

    needs_subscription = True

    extra_css = '''
                img{max-width:100%}
                h1{font-size:1.2em !important; line-height:1.2em !important; }
                h2{font-size:1em !important; line-height:1em !important; }
                h3{font-size:1em !important; text-transform:uppercase !important; color:#666;}
                #photo{text-align:center !important; margin:10px 0 -8px;}
                #lgd{font-size:1em !important; line-height:1em !important;  font-style:italic; color:#333;} '''

    keep_only_tags = [
        dict(itemprop=['Headline', 'description']),
        classes('bloc_signature'),
        dict(itemprop=['articleBody']),
    ]

    lm_sections = [
        'international:International',
        'politique:Politique',
        'societe:Société',
        'economie:Éco',
        'culture:Culture',
        'idees:Idées',
        'planete:Planète',
        'sport:Sport',
        'sciences:Sciences',
        'pixels:Pixels',
        'campus:Campus'
    ]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open('https://secure.lemonde.fr/sfuser/connexion')
        br.select_form(name='connection')
        br['connection[mail]'] = self.username
        br['connection[password]'] = self.password
        br.submit()
        return br

    def preprocess_html(self, soup):
        for lgd in soup.findAll(id="lgd"):
            lgd.contents[-1].extract()
        for img in soup.findAll('img', attrs={'data-src': True}):
            img['src'] = img['data-src']
        return soup

    def parse_index(self):
        ans = []
        for x in self.lm_sections:
            s, section_title = x.partition(':')[::2]
            self.log('Processing section', section_title, '...')
            articles = list(self.parse_section('http://www.lemonde.fr/%s/' % s))
            if articles:
                ans.append((section_title, articles))
        return ans

    def parse_section(self, url):
        soup = self.index_to_soup(url)
        container = soup.find(attrs={'class':lambda x: x and 'grid_12 alpha' in x})
        for article in container.findAll('article'):
            h2 = article.find('h2')
            if h2 is None:
                h2 = article.find('h3')
                if h2 is None:
                    continue
            a = h2.find('a', href=True)
            if a is None:
                a = h2.findParents('a', href=True)
                if not a:
                    continue
                a = a[0]
            url = a['href']
            if url.startswith('/'):
                url = 'http://www.lemonde.fr' + url
            title = self.tag_to_string(a)
            desc = ''
            p = article.find('p')
            if p is not None:
                desc = self.tag_to_string(p)
            self.log('\tFound article', title, 'at', url)
            yield {'title': title, 'url': url, 'description': desc}