calibre/recipes/le_monde_sub.recipe
2017-12-05 15:18:57 +05:30

103 lines
3.5 KiB
Plaintext

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
__author__ = 'S. Durand <sylvaindurand@users.noreply.github.com>'
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class LeMonde(BasicNewsRecipe):
title = u'Le Monde: Édition abonnés'
__author__ = 'Sylvain Durand'
description = u'La version papier du quotidien Le Monde, disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.'
language = 'fr'
encoding = 'utf8'
needs_subscription = True
extra_css = '''
img{max-width:100%}
h1{font-size:1.2em !important; line-height:1.2em !important; }
h2{font-size:1em !important; line-height:1em !important; }
h3{font-size:1em !important; text-transform:uppercase !important; color:#666;}
#photo{text-align:center !important; margin:10px 0 -8px;}
#lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} '''
keep_only_tags = [
dict(itemprop=['Headline', 'description']),
classes('bloc_signature'),
dict(itemprop=['articleBody']),
]
lm_sections = [
'international:International',
'politique:Politique',
'societe:Société',
'economie:Éco',
'culture:Culture',
'idees:Idées',
'planete:Planète',
'sport:Sport',
'sciences:Sciences',
'pixels:Pixels',
'campus:Campus'
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.open('https://secure.lemonde.fr/sfuser/connexion')
br.select_form(name='connection')
br['connection[mail]'] = self.username
br['connection[password]'] = self.password
br.submit()
return br
def preprocess_html(self, soup):
for lgd in soup.findAll(id="lgd"):
lgd.contents[-1].extract()
for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src']
return soup
def parse_index(self):
ans = []
for x in self.lm_sections:
s, section_title = x.partition(':')[::2]
self.log('Processing section', section_title, '...')
articles = list(self.parse_section('http://www.lemonde.fr/%s/' % s))
if articles:
ans.append((section_title, articles))
return ans
def parse_section(self, url):
soup = self.index_to_soup(url)
container = soup.find(attrs={'class':lambda x: x and 'grid_12 alpha' in x})
for article in container.findAll('article'):
h2 = article.find('h2')
if h2 is None:
h2 = article.find('h3')
if h2 is None:
continue
a = h2.find('a', href=True)
if a is None:
a = h2.findParents('a', href=True)
if not a:
continue
a = a[0]
url = a['href']
if url.startswith('/'):
url = 'http://www.lemonde.fr' + url
title = self.tag_to_string(a)
desc = ''
p = article.find('p')
if p is not None:
desc = self.tag_to_string(p)
self.log('\tFound article', title, 'at', url)
yield {'title': title, 'url': url, 'description': desc}