mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
103 lines
3.5 KiB
Plaintext
103 lines
3.5 KiB
Plaintext
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
|
__author__ = 'S. Durand <sylvaindurand@users.noreply.github.com>'
|
|
__license__ = 'GPL v3'
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
def classes(classes):
|
|
q = frozenset(classes.split(' '))
|
|
return dict(attrs={
|
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
|
|
|
|
|
class LeMonde(BasicNewsRecipe):
|
|
|
|
title = u'Le Monde: Édition abonnés'
|
|
__author__ = 'Sylvain Durand'
|
|
description = u'La version papier du quotidien Le Monde, disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.'
|
|
language = 'fr'
|
|
encoding = 'utf8'
|
|
|
|
needs_subscription = True
|
|
|
|
extra_css = '''
|
|
img{max-width:100%}
|
|
h1{font-size:1.2em !important; line-height:1.2em !important; }
|
|
h2{font-size:1em !important; line-height:1em !important; }
|
|
h3{font-size:1em !important; text-transform:uppercase !important; color:#666;}
|
|
#photo{text-align:center !important; margin:10px 0 -8px;}
|
|
#lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} '''
|
|
|
|
keep_only_tags = [
|
|
dict(itemprop=['Headline', 'description']),
|
|
classes('bloc_signature'),
|
|
dict(itemprop=['articleBody']),
|
|
]
|
|
|
|
lm_sections = [
|
|
'international:International',
|
|
'politique:Politique',
|
|
'societe:Société',
|
|
'economie:Éco',
|
|
'culture:Culture',
|
|
'idees:Idées',
|
|
'planete:Planète',
|
|
'sport:Sport',
|
|
'sciences:Sciences',
|
|
'pixels:Pixels',
|
|
'campus:Campus'
|
|
]
|
|
|
|
def get_browser(self):
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
br.open('https://secure.lemonde.fr/sfuser/connexion')
|
|
br.select_form(name='connection')
|
|
br['connection[mail]'] = self.username
|
|
br['connection[password]'] = self.password
|
|
br.submit()
|
|
return br
|
|
|
|
def preprocess_html(self, soup):
|
|
for lgd in soup.findAll(id="lgd"):
|
|
lgd.contents[-1].extract()
|
|
for img in soup.findAll('img', attrs={'data-src': True}):
|
|
img['src'] = img['data-src']
|
|
return soup
|
|
|
|
def parse_index(self):
|
|
ans = []
|
|
for x in self.lm_sections:
|
|
s, section_title = x.partition(':')[::2]
|
|
self.log('Processing section', section_title, '...')
|
|
articles = list(self.parse_section('http://www.lemonde.fr/%s/' % s))
|
|
if articles:
|
|
ans.append((section_title, articles))
|
|
return ans
|
|
|
|
def parse_section(self, url):
|
|
soup = self.index_to_soup(url)
|
|
container = soup.find(attrs={'class':lambda x: x and 'grid_12 alpha' in x})
|
|
for article in container.findAll('article'):
|
|
h2 = article.find('h2')
|
|
if h2 is None:
|
|
h2 = article.find('h3')
|
|
if h2 is None:
|
|
continue
|
|
a = h2.find('a', href=True)
|
|
if a is None:
|
|
a = h2.findParents('a', href=True)
|
|
if not a:
|
|
continue
|
|
a = a[0]
|
|
url = a['href']
|
|
if url.startswith('/'):
|
|
url = 'http://www.lemonde.fr' + url
|
|
title = self.tag_to_string(a)
|
|
desc = ''
|
|
p = article.find('p')
|
|
if p is not None:
|
|
desc = self.tag_to_string(p)
|
|
self.log('\tFound article', title, 'at', url)
|
|
yield {'title': title, 'url': url, 'description': desc}
|