# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai __author__ = 'S. Durand ' __license__ = 'GPL v3' from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class LeMonde(BasicNewsRecipe): title = u'Le Monde: Édition abonnés' __author__ = 'Sylvain Durand' description = u'La version papier du quotidien Le Monde, disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.' language = 'fr' encoding = 'utf8' needs_subscription = True extra_css = ''' img{max-width:100%} h1{font-size:1.2em !important; line-height:1.2em !important; } h2{font-size:1em !important; line-height:1em !important; } h3{font-size:1em !important; text-transform:uppercase !important; color:#666;} #photo{text-align:center !important; margin:10px 0 -8px;} #lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} ''' keep_only_tags = [ dict(itemprop=['Headline', 'description']), classes('bloc_signature'), dict(itemprop=['articleBody']), ] lm_sections = [ 'international:International', 'politique:Politique', 'societe:Société', 'economie:Éco', 'culture:Culture', 'idees:Idées', 'planete:Planète', 'sport:Sport', 'sciences:Sciences', 'pixels:Pixels', 'campus:Campus' ] def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open('https://secure.lemonde.fr/sfuser/connexion') br.select_form(name='connection') br['connection[mail]'] = self.username br['connection[password]'] = self.password br.submit() return br def preprocess_html(self, soup): for lgd in soup.findAll(id="lgd"): lgd.contents[-1].extract() for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] return soup def parse_index(self): ans = [] for x in self.lm_sections: s, section_title = x.partition(':')[::2] self.log('Processing section', section_title, '...') articles = list(self.parse_section('http://www.lemonde.fr/%s/' % s)) if articles: ans.append((section_title, articles)) return ans def parse_section(self, url): soup = self.index_to_soup(url) container = soup.find(attrs={'class':lambda x: x and 'grid_12 alpha' in x}) for article in container.findAll('article'): h2 = article.find('h2') if h2 is None: h2 = article.find('h3') if h2 is None: continue a = h2.find('a', href=True) if a is None: a = h2.findParents('a', href=True) if not a: continue a = a[0] url = a['href'] if url.startswith('/'): url = 'http://www.lemonde.fr' + url title = self.tag_to_string(a) desc = '' p = article.find('p') if p is not None: desc = self.tag_to_string(p) self.log('\tFound article', title, 'at', url) yield {'title': title, 'url': url, 'description': desc}