__license__ = 'GPL v3' __copyright__ = '2012' ''' lemonde.fr ''' from calibre.web.feeds.recipes import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class LeMonde(BasicNewsRecipe): title = 'Le Monde' __author__ = 'veezh' description = u'Actualités' oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'utf-8' publisher = 'lemonde.fr' category = 'news, France, world' language = 'fr' extra_css = ''' img{max-width:100%} h1{font-size:1.2em !important; line-height:1.2em !important; } h2{font-size:1em !important; line-height:1em !important; } h3{font-size:1em !important; text-transform:uppercase !important; color:#666;} #photo{text-align:center !important; margin:10px 0 -8px;} #lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} ''' keep_only_tags = [ dict(itemprop=['Headline', 'description']), classes('bloc_signature'), dict(itemprop=['articleBody']), ] remove_empty_feeds = True def preprocess_html(self, soup): for lgd in soup.findAll(id="lgd"): lgd.contents[-1].extract() for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] return soup def get_article_url(self, article): url = article.get('guid', None) if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url: url = None return url feeds = [ ('A la une', 'http://www.lemonde.fr/rss/une.xml'), ('International', 'http://www.lemonde.fr/rss/tag/international.xml'), ('Europe', 'http://www.lemonde.fr/rss/tag/europe.xml'), (u'Société', 'http://www.lemonde.fr/rss/tag/societe.xml'), ('Economie', 'http://www.lemonde.fr/rss/tag/economie.xml'), (u'Médias', 'http://www.lemonde.fr/rss/tag/actualite-medias.xml'), (u'Planète', 'http://www.lemonde.fr/rss/tag/planete.xml'), ('Culture', 'http://www.lemonde.fr/rss/tag/culture.xml'), ('Technologies', 'http://www.lemonde.fr/rss/tag/technologies.xml'), ('Livres', 'http://www.lemonde.fr/rss/tag/livres.xml'), ] def get_cover_url(self): cover_url = None soup = self.index_to_soup( 'http://www.lemonde.fr/web/monde_pdf/0,33-0,1-0,0.html') link_item = soup.find('div', attrs={'class': 'pg-gch'}) if link_item and link_item.img: cover_url = link_item.img['src'] return cover_url