#!/usr/bin/env python2 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' # ------------------------------- # Modified by Roland Kessi - February 2014 # ------------------------------- from calibre.web.feeds.news import BasicNewsRecipe class LeTemps(BasicNewsRecipe): title = u'Le Temps' oldest_article = 7 max_articles_per_feed = 100 __author__ = 'Kovid Goyal' description = 'French news. Needs a subscription from http://www.letemps.ch' no_stylesheets = True remove_javascript = True recursions = 1 encoding = 'UTF-8' match_regexps = [r'http://www.letemps.ch/Page/Uuid/[-0-9a-f]+\|[1-9]'] language = 'fr' needs_subscription = True simultaneous_downloads = 5 use_embedded_content = False remove_empty_feeds = True def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open('http://www.letemps.ch/login') br.select_form(nr=1) br['username'] = self.username br['password'] = self.password raw = br.submit().read() if '>Login' in raw: raise ValueError('Failed to login to letemps.ch. Check ' 'your username and password') return br def get_article_url(self, article): ''' Override in a subclass to customize extraction of the :term:`URL` that points to the content for each article. Return the article URL. It is called with `article`, an object representing a parsed article from a feed. See `feedparser `_. By default it looks for the original link (for feeds syndicated via a service like feedburner or pheedo) and if found, returns that or else returns `article.link `_. ''' # ======================================================================= # Avoid going through http://rss.feedsportal.com/... # ======================================================================= for key in article.keys(): if key.endswith('_origlink'): url = article[key] if url and url.startswith('http://'): print ('Url is :', url) return url ans = article.get('link', None) if not ans and getattr(article, 'links', None): for item in article.links: if item.get('rel', 'alternate') == 'alternate': ans = item['href'] break pos = ans.find('letemps0Bch') if pos > -1: ans = 'http://www.' + ans[pos:] ans = ans.replace('0A', '0') ans = ans.replace('0B', '.') ans = ans.replace('0C', '/') ans = ans.replace('0E', '-') return ans keep_only_tags = [ dict(name='div', attrs={'id': 'content'}), ] remove_tags = [ dict(name='div', attrs={'id': 'html5_gallery'}), dict(name='ul', attrs={'class': ['tabs', 'social-buttons cf']}), dict(name='img', attrs={'class': ['bigImg']}), dict(name='div', attrs={'class': [ 'box function', 'contentInserts', 'box banner', 'related-stories', 'box additional', 'galleryOverview', 'position', 'rightAd', 'bottomAd', 'video', ]}), ] extra_css = ''' h1{font-family:"Georgia","Times New Roman",Times,serif;font-size:large;} .headline{font-family:"Georgia","Times New Roman",Times,serif;font-size:large;color:#990000;} .summary_gal{color:#777777;font-family:"Georgia","Times New Roman",Times,serif;font-size:x-small;} #capt{color:#1B1B1B;font-family:"Georgia","Times New Roman",Times,serif;font-size:x-small;} #content{font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;} .box.article.important{font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;} #h2 {font-size: 24px; line-height: 25px; margin-bottom: 14px; text-transform:uppercase;} .lead {font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;font-weight: bold; margin: 10px 0;font-size:small;} p {margin: 0 0 10px 0;} h3{font-size:small;font-weight:bold;} .description{font-size:x-small;font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;color:white; } a {color:#1B1B1B; font-size:small;} .linkbox{font-size:x-small;color:#1B1B1B;font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;} h2{font-size:small;font-weight:bold;font-family:"Lucida Grande","Lucida Sans Unicode",Arial,Verdana,sans-serif;} p.clear{clear:both;} .heading{font-size:x-small;} .heading strong{color:#940026;} .box dd { clear:both; } .box dl { position:relative; } dl.caption {float:left;overflow:hidden;position:relative;margin: 0 10px 12px -40px;} .caption dd p, .caption dt img { margin-right: 0;margin-bottom: 0;} .caption dt img {float: left;} .caption dd {width: 100%;bottom: -1px;position: absolute;} .caption dd .description {z-index: 2;margin-left: 0px;padding: 3px 4px;position: relative;} ''' feeds = [ (u'Actualité', u'http://letemps.ch/rss/site/'), (u'Actualité - Monde', u'http://letemps.ch/rss/site/actualite/monde'), (u'Actualité - Suisse & régions', u'http://letemps.ch/rss/site/actualite/suisse_regions'), (u'Actualité - Sport', u'http://letemps.ch/rss/site/actualite/sports'), (u'Actualité - Sciences & Environnement', u'http://letemps.ch/rss/site/actualite/sciences_environnement'), (u'Actualité - Multimédia', u'http://letemps.ch/rss/site/actualite/multimedia'), (u'Actualité - Société', u'http://letemps.ch/rss/site/actualite/societe'), (u'Actualité - Société | Quoi de neuf', u'http://letemps.ch/rss/site/actualite/societe/quoi_de_neuf'), (u'Economie & Finance', u'http://letemps.ch/rss/site/economie_finance'), (u'Economie & Finance - Finance', u'http://letemps.ch/rss/site/economie_finance/finance'), (u'Economie & Finance - Fonds de placement', u'http://letemps.ch/rss/site/economie_finance/fonds_placement'), (u'Economie & Finance - Carrières', u'http://letemps.ch/rss/site/economie_finance/carrieres'), (u'Culture', u'http://letemps.ch/rss/site/culture'), (u'Culture - Cinémas', u'http://letemps.ch/rss/site/culture/cinema'), (u'Culture - Musiques', u'http://letemps.ch/rss/site/culture/musiques'), (u'Culture - Scènes', u'http://letemps.ch/rss/site/culture/scenes'), (u'Culture - Arts plastiques', u'http://letemps.ch/rss/site/culture/arts_plastiques'), (u'Culture - Livres', u'http://letemps.ch/rss/site/culture/livres'), (u'Lifestyle - Luxe', u'http://letemps.ch/rss/site/lifestyle/luxe'), (u'Lifestyle - Mode', u'http://letemps.ch/rss/site/lifestyle/mode'), (u'Lifestyle - Horlogerie & Joaillerie', u'http://letemps.ch/rss/site/lifestyle/horlogerie_joaillerie'), (u'Lifestyle - Design', u'http://letemps.ch/rss/site/lifestyle/design'), (u'Lifestyle - Voyages', u'http://letemps.ch/rss/site/lifestyle/voyages'), (u'Lifestyle - Gastronomie', u'http://letemps.ch/rss/site/lifestyle/gastronomie'), (u'Lifestyle - Architecture & Immobilier', u'http://letemps.ch/rss/site/lifestyle/architecture_immobilier'), (u'Lifestyle - Automobile', u'http://letemps.ch/rss/site/lifestyle/automobile'), (u'Opinions', u'http://letemps.ch/rss/site/opinions'), (u'Opinions - Editoriaux', u'http://letemps.ch/rss/site/opinions/editoriaux'), (u'Opinions - Invités', u'http://letemps.ch/rss/site/opinions/invites'), (u'Opinions - Chroniques', u'http://letemps.ch/rss/site/opinions/chroniques'), (u'Opinions - Chappatte', u'http://letemps.ch/rss/site/opinions/chappatte') ] def parse_feeds(self): feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: # The title says it all and the description has has bad characters # for "Le Temps" del feed.description return feeds def postprocess_html(self, soup, first): for tag in soup.findAll('div', attrs={'class': 'box pagination'}): tag.extract() if not first: h = soup.find('h1') if h is not None: h.extract() return soup