diff --git a/recipes/icons/mediapart.png b/recipes/icons/mediapart.png new file mode 100644 index 0000000000..ab489d3db7 Binary files /dev/null and b/recipes/icons/mediapart.png differ diff --git a/recipes/icons/rue89.png b/recipes/icons/rue89.png new file mode 100644 index 0000000000..55c52bc488 Binary files /dev/null and b/recipes/icons/rue89.png differ diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index a5bc4e96f9..0c9bbb4b01 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -1,11 +1,13 @@ __license__ = 'GPL v3' -__copyright__ = '2009, Mathieu Godlewski ; 2010, 2011, Louis Gesbert ' +__copyright__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' ''' Mediapart ''' +__author__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' + import re -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.web.feeds.news import BasicNewsRecipe class Mediapart(BasicNewsRecipe): @@ -15,8 +17,9 @@ class Mediapart(BasicNewsRecipe): oldest_article = 7 language = 'fr' needs_subscription = True - max_articles_per_feed = 50 + + use_embedded_content = False no_stylesheets = True cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg' @@ -27,14 +30,9 @@ class Mediapart(BasicNewsRecipe): # -- print-version - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in - [ - (r'', lambda match : '

'+match.group(1)+'

'), - (r'\'', lambda match: '’') - ] - ] + conversion_options = { 'smarten_punctuation' : True } - remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ] + remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ] def print_version(self, url): raw = self.browser.open(url).read() @@ -55,3 +53,11 @@ class Mediapart(BasicNewsRecipe): br['pass'] = self.password br.submit() return br + + def preprocess_html(self, soup): + for title in soup.findAll('p', {'class':'titre_page'}): + title.name = 'h3' + for legend in soup.findAll('span', {'class':'legend'}): + legend.insert(0, Tag(soup, 'br', [])) + legend.name = 'small' + return soup diff --git a/recipes/rue89.recipe b/recipes/rue89.recipe index 51cf8f6b98..c49712dc32 100644 --- a/recipes/rue89.recipe +++ b/recipes/rue89.recipe @@ -1,10 +1,10 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Louis Gesbert ' +__copyright__ = '2010-2012, Louis Gesbert ' ''' Rue89 ''' -__author__ = '2010, Louis Gesbert ' +__author__ = '2010-2012, Louis Gesbert ' import re from calibre.ebooks.BeautifulSoup import Tag @@ -17,37 +17,45 @@ class Rue89(BasicNewsRecipe): title = u'Rue89' language = 'fr' oldest_article = 7 - max_articles_per_feed = 50 + max_articles_per_feed = 12 - feeds = [(u'La Une', u'http://www.rue89.com/homepage/feed')] + use_embedded_content = False + + # From http://www.rue89.com/les-flux-rss-de-rue89 + feeds = [ + (u'La Une', u'http://www.rue89.com/feed'), + (u'Rue69', u'http://www.rue89.com/rue69/feed'), + (u'Eco', u'http://www.rue89.com/rue89-eco/feed'), + (u'Planète', u'http://www.rue89.com/rue89-planete/feed'), + (u'Sport', u'http://www.rue89.com/rue89-sport/feed'), + (u'Culture', u'http://www.rue89.com/culture/feed'), + (u'Hi-tech', u'http://www.rue89.com/hi-tech/feed'), + (u'Media', u'http://www.rue89.com/medias/feed'), + (u'Monde', u'http://www.rue89.com/monde/feed'), + (u'Politique', u'http://www.rue89.com/politique/feed'), + (u'Societe', u'http://www.rue89.com/societe/feed'), + ] + + # Follow redirection from feedsportal.com + def get_article_url(self,article): + return self.browser.open_novisit(article.link).geturl() + + def print_version(self, url): + return url + '?imprimer=1' no_stylesheets = True - preprocess_regexps = [ - (re.compile(r'<(/?)h2>', re.IGNORECASE|re.DOTALL), - lambda match : '<'+match.group(1)+'h3>'), - (re.compile(r'', re.IGNORECASE|re.DOTALL), - lambda match : '

'+match.group(1)+'

'), - (re.compile(r']+src="[^"]*/numeros/(\d+)[^0-9.">]*.gif"[^>]*/>', re.IGNORECASE|re.DOTALL), - lambda match : ''+match.group(1)+''), - (re.compile(r'\''), lambda match: '’'), - ] + conversion_options = { 'smarten_punctuation' : True } - def preprocess_html(self,soup): - body = Tag(soup, 'body') - title = soup.find('h1', {'class':'title'}) - content = soup.find('div', {'class':'content'}) - soup.body.replaceWith(body) - body.insert(0, title) - body.insert(1, content) - return soup + keep_only_tags = [ + dict(name='div', attrs={'id':'article'}), + ] - remove_tags = [ #dict(name='div', attrs={'class':'print-source_url'}), - #dict(name='div', attrs={'class':'print-links'}), - #dict(name='img', attrs={'class':'print-logo'}), - dict(name='div', attrs={'class':'content_top'}), - dict(name='div', attrs={'id':'sidebar-left'}), ] + remove_tags_after = [ + dict(name='div', attrs={'id':'plus_loin'}), + ] -# -- print-version has poor quality on this website, better do the conversion ourselves -# def print_version(self, url): -# return re.sub('^.*-([0-9]+)$', 'http://www.rue89.com/print/\\1',url) + remove_tags = [ + dict(name='div', attrs={'id':'article_tools'}), + dict(name='div', attrs={'id':'plus_loin'}), + ]