From b7ac30b8904ddeb0c190d3b91d3f7d78b957b02f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Oct 2010 16:02:55 -0600 Subject: [PATCH] Rue89 by Louis Gesbert and update Mediapart --- resources/recipes/mediapart.recipe | 82 ++++++++++++++++++++---------- resources/recipes/rue89.recipe | 53 +++++++++++++++++++ 2 files changed, 107 insertions(+), 28 deletions(-) create mode 100644 resources/recipes/rue89.recipe diff --git a/resources/recipes/mediapart.recipe b/resources/recipes/mediapart.recipe index ca5f787747..0cf8f21032 100644 --- a/resources/recipes/mediapart.recipe +++ b/resources/recipes/mediapart.recipe @@ -1,53 +1,79 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Mathieu Godlewski ' +__copyright__ = '2009, Mathieu Godlewski ; 2010, Louis Gesbert ' ''' Mediapart ''' -import re, string -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.BeautifulSoup import Tag from calibre.web.feeds.news import BasicNewsRecipe class Mediapart(BasicNewsRecipe): title = 'Mediapart' - __author__ = 'Mathieu Godlewski ' + __author__ = 'Mathieu Godlewski' description = 'Global news in french from online newspapers' oldest_article = 7 language = 'fr' + needs_subscription = True max_articles_per_feed = 50 no_stylesheets = True - html2lrf_options = ['--base-font-size', '10'] + cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg' feeds = [ ('Les articles', 'http://www.mediapart.fr/articles/feed'), ] - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in - [ - (r'', lambda match : '

'+match.group(1)+'

'), - (r'

Mediapart\.fr

', lambda match : ''), - (r']*>[\s]*

', lambda match : ''), - (r'

[^>]*

', lambda match : ''), +# -- print-version has poor quality on this website, better do the conversion ourselves +# +# preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in +# [ +# (r'', lambda match : '

'+match.group(1)+'

'), +# (r'[^>]+]*>([^<]*)[^<]*', +# lambda match : ''+match.group(1)+''), +# (r'\'', lambda match: '’'), +# ] +# ] +# +# remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}), +# dict(name='div', attrs={'class':'print-links'}), +# dict(name='img', attrs={'src':'entete_article.png'}), +# dict(name='br') ] +# +# def print_version(self, url): +# raw = self.browser.open(url).read() +# soup = BeautifulSoup(raw.decode('utf8', 'replace')) +# div = soup.find('div', {'id':re.compile('node-\d+')}) +# if div is None: +# return None +# article_id = string.replace(div['id'], 'node-', '') +# if article_id is None: +# return None +# return 'http://www.mediapart.fr/print/'+article_id + +# -- Non-print version [dict(name='div', attrs={'class':'advert'})] + + keep_only_tags = [ + dict(name='h1', attrs={'class':'title'}), + dict(name='div', attrs={'class':'page_papier_detail'}), ] - ] - remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}), - dict(name='div', attrs={'class':'print-links'}), - dict(name='img', attrs={'src':'entete_article.png'}), - ] + def preprocess_html(self,soup): + for title in soup.findAll('div', {'class':'titre'}): + tag = Tag(soup, 'h3') + title.replaceWith(tag) + tag.insert(0,title) + return soup +# -- Handle login + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://www.mediapart.fr/') + br.select_form(nr=1) + br['name'] = self.username + br['pass'] = self.password + br.submit() + return br - def print_version(self, url): - raw = self.browser.open(url).read() - soup = BeautifulSoup(raw.decode('utf8', 'replace')) - div = soup.find('div', {'class':'node node-type-article'}) - if div is None: - return None - article_id = string.replace(div['id'], 'node-', '') - if article_id is None: - return None - return 'http://www.mediapart.fr/print/'+article_id diff --git a/resources/recipes/rue89.recipe b/resources/recipes/rue89.recipe new file mode 100644 index 0000000000..51cf8f6b98 --- /dev/null +++ b/resources/recipes/rue89.recipe @@ -0,0 +1,53 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Louis Gesbert ' +''' +Rue89 +''' + +__author__ = '2010, Louis Gesbert ' + +import re +from calibre.ebooks.BeautifulSoup import Tag +from calibre.web.feeds.news import BasicNewsRecipe + +class Rue89(BasicNewsRecipe): + title = 'Rue89' + __author__ = 'Louis Gesbert' + description = 'Popular free french news website' + title = u'Rue89' + language = 'fr' + oldest_article = 7 + max_articles_per_feed = 50 + + feeds = [(u'La Une', u'http://www.rue89.com/homepage/feed')] + + no_stylesheets = True + + preprocess_regexps = [ + (re.compile(r'<(/?)h2>', re.IGNORECASE|re.DOTALL), + lambda match : '<'+match.group(1)+'h3>'), + (re.compile(r'', re.IGNORECASE|re.DOTALL), + lambda match : '

'+match.group(1)+'

'), + (re.compile(r']+src="[^"]*/numeros/(\d+)[^0-9.">]*.gif"[^>]*/>', re.IGNORECASE|re.DOTALL), + lambda match : ''+match.group(1)+''), + (re.compile(r'\''), lambda match: '’'), + ] + + def preprocess_html(self,soup): + body = Tag(soup, 'body') + title = soup.find('h1', {'class':'title'}) + content = soup.find('div', {'class':'content'}) + soup.body.replaceWith(body) + body.insert(0, title) + body.insert(1, content) + return soup + + remove_tags = [ #dict(name='div', attrs={'class':'print-source_url'}), + #dict(name='div', attrs={'class':'print-links'}), + #dict(name='img', attrs={'class':'print-logo'}), + dict(name='div', attrs={'class':'content_top'}), + dict(name='div', attrs={'id':'sidebar-left'}), ] + +# -- print-version has poor quality on this website, better do the conversion ourselves +# def print_version(self, url): +# return re.sub('^.*-([0-9]+)$', 'http://www.rue89.com/print/\\1',url)