diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 8277338e18..7ae997f90d 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -31,7 +31,7 @@ recipe_modules = ['recipe_' + r for r in ( 'pobjeda', 'chicago_breaking_news', 'glasgow_herald', 'linuxdevices', 'hindu', 'cincinnati_enquirer', 'physics_world', 'pressonline', 'la_republica', 'physics_today', 'chicago_tribune', 'e_novine', - 'al_jazeera', 'winsupersite', 'borba', + 'al_jazeera', 'winsupersite', 'borba', 'courrierinternational', )] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/recipe_courrierinternational.py b/src/calibre/web/feeds/recipes/recipe_courrierinternational.py new file mode 100644 index 0000000000..153896d4e0 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_courrierinternational.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2009, Mathieu Godlewski ' +''' +Courrier International +''' + +import re +from datetime import date +from calibre.web.feeds.news import BasicNewsRecipe + +class CourrierInternational(BasicNewsRecipe): + title = 'Courrier International' + __author__ = 'Mathieu Godlewski ' + description = 'Global news in french from international newspapers' + oldest_article = 7 + language = _('French') + max_articles_per_feed = 50 + no_stylesheets = True + + html2lrf_options = ['--base-font-size', '10'] + + feeds = [ + # Some articles requiring subscription fails on download. + ('A la Une', 'http://www.courrierinternational.com/rss/rss_a_la_une.xml'), + ] + + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in + [ + #Handle Depeches + (r'.*]*>([0-9][0-9]/.*

).*', lambda match : '
'+match.group(1)+'
'), + #Handle Articles + (r'.*]*>(Courrier international.*?) .*', lambda match : '
'+match.group(1)+''), + ] + ] + + + def print_version(self, url): + return re.sub('/[a-zA-Z]+\.asp','/imprimer.asp' ,url) + diff --git a/src/calibre/web/feeds/recipes/recipe_le_monde.py b/src/calibre/web/feeds/recipes/recipe_le_monde.py index b543650200..0fc05b3aa5 100644 --- a/src/calibre/web/feeds/recipes/recipe_le_monde.py +++ b/src/calibre/web/feeds/recipes/recipe_le_monde.py @@ -7,7 +7,7 @@ lemonde.fr ''' import re - +from datetime import date from calibre.web.feeds.news import BasicNewsRecipe @@ -15,11 +15,15 @@ class LeMonde(BasicNewsRecipe): title = 'LeMonde.fr' __author__ = 'Mathieu Godlewski ' description = 'Global news in french' - oldest_article = 7 + oldest_article = 3 language = _('French') - max_articles_per_feed = 20 + max_articles_per_feed = 30 no_stylesheets = True + cover_url='http://abonnes.lemonde.fr/titresdumonde/'+date.today().strftime("%y%m%d")+'/1.jpg' + + html2lrf_options = ['--base-font-size', '10'] + feeds = [ ('A la Une', 'http://www.lemonde.fr/rss/une.xml'), ('International', 'http://www.lemonde.fr/rss/sequence/0,2-3210,1-0,0.xml'), @@ -38,25 +42,57 @@ class LeMonde(BasicNewsRecipe): ('Examens', 'http://www.lemonde.fr/rss/sequence/0,2-3404,1-0,0.xml'), ('Opinions', 'http://www.lemonde.fr/rss/sequence/0,2-3232,1-0,0.xml') ] - + remove_tags = [dict(name='img', attrs={'src':'http://medias.lemonde.fr/mmpub/img/lgo/lemondefr_pet.gif'}), dict(name='div', attrs={'id':'xiti-logo-noscript'}), dict(name='br', attrs={}), dict(name='iframe', attrs={}), ] - + extra_css = '.ar-tit {font-size: x-large;} \n .dt {font-size: x-small;}' - filter_regexps = [r'xiti\.com'] - - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in [ + (r'.*?.*?
.*?
).*You can start editing here.*', lambda match : ''+match.group(1)+''), (r'

 

', lambda match : ''), (r']*>
', lambda match : '
'+match.group(1).upper()), + (r']*>
', lambda match : '
"'+match.group(1).upper()), (r'(
.*
).*', lambda match : match.group(1)), ] ] - + + article_match_regexps = [ (re.compile(i)) for i in + [ + (r'http://www\.lemonde\.fr/\S+/article/.*'), + (r'http://www\.lemonde\.fr/\S+/portfolio/.*'), + (r'http://www\.lemonde\.fr/\S+/article_interactif/.*'), + (r'http://\S+\.blog\.lemonde\.fr/.*'), + ] + ] + def print_version(self, url): - return re.sub('http:.*_([0-9]+)_[0-9]+\.html.*','http://www.lemonde.fr/web/imprimer_element/0,40-0,50-\\1,0.html' ,url) + return re.sub('http://www\.lemonde\.fr/.*_([0-9]+)_[0-9]+\.html.*','http://www.lemonde.fr/web/imprimer_element/0,40-0,50-\\1,0.html' ,url) + # Used to filter duplicated articles + articles_list = [] + + def get_article_url(self, article): + url=article.get('link', None) + url=url[0:url.find("#")] + if url in self.articles_list: + self.log_debug(_('Skipping duplicated article: %s')%url) + return False + if self.is_article_wanted(url): + self.articles_list.append(url) + return url + self.log_debug(_('Skipping filtered article: %s')%url) + return False + + + def is_article_wanted(self, url): + if self.article_match_regexps: + for m in self.article_match_regexps: + if m.search(url): + return True + return False + return False