From 414db0f196635a58d55b5013e80be0b976104fc3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 25 Feb 2011 06:36:30 -0700 Subject: [PATCH] Updated Le Monde --- resources/recipes/le_monde.recipe | 69 +++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/resources/recipes/le_monde.recipe b/resources/recipes/le_monde.recipe index c14b8eeeff..3c47d81ee1 100644 --- a/resources/recipes/le_monde.recipe +++ b/resources/recipes/le_monde.recipe @@ -1,10 +1,15 @@ +__license__ = 'GPL v3' +__copyright__ = '2011' +''' +lemonde.fr +''' import re from calibre.web.feeds.recipes import BasicNewsRecipe class LeMonde(BasicNewsRecipe): title = 'Le Monde' __author__ = 'veezh' - description = u'Actualit\xe9s' + description = 'Actualités' oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True @@ -12,13 +17,27 @@ class LeMonde(BasicNewsRecipe): use_embedded_content = False encoding = 'cp1252' publisher = 'lemonde.fr' + category = 'news, France, world' language = 'fr' + #publication_type = 'newsportal' + extra_css = ''' + h1{font-size:130%;} + .ariane{font-size:xx-small;} + .source{font-size:xx-small;} + #.href{font-size:xx-small;} + .LM_caption{color:#666666; font-size:x-small;} + #.main-article-info{font-family:Arial,Helvetica,sans-serif;} + #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} + #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} + ''' + #preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] conversion_options = { - 'comments' : description - ,'language' : language - ,'publisher' : publisher - ,'linearize_tables': True - } + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } remove_empty_feeds = True @@ -32,15 +51,28 @@ class LeMonde(BasicNewsRecipe): return soup preprocess_regexps = [ + (re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'), + (re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + m.group(2) + m.group(3) + ' ' + m.group(4) + m.group(5) + m.group(6)), + (re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + ' ' + m.group(2) + m.group(3) + m.group(4)), + (re.compile(r''), lambda match: ' '), + (re.compile(r'\("'), lambda match: '(« '), + (re.compile(r'"\)'), lambda match: ' »)'), + (re.compile(r'“'), lambda match: '(« '), + (re.compile(r'”'), lambda match: ' »)'), + (re.compile(r'>\''), lambda match: '>‘'), (re.compile(r' \''), lambda match: ' ‘'), (re.compile(r'\''), lambda match: '’'), - (re.compile(r'"<'), lambda match: ' »<'), + (re.compile(r'"'), lambda match: '« '), + (re.compile(r'""'), lambda match: '« '), + (re.compile(r'""'), lambda match: ' »'), + (re.compile(r'"'), lambda match: ' »'), + (re.compile(r'""'), lambda match: '>« '), + (re.compile(r'"<'), lambda match: ' »<'), (re.compile(r'’"'), lambda match: '’« '), (re.compile(r' "'), lambda match: ' « '), (re.compile(r'" '), lambda match: ' » '), - (re.compile(r'\("'), lambda match: '(« '), - (re.compile(r'"\)'), lambda match: ' »)'), (re.compile(r'"\.'), lambda match: ' ».'), (re.compile(r'",'), lambda match: ' »,'), (re.compile(r'"\?'), lambda match: ' »?'), @@ -56,8 +88,14 @@ class LeMonde(BasicNewsRecipe): (re.compile(r' %'), lambda match: ' %'), (re.compile(r'\.jpg » border='), lambda match: '.jpg'), (re.compile(r'\.png » border='), lambda match: '.png'), + (re.compile(r' – '), lambda match: ' – '), + (re.compile(r' – '), lambda match: ' – '), + (re.compile(r' - '), lambda match: ' – '), + (re.compile(r' -,'), lambda match: ' –,'), + (re.compile(r'»:'), lambda match: '» :'), ] + keep_only_tags = [ dict(name='div', attrs={'class':['contenu']}) ] @@ -65,11 +103,15 @@ class LeMonde(BasicNewsRecipe): remove_tags_after = [dict(id='appel_temoignage')] def get_article_url(self, article): - link = article.get('link') - if 'blog' not in link: - return link - + url = article.get('guid', None) + if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : + url = None + return url +# def get_article_url(self, article): +# link = article.get('link') +# if 'blog' not in link and ('chat' not in link): +# return link feeds = [ ('A la une', 'http://www.lemonde.fr/rss/une.xml'), @@ -94,3 +136,4 @@ class LeMonde(BasicNewsRecipe): cover_url = link_item.img['src'] return cover_url +