From 9915d4b9636ec094f9b501d94ec55b4986df9c7c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 20 Mar 2012 18:49:00 +0530 Subject: [PATCH] Fix Le Monde --- recipes/le_monde.recipe | 79 +++++------------------------------------ 1 file changed, 8 insertions(+), 71 deletions(-) diff --git a/recipes/le_monde.recipe b/recipes/le_monde.recipe index 8fcdf9c870..6c7f15cca7 100644 --- a/recipes/le_monde.recipe +++ b/recipes/le_monde.recipe @@ -3,7 +3,6 @@ __copyright__ = '2011' ''' lemonde.fr ''' -import re from calibre.web.feeds.recipes import BasicNewsRecipe class LeMonde(BasicNewsRecipe): @@ -41,77 +40,8 @@ class LeMonde(BasicNewsRecipe): remove_empty_feeds = True - filterDuplicates = True + auto_cleanup = True - def preprocess_html(self, soup): - for alink in soup.findAll('a'): - if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) - return self.adeify_images(soup) - - preprocess_regexps = [ - (re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'), - (re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + m.group(2) + m.group(3) + ' ' + m.group(4) + m.group(5) + m.group(6)), - (re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + ' ' + m.group(2) + m.group(3) + m.group(4)), - (re.compile(r''), lambda match: ' '), - (re.compile(r'\("'), lambda match: '(« '), - (re.compile(r'"\)'), lambda match: ' »)'), - (re.compile(r'“'), lambda match: '(« '), - (re.compile(r'”'), lambda match: ' »)'), - (re.compile(r'>\''), lambda match: '>‘'), - (re.compile(r' \''), lambda match: ' ‘'), - (re.compile(r'\''), lambda match: '’'), - (re.compile(r'"'), lambda match: '« '), - (re.compile(r'""'), lambda match: '« '), - (re.compile(r'""'), lambda match: ' »'), - (re.compile(r'"'), lambda match: ' »'), - (re.compile(r'""'), lambda match: '>« '), - (re.compile(r'"<'), lambda match: ' »<'), - (re.compile(r'’"'), lambda match: '’« '), - (re.compile(r' "'), lambda match: ' « '), - (re.compile(r'" '), lambda match: ' » '), - (re.compile(r'"\.'), lambda match: ' ».'), - (re.compile(r'",'), lambda match: ' »,'), - (re.compile(r'"\?'), lambda match: ' »?'), - (re.compile(r'":'), lambda match: ' »:'), - (re.compile(r'";'), lambda match: ' »;'), - (re.compile(r'"\!'), lambda match: ' »!'), - (re.compile(r' :'), lambda match: ' :'), - (re.compile(r' ;'), lambda match: ' ;'), - (re.compile(r' \?'), lambda match: ' ?'), - (re.compile(r' \!'), lambda match: ' !'), - (re.compile(r'\s»'), lambda match: ' »'), - (re.compile(r'«\s'), lambda match: '« '), - (re.compile(r' %'), lambda match: ' %'), - (re.compile(r'\.jpg » border='), lambda match: '.jpg'), - (re.compile(r'\.png » border='), lambda match: '.png'), - (re.compile(r' – '), lambda match: ' – '), - (re.compile(r' – '), lambda match: ' – '), - (re.compile(r' - '), lambda match: ' – '), - (re.compile(r' -,'), lambda match: ' –,'), - (re.compile(r'»:'), lambda match: '» :'), - ] - - - keep_only_tags = [ - dict(name='div', attrs={'class':['contenu']}) - ] - remove_tags = [dict(name='div', attrs={'class':['LM_atome']})] - remove_tags_after = [dict(id='appel_temoignage')] - - def get_article_url(self, article): - url = article.get('guid', None) - if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : - url = None - return url - -# def get_article_url(self, article): -# link = article.get('link') -# if 'blog' not in link and ('chat' not in link): -# return link feeds = [ ('A la une', 'http://www.lemonde.fr/rss/une.xml'), @@ -137,3 +67,10 @@ class LeMonde(BasicNewsRecipe): return cover_url + def get_article_url(self, article): + url = article.get('guid', None) + if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : + url = None + return url + +