diff --git a/recipes/le_monde.recipe b/recipes/le_monde.recipe index 6c7f15cca7..afc19e4d86 100644 --- a/recipes/le_monde.recipe +++ b/recipes/le_monde.recipe @@ -1,8 +1,9 @@ __license__ = 'GPL v3' -__copyright__ = '2011' +__copyright__ = '2012' ''' lemonde.fr ''' +import re from calibre.web.feeds.recipes import BasicNewsRecipe class LeMonde(BasicNewsRecipe): @@ -24,7 +25,7 @@ class LeMonde(BasicNewsRecipe): .ariane{font-size:xx-small;} .source{font-size:xx-small;} #.href{font-size:xx-small;} - .LM_caption{color:#666666; font-size:x-small;} + #.figcaption style{color:#666666; font-size:x-small;} #.main-article-info{font-family:Arial,Helvetica,sans-serif;} #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} @@ -40,8 +41,88 @@ class LeMonde(BasicNewsRecipe): remove_empty_feeds = True - auto_cleanup = True + filterDuplicates = True + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup + + preprocess_regexps = [ + (re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'), + (re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + m.group(2) + m.group(3) + ' ' + m.group(4) + m.group(5) + m.group(6)), + (re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + ' ' + m.group(2) + m.group(3) + m.group(4)), + (re.compile(r''), lambda match: ' '), + (re.compile(r'\("'), lambda match: '(« '), + (re.compile(r'"\)'), lambda match: ' »)'), + (re.compile(r'“'), lambda match: '(« '), + (re.compile(r'”'), lambda match: ' »)'), + (re.compile(r'>\''), lambda match: '>‘'), + (re.compile(r' \''), lambda match: ' ‘'), + (re.compile(r' "'), lambda match: ' « '), + (re.compile(r'>"'), lambda match: '>« '), + (re.compile(r'"<'), lambda match: ' »<'), + (re.compile(r'" '), lambda match: ' » '), + (re.compile(r'",'), lambda match: ' »,'), + (re.compile(r'\''), lambda match: '’'), + (re.compile(r'"'), lambda match: '« '), + (re.compile(r'""'), lambda match: '« '), + (re.compile(r'""'), lambda match: ' »'), + (re.compile(r'"'), lambda match: ' »'), + (re.compile(r'""'), lambda match: '>« '), + (re.compile(r'"<'), lambda match: ' »<'), + (re.compile(r'’"'), lambda match: '’« '), + (re.compile(r' "'), lambda match: ' « '), + (re.compile(r'" '), lambda match: ' » '), + (re.compile(r'"\.'), lambda match: ' ».'), + (re.compile(r'",'), lambda match: ' »,'), + (re.compile(r'"\?'), lambda match: ' »?'), + (re.compile(r'":'), lambda match: ' »:'), + (re.compile(r'";'), lambda match: ' »;'), + (re.compile(r'"\!'), lambda match: ' »!'), + (re.compile(r' :'), lambda match: ' :'), + (re.compile(r' ;'), lambda match: ' ;'), + (re.compile(r' \?'), lambda match: ' ?'), + (re.compile(r' \!'), lambda match: ' !'), + (re.compile(r'\s»'), lambda match: ' »'), + (re.compile(r'«\s'), lambda match: '« '), + (re.compile(r' %'), lambda match: ' %'), + (re.compile(r'\.jpg » width='), lambda match: '.jpg'), + (re.compile(r'\.png » width='), lambda match: '.png'), + (re.compile(r' – '), lambda match: ' – '), + (re.compile(r'figcaption style="display:none"'), lambda match: 'figcaption'), + (re.compile(r' – '), lambda match: ' – '), + (re.compile(r' - '), lambda match: ' – '), + (re.compile(r' -,'), lambda match: ' –,'), + (re.compile(r'»:'), lambda match: '» :'), + ] + + + keep_only_tags = [ + dict(name='div', attrs={'class':['global']}) + ] + + remove_tags = [ + dict(name='div', attrs={'class':['bloc_base meme_sujet']}), + dict(name='p', attrs={'class':['lire']}) + ] + + remove_tags_after = [dict(id='fb-like')] + + def get_article_url(self, article): + url = article.get('guid', None) + if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : + url = None + return url + +# def get_article_url(self, article): +# link = article.get('link') +# if 'blog' not in link and ('chat' not in link): +# return link feeds = [ ('A la une', 'http://www.lemonde.fr/rss/une.xml'), @@ -66,11 +147,3 @@ class LeMonde(BasicNewsRecipe): cover_url = link_item.img['src'] return cover_url - - def get_article_url(self, article): - url = article.get('guid', None) - if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : - url = None - return url - -