From 9915d4b9636ec094f9b501d94ec55b4986df9c7c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 20 Mar 2012 18:49:00 +0530
Subject: [PATCH] Fix Le Monde

---
 recipes/le_monde.recipe | 79 +++++------------------------------------
 1 file changed, 8 insertions(+), 71 deletions(-)
diff --git a/recipes/le_monde.recipe b/recipes/le_monde.recipe
index 8fcdf9c870..6c7f15cca7 100644
--- a/recipes/le_monde.recipe
+++ b/recipes/le_monde.recipe
@@ -3,7 +3,6 @@ __copyright__ = '2011'
 '''
 lemonde.fr
 '''
-import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 
 class LeMonde(BasicNewsRecipe):
@@ -41,77 +40,8 @@ class LeMonde(BasicNewsRecipe):
 
     remove_empty_feeds = True
 
-    filterDuplicates = True
+    auto_cleanup = True
 
-    def preprocess_html(self, soup):
-        for alink in soup.findAll('a'):
-            if alink.string is not None:
-               tstr = alink.string
-               alink.replaceWith(tstr)
-        return self.adeify_images(soup)
-
-    preprocess_regexps = [
-        (re.compile(r'([0-9])%'), lambda m: m.group(1) + '&nbsp;%'),
-        (re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + m.group(2) + m.group(3) + '&nbsp;' + m.group(4) + m.group(5) + m.group(6)),
-        (re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + '&nbsp;' + m.group(2) + m.group(3) + m.group(4)),
-        (re.compile(r'<span>'), lambda match: ' <span>'),
-        (re.compile(r'\("'), lambda match: '(&laquo;&nbsp;'),
-        (re.compile(r'"\)'), lambda match: '&nbsp;&raquo;)'),
-        (re.compile(r'&ldquo;'), lambda match: '(&laquo;&nbsp;'),
-        (re.compile(r'&rdquo;'), lambda match: '&nbsp;&raquo;)'),
-        (re.compile(r'>\''), lambda match: '>&lsquo;'),
-        (re.compile(r' \''), lambda match: ' &lsquo;'),
-        (re.compile(r'\''), lambda match: '&rsquo;'),
-        (re.compile(r'"<em>'), lambda match: '<em>&laquo;&nbsp;'),
-        (re.compile(r'"<em>"</em><em>'), lambda match: '<em>&laquo;&nbsp;'),
-        (re.compile(r'"<a href='), lambda match: '&laquo;&nbsp;<a href='),
-        (re.compile(r'</em>"'), lambda match: '&nbsp;&raquo;</em>'),
-        (re.compile(r'</a>"'), lambda match: '&nbsp;&raquo;</a>'),
-        (re.compile(r'"</'), lambda match: '&nbsp;&raquo;</'),
-        (re.compile(r'>"'), lambda match: '>&laquo;&nbsp;'),
-        (re.compile(r'"<'), lambda match: '&nbsp;&raquo;<'),
-        (re.compile(r'&rsquo;"'), lambda match: '&rsquo;«&nbsp;'),
-        (re.compile(r' "'), lambda match: ' &laquo;&nbsp;'),
-        (re.compile(r'" '), lambda match: '&nbsp;&raquo; '),
-        (re.compile(r'"\.'), lambda match: '&nbsp;&raquo;.'),
-        (re.compile(r'",'), lambda match: '&nbsp;&raquo;,'),
-        (re.compile(r'"\?'), lambda match: '&nbsp;&raquo;?'),
-        (re.compile(r'":'), lambda match: '&nbsp;&raquo;:'),
-        (re.compile(r'";'), lambda match: '&nbsp;&raquo;;'),
-        (re.compile(r'"\!'), lambda match: '&nbsp;&raquo;!'),
-        (re.compile(r' :'), lambda match: '&nbsp;:'),
-        (re.compile(r' ;'), lambda match: '&nbsp;;'),
-        (re.compile(r' \?'), lambda match: '&nbsp;?'),
-        (re.compile(r' \!'), lambda match: '&nbsp;!'),
-        (re.compile(r'\s»'), lambda match: '&nbsp;»'),
-        (re.compile(r'«\s'), lambda match: '«&nbsp;'),
-        (re.compile(r' %'), lambda match: '&nbsp;%'),
-        (re.compile(r'\.jpg&nbsp;&raquo; border='), lambda match: '.jpg'),
-        (re.compile(r'\.png&nbsp;&raquo; border='), lambda match: '.png'),
-        (re.compile(r' &ndash; '), lambda match: '&nbsp;&ndash; '),
-        (re.compile(r' – '), lambda match: '&nbsp;&ndash; '),
-        (re.compile(r' - '), lambda match: '&nbsp;&ndash; '),
-        (re.compile(r' -,'), lambda match: '&nbsp;&ndash;,'),
-        (re.compile(r'&raquo;:'), lambda match: '&raquo;&nbsp;:'),
-        ]
-
-
-    keep_only_tags    = [
-                       dict(name='div', attrs={'class':['contenu']})
-                        ]
-    remove_tags = [dict(name='div', attrs={'class':['LM_atome']})]
-    remove_tags_after = [dict(id='appel_temoignage')]
-
-    def get_article_url(self, article):
-          url = article.get('guid', None)
-          if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url :
-              url = None
-          return url
-
-#    def get_article_url(self, article):
-#        link = article.get('link')
-#        if 'blog' not in link and ('chat' not in link):
-#             return link
 
     feeds          = [
                       ('A la une', 'http://www.lemonde.fr/rss/une.xml'),
@@ -137,3 +67,10 @@ class LeMonde(BasicNewsRecipe):
 
         return cover_url
 
+    def get_article_url(self, article):
+        url = article.get('guid', None)
+        if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url :
+            url = None
+        return url
+
+