From c14fd982e412e67a6ad81fd9cce44b2908b194f9 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 15 Dec 2011 08:32:28 +0530
Subject: [PATCH] Update La Republica. Fixes #904387 (Updated recipe for La
 Repubblica)

---
 recipes/la_republica.recipe | 39 ++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 22 deletions(-)
diff --git a/recipes/la_republica.recipe b/recipes/la_republica.recipe
index 05be1955b4..2259f2dc52 100644
--- a/recipes/la_republica.recipe
+++ b/recipes/la_republica.recipe
@@ -1,13 +1,12 @@
 __license__   = 'GPL v3'
 __author__    = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
 __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
-description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version'
+description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'
 
 '''
 http://www.repubblica.it/
 '''
 
-import re
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.web.feeds.news import BasicNewsRecipe
 
@@ -25,27 +24,21 @@ class LaRepubblica(BasicNewsRecipe):
     use_embedded_content    = False
     no_stylesheets          = True
     publication_type        = 'newspaper'
-    articles_are_obfuscated = True
-    temp_files              = []
+    articles_are_obfuscated = True    
+    temp_files              = []    
     extra_css               = """
                                img{display: block}
                               """
-
+                           
     remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
-
-    preprocess_regexps = [
-        (re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
-        (re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
-        (re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
-    ]
-
+    
     def get_article_url(self, article):
         link = BasicNewsRecipe.get_article_url(self, article)
         if link and not '.repubblica.it/' in link:
             link2 = article.get('id', article.get('guid', None))
             if link2:
                 link = link2
-        return link.rpartition('?')[0]
+        return link.rpartition('?')[0]        
 
     def get_obfuscated_article(self, url):
         count = 0
@@ -56,12 +49,12 @@ class LaRepubblica(BasicNewsRecipe):
                 count = 10
             except:
                 print "Retrying download..."
-            count += 1
+            count += 1        
         self.temp_files.append(PersistentTemporaryFile('_fa.html'))
         self.temp_files[-1].write(html)
         self.temp_files[-1].close()
         return self.temp_files[-1].name
-
+        
     keep_only_tags     = [
                           dict(attrs={'class':'articolo'}),
                           dict(attrs={'class':'body-text'}),
@@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
     remove_tags        = [
                             dict(name=['object','link','meta','iframe','embed']),
                             dict(name='span',attrs={'class':'linkindice'}),
-                            dict(name='div', attrs={'class':'bottom-mobile'}),
-                            dict(name='div', attrs={'id':['rssdiv','blocco']}),
-                            dict(name='div', attrs={'class':'utility'}),
+                            dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
+                            dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
+                            dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
                             dict(name='div', attrs={'class':'generalbox'}),
                             dict(name='ul', attrs={'id':'hystory'})
                          ]
 
     feeds          = [
-                       (u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
+                       (u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
                        (u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
                        (u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
                        (u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
@@ -105,8 +98,10 @@ class LaRepubblica(BasicNewsRecipe):
     def preprocess_html(self, soup):
         for item in soup.findAll(['hgroup','deresponsabilizzazione','per']):
             item.name = 'div'
-            item.attrs = []
+            item.attrs = []            
         for item in soup.findAll(style=True):
-            del item['style']
+            del item['style']           
         return soup
-
+                      
+    def preprocess_raw_html(self, raw, url):
+       return '<html><head>'+raw[raw.find('</head>'):]