Update La Republica. Fixes #904387 (Updated recipe for La Repubblica)

2026-01-03 10:40:21 -05:00 · 2011-12-15 08:32:28 +05:30 · 2011-12-15 08:32:28 +05:30 · c14fd982e4
commit c14fd982e4
parent 9a2f9517f7
1 changed files with 17 additions and 22 deletions
--- a/recipes/la_republica.recipe
+++ b/recipes/la_republica.recipe
@ -1,13 +1,12 @@
 __license__   = 'GPL v3'
 __author__    = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
 __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
-description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version'
+description   = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'

 '''
 http://www.repubblica.it/
 '''

-import re
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.web.feeds.news import BasicNewsRecipe

@ -25,27 +24,21 @@ class LaRepubblica(BasicNewsRecipe):
    use_embedded_content    = False
    no_stylesheets          = True
    publication_type        = 'newspaper'
-    articles_are_obfuscated = True
-    temp_files              = []
+    articles_are_obfuscated = True    
+    temp_files              = []    
    extra_css               = """
                               img{display: block}
                              """
-
+                           
    remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
-
-    preprocess_regexps = [
-        (re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
-        (re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
-        (re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
-    ]
-
+    
    def get_article_url(self, article):
        link = BasicNewsRecipe.get_article_url(self, article)
        if link and not '.repubblica.it/' in link:
            link2 = article.get('id', article.get('guid', None))
            if link2:
                link = link2
-        return link.rpartition('?')[0]
+        return link.rpartition('?')[0]        

    def get_obfuscated_article(self, url):
        count = 0
@ -56,12 +49,12 @@ class LaRepubblica(BasicNewsRecipe):
                count = 10
            except:
                print "Retrying download..."
-            count += 1
+            count += 1        
        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
        self.temp_files[-1].write(html)
        self.temp_files[-1].close()
        return self.temp_files[-1].name
-
+        
    keep_only_tags     = [
                          dict(attrs={'class':'articolo'}),
                          dict(attrs={'class':'body-text'}),
@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
    remove_tags        = [
                            dict(name=['object','link','meta','iframe','embed']),
                            dict(name='span',attrs={'class':'linkindice'}),
-                            dict(name='div', attrs={'class':'bottom-mobile'}),
-                            dict(name='div', attrs={'id':['rssdiv','blocco']}),
-                            dict(name='div', attrs={'class':'utility'}),
+                            dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
+                            dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
+                            dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
                            dict(name='div', attrs={'class':'generalbox'}),
                            dict(name='ul', attrs={'id':'hystory'})
                         ]

    feeds          = [
-                       (u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
+                       (u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
                       (u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
                       (u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
                       (u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
@ -105,8 +98,10 @@ class LaRepubblica(BasicNewsRecipe):
    def preprocess_html(self, soup):
        for item in soup.findAll(['hgroup','deresponsabilizzazione','per']):
            item.name = 'div'
-            item.attrs = []
+            item.attrs = []            
        for item in soup.findAll(style=True):
-            del item['style']
+            del item['style']           
        return soup
-
+                      
+    def preprocess_raw_html(self, raw, url):
+       return '<html><head>'+raw[raw.find('</head>'):]