From c14fd982e412e67a6ad81fd9cce44b2908b194f9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 15 Dec 2011 08:32:28 +0530 Subject: [PATCH] Update La Republica. Fixes #904387 (Updated recipe for La Repubblica) --- recipes/la_republica.recipe | 39 ++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/recipes/la_republica.recipe b/recipes/la_republica.recipe index 05be1955b4..2259f2dc52 100644 --- a/recipes/la_republica.recipe +++ b/recipes/la_republica.recipe @@ -1,13 +1,12 @@ __license__ = 'GPL v3' __author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini' __copyright__ = '2009-2011, Darko Miletic , Lorenzo Vigentini ' -description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version' +description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version' ''' http://www.repubblica.it/ ''' -import re from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe @@ -25,27 +24,21 @@ class LaRepubblica(BasicNewsRecipe): use_embedded_content = False no_stylesheets = True publication_type = 'newspaper' - articles_are_obfuscated = True - temp_files = [] + articles_are_obfuscated = True + temp_files = [] extra_css = """ img{display: block} """ - + remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb'] - - preprocess_regexps = [ - (re.compile(r'.*?', re.DOTALL|re.IGNORECASE), lambda match: ''), - (re.compile(r'.*?', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'), - (re.compile(r'.*?', re.DOTALL|re.IGNORECASE), lambda match: '') - ] - + def get_article_url(self, article): link = BasicNewsRecipe.get_article_url(self, article) if link and not '.repubblica.it/' in link: link2 = article.get('id', article.get('guid', None)) if link2: link = link2 - return link.rpartition('?')[0] + return link.rpartition('?')[0] def get_obfuscated_article(self, url): count = 0 @@ -56,12 +49,12 @@ class LaRepubblica(BasicNewsRecipe): count = 10 except: print "Retrying download..." - count += 1 + count += 1 self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files[-1].write(html) self.temp_files[-1].close() return self.temp_files[-1].name - + keep_only_tags = [ dict(attrs={'class':'articolo'}), dict(attrs={'class':'body-text'}), @@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe): remove_tags = [ dict(name=['object','link','meta','iframe','embed']), dict(name='span',attrs={'class':'linkindice'}), - dict(name='div', attrs={'class':'bottom-mobile'}), - dict(name='div', attrs={'id':['rssdiv','blocco']}), - dict(name='div', attrs={'class':'utility'}), + dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}), + dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}), + dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}), dict(name='div', attrs={'class':'generalbox'}), dict(name='ul', attrs={'id':'hystory'}) ] feeds = [ - (u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'), + (u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'), (u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'), (u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'), (u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'), @@ -105,8 +98,10 @@ class LaRepubblica(BasicNewsRecipe): def preprocess_html(self, soup): for item in soup.findAll(['hgroup','deresponsabilizzazione','per']): item.name = 'div' - item.attrs = [] + item.attrs = [] for item in soup.findAll(style=True): - del item['style'] + del item['style'] return soup - + + def preprocess_raw_html(self, raw, url): + return ''+raw[raw.find(''):]