diff --git a/recipes/eltribuno_salta_impreso.recipe b/recipes/eltribuno_salta_impreso.recipe index f768c36add..bf50ffce22 100644 --- a/recipes/eltribuno_salta_impreso.recipe +++ b/recipes/eltribuno_salta_impreso.recipe @@ -1,128 +1,45 @@ -__license__ = 'GPL v3' -__copyright__ = '2013, Darko Miletic ' +__license__ = 'GPL v3' +__copyright__ = '2013 - 2016, Darko Miletic ' ''' http://www.eltribuno.info/salta/edicion_impresa.aspx ''' -import urllib -from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe -from collections import OrderedDict - class ElTribunoSaltaImpreso(BasicNewsRecipe): - title = 'El Tribuno Salta (Edición Impresa)' - __author__ = 'Darko Miletic' - description = "Diario principal de Salta" - publisher = 'Horizontes S.A.' - category = 'news, politics, Salta, Argentina, World' - oldest_article = 2 - language = 'es_AR' - max_articles_per_feed = 250 - no_stylesheets = True - use_embedded_content = False - encoding = 'utf8' - publication_type = 'newspaper' - delay = 1 - articles_are_obfuscated = True - temp_files = [] - PREFIX = 'http://www.eltribuno.info/salta/' - INDEX = PREFIX + 'edicion_impresa.aspx' - PRINTURL = PREFIX + 'nota_print.aspx?%s' + title = 'El Tribuno Salta' + __author__ = 'Darko Miletic' + description = "Diario principal de Salta" + publisher = 'Horizontes S.A.' + category = 'news, politics, Salta, Argentina, World' + oldest_article = 2 + language = 'es_AR' + max_articles_per_feed = 250 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf8' + publication_type = 'newspaper' + remove_javascript = True + auto_cleanup = True conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True - } - - keep_only_tags = [ - dict(name='div', attrs={'class': ['notaHead', 'notaContent']})] - remove_tags = [ - dict(name=['meta', 'iframe', 'base', - 'object', 'embed', 'link', 'img']), - dict(name='ul', attrs={'class': 'Tabs'}) - ] + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language + } extra_css = """ body{font-family: Arial,Helvetica,sans-serif} - .notaHead h4{text-transform: uppercase; color: gray} img{margin-top: 0.8em; display: block} """ - def parse_index(self): - feeds = OrderedDict() - soup = None - count = 0 - while (count < 5): - try: - soup = self.index_to_soup(self.INDEX) - count = 5 - except: - print "Retrying download..." - count += 1 - if not soup: - return [] - alink = soup.find('a', href=True, attrs={'class': 'ZoomTapa'}) - if alink and 'href' in alink: - self.cover_url = alink['href'] - sections = soup.findAll( - 'div', attrs={'id': lambda x: x and x.startswith('Ediciones')}) - for section in sections: - section_title = 'Sin titulo' - sectiont = section.find('h3', attrs={'class': 'NombreSeccion'}) - if sectiont: - section_title = self.tag_to_string(sectiont.span) - - arts = section.findAll( - 'div', attrs={'class': 'Noticia NoticiaAB1'}) - for article in arts: - articles = [] - title = self.tag_to_string(article.div.h3.a) - url = article.div.h3.a['href'] - description = self.tag_to_string(article.p) - articles.append({'title': title, 'url': url, - 'description': description, 'date': ''}) - - if articles: - if section_title not in feeds: - feeds[section_title] = [] - feeds[section_title] += articles - - ans = [(key, val) for key, val in feeds.iteritems()] - return ans - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll('a'): - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - str = self.tag_to_string(item) - item.replaceWith(str) - return soup + feeds = [ + (u'Mas leidas', u'http://www.eltribuno.info/rss/salta/masleidas.xml') + ,(u'El Tribuno', u'http://www.eltribuno.info/rss/salta/home.xml') + ,(u'Salta' , u'http://www.eltribuno.info/rss/salta/salta.xml') + ,(u'Deportes' , u'http://www.eltribuno.info/rss/salta/deportes.xml') + ] def get_masthead_title(self): return 'El Tribuno' - - def get_obfuscated_article(self, url): - count = 0 - while (count < 10): - try: - response = self.browser.open(url) - html = response.read() - count = 10 - except: - print "Retrying download..." - count += 1 - tfile = PersistentTemporaryFile('_fa.html') - tfile.write(html) - tfile.close() - self.temp_files.append(tfile) - return tfile.name - - def print_version(self, url): - right = url.rpartition('/')[2] - artid = right.partition('-')[0] - params = {'Note': artid} - return (self.PRINTURL % urllib.urlencode(params))