From 5f3c10f91df30f2a8d5c2f5d6ca98c572251a4c0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 23 Jul 2013 16:06:32 +0530 Subject: [PATCH] El Tribuno Salta and Jujuy by Darko Miletic Fixes #1203724 [New recipes for El Tribuno Salta and Jujuy](https://bugs.launchpad.net/calibre/+bug/1203724) --- recipes/eltribuno_jujuy_impreso.recipe | 127 ++++++++++++++++++++++ recipes/eltribuno_salta_impreso.recipe | 127 ++++++++++++++++++++++ recipes/icons/eltribuno_jujuy_impreso.png | Bin 0 -> 592 bytes recipes/icons/eltribuno_salta_impreso.png | Bin 0 -> 592 bytes 4 files changed, 254 insertions(+) create mode 100644 recipes/eltribuno_jujuy_impreso.recipe create mode 100644 recipes/eltribuno_salta_impreso.recipe create mode 100644 recipes/icons/eltribuno_jujuy_impreso.png create mode 100644 recipes/icons/eltribuno_salta_impreso.png diff --git a/recipes/eltribuno_jujuy_impreso.recipe b/recipes/eltribuno_jujuy_impreso.recipe new file mode 100644 index 0000000000..2b725231c9 --- /dev/null +++ b/recipes/eltribuno_jujuy_impreso.recipe @@ -0,0 +1,127 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +http://www.eltribuno.info/jujuy/edicion_impresa.aspx +''' + +import urllib +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe +from collections import OrderedDict + +class ElTribunoJujuyImpreso(BasicNewsRecipe): + title = 'El Tribuno Jujuy (Edición Impresa)' + __author__ = 'Darko Miletic' + description = "Diario principal de Jujuy" + publisher = 'Horizontes S.A.' + category = 'news, politics, Jujuy, Argentina, World' + oldest_article = 2 + language = 'es_AR' + max_articles_per_feed = 250 + no_stylesheets = True + use_embedded_content = False + encoding = 'cp1252' + publication_type = 'newspaper' + delay = 1 + articles_are_obfuscated = True + temp_files = [] + PREFIX = 'http://www.eltribuno.info/jujuy/' + INDEX = PREFIX + 'edicion_impresa.aspx' + PRINTURL = PREFIX + 'nota_print.aspx?%s' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + + keep_only_tags = [dict(name='div' , attrs={'class':['notaHead', 'notaContent']})] + remove_tags = [ + dict(name=['meta','iframe','base','object','embed','link','img']), + dict(name='ul', attrs={'class':'Tabs'}) + ] + + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif} + .notaHead h4{text-transform: uppercase; color: gray} + img{margin-top: 0.8em; display: block} + """ + + def parse_index(self): + feeds = OrderedDict() + soup = None + count = 0 + while (count < 5): + try: + soup = self.index_to_soup(self.INDEX) + count = 5 + except: + print "Retrying download..." + count += 1 + if not soup: + return [] + alink = soup.find('a', href=True, attrs={'class':'ZoomTapa'}) + if alink and 'href' in alink: + self.cover_url = alink['href'] + sections = soup.findAll('div', attrs={'id':lambda x: x and x.startswith('Ediciones')}) + for section in sections: + section_title = 'Sin titulo' + sectiont=section.find('h3', attrs={'class':'NombreSeccion'}) + if sectiont: + section_title = self.tag_to_string(sectiont.span) + + arts = section.findAll('div', attrs={'class':'Noticia NoticiaAB1'}) + for article in arts: + articles = [] + title=self.tag_to_string(article.div.h3.a) + url=article.div.h3.a['href'] + description=self.tag_to_string(article.p) + articles.append({'title':title, 'url':url, 'description':description, 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + + ans = [(key, val) for key, val in feeds.iteritems()] + return ans + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + str = self.tag_to_string(item) + item.replaceWith(str) + return soup + + def get_masthead_title(self): + return 'El Tribuno' + + def get_obfuscated_article(self, url): + count = 0 + while (count < 10): + try: + response = self.browser.open(url) + html = response.read() + count = 10 + except: + print "Retrying download..." + count += 1 + tfile = PersistentTemporaryFile('_fa.html') + tfile.write(html) + tfile.close() + self.temp_files.append(tfile) + return tfile.name + + def print_version(self, url): + right = url.rpartition('/')[2] + artid = right.partition('-')[0] + params = {'Note':artid} + return (self.PRINTURL % urllib.urlencode(params)) + diff --git a/recipes/eltribuno_salta_impreso.recipe b/recipes/eltribuno_salta_impreso.recipe new file mode 100644 index 0000000000..67cc073a7e --- /dev/null +++ b/recipes/eltribuno_salta_impreso.recipe @@ -0,0 +1,127 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +http://www.eltribuno.info/salta/edicion_impresa.aspx +''' + +import urllib +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe +from collections import OrderedDict + +class ElTribunoSaltaImpreso(BasicNewsRecipe): + title = 'El Tribuno Salta (Edición Impresa)' + __author__ = 'Darko Miletic' + description = "Diario principal de Salta" + publisher = 'Horizontes S.A.' + category = 'news, politics, Salta, Argentina, World' + oldest_article = 2 + language = 'es_AR' + max_articles_per_feed = 250 + no_stylesheets = True + use_embedded_content = False + encoding = 'cp1252' + publication_type = 'newspaper' + delay = 1 + articles_are_obfuscated = True + temp_files = [] + PREFIX = 'http://www.eltribuno.info/salta/' + INDEX = PREFIX + 'edicion_impresa.aspx' + PRINTURL = PREFIX + 'nota_print.aspx?%s' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + + keep_only_tags = [dict(name='div' , attrs={'class':['notaHead', 'notaContent']})] + remove_tags = [ + dict(name=['meta','iframe','base','object','embed','link','img']), + dict(name='ul', attrs={'class':'Tabs'}) + ] + + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif} + .notaHead h4{text-transform: uppercase; color: gray} + img{margin-top: 0.8em; display: block} + """ + + def parse_index(self): + feeds = OrderedDict() + soup = None + count = 0 + while (count < 5): + try: + soup = self.index_to_soup(self.INDEX) + count = 5 + except: + print "Retrying download..." + count += 1 + if not soup: + return [] + alink = soup.find('a', href=True, attrs={'class':'ZoomTapa'}) + if alink and 'href' in alink: + self.cover_url = alink['href'] + sections = soup.findAll('div', attrs={'id':lambda x: x and x.startswith('Ediciones')}) + for section in sections: + section_title = 'Sin titulo' + sectiont=section.find('h3', attrs={'class':'NombreSeccion'}) + if sectiont: + section_title = self.tag_to_string(sectiont.span) + + arts = section.findAll('div', attrs={'class':'Noticia NoticiaAB1'}) + for article in arts: + articles = [] + title=self.tag_to_string(article.div.h3.a) + url=article.div.h3.a['href'] + description=self.tag_to_string(article.p) + articles.append({'title':title, 'url':url, 'description':description, 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + + ans = [(key, val) for key, val in feeds.iteritems()] + return ans + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + str = self.tag_to_string(item) + item.replaceWith(str) + return soup + + def get_masthead_title(self): + return 'El Tribuno' + + def get_obfuscated_article(self, url): + count = 0 + while (count < 10): + try: + response = self.browser.open(url) + html = response.read() + count = 10 + except: + print "Retrying download..." + count += 1 + tfile = PersistentTemporaryFile('_fa.html') + tfile.write(html) + tfile.close() + self.temp_files.append(tfile) + return tfile.name + + def print_version(self, url): + right = url.rpartition('/')[2] + artid = right.partition('-')[0] + params = {'Note':artid} + return (self.PRINTURL % urllib.urlencode(params)) + diff --git a/recipes/icons/eltribuno_jujuy_impreso.png b/recipes/icons/eltribuno_jujuy_impreso.png new file mode 100644 index 0000000000000000000000000000000000000000..8862b78d0c77b95c435de73b4bf78265bcc95815 GIT binary patch literal 592 zcmV-W0el z0jV;e3a3F)l(;f+?f9$by$7X7%GLGFz2}{CFUhh@5kNu+$pk2+sHAC11Y{2qM5?^N zbzRv_=yNei63n2|>2zMlBu%&5t=H?wQ5u|_eBa!hSzBvdUSfzdrzvGj&QQsXZt!t> zRUNh4=E{nDbp-{z!^0nYd!UKqcrX}nc4SZ-1Ob$2y6)ZI-)wLH1~(z`)04Tp{OR^K z2}AJa^LcK__kCoDSrmPposATQ31`r2uC6vOE>M@27-PDw(_AhWh9Nh^*k{i(78fvGIO?Kk|Kk7O^pc$s|E5 zmMP`XYPDLaRG=^RypPAnp*Rq~dZ9oKgXlW)VzKBr4!02tc%D}-mw)wonevJ>2o!OU z%EAJ=YqgrHszmM+FJgK$8rimu7>d*4M3!X@heLdpN+on-D@59rDsdVlnx;L!kfxK# eL>G|V(Ek_WCy$~!G;g^80000el z0jV;e3a3F)l(;f+?f9$by$7X7%GLGFz2}{CFUhh@5kNu+$pk2+sHAC11Y{2qM5?^N zbzRv_=yNei63n2|>2zMlBu%&5t=H?wQ5u|_eBa!hSzBvdUSfzdrzvGj&QQsXZt!t> zRUNh4=E{nDbp-{z!^0nYd!UKqcrX}nc4SZ-1Ob$2y6)ZI-)wLH1~(z`)04Tp{OR^K z2}AJa^LcK__kCoDSrmPposATQ31`r2uC6vOE>M@27-PDw(_AhWh9Nh^*k{i(78fvGIO?Kk|Kk7O^pc$s|E5 zmMP`XYPDLaRG=^RypPAnp*Rq~dZ9oKgXlW)VzKBr4!02tc%D}-mw)wonevJ>2o!OU z%EAJ=YqgrHszmM+FJgK$8rimu7>d*4M3!X@heLdpN+on-D@59rDsdVlnx;L!kfxK# eL>G|V(Ek_WCy$~!G;g^80000