diff --git a/resources/recipes/animal_politico.recipe b/resources/recipes/animal_politico.recipe new file mode 100644 index 0000000000..f48587ea94 --- /dev/null +++ b/resources/recipes/animal_politico.recipe @@ -0,0 +1,111 @@ +#!/usr/bin/python +# encoding: utf-8 + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1290663986(BasicNewsRecipe): + title = u'Animal Pol\u00EDtico' + publisher = u'Animal Pol\u00EDtico' + category = u'News, Mexico' + description = u'Noticias Pol\u00EDticas' + __author__ = 'leamsi' + masthead_url = 'http://www.animalpolitico.com/wp-content/themes/animal_mu/images/logo.png' + oldest_article = 1 + max_articles_per_feed = 100 + language = 'es' + + #feeds = [(u'Animal Politico', u'http://www.animalpolitico.com/feed/')] + + remove_tags_before = dict(name='div', id='main') + remove_tags = [dict(name='div', attrs={'class':'fb-like-button'})] + keep_only_tags = [dict(name='h1', attrs={'class':'entry-title'}), + dict(name='div', attrs={'class':'entry-content'})] + remove_javascript = True + INDEX = 'http://www.animalpolitico.com/' + + def generic_parse(self, soup): + articles = [] + for entry in soup.findAll(lambda tag: tag.name == 'li' and tag.has_key('class') and tag['class'].find('hentry') != -1): #soup.findAll('li', 'hentry'): + article_url = entry.a['href'] + '?print=yes' + article_title= entry.find('h3', 'entry-title') + article_title= self.tag_to_string(article_title) + article_date = entry.find('span', 'the-time') + article_date = self.tag_to_string(article_date) + article_desc = self.tag_to_string(entry.find('p')) + + #print 'Article:',article_title, article_date,article_url + #print entry['class'] + + articles.append({'title' : article_title, + 'date' : article_date, + 'description' : article_desc, + 'url' : article_url}) + # Avoid including the multimedia stuff. + if entry['class'].find('last') != -1: + break + + return articles + + def plumaje_parse(self, soup): + articles = [] + blogs_soup = soup.find(lambda tag: tag.name == 'ul' and tag.has_key('class') and tag['class'].find('bloglist-fecha') != -1) + for entry in blogs_soup.findAll('li'): + article_title = entry.p + article_url = article_title.a['href'] + '?print=yes' + article_date = article_title.nextSibling + article_title = self.tag_to_string(article_title) + article_date = self.tag_to_string(article_date).replace(u'Last Updated: ', '') + article_desc = self.tag_to_string(entry.find('h4')) + + #print 'Article:',article_title, article_date,article_url + articles.append({'title' : article_title, + 'date' : article_date, + 'description' : article_desc, + 'url' : article_url}) + + return articles + + def boca_parse(self, soup): + articles = [] + for entry in soup.findAll(lambda tag: tag.name == 'div' and tag.has_key('class') and tag['class'].find('hentry') != -1): #soup.findAll('li', 'hentry'): + article_title= entry.find('h2', 'entry-title') + article_url = article_title.a['href'] + '?print=yes' + article_title= self.tag_to_string(article_title) + article_date = entry.find('span', 'entry-date') + article_date = self.tag_to_string(article_date) + article_desc = self.tag_to_string(entry.find('div', 'entry-content')) + + #print 'Article:',article_title, article_date,article_url + #print entry['class'] + + articles.append({'title' : article_title, + 'date' : article_date, + 'description' : article_desc, + 'url' : article_url}) + # Avoid including the multimedia stuff. + if entry['class'].find('last') != -1: + break + + return articles + + + + + def parse_index(self): + gobierno_soup = self.index_to_soup(self.INDEX+'gobierno/') + congreso_soup = self.index_to_soup(self.INDEX+'congreso/') + seguridad_soup = self.index_to_soup(self.INDEX+'seguridad/') + comunidad_soup = self.index_to_soup(self.INDEX+'comunidad/') + plumaje_soup = self.index_to_soup(self.INDEX+'plumaje/') + la_boca_del_lobo_soup = self.index_to_soup(self.INDEX+'category/la-boca-del-lobo/') + + gobierno_articles = self.generic_parse(gobierno_soup) + congreso_articles = self.generic_parse(congreso_soup) + seguridad_articles = self.generic_parse(seguridad_soup) + comunidad_articles = self.generic_parse(comunidad_soup) + plumaje_articles = self.plumaje_parse(plumaje_soup) + la_boca_del_lobo_articles = self.boca_parse(la_boca_del_lobo_soup) + + + return [ (u'Gobierno', gobierno_articles), (u'Congreso', congreso_articles), (u'Seguridad', seguridad_articles), + (u'Comunidad', comunidad_articles), (u'Plumaje', plumaje_articles), (u'La Boca del Lobo', la_boca_del_lobo_articles), ]