__license__ = 'GPL v3' __copyright__ = '2012, Darko Miletic ' ''' www.buenosairesherald.com ''' from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class BuenosAiresHerald(BasicNewsRecipe): title = 'Buenos Aires Herald' __author__ = 'Darko Miletic' description = 'A world of information in a few words' publisher = 'Editorial Nefir S.A.' category = 'news, politics, Argentina' oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf8' use_embedded_content = False language = 'en_AR' remove_empty_feeds = True publication_type = 'newspaper' masthead_url = 'http://www.buenosairesherald.com/img/logo.jpg' INDEX = 'http://www.buenosairesherald.com' extra_css = """ body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em; display:block} h1{font-family: Georgia,serif} #fecha{text-align: right; font-size: small} """ conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } remove_tags = [dict(name=['meta', 'link', 'iframe'])] keep_only_tags = [dict(attrs={'class': 'nota_texto p'})] feeds = [ (u'Argentina', u'http://www.buenosairesherald.com/argentina'), (u'World', u'http://www.buenosairesherald.com/world'), (u'Latin America', u'http://www.buenosairesherald.com/latin-america'), (u'Sports', u'http://www.buenosairesherald.com/sports') ] def print_version(self, url): artidraw = url.rpartition('/article/')[2] artid = artidraw.partition('/')[0] return 'http://www.buenosairesherald.com/articles/print.aspx?ix=' + artid def parse_index(self): totalfeeds = [] lfeeds = self.get_feeds() for feedobj in lfeeds: feedtitle, feedurl = feedobj self.report_progress(0, ('Fetching feed') + ' %s...' % (feedtitle if feedtitle else feedurl)) articles = [] soup = self.index_to_soup(feedurl) for item in soup.findAll('div', attrs={'class': 'nota_texto_seccion'}): description = self.tag_to_string(item.h2) atag = item.h2.find('a', href=True) if atag is not None: url = self.INDEX + atag['href'] title = description date = strftime(self.timefmt) articles.append({ 'title': title, 'date': date, 'url': url, 'description': description }) totalfeeds.append((feedtitle, articles)) return totalfeeds