#!/usr/bin/env python # vim:fileencoding=utf-8 from __future__ import absolute_import, division, print_function, unicode_literals __license__ = 'GPL v3' __copyright__ = '2015, Kovid Goyal ' from calibre.web.feeds.news import BasicNewsRecipe class EntrepeneurMagRecipe(BasicNewsRecipe): __author__ = 'Kovid Goyal' language = 'en' title = u'Entrepeneur Magazine' publisher = u'Entrepreneur Media, Inc' category = u'Business' description = u'Online magazine on business, small business, management and economy' encoding = 'utf-8' no_stylesheets = True remove_javascript = True keep_only_tags = [ dict(attrs={'class': ['headline', 'hero topimage']}), dict(itemprop='articlebody'), ] remove_tags = [ dict(attrs={'class': ['related-content']}), ] remove_attributes = ['style'] INDEX = 'http://www.entrepreneur.com' def parse_index(self): root = self.index_to_soup( self.INDEX + '/magazine/index.html', as_tree=True) for href in root.xpath('//h2[@class="sectiontitle nb"]/a/@href'): return self.parse_ent_index(self.INDEX + href) def parse_ent_index(self, url): root = self.index_to_soup(url, as_tree=True) img = root.xpath('//a[@class="hero"]/img[@class="lazy"]')[0] self.cover_url = img.get('data-original') self.timefmt = ' [%s]' % img.get('alt').rpartition('-')[-1].strip() body = root.xpath('//div[@id="latest"]')[0] ans = [] for x in body.xpath('descendant::h3'): title = self.tag_to_string(x) try: a = x.xpath('./a')[0] except IndexError: continue url = self.INDEX + a.get('href') d = x.getnext() desc = self.tag_to_string(d) if d is not None else '' self.log('\t', title, 'at:', url) self.log('\t\t', desc) ans.append({'title': title, 'url': url, 'description': desc}) return [('Articles', ans)]