from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class Fortune(BasicNewsRecipe): title = 'Fortune Magazine' __author__ = 'Rick Shang' description = 'FORTUNE is a global business magazine that has been revered in its content and credibility since 1930. FORTUNE covers the entire field of business, including specific companies and business trends, prominent business leaders, and new ideas shaping the global marketplace.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [dict(attrs={'id':['storycontent']})] remove_tags = [dict(attrs={'class':['hed_side','socialMediaToolbarContainer']})] no_javascript = True no_stylesheets = True needs_subscription = True def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open('http://money.cnn.com/2013/03/21/smallbusiness/legal-marijuana-startups.pr.fortune/index.html') br.select_form(name="paywall-form") br['email'] = self.username br['password'] = self.password br.submit() return br def parse_index(self): articles = [] soup0 = self.index_to_soup('http://money.cnn.com/magazines/fortune/') #Go to the latestissue soup = self.index_to_soup(soup0.find('div',attrs={'class':'latestissue'}).find('a',href=True)['href']) #Find cover & date cover_item = soup.find('div', attrs={'id':'cover-story'}) cover = cover_item.find('img',src=True) self.cover_url = cover['src'] date = self.tag_to_string(cover_item.find('div', attrs={'class':'tocDate'})).strip() self.timefmt = u' [%s]'%date feeds = OrderedDict() section_title = '' #checkout the cover story articles = [] coverstory=soup.find('div', attrs={'class':'cnnHeadline'}) title=self.tag_to_string(coverstory.a).strip() url=coverstory.a['href'] desc=self.tag_to_string(coverstory.findNext('p', attrs={'class':'cnnBlurbTxt'})) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) feeds['Cover Story'] = [] feeds['Cover Story'] += articles for post in soup.findAll('div', attrs={'class':'cnnheader'}): section_title = self.tag_to_string(post).strip() articles = [] ul=post.findNext('ul') for link in ul.findAll('li'): links=link.find('h2') title=self.tag_to_string(links.a).strip() url=links.a['href'] desc=self.tag_to_string(link.find('p', attrs={'class':'cnnBlurbTxt'})) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans