#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2014, Nikolas Mangold-Takao ' __version__ = '0.10' from calibre.web.feeds.recipes import BasicNewsRecipe class IncMagazineRecipe(BasicNewsRecipe): language = 'en' version = 1 __author__ = 'Nikolas Mangold-Takao ' title = u'Inc Magazine' publisher = u'Mansueto Ventures LLC' category = u'News, Business' description = u'Free account required to browse website. Handbook of the American Entrepeneur' needs_subscription = True # signup is free, without account browsing is limited and recipe won't work use_embedded_content = False remove_empty_feeds = True no_stylesheets = True remove_javascript = True INDEX = 'http://www.inc.com/magazine' remove_tags_before = dict(name='div', attrs={'id': 'outercontainer'}) remove_tags = [dict(name='div', attrs={'id': 'morearticles'})] remove_tags_after = dict(name='div', attrs={'id': 'article-share-footer'}) extra_css = ''' div.byline {font-size: x-small; color: #696969; margin-top: 0.4em;} ''' def get_browser(self): def has_login_name(form): try: form.find_control(name="email") except: return False else: return True br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.inc.com/login') br.select_form(predicate=has_login_name) br['email'] = self.username br['password'] = self.password br.submit() return br def parse_index(self): soup = self.index_to_soup(self.INDEX) feeds = [] articles = [] # We need for sure 'toc-top-l-column' and 'toc-bot-l-column' # I am not sure about 'toc-top-r-column and 'toc-bot-r-column', seems to be mostly ads for toc in soup.findAll('div', id=lambda x: x and x.endswith('-l-column')): for article in toc.findAll('div', {'class' : lambda x: x and x.startswith('toc-h')}): # h1-h2-h3 link = article.find('a') if link is None: continue article_title = self.tag_to_string(link) article_link = link.get('href') articles.append({'title' : article_title, 'url' : article_link}) feeds.append(('Articles', articles)) # TODO figure out sections return feeds def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) return soup