from calibre.web.feeds.news import BasicNewsRecipe import re class HBR(BasicNewsRecipe): title = 'Harvard Business Review Blogs' description = 'To subscribe go to http://hbr.harvardbusiness.org' __author__ = 'Kovid Goyal' language = 'en' no_stylesheets = True #recipe_disabled = ('hbr.org has started requiring the use of javascript' # ' to log into their website. This is unsupported in calibre, so' # ' this recipe has been disabled. If you would like to see ' # ' HBR supported in calibre, contact hbr.org and ask them' # ' to provide a javascript free login method.') needs_subscription = False LOGIN_URL = 'http://hbr.org/login?request_url=/' LOGOUT_URL = 'http://hbr.org/logout?request_url=/' INDEX = 'http://hbr.org/current' remove_tags_after = dict(id='articleBody') remove_tags_before = dict(id='pageFeature') feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')] oldest_article = 30 max_articles_per_feed = 100 use_embedded_content = False keep_only_tags = [ dict(name='div', id='pageContainer') ] remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'articleToolbarTop','articleToolbarBottom', 'articleToolbarRD', 'mailingListTout', 'partnerCenter', 'pageFooter', 'shareWidgetTop']), dict(name=['iframe', 'style'])] def get_browser(self): br = BasicNewsRecipe.get_browser(self) self.logout_url = None return br #''' br.open(self.LOGIN_URL) br.select_form(name='signin-form') br['signin-form:username'] = self.username br['signin-form:password'] = self.password raw = br.submit().read() if 'My Account' not in raw: raise Exception('Failed to login, are you sure your username and password are correct?') try: link = br.find_link(text='Sign out') if link: self.logout_url = link.absolute_url except: self.logout_url = self.LOGOUT_URL #''' return br #------------------------------------------------------------------------------------------------- def cleanup(self): if self.logout_url is not None: self.browser.open(self.logout_url) #------------------------------------------------------------------------------------------------- def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' def get_cover_url(self): cover_url = None index = 'http://hbr.org/current' soup = self.index_to_soup(index) link_item = soup.find('img', alt=re.compile("Current Issue"), src=True) if link_item: cover_url = 'http://hbr.org' + link_item['src'] return cover_url