from calibre.web.feeds.news import BasicNewsRecipe import re from datetime import date, timedelta class HBR(BasicNewsRecipe): title = 'Harvard Business Review' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True __author__ = 'Kovid Goyal and Sujata Raman' timefmt = ' [%B %Y]' language = 'en' no_stylesheets = True recipe_disabled = ('hbr.org has started requiring the use of javascript' ' to log into their website. This is unsupported in calibre, so' ' this recipe has been disabled. If you would like to see ' ' HBR supported in calibre, contact hbr.org and ask them' ' to provide a javascript free login method.') LOGIN_URL = 'https://hbr.org/login?request_url=/' LOGOUT_URL = 'https://hbr.org/logout?request_url=/' INDEX = 'http://hbr.org/archive-toc/BR' keep_only_tags = [dict(name='div', id='pageContainer')] remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'mailingListTout', 'partnerCenter', 'pageFooter', 'superNavHeadContainer', 'hbrDisqus', 'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']), dict(name='iframe')] extra_css = ''' a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} ''' def get_browser(self): br = BasicNewsRecipe.get_browser(self) self.logout_url = None #''' br.open(self.LOGIN_URL) br.select_form(name='signin-form') br['signin-form:username'] = self.username br['signin-form:password'] = self.password raw = br.submit().read() if '>Sign out<' not in raw: raise Exception('Failed to login, are you sure your username and password are correct?') try: link = br.find_link(text='Sign out') if link: self.logout_url = link.absolute_url except: self.logout_url = self.LOGOUT_URL #''' return br def cleanup(self): if self.logout_url is not None: self.browser.open(self.logout_url) def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' def hbr_get_toc(self): #return self.index_to_soup(open('/t/hbr.html').read()) today = date.today() future = today + timedelta(days=30) for x in [x.strftime('%y%m') for x in (future, today)]: url = self.INDEX + x soup = self.index_to_soup(url) if not soup.find(text='Issue Not Found'): return soup raise Exception('Could not find current issue') def hbr_parse_toc(self, soup): feeds = [] current_section = None articles = [] for x in soup.find(id='archiveToc').findAll(['h3', 'h4']): if x.name == 'h3': if current_section is not None and articles: feeds.append((current_section, articles)) current_section = self.tag_to_string(x).capitalize() articles = [] self.log('\tFound section:', current_section) else: a = x.find('a', href=True) if a is None: continue title = self.tag_to_string(a) url = a['href'] if '/ar/' not in url: continue if url.startswith('/'): url = 'http://hbr.org' + url url = self.map_url(url) p = x.parent.find('p') desc = '' if p is not None: desc = self.tag_to_string(p) self.log('\t\tFound article:', title) self.log('\t\t\t', url) self.log('\t\t\t', desc) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) return feeds def parse_index(self): soup = self.hbr_get_toc() #open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8')) feeds = self.hbr_parse_toc(soup) return feeds def get_cover_url(self): cover_url = None index = 'http://hbr.org/current' soup = self.index_to_soup(index) link_item = soup.find('img', alt=re.compile("Current Issue"), src=True) if link_item: cover_url = 'http://hbr.org' + link_item['src'] return cover_url