from calibre.web.feeds.news import BasicNewsRecipe import re from datetime import date, timedelta class HBR(BasicNewsRecipe): title = 'Harvard Business Review' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True __author__ = 'Kovid Goyal and Sujata Raman' timefmt = ' [%B %Y]' language = 'en' no_stylesheets = True # recipe_disabled = ('hbr.org has started requiring the use of javascript' # ' to log into their website. This is unsupported in calibre, so' # ' this recipe has been disabled. If you would like to see ' # ' HBR supported in calibre, contact hbr.org and ask them' # ' to provide a javascript free login method.') LOGIN_URL = 'https://hbr.org/login?request_url=/' LOGOUT_URL = 'https://hbr.org/logout?request_url=/' INDEX = 'http://hbr.org/archive-toc/BR' keep_only_tags = [dict(name='div', id='pageContainer')] remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'mailingListTout', 'partnerCenter', 'pageFooter', 'superNavHeadContainer', 'hbrDisqus', 'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']), dict(name='iframe')] extra_css = ''' a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} ''' use_javascript_to_login = True def javascript_login(self, br, username, password): from calibre.web.jsbrowser.browser import Timeout try: br.visit('https://hbr.org/login?request_url=/', timeout=20) except Timeout: pass br.click('#accordion div[tabindex="0"]', wait_for_load=False) f = br.select_form('#signin-form') f['signin-form:username'] = username f['signin-form:password'] = password br.submit(wait_for_load=False) br.run_for_a_time(30) def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' def hbr_get_toc(self): # return self.index_to_soup(open('/t/toc.html').read()) today = date.today() future = today + timedelta(days=30) past = today - timedelta(days=30) for x in [x.strftime('%y%m') for x in (future, today, past)]: url = self.INDEX + x soup = self.index_to_soup(url) if (not soup.find(text='Issue Not Found') and not soup.find( text="We're Sorry. There was an error processing your request") and 'Exception: java.io.FileNotFoundException' not in unicode(soup)): return soup raise Exception('Could not find current issue') def hbr_parse_toc(self, soup): feeds = [] current_section = None articles = [] for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']): if x.name == 'h4': if x.get('class', None) == 'basic':continue if current_section is not None and articles: feeds.append((current_section, articles)) current_section = self.tag_to_string(x).capitalize() articles = [] self.log('\tFound section:', current_section) else: a = x.find('a', href=True) if a is None: continue title = self.tag_to_string(a) url = a['href'] if '/ar/' not in url: continue if url.startswith('/'): url = 'http://hbr.org' + url url = self.map_url(url) p = x.find('p', attrs={'class':'author'}) desc = '' if p is not None: desc = self.tag_to_string(p) self.log('\t\tFound article:', title) self.log('\t\t\t', url) self.log('\t\t\t', desc) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) return feeds def parse_index(self): soup = self.hbr_get_toc() # open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8')) feeds = self.hbr_parse_toc(soup) return feeds def get_cover_url(self): cover_url = None index = 'http://hbr.org/current' soup = self.index_to_soup(index) link_item = soup.find('img', alt=re.compile("Current Issue"), src=True) if link_item: cover_url = 'http://hbr.org' + link_item['src'] return cover_url