from calibre.web.feeds.news import BasicNewsRecipe class HBR(BasicNewsRecipe): title = 'Harvard Business Review' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True __author__ = 'Kovid Goyal and Sujata Raman' timefmt = ' [%B %Y]' language = 'en' no_stylesheets = True LOGIN_URL = 'https://hbr.org/login?request_url=/' LOGOUT_URL = 'https://hbr.org/logout?request_url=/' INDEX = 'http://hbr.org' keep_only_tags = [dict(name='div', id='pageContainer')] remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'mailingListTout', 'partnerCenter', 'pageFooter', 'superNavHeadContainer', 'hbrDisqus', 'article-toolbox', 'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']), dict(name='iframe')] extra_css = ''' a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} ''' use_javascript_to_login = True def javascript_login(self, br, username, password): from calibre.web.jsbrowser.browser import Timeout try: br.visit('https://hbr.org/login?request_url=/', timeout=20) except Timeout: pass br.click('#form-wrapper h3[tabindex="0"]', wait_for_load=False) f = br.select_form('#login-form') f['username'] = username f['password'] = password br.submit(wait_for_load=False) br.run_for_a_time(30) def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' def hbr_parse_toc(self, soup): feeds = [] current_section = None articles = [] for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']): if x.name == 'h4': if x.get('class', None) == 'basic': continue if current_section is not None and articles: feeds.append((current_section, articles)) current_section = self.tag_to_string(x).capitalize() articles = [] self.log('\tFound section:', current_section) else: a = x.find('a', href=True) if a is None: continue title = self.tag_to_string(a) url = a['href'] if '/ar/' not in url: continue if url.startswith('/'): url = 'http://hbr.org' + url url = self.map_url(url) p = x.find('p', attrs={'class':'author'}) desc = '' if p is not None: desc = self.tag_to_string(p) self.log('\t\tFound article:', title) self.log('\t\t\t', url) self.log('\t\t\t', desc) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if current_section is not None and articles: feeds.append((current_section, articles)) return feeds def parse_index(self): soup0 = self.index_to_soup('http://hbr.org/magazine') datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1] # find date & cover self.cover_url=datencover.img['src'] dates=self.tag_to_string(datencover.img['alt']) self.timefmt = u' [%s]'%dates soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs={'class':'magazine_page'}).a['href']) feeds = self.hbr_parse_toc(soup) return feeds