from calibre.web.feeds.news import BasicNewsRecipe import re class HBR(BasicNewsRecipe): title = 'Harvard Business Review' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True __author__ = 'Kovid Goyal and Sujata Raman' timefmt = ' [%B %Y]' language = 'en' no_stylesheets = True LOGIN_URL = 'http://hbr.org/login?request_url=/' INDEX = 'http://hbr.org/current' keep_only_tags = [dict(name='div', id='pageContainer')] remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'mailingListTout', 'partnerCenter', 'pageFooter', 'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']), dict(name='iframe')] extra_css = ''' a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} ''' def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.LOGIN_URL) br.select_form(name='signin-form') br['signin-form:username'] = self.username br['signin-form:password'] = self.password raw = br.submit().read() if 'My Account' not in raw: raise Exception('Failed to login, are you sure your username and password are correct?') self.logout_url = None link = br.find_link(text='Sign out') if link: self.logout_url = link.absolute_url return br def cleanup(self): if self.logout_url is not None: self.browser.open(self.logout_url) def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' def hbr_get_toc(self): soup = self.index_to_soup(self.INDEX) url = soup.find('a', text=lambda t:'Full Table of Contents' in t).parent.get('href') return self.index_to_soup('http://hbr.org'+url) def hbr_parse_section(self, container, feeds): current_section = None current_articles = [] for x in container.findAll(name=['li', 'h3', 'h4']): if x.name in ['h3', 'h4'] and not x.findAll(True): if current_section and current_articles: feeds.append((current_section, current_articles)) current_section = self.tag_to_string(x) current_articles = [] self.log('\tFound section:', current_section) if x.name == 'li': a = x.find('a', href=True) if a is not None: title = self.tag_to_string(a) url = a.get('href') if '/ar/' not in url: continue if url.startswith('/'): url = 'http://hbr.org'+url url = self.map_url(url) p = x.find('p') desc = '' if p is not None: desc = self.tag_to_string(p) if not title or not url: continue self.log('\t\tFound article:', title) self.log('\t\t\t', url) self.log('\t\t\t', desc) current_articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if current_section and current_articles: feeds.append((current_section, current_articles)) def hbr_parse_toc(self, soup): feeds = [] features = soup.find(id='issueFeaturesContent') self.hbr_parse_section(features, feeds) departments = soup.find(id='issueDepartments') self.hbr_parse_section(departments, feeds) return feeds def parse_index(self): soup = self.hbr_get_toc() feeds = self.hbr_parse_toc(soup) return feeds def get_cover_url(self): cover_url = None index = 'http://hbr.org/current' soup = self.index_to_soup(index) link_item = soup.find('img', alt=re.compile("Current Issue"), src=True) if link_item: cover_url = 'http://hbr.org' + link_item['src'] return cover_url