from calibre.web.feeds.news import BasicNewsRecipe from css_selectors import Select class HBR(BasicNewsRecipe): title = 'Harvard Business Review' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True __author__ = 'Kovid Goyal and Sujata Raman' timefmt = ' [%B %Y]' language = 'en' no_stylesheets = True LOGIN_URL = 'https://hbr.org/login?request_url=/' LOGOUT_URL = 'https://hbr.org/logout?request_url=/' keep_only_tags = [ dict(attrs={'class':['article-hed', 'byline']}), dict(attrs={'class':lambda x: x and 'article' in x.split()}), ] remove_tags = [ dict(name='personalization-placement'), ] use_javascript_to_login = True def javascript_login(self, br, username, password): br.visit('https://hbr.org/sign-in') br.run_for_a_time(15) f = br.select_form('sign-in form') f['login-email'] = username f['login-password'] = password br.submit('[js-target="submit-sign-in"]', wait_for_load=False) br.run_for_a_time(15) def hbr_parse_toc(self, url): root = self.index_to_soup(url, as_tree=True) select = Select(root) section = 'Unknown' articles = [] feeds = [] toc = next(select('stream-content[data-stream-name="table-of-contents"] ul')) for x in toc.xpath('descendant::*[local-name()="h4" or local-name()="stream-item"]'): if 'h4' in x.tag: if articles: feeds.append((section, articles)) section = self.tag_to_string(x) articles = [] self.log('Found section:', section) else: title, url = x.get('data-title'), x.get('data-url') desc = ''.join(c.tail or '' for c in x).strip() authors = x.get('data-authors') if authors: desc = 'by ' + authors + ': ' + desc self.log('\tFound article:', title, url, desc) articles.append({'title':title, 'url':'https://hbr.org' + url, 'description':desc}) if articles: feeds.append((section, articles)) return feeds def parse_index(self): soup = self.index_to_soup('http://hbr.org/magazine') fig = soup.find('figure', attrs={'class': lambda x: x and 'current-issue' in x.split()}) url = 'https://hbr.org' + fig.find('a', href=True)['href'] img = fig.find('img') self.cover_url = 'https://hbr.org' + img['src'] self.timefmt = img['alt'] return self.hbr_parse_toc(url)