diff --git a/resources/recipes/hbr_blogs.recipe b/resources/recipes/hbr_blogs.recipe new file mode 100644 index 0000000000..b15203fce1 --- /dev/null +++ b/resources/recipes/hbr_blogs.recipe @@ -0,0 +1,197 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + +# Needed for BLOGs +from calibre.web.feeds import Feed + +class HBR(BasicNewsRecipe): + + title = 'Harvard Business Review Blogs' + description = 'To subscribe go to http://hbr.harvardbusiness.org' + needs_subscription = True + __author__ = 'Kovid Goyal and Sujata Raman, enhanced by BrianG' + language = 'en' + no_stylesheets = True + + LOGIN_URL = 'http://hbr.org/login?request_url=/' + INDEX = 'http://hbr.org/current' + + # + # Blog Stuff + # + + + INCLUDE_BLOGS = True + INCLUDE_ARTICLES = False + + # option-specific settings. + + if INCLUDE_BLOGS == True: + remove_tags_after = dict(id='articleBody') + remove_tags_before = dict(id='pageFeature') + feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')] + oldest_article = 30 + max_articles_per_feed = 100 + else: + timefmt = ' [%B %Y]' + + + keep_only_tags = [ dict(name='div', id='pageContainer') + ] + + remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', + 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', + 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', + 'articleToolbarTop','articleToolbarBottom', 'articleToolbarRD', + 'mailingListTout', 'partnerCenter', 'pageFooter']), + dict(name='iframe')] + + extra_css = ''' + a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } + .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} + h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } + h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } + #articleBody{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} + #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} + ''' +#------------------------------------------------------------------------------------------------- + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.open(self.LOGIN_URL) + br.select_form(name='signInForm') + br['signInForm:username'] = self.username + br['signInForm:password'] = self.password + raw = br.submit().read() + if 'My Account' not in raw: + raise Exception('Failed to login, are you sure your username and password are correct?') + self.logout_url = None + link = br.find_link(text='Sign out') + if link: + self.logout_url = link.absolute_url + return br +#------------------------------------------------------------------------------------------------- + def cleanup(self): + if self.logout_url is not None: + self.browser.open(self.logout_url) +#------------------------------------------------------------------------------------------------- + def map_url(self, url): + if url.endswith('/ar/1'): + return url[:-1]+'pr' +#------------------------------------------------------------------------------------------------- + + def hbr_get_toc(self): + soup = self.index_to_soup(self.INDEX) + url = soup.find('a', text=lambda t:'Full Table of Contents' in t).parent.get('href') + return self.index_to_soup('http://hbr.org'+url) + +#------------------------------------------------------------------------------------------------- + + def hbr_parse_section(self, container, feeds): + current_section = None + current_articles = [] + for x in container.findAll(name=['li', 'h3', 'h4']): + if x.name in ['h3', 'h4'] and not x.findAll(True): + if current_section and current_articles: + feeds.append((current_section, current_articles)) + current_section = self.tag_to_string(x) + current_articles = [] + self.log('\tFound section:', current_section) + if x.name == 'li': + a = x.find('a', href=True) + if a is not None: + title = self.tag_to_string(a) + url = a.get('href') + if '/ar/' not in url: + continue + if url.startswith('/'): + url = 'http://hbr.org'+url + url = self.map_url(url) + p = x.find('p') + desc = '' + if p is not None: + desc = self.tag_to_string(p) + if not title or not url: + continue + self.log('\t\tFound article:', title) + self.log('\t\t\t', url) + self.log('\t\t\t', desc) + current_articles.append({'title':title, 'url':url, + 'description':desc, 'date':''}) + if current_section and current_articles: + feeds.append((current_section, current_articles)) + +#------------------------------------------------------------------------------------------------- + + def hbr_parse_toc(self, soup): + feeds = [] + features = soup.find(id='issueFeaturesContent') + self.hbr_parse_section(features, feeds) + departments = soup.find(id='issueDepartments') + self.hbr_parse_section(departments, feeds) + return feeds +#------------------------------------------------------------------------------------------------- + def feed_to_index_append(self, feedObject, masterFeed): + # Loop thru the feed object and build the correct type of article list + for feed in feedObject: + # build the correct structure from the feed object + newArticles = [] + for article in feed.articles: + newArt = { + 'title' : article.title, + 'url' : article.url, + 'date' : article.date, + 'description' : article.text_summary + } + newArticles.append(newArt) + + # Append the earliest/latest dates of the feed to the feed title + startDate, endDate = self.get_feed_dates(feed, '%d-%b') + newFeedTitle = feed.title + ' (' + startDate + ' thru ' + endDate + ')' + + # append the newly-built list object to the index object passed in + # as masterFeed. + masterFeed.append( (newFeedTitle,newArticles) ) + +#------------------------------------------------------------------------------------------------- + def get_feed_dates(self, feedObject, dateMask): + startDate = feedObject.articles[len(feedObject.articles)-1].localtime.strftime(dateMask) + endDate = feedObject.articles[0].localtime.strftime(dateMask) + + return startDate, endDate + +#------------------------------------------------------------------------------------------------- + def hbr_parse_blogs(self, feeds): + # Do the "official" parse_feeds first + rssFeeds = Feed() + + # Use the PARSE_FEEDS method to get a Feeds object of the articles + rssFeeds = BasicNewsRecipe.parse_feeds(self) + + # Create a new feed of the right configuration and append to existing afeeds + self.feed_to_index_append(rssFeeds[:], feeds) + +#------------------------------------------------------------------------------------------------- + def parse_index(self): + if self.INCLUDE_ARTICLES == True: + soup = self.hbr_get_toc() + feeds = self.hbr_parse_toc(soup) + else: + feeds = [] + + # blog stuff + if self.INCLUDE_BLOGS == True: + self.hbr_parse_blogs(feeds) + + return feeds +#------------------------------------------------------------------------------------------------- + def get_cover_url(self): + cover_url = None + index = 'http://hbr.org/current' + soup = self.index_to_soup(index) + link_item = soup.find('img', alt=re.compile("Current Issue"), src=True) + + if link_item: + cover_url = 'http://hbr.org' + link_item['src'] + + return cover_url