From 7a78fb5e9ac8e17cc718000ab9d56afdcd6dd98a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Aug 2011 21:18:29 -0600 Subject: [PATCH] Fix HBR Blogs --- recipes/hbr.recipe | 3 +- recipes/hbr_blogs.recipe | 148 ++++++--------------------------------- 2 files changed, 23 insertions(+), 128 deletions(-) diff --git a/recipes/hbr.recipe b/recipes/hbr.recipe index 30cf54bf8d..214ae14f33 100644 --- a/recipes/hbr.recipe +++ b/recipes/hbr.recipe @@ -36,6 +36,8 @@ class HBR(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser(self) + self.logout_url = None + #''' br.open(self.LOGIN_URL) br.select_form(name='signin-form') @@ -44,7 +46,6 @@ class HBR(BasicNewsRecipe): raw = br.submit().read() if 'My Account' not in raw: raise Exception('Failed to login, are you sure your username and password are correct?') - self.logout_url = None try: link = br.find_link(text='Sign out') if link: diff --git a/recipes/hbr_blogs.recipe b/recipes/hbr_blogs.recipe index acee567d8d..0ca205ab5c 100644 --- a/recipes/hbr_blogs.recipe +++ b/recipes/hbr_blogs.recipe @@ -11,28 +11,16 @@ class HBR(BasicNewsRecipe): no_stylesheets = True LOGIN_URL = 'http://hbr.org/login?request_url=/' + LOGOUT_URL = 'http://hbr.org/logout?request_url=/' + INDEX = 'http://hbr.org/current' - # - # Blog Stuff - # - - - INCLUDE_BLOGS = True - INCLUDE_ARTICLES = False - - # option-specific settings. - - if INCLUDE_BLOGS == True: - remove_tags_after = dict(id='articleBody') - remove_tags_before = dict(id='pageFeature') - feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')] - oldest_article = 30 - max_articles_per_feed = 100 - use_embedded_content = False - else: - timefmt = ' [%B %Y]' - + remove_tags_after = dict(id='articleBody') + remove_tags_before = dict(id='pageFeature') + feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')] + oldest_article = 30 + max_articles_per_feed = 100 + use_embedded_content = False keep_only_tags = [ dict(name='div', id='pageContainer') ] @@ -41,21 +29,15 @@ class HBR(BasicNewsRecipe): 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'articleToolbarTop','articleToolbarBottom', 'articleToolbarRD', - 'mailingListTout', 'partnerCenter', 'pageFooter']), - dict(name='iframe')] + 'mailingListTout', 'partnerCenter', 'pageFooter', 'shareWidgetTop']), + dict(name=['iframe', 'style'])] - extra_css = ''' - a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } - .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} - h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } - h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } - #articleBody{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} - #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} - ''' -#------------------------------------------------------------------------------------------------- def get_browser(self): br = BasicNewsRecipe.get_browser(self) + self.logout_url = None + + #''' br.open(self.LOGIN_URL) br.select_form(name='signin-form') br['signin-form:username'] = self.username @@ -63,11 +45,15 @@ class HBR(BasicNewsRecipe): raw = br.submit().read() if 'My Account' not in raw: raise Exception('Failed to login, are you sure your username and password are correct?') - self.logout_url = None - link = br.find_link(text='Sign out') - if link: - self.logout_url = link.absolute_url + try: + link = br.find_link(text='Sign out') + if link: + self.logout_url = link.absolute_url + except: + self.logout_url = self.LOGOUT_URL + #''' return br + #------------------------------------------------------------------------------------------------- def cleanup(self): if self.logout_url is not None: @@ -76,99 +62,7 @@ class HBR(BasicNewsRecipe): def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' -#------------------------------------------------------------------------------------------------- - def hbr_get_toc(self): - soup = self.index_to_soup(self.INDEX) - url = soup.find('a', text=lambda t:'Full Table of Contents' in t).parent.get('href') - return self.index_to_soup('http://hbr.org'+url) - -#------------------------------------------------------------------------------------------------- - - def hbr_parse_section(self, container, feeds): - current_section = None - current_articles = [] - for x in container.findAll(name=['li', 'h3', 'h4']): - if x.name in ['h3', 'h4'] and not x.findAll(True): - if current_section and current_articles: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x) - current_articles = [] - self.log('\tFound section:', current_section) - if x.name == 'li': - a = x.find('a', href=True) - if a is not None: - title = self.tag_to_string(a) - url = a.get('href') - if '/ar/' not in url: - continue - if url.startswith('/'): - url = 'http://hbr.org'+url - url = self.map_url(url) - p = x.find('p') - desc = '' - if p is not None: - desc = self.tag_to_string(p) - if not title or not url: - continue - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - self.log('\t\t\t', desc) - current_articles.append({'title':title, 'url':url, - 'description':desc, 'date':''}) - if current_section and current_articles: - feeds.append((current_section, current_articles)) - -#------------------------------------------------------------------------------------------------- - - def hbr_parse_toc(self, soup): - feeds = [] - features = soup.find(id='issueFeaturesContent') - self.hbr_parse_section(features, feeds) - departments = soup.find(id='issueDepartments') - self.hbr_parse_section(departments, feeds) - return feeds -#------------------------------------------------------------------------------------------------- - def feed_to_index_append(self, feedObject, masterFeed): - # Loop thru the feed object and build the correct type of article list - for feed in feedObject: - # build the correct structure from the feed object - newArticles = [] - for article in feed.articles: - newArt = { - 'title' : article.title, - 'url' : article.url, - 'date' : article.date, - 'description' : article.text_summary - } - newArticles.append(newArt) - - # Append the earliest/latest dates of the feed to the feed title - startDate, endDate = self.get_feed_dates(feed, '%d-%b') - newFeedTitle = feed.title + ' (' + startDate + ' thru ' + endDate + ')' - - # append the newly-built list object to the index object passed in - # as masterFeed. - masterFeed.append( (newFeedTitle,newArticles) ) - -#------------------------------------------------------------------------------------------------- - def get_feed_dates(self, feedObject, dateMask): - startDate = feedObject.articles[len(feedObject.articles)-1].localtime.strftime(dateMask) - endDate = feedObject.articles[0].localtime.strftime(dateMask) - - return startDate, endDate - -#------------------------------------------------------------------------------------------------- - - def parse_index(self): - if self.INCLUDE_ARTICLES == True: - soup = self.hbr_get_toc() - feeds = self.hbr_parse_toc(soup) - else: - return BasicNewsRecipe.parse_index(self) - - return feeds -#------------------------------------------------------------------------------------------------- def get_cover_url(self): cover_url = None index = 'http://hbr.org/current'