From 05bc40b53fd143c0a03895033de1987eaf08374e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 18 Nov 2009 07:14:15 -0700 Subject: [PATCH] IGN:Various improved recipes --- resources/recipes/hbr.recipe | 18 ++++++------ resources/recipes/kellog_insight.recipe | 27 ++++++++++++++---- resources/recipes/science_news.recipe | 38 +++++++++++++++++++++++-- 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/resources/recipes/hbr.recipe b/resources/recipes/hbr.recipe index 7c91837529..f90db55539 100644 --- a/resources/recipes/hbr.recipe +++ b/resources/recipes/hbr.recipe @@ -6,10 +6,10 @@ class HBR(BasicNewsRecipe): title = 'Harvard Business Review' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal and Sujata Raman' timefmt = ' [%B %Y]' - no_stylesheets = True - + no_stylesheets = True + LOGIN_URL = 'http://hbr.harvardbusiness.org/login?request_url=/' INDEX = 'http://hbr.harvardbusiness.org/current' @@ -20,14 +20,14 @@ class HBR(BasicNewsRecipe): 'contentRight', 'summaryLink']), dict(name='form'), ] - + extra_css = ''' a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} - #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} + #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} ''' def get_browser(self): @@ -100,10 +100,10 @@ class HBR(BasicNewsRecipe): index = 'http://hbr.harvardbusiness.org/current' soup = self.index_to_soup(index) link_item = soup.find('img', alt=re.compile("HBR Cover Image"), src=True) - + if link_item: cover_url = 'http://hbr.harvardbusiness.org' + link_item['src'] - + return cover_url - - + + diff --git a/resources/recipes/kellog_insight.recipe b/resources/recipes/kellog_insight.recipe index 7a97e4cdd6..db5c7265b9 100644 --- a/resources/recipes/kellog_insight.recipe +++ b/resources/recipes/kellog_insight.recipe @@ -12,20 +12,29 @@ from calibre.web.feeds.news import BasicNewsRecipe class KellogInsight(BasicNewsRecipe): title = 'Kellog Insight' - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal and Sujata Raman' description = 'Articles from the Kellog School of Management' no_stylesheets = True encoding = 'utf-8' language = 'en' oldest_article = 60 - remove_tags_before = {'name':'h1'} - remove_tags_after = {'class':'col-two-text'} + keep_only_tags = [dict(name='div', attrs={'id':['print_no_comments']})] + remove_tags = [dict(name='div', attrs={'class':'col-three'})] - feeds = [('Articles', - 'http://insight.kellogg.northwestern.edu/index.php/Kellogg/RSS')] + extra_css = ''' + h1{font-family:arial; font-size:medium; color:#333333;} + .col-one{font-family:arial; font-size:xx-small;} + .col-two{font-family:arial; font-size:x-small; } + h2{font-family:arial; font-size:small; color:#666666;} + h3{font-family:arial; font-size:small; color:#333333;text-transform: uppercase; font-weight:normal;} + h4{color:#660000;font-family:arial; font-size:x-small;} + .col-two-text{font-family:arial; font-size:x-small; color:#333333;} + ''' + + feeds = [('Articles', 'http://insight.kellogg.northwestern.edu/index.php/Kellogg/RSS')] def get_article_url(self, article): # Get only article not blog links @@ -34,3 +43,11 @@ class KellogInsight(BasicNewsRecipe): return link self.log('Skipping non-article', link) return None + + def preprocess_html(self, soup): + + for tag in soup.findAll(name=['span']): + tag.nextSibling.name = 'h4' + + return soup + diff --git a/resources/recipes/science_news.recipe b/resources/recipes/science_news.recipe index b5867ef837..b1862e9112 100644 --- a/resources/recipes/science_news.recipe +++ b/resources/recipes/science_news.recipe @@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Sciencenews(BasicNewsRecipe): title = u'ScienceNews' - __author__ = u'Darko Miletic' + __author__ = u'Darko Miletic and Sujata Raman' description = u"Science News is an award-winning weekly newsmagazine covering the most important research in all fields of science. Its 16 pages each week are packed with short, accurate articles that appeal to both general readers and scientists. Published since 1922, the magazine now reaches about 150,000 subscribers and more than 1 million readers. These are the latest News Items from Science News." oldest_article = 30 language = 'en' @@ -17,13 +17,45 @@ class Sciencenews(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - timefmt = ' [%A, %d %B, %Y]' + timefmt = ' [%A, %d %B, %Y]' + + extra_css = ''' + .content_description{font-family:georgia ;font-size:x-large; color:#646464 ; font-weight:bold;} + .content_summary{font-family:georgia ;font-size:small ;color:#585858 ; font-weight:bold;} + .content_authors{font-family:helvetica,arial ;font-size: xx-small ;color:#14487E ;} + .content_edition{font-family:helvetica,arial ;font-size: xx-small ;} + .exclusive{color:#FF0000 ;} + .anonymous{color:#14487E ;} + .content_content{font-family:helvetica,arial ;font-size: x-small ; color:#000000;} + .description{color:#585858;font-family:helvetica,arial ;font-size: xx-small ;} + .credit{color:#A6A6A6;font-family:helvetica,arial ;font-size: xx-small ;} + ''' keep_only_tags = [ dict(name='div', attrs={'id':'column_action'}) ] remove_tags_after = dict(name='ul', attrs={'id':'content_functions_bottom'}) remove_tags = [ dict(name='ul', attrs={'id':'content_functions_bottom'}) - ,dict(name='div', attrs={'id':'content_functions_top'}) + ,dict(name='div', attrs={'id':['content_functions_top','breadcrumb_content']}) + ,dict(name='img', attrs={'class':'icon'}) + ,dict(name='div', attrs={'class': 'embiggen'}) ] feeds = [(u"Science News / News Items", u'http://sciencenews.org/view/feed/type/news/name/news.rss')] + + def get_cover_url(self): + cover_url = None + index = 'http://www.sciencenews.org/view/home' + soup = self.index_to_soup(index) + link_item = soup.find(name = 'img',alt = "issue") + print link_item + if link_item: + cover_url = 'http://www.sciencenews.org' + link_item['src'] + '.jpg' + + return cover_url + + def preprocess_html(self, soup): + + for tag in soup.findAll(name=['span']): + tag.name = 'div' + + return soup