diff --git a/resources/recipes/barrons.recipe b/resources/recipes/barrons.recipe index d1297a5684..e7df57c704 100644 --- a/resources/recipes/barrons.recipe +++ b/resources/recipes/barrons.recipe @@ -17,7 +17,7 @@ class Barrons(BasicNewsRecipe): needs_subscription = True language = 'en' - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal and Sujata Raman' description = 'Weekly publication for investors from the publisher of the Wall Street Journal' timefmt = ' [%a, %b %d, %Y]' use_embedded_content = False @@ -75,14 +75,14 @@ class Barrons(BasicNewsRecipe): def print_version(self, url): main, sep, rest = url.rpartition('?') return main + '#printmode' - + def postprocess_html(self, soup, first): for tag in soup.findAll(name=['ul', 'li']): tag.name = 'div' for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}): tag.extract() - + return soup ## Comment out the feeds you don't want retrieved. @@ -98,7 +98,7 @@ class Barrons(BasicNewsRecipe): ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'), ] - + def get_cover_url(self): cover_url = None index = 'http://online.barrons.com/home-page' @@ -108,7 +108,7 @@ class Barrons(BasicNewsRecipe): cover_url = link_item.img['src'] return cover_url - + ## Logout of website ## NOT CURRENTLY WORKING # def cleanup(self): diff --git a/resources/recipes/spiegel_int.recipe b/resources/recipes/spiegel_int.recipe index ce54342cd0..7af5c8a41e 100644 --- a/resources/recipes/spiegel_int.recipe +++ b/resources/recipes/spiegel_int.recipe @@ -8,9 +8,10 @@ spiegel.de from calibre.web.feeds.news import BasicNewsRecipe + class Spiegel_int(BasicNewsRecipe): title = 'Spiegel Online International' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic and Sujata Raman' description = "News and POV from Europe's largest newsmagazine" oldest_article = 7 max_articles_per_feed = 100 @@ -21,8 +22,9 @@ class Spiegel_int(BasicNewsRecipe): publisher = 'SPIEGEL ONLINE GmbH' category = 'news, politics, Germany' lang = 'en' - - conversion_options = { + recursions = 1 + match_regexps = [r'http://www.spiegel.de/.*-[1-9],00.html'] + conversion_options = { 'comments' : description ,'tags' : category ,'language' : lang @@ -30,11 +32,63 @@ class Spiegel_int(BasicNewsRecipe): ,'pretty_print': True } - remove_tags_after = dict(name='div', attrs={'id':'spArticleBody'}) + extra_css = ''' + #spArticleColumn{font-family:verdana,arial,helvetica,geneva,sans-serif ; } + h1{color:#666666; font-weight:bold;} + h2{color:#990000;} + h3{color:#990000;} + h4 {color:#990000;} + a{color:#990000;} + .spAuthor{font-style:italic;} + #spIntroTeaser{font-weight:bold;} + .spCredit{color:#666666; font-size:x-small;} + .spShortDate{font-size:x-small;} + .spArticleImageBox {font-size:x-small;} + .spPhotoGallery{font-size:x-small; color:#990000 ;} + ''' + + keep_only_tags = [ + dict(name ='div', attrs={'id': ['spArticleImageBox spAssetAlignleft','spArticleColumn']}), + ] + + remove_tags = [ + dict(name='div', attrs={'id':['spSocialBookmark','spArticleFunctions','spMultiPagerHeadlines',]}), + dict(name='div', attrs={'class':['spCommercial spM520','spArticleCredit','spPicZoom']}), + ] feeds = [(u'Spiegel Online', u'http://www.spiegel.de/schlagzeilen/rss/0,5291,676,00.xml')] - def print_version(self, url): - main, sep, rest = url.rpartition(',') - rmain, rsep, rrest = main.rpartition(',') - return rmain + ',druck-' + rrest + ',' + rest + def postprocess_html(self, soup,first): + + for tag in soup.findAll(name='div',attrs={'id':"spMultiPagerControl"}): + tag.extract() + + p = soup.find(name = 'p', attrs={'id':'spIntroTeaser'}) + + if p.string is not None: + t = p.string.rpartition(':')[0] + + if 'Part'in t: + if soup.h1 is not None: + soup.h1.extract() + if soup.h2 is not None: + soup.h2.extract() + functag = soup.find(name= 'div', attrs={'id':"spArticleFunctions"}) + if functag is not None: + functag.extract() + auttag = soup.find(name= 'p', attrs={'class':"spAuthor"}) + if auttag is not None: + auttag.extract() + + pictag = soup.find(name= 'div', attrs={'id':"spArticleTopAsset"}) + if pictag is not None: + pictag.extract() + + + return soup + + # def print_version(self, url): + # main, sep, rest = url.rpartition(',') + # rmain, rsep, rrest = main.rpartition(',') + # return rmain + ',druck-' + rrest + ',' + rest +