From f50122805cd26020cbcaf6875cb468025f93d24f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Jan 2012 08:50:27 +0530 Subject: [PATCH] Update New Scientist --- recipes/new_scientist.recipe | 55 ++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/recipes/new_scientist.recipe b/recipes/new_scientist.recipe index 434c41f525..1bfe27685f 100644 --- a/recipes/new_scientist.recipe +++ b/recipes/new_scientist.recipe @@ -1,16 +1,35 @@ -__license__ = 'GPL v3' -__copyright__ = '2008-2010, AprilHare, Darko Miletic ' +## +## Title: Microwave Journal RSS recipe +## Contact: AprilHare, Darko Miletic +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +## Copyright: 2008-2010, AprilHare, Darko Miletic +## +## Written: 2008 +## Last Edited: Jan 2012 +## + +''' +01-19-2012: Added GrayScale Image conversion and Duplicant article removals +''' + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' +__copyright__ = '2008-2012, AprilHare, Darko Miletic ' +__version__ = 'v0.5.0' +__date__ = '2012-01-19' +__author__ = 'Darko Miletic' + ''' newscientist.com ''' import re import urllib +from calibre.utils.magick import Image from calibre.web.feeds.news import BasicNewsRecipe class NewScientist(BasicNewsRecipe): title = 'New Scientist - Online News w. subscription' - __author__ = 'Darko Miletic' description = 'Science news and science articles from New Scientist.' language = 'en' publisher = 'Reed Business Information Ltd.' @@ -39,10 +58,19 @@ class NewScientist(BasicNewsRecipe): keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})] + # Whether to omit duplicates of articles (typically arsing when articles are indexed in + # more than one section). If True, only the first occurance will be downloaded. + filterDuplicates = True + + # Whether to convert images to grayscale for eInk readers. + Convert_Grayscale = False + + url_list = [] # This list is used to check if an article had already been included. + def get_browser(self): br = BasicNewsRecipe.get_browser() br.open('http://www.newscientist.com/') - if self.username is not None and self.password is not None: + if self.username is not None and self.password is not None: br.open('https://www.newscientist.com/user/login') data = urllib.urlencode({ 'source':'form' ,'redirectURL':'' @@ -80,6 +108,10 @@ class NewScientist(BasicNewsRecipe): return article.get('guid', None) def print_version(self, url): + if self.filterDuplicates: + if url in self.url_list: + return + self.url_list.append(url) return url + '?full=true&print=true' def preprocess_html(self, soup): @@ -91,7 +123,7 @@ class NewScientist(BasicNewsRecipe): item.name='p' for item in soup.findAll(['xref','figref']): tstr = item.string - item.replaceWith(tstr) + item.replaceWith(tstr) for tg in soup.findAll('a'): if tg.string == 'Home': tg.parent.extract() @@ -101,3 +133,16 @@ class NewScientist(BasicNewsRecipe): tg.replaceWith(tstr) return soup + # Converts images to Gray Scale + def postprocess_html(self, soup, first): + if self.Convert_Grayscale: + #process all the images + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + if img < 0: + raise RuntimeError('Out of memory') + img.type = "GrayscaleType" + img.save(iurl) + return soup